o
    ;/b!                     @   sn  d Z ddlZddlZddlmZ ddlmZ ddlm	Z	 ddl
mZmZmZ zeZW n ey7   eefZY nw zddlmZ W n eyO   ddlmZ Y nw zddlmZ W n eyg   ddlmZ Y nw G d	d
 d
eZzddlmZ W n	 ey   Y nw G dd deZe Zdd ZdddZ		dddZ		dddZdddZ dddZ!dd Z"e Z#dS )z?
An interface to html5lib that mimics the lxml.html interface.
    N)
HTMLParser)TreeBuilder)etree)ElementXHTML_NAMESPACE_contains_block_level_tag)urlopen)urlparsec                   @      e Zd ZdZdddZdS )r   z*An html5lib HTML parser with lxml as tree.Fc                 K      t j| f|td| d S N)stricttree)_HTMLParser__init__r   selfr   kwargs r   7/usr/lib/python3/dist-packages/lxml/html/html5parser.pyr         zHTMLParser.__init__NF__name__
__module____qualname____doc__r   r   r   r   r   r          r   )XHTMLParserc                   @   r
   )r   z+An html5lib XHTML Parser with lxml as tree.Fc                 K   r   r   )_XHTMLParserr   r   r   r   r   r   r   *   r   zXHTMLParser.__init__Nr   r   r   r   r   r   r   '   r   r   c                 C   s(   |  |}|d ur|S |  dt|f S )Nz{%s}%s)findr   )r   tagelemr   r   r   	_find_tag0   s   
r#   c                 C   s^   t | ts	td|du rt}i }|du rt | trd}|dur$||d< |j| fi | S )z
    Parse a whole document into a string.

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    string requiredNT
useChardet)
isinstance_strings	TypeErrorhtml_parserbytesparseZgetroot)htmlguess_charsetparseroptionsr   r   r   document_fromstring7   s   
r0   Fc                 C   s   t | ts	td|du rt}i }|du rt | trd}|dur$||d< |j| dfi |}|rKt |d trK|rK|d  rHtd|d  |d= |S )a`  Parses several HTML elements, returning a list of elements.

    The first item in the list may be a string.  If no_leading_text is true,
    then it will be an error if there is leading text, and it will always be
    a list of only elements.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    r$   NFr%   divr   zThere is leading text: %r)	r&   r'   r(   r)   r*   ZparseFragmentstripr   ParserError)r,   no_leading_textr-   r.   r/   childrenr   r   r   fragments_fromstringO   s$   
r6   c                 C   s   t | ts	tdt|}t| ||| d}|r;t |tsd}t|}|r9t |d tr4|d |_|d= || |S |sBt	dt
|dkrMt	d|d }|jra|j rat	d|j d	|_|S )
a  Parses a single HTML element; it is an error if there is more than
    one element, or if anything but whitespace precedes or follows the
    element.

    If 'create_parent' is true (or is a tag name) then a parent node
    will be created to encapsulate the HTML in a single element.  In
    this case, leading or trailing text is allowed.

    If `guess_charset` is true, the `chardet` library will perform charset
    guessing on the string.
    r$   )r-   r.   r4   r1   r   zNo elements found   zMultiple elements foundzElement followed by text: %rN)r&   r'   r(   boolr6   r   textextendr   r3   lentailr2   )r,   Zcreate_parentr-   r.   Zaccept_leading_textelementsZnew_rootresultr   r   r   fragment_fromstringq   s4   





r?   c                 C   s   t | ts	tdt| ||d}| dd }t |tr!|dd}|  }|ds1|dr3|S t	|d	}t
|r>|S t	|d
}t
|dkra|jrQ|j sa|d jr]|d j sa|d S t|rjd|_|S d|_|S )a  Parse the html, returning a single element/document.

    This tries to minimally parse the chunk of text, without knowing if it
    is a fragment or a document.

    'base_url' will set the document's base_url attribute (and the tree's
    docinfo.URL)

    If `guess_charset` is true, or if the input is not Unicode but a
    byte string, the `chardet` library will perform charset guessing
    on the string.
    r$   )r.   r-   N2   asciireplacez<htmlz	<!doctypeheadbodyr7   r   r1   span)r&   r'   r(   r0   r*   decodelstriplower
startswithr#   r;   r9   r2   r<   r   r!   )r,   r-   r.   docstartrC   rD   r   r   r   
fromstring   s4   



rM   c                 C   s~   |du rt }t| ts| }|du rd}nt| r#t| }|du r"d}nt| d}|du r.d}i }|r6||d< |j|fi |S )a*  Parse a filename, URL, or file-like object into an HTML document
    tree.  Note: this returns a tree, not an element.  Use
    ``parse(...).getroot()`` to get the document root.

    If ``guess_charset`` is true, the ``useChardet`` option is passed into
    html5lib to enable character detection.  This option is on by default
    when parsing from URLs, off by default when parsing from file(-like)
    objects (which tend to return Unicode more often than not), and on by
    default when parsing from a file path (which is read in binary mode).
    NFTrbr%   )r)   r&   r'   _looks_like_urlr   openr+   )Zfilename_url_or_filer-   r.   fpr/   r   r   r   r+      s&   

r+   c                 C   s<   t | d }|s
dS tjdkr|tjv rt|dkrdS dS )Nr   Fwin32r7   T)r	   sysplatformstringZascii_lettersr;   )strZschemer   r   r   rO      s   

rO   )NN)FNN)$r   rS   rU   Zhtml5libr   r   Z html5lib.treebuilders.etree_lxmlr   Zlxmlr   Z	lxml.htmlr   r   r   Z
basestringr'   	NameErrorr*   rV   Zurllib2r   ImportErrorZurllib.requestr	   Zurllib.parser   r   Zxhtml_parserr#   r0   r6   r?   rM   r+   rO   r)   r   r   r   r   <module>   sT    

"

,
6$
