o
    V=^,~                     @   s  d dl mZmZmZ d dlmZ d dlmZmZ d dl	Z	d dl
Z
d dlmZmZ d dlZddlmZmZmZmZ ddlmZ dd	lmZ ed
d eD Zedd eD Zedd eD ZeeddgB ZdZejred dkryeddks{J e
edd e d d Z!ne
eZ!h dZ"e
dZ#i Z$G dd de%Z&dd Z'G dd de%Z(G dd de(Z)G dd de*Z+G d d! d!e%Z,G d"d# d#e%Z-d$d% Z.dS )&    )absolute_importdivisionunicode_literals)	text_type)http_clienturllibN)BytesIOStringIO   )EOFspaceCharactersasciiLettersasciiUppercase)_ReparseException)_utilsc                 C      g | ]}| d qS asciiencode.0item r   7/usr/lib/python3/dist-packages/html5lib/_inputstream.py
<listcomp>       r   c                 C   r   r   r   r   r   r   r   r      r   c                 C   r   r   r   r   r   r   r   r      r      >   <u   [---﷐-﷯￾￿🿾🿿𯿾𯿿𿿾𿿿񏿾񏿿񟿾񟿿񯿾񯿿񿿾񿿿򏿾򏿿򟿾򟿿򯿾򯿿򿿾򿿿󏿾󏿿󟿾󟿿󯿾󯿿󿿾󿿿􏿾􏿿]]z"\uD800-\uDFFF">            	 
               	 
       z[	- -/:-@\[-`{-~]c                   @   sH   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dS )BufferedStreamzBuffering for streams that do not have buffering of their own

    The buffer is implemented as a list of chunks on the assumption that
    joining many strings will be slow since it is O(n**2)
    c                 C   s   || _ g | _ddg| _d S )Nr   r   )streambufferposition)selfrB   r   r   r   __init__:   s   zBufferedStream.__init__c                 C   s<   d}| j d | jd  D ]}|t|7 }q|| jd 7 }|S Nr   r
   )rC   rD   len)rE   poschunkr   r   r   tell?   s
   zBufferedStream.tellc                 C   sd   ||   ksJ |}d}t| j| |k r+|t| j| 8 }|d7 }t| j| |k s||g| _d S rG   )_bufferedBytesrH   rC   rD   )rE   rI   offsetir   r   r   seekF   s   zBufferedStream.seekc                 C   sP   | j s| |S | jd t| j kr#| jd t| j d kr#| |S | |S )Nr   r
   r   )rC   _readStreamrD   rH   _readFromBufferrE   bytesr   r   r   readO   s   


zBufferedStream.readc                 C   s   t dd | jD S )Nc                 S   s   g | ]}t |qS r   )rH   r   r   r   r   r   Y   s    z1BufferedStream._bufferedBytes.<locals>.<listcomp>)sumrC   rE   r   r   r   rL   X      zBufferedStream._bufferedBytesc                 C   s<   | j |}| j| | jd  d7  < t|| jd< |S rG   )rB   rT   rC   appendrD   rH   )rE   rS   datar   r   r   rP   [   s
   zBufferedStream._readStreamc                 C   s   |}g }| j d }| j d }|t| jk rc|dkrc|dksJ | j| }|t|| kr6|}||| g| _ nt|| }|t|g| _ |d7 }|||||   ||8 }d}|t| jk rc|dks|rm|| | d|S )Nr   r
       )rD   rH   rC   rX   rP   join)rE   rS   ZremainingBytesrvZbufferIndexZbufferOffsetZbufferedDataZbytesToReadr   r   r   rQ   b   s(   



zBufferedStream._readFromBufferN)__name__
__module____qualname____doc__rF   rK   rO   rT   rL   rP   rQ   r   r   r   r   rA   3   s    		rA   c                 K   s   t | tjst | tjjrt | jtjrd}nt| dr%t | dt	}nt | t	}|rCdd |D }|r;t
d| t| fi |S t| fi |S )NFrT   r   c                 S   s   g | ]	}| d r|qS )	_encoding)endswith)r   xr   r   r   r      s    z#HTMLInputStream.<locals>.<listcomp>z3Cannot set an encoding with a unicode input, set %r)
isinstancer   ZHTTPResponser   ZresponseZaddbasefphasattrrT   r   	TypeErrorHTMLUnicodeInputStreamHTMLBinaryInputStream)sourcekwargsZ	isUnicode	encodingsr   r   r   HTMLInputStream}   s   

rm   c                   @   sp   e Zd ZdZdZdd Zdd Zdd Zd	d
 Zdd Z	dd Z
dddZdd Zdd ZdddZdd ZdS )rh   Provides a unicode stream of characters to the HTMLTokenizer.

    This class takes care of character encoding and removing or replacing
    incorrect byte-sequences and also provides column and line tracking.

    i (  c                 C   sZ   t jsd| _ntddkr| j| _n| j| _dg| _tddf| _| 	|| _
|   dS )  Initialises the HTMLInputStream.

        HTMLInputStream(source, [encoding]) -> Normalized stream from source
        for use by html5lib.

        source can be either a file-object, local filename or a string.

        The optional encoding parameter must be a string that indicates
        the encoding.  If specified, that encoding will be used,
        regardless of any BOM or later declaration (such as in a meta
        element)

        Nu   􏿿r
   r   utf-8certain)r   supports_lone_surrogatesreportCharacterErrorsrH   characterErrorsUCS4characterErrorsUCS2ZnewLineslookupEncodingcharEncoding
openStream
dataStreamreset)rE   rj   r   r   r   rF      s   
zHTMLUnicodeInputStream.__init__c                 C   s.   d| _ d| _d| _g | _d| _d| _d | _d S )N r   )rJ   	chunkSizechunkOffseterrorsprevNumLinesprevNumCols_bufferedCharacterrV   r   r   r   rz      s   
zHTMLUnicodeInputStream.resetc                 C   s   t |dr	|}|S t|}|S zvProduces a file object from source.

        source can be either a file object, local filename or a string.

        rT   )rf   r	   rE   rj   rB   r   r   r   rx      s
   
z!HTMLUnicodeInputStream.openStreamc                 C   sZ   | j }|dd|}| j| }|dd|}|dkr#| j| }||fS ||d  }||fS )N
r   r   r
   )rJ   countr   rfindr   )rE   rM   rJ   ZnLinesZpositionLineZlastLinePosZpositionColumnr   r   r   	_position   s   

z HTMLUnicodeInputStream._positionc                 C   s   |  | j\}}|d |fS )z:Returns (line, col) of the current position in the stream.r
   )r   r}   )rE   linecolr   r   r   rD      s   zHTMLUnicodeInputStream.positionc                 C   s6   | j | jkr|  stS | j }| j| }|d | _ |S )zo Read one character from the stream or queue if available. Return
            EOF when EOF is reached.
        r
   )r}   r|   	readChunkr   rJ   )rE   r}   charr   r   r   r      s   

zHTMLUnicodeInputStream.charNc                 C   s   |d u r| j }| | j\| _| _d| _d| _d| _| j|}| j	r,| j	| }d | _	n|s0dS t
|dkrWt|d }|dksLd|  krJdkrWn n|d | _	|d d }| jr_| | |d	d
}|dd
}|| _t
|| _dS )Nr{   r   Fr
   r         i  z
r   T)_defaultChunkSizer   r|   r   r   rJ   r}   ry   rT   r   rH   ordrs   replace)rE   r|   rY   Zlastvr   r   r   r      s0   
 


z HTMLUnicodeInputStream.readChunkc                 C   s(   t tt|D ]}| jd q	d S )Ninvalid-codepoint)rangerH   invalid_unicode_refindallr~   rX   )rE   rY   _r   r   r   rt     s   z*HTMLUnicodeInputStream.characterErrorsUCS4c                 C   s   d}t |D ]Q}|rqt| }| }t|||d  r9t|||d  }|tv r6| j	
d d}q|dkrP|dkrP|t|d krP| j	
d qd}| j	
d qd S )NF   r   Tr   i  r
   )r   finditerr   groupstartr   ZisSurrogatePairZsurrogatePairToCodepointnon_bmp_invalid_codepointsr~   rX   rH   )rE   rY   skipmatchZ	codepointrI   Zchar_valr   r   r   ru   #  s$   z*HTMLUnicodeInputStream.characterErrorsUCS2Fc           
      C   s  zt ||f }W n4 ty<   	 |D ]
}t|dk sJ qddd |D }|s-d| }td|  }t ||f< Y nw g }	 || j| j}|du rT| j| j	krSn*n|
 }|| j	krl|| j| j|  || _n|| j| jd  |  s|nq@d|}	|	S )	z Returns a string of characters from the stream up to but not
        including any character in 'characters' or EOF. 'characters' must be
        a container that supports the 'in' method and iteration over its
        characters.
        T   r{   c                 S   s   g | ]}d t | qS )z\x%02x)r   )r   cr   r   r   r   H  s    z5HTMLUnicodeInputStream.charsUntil.<locals>.<listcomp>z^%sz[%s]+N)charsUntilRegExKeyErrorr   r[   recompiler   rJ   r}   r|   endrX   r   )
rE   Z
charactersZoppositecharsr   Zregexr\   mr   rr   r   r   
charsUntil:  s:   	

z!HTMLUnicodeInputStream.charsUntilc                 C   sZ   |t ur)| jdkr|| j | _|  jd7  _d S |  jd8  _| j| j |ks+J d S d S rG   )r   r}   rJ   r|   )rE   r   r   r   r   ungeti  s   
zHTMLUnicodeInputStream.ungetN)F)r]   r^   r_   r`   r   rF   rz   rx   r   rD   r   r   rt   ru   r   r   r   r   r   r   rh      s     
&
/rh   c                   @   sR   e Zd ZdZ			dddZdd Zd	d
 ZdddZdd Zdd Z	dd Z
dS )ri   rn   Nwindows-1252Tc                 C   sn   |  || _t| | j d| _d| _|| _|| _|| _|| _	|| _
| || _| jd dus1J |   dS )ro   i   d   r   N)rx   	rawStreamrh   rF   numBytesMetanumBytesChardetoverride_encodingtransport_encodingsame_origin_parent_encodinglikely_encodingdefault_encodingdetermineEncodingrw   rz   )rE   rj   r   r   r   r   r   Z
useChardetr   r   r   rF     s   zHTMLBinaryInputStream.__init__c                 C   s&   | j d j| jd| _t|  d S )Nr   r   )rw   Z
codec_infostreamreaderr   ry   rh   rz   rV   r   r   r   rz     s   zHTMLBinaryInputStream.resetc                 C   sJ   t |dr|}nt|}z
||  W |S  ty$   t|}Y |S w r   )rf   r   rO   rK   	ExceptionrA   r   r   r   r   rx     s   

z HTMLBinaryInputStream.openStreamc                 C   s  |   df}|d d ur|S t| jdf}|d d ur|S t| jdf}|d d ur,|S |  df}|d d ur:|S t| jdf}|d d urQ|d jdsQ|S t| jdf}|d d ur`|S |rzddl	m
} W n	 tys   Y n@w g }| }|js| j| j}t|tsJ |sn|| || |jr||  t|jd }| jd |d ur|dfS t| jdf}|d d ur|S tddfS )Nrq   r   Z	tentativezutf-16)UniversalDetectorencodingr   )	detectBOMrv   r   r   detectEncodingMetar   name
startswithr   Zchardet.universaldetectorr   ImportErrorZdoner   rT   r   rd   rS   rX   ZfeedcloseresultrO   r   )rE   Zchardetrw   r   ZbuffersZdetectorrC   r   r   r   r   r     sV   

z'HTMLBinaryInputStream.determineEncodingc                 C   s   | j d dks	J t|}|d u rd S |jdv r$td}|d us"J d S || j d kr5| j d df| _ d S | jd |df| _ |   td| j d |f )Nr
   rq   utf-16beutf-16lerp   r   zEncoding changed from %s to %s)rw   rv   r   r   rO   rz   r   )rE   ZnewEncodingr   r   r   changeEncoding  s   

z$HTMLBinaryInputStream.changeEncodingc              
   C   s   t jdt jdt jdt jdt jdi}| jd}t|t	sJ |
|dd }d}|s?|
|}d}|s?|
|dd	 }d	}|rK| j| t|S | jd
 dS )zAttempts to detect at BOM at the start of the stream. If
        an encoding can be determined from the BOM return the name of the
        encoding otherwise return Nonerp   r   r   zutf-32lezutf-32be   N   r   r   )codecsBOM_UTF8BOM_UTF16_LEBOM_UTF16_BEBOM_UTF32_LEBOM_UTF32_BEr   rT   rd   rS   getrO   rv   )rE   ZbomDictstringr   rO   r   r   r   r     s&   
zHTMLBinaryInputStream.detectBOMc                 C   sV   | j | j}t|tsJ t|}| j d | }|dur)|jdv r)t	d}|S )z9Report the encoding declared by the meta element
        r   Nr   rp   )
r   rT   r   rd   rS   EncodingParserrO   getEncodingr   rv   )rE   rC   parserr   r   r   r   r   3  s   z(HTMLBinaryInputStream.detectEncodingMeta)NNNNr   T)T)r]   r^   r_   r`   rF   rz   rx   r   r   r   r   r   r   r   r   ri   z  s    
*
>"ri   c                   @   s   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zeee
Zdd ZeeZefddZdd Zdd Zdd ZdS )EncodingByteszString-like object with an associated position and various extra methods
    If the position is ever greater than the string length then an exception is
    raisedc                 C   s   t |tsJ t| | S r   )rd   rS   __new__lowerrE   valuer   r   r   r   F  s   zEncodingBytes.__new__c                 C   s
   d| _ d S )Nr   )r   r   r   r   r   rF   J  s   
zEncodingBytes.__init__c                 C   s   | S r   r   rV   r   r   r   __iter__N  s   zEncodingBytes.__iter__c                 C   s<   | j d  }| _ |t| krt|dk rt| ||d  S )Nr
   r   r   rH   StopIterationrg   rE   pr   r   r   __next__Q  s   zEncodingBytes.__next__c                 C   s   |   S r   )r   rV   r   r   r   nextY  s   zEncodingBytes.nextc                 C   s@   | j }|t| krt|dk rt|d  | _ }| ||d  S rG   r   r   r   r   r   previous]  s   zEncodingBytes.previousc                 C   s   | j t| kr	t|| _ d S r   r   rH   r   )rE   rD   r   r   r   setPositionf  s   
zEncodingBytes.setPositionc                 C   s&   | j t| kr	t| j dkr| j S d S )Nr   r   rV   r   r   r   getPositionk  s
   
zEncodingBytes.getPositionc                 C   s   | | j | j d  S Nr
   )rD   rV   r   r   r   getCurrentByteu  rW   zEncodingBytes.getCurrentBytec                 C   sR   | j }|t| k r$| ||d  }||vr|| _|S |d7 }|t| k s	|| _dS )zSkip past a list of charactersr
   NrD   rH   r   rE   r   r   r   r   r   r   r   z  s   zEncodingBytes.skipc                 C   sR   | j }|t| k r$| ||d  }||v r|| _|S |d7 }|t| k s	|| _d S r   r   r   r   r   r   	skipUntil  s   zEncodingBytes.skipUntilc                 C   s(   |  || j}|r|  jt|7  _|S )zLook for a sequence of bytes at the start of a string. If the bytes
        are found return True and advance the position to the byte after the
        match. Otherwise return False and leave the position alone)r   rD   rH   )rE   rS   r\   r   r   r   
matchBytes  s   zEncodingBytes.matchBytesc                 C   s6   z|  || jt| d | _W dS  ty   tw )zLook for the next sequence of bytes matching a given sequence. If
        a match is found advance the position to the last byte of the matchr
   T)indexrD   rH   r   
ValueErrorr   rR   r   r   r   jumpTo  s   zEncodingBytes.jumpToN)r]   r^   r_   r`   r   rF   r   r   r   r   r   r   propertyrD   r   currentBytespaceCharactersBytesr   r   r   r   r   r   r   r   r   B  s"    	
	r   c                   @   sX   e Zd ZdZdd Zdd Zdd Zdd	 Zd
d Zdd Z	dd Z
dd Zdd ZdS )r   z?Mini parser for detecting character encoding from meta elementsc                 C   s   t || _d| _dS )z3string - the data to work on for encoding detectionN)r   rY   r   rE   rY   r   r   r   rF     s   

zEncodingParser.__init__c              
   C   s   d| j vrd S d| jfd| jfd| jfd| jfd| jfd| jff}| j D ]?}d}z| j d W n ty<   Y  | j	S w |D ]\}}| j |r\z| }W  n ty[   d}Y  nw q?|sc | j	S q$| j	S )	Ns   <metas   <!--s   </s   <!s   <?r   TF)
rY   handleComment
handleMetahandlePossibleEndTaghandleOtherhandlePossibleStartTagr   r   r   r   )rE   ZmethodDispatchr   ZkeepParsingkeymethodr   r   r   r     s@   

zEncodingParser.getEncodingc                 C      | j dS )zSkip over commentss   -->rY   r   rV   r   r   r   r     s   zEncodingParser.handleCommentc                 C   s   | j jtvrdS d}d }	 |  }|d u rdS |d dkr/|d dk}|r.|d ur.|| _dS n?|d dkrG|d }t|}|d urF|| _dS n'|d dkrntt|d }| }|d urnt|}|d urn|rl|| _dS |}q)	NTFr   s
   http-equivr
   s   content-type   charsets   content)	rY   r   r   getAttributer   rv   ContentAttrParserr   parse)rE   Z	hasPragmaZpendingEncodingattrZtentativeEncodingcodecZcontentParserr   r   r   r     s@   zEncodingParser.handleMetac                 C   s
   |  dS )NF)handlePossibleTagrV   r   r   r   r     s   
z%EncodingParser.handlePossibleStartTagc                 C   s   t | j | dS )NT)r   rY   r   rV   r   r   r   r     s   

z#EncodingParser.handlePossibleEndTagc                 C   sj   | j }|jtvr|r|  |   dS |t}|dkr#|  dS |  }|d ur3|  }|d us+dS )NTr   )rY   r   asciiLettersBytesr   r   r   spacesAngleBracketsr   )rE   ZendTagrY   r   r   r   r   r   r     s   

z EncodingParser.handlePossibleTagc                 C   r   )Nr   r   rV   r   r   r   r     s   zEncodingParser.handleOtherc                 C   s  | j }|ttdgB }|du st|dksJ |dv rdS g }g }	 |dkr+|r+n0|tv r4| }n'|dv r?d|dfS |tv rK||  n|du rQdS || t	|}q$|dkrj|
  d|dfS t	| | }|d	v r|}	 t	|}||krt	| d|d|fS |tv r||  n|| qy|d
krd|dfS |tv r||  n|du rdS || 	 t	|}|tv rd|d|fS |tv r||  n|du rdS || q)z_Return a name,value pair for the next attribute in the stream,
        if one is found, or None   /Nr
   )r   NT   =)r  r   rZ   )   '   "r   )rY   r   r   	frozensetrH   r[   asciiUppercaseBytesrX   r   r   r   r   )rE   rY   r   ZattrNameZ	attrValueZ	quoteCharr   r   r   r     sn   



zEncodingParser.getAttributeN)r]   r^   r_   r`   rF   r   r   r   r   r   r   r   r   r   r   r   r   r     s    $r   c                   @   s   e Zd Zdd Zdd ZdS )r   c                 C   s   t |tsJ || _d S r   )rd   rS   rY   r   r   r   r   rF   a  s   
zContentAttrParser.__init__c                 C   s  zy| j d | j  jd7  _| j   | j jdksW d S | j  jd7  _| j   | j jdv rS| j j}| j  jd7  _| j j}| j |rP| j || j j W S W d S | j j}z| j t | j || j j W W S  tyy   | j |d   Y W S w  ty   Y d S w )Nr   r
   r  )r  r  )rY   r   rD   r   r   r   r   r   )rE   Z	quoteMarkZoldPositionr   r   r   r   e  s2   

zContentAttrParser.parseN)r]   r^   r_   rF   r   r   r   r   r   r   `  s    r   c                 C   s\   t | trz| d} W n
 ty   Y dS w | dur,zt| W S  ty+   Y dS w dS )z{Return the python codec name corresponding to an encoding or None if the
    string doesn't correspond to a valid encoding.r   N)rd   rS   decodeUnicodeDecodeErrorwebencodingslookupAttributeError)r   r   r   r   rv     s   
rv   )/Z
__future__r   r   r   Zsixr   Z	six.movesr   r   r   r   ior   r	   r	  Z	constantsr   r   r   r   r   r{   r   r  r   r   r  r   Zinvalid_unicode_no_surrogaterr   r   r   evalr   r   Zascii_punctuation_rer   objectrA   rm   rh   ri   rS   r   r   r   rv   r   r   r   r   <module>   sJ    

J g Ib ='