o
    n~b3                     @   s   d dl Z d dlmZ d dlmZ d dlmZ d dlmZ d dl	m
Z
 d dlmZmZmZmZmZmZmZ dd	lmZmZ dd
lmZ ddlmZmZmZ G dd dZG dd dZeeef Z ee  Z!G dd dZ"dS )    N)Counter)aliases)sha256)dumps)sub)AnyDictIteratorListOptionalTupleUnion   )NOT_PRINTABLE_PATTERNTOO_BIG_SEQUENCE)
mess_ratio)	iana_nameis_multi_byte_encodingunicode_rangec                   @   s$  e Zd Z	dGdededededddee fd	d
Zde	defddZ
de	defddZedefddZedefddZedefddZedefddZdefddZdefddZdHddZedefdd Zedee fd!d"Zedefd#d$Zedefd%d&Zedee fd'd(Zedefd)d*Zedefd+d,Zedefd-d.Zedefd/d0Zedefd1d2Zedefd3d4Z eded  fd5d6Z!edefd7d8Z"edee fd9d:Z#edee fd;d<Z$dId=d>Z%dId?d@Z&dJdBedefdCdDZ'edefdEdFZ(dS )KCharsetMatchNpayloadguessed_encodingmean_mess_ratiohas_sig_or_bom	languagesCoherenceMatchesdecoded_payloadc                 C   sF   || _ || _|| _|| _|| _d | _g | _d| _d | _d | _	|| _
d S )N        )_payload	_encoding_mean_mess_ratio
_languages_has_sig_or_bom_unicode_ranges_leavesZ_mean_coherence_ratio_output_payload_output_encoding_string)selfr   r   r   r   r   r    r)   D/usr/local/lib/python3.10/dist-packages/charset_normalizer/models.py__init__   s   	
zCharsetMatch.__init__otherreturnc                 C   s>   t |tstdt|jt| j| j|jko| j|jkS )Nz&__eq__ cannot be invoked on {} and {}.)
isinstancer   	TypeErrorformatstr	__class__encodingfingerprintr(   r,   r)   r)   r*   __eq__(   s   
zCharsetMatch.__eq__c                 C   sv   t |tstt| j|j }t| j|j }|dk r5|dkr5|dkr/| j|jkr/| j|jkS | j|jkS | j|jk S )zQ
        Implemented to make sorted available upon CharsetMatches items.
        g{Gz?g{Gz?r   )r.   r   
ValueErrorabschaos	coherencemulti_byte_usage)r(   r,   Zchaos_differenceZcoherence_differencer)   r)   r*   __lt__1   s   
zCharsetMatch.__lt__c                 C   s   dt t| t | j  S )N      ?)lenr1   rawr(   r)   r)   r*   r;   D   s   zCharsetMatch.multi_byte_usagec                 C   s   t dt tt| dS )z
        Check once again chaos in decoded text, except this time, with full content.
        Use with caution, this can be very slow.
        Notice: Will be removed in 3.0
        z=chaos_secondary_pass is deprecated and will be removed in 3.0r=   )warningswarnDeprecationWarningr   r1   r@   r)   r)   r*   chaos_secondary_passH   s
   z!CharsetMatch.chaos_secondary_passc                 C   s   t dt dS )zy
        Coherence ratio on the first non-latin language detected if ANY.
        Notice: Will be removed in 3.0
        z<coherence_non_latin is deprecated and will be removed in 3.0r   )rA   rB   rC   r@   r)   r)   r*   coherence_non_latinU   s
   z CharsetMatch.coherence_non_latinc                 C   s,   t dt ttdt|  }t| S )z_
        Word counter instance on decoded text.
        Notice: Will be removed in 3.0
        z2w_counter is deprecated and will be removed in 3.0 )	rA   rB   rC   r   r   r1   lowerr   split)r(   Zstring_printable_onlyr)   r)   r*   	w_countera   s
   zCharsetMatch.w_counterc                 C   s"   | j d u rt| j| jd| _ | j S )Nstrict)r'   r1   r   r   r@   r)   r)   r*   __str__o   s   
zCharsetMatch.__str__c                 C   s   d | j| jS )Nz<CharsetMatch '{}' bytes({})>)r0   r3   r4   r@   r)   r)   r*   __repr__u      zCharsetMatch.__repr__c                 C   s8   t |tr	|| krtd|jd |_| j| d S )Nz;Unable to add instance <{}> as a submatch of a CharsetMatch)r.   r   r7   r0   r2   r'   r$   appendr5   r)   r)   r*   add_submatchx   s   zCharsetMatch.add_submatchc                 C      | j S N)r   r@   r)   r)   r*   r3         zCharsetMatch.encodingc                 C   sD   g }t  D ]\}}| j|kr|| q| j|kr|| q|S )z
        Encoding name are known by many name, using this could help when searching for IBM855 when it's listed as CP855.
        )r   itemsr3   rN   )r(   Zalso_known_asupr)   r)   r*   encoding_aliases   s   


zCharsetMatch.encoding_aliasesc                 C   rP   rQ   r"   r@   r)   r)   r*   bom   rR   zCharsetMatch.bomc                 C   rP   rQ   rW   r@   r)   r)   r*   byte_order_mark   rR   zCharsetMatch.byte_order_markc                 C   s   dd | j D S )z
        Return the complete list of possible languages found in decoded sequence.
        Usually not really useful. Returned list may be empty even if 'language' property return something != 'Unknown'.
        c                 S   s   g | ]}|d  qS )r   r)   ).0er)   r)   r*   
<listcomp>       z*CharsetMatch.languages.<locals>.<listcomp>r!   r@   r)   r)   r*   r      s   zCharsetMatch.languagesc                 C   sp   | j s1d| jv r
dS ddlm}m} t| jr|| jn|| j}t|dks+d|v r-dS |d S | j d d S )z
        Most probable language found in decoded sequence. If none were detected or inferred, the property will return
        "Unknown".
        asciiZEnglishr   )encoding_languagesmb_encoding_languageszLatin BasedUnknown)r!   could_be_from_charsetZcharset_normalizer.cdr`   ra   r   r3   r>   )r(   r`   ra   r   r)   r)   r*   language   s   
zCharsetMatch.languagec                 C   rP   rQ   )r    r@   r)   r)   r*   r9      rR   zCharsetMatch.chaosc                 C   s   | j sdS | j d d S )Nr   r   r   r^   r@   r)   r)   r*   r:      s   zCharsetMatch.coherencec                 C      t | jd ddS Nd      )ndigits)roundr9   r@   r)   r)   r*   percent_chaos      zCharsetMatch.percent_chaosc                 C   re   rf   )rj   r:   r@   r)   r)   r*   percent_coherence   rl   zCharsetMatch.percent_coherencec                 C   rP   )z+
        Original untouched bytes.
        )r   r@   r)   r)   r*   r?      s   zCharsetMatch.rawc                 C   rP   rQ   )r$   r@   r)   r)   r*   submatch   rR   zCharsetMatch.submatchc                 C      t | jdkS Nr   )r>   r$   r@   r)   r)   r*   has_submatch   s   zCharsetMatch.has_submatchc                 C   s@   | j d ur| j S dd t| D }ttdd |D | _ | j S )Nc                 S   s   g | ]}t |qS r)   )r   )rZ   charr)   r)   r*   r\      s    z*CharsetMatch.alphabets.<locals>.<listcomp>c                 S   s   h | ]}|r|qS r)   r)   )rZ   rr)   r)   r*   	<setcomp>   r]   z)CharsetMatch.alphabets.<locals>.<setcomp>)r#   r1   sortedlist)r(   Zdetected_rangesr)   r)   r*   	alphabets   s   
zCharsetMatch.alphabetsc                 C   s   | j gdd | jD  S )z
        The complete list of encoding that output the exact SAME str result and therefore could be the originating
        encoding.
        This list does include the encoding available in property 'encoding'.
        c                 S   s   g | ]}|j qS r)   )r3   )rZ   mr)   r)   r*   r\      s    z6CharsetMatch.could_be_from_charset.<locals>.<listcomp>)r   r$   r@   r)   r)   r*   rc      s   z"CharsetMatch.could_be_from_charsetc                 C      | S z>
        Kept for BC reasons. Will be removed in 3.0.
        r)   r@   r)   r)   r*   first      zCharsetMatch.firstc                 C   ry   rz   r)   r@   r)   r)   r*   best   r|   zCharsetMatch.bestutf_8r3   c                 C   s2   | j du s
| j |kr|| _ t| |d| _| jS )z
        Method to get re-encoded bytes payload using given target encoding. Default to UTF-8.
        Any errors will be simply ignored by the encoder NOT replaced.
        Nreplace)r&   r1   encoder%   )r(   r3   r)   r)   r*   output  s   zCharsetMatch.outputc                 C   s   t |   S )zw
        Retrieve the unique SHA256 computed using the transformed (re-encoded) payload. Not the original one.
        )r   r   	hexdigestr@   r)   r)   r*   r4     s   zCharsetMatch.fingerprintrQ   )r,   r   r-   N)r-   r   )r~   ))__name__
__module____qualname__bytesr1   floatboolr   r+   objectr6   r<   propertyr;   rD   rE   r   rI   rK   rL   rO   r3   r
   rV   rX   rY   r   rd   r9   r:   rk   rm   r?   rn   rq   rw   rc   r{   r}   r   r4   r)   r)   r)   r*   r      s~    
	


r   c                   @   s   e Zd ZdZddee fddZdee fddZd	e	e
ef defd
dZde
fddZdefddZd	eddfddZded fddZded fddZdS )CharsetMatchesz
    Container with every CharsetMatch items ordered by default from most probable to the less one.
    Act like a list(iterable) but does not implements all related methods.
    Nresultsc                 C   s   |r	t || _d S g | _d S rQ   )ru   _results)r(   r   r)   r)   r*   r+     s   zCharsetMatches.__init__r-   c                 c   s    | j E d H  d S rQ   r   r@   r)   r)   r*   __iter__  s   zCharsetMatches.__iter__itemc                 C   sJ   t |tr
| j| S t |tr#t|d}| jD ]}||jv r"|  S qt)z
        Retrieve a single item either by its position or encoding name (alias may be used here).
        Raise KeyError upon invalid index or encoding not present in results.
        F)r.   intr   r1   r   rc   KeyError)r(   r   resultr)   r)   r*   __getitem__!  s   





zCharsetMatches.__getitem__c                 C   s
   t | jS rQ   r>   r   r@   r)   r)   r*   __len__/  s   
zCharsetMatches.__len__c                 C   ro   rp   r   r@   r)   r)   r*   __bool__2  s   zCharsetMatches.__bool__c                 C   s|   t |tstdt|jt|jtkr0| j	D ]}|j
|j
kr/|j|jkr/||  dS q| j	| t| j	| _	dS )z~
        Insert a single match. Will be inserted accordingly to preserve sort.
        Can be inserted as a submatch.
        z-Cannot append instance '{}' to CharsetMatchesN)r.   r   r7   r0   r1   r2   r>   r?   r   r   r4   r9   rO   rN   ru   )r(   r   matchr)   r)   r*   rN   5  s   


zCharsetMatches.appendr   c                 C   s   | j sdS | j d S )zQ
        Simply return the first match. Strict equivalent to matches[0].
        Nr   r   r@   r)   r)   r*   r}   I  s   
zCharsetMatches.bestc                 C   s   |   S )zP
        Redundant method, call the method best(). Kept for BC reasons.
        )r}   r@   r)   r)   r*   r{   Q  s   zCharsetMatches.firstrQ   )r   r   r   __doc__r
   r   r+   r	   r   r   r   r1   r   r   r   r   rN   r   r}   r{   r)   r)   r)   r*   r     s    r   c                   @   s~   e Zd Zdedee dee dee dedee deded	ed
ee defddZe	de
eef fddZdefddZdS )CliDetectionResultpathr3   rV   alternative_encodingsrd   rw   r   r9   r:   unicode_pathis_preferredc                 C   sF   || _ |
| _|| _|| _|| _|| _|| _|| _|| _|	| _	|| _
d S rQ   )r   r   r3   rV   r   rd   rw   r   r9   r:   r   )r(   r   r3   rV   r   rd   rw   r   r9   r:   r   r   r)   r)   r*   r+   ]  s   
zCliDetectionResult.__init__r-   c                 C   s2   | j | j| j| j| j| j| j| j| j| j	| j
dS )Nr   r3   rV   r   rd   rw   r   r9   r:   r   r   r   r@   r)   r)   r*   __dict__w  s   zCliDetectionResult.__dict__c                 C   s   t | jdddS )NT   )ensure_asciiindent)r   r   r@   r)   r)   r*   to_json  rM   zCliDetectionResult.to_jsonN)r   r   r   r1   r   r
   r   r   r+   r   r   r   r   r   r)   r)   r)   r*   r   \  s6    	

r   )#rA   collectionsr   Zencodings.aliasesr   hashlibr   jsonr   rer   typingr   r   r	   r
   r   r   r   Zconstantr   r   mdr   utilsr   r   r   r   r   r1   r   ZCoherenceMatchr   r   r)   r)   r)   r*   <module>   s"    $  	C