o
    n~bG                  
   @   sH  d dl mZ d dlmZmZ ddlmZmZ ddlm	Z	m
Z
mZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG dd	 d	eZG d
d deZG dd deZG dd deZG dd deZG dd deZG dd deZG dd deZ dee! dee! de"fddZ#edd	 d'd!e!d"e$d#e"de$fd$d%Z%d&S )(    )	lru_cache)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thairemove_accentunicode_rangec                   @   sP   e Zd ZdZdedefddZdeddfddZdd	d
Ze	de
fddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                 C      t )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr    r   @/usr/local/lib/python3.10/dist-packages/charset_normalizer/md.pyeligible      zMessDetectorPlugin.eligibleNc                 C   r   )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r   r   r   r    feed$   s   zMessDetectorPlugin.feedc                 C   r   )zB
        Permit to reset the plugin to the initial state.
        r   r   r   r   r    reset+   r"   zMessDetectorPlugin.resetc                 C   r   )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r$   r   r   r    ratio1   s   zMessDetectorPlugin.ratior   N)__name__
__module____qualname____doc__strboolr!   r#   r%   propertyfloatr&   r   r   r   r    r      s    
r   c                   @   V   e Zd ZdddZdedefddZdeddfdd	Zdd
dZe	de
fddZdS ) TooManySymbolOrPunctuationPluginr   Nc                 C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_wordr$   r   r   r    __init__;   s
   
z)TooManySymbolOrPunctuationPlugin.__init__r   c                 C      |  S Nisprintabler   r   r   r    r!   C      z)TooManySymbolOrPunctuationPlugin.eligiblec                 C   sp   |  j d7  _ || jkr3|tvr3t|r|  jd7  _n| du r3t|r3t|du r3|  jd7  _|| _d S )Nr   F   )	r4   r5   r   r   r2   isdigitr   r   r3   r   r   r   r    r#   F   s   


z%TooManySymbolOrPunctuationPlugin.feedc                 C   s   d| _ d| _d| _d S Nr   )r2   r4   r3   r$   r   r   r    r%   X      
z&TooManySymbolOrPunctuationPlugin.resetc                 C   s0   | j dkrdS | j| j | j  }|dkr|S dS )Nr           g333333?)r4   r2   r3   )r   Zratio_of_punctuationr   r   r    r&   ]   s   

z&TooManySymbolOrPunctuationPlugin.ratior'   r(   r)   r*   r6   r,   r-   r!   r#   r%   r.   r/   r&   r   r   r   r    r1   :   s    

r1   c                   @   r0   )TooManyAccentuatedPluginr   Nc                 C      d| _ d| _d S r>   r4   _accentuated_countr$   r   r   r    r6   j      
z!TooManyAccentuatedPlugin.__init__r   c                 C   r7   r8   )isalphar   r   r   r    r!   n   r;   z!TooManyAccentuatedPlugin.eligiblec                 C   s,   |  j d7  _ t|r|  jd7  _d S d S Nr   )r4   r   rE   r   r   r   r    r#   q   s   zTooManyAccentuatedPlugin.feedc                 C   rC   r>   rD   r$   r   r   r    r%   w   rF   zTooManyAccentuatedPlugin.resetc                 C   s*   | j dkrdS | j| j  }|dkr|S dS )Nr   r@   gffffff?rD   )r   Zratio_of_accentuationr   r   r    r&   {   s
   

zTooManyAccentuatedPlugin.ratior'   rA   r   r   r   r    rB   i   s    

rB   c                   @   r0   )UnprintablePluginr   Nc                 C   rC   r>   )_unprintable_countr4   r$   r   r   r    r6      rF   zUnprintablePlugin.__init__r   c                 C      dS NTr   r   r   r   r    r!         zUnprintablePlugin.eligiblec                 C   s@   |  du r| du r|dkr|  jd7  _|  jd7  _d S )NFr   )isspacer:   rJ   r4   r   r   r   r    r#      s   

zUnprintablePlugin.feedc                 C   s
   d| _ d S r>   )rJ   r$   r   r   r    r%      s   
zUnprintablePlugin.resetc                 C      | j dkrdS | jd | j  S )Nr   r@      )r4   rJ   r$   r   r   r    r&         
zUnprintablePlugin.ratior'   rA   r   r   r   r    rI      s    

	rI   c                   @   r0   )SuspiciousDuplicateAccentPluginr   Nc                 C      d| _ d| _d | _d S r>   _successive_countr4   _last_latin_characterr$   r   r   r    r6      s   
z(SuspiciousDuplicateAccentPlugin.__init__r   c                 C   s   |  ot|S r8   )rG   r   r   r   r   r    r!      s   z(SuspiciousDuplicateAccentPlugin.eligiblec                 C   st   |  j d7  _ | jd ur5t|r5t| jr5| r%| j r%|  jd7  _t|t| jkr5|  jd7  _|| _d S rH   )r4   rW   r   isupperrV   r   r   r   r   r    r#      s   
z$SuspiciousDuplicateAccentPlugin.feedc                 C   rT   r>   rU   r$   r   r   r    r%      r?   z%SuspiciousDuplicateAccentPlugin.resetc                 C   rP   )Nr   r@   r<   )r4   rV   r$   r   r   r    r&      rR   z%SuspiciousDuplicateAccentPlugin.ratior'   rA   r   r   r   r    rS      s    

rS   c                   @   r0   )SuspiciousRanger   Nc                 C   rT   r>   )"_suspicious_successive_range_countr4   _last_printable_seenr$   r   r   r    r6      r?   zSuspiciousRange.__init__r   c                 C   r7   r8   r9   r   r   r   r    r!      r;   zSuspiciousRange.eligiblec                 C   sx   |  j d7  _ | st|s|tv rd | _d S | jd u r"|| _d S t| j}t|}t||r7|  jd7  _|| _d S rH   )r4   rO   r   r   r[   r    is_suspiciously_successive_rangerZ   )r   r   unicode_range_aunicode_range_br   r   r    r#      s&   


zSuspiciousRange.feedc                 C   rT   r>   )r4   rZ   r[   r$   r   r   r    r%      r?   zSuspiciousRange.resetc                 C   s.   | j dkrdS | jd | j  }|dk rdS |S )Nr   r@   r<   g?)r4   rZ   )r   Zratio_of_suspicious_range_usager   r   r    r&      s   
zSuspiciousRange.ratior'   rA   r   r   r   r    rY      s    

rY   c                   @   r0   )SuperWeirdWordPluginr   Nc                 C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr4   _bad_character_count_buffer_buffer_accent_countr$   r   r   r    r6      s   
zSuperWeirdWordPlugin.__init__r   c                 C   rK   rL   r   r   r   r   r    r!     rM   zSuperWeirdWordPlugin.eligiblec                 C   s  |  rJd| j|g| _t|r|  jd7  _| jdu rHt|du s't|rHt|du rHt|du rHt	|du rHt
|du rHt|du rHd| _d S | jsOd S | s[t|s[t|r| jr|  jd7  _t| j}|  j|7  _|dkr| j| dkrd| _t| jd r| jd  r|  jd7  _d| _|dkr| jr|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d	| _d S |d
vr| du rt|rd| _|  j|7  _d S d S d S d S )Nr`   r   FT   g(\?   r   >   _-<=>|~)rG   joinrg   r   rh   re   r   r   r   r   r   r   rO   r   r   ra   lenr4   rd   rX   rc   rb   rf   r=   r   )r   r   Zbuffer_lengthr   r   r    r#     sx   





	


zSuperWeirdWordPlugin.feedc                 C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )Nr`   Fr   )rg   rd   re   rb   ra   r4   rf   rc   r$   r   r   r    r%   D  s   
zSuperWeirdWordPlugin.resetc                 C   s$   | j dkr| jdkrdS | j| j S )N
   r   r@   )ra   rc   rf   r4   r$   r   r   r    r&   N  s   zSuperWeirdWordPlugin.ratior'   rA   r   r   r   r    r_      s    

6
r_   c                   @   sZ   e Zd ZdZdddZdedefddZdeddfd	d
ZdddZ	e
defddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   Nc                 C   rC   r>   _wrong_stop_count_cjk_character_countr$   r   r   r    r6   \  rF   zCjkInvalidStopPlugin.__init__r   c                 C   rK   rL   r   r   r   r   r    r!   `  rM   zCjkInvalidStopPlugin.eligiblec                 C   s8   |dv r|  j d7  _ d S t|r|  jd7  _d S d S )N>      丄   丅r   )rx   r   ry   r   r   r   r    r#   c  s   zCjkInvalidStopPlugin.feedc                 C   rC   r>   rw   r$   r   r   r    r%   j  rF   zCjkInvalidStopPlugin.resetc                 C   s   | j dk rdS | j| j  S )N   r@   )ry   rx   r$   r   r   r    r&   n  s   
zCjkInvalidStopPlugin.ratior'   )r(   r)   r*   r+   r6   r,   r-   r!   r#   r%   r.   r/   r&   r   r   r   r    rv   V  s    

rv   c                   @   r0   )ArchaicUpperLowerPluginr   Nc                 C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr4   _last_alpha_seen_current_ascii_onlyr$   r   r   r    r6   v  s   
z ArchaicUpperLowerPlugin.__init__r   c                 C   rK   rL   r   r   r   r   r    r!     rM   z ArchaicUpperLowerPlugin.eligiblec                 C   s$  |  ot|}|du }|rC| jdkrC| jdkr+| du r+| jdu r+|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdu rQt
|du rQd| _| jd ur| r_| j sh| r|| j r|| jdu rx|  jd7  _d| _nd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   Tr<   )rG   r
   r   r=   r   r   r   r   r~   r4   r	   rX   islower)r   r   Zis_concernedZ	chunk_sepr   r   r    r#     sF   



zArchaicUpperLowerPlugin.feedc                 C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r4   r   r   r   r   r~   r   r$   r   r   r    r%     s   
zArchaicUpperLowerPlugin.resetc                 C   s   | j dkrdS | j| j  S )Nr   r@   )r4   r   r$   r   r   r    r&     s   
zArchaicUpperLowerPlugin.ratior'   rA   r   r   r   r    r}   u  s    

*	r}   r]   r^   r   c                 C   sb  | du s|du r
dS | |krdS d| v rd|v rdS d| v s"d|v r$dS d| v s,d|v r6d| v s4d|v r6dS |  d| d}}|D ]}|tv rJqC||v rQ dS qC| dv |dv }}|s_|rid	| v sgd	|v ridS |ro|rodS d
| v swd
|v rd	| v sd	|v rdS | dks|dkrdS d	| v sd	|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationZForms)splitr   )r]   r^   Zkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr   r   r    r\     sj   r\   i   )maxsize皙?Fdecoded_sequencemaximum_thresholddebugc                 C   s   dd t  D }t| d }d}|dk rd}n	|dkrd}nd	}t| d
 t|D ]2\}}|D ]}	|	|r<|	| q0|dkrG|| dksM||d kr\tdd |D }||kr\ nq*|rk|D ]	}
t|
j	|
j
 qat|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S   s   g | ]}| qS r   r   ).0Zmd_classr   r   r    
<listcomp>  s    zmess_ratio.<locals>.<listcomp>r   r@   i       i   r      
r   c                 s   s    | ]}|j V  qd S r8   )r&   )r   dtr   r   r    	<genexpr>&  s    zmess_ratio.<locals>.<genexpr>   )r   __subclasses__rt   zipranger!   r#   sumprint	__class__r&   round)r   r   r   Z	detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcr   indexdetectorr   r   r   r    
mess_ratio  s:   




r   N)r   F)&	functoolsr   typingr   r   Zconstantr   r   utilsr   r	   r
   r   r   r   r   r   r   r   r   r   r   r   r   r   r1   rB   rI   rS   rY   r_   rv   r}   r,   r-   r\   r/   r   r   r   r   r    <module>   s>    D"/%6ZL
F