o
    6d@K                  
   @   sf  d dl mZ d dlmZ d dlmZmZ ddlmZm	Z	m
Z
 ddlmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZmZ G dd dZG d	d
 d
eZG dd deZG dd deZG dd deZ G dd deZ!G dd deZ"G dd deZ#G dd deZ$edddee% dee% de&fddZ'ed d	"d)d#e%d$e(d%e&de(fd&d'Z)d(S )*    )	lru_cache)	getLogger)ListOptional   )COMMON_SAFE_ASCII_CHARACTERSTRACEUNICODE_SECONDARY_RANGE_KEYWORD)is_accentuatedis_asciiis_case_variableis_cjkis_emoticon	is_hangulis_hiraganais_katakanais_latinis_punctuationis_separator	is_symbolis_thaiis_unprintableremove_accentunicode_rangec                   @   sP   e Zd ZdZdedefddZdeddfddZdd	d
Ze	de
fddZdS )MessDetectorPluginzy
    Base abstract class used for mess detection plugins.
    All detectors MUST extend and implement given methods.
    	characterreturnc                 C      t )z@
        Determine if given character should be fed in.
        NotImplementedErrorselfr    r"   lC:\Users\jesus\OneDrive\Desktop\erpjis_fastapi\backend\jisbackend\Lib\site-packages\charset_normalizer/md.pyeligible$      zMessDetectorPlugin.eligibleNc                 C   r   )z
        The main routine to be executed upon character.
        Insert the logic in witch the text would be considered chaotic.
        r   r    r"   r"   r#   feed*   s   zMessDetectorPlugin.feedc                 C   r   )zB
        Permit to reset the plugin to the initial state.
        r   r!   r"   r"   r#   reset1   r%   zMessDetectorPlugin.resetc                 C   r   )z
        Compute the chaos ratio based on what your feed() has seen.
        Must NOT be lower than 0.; No restriction gt 0.
        r   r'   r"   r"   r#   ratio7   s   zMessDetectorPlugin.ratior   N)__name__
__module____qualname____doc__strboolr$   r&   r(   propertyfloatr)   r"   r"   r"   r#   r      s    
r   c                   @   V   e Zd ZdddZdedefddZdeddfdd	Zdd
dZe	de
fddZdS ) TooManySymbolOrPunctuationPluginr   Nc                 C   s"   d| _ d| _d| _d | _d| _d S )Nr   F)_punctuation_count_symbol_count_character_count_last_printable_charZ_frenzy_symbol_in_wordr'   r"   r"   r#   __init__A   s
   
z)TooManySymbolOrPunctuationPlugin.__init__r   c                 C      |  S Nisprintabler    r"   r"   r#   r$   I      z)TooManySymbolOrPunctuationPlugin.eligiblec                 C   sp   |  j d7  _ || jkr3|tvr3t|r|  jd7  _n| du r3t|r3t|du r3|  jd7  _|| _d S )Nr   F   )	r7   r8   r   r   r5   isdigitr   r   r6   r    r"   r"   r#   r&   L   s   

z%TooManySymbolOrPunctuationPlugin.feedc                 C   s   d| _ d| _d| _d S Nr   )r5   r7   r6   r'   r"   r"   r#   r(   ^      
z&TooManySymbolOrPunctuationPlugin.resetc                 C   s0   | j dkrdS | j| j | j  }|dkr|S dS )Nr           333333?)r7   r5   r6   )r!   Zratio_of_punctuationr"   r"   r#   r)   c   s   

z&TooManySymbolOrPunctuationPlugin.ratior*   r+   r,   r-   r9   r/   r0   r$   r&   r(   r1   r2   r)   r"   r"   r"   r#   r4   @   s    

r4   c                   @   r3   )TooManyAccentuatedPluginr   Nc                 C      d| _ d| _d S rA   r7   _accentuated_countr'   r"   r"   r#   r9   p      
z!TooManyAccentuatedPlugin.__init__r   c                 C   r:   r;   )isalphar    r"   r"   r#   r$   t   r>   z!TooManyAccentuatedPlugin.eligiblec                 C   s,   |  j d7  _ t|r|  jd7  _d S d S Nr   )r7   r
   rI   r    r"   r"   r#   r&   w   s   zTooManyAccentuatedPlugin.feedc                 C   rG   rA   rH   r'   r"   r"   r#   r(   }   rJ   zTooManyAccentuatedPlugin.resetc                 C   s4   | j dks
| j dk rdS | j| j  }|dkr|S dS )Nr      rC   gffffff?rH   )r!   Zratio_of_accentuationr"   r"   r#   r)      s   zTooManyAccentuatedPlugin.ratior*   rE   r"   r"   r"   r#   rF   o   s    

rF   c                   @   r3   )UnprintablePluginr   Nc                 C   rG   rA   )_unprintable_countr7   r'   r"   r"   r#   r9      rJ   zUnprintablePlugin.__init__r   c                 C      dS NTr"   r    r"   r"   r#   r$         zUnprintablePlugin.eligiblec                 C   s(   t |r|  jd7  _|  jd7  _d S rL   )r   rO   r7   r    r"   r"   r#   r&      s   zUnprintablePlugin.feedc                 C   s
   d| _ d S rA   )rO   r'   r"   r"   r#   r(      s   
zUnprintablePlugin.resetc                 C      | j dkrdS | jd | j  S )Nr   rC   rM   )r7   rO   r'   r"   r"   r#   r)         
zUnprintablePlugin.ratior*   rE   r"   r"   r"   r#   rN      s    

rN   c                   @   r3   )SuspiciousDuplicateAccentPluginr   Nc                 C      d| _ d| _d | _d S rA   _successive_countr7   _last_latin_characterr'   r"   r"   r#   r9      s   
z(SuspiciousDuplicateAccentPlugin.__init__r   c                 C   s   |  ot|S r;   )rK   r   r    r"   r"   r#   r$      s   z(SuspiciousDuplicateAccentPlugin.eligiblec                 C   st   |  j d7  _ | jd ur5t|r5t| jr5| r%| j r%|  jd7  _t|t| jkr5|  jd7  _|| _d S rL   )r7   rY   r
   isupperrX   r   r    r"   r"   r#   r&      s   

z$SuspiciousDuplicateAccentPlugin.feedc                 C   rV   rA   rW   r'   r"   r"   r#   r(      rB   z%SuspiciousDuplicateAccentPlugin.resetc                 C   rS   )Nr   rC   r?   )r7   rX   r'   r"   r"   r#   r)      rT   z%SuspiciousDuplicateAccentPlugin.ratior*   rE   r"   r"   r"   r#   rU      s    

rU   c                   @   r3   )SuspiciousRanger   Nc                 C   rV   rA   )"_suspicious_successive_range_countr7   _last_printable_seenr'   r"   r"   r#   r9      rB   zSuspiciousRange.__init__r   c                 C   r:   r;   r<   r    r"   r"   r#   r$      r>   zSuspiciousRange.eligiblec                 C   sx   |  j d7  _ | st|s|tv rd | _d S | jd u r"|| _d S t| j}t|}t||r7|  jd7  _|| _d S rL   )r7   isspacer   r   r]   r    is_suspiciously_successive_ranger\   )r!   r   unicode_range_aunicode_range_br"   r"   r#   r&      s    



zSuspiciousRange.feedc                 C   rV   rA   )r7   r\   r]   r'   r"   r"   r#   r(      rB   zSuspiciousRange.resetc                 C   s.   | j dkrdS | jd | j  }|dk rdS |S )Nr   rC   r?   g?)r7   r\   )r!   Zratio_of_suspicious_range_usager"   r"   r#   r)      s   
zSuspiciousRange.ratior*   rE   r"   r"   r"   r#   r[      s    

r[   c                   @   r3   )SuperWeirdWordPluginr   Nc                 C   s:   d| _ d| _d| _d| _d| _d| _d| _d| _d| _d S )Nr   F )	_word_count_bad_word_count_foreign_long_count_is_current_word_bad_foreign_long_watchr7   _bad_character_count_buffer_buffer_accent_countr'   r"   r"   r#   r9      s   
zSuperWeirdWordPlugin.__init__r   c                 C   rP   rQ   r"   r    r"   r"   r#   r$   	  rR   zSuperWeirdWordPlugin.eligiblec                 C   s  |  rH|  j|7  _t|r|  jd7  _| jdu rFt|du s%t|rFt|du rFt|du rFt|du rFt	|du rFt
|du rFd| _d S | jsMd S | sYt|sYt|r| jr|  jd7  _t| j}|  j|7  _|dkr| j| dkr}d| _t| jd r| jd  r|  jd7  _d| _|dkr| jrdd	 t| jtd
|D }d}|rt|| dkrd}|s|  jd7  _d| _| jr|  jd7  _|  jt| j7  _d| _d| _d| _d
| _d S |dvr| du rt|rd| _|  j|7  _d S d S d S d S )Nr   FT   g(\?   c                 S   s   g | ]
\}}|  r|qS r"   )rZ   ).0cir"   r"   r#   
<listcomp>/  s    z-SuperWeirdWordPlugin.feed.<locals>.<listcomp>r   rD   rc   >   |=~_-<>)rK   rj   r
   rk   rh   r   r   r   r   r   r   r^   r   r   rd   lenr7   rg   rZ   rf   ziprangere   ri   r@   r   )r!   r   Zbuffer_lengthZcamel_case_dstZprobable_camel_casedr"   r"   r#   r&     sr   



zSuperWeirdWordPlugin.feedc                 C   s4   d| _ d| _d| _d| _d| _d| _d| _d| _d S )Nrc   Fr   )rj   rg   rh   re   rd   r7   ri   rf   r'   r"   r"   r#   r(   M  s   
zSuperWeirdWordPlugin.resetc                 C   s$   | j dkr| jdkrdS | j| j S )N
   r   rC   )rd   rf   ri   r7   r'   r"   r"   r#   r)   W  s   zSuperWeirdWordPlugin.ratior*   rE   r"   r"   r"   r#   rb      s    

A
rb   c                   @   sZ   e Zd ZdZdddZdedefddZdeddfd	d
ZdddZ	e
defddZdS )CjkInvalidStopPluginu   
    GB(Chinese) based encoding often render the stop incorrectly when the content does not fit and
    can be easily detected. Searching for the overuse of '丅' and '丄'.
    r   Nc                 C   rG   rA   _wrong_stop_count_cjk_character_countr'   r"   r"   r#   r9   e  rJ   zCjkInvalidStopPlugin.__init__r   c                 C   rP   rQ   r"   r    r"   r"   r#   r$   i  rR   zCjkInvalidStopPlugin.eligiblec                 C   s8   |dv r|  j d7  _ d S t|r|  jd7  _d S d S )N>   u   丅u   丄r   )r   r   r   r    r"   r"   r#   r&   l  s   zCjkInvalidStopPlugin.feedc                 C   rG   rA   r   r'   r"   r"   r#   r(   s  rJ   zCjkInvalidStopPlugin.resetc                 C   s   | j dk rdS | j| j  S )N   rC   )r   r   r'   r"   r"   r#   r)   w  s   
zCjkInvalidStopPlugin.ratior*   )r+   r,   r-   r.   r9   r/   r0   r$   r&   r(   r1   r2   r)   r"   r"   r"   r#   r~   _  s    

r~   c                   @   r3   )ArchaicUpperLowerPluginr   Nc                 C   s.   d| _ d| _d| _d| _d| _d | _d| _d S )NFr   T)_buf_character_count_since_last_sep_successive_upper_lower_count#_successive_upper_lower_count_finalr7   _last_alpha_seen_current_ascii_onlyr'   r"   r"   r#   r9     s   
z ArchaicUpperLowerPlugin.__init__r   c                 C   rP   rQ   r"   r    r"   r"   r#   r$     rR   z ArchaicUpperLowerPlugin.eligiblec                 C   s$  |  ot|}|du }|rC| jdkrC| jdkr+| du r+| jdu r+|  j| j7  _d| _d| _d | _d| _|  j	d7  _	d| _d S | jdu rQt
|du rQd| _| jd ur| r_| j sh| r|| j r|| jdu rx|  jd7  _d| _nd| _nd| _|  j	d7  _	|  jd7  _|| _d S )NFr   @   r   Tr?   )rK   r   r   r@   r   r   r   r   r   r7   r   rZ   islower)r!   r   Zis_concernedZ	chunk_sepr"   r"   r#   r&     s@   




zArchaicUpperLowerPlugin.feedc                 C   s.   d| _ d| _d| _d| _d | _d| _d| _d S )Nr   FT)r7   r   r   r   r   r   r   r'   r"   r"   r#   r(     s   
zArchaicUpperLowerPlugin.resetc                 C   s   | j dkrdS | j| j  S )Nr   rC   )r7   r   r'   r"   r"   r#   r)     s   
zArchaicUpperLowerPlugin.ratior*   rE   r"   r"   r"   r#   r   ~  s    

*	r      )maxsizer`   ra   r   c                 C   sb  | du s|du r
dS | |krdS d| v rd|v rdS d| v s"d|v r$dS d| v s,d|v r6d| v s4d|v r6dS |  d| d}}|D ]}|tv rJqC||v rQ dS qC| dv |dv }}|s_|rid	| v sgd	|v ridS |ro|rodS d
| v swd
|v rd	| v sd	|v rdS | dks|dkrdS d	| v sd	|v s| dv r|dv rd| v sd|v rdS d| v sd|v rdS dS )za
    Determine if two Unicode range seen next to each other can be considered as suspicious.
    NTFZLatinZ	EmoticonsZ	Combining )HiraganaKatakanaCJKHangulzBasic Latin)r   r   PunctuationZForms)splitr	   )r`   ra   Zkeywords_range_aZkeywords_range_belZrange_a_jp_charsZrange_b_jp_charsr"   r"   r#   r_     sZ   r_   i   皙?Fdecoded_sequencemaximum_thresholddebugc              	   C   sR  dd t  D }t| d }d}|dk rd}n	|dkrd}nd	}t| d
 t|D ]2\}}|D ]}	|	|r<|	| q0|dkrG|| dksM||d kr\tdd |D }||kr\ nq*|rtd}
|
	t
d| d| d|  t| dkr|
	t
d| dd   |
	t
d| dd   |D ]}|
	t
|j d|j  qt|dS )zw
    Compute a mess ratio given a decoded bytes sequence. The maximum threshold does stop the computation earlier.
    c                 S   s   g | ]}| qS r"   r"   )ro   Zmd_classr"   r"   r#   rr     s    zmess_ratio.<locals>.<listcomp>r   rC   i       r   r      
r   c                 s   s    | ]}|j V  qd S r;   )r)   )ro   dtr"   r"   r#   	<genexpr>0  s    zmess_ratio.<locals>.<genexpr>Zcharset_normalizerzIMess-detector extended-analysis start. intermediary_mean_mess_ratio_calc=z mean_mess_ratio=z maximum_threshold=r   zStarting with: NzEnding with: iz:    )r   __subclasses__rz   r{   r|   r$   r&   sumr   logr   	__class__r)   round)r   r   r   Z	detectorslengthZmean_mess_ratioZ!intermediary_mean_mess_ratio_calcr   indexdetectorloggerr   r"   r"   r#   
mess_ratio  sN   


r   N)r   F)*	functoolsr   loggingr   typingr   r   Zconstantr   r   r	   utilsr
   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r   r4   rF   rN   rU   r[   rb   r~   r   r/   r0   r_   r2   r   r"   r"   r"   r#   <module>   sB    H"/%4eLF