d9!BddlmZmZddlmZddlmZGddeZy))ListUnion) CharSetProber) ProbingStateceZdZdZdZdZdfd Zdfd Zede fdZ ede fd Z de fd Z de fd Zdefd Zdefd ZdefdZdefdZdeeddfdZdeeddfdZdeeefdefdZedefdZde fdZxZS) UTF1632Proberad This class simply looks for occurrences of zero bytes, and infers whether the file is UTF16 or UTF32 (low-endian or big-endian) For instance, files looking like ( [nonzero] )+ have a good probability to be UTF32BE. Files looking like ( [nonzero] )+ may be guessed to be UTF16BE, and inversely for little-endian varieties. gGz?returnNc t|d|_dgdz|_dgdz|_t j |_gd|_d|_ d|_ d|_ d|_ d|_ d|_|jy)NrrrrrF)super__init__position zeros_at_modnonzeros_at_modr DETECTING_statequadinvalid_utf16beinvalid_utf16leinvalid_utf32beinvalid_utf32le'first_half_surrogate_pair_detected_16be'first_half_surrogate_pair_detected_16leresetself __class__s 7/usr/lib/python3/dist-packages/chardet/utf1632prober.pyrzUTF1632Prober.__init__)s~  C!G !sQw",,   $$$$7<47<4 ct|d|_dgdz|_dgdz|_t j |_d|_d|_ d|_ d|_ d|_ d|_ gd|_y)Nrr Fr)rrrrrrrrrrrrrrrrs r!rzUTF1632Prober.reset8ss   C!G !sQw",, $$$$7<47<4  r"c|jry|jry|jry|jryy)Nzutf-32bezutf-32lezutf-16bezutf-16lezutf-16)is_likely_utf32beis_likely_utf32leis_likely_utf16beis_likely_utf16lers r! charset_namezUTF1632Prober.charset_nameFsA  ! ! #  ! ! #  ! ! #  ! ! #r"cy)Nr)s r!languagezUTF1632Prober.languageSsr"c4td|jdz S)N?g@maxrr)s r!approx_32bit_charsz UTF1632Prober.approx_32bit_charsW3 +,,r"c4td|jdz S)Nr0g@r1r)s r!approx_16bit_charsz UTF1632Prober.approx_16bit_charsZr4r"cf|j}||jk\xr|jd|z |jkDxrp|jd|z |jkDxrO|jd|z |jkDxr.|jd|z |jkDxr |j SNrr)r3MIN_CHARS_FOR_DETECTIONrEXPECTED_RATIOrrr approx_charss r!r%zUTF1632Prober.is_likely_utf32be]s..0 t;;;   a < /$2E2E E )!!!$|3d6I6II )!!!$|3d6I6II )$$Q',69L9LL )(((  r"cf|j}||jk\xr|jd|z |jkDxrp|jd|z |jkDxrO|jd|z |jkDxr.|jd|z |jkDxr |j Sr8)r3r;rr<rrr=s r!r&zUTF1632Prober.is_likely_utf32legs..0 t;;;   #l 2T5H5H H )!!!$|3d6I6II )!!!$|3d6I6II )!!!$|3d6I6II )(((  r"c"|j}||jk\xro|jd|jdz|z |jkDxr>|jd|jdz|z |jkDxr |j S)Nrr:rr9)r6r;rr<rrr=s r!r'zUTF1632Prober.is_likely_utf16beq..0 t;;;  ! !! $t';';A'> >, N!! " )""1%(9(9!(<< L!!" )(((  r"c"|j}||jk\xro|jd|jdz|z |jkDxr>|jd|jdz|z |jkDxr |j S)Nrr9rr:)r6r;rr<rrr=s r!r(zUTF1632Prober.is_likely_utf16le{rAr"rc|ddk7s)|ddkDs!|ddk(r |ddk(rd|dcxkrdkr nnd|_|ddk7s)|ddkDs!|ddk(r#|ddk(rd|dcxkrdkr ny d|_y y y y ) z Validate if the quad of bytes is valid UTF-32. UTF-32 is valid in the range 0x00000000 - 0x0010FFFF excluding 0x0000D800 - 0x0000DFFF https://en.wikipedia.org/wiki/UTF-32 rrr9Tr:N)rr)rrs r!validate_utf32_charactersz'UTF1632Prober.validate_utf32_characterss GqLAw~Q1 aA$$q'2IT2I#'D GqLAw~Q1 aA$$q'2IT2I#'D 3J r"paircz|js2d|dcxkrdkr nnd|_n9d|dcxkrdkr+nn(d|_n d|dcxkrdkr nnd|_nd|_|js3d|dcxkrdkr nnd|_y d|dcxkrdkr ny d|_y y d|dcxkr dkr d|_y d|_y ) a9 Validate if the pair of bytes is valid UTF-16. UTF-16 is valid in the range 0x0000 - 0xFFFF excluding 0xD800 - 0xFFFF with an exception for surrogate pairs, which must be in the range 0xD800-0xDBFF followed by 0xDC00-0xDFFF https://en.wikipedia.org/wiki/UTF-16 rErTrFFrN)rrrr)rrHs r!validate_utf16_charactersz'UTF1632Prober.validate_utf16_characterss;;tAw&$&?C<a(D('+$tAw&$&?D<'+$;;tAw&$&?C<a(D('+$)tAw&$&?D<'(,$r"byte_strc|D]}|jdz}||j|<|dk(rW|j|j|j|jdd|j|jdd|dk(r|j|xxdz cc<n|j |xxdz cc<|xjdz c_|j S)Nr r:rr9r)rrrGrLrrstate)rrMcmod4s r!feedzUTF1632Prober.feeds A==1$DDIIdOqy..tyy9..tyy1~>..tyy1~>Av!!$'1,'$$T*a/* MMQ M zzr"c:|jtjtjhvr |jS|j dkDr!tj|_|jS|j dkDrtj|_|jS)Ng?i)rrNOT_MEFOUND_ITget_confidencerr)s r!rOzUTF1632Prober.statesz ;;<.. 0E0EF F;;     4 '&//DK {{ ]]X %'--DK{{r"c|js0|js |js|jrdSdS)Ng333333?g)r(r'r&r%r)s r!rVzUTF1632Prober.get_confidencesH&&())+))+))+   r")r N) __name__ __module__ __qualname____doc__r;r<rrpropertystrr*r.floatr3r6boolr%r&r'r(rintrGrLrbytes bytearrayrrRrOrV __classcell__)r s@r!r r s!N  ! c  #-E--E- 4  4  4  4 (d3i(D(,,d3i,D,@ U5)#34   |      r"r N)typingrr charsetproberrenumsrr r-r"r!rgs*(F MF r"