^cXtddlZddlZddlZddlZddlmZddlmZmZmZddl m Z ej r.ddl mZmZmZmZmZmZmZeeeeeefD]ZesJedd Zejd Z e jZe jZe jZe jZejZeZ d d !DZ"e#d e"DsJdZ$ej%Z&e&rej'Z(n#dej()DZ(dddZ*ej+rde*d< e(,dZ-e-s:ej./dpdZ-e*,e-e-Z-n#ej0$rdZ-YnwxYwe(,ddZ1dZ2Gdde3Z4Gdde3Z5dZ6dZ7dZ8dZ9d Z:e8Z;e9Zd"Z?d#Z@e&s{ej+rGd$d%eAZBeBZ(ej()D]B\ZCZDe6eD/d&e(e6eC/d&<CejEd'ZFej+rd(ZGnejHZGee(,d)d*d+krd,pd-ZId.ZJd/ZKd0ZLdN)getattr)errorpolicypycompat) charencode)AnyCallableListTextTypeTypeVarUnion _Tlocalstrlocalstr)boundrcng|]2}tt|dd3S)utf-8)unichrintencode).0xs 4/usr/lib/python3/dist-packages/mercurial/encoding.py r6sF     3q"::g&&   sO200c 200d 200e 200f 202a 202b 202c 202d 202e 206a 206b 206c 206d 206e 206f feffc#@K|]}|dVdS))N) startswith)ris r r#<s/ = =1<<* + + = = = = = =rcVd|vsd|vr tD]}||d}|S)uRemove codepoints ignored by HFS+ from s. >>> hfsignoreclean(u'.h‌g'.encode('utf-8')) '.hg' >>> hfsignoreclean(u'.hg'.encode('utf-8')) '.hg' rr r)_ignorereplace)scs rhfsignorecleanr)?s>!||w!|| " "A !S!!AA Hrcfi|].\}}|d|d/S)r)r)rkvs r r-VsF Aq 188G,,rsascii)s646sANSI_X3.4-1968sutf-8scp65001s HGENCODINGasciisHGENCODINGMODEsstricts ISO-8859-1c<eZdZdZdZejrfdZdZxZ S)rzdThis class allows strings that are unmodified to be round-tripped to the local encoding and backcJt||}||_|SN)bytes__new___utf8)clsulr's rr3zlocalstr.__new__ts! MM#q ! !rcftt||||_dSr1)superr__init__r4)selfr6r7 __class__s rr:zlocalstr.__init__{s, (D ! ! * *1 - - -DJJJrc*t|jSr1)hashr4)r;s r__hash__zlocalstr.__hash__sDJr) __name__ __module__ __qualname____doc__r3r TYPE_CHECKINGr:r? __classcell__r<s@rrrpsm44              rceZdZdZdS) safelocalstraDTagged string denoting it was previously an internal UTF-8 string, and can be converted back to UTF-8 losslessly >>> assert safelocalstr(b'\xc3') == b'\xc3' >>> assert b'\xc3' == safelocalstr(b'\xc3') >>> assert b'\xc3' in {safelocalstr(b'\xc3'): 0} >>> assert safelocalstr(b'\xc3') in {b'\xc3': 0} N)r@rArBrCrrrHrHsrrHct|r|S |d}tdkr|S|t td}||t tkrt |St ||S#t$r |t t}|t td}||t tkrt |cYSt |d|cYS#t$rC|dd}|t tdcYcYSwxYwwxYw#t$r-}tj tj |dd}~wwxYw)a Convert a string from internal UTF-8 to local encoding All internal strings should be UTF-8 but some repos before the implementation of locale support may contain latin1 or possibly other character sets. We attempt to decode everything strictly using UTF-8, then Latin-1, and failing that, we use UTF-8 and replace unknown characters. The localstr class is used to cache the known UTF-8 encoding of strings next to their local representation to allow lossless round-trip conversion back to UTF-8. >>> u = b'foo: \xc3\xa4' # utf-8 >>> l = tolocal(u) >>> l 'foo: ?' >>> fromlocal(l) 'foo: \xc3\xa4' >>> u2 = b'foo: \xc3\xa1' >>> d = { l: 1, tolocal(u2): 2 } >>> len(d) # no collision 2 >>> b'foo: ?' in d False >>> l1 = b'foo: \xe4' # historical latin1 fallback >>> l = tolocal(l1) >>> l 'foo: ?' >>> fromlocal(l) # magically in utf-8 'foo: \xc3\xa4' zUTF-8sUTF-8r&r!please check your locale settingshintN) isasciistrdecodeencodingr_sysstrrHrUnicodeDecodeErrorfallbackencoding LookupErrorrAbortrbytestr)r'r6rr+s rtolocalrXsF!}}  >!!A8##**I66AAHHWX..////#A&Aq>> !! > > > >HHW%56677HHWX.. ::!2!23333'??*** 1 1155555% > > >HHWi00xx 1 19======= > >    k  Q  &J     ss!B(A!B(B(( F54BE$<F5=F8?"E$!F5"F8$AF1,F5.F80F11F55F88 G/(G**G/c0t|tr|jSt|r|S |t t t t}|dS#t$rX}|td|j dz |j dz}tj d|tj|fzd}~wt $r-}tj tj|dd}~wwxYw)a Convert a string from the local character encoding to UTF-8 We attempt to decode strings using the encoding mode set by HGENCODINGMODE, which defaults to 'strict'. In this mode, unknown characters will cause an error message. Other modes include 'replace', which replaces unknown characters with a special Unicode character, and 'ignore', which drops the character. rr sdecoding near '%s': %s!NrKrL) isinstancerr4rNrOrQrP encodingmoderrRmaxstartrrUrrVrT)r'r6instsubr+s r fromlocalras!Xw!}}  HHWX&& (=(= > >xx      AtzB''$*r/9:k &#x/?/E/E)F F       k  Q  &J     s%AA>> DAC D((DDcFt|dS)z;Convert a unicode string to a byte string of local encodingr)rXr)r6s r unitolocalrcs 188G$$ % %%rcFt|dS)z;Convert a byte string of local encoding to a unicode stringr)rarOr's r unifromlocalrfs Q<<  w ' ''rcfd}|S)z^Create a proxy method that forwards __unicode__() and __str__() of Python 3 to __bytes__()c4t|Sr1)rf)obj bytesfuncs runifunczunimethod..unifuncsIIcNN+++rrI)rjrks` r unimethodrls# ,,,,, NrcP t|S#t$rYnwxYw t|tr|jd}n:|t tt t}| }||kr|S| t tS#t$r| cYSt$r-}tjtj|dd}~wwxYw)9best-effort encoding-aware case-folding of local string srrKrLN) asciilowerrRr[rr4rOrQrPr\lowerr UnicodeErrorrTrrUrrV)r'r6lur+s rrprps' !}}       a " " Cw''AA**GL,A,ABBA WWYY 77Hyy**+++ wwyy    k  Q  &J     s0 BC(&CD%/ D%8(D  D%c` t|S#t$rt|cYSwxYw)rn) asciiupperrR upperfallbackres rupperrv,sB !}}    Q s --c t|tr|jd}n:|t t t t }|}||kr|S|t t S#t$r|cYSt$r-}tj tj|dd}~wwxYw)NrrKrL)r[rr4rOrQrPr\rvrrqrTrrUrrV)r'r6uur+s rruru5s a " " Cw''AA**GL,A,ABBA WWYY 77Hyy**+++ wwyy    k  Q  &J     s$BB/&B//D D(DDc$eZdZdZdfd ZxZS)WindowsEnvironzE`os.environ` normalizes environment variables to uppercase on windowsNcbtt||Sr1)r9getrv)r;keydefaultr<s rr|zWindowsEnviron.getQs!ww{{5::w777rr1)r@rArBrCr|rErFs@rrzrzNsC W W 8 8 8 8 8 8 8 8 8 8rrzrs^[a-z]:c tj}tj|}t |}t |r'|dd|ddz}|S)Nrr)osgetcwdpathrealpath strtolocalDRIVE_REmatchrv)cwds rrrmsjikkgs##oo >>#   -ac(..""SW,C rsHGENCODINGAMBIGUOUSsnarrowswidesWFAsWFclt|ttdS)zCFind the column width of a string for display in the local encodingr&) ucolwidthrOrQrPres rcolwidthrs( QXXgh//;; < <.s/:::1CCFFeO).Q:::r)r unicodedatasumlen)drs @rrrsJ +14 8 8C :::::::;;; q66Mrct||zt|D]#}|||}t||kr|cS$td)zOUse colwidth to find a c-column substring of s starting at byte index startzsubstring not found)rangerr ValueError)r'r^r(rts rgetcolsrsb519c!ff % % eAgJ A;;!  HHH  * + ++rrFcN |tt}nz#t$rmt ||kr|cYS|t |z}|dkr|d|t |zcYS|r||| dzcYS|d||zcYSwxYwt ||kr|S|t |z}|dkr|d|t |zSt |}|r|d}t|D]\}}|t |z }||krn |d|}|r|d | tt}|r||zS||zS)uTrim string 's' to at most 'width' columns (including 'ellipsis'). If 'leftside' is True, left side of string 's' is trimmed. 'ellipsis' is always placed at trimmed side. >>> from .node import bin >>> def bprint(s): ... print(pycompat.sysstr(s)) >>> ellipsis = b'+++' >>> from . import encoding >>> encoding.encoding = b'utf-8' >>> t = b'1234567890' >>> bprint(trim(t, 12, ellipsis=ellipsis)) 1234567890 >>> bprint(trim(t, 10, ellipsis=ellipsis)) 1234567890 >>> bprint(trim(t, 8, ellipsis=ellipsis)) 12345+++ >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) +++67890 >>> bprint(trim(t, 8)) 12345678 >>> bprint(trim(t, 8, leftside=True)) 34567890 >>> bprint(trim(t, 3, ellipsis=ellipsis)) +++ >>> bprint(trim(t, 1, ellipsis=ellipsis)) + >>> u = u'あいうえお' # 2 x 5 = 10 columns >>> t = u.encode(pycompat.sysstr(encoding.encoding)) >>> bprint(trim(t, 12, ellipsis=ellipsis)) あいうえお >>> bprint(trim(t, 10, ellipsis=ellipsis)) あいうえお >>> bprint(trim(t, 8, ellipsis=ellipsis)) あい+++ >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) +++えお >>> bprint(trim(t, 5)) あい >>> bprint(trim(t, 5, leftside=True)) えお >>> bprint(trim(t, 4, ellipsis=ellipsis)) +++ >>> bprint(trim(t, 4, ellipsis=ellipsis, leftside=True)) +++ >>> t = bin(b'112233445566778899aa') # invalid byte sequence >>> bprint(trim(t, 12, ellipsis=ellipsis)) "3DUfwˆ™ª >>> bprint(trim(t, 10, ellipsis=ellipsis)) "3DUfwˆ™ª >>> bprint(trim(t, 8, ellipsis=ellipsis)) "3DU+++ >>> bprint(trim(t, 8, ellipsis=ellipsis, leftside=True)) +++fwˆ™ª >>> bprint(trim(t, 8)) "3DUfwˆ >>> bprint(trim(t, 8, leftside=True)) 3DUfwˆ™ª >>> bprint(trim(t, 3, ellipsis=ellipsis)) +++ >>> bprint(trim(t, 1, ellipsis=ellipsis)) + rN) rOrQrPrRrrlistreverse enumeratejoinr) r'widthellipsisleftsider6chars width_so_farr"r(s rtrimrsD $ HHWX&& ' ' $$$ q66U??HHH X A::3ec(mm334 4 4 4  )aj( ( ( (%y8####$||u S]]E zz/%#h--//00 GGE L%  1 ! $ %   E "1"IE  wx0011A!| x<s!'*B! 2B!?B! B! B!ceZdZdZdZdZdZdS) normcasespecsaxwhat a platform's normcase does to ASCII strings This is specified per platform, and should be consistent with what normcase on that platform actually does. lower: normcase lowercases ASCII strings upper: normcase uppercases ASCII strings other: the fallback function should always be called This should be kept in sync with normcase_spec in util.h.rrN)r@rArBrCrprvotherrIrrrrs+ A A E E EEErrct|} t||S#t$rYnwxYwtj||S)areturns a string suitable for JSON JSON is problematic for us because it doesn't support non-Unicode bytes. To deal with this, we take the following approach: - localstr/safelocalstr objects are converted back to UTF-8 - valid UTF-8/ASCII strings are passed as-is - other strings are converted to UTF-8b surrogate encoding - apply JSON-specified string escaping (escapes are doubled in these tests) >>> jsonescape(b'this is a test') 'this is a test' >>> jsonescape(b'escape characters: \0 \x0b \x7f') 'escape characters: \\u0000 \\u000b \\u007f' >>> jsonescape(b'escape characters: \b \t \n \f \r \" \\') 'escape characters: \\b \\t \\n \\f \\r \\" \\\\' >>> jsonescape(b'a weird byte: \xdd') 'a weird byte: \xed\xb3\x9d' >>> jsonescape(b'utf-8: caf\xc3\xa9') 'utf-8: caf\xc3\xa9' >>> jsonescape(b'') '' If paranoid, non-ascii and common troublesome characters are also escaped. This is suitable for web output. >>> s = b'escape characters: \0 \x0b \x7f' >>> assert jsonescape(s) == jsonescape(s, paranoid=True) >>> s = b'escape characters: \b \t \n \f \r \" \\' >>> assert jsonescape(s) == jsonescape(s, paranoid=True) >>> jsonescape(b'escape boundary: \x7e \x7f \xc2\x80', paranoid=True) 'escape boundary: ~ \\u007f \\u0080' >>> jsonescape(b'a weird byte: \xdd', paranoid=True) 'a weird byte: \\udcdd' >>> jsonescape(b'utf-8: caf\xc3\xa9', paranoid=True) 'utf-8: caf\\u00e9' >>> jsonescape(b'non-BMP: \xf0\x9d\x84\x9e', paranoid=True) 'non-BMP: \\ud834\\udd1e' >>> jsonescape(b'', paranoid=True) '\\u003cfoo@example.org\\u003e' )toutf8b_jsonescapeu8fastrcharencodepurejsonescapeu8fallback)r'paranoidu8charss r jsonescapersX\ajjG  (333       .w A AAs ! .. surrogatepass)rrrrrrrrrrrrrrctt|||dzdz }|s |||dzS||||z}|dt|S)zget the next full utf-8 character in the given string, starting at pos Raises a UnicodeError if the given location does not start a valid utf-8 character. rrr)_utf8lenordrO _utf8strict)r'posr7r(s r getutf8charrOsn QsS1W}%&&!+,A  sQw #a-AHHWk""" Hrct|tr|jSt|trt |St |r|Sd|vr. |dt|S#t$rYnwxYwtj |}d}d}t|}||kr t||}d|cxkrdkrGnnDtdt||zdt}|dz }n|t|z }nS#t$rFtdt||zdt}|dz }YnwxYw||z }||k|S) aBconvert a local, possibly-binary string into UTF-8b This is intended as a generic method to preserve data when working with schemes like JSON and XML that have no provision for arbitrary byte strings. As Mercurial often doesn't know what encoding data is in, we use so-called UTF-8b. If a string is already valid UTF-8 (or ASCII), it passes unmodified. Otherwise, unsupported bytes are mapped to UTF-16 surrogate range, uDC00-uDCFF. Principles of operation: - ASCII and UTF-8 data successfully round-trips and is understood by Unicode-oriented clients - filenames and file contents in arbitrary other encodings can have be round-tripped or recovered by clueful clients - local strings that have a cached known UTF-8 encoding (aka localstr) get sent as UTF-8 so Unicode-oriented clients get the Unicode data they want - non-lossy local strings (aka safelocalstr) get sent as UTF-8 as well - because we must preserve UTF-8 bytestring in places such as filenames, metadata can't be roundtripped without help (Note: "UTF-8b" often refers to decoding a mix of valid UTF-8 and arbitrary bytes into an internal Unicode format that can be re-encoded back into the original. Here we are exposing the internal surrogate encoding as a UTF-8 string.) rrrir)r[rr4rHrarNrOrrRrrVrrrrrr'rWrr7r(s rrrbs@!X w A| $ $|| Aa  HHWk * * *H!    D  A A C AA '' As##A!6666666666C#KK/0077MMqs1vv !   vAcF +,,33G[IIA 1HCCC  Q '' Hs&A44 BB2A6D))A E98E9ct|r|Sd|vr|Stj|}d}d}t|}||krzt ||}|t|z }d|cxkrdkr@nn=tjt |dtdz}||z }||kz|S)aWGiven a UTF-8b string, return a local, possibly-binary string. return the original binary string. This is a round-trip process for strings like filenames, but metadata that's was passed through tolocal will remain in UTF-8. >>> roundtrip = lambda x: fromutf8b(toutf8b(x)) == x >>> m = b"\xc3\xa9\x99abcd" >>> toutf8b(m) '\xc3\xa9\xed\xb2\x99abcd' >>> roundtrip(m) True >>> roundtrip(b"\xc2\xc2\x80") True >>> roundtrip(b"\xef\xbf\xbd") True >>> roundtrip(b"\xef\xef\xbf\xbd") True >>> roundtrip(b"\xf1\x80\x80\x80\x80") True rrrrrr) rNrrVrrbytechrrrOrrs r fromutf8brs0!}}a A A C AA '' 3   s1vv  a 2 2 2 2? 2 2 2 2 2 QXXg{%C%C!D!Dt!KLLA Q '' Hr)rF)F)UlocalerrerrrrrrpurerrrDtypingr r r r r rrrr importmodrNrortjsonescapeu8fastrsysstrrQchrrsplitr%allr)supports_bytes_environ_nativeenvironenvironbenvironitems_encodingrewrites iswindowsr|rPgetpreferredencodingrErrorr\rSr2rrHrXrarcrfrlr strfromlocal strmethodrprvrudictrzr+r,compilerrgetcwdbrrrrrrrrrrrrrIrrrs  /..... 98T4u 5Z888J V l + +  "  "  " / /    **/%''     s = =W = = ======    "*kGGJ$$&&G  -$,j!{{=))H =.6.0077@@LH$((8<< |HHH{{,i88       u   (5B B B J   @&&& (((       2      (I# 8 8 8 8 8T 8 8 8 !.""   ""II1.5gahhw6G6G.H.H))**++ 2:j ! ! ZF  KK& 22g=    === ,,,ccccL"3B3B3B3Bp ; ; ;   &B B B J. . . . . sAE!! E0/E0