from uc_micro.categories import Cc, Cf, P, Z from uc_micro.properties import Any SRC_ANY = Any.REGEX SRC_CC = Cc.REGEX SRC_CF = Cf.REGEX SRC_P = P.REGEX SRC_Z = Z.REGEX # \p{\Z\P\Cc\CF} (white spaces + control + format + punctuation) SRC_ZPCC = "|".join([SRC_Z, SRC_P, SRC_CC]) # \p{\Z\Cc} (white spaces + control) SRC_ZCC = "|".join([SRC_Z, SRC_CC]) # Experimental. List of chars, completely prohibited in links # because can separate it from other part of text TEXT_SEPARATORS = "[><\uff5c]" # All possible word characters (everything without punctuation, spaces & controls) # Defined via punctuation & spaces to save space # Should be something like \p{\L\N\S\M} (\w but without `_`) SRC_PSEUDO_LETTER = "(?:(?!" + TEXT_SEPARATORS + "|" + SRC_ZPCC + ")" + SRC_ANY + ")" # The same as abothe but without [0-9] # var SRC_PSEUDO_LETTER_non_d = '(?:(?![0-9]|' + SRC_ZPCC + ')' + SRC_ANY + ')' # ============================================================================= SRC_IP4 = ( "(?:(25[0-5]|2[0-4][0-9]|[01]?[0-9][0-9]?)\\.){3}(25[0-5]|" + "2[0-4][0-9]|[01]?[0-9][0-9]?)" ) # Prohibit any of "@/[]()" in user/pass to avoid wrong domain fetch. SRC_AUTH = "(?:(?:(?!" + SRC_ZCC + "|[@/\\[\\]()]).)+@)?" SRC_PORT = ( "(?::(?:6(?:[0-4]\\d{3}|5(?:[0-4]\\d{2}|5(?:[0-2]\\d|3[0-5])))|[1-5]?\\d{1,4}))?" ) # Allow anything in markdown spec, forbid quote (") at the first position # because emails enclosed in quotes are far more common SRC_EMAIL_NAME = '[\\-:&=\\+\\$,\\.a-zA-Z0-9_][\\-:&=\\+\\$,\\"\\.a-zA-Z0-9_]*' SRC_XN = "xn--[a-z0-9\\-]{1,59}" # More to read about domain names # http:#serverfault.com/questions/638260/ # Allow letters & digits (http:#test1) SRC_DOMAIN_ROOT = "(?:" + SRC_XN + "|" + SRC_PSEUDO_LETTER + "{1,63}" + ")" SRC_DOMAIN = ( "(?:" + SRC_XN + "|" + "(?:" + SRC_PSEUDO_LETTER + ")" + "|" + "(?:" + SRC_PSEUDO_LETTER + "(?:-|" + SRC_PSEUDO_LETTER + "){0,61}" + SRC_PSEUDO_LETTER + ")" + ")" ) SRC_HOST = ( "(?:" + # Don't need IP check, because digits are already allowed in normal domain names # SRC_IP4 + # '|' + "(?:(?:(?:" + SRC_DOMAIN + ")\\.)*" + SRC_DOMAIN # _root + ")" + ")" ) TPL_HOST_FUZZY = ( "(?:" + SRC_IP4 + "|" + "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))" + ")" ) TPL_HOST_NO_IP_FUZZY = "(?:(?:(?:" + SRC_DOMAIN + ")\\.)+(?:%TLDS%))" # ============================================================================= # Rude test fuzzy links by host, for quick deny TPL_HOST_FUZZY_TEST = ( "localhost|www\\.|\\.\\d{1,3}\\.|(?:\\.(?:%TLDS%)(?:" + SRC_ZPCC + "|>|$))" ) def _re_host_terminator(opts): src_host_terminator = ( "(?=$|" + TEXT_SEPARATORS + "|" + SRC_ZPCC + ")" + "(?!" + ("-(?!--)|" if opts.get("---") else "-|") + "_|:\\d|\\.-|\\.(?!$|" + SRC_ZPCC + "))" ) return src_host_terminator def _re_src_path(opts): src_path = ( "(?:" + "[/?#]" + "(?:" + "(?!" + SRC_ZCC + "|" + TEXT_SEPARATORS + "|[()[\\]{}.,\"'?!\\-;]).|" + "\\[(?:(?!" + SRC_ZCC + "|\\]).)*\\]|" + "\\((?:(?!" + SRC_ZCC + "|[)]).)*\\)|" + "\\{(?:(?!" + SRC_ZCC + "|[}]).)*\\}|" + '\\"(?:(?!' + SRC_ZCC + '|["]).)+\\"|' + "\\'(?:(?!" + SRC_ZCC + "|[']).)+\\'|" + "\\'(?=" + SRC_PSEUDO_LETTER + "|[-])|" + "\\.{2,}[a-zA-Z0-9%/&]|" # google has many dots in "google search" links (#66, #81). # github has ... in commit range links, # ReSTRICT to # - english # - percent-encoded # - parts of file path # - params separator # until more examples found. + "\\.(?!" + SRC_ZCC + "|[.]|$)|" + ("\\-(?!--(?:[^-]|$))(?:-*)|" if opts.get("---") else "\\-+|") + ",(?!" + SRC_ZCC + "|$)|" # allow `,,,` in paths + ";(?!" + SRC_ZCC + "|$)|" # allow `,,,` in paths + "\\!+(?!" + SRC_ZCC + "|[!]|$)|" # allow `!!!` in paths, but not at the end + "\\?(?!" + SRC_ZCC + "|[?]|$)" + ")+" + "|\\/" + ")?" ) return src_path def build_re(opts): """Build regex Args: opts (dict): options Return: dict: dict of regex string """ SRC_HOST_STRICT = SRC_HOST + _re_host_terminator(opts) TPL_HOST_FUZZY_STRICT = TPL_HOST_FUZZY + _re_host_terminator(opts) SRC_HOST_PORT_STRICT = SRC_HOST + SRC_PORT + _re_host_terminator(opts) TPL_HOST_PORT_FUZZY_STRICT = TPL_HOST_FUZZY + SRC_PORT + _re_host_terminator(opts) TPL_HOST_PORT_NO_IP_FUZZY_STRICT = ( TPL_HOST_NO_IP_FUZZY + SRC_PORT + _re_host_terminator(opts) ) TPL_EMAIL_FUZZY = ( "(^|" + TEXT_SEPARATORS + '|"|\\(|' + SRC_ZCC + ")" + "(" + SRC_EMAIL_NAME + "@" + TPL_HOST_FUZZY_STRICT + ")" ) regex = { "src_Any": SRC_ANY, "src_Cc": SRC_CC, "src_Cf": SRC_CF, "src_Z": SRC_Z, "src_P": SRC_P, "src_ZPCc": SRC_ZPCC, "src_ZCc": SRC_ZCC, "src_pseudo_letter": SRC_PSEUDO_LETTER, "src_ip4": SRC_IP4, "src_auth": SRC_AUTH, "src_port": SRC_PORT, "src_host_terminator": _re_host_terminator(opts), "src_path": _re_src_path(opts), "src_email_name": SRC_EMAIL_NAME, "src_xn": SRC_XN, "src_domain_root": SRC_DOMAIN_ROOT, "src_domain": SRC_DOMAIN, "src_host": SRC_HOST, "tpl_host_fuzzy": TPL_HOST_FUZZY, "tpl_host_no_ip_fuzzy": TPL_HOST_NO_IP_FUZZY, "src_host_strict": SRC_HOST_STRICT, "tpl_host_fuzzy_strict": TPL_HOST_FUZZY_STRICT, "src_host_port_strict": SRC_HOST_PORT_STRICT, "tpl_host_port_fuzzy_strict": TPL_HOST_PORT_FUZZY_STRICT, "tpl_host_port_no_ip_fuzzy_strict": TPL_HOST_PORT_FUZZY_STRICT, # Main rules "tpl_host_fuzzy_test": TPL_HOST_FUZZY_TEST, "tpl_email_fuzzy": TPL_EMAIL_FUZZY, # Fuzzy link can't be prepended with .:/\- and non punctuation. # but can start with > (markdown blockquote) "tpl_link_fuzzy": ( "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + SRC_ZPCC + "))" + "((?![$+<=>^`|\uff5c])" + TPL_HOST_PORT_FUZZY_STRICT + _re_src_path(opts) + ")" ), # Fuzzy link can't be prepended with .:/\- and non punctuation. # but can start with > (markdown blockquote) "tpl_link_no_ip_fuzzy": ( "(^|(?![.:/\\-_@])(?:[$+<=>^`|\uff5c]|" + SRC_ZPCC + "))" + "((?![$+<=>^`|\uff5c])" + TPL_HOST_PORT_NO_IP_FUZZY_STRICT + _re_src_path(opts) + ")" ), } return regex