Lichen

lib/sre_parse.py

90:c7ddfc4525da
2016-10-08 Paul Boddie Added some support for eliminating accessor class types where the provided attributes are invoked and are unbound methods. This uses a more sophisticated method involving usage observations that incorporate invocation information, permitting classes as accessors if paths through the code support them, even if other paths require instances as accessors to invoke methods.
     1 #     2 # Secret Labs' Regular Expression Engine     3 #     4 # convert re-style regular expression to sre pattern     5 #     6 # Copyright (c) 1998-2001 by Secret Labs AB.  All rights reserved.     7 #     8 # See the sre.py file for information on usage and redistribution.     9 #    10     11 """Internal support module for sre"""    12     13 # XXX: show string offset and offending character for all errors    14     15 import sys    16     17 from sre_constants import *    18     19 SPECIAL_CHARS = ".\\[{()*+?^$|"    20 REPEAT_CHARS = "*+?{"    21     22 DIGITS = set("0123456789")    23     24 OCTDIGITS = set("01234567")    25 HEXDIGITS = set("0123456789abcdefABCDEF")    26     27 WHITESPACE = set(" \t\n\r\v\f")    28     29 ESCAPES = {    30     r"\a": (LITERAL, ord("\a")),    31     r"\b": (LITERAL, ord("\b")),    32     r"\f": (LITERAL, ord("\f")),    33     r"\n": (LITERAL, ord("\n")),    34     r"\r": (LITERAL, ord("\r")),    35     r"\t": (LITERAL, ord("\t")),    36     r"\v": (LITERAL, ord("\v")),    37     r"\\": (LITERAL, ord("\\"))    38 }    39     40 CATEGORIES = {    41     r"\A": (AT, AT_BEGINNING_STRING), # start of string    42     r"\b": (AT, AT_BOUNDARY),    43     r"\B": (AT, AT_NON_BOUNDARY),    44     r"\d": (IN, [(CATEGORY, CATEGORY_DIGIT)]),    45     r"\D": (IN, [(CATEGORY, CATEGORY_NOT_DIGIT)]),    46     r"\s": (IN, [(CATEGORY, CATEGORY_SPACE)]),    47     r"\S": (IN, [(CATEGORY, CATEGORY_NOT_SPACE)]),    48     r"\w": (IN, [(CATEGORY, CATEGORY_WORD)]),    49     r"\W": (IN, [(CATEGORY, CATEGORY_NOT_WORD)]),    50     r"\Z": (AT, AT_END_STRING), # end of string    51 }    52     53 FLAGS = {    54     # standard flags    55     "i": SRE_FLAG_IGNORECASE,    56     "L": SRE_FLAG_LOCALE,    57     "m": SRE_FLAG_MULTILINE,    58     "s": SRE_FLAG_DOTALL,    59     "x": SRE_FLAG_VERBOSE,    60     # extensions    61     "t": SRE_FLAG_TEMPLATE,    62     "u": SRE_FLAG_UNICODE,    63 }    64     65 class Pattern:    66     # master pattern object.  keeps track of global attributes    67     def __init__(self):    68         self.flags = 0    69         self.open = []    70         self.groups = 1    71         self.groupdict = {}    72         self.str = None    73     def opengroup(self, name=None):    74         gid = self.groups    75         self.groups = gid + 1    76         if name is not None:    77             ogid = self.groupdict.get(name, None)    78             if ogid is not None:    79                 raise error, ("redefinition of group name %s as group %d; "    80                               "was group %d" % (repr(name), gid,  ogid))    81             self.groupdict[name] = gid    82         self.open.append(gid)    83         return gid    84     def closegroup(self, gid):    85         self.open.remove(gid)    86     def checkgroup(self, gid):    87         return gid < self.groups and gid not in self.open    88     89 class SubPattern:    90     # a subpattern, in intermediate form    91     def __init__(self, pattern, data=None):    92         self.pattern = pattern    93         if data is None:    94             data = []    95         self.data = data    96         self.width = None    97     def dump(self, level=0):    98         nl = 1    99         seqtypes = type(()), type([])   100         for op, av in self.data:   101             print level*"  " + op,; nl = 0   102             if op == "in":   103                 # member sublanguage   104                 print; nl = 1   105                 for op, a in av:   106                     print (level+1)*"  " + op, a   107             elif op == "branch":   108                 print; nl = 1   109                 i = 0   110                 for a in av[1]:   111                     if i > 0:   112                         print level*"  " + "or"   113                     a.dump(level+1); nl = 1   114                     i = i + 1   115             elif type(av) in seqtypes:   116                 for a in av:   117                     if isinstance(a, SubPattern):   118                         if not nl: print   119                         a.dump(level+1); nl = 1   120                     else:   121                         print a, ; nl = 0   122             else:   123                 print av, ; nl = 0   124             if not nl: print   125     def __repr__(self):   126         return repr(self.data)   127     def __len__(self):   128         return len(self.data)   129     def __delitem__(self, index):   130         del self.data[index]   131     def __getitem__(self, index):   132         if isinstance(index, slice):   133             return SubPattern(self.pattern, self.data[index])   134         return self.data[index]   135     def __setitem__(self, index, code):   136         self.data[index] = code   137     def insert(self, index, code):   138         self.data.insert(index, code)   139     def append(self, code):   140         self.data.append(code)   141     def getwidth(self):   142         # determine the width (min, max) for this subpattern   143         if self.width:   144             return self.width   145         lo = hi = 0L   146         UNITCODES = (ANY, RANGE, IN, LITERAL, NOT_LITERAL, CATEGORY)   147         REPEATCODES = (MIN_REPEAT, MAX_REPEAT)   148         for op, av in self.data:   149             if op is BRANCH:   150                 i = sys.maxint   151                 j = 0   152                 for av in av[1]:   153                     l, h = av.getwidth()   154                     i = min(i, l)   155                     j = max(j, h)   156                 lo = lo + i   157                 hi = hi + j   158             elif op is CALL:   159                 i, j = av.getwidth()   160                 lo = lo + i   161                 hi = hi + j   162             elif op is SUBPATTERN:   163                 i, j = av[1].getwidth()   164                 lo = lo + i   165                 hi = hi + j   166             elif op in REPEATCODES:   167                 i, j = av[2].getwidth()   168                 lo = lo + long(i) * av[0]   169                 hi = hi + long(j) * av[1]   170             elif op in UNITCODES:   171                 lo = lo + 1   172                 hi = hi + 1   173             elif op == SUCCESS:   174                 break   175         self.width = int(min(lo, sys.maxint)), int(min(hi, sys.maxint))   176         return self.width   177    178 class Tokenizer:   179     def __init__(self, string):   180         self.string = string   181         self.index = 0   182         self.__next()   183     def __next(self):   184         if self.index >= len(self.string):   185             self.next = None   186             return   187         char = self.string[self.index]   188         if char[0] == "\\":   189             try:   190                 c = self.string[self.index + 1]   191             except IndexError:   192                 raise error, "bogus escape (end of line)"   193             char = char + c   194         self.index = self.index + len(char)   195         self.next = char   196     def match(self, char, skip=1):   197         if char == self.next:   198             if skip:   199                 self.__next()   200             return 1   201         return 0   202     def get(self):   203         this = self.next   204         self.__next()   205         return this   206     def tell(self):   207         return self.index, self.next   208     def seek(self, index):   209         self.index, self.next = index   210    211 def isident(char):   212     return "a" <= char <= "z" or "A" <= char <= "Z" or char == "_"   213    214 def isdigit(char):   215     return "0" <= char <= "9"   216    217 def isname(name):   218     # check that group name is a valid string   219     if not isident(name[0]):   220         return False   221     for char in name[1:]:   222         if not isident(char) and not isdigit(char):   223             return False   224     return True   225    226 def _class_escape(source, escape):   227     # handle escape code inside character class   228     code = ESCAPES.get(escape)   229     if code:   230         return code   231     code = CATEGORIES.get(escape)   232     if code:   233         return code   234     try:   235         c = escape[1:2]   236         if c == "x":   237             # hexadecimal escape (exactly two digits)   238             while source.next in HEXDIGITS and len(escape) < 4:   239                 escape = escape + source.get()   240             escape = escape[2:]   241             if len(escape) != 2:   242                 raise error, "bogus escape: %s" % repr("\\" + escape)   243             return LITERAL, int(escape, 16) & 0xff   244         elif c in OCTDIGITS:   245             # octal escape (up to three digits)   246             while source.next in OCTDIGITS and len(escape) < 4:   247                 escape = escape + source.get()   248             escape = escape[1:]   249             return LITERAL, int(escape, 8) & 0xff   250         elif c in DIGITS:   251             raise error, "bogus escape: %s" % repr(escape)   252         if len(escape) == 2:   253             return LITERAL, ord(escape[1])   254     except ValueError:   255         pass   256     raise error, "bogus escape: %s" % repr(escape)   257    258 def _escape(source, escape, state):   259     # handle escape code in expression   260     code = CATEGORIES.get(escape)   261     if code:   262         return code   263     code = ESCAPES.get(escape)   264     if code:   265         return code   266     try:   267         c = escape[1:2]   268         if c == "x":   269             # hexadecimal escape   270             while source.next in HEXDIGITS and len(escape) < 4:   271                 escape = escape + source.get()   272             if len(escape) != 4:   273                 raise ValueError   274             return LITERAL, int(escape[2:], 16) & 0xff   275         elif c == "0":   276             # octal escape   277             while source.next in OCTDIGITS and len(escape) < 4:   278                 escape = escape + source.get()   279             return LITERAL, int(escape[1:], 8) & 0xff   280         elif c in DIGITS:   281             # octal escape *or* decimal group reference (sigh)   282             if source.next in DIGITS:   283                 escape = escape + source.get()   284                 if (escape[1] in OCTDIGITS and escape[2] in OCTDIGITS and   285                     source.next in OCTDIGITS):   286                     # got three octal digits; this is an octal escape   287                     escape = escape + source.get()   288                     return LITERAL, int(escape[1:], 8) & 0xff   289             # not an octal escape, so this is a group reference   290             group = int(escape[1:])   291             if group < state.groups:   292                 if not state.checkgroup(group):   293                     raise error, "cannot refer to open group"   294                 return GROUPREF, group   295             raise ValueError   296         if len(escape) == 2:   297             return LITERAL, ord(escape[1])   298     except ValueError:   299         pass   300     raise error, "bogus escape: %s" % repr(escape)   301    302 def _parse_sub(source, state, nested=1):   303     # parse an alternation: a|b|c   304    305     items = []   306     itemsappend = items.append   307     sourcematch = source.match   308     while 1:   309         itemsappend(_parse(source, state))   310         if sourcematch("|"):   311             continue   312         if not nested:   313             break   314         if not source.next or sourcematch(")", 0):   315             break   316         else:   317             raise error, "pattern not properly closed"   318    319     if len(items) == 1:   320         return items[0]   321    322     subpattern = SubPattern(state)   323     subpatternappend = subpattern.append   324    325     # check if all items share a common prefix   326     while 1:   327         prefix = None   328         for item in items:   329             if not item:   330                 break   331             if prefix is None:   332                 prefix = item[0]   333             elif item[0] != prefix:   334                 break   335         else:   336             # all subitems start with a common "prefix".   337             # move it out of the branch   338             for item in items:   339                 del item[0]   340             subpatternappend(prefix)   341             continue # check next one   342         break   343    344     # check if the branch can be replaced by a character set   345     for item in items:   346         if len(item) != 1 or item[0][0] != LITERAL:   347             break   348     else:   349         # we can store this as a character set instead of a   350         # branch (the compiler may optimize this even more)   351         set = []   352         setappend = set.append   353         for item in items:   354             setappend(item[0])   355         subpatternappend((IN, set))   356         return subpattern   357    358     subpattern.append((BRANCH, (None, items)))   359     return subpattern   360    361 def _parse_sub_cond(source, state, condgroup):   362     item_yes = _parse(source, state)   363     if source.match("|"):   364         item_no = _parse(source, state)   365         if source.match("|"):   366             raise error, "conditional backref with more than two branches"   367     else:   368         item_no = None   369     if source.next and not source.match(")", 0):   370         raise error, "pattern not properly closed"   371     subpattern = SubPattern(state)   372     subpattern.append((GROUPREF_EXISTS, (condgroup, item_yes, item_no)))   373     return subpattern   374    375 _PATTERNENDERS = set("|)")   376 _ASSERTCHARS = set("=!<")   377 _LOOKBEHINDASSERTCHARS = set("=!")   378 _REPEATCODES = set([MIN_REPEAT, MAX_REPEAT])   379    380 def _parse(source, state):   381     # parse a simple pattern   382     subpattern = SubPattern(state)   383    384     # precompute constants into local variables   385     subpatternappend = subpattern.append   386     sourceget = source.get   387     sourcematch = source.match   388     _len = len   389     PATTERNENDERS = _PATTERNENDERS   390     ASSERTCHARS = _ASSERTCHARS   391     LOOKBEHINDASSERTCHARS = _LOOKBEHINDASSERTCHARS   392     REPEATCODES = _REPEATCODES   393    394     while 1:   395    396         if source.next in PATTERNENDERS:   397             break # end of subpattern   398         this = sourceget()   399         if this is None:   400             break # end of pattern   401    402         if state.flags & SRE_FLAG_VERBOSE:   403             # skip whitespace and comments   404             if this in WHITESPACE:   405                 continue   406             if this == "#":   407                 while 1:   408                     this = sourceget()   409                     if this in (None, "\n"):   410                         break   411                 continue   412    413         if this and this[0] not in SPECIAL_CHARS:   414             subpatternappend((LITERAL, ord(this)))   415    416         elif this == "[":   417             # character set   418             set = []   419             setappend = set.append   420 ##          if sourcematch(":"):   421 ##              pass # handle character classes   422             if sourcematch("^"):   423                 setappend((NEGATE, None))   424             # check remaining characters   425             start = set[:]   426             while 1:   427                 this = sourceget()   428                 if this == "]" and set != start:   429                     break   430                 elif this and this[0] == "\\":   431                     code1 = _class_escape(source, this)   432                 elif this:   433                     code1 = LITERAL, ord(this)   434                 else:   435                     raise error, "unexpected end of regular expression"   436                 if sourcematch("-"):   437                     # potential range   438                     this = sourceget()   439                     if this == "]":   440                         if code1[0] is IN:   441                             code1 = code1[1][0]   442                         setappend(code1)   443                         setappend((LITERAL, ord("-")))   444                         break   445                     elif this:   446                         if this[0] == "\\":   447                             code2 = _class_escape(source, this)   448                         else:   449                             code2 = LITERAL, ord(this)   450                         if code1[0] != LITERAL or code2[0] != LITERAL:   451                             raise error, "bad character range"   452                         lo = code1[1]   453                         hi = code2[1]   454                         if hi < lo:   455                             raise error, "bad character range"   456                         setappend((RANGE, (lo, hi)))   457                     else:   458                         raise error, "unexpected end of regular expression"   459                 else:   460                     if code1[0] is IN:   461                         code1 = code1[1][0]   462                     setappend(code1)   463    464             # XXX: <fl> should move set optimization to compiler!   465             if _len(set)==1 and set[0][0] is LITERAL:   466                 subpatternappend(set[0]) # optimization   467             elif _len(set)==2 and set[0][0] is NEGATE and set[1][0] is LITERAL:   468                 subpatternappend((NOT_LITERAL, set[1][1])) # optimization   469             else:   470                 # XXX: <fl> should add charmap optimization here   471                 subpatternappend((IN, set))   472    473         elif this and this[0] in REPEAT_CHARS:   474             # repeat previous item   475             if this == "?":   476                 min, max = 0, 1   477             elif this == "*":   478                 min, max = 0, MAXREPEAT   479    480             elif this == "+":   481                 min, max = 1, MAXREPEAT   482             elif this == "{":   483                 if source.next == "}":   484                     subpatternappend((LITERAL, ord(this)))   485                     continue   486                 here = source.tell()   487                 min, max = 0, MAXREPEAT   488                 lo = hi = ""   489                 while source.next in DIGITS:   490                     lo = lo + source.get()   491                 if sourcematch(","):   492                     while source.next in DIGITS:   493                         hi = hi + sourceget()   494                 else:   495                     hi = lo   496                 if not sourcematch("}"):   497                     subpatternappend((LITERAL, ord(this)))   498                     source.seek(here)   499                     continue   500                 if lo:   501                     min = int(lo)   502                 if hi:   503                     max = int(hi)   504                 if max < min:   505                     raise error, "bad repeat interval"   506             else:   507                 raise error, "not supported"   508             # figure out which item to repeat   509             if subpattern:   510                 item = subpattern[-1:]   511             else:   512                 item = None   513             if not item or (_len(item) == 1 and item[0][0] == AT):   514                 raise error, "nothing to repeat"   515             if item[0][0] in REPEATCODES:   516                 raise error, "multiple repeat"   517             if sourcematch("?"):   518                 subpattern[-1] = (MIN_REPEAT, (min, max, item))   519             else:   520                 subpattern[-1] = (MAX_REPEAT, (min, max, item))   521    522         elif this == ".":   523             subpatternappend((ANY, None))   524    525         elif this == "(":   526             group = 1   527             name = None   528             condgroup = None   529             if sourcematch("?"):   530                 group = 0   531                 # options   532                 if sourcematch("P"):   533                     # python extensions   534                     if sourcematch("<"):   535                         # named group: skip forward to end of name   536                         name = ""   537                         while 1:   538                             char = sourceget()   539                             if char is None:   540                                 raise error, "unterminated name"   541                             if char == ">":   542                                 break   543                             name = name + char   544                         group = 1   545                         if not isname(name):   546                             raise error, "bad character in group name"   547                     elif sourcematch("="):   548                         # named backreference   549                         name = ""   550                         while 1:   551                             char = sourceget()   552                             if char is None:   553                                 raise error, "unterminated name"   554                             if char == ")":   555                                 break   556                             name = name + char   557                         if not isname(name):   558                             raise error, "bad character in group name"   559                         gid = state.groupdict.get(name)   560                         if gid is None:   561                             raise error, "unknown group name"   562                         subpatternappend((GROUPREF, gid))   563                         continue   564                     else:   565                         char = sourceget()   566                         if char is None:   567                             raise error, "unexpected end of pattern"   568                         raise error, "unknown specifier: ?P%s" % char   569                 elif sourcematch(":"):   570                     # non-capturing group   571                     group = 2   572                 elif sourcematch("#"):   573                     # comment   574                     while 1:   575                         if source.next is None or source.next == ")":   576                             break   577                         sourceget()   578                     if not sourcematch(")"):   579                         raise error, "unbalanced parenthesis"   580                     continue   581                 elif source.next in ASSERTCHARS:   582                     # lookahead assertions   583                     char = sourceget()   584                     dir = 1   585                     if char == "<":   586                         if source.next not in LOOKBEHINDASSERTCHARS:   587                             raise error, "syntax error"   588                         dir = -1 # lookbehind   589                         char = sourceget()   590                     p = _parse_sub(source, state)   591                     if not sourcematch(")"):   592                         raise error, "unbalanced parenthesis"   593                     if char == "=":   594                         subpatternappend((ASSERT, (dir, p)))   595                     else:   596                         subpatternappend((ASSERT_NOT, (dir, p)))   597                     continue   598                 elif sourcematch("("):   599                     # conditional backreference group   600                     condname = ""   601                     while 1:   602                         char = sourceget()   603                         if char is None:   604                             raise error, "unterminated name"   605                         if char == ")":   606                             break   607                         condname = condname + char   608                     group = 2   609                     if isname(condname):   610                         condgroup = state.groupdict.get(condname)   611                         if condgroup is None:   612                             raise error, "unknown group name"   613                     else:   614                         try:   615                             condgroup = int(condname)   616                         except ValueError:   617                             raise error, "bad character in group name"   618                 else:   619                     # flags   620                     if not source.next in FLAGS:   621                         raise error, "unexpected end of pattern"   622                     while source.next in FLAGS:   623                         state.flags = state.flags | FLAGS[sourceget()]   624             if group:   625                 # parse group contents   626                 if group == 2:   627                     # anonymous group   628                     group = None   629                 else:   630                     group = state.opengroup(name)   631                 if condgroup:   632                     p = _parse_sub_cond(source, state, condgroup)   633                 else:   634                     p = _parse_sub(source, state)   635                 if not sourcematch(")"):   636                     raise error, "unbalanced parenthesis"   637                 if group is not None:   638                     state.closegroup(group)   639                 subpatternappend((SUBPATTERN, (group, p)))   640             else:   641                 while 1:   642                     char = sourceget()   643                     if char is None:   644                         raise error, "unexpected end of pattern"   645                     if char == ")":   646                         break   647                     raise error, "unknown extension"   648    649         elif this == "^":   650             subpatternappend((AT, AT_BEGINNING))   651    652         elif this == "$":   653             subpattern.append((AT, AT_END))   654    655         elif this and this[0] == "\\":   656             code = _escape(source, this, state)   657             subpatternappend(code)   658    659         else:   660             raise error, "parser error"   661    662     return subpattern   663    664 def parse(str, flags=0, pattern=None):   665     # parse 're' pattern into list of (opcode, argument) tuples   666    667     source = Tokenizer(str)   668    669     if pattern is None:   670         pattern = Pattern()   671     pattern.flags = flags   672     pattern.str = str   673    674     p = _parse_sub(source, pattern, 0)   675    676     tail = source.get()   677     if tail == ")":   678         raise error, "unbalanced parenthesis"   679     elif tail:   680         raise error, "bogus characters at end of regular expression"   681    682     if flags & SRE_FLAG_DEBUG:   683         p.dump()   684    685     if not (flags & SRE_FLAG_VERBOSE) and p.pattern.flags & SRE_FLAG_VERBOSE:   686         # the VERBOSE flag was switched on inside the pattern.  to be   687         # on the safe side, we'll parse the whole thing again...   688         return parse(str, p.pattern.flags)   689    690     return p   691    692 def parse_template(source, pattern):   693     # parse 're' replacement string into list of literals and   694     # group references   695     s = Tokenizer(source)   696     sget = s.get   697     p = []   698     a = p.append   699     def literal(literal, p=p, pappend=a):   700         if p and p[-1][0] is LITERAL:   701             p[-1] = LITERAL, p[-1][1] + literal   702         else:   703             pappend((LITERAL, literal))   704     sep = source[:0]   705     if type(sep) is type(""):   706         makechar = chr   707     else:   708         makechar = unichr   709     while 1:   710         this = sget()   711         if this is None:   712             break # end of replacement string   713         if this and this[0] == "\\":   714             # group   715             c = this[1:2]   716             if c == "g":   717                 name = ""   718                 if s.match("<"):   719                     while 1:   720                         char = sget()   721                         if char is None:   722                             raise error, "unterminated group name"   723                         if char == ">":   724                             break   725                         name = name + char   726                 if not name:   727                     raise error, "bad group name"   728                 try:   729                     index = int(name)   730                     if index < 0:   731                         raise error, "negative group number"   732                 except ValueError:   733                     if not isname(name):   734                         raise error, "bad character in group name"   735                     try:   736                         index = pattern.groupindex[name]   737                     except KeyError:   738                         raise IndexError, "unknown group name"   739                 a((MARK, index))   740             elif c == "0":   741                 if s.next in OCTDIGITS:   742                     this = this + sget()   743                     if s.next in OCTDIGITS:   744                         this = this + sget()   745                 literal(makechar(int(this[1:], 8) & 0xff))   746             elif c in DIGITS:   747                 isoctal = False   748                 if s.next in DIGITS:   749                     this = this + sget()   750                     if (c in OCTDIGITS and this[2] in OCTDIGITS and   751                         s.next in OCTDIGITS):   752                         this = this + sget()   753                         isoctal = True   754                         literal(makechar(int(this[1:], 8) & 0xff))   755                 if not isoctal:   756                     a((MARK, int(this[1:])))   757             else:   758                 try:   759                     this = makechar(ESCAPES[this][1])   760                 except KeyError:   761                     pass   762                 literal(this)   763         else:   764             literal(this)   765     # convert template to groups and literals lists   766     i = 0   767     groups = []   768     groupsappend = groups.append   769     literals = [None] * len(p)   770     for c, s in p:   771         if c is MARK:   772             groupsappend((i, s))   773             # literal[i] is already None   774         else:   775             literals[i] = s   776         i = i + 1   777     return groups, literals   778    779 def expand_template(template, match):   780     g = match.group   781     sep = match.string[:0]   782     groups, literals = template   783     literals = literals[:]   784     try:   785         for index, group in groups:   786             literals[index] = s = g(group)   787             if s is None:   788                 raise error, "unmatched group"   789     except IndexError:   790         raise error, "invalid group reference"   791     return sep.join(literals)