Lichen

pyparser/pytokenizer.py

1027:dd0745ab8b8a
5 months ago Paul Boddie Reordered GCC arguments to prevent linking failures. Someone decided to change the GCC invocation or linking semantics at some point, meaning that libraries specified "too early" in the argument list no longer provide the symbols required by the program objects, whereas specifying them at the end of the argument list allows those symbols to be found and obtained.
     1 from pyparser import automata     2 from pyparser.pygram import tokens     3 from pyparser.pytoken import python_opmap     4 from pyparser.error import TokenError, TokenIndentationError     5 from pyparser.pytokenize import tabsize, whiteSpaceDFA, \     6     triple_quoted, endDFAs, single_quoted, pseudoDFA     7 from pyparser import consts     8      9 NAMECHARS = 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_'    10 NUMCHARS = '0123456789'    11 ALNUMCHARS = NAMECHARS + NUMCHARS    12 EXTENDED_ALNUMCHARS = ALNUMCHARS + '-.'    13 WHITESPACES = ' \t\n\r\v\f'    14     15 def match_encoding_declaration(comment):    16     """returns the declared encoding or None    17     18     This function is a replacement for :    19     >>> py_encoding = re.compile(r"coding[:=]\s*([-\w.]+)")    20     >>> py_encoding.search(comment)    21     """    22     index = comment.find('coding')    23     if index < 0:    24         return None    25     next_char = comment[index + 6]    26     if next_char not in ':=':    27         return None    28     end_of_decl = comment[index + 7:]    29     index = 0    30     for char in end_of_decl:    31         if char not in WHITESPACES:    32             break    33         index += 1    34     else:    35         return None    36     encoding = ''    37     for char in end_of_decl[index:]:    38         if char in EXTENDED_ALNUMCHARS:    39             encoding += char    40         else:    41             break    42     if encoding != '':    43         return encoding    44     return None    45     46     47 DUMMY_DFA = automata.DFA([], [])    48     49 def generate_tokens(lines, flags):    50     """    51     This is a rewrite of pypy.module.parser.pytokenize.generate_tokens since    52     the original function is not RPYTHON (uses yield)    53     It was also slightly modified to generate Token instances instead    54     of the original 5-tuples -- it's now a 4-tuple of    55     56     * the Token instance    57     * the whole line as a string    58     * the line number (the real one, counting continuation lines)    59     * the position on the line of the end of the token.    60     61     Original docstring ::    62     63         The generate_tokens() generator requires one argment, readline, which    64         must be a callable object which provides the same interface as the    65         readline() method of built-in file objects. Each call to the function    66         should return one line of input as a string.    67     68         The generator produces 5-tuples with these members: the token type; the    69         token string; a 2-tuple (srow, scol) of ints specifying the row and    70         column where the token begins in the source; a 2-tuple (erow, ecol) of    71         ints specifying the row and column where the token ends in the source;    72         and the line on which the token was found. The line passed is the    73         logical line; continuation lines are included.    74     """    75     token_list = []    76     lnum = parenlev = continued = 0    77     namechars = NAMECHARS    78     numchars = NUMCHARS    79     contstr, needcont = '', 0    80     contline = None    81     indents = [0]    82     last_comment = ''    83     parenlevstart = (0, 0, "")    84     85     # make the annotator happy    86     endDFA = DUMMY_DFA    87     # make the annotator happy    88     line = ''    89     pos = 0    90     lines.append("")    91     strstart = (0, 0, "")    92     for line in lines:    93         lnum = lnum + 1    94         line = universal_newline(line)    95         pos, max = 0, len(line)    96     97         if contstr:    98             if not line:    99                 raise TokenError(   100                     "EOF while scanning triple-quoted string literal",   101                     strstart[2], strstart[0], strstart[1]+1,   102                     token_list, lnum-1)   103             endmatch = endDFA.recognize(line)   104             if endmatch >= 0:   105                 pos = end = endmatch   106                 tok = (tokens["STRING"], contstr + line[:end], strstart[0],   107                        strstart[1], line)   108                 token_list.append(tok)   109                 last_comment = ''   110                 contstr, needcont = '', 0   111                 contline = None   112             elif (needcont and not line.endswith('\\\n') and   113                                not line.endswith('\\\r\n')):   114                 tok = (tokens["ERRORTOKEN"], contstr + line, strstart[0],   115                        strstart[1], line)   116                 token_list.append(tok)   117                 last_comment = ''   118                 contstr = ''   119                 contline = None   120                 continue   121             else:   122                 contstr = contstr + line   123                 contline = contline + line   124                 continue   125    126         elif parenlev == 0 and not continued:  # new statement   127             if not line: break   128             column = 0   129             while pos < max:                   # measure leading whitespace   130                 if line[pos] == ' ': column = column + 1   131                 elif line[pos] == '\t': column = (column/tabsize + 1)*tabsize   132                 elif line[pos] == '\f': column = 0   133                 else: break   134                 pos = pos + 1   135             if pos == max: break   136    137             if line[pos] in '#\r\n':   138                 # skip comments or blank lines   139                 continue   140    141             if column > indents[-1]:           # count indents or dedents   142                 indents.append(column)   143                 token_list.append((tokens["INDENT"], line[:pos], lnum, 0, line))   144                 last_comment = ''   145             while column < indents[-1]:   146                 indents = indents[:-1]   147                 token_list.append((tokens["DEDENT"], '', lnum, pos, line))   148                 last_comment = ''   149             if column != indents[-1]:   150                 err = "unindent does not match any outer indentation level"   151                 raise TokenIndentationError(err, line, lnum, 0, token_list)   152    153         else:                                  # continued statement   154             if not line:   155                 if parenlev > 0:   156                     lnum1, start1, line1 = parenlevstart   157                     raise TokenError("parenthesis is never closed", line1,   158                                      lnum1, start1 + 1, token_list, lnum)   159                 raise TokenError("EOF in multi-line statement", line,   160                                  lnum, 0, token_list)   161             continued = 0   162    163         while pos < max:   164             pseudomatch = pseudoDFA.recognize(line, pos)   165             if pseudomatch >= 0:                            # scan for tokens   166                 # JDR: Modified   167                 start = whiteSpaceDFA.recognize(line, pos)   168                 if start < 0:   169                     start = pos   170                 end = pseudomatch   171    172                 if start == end:   173                     raise TokenError("Unknown character", line,   174                                      lnum, start + 1, token_list)   175    176                 pos = end   177                 token, initial = line[start:end], line[start]   178                 if initial in numchars or \   179                    (initial == '.' and token != '.'):      # ordinary number   180                     token_list.append((tokens["NUMBER"], token, lnum, start, line))   181                     last_comment = ''   182                 elif initial in '\r\n':   183                     if parenlev <= 0:   184                         tok = (tokens["NEWLINE"], last_comment, lnum, start, line)   185                         token_list.append(tok)   186                     last_comment = ''   187                 elif initial == '#':   188                     # skip comment   189                     last_comment = token   190                 elif token in triple_quoted:   191                     endDFA = endDFAs[token]   192                     endmatch = endDFA.recognize(line, pos)   193                     if endmatch >= 0:                     # all on one line   194                         pos = endmatch   195                         token = line[start:pos]   196                         tok = (tokens["STRING"], token, lnum, start, line)   197                         token_list.append(tok)   198                         last_comment = ''   199                     else:   200                         strstart = (lnum, start, line)   201                         contstr = line[start:]   202                         contline = line   203                         break   204                 elif initial in single_quoted or \   205                     token[:2] in single_quoted or \   206                     token[:3] in single_quoted:   207                     if token[-1] == '\n':                  # continued string   208                         strstart = (lnum, start, line)   209                         endDFA = (endDFAs[initial] or endDFAs[token[1]] or   210                                    endDFAs[token[2]])   211                         contstr, needcont = line[start:], 1   212                         contline = line   213                         break   214                     else:                                  # ordinary string   215                         tok = (tokens["STRING"], token, lnum, start, line)   216                         token_list.append(tok)   217                         last_comment = ''   218                 elif initial in namechars:                 # ordinary name   219                     token_list.append((tokens["NAME"], token, lnum, start, line))   220                     last_comment = ''   221                 elif initial == '\\':                      # continued stmt   222                     continued = 1   223                 else:   224                     if initial in '([{':   225                         if parenlev == 0:   226                             parenlevstart = (lnum, start, line)   227                         parenlev = parenlev + 1   228                     elif initial in ')]}':   229                         parenlev = parenlev - 1   230                         if parenlev < 0:   231                             raise TokenError("unmatched '%s'" % initial, line,   232                                              lnum, start + 1, token_list)   233                     if token in python_opmap:   234                         punct = python_opmap[token]   235                     else:   236                         punct = tokens["OP"]   237                     token_list.append((punct, token, lnum, start, line))   238                     last_comment = ''   239             else:   240                 start = whiteSpaceDFA.recognize(line, pos)   241                 if start < 0:   242                     start = pos   243                 if start<max and line[start] in single_quoted:   244                     raise TokenError("EOL while scanning string literal",   245                              line, lnum, start+1, token_list)   246                 tok = (tokens["ERRORTOKEN"], line[pos], lnum, pos, line)   247                 token_list.append(tok)   248                 last_comment = ''   249                 pos = pos + 1   250    251     lnum -= 1   252     if not (flags & consts.PyCF_DONT_IMPLY_DEDENT):   253         if token_list and token_list[-1][0] != tokens["NEWLINE"]:   254             tok = (tokens["NEWLINE"], '', lnum, 0, '\n')   255             token_list.append(tok)   256         for indent in indents[1:]:                # pop remaining indent levels   257             token_list.append((tokens["DEDENT"], '', lnum, pos, line))   258     tok = (tokens["NEWLINE"], '', lnum, 0, '\n')   259     token_list.append(tok)   260    261     token_list.append((tokens["ENDMARKER"], '', lnum, pos, line))   262     return token_list   263    264    265 def universal_newline(line):   266     # show annotator that indexes below are non-negative   267     line_len_m2 = len(line) - 2   268     if line_len_m2 >= 0 and line[-2] == '\r' and line[-1] == '\n':   269         return line[:line_len_m2] + '\n'   270     line_len_m1 = len(line) - 1   271     if line_len_m1 >= 0 and line[-1] == '\r':   272         return line[:line_len_m1] + '\n'   273     return line