Lichen (file pyparser/pyparse.py at f91b467ef568)

     1 from pyparser import parser, pytokenizer, pygram, error     2 from pyparser import consts     3      4 def _normalize_encoding(encoding):     5     """returns normalized name for <encoding>     6      7     see dist/src/Parser/tokenizer.c 'get_normal_name()'     8     for implementation details / reference     9     10     NOTE: for now, parser.suite() raises a MemoryError when    11           a bad encoding is used. (SF bug #979739)    12     """    13     if encoding is None:    14         return None    15     # lower() + '_' / '-' conversion    16     encoding = encoding.replace('_', '-').lower()    17     if encoding == 'utf-8' or encoding.startswith('utf-8-'):    18         return 'utf-8'    19     for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']:    20         if (encoding == variant or    21             encoding.startswith(variant + '-')):    22             return 'iso-8859-1'    23     return encoding    24     25 def _check_for_encoding(s):    26     eol = s.find('\n')    27     if eol < 0:    28         return _check_line_for_encoding(s)[0]    29     enc, again = _check_line_for_encoding(s[:eol])    30     if enc or not again:    31         return enc    32     eol2 = s.find('\n', eol + 1)    33     if eol2 < 0:    34         return _check_line_for_encoding(s[eol + 1:])[0]    35     return _check_line_for_encoding(s[eol + 1:eol2])[0]    36     37     38 def _check_line_for_encoding(line):    39     """returns the declared encoding or None"""    40     i = 0    41     for i in range(len(line)):    42         if line[i] == '#':    43             break    44         if line[i] not in ' \t\014':    45             return None, False  # Not a comment, don't read the second line.    46     return pytokenizer.match_encoding_declaration(line[i:]), True    47     48     49 class CompileInfo(object):    50     """Stores information about the source being compiled.    51     52     * filename: The filename of the source.    53     * mode: The parse mode to use. ('exec', 'eval', or 'single')    54     * flags: Parser and compiler flags.    55     * encoding: The source encoding.    56     """    57     58     def __init__(self, filename, mode="exec", flags=0):    59         self.filename = filename    60         self.mode = mode    61         self.encoding = None    62         self.flags = flags    63     64     65 _targets = {    66 'eval' : pygram.syms["eval_input"],    67 'single' : pygram.syms["single_input"],    68 'exec' : pygram.syms["file_input"],    69 }    70     71 class PythonParser(parser.Parser):    72     73     def __init__(self, grammar=pygram.python_grammar):    74         parser.Parser.__init__(self, grammar)    75     76     def parse_source(self, textsrc, compile_info):    77         """Main entry point for parsing Python source.    78     79         Everything from decoding the source to tokenizing to building the parse    80         tree is handled here.    81         """    82         # Detect source encoding.    83         enc = None    84         if textsrc.startswith("\xEF\xBB\xBF"):    85             textsrc = textsrc[3:]    86             enc = 'utf-8'    87             # If an encoding is explicitly given check that it is utf-8.    88             decl_enc = _check_for_encoding(textsrc)    89             if decl_enc and decl_enc != "utf-8":    90                 raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc,    91                                         filename=compile_info.filename)    92         elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8:    93             enc = 'utf-8'    94             if _check_for_encoding(textsrc) is not None:    95                 raise error.SyntaxError("coding declaration in unicode string",    96                                         filename=compile_info.filename)    97         else:    98             enc = _normalize_encoding(_check_for_encoding(textsrc))    99    100         flags = compile_info.flags   101    102         # The tokenizer is very picky about how it wants its input.   103         source_lines = textsrc.splitlines(True)   104         if source_lines and not source_lines[-1].endswith("\n"):   105             source_lines[-1] += '\n'   106         if textsrc and textsrc[-1] == "\n":   107             flags &= ~consts.PyCF_DONT_IMPLY_DEDENT   108    109         self.prepare(_targets[compile_info.mode])   110         tp = 0   111         try:   112             try:   113                 # Note: we no longer pass the CO_FUTURE_* to the tokenizer,   114                 # which is expected to work independently of them.  It's   115                 # certainly the case for all futures in Python <= 2.7.   116                 tokens = pytokenizer.generate_tokens(source_lines, flags)   117    118                 self.grammar = pygram.python_grammar   119    120                 for tp, value, lineno, column, line in tokens:   121                     if self.add_token(tp, value, lineno, column, line):   122                         break   123             except error.TokenError as e:   124                 e.filename = compile_info.filename   125                 raise   126             except parser.ParseError as e:   127                 # Catch parse errors, pretty them up and reraise them as a   128                 # SyntaxError.   129                 new_err = error.IndentationError   130                 if tp == pygram.tokens["INDENT"]:   131                     msg = "unexpected indent"   132                 elif e.expected == pygram.tokens["INDENT"]:   133                     msg = "expected an indented block"   134                 else:   135                     new_err = error.SyntaxError   136                     msg = "invalid syntax"   137                 raise new_err(msg, e.lineno, e.column, e.line,   138                               compile_info.filename)   139             else:   140                 tree = self.root   141         finally:   142             # Avoid hanging onto the tree.   143             self.root = None   144         if enc is not None:   145             compile_info.encoding = enc   146             # Wrap the tree in a special encoding declaration for parser module   147             # compatibility.   148             tree = parser.NonterminalEnc(pygram.syms["encoding_decl"], tree, enc)   149         return tree   150    151 def parse(filename):   152     """returns the parsed contents of <filename>"""   153     info = CompileInfo(filename)   154     f = open(filename)   155     try:   156         return PythonParser().parse_source(f.read(), info)   157     finally:   158         f.close()   159    160 def suite(text):   161     """returns the parsed form of the given program <text>"""   162     info = CompileInfo("<stdin>")   163     return PythonParser().parse_source(text, info)   164    165 def expr(text):   166     """returns the parsed form of the given expression <text>"""   167     info = CompileInfo("<stdin>", "single")   168     return PythonParser().parse_source(text, info)   169    170 def st2tuple(tree, line_info=True, col_info=False):   171     """returns <tree> in tuple form for the compiler package"""   172     if isinstance(tree, parser.AbstractNonterminal):   173         l = [tree.type]   174         for i in range(0, tree.num_children()):   175             l.append(st2tuple(tree.get_child(i)))   176         if isinstance(tree, parser.NonterminalEnc):   177             l.append(tree.encoding)   178         return tuple(l)   179     elif isinstance(tree, parser.Terminal):   180         l = [tree.type, tree.value]   181         if line_info:   182             l.append(tree.get_lineno())   183         if col_info:   184             l.append(tree.get_column())   185         return tuple(l)   186     else:   187         raise TypeError, tree