1 from pyparser import parser, pytokenizer, pygram, error 2 from pyparser import consts 3 4 def _normalize_encoding(encoding): 5 """returns normalized name for <encoding> 6 7 see dist/src/Parser/tokenizer.c 'get_normal_name()' 8 for implementation details / reference 9 10 NOTE: for now, parser.suite() raises a MemoryError when 11 a bad encoding is used. (SF bug #979739) 12 """ 13 if encoding is None: 14 return None 15 # lower() + '_' / '-' conversion 16 encoding = encoding.replace('_', '-').lower() 17 if encoding == 'utf-8' or encoding.startswith('utf-8-'): 18 return 'utf-8' 19 for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']: 20 if (encoding == variant or 21 encoding.startswith(variant + '-')): 22 return 'iso-8859-1' 23 return encoding 24 25 def _check_for_encoding(s): 26 eol = s.find('\n') 27 if eol < 0: 28 return _check_line_for_encoding(s)[0] 29 enc, again = _check_line_for_encoding(s[:eol]) 30 if enc or not again: 31 return enc 32 eol2 = s.find('\n', eol + 1) 33 if eol2 < 0: 34 return _check_line_for_encoding(s[eol + 1:])[0] 35 return _check_line_for_encoding(s[eol + 1:eol2])[0] 36 37 38 def _check_line_for_encoding(line): 39 """returns the declared encoding or None""" 40 i = 0 41 for i in range(len(line)): 42 if line[i] == '#': 43 break 44 if line[i] not in ' \t\014': 45 return None, False # Not a comment, don't read the second line. 46 return pytokenizer.match_encoding_declaration(line[i:]), True 47 48 49 class CompileInfo(object): 50 """Stores information about the source being compiled. 51 52 * filename: The filename of the source. 53 * mode: The parse mode to use. ('exec', 'eval', or 'single') 54 * flags: Parser and compiler flags. 55 * encoding: The source encoding. 56 """ 57 58 def __init__(self, filename, mode="exec", flags=0): 59 self.filename = filename 60 self.mode = mode 61 self.encoding = None 62 self.flags = flags 63 64 65 _targets = { 66 'eval' : pygram.syms["eval_input"], 67 'single' : pygram.syms["single_input"], 68 'exec' : pygram.syms["file_input"], 69 } 70 71 class PythonParser(parser.Parser): 72 73 def __init__(self, grammar=pygram.python_grammar): 74 parser.Parser.__init__(self, grammar) 75 76 def parse_source(self, textsrc, compile_info): 77 """Main entry point for parsing Python source. 78 79 Everything from decoding the source to tokenizing to building the parse 80 tree is handled here. 81 """ 82 # Detect source encoding. 83 enc = None 84 if textsrc.startswith("\xEF\xBB\xBF"): 85 textsrc = textsrc[3:] 86 enc = 'utf-8' 87 # If an encoding is explicitly given check that it is utf-8. 88 decl_enc = _check_for_encoding(textsrc) 89 if decl_enc and decl_enc != "utf-8": 90 raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc, 91 filename=compile_info.filename) 92 elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8: 93 enc = 'utf-8' 94 if _check_for_encoding(textsrc) is not None: 95 raise error.SyntaxError("coding declaration in unicode string", 96 filename=compile_info.filename) 97 else: 98 enc = _normalize_encoding(_check_for_encoding(textsrc)) 99 100 flags = compile_info.flags 101 102 # The tokenizer is very picky about how it wants its input. 103 source_lines = textsrc.splitlines(True) 104 if source_lines and not source_lines[-1].endswith("\n"): 105 source_lines[-1] += '\n' 106 if textsrc and textsrc[-1] == "\n": 107 flags &= ~consts.PyCF_DONT_IMPLY_DEDENT 108 109 self.prepare(_targets[compile_info.mode]) 110 tp = 0 111 try: 112 try: 113 # Note: we no longer pass the CO_FUTURE_* to the tokenizer, 114 # which is expected to work independently of them. It's 115 # certainly the case for all futures in Python <= 2.7. 116 tokens = pytokenizer.generate_tokens(source_lines, flags) 117 118 self.grammar = pygram.python_grammar 119 120 for tp, value, lineno, column, line in tokens: 121 if self.add_token(tp, value, lineno, column, line): 122 break 123 except error.TokenError as e: 124 e.filename = compile_info.filename 125 raise 126 except parser.ParseError as e: 127 # Catch parse errors, pretty them up and reraise them as a 128 # SyntaxError. 129 new_err = error.IndentationError 130 if tp == pygram.tokens["INDENT"]: 131 msg = "unexpected indent" 132 elif e.expected == pygram.tokens["INDENT"]: 133 msg = "expected an indented block" 134 else: 135 new_err = error.SyntaxError 136 msg = "invalid syntax" 137 raise new_err(msg, e.lineno, e.column, e.line, 138 compile_info.filename) 139 else: 140 tree = self.root 141 finally: 142 # Avoid hanging onto the tree. 143 self.root = None 144 if enc is not None: 145 compile_info.encoding = enc 146 # Wrap the tree in a special encoding declaration for parser module 147 # compatibility. 148 tree = parser.NonterminalEnc(pygram.syms["encoding_decl"], tree, enc) 149 return tree 150 151 def parse(filename): 152 """returns the parsed contents of <filename>""" 153 info = CompileInfo(filename) 154 f = open(filename) 155 try: 156 return PythonParser().parse_source(f.read(), info) 157 finally: 158 f.close() 159 160 def suite(text): 161 """returns the parsed form of the given program <text>""" 162 info = CompileInfo("<stdin>") 163 return PythonParser().parse_source(text, info) 164 165 def expr(text): 166 """returns the parsed form of the given expression <text>""" 167 info = CompileInfo("<stdin>", "single") 168 return PythonParser().parse_source(text, info) 169 170 def st2tuple(tree, line_info=True, col_info=False): 171 """returns <tree> in tuple form for the compiler package""" 172 if isinstance(tree, parser.AbstractNonterminal): 173 l = [tree.type] 174 for i in range(0, tree.num_children()): 175 l.append(st2tuple(tree.get_child(i))) 176 if isinstance(tree, parser.NonterminalEnc): 177 l.append(tree.encoding) 178 return tuple(l) 179 elif isinstance(tree, parser.Terminal): 180 l = [tree.type, tree.value] 181 if line_info: 182 l.append(tree.get_lineno()) 183 if col_info: 184 l.append(tree.get_column()) 185 return tuple(l) 186 else: 187 raise TypeError, tree