1 from pyparser import parser, pytokenizer, pygram, error 2 from pyparser import consts 3 4 def recode_to_utf8(bytes, encoding): 5 text = bytes.decode(encoding) 6 if not isinstance(text, unicode): 7 raise error.SyntaxError("codec did not return a unicode object") 8 recoded = text.encode("utf-8") 9 return recoded 10 11 def _normalize_encoding(encoding): 12 """returns normalized name for <encoding> 13 14 see dist/src/Parser/tokenizer.c 'get_normal_name()' 15 for implementation details / reference 16 17 NOTE: for now, parser.suite() raises a MemoryError when 18 a bad encoding is used. (SF bug #979739) 19 """ 20 if encoding is None: 21 return None 22 # lower() + '_' / '-' conversion 23 encoding = encoding.replace('_', '-').lower() 24 if encoding == 'utf-8' or encoding.startswith('utf-8-'): 25 return 'utf-8' 26 for variant in ['latin-1', 'iso-latin-1', 'iso-8859-1']: 27 if (encoding == variant or 28 encoding.startswith(variant + '-')): 29 return 'iso-8859-1' 30 return encoding 31 32 def _check_for_encoding(s): 33 eol = s.find('\n') 34 if eol < 0: 35 return _check_line_for_encoding(s)[0] 36 enc, again = _check_line_for_encoding(s[:eol]) 37 if enc or not again: 38 return enc 39 eol2 = s.find('\n', eol + 1) 40 if eol2 < 0: 41 return _check_line_for_encoding(s[eol + 1:])[0] 42 return _check_line_for_encoding(s[eol + 1:eol2])[0] 43 44 45 def _check_line_for_encoding(line): 46 """returns the declared encoding or None""" 47 i = 0 48 for i in range(len(line)): 49 if line[i] == '#': 50 break 51 if line[i] not in ' \t\014': 52 return None, False # Not a comment, don't read the second line. 53 return pytokenizer.match_encoding_declaration(line[i:]), True 54 55 56 class CompileInfo(object): 57 """Stores information about the source being compiled. 58 59 * filename: The filename of the source. 60 * mode: The parse mode to use. ('exec', 'eval', or 'single') 61 * flags: Parser and compiler flags. 62 * encoding: The source encoding. 63 """ 64 65 def __init__(self, filename, mode="exec", flags=0): 66 self.filename = filename 67 self.mode = mode 68 self.encoding = None 69 self.flags = flags 70 71 72 _targets = { 73 'eval' : pygram.syms["eval_input"], 74 'single' : pygram.syms["single_input"], 75 'exec' : pygram.syms["file_input"], 76 } 77 78 class PythonParser(parser.Parser): 79 80 def __init__(self, grammar=pygram.python_grammar): 81 parser.Parser.__init__(self, grammar) 82 83 def parse_source(self, textsrc, compile_info): 84 """Main entry point for parsing Python source. 85 86 Everything from decoding the source to tokenizing to building the parse 87 tree is handled here. 88 """ 89 # Detect source encoding. 90 enc = None 91 if textsrc.startswith("\xEF\xBB\xBF"): 92 textsrc = textsrc[3:] 93 enc = 'utf-8' 94 # If an encoding is explicitly given check that it is utf-8. 95 decl_enc = _check_for_encoding(textsrc) 96 if decl_enc and decl_enc != "utf-8": 97 raise error.SyntaxError("UTF-8 BOM with %s coding cookie" % decl_enc, 98 filename=compile_info.filename) 99 elif compile_info.flags & consts.PyCF_SOURCE_IS_UTF8: 100 enc = 'utf-8' 101 if _check_for_encoding(textsrc) is not None: 102 raise error.SyntaxError("coding declaration in unicode string", 103 filename=compile_info.filename) 104 else: 105 enc = _normalize_encoding(_check_for_encoding(textsrc)) 106 if enc is not None and enc != 'utf-8': 107 try: 108 textsrc = recode_to_utf8(textsrc, enc) 109 except LookupError as e: 110 # if the codec is not found, LookupError is raised. 111 raise error.SyntaxError("Unknown encoding: %s" % enc, 112 filename=compile_info.filename) 113 # Transform unicode errors into SyntaxError 114 except UnicodeDecodeError as e: 115 message = str(e) 116 raise error.SyntaxError(message) 117 118 flags = compile_info.flags 119 120 # The tokenizer is very picky about how it wants its input. 121 source_lines = textsrc.splitlines(True) 122 if source_lines and not source_lines[-1].endswith("\n"): 123 source_lines[-1] += '\n' 124 if textsrc and textsrc[-1] == "\n": 125 flags &= ~consts.PyCF_DONT_IMPLY_DEDENT 126 127 self.prepare(_targets[compile_info.mode]) 128 tp = 0 129 try: 130 try: 131 # Note: we no longer pass the CO_FUTURE_* to the tokenizer, 132 # which is expected to work independently of them. It's 133 # certainly the case for all futures in Python <= 2.7. 134 tokens = pytokenizer.generate_tokens(source_lines, flags) 135 136 self.grammar = pygram.python_grammar 137 138 for tp, value, lineno, column, line in tokens: 139 if self.add_token(tp, value, lineno, column, line): 140 break 141 except error.TokenError as e: 142 e.filename = compile_info.filename 143 raise 144 except parser.ParseError as e: 145 # Catch parse errors, pretty them up and reraise them as a 146 # SyntaxError. 147 new_err = error.IndentationError 148 if tp == pygram.tokens["INDENT"]: 149 msg = "unexpected indent" 150 elif e.expected == pygram.tokens["INDENT"]: 151 msg = "expected an indented block" 152 else: 153 new_err = error.SyntaxError 154 msg = "invalid syntax" 155 raise new_err(msg, e.lineno, e.column, e.line, 156 compile_info.filename) 157 else: 158 tree = self.root 159 finally: 160 # Avoid hanging onto the tree. 161 self.root = None 162 if enc is not None: 163 compile_info.encoding = enc 164 # Wrap the tree in a special encoding declaration for parser module 165 # compatibility. 166 tree = parser.NonterminalEnc(pygram.syms["encoding_decl"], tree, enc) 167 return tree 168 169 def parse(filename): 170 """returns the parsed contents of <filename>""" 171 info = CompileInfo(filename) 172 f = open(filename) 173 try: 174 return PythonParser().parse_source(f.read(), info) 175 finally: 176 f.close() 177 178 def suite(text): 179 """returns the parsed form of the given program <text>""" 180 info = CompileInfo("<stdin>") 181 return PythonParser().parse_source(text, info) 182 183 def expr(text): 184 """returns the parsed form of the given expression <text>""" 185 info = CompileInfo("<stdin>", "single") 186 return PythonParser().parse_source(text, info) 187 188 def st2tuple(tree, line_info=True, col_info=False): 189 """returns <tree> in tuple form for the compiler package""" 190 if isinstance(tree, parser.AbstractNonterminal): 191 l = [tree.type] 192 for i in range(0, tree.num_children()): 193 l.append(st2tuple(tree.get_child(i))) 194 if isinstance(tree, parser.NonterminalEnc): 195 l.append(tree.encoding) 196 return tuple(l) 197 elif isinstance(tree, parser.Terminal): 198 l = [tree.type, tree.value] 199 if line_info: 200 l.append(tree.get_lineno()) 201 if col_info: 202 l.append(tree.get_column()) 203 return tuple(l) 204 else: 205 raise TypeError, tree