1 """ 2 Makes a parser from a grammar source. 3 4 Inspired by Guido van Rossum's pgen2. 5 """ 6 7 import StringIO 8 import tokenize 9 import token 10 11 from pyparser import parser 12 13 14 class PgenError(Exception): 15 16 def __init__(self, msg, location=None): 17 Exception.__init__(self, msg) 18 self.location = location 19 20 21 class NFA(object): 22 23 def __init__(self): 24 self.arcs = [] 25 26 def arc(self, to_state, label=None): 27 self.arcs.append((label, to_state)) 28 29 def find_unlabeled_states(self, into): 30 if self in into: 31 return 32 into.add(self) 33 for label, state in self.arcs: 34 if label is None: 35 state.find_unlabeled_states(into) 36 37 38 class DFA(object): 39 40 def __init__(self, nfa_set, final_state): 41 self.nfas = nfa_set 42 self.is_final = final_state in nfa_set 43 self.arcs = {} 44 45 def arc(self, next, label): 46 self.arcs[label] = next 47 48 def unify_state(self, old, new): 49 for label, state in self.arcs.iteritems(): 50 if state is old: 51 self.arcs[label] = new 52 53 def __repr__(self): 54 return "<DFA arcs=%r>" % self.arcs 55 56 def __eq__(self, other): 57 if not isinstance(other, DFA): 58 # This shouldn't really happen. 59 return NotImplemented 60 if other.is_final != self.is_final: 61 return False 62 if len(self.arcs) != len(other.arcs): 63 return False 64 for label, state in self.arcs.iteritems(): 65 try: 66 other_state = other.arcs[label] 67 except KeyError: 68 return False 69 else: 70 if other_state is not state: 71 return False 72 return True 73 74 75 def nfa_to_dfa(start, end): 76 """Convert an NFA to a DFA(s) 77 78 Each DFA is initially a set of NFA states without labels. We start with the 79 DFA for the start NFA. Then we add labeled arcs to it pointing to another 80 set of NFAs (the next state). Finally, we do the same thing to every DFA 81 that is found and return the list of states. 82 """ 83 base_nfas = set() 84 start.find_unlabeled_states(base_nfas) 85 state_stack = [DFA(base_nfas, end)] 86 for state in state_stack: 87 arcs = {} 88 for nfa in state.nfas: 89 for label, sub_nfa in nfa.arcs: 90 if label is not None: 91 sub_nfa.find_unlabeled_states(arcs.setdefault(label, set())) 92 for label, nfa_set in arcs.iteritems(): 93 for st in state_stack: 94 if st.nfas == nfa_set: 95 break 96 else: 97 st = DFA(nfa_set, end) 98 state_stack.append(st) 99 state.arc(st, label) 100 return state_stack 101 102 def simplify_dfa(dfa): 103 changed = True 104 while changed: 105 changed = False 106 for i, state in enumerate(dfa): 107 for j in xrange(i + 1, len(dfa)): 108 other_state = dfa[j] 109 if state == other_state: 110 del dfa[j] 111 for sub_state in dfa: 112 sub_state.unify_state(other_state, state) 113 changed = True 114 break 115 116 117 class ParserGenerator(object): 118 """NOT_RPYTHON""" 119 120 def __init__(self, grammar_source): 121 self.start_symbol = None 122 self.dfas = {} 123 stream = StringIO.StringIO(grammar_source) 124 self.token_stream = tokenize.generate_tokens(stream.readline) 125 self.parse() 126 self.first = {} 127 self.add_first_sets() 128 129 def build_grammar(self, grammar_cls): 130 gram = grammar_cls() 131 gram.start = self.start_symbol 132 names = self.dfas.keys() 133 names.sort() 134 names.remove(self.start_symbol) 135 names.insert(0, self.start_symbol) 136 # First, build symbol and id mappings. 137 for name in names: 138 i = 256 + len(gram.symbol_ids) 139 gram.symbol_ids[name] = i 140 gram.symbol_names[i] = name 141 # Then, iterate through again and finalize labels. 142 for name in names: 143 dfa = self.dfas[name] 144 states = [] 145 for state in dfa: 146 arcs = [] 147 for label, next in state.arcs.iteritems(): 148 arcs.append((self.make_label(gram, label), dfa.index(next))) 149 states.append((arcs, state.is_final)) 150 gram.dfas.append((states, self.make_first(gram, name))) 151 assert len(gram.dfas) - 1 == gram.symbol_ids[name] - 256 152 gram.start = gram.symbol_ids[self.start_symbol] 153 return gram 154 155 def make_label(self, gram, label): 156 label_index = len(gram.labels) 157 if label[0].isalpha(): 158 # Either a symbol or a token. 159 if label in gram.symbol_ids: 160 if label in gram.symbol_to_label: 161 return gram.symbol_to_label[label] 162 else: 163 gram.labels.append(gram.symbol_ids[label]) 164 gram.symbol_to_label[label] = label_index 165 return label_index 166 elif label.isupper(): 167 token_index = gram.TOKENS[label] 168 if token_index in gram.token_ids: 169 return gram.token_ids[token_index] 170 else: 171 gram.labels.append(token_index) 172 gram.token_ids[token_index] = label_index 173 return label_index 174 else: 175 # Probably a rule without a definition. 176 raise PgenError("no such rule: %r" % (label,)) 177 else: 178 # A keyword or operator. 179 value = label.strip("\"'") 180 if value[0].isalpha(): 181 if value in gram.keyword_ids: 182 return gram.keyword_ids[value] 183 else: 184 gram.labels.append(gram.KEYWORD_TOKEN) 185 gram.keyword_ids[value] = label_index 186 return label_index 187 else: 188 try: 189 token_index = gram.OPERATOR_MAP[value] 190 except KeyError: 191 raise PgenError("no such operator: %r" % (value,)) 192 if token_index in gram.token_ids: 193 return gram.token_ids[token_index] 194 else: 195 gram.labels.append(token_index) 196 gram.token_ids[token_index] = label_index 197 return label_index 198 199 def make_first(self, gram, name): 200 original_firsts = self.first[name] 201 firsts = dict() 202 for label in original_firsts: 203 firsts[self.make_label(gram, label)] = None 204 return firsts 205 206 def add_first_sets(self): 207 for name, dfa in self.dfas.iteritems(): 208 if name not in self.first: 209 self.get_first(name, dfa) 210 211 def get_first(self, name, dfa): 212 self.first[name] = None 213 state = dfa[0] 214 all_labels = set() 215 overlap_check = {} 216 for label, sub_state in state.arcs.iteritems(): 217 if label in self.dfas: 218 if label in self.first: 219 new_labels = self.first[label] 220 if new_labels is None: 221 raise PgenError("recursion in rule: %r" % (name,)) 222 else: 223 new_labels = self.get_first(label, self.dfas[label]) 224 all_labels.update(new_labels) 225 overlap_check[label] = new_labels 226 else: 227 all_labels.add(label) 228 overlap_check[label] = set((label,)) 229 inverse = {} 230 for label, their_first in overlap_check.iteritems(): 231 for sub_label in their_first: 232 if sub_label in inverse: 233 raise PgenError("ambiguous symbol with label %s" 234 % (label,)) 235 inverse[sub_label] = label 236 self.first[name] = all_labels 237 return all_labels 238 239 def expect(self, token_type, value=None): 240 if token_type != self.type: 241 expected = token.tok_name[token_type] 242 got = token.tok_name[self.type] 243 raise PgenError("expected token %s but got %s" % (expected, got), 244 self.location) 245 current_value = self.value 246 if value is not None: 247 if value != current_value: 248 msg = "expected %r but got %r" % (value, current_value) 249 raise PgenError(msg,self.location) 250 self.advance_token() 251 return current_value 252 253 def test_token(self, token_type, value): 254 if self.type == token_type and self.value == value: 255 return True 256 return False 257 258 def advance_token(self): 259 data = self.token_stream.next() 260 # Ignore comments and non-logical newlines. 261 while data[0] in (tokenize.NL, tokenize.COMMENT): 262 data = self.token_stream.next() 263 self.type, self.value = data[:2] 264 self.location = data[2:] 265 266 def parse(self): 267 self.advance_token() 268 while self.type != token.ENDMARKER: 269 # Skip over whitespace. 270 while self.type == token.NEWLINE: 271 self.advance_token() 272 name, start_state, end_state = self.parse_rule() 273 dfa = nfa_to_dfa(start_state, end_state) 274 simplify_dfa(dfa) 275 self.dfas[name] = dfa 276 if self.start_symbol is None: 277 self.start_symbol = name 278 279 def parse_rule(self): 280 # RULE: NAME ':' ALTERNATIVES 281 name = self.expect(token.NAME) 282 self.expect(token.OP, ":") 283 start_state, end_state = self.parse_alternatives() 284 self.expect(token.NEWLINE) 285 return name, start_state, end_state 286 287 def parse_alternatives(self): 288 # ALTERNATIVES: ITEMS ('|' ITEMS)* 289 first_state, end_state = self.parse_items() 290 if self.test_token(token.OP, "|"): 291 # Link all alternatives into a enclosing set of states. 292 enclosing_start_state = NFA() 293 enclosing_end_state = NFA() 294 enclosing_start_state.arc(first_state) 295 end_state.arc(enclosing_end_state) 296 while self.test_token(token.OP, "|"): 297 self.advance_token() 298 sub_start_state, sub_end_state = self.parse_items() 299 enclosing_start_state.arc(sub_start_state) 300 sub_end_state.arc(enclosing_end_state) 301 first_state = enclosing_start_state 302 end_state = enclosing_end_state 303 return first_state, end_state 304 305 def parse_items(self): 306 # ITEMS: ITEM+ 307 first_state, end_state = self.parse_item() 308 while self.type in (token.STRING, token.NAME) or \ 309 self.test_token(token.OP, "(") or \ 310 self.test_token(token.OP, "["): 311 sub_first_state, new_end_state = self.parse_item() 312 end_state.arc(sub_first_state) 313 end_state = new_end_state 314 return first_state, end_state 315 316 def parse_item(self): 317 # ITEM: '[' ALTERNATIVES ']' | ATOM ['+' | '*'] 318 if self.test_token(token.OP, "["): 319 self.advance_token() 320 start_state, end_state = self.parse_alternatives() 321 self.expect(token.OP, "]") 322 # Bypass the rule if this is optional. 323 start_state.arc(end_state) 324 return start_state, end_state 325 else: 326 atom_state, next_state = self.parse_atom() 327 # Check for a repeater. 328 if self.type == token.OP and self.value in ("+", "*"): 329 next_state.arc(atom_state) 330 repeat = self.value 331 self.advance_token() 332 if repeat == "*": 333 # Optionally repeated 334 return atom_state, atom_state 335 else: 336 # Required 337 return atom_state, next_state 338 else: 339 return atom_state, next_state 340 341 def parse_atom(self): 342 # ATOM: '(' ALTERNATIVES ')' | NAME | STRING 343 if self.test_token(token.OP, "("): 344 self.advance_token() 345 rule = self.parse_alternatives() 346 self.expect(token.OP, ")") 347 return rule 348 elif self.type in (token.NAME, token.STRING): 349 atom_state = NFA() 350 next_state = NFA() 351 atom_state.arc(next_state, self.value) 352 self.advance_token() 353 return atom_state, next_state 354 else: 355 invalid = token.tok_name[self.type] 356 raise PgenError("unexpected token: %s" % (invalid,), 357 self.location)