Lichen

pyparser/metaparser.py

583:aed28d04304d
2017-02-13 Paul Boddie Re-added size information to string instances as the __size__ attribute. This fixes problems introduced when using strlen on data likely to contain embedded nulls, which was the reason for having size information explicitly stored in the first place. attr-strvalue-without-size
     1 """     2 Makes a parser from a grammar source.     3      4 Inspired by Guido van Rossum's pgen2.     5 """     6      7 import StringIO     8 import tokenize     9 import token    10     11 from pyparser import parser    12     13     14 class PgenError(Exception):    15     16     def __init__(self, msg, location=None):    17         Exception.__init__(self, msg)    18         self.location = location    19     20     21 class NFA(object):    22     23     def __init__(self):    24         self.arcs = []    25     26     def arc(self, to_state, label=None):    27         self.arcs.append((label, to_state))    28     29     def find_unlabeled_states(self, into):    30         if self in into:    31             return    32         into.add(self)    33         for label, state in self.arcs:    34             if label is None:    35                 state.find_unlabeled_states(into)    36     37     38 class DFA(object):    39     40     def __init__(self, nfa_set, final_state):    41         self.nfas = nfa_set    42         self.is_final = final_state in nfa_set    43         self.arcs = {}    44     45     def arc(self, next, label):    46         self.arcs[label] = next    47     48     def unify_state(self, old, new):    49         for label, state in self.arcs.iteritems():    50             if state is old:    51                 self.arcs[label] = new    52     53     def __repr__(self):    54         return "<DFA arcs=%r>" % self.arcs    55     56     def __eq__(self, other):    57         if not isinstance(other, DFA):    58             # This shouldn't really happen.    59             return NotImplemented    60         if other.is_final != self.is_final:    61             return False    62         if len(self.arcs) != len(other.arcs):    63             return False    64         for label, state in self.arcs.iteritems():    65             try:    66                 other_state = other.arcs[label]    67             except KeyError:    68                 return False    69             else:    70                 if other_state is not state:    71                     return False    72         return True    73     74     75 def nfa_to_dfa(start, end):    76     """Convert an NFA to a DFA(s)    77     78     Each DFA is initially a set of NFA states without labels.  We start with the    79     DFA for the start NFA.  Then we add labeled arcs to it pointing to another    80     set of NFAs (the next state).  Finally, we do the same thing to every DFA    81     that is found and return the list of states.    82     """    83     base_nfas = set()    84     start.find_unlabeled_states(base_nfas)    85     state_stack = [DFA(base_nfas, end)]    86     for state in state_stack:    87         arcs = {}    88         for nfa in state.nfas:    89             for label, sub_nfa in nfa.arcs:    90                 if label is not None:    91                     sub_nfa.find_unlabeled_states(arcs.setdefault(label, set()))    92         for label, nfa_set in arcs.iteritems():    93             for st in state_stack:    94                 if st.nfas == nfa_set:    95                     break    96             else:    97                 st = DFA(nfa_set, end)    98                 state_stack.append(st)    99             state.arc(st, label)   100     return state_stack   101    102 def simplify_dfa(dfa):   103     changed = True   104     while changed:   105         changed = False   106         for i, state in enumerate(dfa):   107             for j in xrange(i + 1, len(dfa)):   108                 other_state = dfa[j]   109                 if state == other_state:   110                     del dfa[j]   111                     for sub_state in dfa:   112                         sub_state.unify_state(other_state, state)   113                     changed = True   114                     break   115    116    117 class ParserGenerator(object):   118     """NOT_RPYTHON"""   119    120     def __init__(self, grammar_source):   121         self.start_symbol = None   122         self.dfas = {}   123         stream = StringIO.StringIO(grammar_source)   124         self.token_stream = tokenize.generate_tokens(stream.readline)   125         self.parse()   126         self.first = {}   127         self.add_first_sets()   128    129     def build_grammar(self, grammar_cls):   130         gram = grammar_cls()   131         gram.start = self.start_symbol   132         names = self.dfas.keys()   133         names.sort()   134         names.remove(self.start_symbol)   135         names.insert(0, self.start_symbol)   136         # First, build symbol and id mappings.   137         for name in names:   138             i = 256 + len(gram.symbol_ids)   139             gram.symbol_ids[name] = i   140             gram.symbol_names[i] = name   141         # Then, iterate through again and finalize labels.   142         for name in names:   143             dfa = self.dfas[name]   144             states = []   145             for state in dfa:   146                 arcs = []   147                 for label, next in state.arcs.iteritems():   148                     arcs.append((self.make_label(gram, label), dfa.index(next)))   149                 states.append((arcs, state.is_final))   150             gram.dfas.append((states, self.make_first(gram, name)))   151             assert len(gram.dfas) - 1 == gram.symbol_ids[name] - 256   152         gram.start = gram.symbol_ids[self.start_symbol]   153         return gram   154    155     def make_label(self, gram, label):   156         label_index = len(gram.labels)   157         if label[0].isalpha():   158             # Either a symbol or a token.   159             if label in gram.symbol_ids:   160                 if label in gram.symbol_to_label:   161                     return gram.symbol_to_label[label]   162                 else:   163                     gram.labels.append(gram.symbol_ids[label])   164                     gram.symbol_to_label[label] = label_index   165                     return label_index   166             elif label.isupper():   167                 token_index = gram.TOKENS[label]   168                 if token_index in gram.token_ids:   169                     return gram.token_ids[token_index]   170                 else:   171                     gram.labels.append(token_index)   172                     gram.token_ids[token_index] = label_index   173                     return label_index   174             else:   175                 # Probably a rule without a definition.   176                 raise PgenError("no such rule: %r" % (label,))   177         else:   178             # A keyword or operator.   179             value = label.strip("\"'")   180             if value[0].isalpha():   181                 if value in gram.keyword_ids:   182                     return gram.keyword_ids[value]   183                 else:   184                     gram.labels.append(gram.KEYWORD_TOKEN)   185                     gram.keyword_ids[value] = label_index   186                     return label_index   187             else:   188                 try:   189                     token_index = gram.OPERATOR_MAP[value]   190                 except KeyError:   191                     raise PgenError("no such operator: %r" % (value,))   192                 if token_index in gram.token_ids:   193                     return gram.token_ids[token_index]   194                 else:   195                     gram.labels.append(token_index)   196                     gram.token_ids[token_index] = label_index   197                     return label_index   198    199     def make_first(self, gram, name):   200         original_firsts = self.first[name]   201         firsts = dict()   202         for label in original_firsts:   203             firsts[self.make_label(gram, label)] = None   204         return firsts   205    206     def add_first_sets(self):   207         for name, dfa in self.dfas.iteritems():   208             if name not in self.first:   209                 self.get_first(name, dfa)   210    211     def get_first(self, name, dfa):   212         self.first[name] = None   213         state = dfa[0]   214         all_labels = set()   215         overlap_check = {}   216         for label, sub_state in state.arcs.iteritems():   217             if label in self.dfas:   218                 if label in self.first:   219                     new_labels = self.first[label]   220                     if new_labels is None:   221                         raise PgenError("recursion in rule: %r" % (name,))   222                 else:   223                     new_labels = self.get_first(label, self.dfas[label])   224                 all_labels.update(new_labels)   225                 overlap_check[label] = new_labels   226             else:   227                 all_labels.add(label)   228                 overlap_check[label] = set((label,))   229         inverse = {}   230         for label, their_first in overlap_check.iteritems():   231             for sub_label in their_first:   232                 if sub_label in inverse:   233                     raise PgenError("ambiguous symbol with label %s"   234                                     % (label,))   235                 inverse[sub_label] = label   236         self.first[name] = all_labels   237         return all_labels   238    239     def expect(self, token_type, value=None):   240         if token_type != self.type:   241             expected = token.tok_name[token_type]   242             got = token.tok_name[self.type]   243             raise PgenError("expected token %s but got %s" % (expected, got),   244                             self.location)   245         current_value = self.value   246         if value is not None:   247             if value != current_value:   248                 msg = "expected %r but got %r" % (value, current_value)   249                 raise PgenError(msg,self.location)   250         self.advance_token()   251         return current_value   252    253     def test_token(self, token_type, value):   254         if self.type == token_type and self.value == value:   255             return True   256         return False   257    258     def advance_token(self):   259         data = self.token_stream.next()   260         # Ignore comments and non-logical newlines.   261         while data[0] in (tokenize.NL, tokenize.COMMENT):   262             data = self.token_stream.next()   263         self.type, self.value = data[:2]   264         self.location = data[2:]   265    266     def parse(self):   267         self.advance_token()   268         while self.type != token.ENDMARKER:   269             # Skip over whitespace.   270             while self.type == token.NEWLINE:   271                 self.advance_token()   272             name, start_state, end_state = self.parse_rule()   273             dfa = nfa_to_dfa(start_state, end_state)   274             simplify_dfa(dfa)   275             self.dfas[name] = dfa   276             if self.start_symbol is None:   277                 self.start_symbol = name   278    279     def parse_rule(self):   280         # RULE: NAME ':' ALTERNATIVES   281         name = self.expect(token.NAME)   282         self.expect(token.OP, ":")   283         start_state, end_state = self.parse_alternatives()   284         self.expect(token.NEWLINE)   285         return name, start_state, end_state   286    287     def parse_alternatives(self):   288         # ALTERNATIVES: ITEMS ('|' ITEMS)*   289         first_state, end_state = self.parse_items()   290         if self.test_token(token.OP, "|"):   291             # Link all alternatives into a enclosing set of states.   292             enclosing_start_state = NFA()   293             enclosing_end_state = NFA()   294             enclosing_start_state.arc(first_state)   295             end_state.arc(enclosing_end_state)   296             while self.test_token(token.OP, "|"):   297                 self.advance_token()   298                 sub_start_state, sub_end_state = self.parse_items()   299                 enclosing_start_state.arc(sub_start_state)   300                 sub_end_state.arc(enclosing_end_state)   301             first_state = enclosing_start_state   302             end_state = enclosing_end_state   303         return first_state, end_state   304    305     def parse_items(self):   306         # ITEMS: ITEM+   307         first_state, end_state = self.parse_item()   308         while self.type in (token.STRING, token.NAME) or \   309                            self.test_token(token.OP, "(") or \   310                            self.test_token(token.OP, "["):   311             sub_first_state, new_end_state = self.parse_item()   312             end_state.arc(sub_first_state)   313             end_state = new_end_state   314         return first_state, end_state   315    316     def parse_item(self):   317         # ITEM: '[' ALTERNATIVES ']' | ATOM ['+' | '*']   318         if self.test_token(token.OP, "["):   319             self.advance_token()   320             start_state, end_state = self.parse_alternatives()   321             self.expect(token.OP, "]")   322             # Bypass the rule if this is optional.   323             start_state.arc(end_state)   324             return start_state, end_state   325         else:   326             atom_state, next_state = self.parse_atom()   327             # Check for a repeater.   328             if self.type == token.OP and self.value in ("+", "*"):   329                 next_state.arc(atom_state)   330                 repeat = self.value   331                 self.advance_token()   332                 if repeat == "*":   333                     # Optionally repeated   334                     return atom_state, atom_state   335                 else:   336                     # Required   337                     return atom_state, next_state   338             else:   339                 return atom_state, next_state   340    341     def parse_atom(self):   342         # ATOM: '(' ALTERNATIVES ')' | NAME | STRING   343         if self.test_token(token.OP, "("):   344             self.advance_token()   345             rule = self.parse_alternatives()   346             self.expect(token.OP, ")")   347             return rule   348         elif self.type in (token.NAME, token.STRING):   349             atom_state = NFA()   350             next_state = NFA()   351             atom_state.arc(next_state, self.value)   352             self.advance_token()   353             return atom_state, next_state   354         else:   355             invalid = token.tok_name[self.type]   356             raise PgenError("unexpected token: %s" % (invalid,),   357                             self.location)