1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/pyparser/parser.py Sun Jan 08 20:20:39 2017 +0100
1.3 @@ -0,0 +1,287 @@
1.4 +"""
1.5 +A CPython inspired RPython parser.
1.6 +"""
1.7 +
1.8 +
1.9 +class Grammar(object):
1.10 + """
1.11 + Base Grammar object.
1.12 +
1.13 + Pass this to ParserGenerator.build_grammar to fill it with useful values for
1.14 + the Parser.
1.15 + """
1.16 +
1.17 + def __init__(self):
1.18 + self.symbol_ids = {}
1.19 + self.symbol_names = {}
1.20 + self.symbol_to_label = {}
1.21 + self.keyword_ids = {}
1.22 + self.dfas = []
1.23 + self.labels = [0]
1.24 + self.token_ids = {}
1.25 + self.start = -1
1.26 +
1.27 + def shared_copy(self):
1.28 + new = self.__class__()
1.29 + new.symbol_ids = self.symbol_ids
1.30 + new.symbols_names = self.symbol_names
1.31 + new.keyword_ids = self.keyword_ids
1.32 + new.dfas = self.dfas
1.33 + new.labels = self.labels
1.34 + new.token_ids = self.token_ids
1.35 + return new
1.36 +
1.37 + def _freeze_(self):
1.38 + # Remove some attributes not used in parsing.
1.39 + try:
1.40 + del self.symbol_to_label
1.41 + del self.symbol_names
1.42 + del self.symbol_ids
1.43 + except AttributeError:
1.44 + pass
1.45 + return True
1.46 +
1.47 +
1.48 +class Node(object):
1.49 +
1.50 + __slots__ = ("type", )
1.51 +
1.52 + def __init__(self, type):
1.53 + self.type = type
1.54 +
1.55 + def __eq__(self, other):
1.56 + raise NotImplementedError("abstract base class")
1.57 +
1.58 + def __ne__(self, other):
1.59 + return not self == other
1.60 +
1.61 + def get_value(self):
1.62 + return None
1.63 +
1.64 + def get_child(self, i):
1.65 + raise NotImplementedError("abstract base class")
1.66 +
1.67 + def num_children(self):
1.68 + return 0
1.69 +
1.70 + def append_child(self, child):
1.71 + raise NotImplementedError("abstract base class")
1.72 +
1.73 + def get_lineno(self):
1.74 + raise NotImplementedError("abstract base class")
1.75 +
1.76 + def get_column(self):
1.77 + raise NotImplementedError("abstract base class")
1.78 +
1.79 +
1.80 +class Terminal(Node):
1.81 + __slots__ = ("value", "lineno", "column")
1.82 + def __init__(self, type, value, lineno, column):
1.83 + Node.__init__(self, type)
1.84 + self.value = value
1.85 + self.lineno = lineno
1.86 + self.column = column
1.87 +
1.88 + def __repr__(self):
1.89 + return "Terminal(type=%s, value=%r)" % (self.type, self.value)
1.90 +
1.91 + def __eq__(self, other):
1.92 + # For tests.
1.93 + return (type(self) == type(other) and
1.94 + self.type == other.type and
1.95 + self.value == other.value)
1.96 +
1.97 + def get_value(self):
1.98 + return self.value
1.99 +
1.100 + def get_lineno(self):
1.101 + return self.lineno
1.102 +
1.103 + def get_column(self):
1.104 + return self.column
1.105 +
1.106 +
1.107 +class AbstractNonterminal(Node):
1.108 + __slots__ = ()
1.109 +
1.110 + def get_lineno(self):
1.111 + return self.get_child(0).get_lineno()
1.112 +
1.113 + def get_column(self):
1.114 + return self.get_child(0).get_column()
1.115 +
1.116 + def __eq__(self, other):
1.117 + # For tests.
1.118 + # grumble, annoying
1.119 + if not isinstance(other, AbstractNonterminal):
1.120 + return False
1.121 + if self.type != other.type:
1.122 + return False
1.123 + if self.num_children() != other.num_children():
1.124 + return False
1.125 + for i in range(self.num_children()):
1.126 + if self.get_child(i) != other.get_child(i):
1.127 + return False
1.128 + return True
1.129 +
1.130 +
1.131 +class Nonterminal(AbstractNonterminal):
1.132 + __slots__ = ("_children", )
1.133 + def __init__(self, type, children):
1.134 + Node.__init__(self, type)
1.135 + self._children = children
1.136 +
1.137 + def __repr__(self):
1.138 + return "Nonterminal(type=%s, children=%r)" % (self.type, self._children)
1.139 +
1.140 + def get_child(self, i):
1.141 + return self._children[i]
1.142 +
1.143 + def num_children(self):
1.144 + return len(self._children)
1.145 +
1.146 + def append_child(self, child):
1.147 + self._children.append(child)
1.148 +
1.149 +
1.150 +class Nonterminal1(AbstractNonterminal):
1.151 + __slots__ = ("_child", )
1.152 + def __init__(self, type, child):
1.153 + Node.__init__(self, type)
1.154 + self._child = child
1.155 +
1.156 + def __repr__(self):
1.157 + return "Nonterminal(type=%s, children=[%r])" % (self.type, self._child)
1.158 +
1.159 + def get_child(self, i):
1.160 + assert i == 0 or i == -1
1.161 + return self._child
1.162 +
1.163 + def num_children(self):
1.164 + return 1
1.165 +
1.166 + def append_child(self, child):
1.167 + assert 0, "should be unreachable"
1.168 +
1.169 +
1.170 +
1.171 +class ParseError(Exception):
1.172 +
1.173 + def __init__(self, msg, token_type, value, lineno, column, line,
1.174 + expected=-1):
1.175 + self.msg = msg
1.176 + self.token_type = token_type
1.177 + self.value = value
1.178 + self.lineno = lineno
1.179 + self.column = column
1.180 + self.line = line
1.181 + self.expected = expected
1.182 +
1.183 + def __str__(self):
1.184 + return "ParserError(%s, %r)" % (self.token_type, self.value)
1.185 +
1.186 +
1.187 +class Parser(object):
1.188 +
1.189 + def __init__(self, grammar):
1.190 + self.grammar = grammar
1.191 + self.root = None
1.192 + self.stack = None
1.193 +
1.194 + def prepare(self, start=-1):
1.195 + """Setup the parser for parsing.
1.196 +
1.197 + Takes the starting symbol as an argument.
1.198 + """
1.199 + if start == -1:
1.200 + start = self.grammar.start
1.201 + self.root = None
1.202 + current_node = Nonterminal(start, [])
1.203 + self.stack = []
1.204 + self.stack.append((self.grammar.dfas[start - 256], 0, current_node))
1.205 +
1.206 + def add_token(self, token_type, value, lineno, column, line):
1.207 + label_index = self.classify(token_type, value, lineno, column, line)
1.208 + sym_id = 0 # for the annotator
1.209 + while True:
1.210 + dfa, state_index, node = self.stack[-1]
1.211 + states, first = dfa
1.212 + arcs, is_accepting = states[state_index]
1.213 + for i, next_state in arcs:
1.214 + sym_id = self.grammar.labels[i]
1.215 + if label_index == i:
1.216 + # We matched a non-terminal.
1.217 + self.shift(next_state, token_type, value, lineno, column)
1.218 + state = states[next_state]
1.219 + # While the only possible action is to accept, pop nodes off
1.220 + # the stack.
1.221 + while state[1] and not state[0]:
1.222 + self.pop()
1.223 + if not self.stack:
1.224 + # Parsing is done.
1.225 + return True
1.226 + dfa, state_index, node = self.stack[-1]
1.227 + state = dfa[0][state_index]
1.228 + return False
1.229 + elif sym_id >= 256:
1.230 + sub_node_dfa = self.grammar.dfas[sym_id - 256]
1.231 + # Check if this token can start a child node.
1.232 + if label_index in sub_node_dfa[1]:
1.233 + self.push(sub_node_dfa, next_state, sym_id, lineno,
1.234 + column)
1.235 + break
1.236 + else:
1.237 + # We failed to find any arcs to another state, so unless this
1.238 + # state is accepting, it's invalid input.
1.239 + if is_accepting:
1.240 + self.pop()
1.241 + if not self.stack:
1.242 + raise ParseError("too much input", token_type, value,
1.243 + lineno, column, line)
1.244 + else:
1.245 + # If only one possible input would satisfy, attach it to the
1.246 + # error.
1.247 + if len(arcs) == 1:
1.248 + expected = sym_id
1.249 + else:
1.250 + expected = -1
1.251 + raise ParseError("bad input", token_type, value, lineno,
1.252 + column, line, expected)
1.253 +
1.254 + def classify(self, token_type, value, lineno, column, line):
1.255 + """Find the label for a token."""
1.256 + if token_type == self.grammar.KEYWORD_TOKEN:
1.257 + label_index = self.grammar.keyword_ids.get(value, -1)
1.258 + if label_index != -1:
1.259 + return label_index
1.260 + label_index = self.grammar.token_ids.get(token_type, -1)
1.261 + if label_index == -1:
1.262 + raise ParseError("invalid token", token_type, value, lineno, column,
1.263 + line)
1.264 + return label_index
1.265 +
1.266 + def shift(self, next_state, token_type, value, lineno, column):
1.267 + """Shift a non-terminal and prepare for the next state."""
1.268 + dfa, state, node = self.stack[-1]
1.269 + new_node = Terminal(token_type, value, lineno, column)
1.270 + node.append_child(new_node)
1.271 + self.stack[-1] = (dfa, next_state, node)
1.272 +
1.273 + def push(self, next_dfa, next_state, node_type, lineno, column):
1.274 + """Push a terminal and adjust the current state."""
1.275 + dfa, state, node = self.stack[-1]
1.276 + new_node = Nonterminal(node_type, [])
1.277 + self.stack[-1] = (dfa, next_state, node)
1.278 + self.stack.append((next_dfa, 0, new_node))
1.279 +
1.280 + def pop(self):
1.281 + """Pop an entry off the stack and make its node a child of the last."""
1.282 + dfa, state, node = self.stack.pop()
1.283 + if self.stack:
1.284 + # we are now done with node, so we can store it more efficiently if
1.285 + # it has just one child
1.286 + if node.num_children() == 1:
1.287 + node = Nonterminal1(node.type, node.get_child(0))
1.288 + self.stack[-1][2].append_child(node)
1.289 + else:
1.290 + self.root = node