1.1 --- a/moinformat/__init__.py Fri May 05 22:38:31 2017 +0200
1.2 +++ b/moinformat/__init__.py Fri May 12 00:51:20 2017 +0200
1.3 @@ -19,91 +19,13 @@
1.4 this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 -from moinformat.parsing import ParserBase, TokenStream, get_patterns, \
1.8 - init_formats, new_block
1.9 +from moinformat.parsing import ParserBase, get_patterns, get_subset, new_block
1.10 from moinformat.serialisers import serialise
1.11 from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \
1.12 Larger, ListItem, Monospace, Region, Rule, Smaller, \
1.13 Subscript, Superscript, Table, TableAttr, \
1.14 TableAttrs, TableCell, TableRow, Text, Underline
1.15
1.16 -# Regular expressions.
1.17 -
1.18 -syntax = {
1.19 - # Page regions:
1.20 - "regionstart" : r"((^\s*)([{]{3,}))", # {{{...
1.21 - "regionend" : r"^\s*([}]{3,})", # }}}...
1.22 - "header" : r"#!(.*?)\n", # #! char-excl-nl
1.23 -
1.24 - # Region contents:
1.25 - # Line-oriented patterns:
1.26 - # blank line
1.27 - "break" : r"^(\s*?)\n",
1.28 - # ws... expecting text ::
1.29 - "defterm" : r"^(\s+)(?=.+?::)",
1.30 - # ws... expecting :: ws...
1.31 - "defterm_empty" : r"^(\s+)(?=::\s+)",
1.32 - # [ws...] =... ws... expecting headingend
1.33 - "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)",
1.34 - # ws... list-item [ws...]
1.35 - "listitem" : r"^(\s+)(\*)(\s*)",
1.36 - # ws... number-item ws...
1.37 - "listitem_num" : r"^(\s+)(\d+\.)(\s+)",
1.38 - # ws... alpha-item ws...
1.39 - "listitem_alpha": r"^(\s+)([aA]\.)(\s+)",
1.40 - # ws... roman-item ws...
1.41 - "listitem_roman": r"^(\s+)([iI]\.)(\s+)",
1.42 - # ws... dot-item [ws...]
1.43 - "listitem_dot" : r"^(\s+)(\.)(\s*)",
1.44 - # ||
1.45 - "tablerow" : r"^\|\|",
1.46 -
1.47 - # Region contents:
1.48 - # Inline patterns:
1.49 - "fontstyle" : r"('{2,6})",
1.50 - "larger" : r"~\+",
1.51 - "monospace" : r"`",
1.52 - "rule" : r"(-----*)", # ----...
1.53 - "smaller" : r"~-",
1.54 - "sub" : r",,",
1.55 - "super" : r"\^",
1.56 - "underline" : r"__",
1.57 -
1.58 - # Inline contents:
1.59 - "largerend" : r"\+~",
1.60 - "monospaceend" : r"`",
1.61 - "smallerend" : r"-~",
1.62 - "subend" : r",,",
1.63 - "superend" : r"\^",
1.64 - "underlineend" : r"__",
1.65 -
1.66 - # Heading contents:
1.67 - "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl
1.68 -
1.69 - # List contents:
1.70 - "deftermend" : r"::(\s*?\n)",
1.71 - "deftermsep" : r"::(\s+)",
1.72 - "listitemend" : r"^", # next line
1.73 -
1.74 - # Table contents:
1.75 - "tableattrs" : r"<",
1.76 - "tablecell" : r"\|\|",
1.77 - "tableend" : r"(\s*?)^", # [ws...] next line
1.78 -
1.79 - # Table attributes:
1.80 - "tableattrsend" : r">",
1.81 - "halign" : r"([(:)])",
1.82 - "valign" : r"([v^])",
1.83 - "colour" : r"(\#[0-9A-F]{6})",
1.84 - "colspan" : r"-(\d+)",
1.85 - "rowspan" : r"\|(\d+)",
1.86 - "width" : r"(\d+%)",
1.87 - "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char...
1.88 - "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""",
1.89 - }
1.90 -
1.91 -
1.92 -
1.93 class Parser(ParserBase):
1.94
1.95 "A wiki region parser."
1.96 @@ -115,145 +37,101 @@
1.97 names to parser objects.
1.98 """
1.99
1.100 - default_formats = {"wiki" : self}
1.101 + # Introduce this class as the default parser for the wiki format.
1.102 +
1.103 + default_formats = {"wiki" : Parser}
1.104 if formats:
1.105 default_formats.update(formats)
1.106
1.107 ParserBase.__init__(self, default_formats)
1.108
1.109 - # Pattern details.
1.110 -
1.111 - patterns = get_patterns(syntax)
1.112 -
1.113 - table_pattern_names = [
1.114 - "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend",
1.115 - "valign", "width"
1.116 - ]
1.117 -
1.118 - inline_pattern_names = [
1.119 - "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline",
1.120 - ]
1.121 -
1.122 - region_pattern_names = inline_pattern_names + [
1.123 - "break", "heading", "defterm", "defterm_empty", "listitem",
1.124 - "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman",
1.125 - "regionstart", "regionend", "rule", "tablerow",
1.126 - ]
1.127 -
1.128 - table_region_pattern_names = inline_pattern_names + [
1.129 - "tableattrs", "tablecell", "tableend"
1.130 - ]
1.131 -
1.132 - def inline_patterns_for(self, name):
1.133 - names = self.inline_pattern_names[:]
1.134 - names[names.index(name)] = "%send" % name
1.135 - return names
1.136 -
1.137 # Principal parser methods.
1.138
1.139 - def get_items(self, s, pos=0):
1.140 -
1.141 - "Return a sequence of token items for 's' and 'pos'."
1.142 -
1.143 - return TokenStream(s, self.patterns, pos)
1.144 -
1.145 def parse(self, s):
1.146
1.147 """
1.148 Parse page text 's'. Pages consist of regions delimited by markers.
1.149 """
1.150
1.151 - items = self.get_items(s)
1.152 - region = Region([])
1.153 + self.items = self.get_items(s)
1.154 + self.region = Region([])
1.155
1.156 # Parse page header.
1.157
1.158 - self.parse_region_header(items, region)
1.159 + self.parse_region_header(self.region)
1.160
1.161 - # Handle pages directly with this parser.
1.162 + # Handle pages directly with this parser. Pages do not need to use an
1.163 + # explicit format indicator.
1.164 +
1.165 + if not self.region.type:
1.166 + self.parse_region_content(self.items, self.region)
1.167 +
1.168 # Otherwise, test the type and find an appropriate parser.
1.169
1.170 - if not region.type:
1.171 - self.parse_region_content(items, region)
1.172 else:
1.173 - self.parse_region_type(items, region)
1.174 -
1.175 - return region
1.176 -
1.177 - def parse_region_content(self, items, region):
1.178 -
1.179 - "Parse the data provided by 'items' to populate a wiki 'region'."
1.180 + self.parse_region_type(self.region)
1.181
1.182 - # Obtain a suitable token stream.
1.183 -
1.184 - items = self.replace_items(items)
1.185 -
1.186 - # Define a block to hold text and start parsing.
1.187 + return self.region
1.188
1.189 - new_block(region)
1.190 - self.parse_region_details(items, region, self.region_pattern_names)
1.191
1.192 - # Update the previous token stream.
1.193 -
1.194 - self.update_items(items)
1.195
1.196 # Parser methods supporting different page features.
1.197
1.198 - def parse_attrname(self, items, attrs):
1.199 + def parse_attrname(self, attrs):
1.200
1.201 "Handle an attribute name within 'attrs'."
1.202
1.203 - name = items.read_match()
1.204 + name = self.read_match()
1.205 attr = TableAttr(name)
1.206
1.207 - preceding = items.read_until(["attrvalue"], False)
1.208 + preceding = self.read_until(["attrvalue"], False)
1.209 if preceding == "":
1.210 - attr.quote = items.read_match(1)
1.211 - attr.value = items.read_match(2)
1.212 + attr.quote = self.read_match(1)
1.213 + attr.value = self.read_match(2)
1.214
1.215 attrs.append(attr)
1.216
1.217 - def parse_break(self, items, region):
1.218 + def parse_break(self, region):
1.219
1.220 "Handle a paragraph break within 'region'."
1.221
1.222 region.add(Break())
1.223 new_block(region)
1.224
1.225 - def parse_defitem(self, items, region, extra=""):
1.226 + def parse_defitem(self, region, extra=""):
1.227
1.228 "Handle a definition item within 'region'."
1.229
1.230 - pad = items.read_match(1)
1.231 + pad = self.read_match(1)
1.232 item = DefItem([], pad, extra)
1.233 - self.parse_region_details(items, item, ["listitemend"])
1.234 + self.parse_region_details(item, ["listitemend"])
1.235 region.add(item)
1.236 new_block(region)
1.237
1.238 - def parse_defterm(self, items, region):
1.239 + def parse_defterm(self, region):
1.240
1.241 "Handle a definition term within 'region'."
1.242
1.243 - pad = items.read_match(1)
1.244 + pad = self.read_match(1)
1.245 term = DefTerm([], pad)
1.246 - self.parse_region_details(items, term, ["deftermend", "deftermsep"])
1.247 + self.parse_region_details(term, ["deftermend", "deftermsep"])
1.248 region.add(term)
1.249 - if items.matching == "deftermsep":
1.250 - self.parse_defitem(items, region)
1.251 + if self.read_matching() == "deftermsep":
1.252 + self.parse_defitem(region)
1.253
1.254 - def parse_defterm_empty(self, items, region):
1.255 + def parse_defterm_empty(self, region):
1.256
1.257 "Handle an empty definition term within 'region'."
1.258
1.259 - extra = items.read_match(1)
1.260 - self.parse_region_details(items, region, ["deftermsep"])
1.261 - self.parse_defitem(items, region, extra)
1.262 + extra = self.read_match(1)
1.263 + self.parse_region_details(region, ["deftermsep"])
1.264 + self.parse_defitem(region, extra)
1.265
1.266 - def parse_fontstyle(self, items, region):
1.267 + def parse_fontstyle(self, region):
1.268
1.269 "Handle emphasis and strong styles."
1.270
1.271 - n = len(items.read_match(1))
1.272 + n = len(self.read_match(1))
1.273
1.274 # Handle endings.
1.275
1.276 @@ -271,7 +149,7 @@
1.277
1.278 if not active:
1.279 if n:
1.280 - items.rewind(n)
1.281 + self.items.rewind(n)
1.282 raise StopIteration
1.283
1.284 elif not n:
1.285 @@ -285,87 +163,87 @@
1.286
1.287 span = FontStyle([], emphasis, strong)
1.288 if not double:
1.289 - self.parse_region_details(items, span, self.inline_pattern_names)
1.290 + self.parse_region_details(span, self.inline_pattern_names)
1.291 region.append_inline(span)
1.292
1.293 - def parse_halign(self, items, attrs):
1.294 + def parse_halign(self, attrs):
1.295
1.296 "Handle horizontal alignment within 'attrs'."
1.297
1.298 - value = items.read_match()
1.299 + value = self.read_match()
1.300 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True)
1.301 attrs.append(attr)
1.302
1.303 - def parse_heading(self, items, region):
1.304 + def parse_heading(self, region):
1.305
1.306 "Handle a heading."
1.307
1.308 - start_extra = items.read_match(1)
1.309 - level = len(items.read_match(2))
1.310 - start_pad = items.read_match(3)
1.311 + start_extra = self.read_match(1)
1.312 + level = len(self.read_match(2))
1.313 + start_pad = self.read_match(3)
1.314 heading = Heading([], level, start_extra, start_pad)
1.315 - self.parse_region_details(items, heading, ["headingend"] + self.inline_pattern_names)
1.316 + self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names)
1.317 region.add(heading)
1.318 new_block(region)
1.319
1.320 - def parse_heading_end(self, items, heading):
1.321 + def parse_heading_end(self, heading):
1.322
1.323 "Handle the end of a heading."
1.324
1.325 - level = len(items.read_match(2))
1.326 + level = len(self.read_match(2))
1.327 if heading.level == level:
1.328 - heading.end_pad = items.read_match(1)
1.329 - heading.end_extra = items.read_match(3)
1.330 + heading.end_pad = self.read_match(1)
1.331 + heading.end_extra = self.read_match(3)
1.332 raise StopIteration
1.333
1.334 - def parse_listitem(self, items, region):
1.335 + def parse_listitem(self, region):
1.336
1.337 "Handle a list item marker within 'region'."
1.338
1.339 - indent = len(items.read_match(1))
1.340 - marker = items.read_match(2)
1.341 - space = items.read_match(3)
1.342 + indent = len(self.read_match(1))
1.343 + marker = self.read_match(2)
1.344 + space = self.read_match(3)
1.345 item = ListItem([], indent, marker, space)
1.346 - self.parse_region_details(items, item, ["listitemend"])
1.347 + self.parse_region_details(item, self.listitem_pattern_names)
1.348 region.add(item)
1.349 new_block(region)
1.350
1.351 - def parse_rule(self, items, region):
1.352 + def parse_rule(self, region):
1.353
1.354 "Handle a horizontal rule within 'region'."
1.355
1.356 - length = len(items.read_match(1))
1.357 + length = len(self.read_match(1))
1.358 rule = Rule(length)
1.359 region.add(rule)
1.360 new_block(region)
1.361
1.362 - def parse_section(self, items, region):
1.363 + def parse_section(self, region):
1.364
1.365 "Handle the start of a new section within 'region'."
1.366
1.367 # Parse the section and start a new block after the section.
1.368
1.369 - indent = len(items.read_match(2))
1.370 - level = len(items.read_match(3))
1.371 - region.add(self.parse_region(items, level, indent))
1.372 + indent = len(self.read_match(2))
1.373 + level = len(self.read_match(3))
1.374 + region.add(self.parse_region(level, indent))
1.375 new_block(region)
1.376
1.377 - def parse_section_end(self, items, region):
1.378 + def parse_section_end(self, region):
1.379
1.380 "Handle the end of a new section within 'region'."
1.381
1.382 - feature = items.read_match()
1.383 + feature = self.read_match()
1.384 if region.have_end(feature):
1.385 raise StopIteration
1.386 else:
1.387 region.append_inline(Text(feature))
1.388
1.389 - def parse_table_attrs(self, items, cell):
1.390 + def parse_table_attrs(self, cell):
1.391
1.392 "Handle the start of table attributes within 'cell'."
1.393
1.394 attrs = TableAttrs([])
1.395 - self.parse_region_details(items, attrs, self.table_pattern_names)
1.396 + self.parse_region_details(attrs, self.table_pattern_names)
1.397
1.398 # Test the validity of the attributes.
1.399
1.400 @@ -396,7 +274,7 @@
1.401
1.402 cell.append_inline(Text(serialise(attrs)))
1.403
1.404 - def parse_table_row(self, items, region):
1.405 + def parse_table_row(self, region):
1.406
1.407 "Handle the start of a table row within 'region'."
1.408
1.409 @@ -414,12 +292,12 @@
1.410
1.411 while True:
1.412 cell = TableCell([])
1.413 - self.parse_region_details(items, cell, self.table_region_pattern_names)
1.414 + self.parse_region_details(cell, self.table_region_pattern_names)
1.415
1.416 # Handle the end of the row.
1.417
1.418 - if items.matching == "tableend":
1.419 - trailing = items.read_match()
1.420 + if self.read_matching() == "tableend":
1.421 + trailing = self.read_match()
1.422
1.423 # If the cell was started but not finished, convert the row into text.
1.424
1.425 @@ -453,11 +331,11 @@
1.426
1.427 new_block(region)
1.428
1.429 - def parse_valign(self, items, attrs):
1.430 + def parse_valign(self, attrs):
1.431
1.432 "Handle vertical alignment within 'attrs'."
1.433
1.434 - value = items.read_match()
1.435 + value = self.read_match()
1.436 attr = TableAttr("valign", value == "^" and "top" or "bottom", True)
1.437 attrs.append(attr)
1.438
1.439 @@ -465,54 +343,162 @@
1.440
1.441 # Inline formatting handlers.
1.442
1.443 - def parse_inline(self, items, region, cls, pattern_name):
1.444 + def parse_inline(self, region, cls, pattern_name):
1.445
1.446 "Handle an inline region."
1.447
1.448 span = cls([])
1.449 - self.parse_region_details(items, span, self.inline_patterns_for(pattern_name))
1.450 + self.parse_region_details(span, self.inline_patterns_for(pattern_name))
1.451 region.append_inline(span)
1.452
1.453 - def parse_larger(self, items, region):
1.454 - self.parse_inline(items, region, Larger, "larger")
1.455 + def parse_larger(self, region):
1.456 + self.parse_inline(region, Larger, "larger")
1.457
1.458 - def parse_monospace(self, items, region):
1.459 - self.parse_inline(items, region, Monospace, "monospace")
1.460 + def parse_monospace(self, region):
1.461 + self.parse_inline(region, Monospace, "monospace")
1.462
1.463 - def parse_smaller(self, items, region):
1.464 - self.parse_inline(items, region, Smaller, "smaller")
1.465 + def parse_smaller(self, region):
1.466 + self.parse_inline(region, Smaller, "smaller")
1.467
1.468 - def parse_sub(self, items, region):
1.469 - self.parse_inline(items, region, Subscript, "sub")
1.470 + def parse_sub(self, region):
1.471 + self.parse_inline(region, Subscript, "sub")
1.472
1.473 - def parse_super(self, items, region):
1.474 - self.parse_inline(items, region, Superscript, "super")
1.475 + def parse_super(self, region):
1.476 + self.parse_inline(region, Superscript, "super")
1.477
1.478 - def parse_underline(self, items, region):
1.479 - self.parse_inline(items, region, Underline, "underline")
1.480 + def parse_underline(self, region):
1.481 + self.parse_inline(region, Underline, "underline")
1.482
1.483
1.484
1.485 # Table attribute handlers.
1.486
1.487 - def parse_table_attr(self, items, attrs, pattern_name):
1.488 + def parse_table_attr(self, attrs, pattern_name):
1.489
1.490 "Handle a table attribute."
1.491
1.492 - value = items.read_match()
1.493 - attrs.append(TableAttr(pattern_name, value, True))
1.494 + attrs.append(TableAttr(pattern_name, self.read_match(), True))
1.495 +
1.496 + def parse_colour(self, cell):
1.497 + self.parse_table_attr(cell, "colour")
1.498 +
1.499 + def parse_colspan(self, cell):
1.500 + self.parse_table_attr(cell, "colspan")
1.501 +
1.502 + def parse_rowspan(self, cell):
1.503 + self.parse_table_attr(cell, "rowspan")
1.504 +
1.505 + def parse_width(self, cell):
1.506 + self.parse_table_attr(cell, "width")
1.507 +
1.508 +
1.509 +
1.510 + # Regular expressions.
1.511 +
1.512 + syntax = {
1.513 + # Page regions:
1.514 + "regionstart" : r"((^\N*)([{]{3,}))", # {{{...
1.515 + "regionend" : r"^\N*([}]{3,})", # }}}...
1.516 + "header" : r"#!(.*?)\n", # #! char-excl-nl
1.517
1.518 - def parse_colour(self, items, cell):
1.519 - self.parse_table_attr(items, cell, "colour")
1.520 + # Region contents:
1.521 + # Line-oriented patterns:
1.522 + # blank line
1.523 + "break" : r"^(\s*?)\n",
1.524 + # ws... expecting text ::
1.525 + "defterm" : r"^(\N+)(?=.+?::)",
1.526 + # ws... expecting :: ws...
1.527 + "defterm_empty" : r"^(\N+)(?=::\s+)",
1.528 + # [ws...] =... ws... expecting headingend
1.529 + "heading" : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)",
1.530 + # ws... list-item [ws...]
1.531 + "listitem" : r"^(\N+)(\*)(\s*)",
1.532 + # ws... number-item ws...
1.533 + "listitem_num" : r"^(\N+)(\d+\.)(\s+)",
1.534 + # ws... alpha-item ws...
1.535 + "listitem_alpha": r"^(\N+)([aA]\.)(\s+)",
1.536 + # ws... roman-item ws...
1.537 + "listitem_roman": r"^(\N+)([iI]\.)(\s+)",
1.538 + # ws... dot-item [ws...]
1.539 + "listitem_dot" : r"^(\N+)(\.)(\s*)",
1.540 + # ||
1.541 + "tablerow" : r"^\|\|",
1.542 +
1.543 + # Region contents:
1.544 + # Inline patterns:
1.545 + "fontstyle" : r"('{2,6})",
1.546 + "larger" : r"~\+",
1.547 + "monospace" : r"`",
1.548 + "rule" : r"(-----*)", # ----...
1.549 + "smaller" : r"~-",
1.550 + "sub" : r",,",
1.551 + "super" : r"\^",
1.552 + "underline" : r"__",
1.553
1.554 - def parse_colspan(self, items, cell):
1.555 - self.parse_table_attr(items, cell, "colspan")
1.556 + # Inline contents:
1.557 + "largerend" : r"\+~",
1.558 + "monospaceend" : r"`",
1.559 + "smallerend" : r"-~",
1.560 + "subend" : r",,",
1.561 + "superend" : r"\^",
1.562 + "underlineend" : r"__",
1.563 +
1.564 + # Heading contents:
1.565 + "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl
1.566 +
1.567 + # List contents:
1.568 + "deftermend" : r"::(\s*?\n)",
1.569 + "deftermsep" : r"::(\s+)",
1.570 + "listitemend" : r"^", # next line
1.571 +
1.572 + # Table contents:
1.573 + "tableattrs" : r"<",
1.574 + "tablecell" : r"\|\|",
1.575 + "tableend" : r"(\s*?)^", # [ws...] next line
1.576
1.577 - def parse_rowspan(self, items, cell):
1.578 - self.parse_table_attr(items, cell, "rowspan")
1.579 + # Table attributes:
1.580 + "tableattrsend" : r">",
1.581 + "halign" : r"([(:)])",
1.582 + "valign" : r"([v^])",
1.583 + "colour" : r"(\#[0-9A-F]{6})",
1.584 + "colspan" : r"-(\d+)",
1.585 + "rowspan" : r"\|(\d+)",
1.586 + "width" : r"(\d+%)",
1.587 + "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char...
1.588 + "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""",
1.589 + }
1.590 +
1.591 + patterns = get_patterns(syntax)
1.592 +
1.593 +
1.594 +
1.595 + # Pattern details.
1.596
1.597 - def parse_width(self, items, cell):
1.598 - self.parse_table_attr(items, cell, "width")
1.599 + table_pattern_names = [
1.600 + "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend",
1.601 + "valign", "width"
1.602 + ]
1.603 +
1.604 + inline_pattern_names = [
1.605 + "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline",
1.606 + ]
1.607 +
1.608 + listitem_pattern_names = inline_pattern_names + ["listitemend"]
1.609 +
1.610 + region_pattern_names = inline_pattern_names + [
1.611 + "break", "heading", "defterm", "defterm_empty", "listitem",
1.612 + "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman",
1.613 + "regionstart", "regionend", "rule", "tablerow",
1.614 + ]
1.615 +
1.616 + table_region_pattern_names = inline_pattern_names + [
1.617 + "tableattrs", "tablecell", "tableend"
1.618 + ]
1.619 +
1.620 + def inline_patterns_for(self, name):
1.621 + names = self.inline_pattern_names[:]
1.622 + names[names.index(name)] = "%send" % name
1.623 + return names
1.624
1.625
1.626
1.627 @@ -570,6 +556,6 @@
1.628 # Top-level functions.
1.629
1.630 def parse(s, formats=None):
1.631 - return Parser(init_formats(formats)).parse(s)
1.632 + return Parser(formats).parse(s)
1.633
1.634 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/moinformat/parsing.py Fri May 05 22:38:31 2017 +0200
2.2 +++ b/moinformat/parsing.py Fri May 12 00:51:20 2017 +0200
2.3 @@ -40,20 +40,14 @@
2.4 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)
2.5 return patterns
2.6
2.7 -def combine_patterns(patterns, syntax):
2.8 +def get_subset(d, keys):
2.9
2.10 - "Combine 'patterns' with those defined by the given 'syntax' mapping."
2.11 -
2.12 - return combine_dicts([patterns, get_patterns(syntax)])
2.13 + "Return a subset of 'd' having the given 'keys'."
2.14
2.15 -def combine_dicts(dicts):
2.16 -
2.17 - "Combine the given 'dicts'."
2.18 -
2.19 - combined = {}
2.20 - for d in dicts:
2.21 - combined.update(d)
2.22 - return combined
2.23 + subset = {}
2.24 + for key in keys:
2.25 + subset[key] = d[key]
2.26 + return subset
2.27
2.28
2.29
2.30 @@ -63,9 +57,8 @@
2.31
2.32 "A stream of tokens taken from a string."
2.33
2.34 - def __init__(self, s, patterns, pos=0):
2.35 + def __init__(self, s, pos=0):
2.36 self.s = s
2.37 - self.patterns = patterns
2.38 self.pos = pos
2.39 self.match = None
2.40 self.matching = None
2.41 @@ -76,12 +69,12 @@
2.42
2.43 self.pos -= min(length, self.pos)
2.44
2.45 - def read_until(self, pattern_names, remaining=True):
2.46 + def read_until(self, patterns, remaining=True):
2.47
2.48 """
2.49 - Find the first match for the given 'pattern_names'. Return the text
2.50 - preceding any match, the remaining text if no match was found, or None
2.51 - if no match was found and 'remaining' is given as a false value.
2.52 + Find the first match for the given 'patterns'. Return the text preceding
2.53 + any match, the remaining text if no match was found, or None if no match
2.54 + was found and 'remaining' is given as a false value.
2.55 """
2.56
2.57 first = None
2.58 @@ -89,8 +82,8 @@
2.59
2.60 # Find the first matching pattern.
2.61
2.62 - for pattern_name in pattern_names:
2.63 - match = self.patterns[pattern_name].search(self.s, self.pos)
2.64 + for pattern_name, pattern in patterns.items():
2.65 + match = pattern.search(self.s, self.pos)
2.66 if match:
2.67 start, end = match.span()
2.68 if self.matching is None or start < first:
2.69 @@ -143,6 +136,8 @@
2.70
2.71 "Common parsing methods."
2.72
2.73 + region_pattern_names = None
2.74 +
2.75 def __init__(self, formats=None):
2.76
2.77 """
2.78 @@ -151,26 +146,66 @@
2.79 """
2.80
2.81 self.formats = formats
2.82 - self.replaced_items = None
2.83 +
2.84 + def get_parser(self, format_type):
2.85 +
2.86 + """
2.87 + Return a parser for 'format_type' or None if no suitable parser is found.
2.88 + """
2.89 +
2.90 + if not self.formats:
2.91 + return None
2.92 +
2.93 + cls = self.formats.get(format_type)
2.94 + if cls:
2.95 + return cls(self.formats)
2.96 + else:
2.97 + return None
2.98 +
2.99 + def get_patterns(self, pattern_names):
2.100 +
2.101 + "Return a mapping of the given 'pattern_names' to patterns."
2.102 +
2.103 + return get_subset(self.patterns, pattern_names)
2.104
2.105 def get_items(self, s, pos=0):
2.106
2.107 "Return a sequence of token items for 's' and 'pos'."
2.108
2.109 - raise NotImplementedError
2.110 + return TokenStream(s, pos)
2.111 +
2.112 + def set_region(self, items, region):
2.113 +
2.114 + "Set the 'items' used to populate the given 'region'."
2.115
2.116 - def replace_items(self, items):
2.117 + self.items = items
2.118 + self.region = region
2.119 +
2.120 + def read_until(self, pattern_names, remaining=True):
2.121
2.122 - "Replace the given 'items' with a sequence employing the same state."
2.123 + """
2.124 + Read the next portion of input, matching using 'pattern_names'. Return
2.125 + the text preceding any match, the remaining text if no match was found,
2.126 + or None if no match was found and 'remaining' is given as a false value.
2.127 + """
2.128
2.129 - self.replaced_items = items
2.130 - return self.get_items(items.s, items.pos)
2.131 + return self.items.read_until(self.get_patterns(pattern_names))
2.132 +
2.133 + def read_match(self, group=1):
2.134 +
2.135 + """
2.136 + Return the group of the matching pattern with the given 'group' number.
2.137 + """
2.138
2.139 - def update_items(self, items):
2.140 + return self.items.read_match(group)
2.141 +
2.142 + def read_matching(self):
2.143
2.144 - "Update the state of the replaced items with that of 'items'."
2.145 + "Return the name of the matching pattern."
2.146
2.147 - self.replaced_items.pos = items.pos
2.148 + return self.items.matching
2.149 +
2.150 + # Parser methods invoked from other objects.
2.151
2.152 def parse(self, s):
2.153
2.154 @@ -178,92 +213,104 @@
2.155 Parse page text 's'. Pages consist of regions delimited by markers.
2.156 """
2.157
2.158 - return self.parse_region(self.get_items(s))
2.159 + self.items = self.get_items(s)
2.160 + self.region = self.parse_region()
2.161 + return self.region
2.162 +
2.163 + def parse_region_content(self, items, region):
2.164 +
2.165 + "Parse the data provided by 'items' to populate a 'region'."
2.166 +
2.167 + self.set_region(items, region)
2.168
2.169 - def parse_region(self, items, level=0, indent=0):
2.170 + # Define a block to hold text and start parsing.
2.171 +
2.172 + new_block(region)
2.173 +
2.174 + if self.region_pattern_names:
2.175 + self.parse_region_details(region, self.region_pattern_names)
2.176 +
2.177 + # Top-level parser handler methods.
2.178 +
2.179 + def parse_region(self, level=0, indent=0):
2.180
2.181 """
2.182 - Parse the data provided by 'items' to populate a region with the given
2.183 - 'level' at the given 'indent'.
2.184 + Parse the data to populate a region with the given 'level' at the given
2.185 + 'indent'.
2.186 """
2.187
2.188 region = Region([], level, indent)
2.189
2.190 # Parse section headers, then parse according to region type.
2.191
2.192 - self.parse_region_header(items, region)
2.193 - self.parse_region_type(items, region)
2.194 + self.parse_region_header(region)
2.195 + self.parse_region_type(region)
2.196
2.197 return region
2.198
2.199 - def parse_region_type(self, items, region):
2.200 + def parse_region_type(self, region):
2.201
2.202 """
2.203 - Given data provided by 'items', use configured parsers to parse the
2.204 - 'region' based on its type.
2.205 + Use configured parsers to parse 'region' based on its type.
2.206 """
2.207
2.208 # Find an appropriate parser given the type.
2.209
2.210 - if self.formats.has_key(region.type):
2.211 - self.formats[region.type].parse_region_content(items, region)
2.212 + parser = self.get_parser(region.type)
2.213 +
2.214 + if parser:
2.215 + parser.parse_region_content(self.items, region)
2.216
2.217 # Otherwise, treat the section as opaque.
2.218
2.219 else:
2.220 - self.parse_region_opaque(items, region)
2.221 + self.parse_region_opaque(region)
2.222
2.223 - def parse_region_header(self, items, region):
2.224 + def parse_region_header(self, region):
2.225
2.226 """
2.227 - Parse the region header from the 'items', setting it for the given 'region'.
2.228 + Parse the region header, setting it on the 'region' object.
2.229 """
2.230
2.231 - if items.read_until(["header"], False) == "": # None means no header
2.232 - region.type = items.read_match()
2.233 + if self.read_until(["header"], False) == "": # None means no header
2.234 + region.type = self.read_match()
2.235
2.236 - def parse_region_opaque(self, items, region):
2.237 + def parse_region_opaque(self, region):
2.238
2.239 - "Parse the data provided by 'items' to populate an opaque 'region'."
2.240 + "Parse the data to populate an opaque 'region'."
2.241
2.242 region.transparent = False
2.243 - self.parse_region_details(items, region, ["regionend"])
2.244 -
2.245 - def parse_region_content(self, items, region):
2.246 -
2.247 - "Parse the data provided by 'items' to populate the given 'region'."
2.248 -
2.249 - pass
2.250 + self.parse_region_details(region, ["regionend"])
2.251
2.252 # Parsing utilities.
2.253
2.254 - def parse_region_details(self, items, region, pattern_names):
2.255 + def parse_region_details(self, region, pattern_names):
2.256
2.257 - "Parse 'items' within 'region' searching using 'pattern_names'."
2.258 + "Search 'region' using the 'pattern_names'."
2.259
2.260 try:
2.261 while True:
2.262
2.263 # Obtain text before any marker or the end of the input.
2.264
2.265 - preceding = items.read_until(pattern_names)
2.266 + preceding = self.read_until(pattern_names)
2.267 if preceding:
2.268 region.append_inline(Text(preceding))
2.269
2.270 # End of input.
2.271
2.272 - if not items.matching:
2.273 + if not self.read_matching():
2.274 break
2.275
2.276 # Obtain any feature.
2.277
2.278 - feature = items.read_match()
2.279 - handler = self.handlers.get(items.matching)
2.280 + feature = self.read_match()
2.281 + handler = self.handlers.get(self.read_matching())
2.282
2.283 # Handle each feature or add text to the region.
2.284
2.285 if handler:
2.286 - handler(self, items, region)
2.287 + handler(self, region)
2.288 else:
2.289 region.append_inline(Text(feature))
2.290
2.291 @@ -272,26 +319,10 @@
2.292
2.293 region.normalise()
2.294
2.295 - def end_region(self, items, region):
2.296 + def end_region(self, region):
2.297
2.298 "End the parsing of 'region', breaking out of the parsing loop."
2.299
2.300 raise StopIteration
2.301
2.302 -
2.303 -# Format mapping initialisation.
2.304 -
2.305 -def init_formats(formats):
2.306 -
2.307 - """
2.308 - Convert the given 'formats' mapping from a name-to-class mapping to a
2.309 - name-to-instance mapping with each parser instance employing the format
2.310 - mapping itself. Return the converted mapping.
2.311 - """
2.312 -
2.313 - d = {}
2.314 - for name, cls in formats.items():
2.315 - d[name] = cls(d)
2.316 - return d
2.317 -
2.318 # vim: tabstop=4 expandtab shiftwidth=4