1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/moinformat/parsers/moin.py Wed Dec 13 00:50:09 2017 +0100
1.3 @@ -0,0 +1,556 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Moin wiki format parser.
1.8 +
1.9 +Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
1.19 +details.
1.20 +
1.21 +You should have received a copy of the GNU General Public License along with
1.22 +this program. If not, see <http://www.gnu.org/licenses/>.
1.23 +"""
1.24 +
1.25 +from moinformat.parsers.common import ParserBase, get_patterns, get_subset, new_block
1.26 +from moinformat.serialisers import serialise
1.27 +from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \
1.28 + Larger, ListItem, Monospace, Region, Rule, Smaller, \
1.29 + Subscript, Superscript, Table, TableAttr, \
1.30 + TableAttrs, TableCell, TableRow, Text, Underline
1.31 +
1.32 +class MoinParser(ParserBase):
1.33 +
1.34 + "A wiki region parser."
1.35 +
1.36 + def __init__(self, formats=None):
1.37 +
1.38 + """
1.39 + Initialise the parser with any given 'formats' mapping from region type
1.40 + names to parser objects.
1.41 + """
1.42 +
1.43 + # Introduce this class as the default parser for the wiki format.
1.44 +
1.45 + default_formats = {"wiki" : MoinParser, "moin" : MoinParser}
1.46 + if formats:
1.47 + default_formats.update(formats)
1.48 +
1.49 + ParserBase.__init__(self, default_formats)
1.50 +
1.51 + # Principal parser methods.
1.52 +
1.53 + def parse(self, s):
1.54 +
1.55 + """
1.56 + Parse page text 's'. Pages consist of regions delimited by markers.
1.57 + """
1.58 +
1.59 + self.items = self.get_items(s)
1.60 + self.region = Region([])
1.61 +
1.62 + # Parse page header.
1.63 +
1.64 + self.parse_region_header(self.region)
1.65 +
1.66 + # Handle pages directly with this parser. Pages do not need to use an
1.67 + # explicit format indicator.
1.68 +
1.69 + if not self.region.type:
1.70 + self.parse_region_content(self.items, self.region)
1.71 +
1.72 + # Otherwise, test the type and find an appropriate parser.
1.73 +
1.74 + else:
1.75 + self.parse_region_type(self.region)
1.76 +
1.77 + return self.region
1.78 +
1.79 +
1.80 +
1.81 + # Parser methods supporting different page features.
1.82 +
1.83 + def parse_attrname(self, attrs):
1.84 +
1.85 + "Handle an attribute name within 'attrs'."
1.86 +
1.87 + name = self.read_match()
1.88 + attr = TableAttr(name)
1.89 +
1.90 + preceding = self.read_until(["attrvalue"], False)
1.91 + if preceding == "":
1.92 + attr.quote = self.read_match(1)
1.93 + attr.value = self.read_match(2)
1.94 +
1.95 + attrs.append(attr)
1.96 +
1.97 + def parse_break(self, region):
1.98 +
1.99 + "Handle a paragraph break within 'region'."
1.100 +
1.101 + region.add(Break())
1.102 + new_block(region)
1.103 +
1.104 + def parse_defitem(self, region, extra=""):
1.105 +
1.106 + "Handle a definition item within 'region'."
1.107 +
1.108 + pad = self.read_match(1)
1.109 + item = DefItem([], pad, extra)
1.110 + self.parse_region_details(item, ["listitemend"])
1.111 + region.add(item)
1.112 + new_block(region)
1.113 +
1.114 + def parse_defterm(self, region):
1.115 +
1.116 + "Handle a definition term within 'region'."
1.117 +
1.118 + pad = self.read_match(1)
1.119 + term = DefTerm([], pad)
1.120 + self.parse_region_details(term, ["deftermend", "deftermsep"])
1.121 + region.add(term)
1.122 + if self.read_matching() == "deftermsep":
1.123 + self.parse_defitem(region)
1.124 +
1.125 + def parse_defterm_empty(self, region):
1.126 +
1.127 + "Handle an empty definition term within 'region'."
1.128 +
1.129 + extra = self.read_match(1)
1.130 + self.parse_region_details(region, ["deftermsep"])
1.131 + self.parse_defitem(region, extra)
1.132 +
1.133 + def parse_fontstyle(self, region):
1.134 +
1.135 + "Handle emphasis and strong styles."
1.136 +
1.137 + n = len(self.read_match(1))
1.138 +
1.139 + # Handle endings.
1.140 +
1.141 + if isinstance(region, FontStyle):
1.142 + emphasis = n in (2, 4, 5)
1.143 + strong = n in (3, 5, 6)
1.144 + active = True
1.145 +
1.146 + if region.emphasis and emphasis:
1.147 + active = region.close_emphasis()
1.148 + n -= 2
1.149 + if region.strong and strong:
1.150 + active = region.close_strong()
1.151 + n -= 3
1.152 +
1.153 + if not active:
1.154 + if n:
1.155 + self.items.rewind(n)
1.156 + raise StopIteration
1.157 +
1.158 + elif not n:
1.159 + return
1.160 +
1.161 + # Handle new styles.
1.162 +
1.163 + emphasis = n in (2, 4, 5)
1.164 + strong = n in (3, 5, 6)
1.165 + double = n in (4, 6)
1.166 +
1.167 + span = FontStyle([], emphasis, strong)
1.168 + if not double:
1.169 + self.parse_region_details(span, self.inline_pattern_names)
1.170 + region.append_inline(span)
1.171 +
1.172 + def parse_halign(self, attrs):
1.173 +
1.174 + "Handle horizontal alignment within 'attrs'."
1.175 +
1.176 + value = self.read_match()
1.177 + attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True)
1.178 + attrs.append(attr)
1.179 +
1.180 + def parse_heading(self, region):
1.181 +
1.182 + "Handle a heading."
1.183 +
1.184 + start_extra = self.read_match(1)
1.185 + level = len(self.read_match(2))
1.186 + start_pad = self.read_match(3)
1.187 + heading = Heading([], level, start_extra, start_pad)
1.188 + self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names)
1.189 + region.add(heading)
1.190 + new_block(region)
1.191 +
1.192 + def parse_heading_end(self, heading):
1.193 +
1.194 + "Handle the end of a heading."
1.195 +
1.196 + level = len(self.read_match(2))
1.197 + if heading.level == level:
1.198 + heading.end_pad = self.read_match(1)
1.199 + heading.end_extra = self.read_match(3)
1.200 + raise StopIteration
1.201 +
1.202 + def parse_listitem(self, region):
1.203 +
1.204 + "Handle a list item marker within 'region'."
1.205 +
1.206 + indent = len(self.read_match(1))
1.207 + marker = self.read_match(2)
1.208 + space = self.read_match(3)
1.209 + item = ListItem([], indent, marker, space)
1.210 + self.parse_region_details(item, self.listitem_pattern_names)
1.211 + region.add(item)
1.212 + new_block(region)
1.213 +
1.214 + def parse_rule(self, region):
1.215 +
1.216 + "Handle a horizontal rule within 'region'."
1.217 +
1.218 + length = len(self.read_match(1))
1.219 + rule = Rule(length)
1.220 + region.add(rule)
1.221 + new_block(region)
1.222 +
1.223 + def parse_section(self, region):
1.224 +
1.225 + "Handle the start of a new section within 'region'."
1.226 +
1.227 + # Parse the section and start a new block after the section.
1.228 +
1.229 + indent = len(self.read_match(2))
1.230 + level = len(self.read_match(3))
1.231 + region.add(self.parse_region(level, indent))
1.232 + new_block(region)
1.233 +
1.234 + def parse_section_end(self, region):
1.235 +
1.236 + "Handle the end of a new section within 'region'."
1.237 +
1.238 + feature = self.read_match()
1.239 + if region.have_end(feature):
1.240 + raise StopIteration
1.241 + else:
1.242 + region.append_inline(Text(feature))
1.243 +
1.244 + def parse_table_attrs(self, cell):
1.245 +
1.246 + "Handle the start of table attributes within 'cell'."
1.247 +
1.248 + attrs = TableAttrs([])
1.249 + self.parse_region_details(attrs, self.table_pattern_names)
1.250 +
1.251 + # Test the validity of the attributes.
1.252 +
1.253 + last = None
1.254 +
1.255 + for node in attrs.nodes:
1.256 +
1.257 + # Text separator nodes must be whitespace.
1.258 +
1.259 + if isinstance(node, Text):
1.260 + if node.s.strip():
1.261 + break
1.262 +
1.263 + # Named attributes must be preceded by space if not the first.
1.264 +
1.265 + elif last and not node.concise and not isinstance(last, Text):
1.266 + break
1.267 +
1.268 + last = node
1.269 +
1.270 + # All nodes were valid: preserve the collection.
1.271 +
1.272 + else:
1.273 + cell.attrs = attrs
1.274 + return
1.275 +
1.276 + # Invalid nodes were found: serialise the attributes as text.
1.277 +
1.278 + cell.append_inline(Text(serialise(attrs)))
1.279 +
1.280 + def parse_table_row(self, region):
1.281 +
1.282 + "Handle the start of a table row within 'region'."
1.283 +
1.284 + # Identify any active table.
1.285 +
1.286 + table = region.node(-2)
1.287 + block = region.node(-1)
1.288 +
1.289 + if not (isinstance(table, Table) and block.empty()):
1.290 + new_table = table = Table([])
1.291 + else:
1.292 + new_table = None
1.293 +
1.294 + row = TableRow([])
1.295 +
1.296 + while True:
1.297 + cell = TableCell([])
1.298 + self.parse_region_details(cell, self.table_region_pattern_names)
1.299 +
1.300 + # Handle the end of the row.
1.301 +
1.302 + if self.read_matching() == "tableend":
1.303 + trailing = self.read_match()
1.304 +
1.305 + # If the cell was started but not finished, convert the row into text.
1.306 +
1.307 + if not row.nodes or not cell.empty():
1.308 + for node in row.nodes:
1.309 + region.append_inline(Text(serialise(node)))
1.310 + region.append_inline(Text(serialise(cell)))
1.311 + region.append_inline(Text(trailing))
1.312 +
1.313 + new_block(region)
1.314 + return
1.315 +
1.316 + # Append the final cell, if not empty.
1.317 +
1.318 + else:
1.319 + row.trailing = trailing
1.320 +
1.321 + if not cell.empty():
1.322 + row.append(cell)
1.323 + break
1.324 +
1.325 + # A cell separator has been found.
1.326 +
1.327 + row.append(cell)
1.328 +
1.329 + # Add the row to the table and any new table to the region.
1.330 +
1.331 + table.add(row)
1.332 + if new_table:
1.333 + region.add(new_table)
1.334 +
1.335 + new_block(region)
1.336 +
1.337 + def parse_valign(self, attrs):
1.338 +
1.339 + "Handle vertical alignment within 'attrs'."
1.340 +
1.341 + value = self.read_match()
1.342 + attr = TableAttr("valign", value == "^" and "top" or "bottom", True)
1.343 + attrs.append(attr)
1.344 +
1.345 +
1.346 +
1.347 + # Inline formatting handlers.
1.348 +
1.349 + def parse_inline(self, region, cls, pattern_name):
1.350 +
1.351 + "Handle an inline region."
1.352 +
1.353 + span = cls([])
1.354 + self.parse_region_details(span, self.inline_patterns_for(pattern_name))
1.355 + region.append_inline(span)
1.356 +
1.357 + def parse_larger(self, region):
1.358 + self.parse_inline(region, Larger, "larger")
1.359 +
1.360 + def parse_monospace(self, region):
1.361 + self.parse_inline(region, Monospace, "monospace")
1.362 +
1.363 + def parse_smaller(self, region):
1.364 + self.parse_inline(region, Smaller, "smaller")
1.365 +
1.366 + def parse_sub(self, region):
1.367 + self.parse_inline(region, Subscript, "sub")
1.368 +
1.369 + def parse_super(self, region):
1.370 + self.parse_inline(region, Superscript, "super")
1.371 +
1.372 + def parse_underline(self, region):
1.373 + self.parse_inline(region, Underline, "underline")
1.374 +
1.375 +
1.376 +
1.377 + # Table attribute handlers.
1.378 +
1.379 + def parse_table_attr(self, attrs, pattern_name):
1.380 +
1.381 + "Handle a table attribute."
1.382 +
1.383 + attrs.append(TableAttr(pattern_name, self.read_match(), True))
1.384 +
1.385 + def parse_colour(self, cell):
1.386 + self.parse_table_attr(cell, "colour")
1.387 +
1.388 + def parse_colspan(self, cell):
1.389 + self.parse_table_attr(cell, "colspan")
1.390 +
1.391 + def parse_rowspan(self, cell):
1.392 + self.parse_table_attr(cell, "rowspan")
1.393 +
1.394 + def parse_width(self, cell):
1.395 + self.parse_table_attr(cell, "width")
1.396 +
1.397 +
1.398 +
1.399 + # Regular expressions.
1.400 +
1.401 + syntax = {
1.402 + # Page regions:
1.403 + "regionstart" : r"((^\N*)([{]{3,}))", # {{{...
1.404 + "regionend" : r"^\N*([}]{3,})", # }}}...
1.405 + "header" : r"#!(.*?)\n", # #! char-excl-nl
1.406 +
1.407 + # Region contents:
1.408 + # Line-oriented patterns:
1.409 + # blank line
1.410 + "break" : r"^(\s*?)\n",
1.411 + # ws... expecting text ::
1.412 + "defterm" : r"^(\N+)(?=.+?::)",
1.413 + # ws... expecting :: ws...
1.414 + "defterm_empty" : r"^(\N+)(?=::\s+)",
1.415 + # [ws...] =... ws... expecting headingend
1.416 + "heading" : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)",
1.417 + # ws... list-item [ws...]
1.418 + "listitem" : r"^(\N+)(\*)(\s*)",
1.419 + # ws... number-item ws...
1.420 + "listitem_num" : r"^(\N+)(\d+\.)(\s+)",
1.421 + # ws... alpha-item ws...
1.422 + "listitem_alpha": r"^(\N+)([aA]\.)(\s+)",
1.423 + # ws... roman-item ws...
1.424 + "listitem_roman": r"^(\N+)([iI]\.)(\s+)",
1.425 + # ws... dot-item [ws...]
1.426 + "listitem_dot" : r"^(\N+)(\.)(\s*)",
1.427 + # ||
1.428 + "tablerow" : r"^\|\|",
1.429 +
1.430 + # Region contents:
1.431 + # Inline patterns:
1.432 + "fontstyle" : r"('{2,6})",
1.433 + "larger" : r"~\+",
1.434 + "monospace" : r"`",
1.435 + "rule" : r"(-----*)", # ----...
1.436 + "smaller" : r"~-",
1.437 + "sub" : r",,",
1.438 + "super" : r"\^",
1.439 + "underline" : r"__",
1.440 +
1.441 + # Inline contents:
1.442 + "largerend" : r"\+~",
1.443 + "monospaceend" : r"`",
1.444 + "smallerend" : r"-~",
1.445 + "subend" : r",,",
1.446 + "superend" : r"\^",
1.447 + "underlineend" : r"__",
1.448 +
1.449 + # Heading contents:
1.450 + "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl
1.451 +
1.452 + # List contents:
1.453 + "deftermend" : r"::(\s*?\n)",
1.454 + "deftermsep" : r"::(\s+)",
1.455 + "listitemend" : r"^", # next line
1.456 +
1.457 + # Table contents:
1.458 + "tableattrs" : r"<",
1.459 + "tablecell" : r"\|\|",
1.460 + "tableend" : r"(\s*?)^", # [ws...] next line
1.461 +
1.462 + # Table attributes:
1.463 + "tableattrsend" : r">",
1.464 + "halign" : r"([(:)])",
1.465 + "valign" : r"([v^])",
1.466 + "colour" : r"(\#[0-9A-F]{6})",
1.467 + "colspan" : r"-(\d+)",
1.468 + "rowspan" : r"\|(\d+)",
1.469 + "width" : r"(\d+%)",
1.470 + "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char...
1.471 + "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""",
1.472 + }
1.473 +
1.474 + patterns = get_patterns(syntax)
1.475 +
1.476 +
1.477 +
1.478 + # Pattern details.
1.479 +
1.480 + table_pattern_names = [
1.481 + "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend",
1.482 + "valign", "width"
1.483 + ]
1.484 +
1.485 + inline_pattern_names = [
1.486 + "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline",
1.487 + ]
1.488 +
1.489 + listitem_pattern_names = inline_pattern_names + ["listitemend"]
1.490 +
1.491 + region_pattern_names = inline_pattern_names + [
1.492 + "break", "heading", "defterm", "defterm_empty", "listitem",
1.493 + "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman",
1.494 + "regionstart", "regionend", "rule", "tablerow",
1.495 + ]
1.496 +
1.497 + table_region_pattern_names = inline_pattern_names + [
1.498 + "tableattrs", "tablecell", "tableend"
1.499 + ]
1.500 +
1.501 + def inline_patterns_for(self, name):
1.502 + names = self.inline_pattern_names[:]
1.503 + names[names.index(name)] = "%send" % name
1.504 + return names
1.505 +
1.506 +
1.507 +
1.508 + # Pattern handlers.
1.509 +
1.510 + end_region = ParserBase.end_region
1.511 +
1.512 + handlers = {
1.513 + None : end_region,
1.514 + "attrname" : parse_attrname,
1.515 + "break" : parse_break,
1.516 + "colour" : parse_colour,
1.517 + "colspan" : parse_colspan,
1.518 + "defterm" : parse_defterm,
1.519 + "defterm_empty" : parse_defterm_empty,
1.520 + "deftermend" : end_region,
1.521 + "deftermsep" : end_region,
1.522 + "fontstyle" : parse_fontstyle,
1.523 + "halign" : parse_halign,
1.524 + "heading" : parse_heading,
1.525 + "headingend" : parse_heading_end,
1.526 + "larger" : parse_larger,
1.527 + "largerend" : end_region,
1.528 + "listitemend" : end_region,
1.529 + "listitem" : parse_listitem,
1.530 + "listitem_alpha" : parse_listitem,
1.531 + "listitem_dot" : parse_listitem,
1.532 + "listitem_num" : parse_listitem,
1.533 + "listitem_roman" : parse_listitem,
1.534 + "monospace" : parse_monospace,
1.535 + "monospaceend" : end_region,
1.536 + "regionstart" : parse_section,
1.537 + "regionend" : parse_section_end,
1.538 + "rowspan" : parse_rowspan,
1.539 + "rule" : parse_rule,
1.540 + "smaller" : parse_smaller,
1.541 + "smallerend" : end_region,
1.542 + "sub" : parse_sub,
1.543 + "subend" : end_region,
1.544 + "super" : parse_super,
1.545 + "superend" : end_region,
1.546 + "tableattrs" : parse_table_attrs,
1.547 + "tableattrsend" : end_region,
1.548 + "tablerow" : parse_table_row,
1.549 + "tablecell" : end_region,
1.550 + "tableend" : end_region,
1.551 + "underline" : parse_underline,
1.552 + "underlineend" : end_region,
1.553 + "valign" : parse_valign,
1.554 + "width" : parse_width,
1.555 + }
1.556 +
1.557 +parser = MoinParser
1.558 +
1.559 +# vim: tabstop=4 expandtab shiftwidth=4