1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 # Document transformations. 23 24 from moinformat.macros import get_macro 25 26 # Parser functionality and pattern definition. 27 28 from moinformat.parsers.common import ParserBase, get_patterns, \ 29 excl, expect, group, optional, recur, \ 30 repeat 31 32 # Serialisation. 33 34 from moinformat.serialisers import serialise 35 36 # Document tree nodes. 37 38 from moinformat.tree.moin import Break, DefItem, DefTerm, FontStyle, Heading, \ 39 Larger, Link, List, ListItem, Macro, \ 40 Monospace, Region, Rule, Smaller, \ 41 Strikethrough, Subscript, Superscript, Table, \ 42 TableAttr, TableAttrs, TableCell, TableRow, \ 43 Text, Underline 44 45 join = "".join 46 47 class MoinParser(ParserBase): 48 49 "A wiki region parser." 50 51 def __init__(self, formats=None, root=None): 52 53 """ 54 Initialise the parser with any given 'formats' mapping from region type 55 names to parser objects. An optional 'root' indicates the document-level 56 parser. 57 """ 58 59 # Introduce this class as the default parser for the wiki format. 60 61 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 62 if formats: 63 default_formats.update(formats) 64 65 ParserBase.__init__(self, default_formats, root) 66 67 # Record certain node occurrences for later evaluation. 68 69 self.macros = [] 70 71 # Principal parser methods. 72 73 def parse(self, s): 74 75 """ 76 Parse page text 's'. Pages consist of regions delimited by markers. 77 """ 78 79 self.items = self.get_items(s) 80 self.region = Region([], type="moin") 81 82 # Parse page header. 83 84 self.parse_region_header(self.region) 85 86 # Handle pages directly with this parser. Pages do not need to use an 87 # explicit format indicator. 88 89 if not self.region.type: 90 self.parse_region_content(self.items, self.region) 91 92 # Otherwise, test the type and find an appropriate parser. 93 94 else: 95 self.parse_region_type(self.region) 96 97 return self.region 98 99 100 101 # Macro evaluation. 102 103 def evaluate_macros(self): 104 105 "Evaluate the macro nodes in the document." 106 107 for node in self.macros: 108 109 # Obtain a class for the named macro. 110 111 macro_cls = get_macro(node.name) 112 if not macro_cls: 113 continue 114 115 # Instantiate the class and evaluate the macro. 116 117 macro = macro_cls(node, self.region) 118 macro.evaluate() 119 120 121 122 # Parser methods supporting different page features. 123 124 def parse_attrname(self, attrs): 125 126 "Handle an attribute name within 'attrs'." 127 128 name = self.match_group("name") 129 attr = TableAttr(name) 130 131 preceding = self.read_until(["attrvalue"], False) 132 if preceding == "": 133 attr.quote = self.match_group("quote") 134 attr.value = self.match_group("value") 135 136 attrs.append(attr) 137 138 def parse_break(self, region): 139 140 "Handle a paragraph break within 'region'." 141 142 self.add_node(region, Break()) 143 self.new_block(region) 144 145 def parse_defitem(self, region, extra=""): 146 147 "Handle a definition item within 'region'." 148 149 pad = self.match_group("pad") 150 item = DefItem([], pad, extra) 151 self.parse_region_details(item, ["listitemend"]) 152 self.add_node(region, item) 153 self.new_block(region) 154 155 def parse_defterm(self, region): 156 157 "Handle a definition term within 'region'." 158 159 pad = self.match_group("pad") 160 term = DefTerm([], pad) 161 self.parse_region_details(term, ["deftermend", "deftermsep"]) 162 self.add_node(region, term) 163 if self.matching_pattern() == "deftermsep": 164 self.parse_defitem(region) 165 166 def parse_defterm_empty(self, region): 167 168 "Handle an empty definition term within 'region'." 169 170 extra = self.match_group("pad") 171 self.parse_region_details(region, ["deftermsep"]) 172 self.parse_defitem(region, extra) 173 174 def parse_fontstyle(self, region): 175 176 "Handle emphasis and strong styles." 177 178 n = len(self.match_group("style")) 179 180 # Handle endings. 181 182 if isinstance(region, FontStyle): 183 emphasis = n in (2, 4, 5) 184 strong = n in (3, 5, 6) 185 active = True 186 187 if region.emphasis and emphasis: 188 active = region.close_emphasis() 189 n -= 2 190 if region.strong and strong: 191 active = region.close_strong() 192 n -= 3 193 194 if not active: 195 if n: 196 self.items.rewind(n) 197 raise StopIteration 198 199 elif not n: 200 return 201 202 # Handle new styles. 203 204 emphasis = n in (2, 4, 5) 205 strong = n in (3, 5, 6) 206 double = n in (4, 6) 207 208 span = FontStyle([], emphasis, strong) 209 if not double: 210 self.parse_region_details(span, self.inline_pattern_names) 211 region.append_inline(span) 212 213 def parse_halign(self, attrs): 214 215 "Handle horizontal alignment within 'attrs'." 216 217 value = self.match_group("value") 218 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 219 attrs.append(attr) 220 221 def parse_heading(self, region): 222 223 "Handle a heading." 224 225 start_extra = self.match_group("extra") 226 level = len(self.match_group("level")) 227 start_pad = self.match_group("pad") 228 heading = Heading([], level, start_extra, start_pad) 229 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 230 self.add_node(region, heading) 231 self.new_block(region) 232 233 def parse_heading_end(self, heading): 234 235 "Handle the end of a heading." 236 237 level = len(self.match_group("level")) 238 if heading.level == level: 239 heading.end_pad = self.match_group("pad") 240 heading.end_extra = self.match_group("extra") 241 raise StopIteration 242 243 def parse_list(self, item): 244 245 "Create a list, starting with 'item'." 246 247 list = List([item], item.indent, item.marker, item.num) 248 self.parse_region_details(list, self.list_pattern_names, True) 249 return list 250 251 def parse_listitem(self, region): 252 253 "Handle a list item marker within 'region'." 254 255 indent = len(self.match_group("indent")) 256 marker = self.match_group("marker") 257 num = self.match_group("num") 258 space = self.match_group("pad") 259 260 last = region.node(-1) 261 262 new_list = not isinstance(last, (List, ListItem)) 263 same_indent = not new_list and indent == last.indent 264 new_marker = not new_list and last.marker != marker and same_indent 265 new_num = not new_list and num is not None and last.num != num and same_indent 266 267 # If the marker or number changes at the same indent, or if the indent 268 # is smaller, queue the item and end the list. 269 270 # Note that Moin format does not seek to support item renumbering, 271 # instead starting new lists on number changes. 272 273 if not new_list and (new_marker or new_num or indent < last.indent): 274 self.queue_match() 275 self.end_region(region) 276 277 # Obtain a list item and populate it. 278 279 item = ListItem([], indent, marker, space, num) 280 self.parse_region_details(item, self.listitem_pattern_names) 281 282 # Start a new list if not preceded by a list item, adding a trailing 283 # block for new elements. 284 285 if new_list: 286 item = self.parse_list(item) 287 self.add_node(region, item) 288 self.new_block(region) 289 290 # Add a nested list to the last item. 291 292 elif indent > last.indent: 293 item = self.parse_list(item) 294 self.add_node(last, item) 295 296 # Add the item to the current list. 297 298 else: 299 self.add_node(region, item) 300 301 def parse_rule(self, region): 302 303 "Handle a horizontal rule within 'region'." 304 305 length = len(self.match_group("rule")) 306 rule = Rule(length) 307 self.add_node(region, rule) 308 self.new_block(region) 309 310 def parse_section(self, region): 311 312 "Handle the start of a new section within 'region'." 313 314 # Parse the section and start a new block after the section. 315 316 indent = len(self.match_group("indent")) 317 level = len(self.match_group("level")) 318 319 section = self.parse_region(level, indent, "inline") 320 321 # If the section is inline, treat it like any other inline element. 322 323 if section.type == "inline": 324 region.append_inline(section) 325 326 # Otherwise, add it as a new block element. 327 328 else: 329 self.add_node(region, section) 330 if region.allow_blocks: 331 self.new_block(region) 332 333 def parse_table_attrs(self, cell): 334 335 "Handle the start of table attributes within 'cell'." 336 337 attrs = TableAttrs([]) 338 self.parse_region_details(attrs, self.table_attr_pattern_names) 339 340 # Test the validity of the attributes. 341 342 last = None 343 344 for node in attrs.nodes: 345 346 # Text separator nodes must be whitespace. 347 348 if isinstance(node, Text): 349 if node.s.strip(): 350 break 351 352 # Named attributes must be preceded by space if not the first. 353 354 elif last and not node.concise and not isinstance(last, Text): 355 break 356 357 last = node 358 359 # All nodes were valid: preserve the collection. 360 361 else: 362 # Add the attributes as a node, also recording their presence. 363 364 cell.append(attrs) 365 cell.attrs = attrs 366 return 367 368 # Invalid nodes were found: serialise the attributes as text. 369 370 cell.append_inline(Text(serialise(attrs))) 371 372 def parse_table_row(self, region): 373 374 "Handle the start of a table row within 'region'." 375 376 # Identify any active table. 377 378 table = region.node(-2) 379 block = region.node(-1) 380 381 if not (isinstance(table, Table) and block.empty()): 382 new_table = table = Table([]) 383 else: 384 new_table = None 385 386 row = TableRow([]) 387 388 while True: 389 cell = TableCell([]) 390 self.parse_region_details(cell, self.table_row_pattern_names) 391 392 # Handle the end of the row. 393 394 if self.matching_pattern() == "tableend": 395 trailing = self.match_group("extra") 396 397 # If the cell was started but not finished, convert the row into text. 398 399 if not row.nodes or not cell.empty(): 400 for node in row.nodes: 401 region.append_inline(Text(serialise(node))) 402 region.append_inline(Text(serialise(cell) + trailing)) 403 404 self.new_block(region) 405 return 406 407 # Append the final cell, if not empty. 408 409 else: 410 row.trailing = trailing 411 412 if not cell.empty(): 413 row.append(cell) 414 break 415 416 # A cell separator has been found. 417 418 row.append(cell) 419 420 # Add the row to the table and any new table to the region. 421 422 table.add(row) 423 if new_table: 424 self.add_node(region, new_table) 425 426 self.new_block(region) 427 428 def parse_valign(self, attrs): 429 430 "Handle vertical alignment within 'attrs'." 431 432 value = self.match_group("value") 433 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 434 attrs.append(attr) 435 436 437 438 def inline_patterns_for(self, name): 439 names = self.inline_pattern_names[:] 440 names[names.index(name)] = "%send" % name 441 return names 442 443 444 445 # Inline formatting handlers. 446 447 def parse_inline(self, region, cls, pattern_name): 448 449 "Handle an inline region." 450 451 span = cls([]) 452 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 453 region.append_inline(span) 454 455 def parse_larger(self, region): 456 self.parse_inline(region, Larger, "larger") 457 458 def parse_monospace(self, region): 459 span = Monospace([]) 460 self.parse_region_details(span, ["monospaceend"]) 461 region.append_inline(span) 462 463 def parse_smaller(self, region): 464 self.parse_inline(region, Smaller, "smaller") 465 466 def parse_strike(self, region): 467 self.parse_inline(region, Strikethrough, "strike") 468 469 def parse_sub(self, region): 470 self.parse_inline(region, Subscript, "sub") 471 472 def parse_super(self, region): 473 self.parse_inline(region, Superscript, "super") 474 475 def parse_underline(self, region): 476 self.parse_inline(region, Underline, "underline") 477 478 479 480 # Complete inline pattern handlers. 481 482 def parse_link(self, region): 483 target = self.match_group("target") 484 text = self.match_group("text") 485 link = Link(text and [Text(text)], target) 486 region.append_inline(link) 487 488 def parse_macro(self, region): 489 name = self.match_group("name") 490 args = self.match_group("args") 491 492 # Obtain the raw arguments. Moin usually leaves it to the macro to 493 # interpret the individual arguments. 494 495 arglist = args and args.split(",") or [] 496 macro = Macro(name, arglist) 497 region.append_inline(macro) 498 499 # Record the macro for later processing. 500 501 self.root.macros.append(macro) 502 503 504 505 # Table attribute handlers. 506 507 def parse_table_attr(self, attrs, pattern_name): 508 509 "Handle a table attribute." 510 511 attrs.append(TableAttr(pattern_name, self.match_group("value"), True)) 512 513 def parse_colour(self, cell): 514 self.parse_table_attr(cell, "colour") 515 516 def parse_colspan(self, cell): 517 self.parse_table_attr(cell, "colspan") 518 519 def parse_rowspan(self, cell): 520 self.parse_table_attr(cell, "rowspan") 521 522 def parse_width(self, cell): 523 self.parse_table_attr(cell, "width") 524 525 526 527 # Regular expressions. 528 529 syntax = { 530 # Page regions: 531 532 "regionstart" : join((group("indent", r"\N*"), # ws... (optional) 533 group("level", repeat("[{]", 3)))), # {{{... 534 535 "regionend" : join((r"\N*", # ws... (optional) 536 group("feature", join(( 537 group("level", repeat("[}]", 3)), # }}}... 538 group("extra", r"\n"), 539 "?"))))), # nl (optional) 540 541 "header" : join(("#!", # #! 542 group("args", ".*?"), "\n")), # text-excl-nl 543 544 # Region contents: 545 546 # Line-oriented patterns support features which require their own 547 # separate lines. 548 549 "break" : r"^(\s*?)\n", # blank line 550 551 "defterm" : join(("^", 552 group("pad", r"\N+"), # ws... 553 expect(".+?::"))), # text :: 554 555 "defterm_empty" : join(("^", 556 group("pad", r"\N+"), # ws... 557 expect("::\s+"))), # :: 558 # ws... (optional) 559 560 "heading" : join(("^", 561 group("extra", r"\N*"), # ws... (optional) 562 group("level", "=+"), # =... 563 group("pad", r"\s+"), # ws... 564 expect(join((r".*?\N+", # text 565 recur("level"), # =... 566 r"\N*$"))))), # ws... (optional) 567 568 "listitem" : join(("^", 569 group("indent", r"\N+"), # ws... 570 group("marker", r"\*"), # list-marker 571 group("pad", r"\s*"))), # ws... (optional) 572 573 "listitem_num" : join(("^", 574 group("indent", r"\N+"), # ws... 575 group("marker", r"\d+\."), # decimal-marker 576 optional(join(("#", group("num", r"\d+")))), # # num (optional) 577 group("pad", r"\s+"))), # ws... 578 579 "listitem_alpha": join(("^", 580 group("indent", r"\N+"), # ws... 581 group("marker", r"[aA]\."), # alpha-marker 582 optional(join(("#", group("num", r"\d+")))), # # num (optional) 583 group("pad", r"\s+"))), # ws... 584 585 "listitem_roman": join(("^", 586 group("indent", r"\N+"), # ws... 587 group("marker", r"[iI]\."), # roman-marker 588 optional(join(("#", group("num", r"\d+")))), # # num (optional) 589 group("pad", r"\s+"))), # ws... 590 591 "listitem_dot" : join(("^", 592 group("indent", r"\N+"), # ws... 593 group("marker", r"\."), # dot-marker 594 group("pad", r"\s*"))), # ws... (optional) 595 596 "tablerow" : r"^\|\|", # || 597 598 # Region contents: 599 600 # Inline patterns are for markup features that appear within blocks. 601 # The patterns below start inline spans that can contain other markup 602 # features. 603 604 "fontstyle" : group("style", repeat("'", 2, 6)), # ''... 605 "larger" : r"~\+", # ~+ 606 "monospace" : r"`", # ` 607 "rule" : group("rule", "-----*"), # ----... 608 "smaller" : r"~-", # ~- 609 "strike" : r"--\(", # --( 610 "sub" : r",,", # ,, 611 "super" : r"\^", # ^ 612 "underline" : r"__", # __ 613 614 # Complete inline patterns are for markup features that do not support 615 # arbitrary content within them: 616 617 "link" : join((r"\[\[", # [[ 618 group("target", ".*?"), # target 619 optional(join((r"\|", group("text", ".*?")))), # | text (optional) 620 "]]")), # ]] 621 622 "macro" : join(("<<", # << 623 group("name", "\w+?"), # digit-letter... 624 optional(join((r"\(", # ( (optional) 625 group("args", ".*?"), # not-)... 626 r"\)"))), # ) (optional) 627 ">>")), # >> 628 629 # Ending patterns for inline features: 630 631 "largerend" : r"\+~", # +~ 632 "monospaceend" : r"`", # ` 633 "smallerend" : r"-~", # -~ 634 "strikeend" : r"\)--", # )-- 635 "subend" : r",,", # ,, 636 "superend" : r"\^", # ^ 637 "underlineend" : r"__", # __ 638 639 # Heading contents: 640 641 "headingend" : join((group("pad", r"\N+"), # ws... 642 group("level", "=+"), # =... 643 group("extra", r"\N*\n"))), # ws (optional) nl 644 645 # List contents: 646 647 "deftermend" : join(("::", group("pad", r"\s*?\n"))), # :: 648 # ws... (optional) 649 # nl 650 651 "deftermsep" : join(("::", group("pad", r"\s+"))), # :: 652 # ws... (optional) 653 654 "listitemend" : r"^", # next line 655 656 # Table contents: 657 658 "tableattrs" : r"<", # < 659 "tablecell" : r"\|\|", # || 660 661 "tableend" : join((group("extra", r"\s*?"), # ws... (optional) 662 "^")), # next line 663 664 # Table attributes: 665 666 "tableattrsend" : r">", # > 667 "halign" : group("value", "[(:)]"), # halign-marker 668 "valign" : group("value", "[v^]"), # valign-marker 669 "colour" : group("value", join(("\#", # # 670 repeat("[0-9A-F]", 6, 6)))), # nnnnnn 671 672 "colspan" : join(("-", # - 673 group("value", "\d+"))), # n... 674 675 "rowspan" : join((r"\|", # | 676 group("value", "\d+"))), # n... 677 678 "width" : group("value", "\d+%"), # n... % 679 680 "attrname" : join((excl(r"[-\d]"), # not-dash-or-digit 681 group("name", r"[-\w]+"))), # dash-digit-letter... 682 683 "attrvalue" : join(("=", group("quote", r"\Q"), # quote 684 group("value", ".*?"), # non-quote... (optional) 685 recur("quote"))), # quote 686 } 687 688 patterns = get_patterns(syntax) 689 690 691 692 # Patterns available within certain markup features. 693 694 table_attr_pattern_names = [ 695 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 696 "valign", "width" 697 ] 698 699 inline_pattern_names = [ 700 "fontstyle", "larger", "link", "macro", "monospace", "regionstart", 701 "smaller", "strike", "sub", "super", "underline", 702 ] 703 704 list_pattern_names = [ 705 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 706 "listitem_roman", 707 ] 708 709 listitem_pattern_names = inline_pattern_names + ["listitemend"] 710 711 region_without_table_pattern_names = inline_pattern_names + list_pattern_names + [ 712 "break", "heading", "defterm", "defterm_empty", 713 "regionend", "rule", 714 ] 715 716 table_row_pattern_names = inline_pattern_names + [ 717 "tableattrs", "tablecell", "tableend" 718 ] 719 720 # The region pattern names are specifically used by the common parser 721 # functionality. 722 723 region_pattern_names = region_without_table_pattern_names + ["tablerow"] 724 725 726 727 # Pattern handlers. 728 729 end_region = ParserBase.end_region 730 parse_section_end = ParserBase.parse_region_end 731 732 handlers = { 733 None : end_region, 734 "attrname" : parse_attrname, 735 "break" : parse_break, 736 "colour" : parse_colour, 737 "colspan" : parse_colspan, 738 "defterm" : parse_defterm, 739 "defterm_empty" : parse_defterm_empty, 740 "deftermend" : end_region, 741 "deftermsep" : end_region, 742 "fontstyle" : parse_fontstyle, 743 "halign" : parse_halign, 744 "heading" : parse_heading, 745 "headingend" : parse_heading_end, 746 "larger" : parse_larger, 747 "largerend" : end_region, 748 "link" : parse_link, 749 "macro" : parse_macro, 750 "listitemend" : end_region, 751 "listitem" : parse_listitem, 752 "listitem_alpha" : parse_listitem, 753 "listitem_dot" : parse_listitem, 754 "listitem_num" : parse_listitem, 755 "listitem_roman" : parse_listitem, 756 "monospace" : parse_monospace, 757 "monospaceend" : end_region, 758 "regionstart" : parse_section, 759 "regionend" : parse_section_end, 760 "rowspan" : parse_rowspan, 761 "rule" : parse_rule, 762 "smaller" : parse_smaller, 763 "smallerend" : end_region, 764 "strike" : parse_strike, 765 "strikeend" : end_region, 766 "sub" : parse_sub, 767 "subend" : end_region, 768 "super" : parse_super, 769 "superend" : end_region, 770 "tableattrs" : parse_table_attrs, 771 "tableattrsend" : end_region, 772 "tablerow" : parse_table_row, 773 "tablecell" : end_region, 774 "tableend" : end_region, 775 "underline" : parse_underline, 776 "underlineend" : end_region, 777 "valign" : parse_valign, 778 "width" : parse_width, 779 } 780 781 parser = MoinParser 782 783 # vim: tabstop=4 expandtab shiftwidth=4