1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 # Document transformations. 23 24 from moinformat.macros import get_macro 25 26 # Parser functionality and pattern definition. 27 28 from moinformat.parsers.common import ParserBase, get_patterns, choice, \ 29 excl, expect, group, optional, recur, \ 30 repeat 31 32 # Serialisation. 33 34 from moinformat.serialisers import serialise 35 36 # Document tree nodes. 37 38 from moinformat.tree.moin import Anchor, Break, DefItem, DefTerm, FontStyle, \ 39 Heading, Larger, LineBreak, Link, List, \ 40 ListItem, Macro, Monospace, Region, Rule, \ 41 Smaller, Strikethrough, Subscript, \ 42 Superscript, Table, TableAttr, TableAttrs, \ 43 TableCell, TableRow, Text, Underline 44 45 join = "".join 46 47 class MoinParser(ParserBase): 48 49 "A wiki region parser." 50 51 format = "moin" 52 53 def __init__(self, formats=None, root=None): 54 55 """ 56 Initialise the parser with any given 'formats' mapping from region type 57 names to parser objects. An optional 'root' indicates the document-level 58 parser. 59 """ 60 61 # Introduce this class as the default parser for the wiki format. 62 63 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 64 if formats: 65 default_formats.update(formats) 66 67 ParserBase.__init__(self, default_formats, root) 68 69 # Record certain node occurrences for later evaluation. 70 71 self.macros = [] 72 73 # Record headings for identifier disambiguation. 74 75 self.headings = [] 76 77 # Principal parser methods. 78 79 def parse(self, s): 80 81 """ 82 Parse page text 's'. Pages consist of regions delimited by markers. 83 """ 84 85 self.items = self.get_items(s) 86 self.region = Region([], type="moin") 87 88 # Parse page header. 89 90 self.parse_region_header(self.region) 91 92 # Handle pages directly with this parser. Pages do not need to use an 93 # explicit format indicator. 94 95 if not self.region.type: 96 self.parse_region_content(self.items, self.region) 97 98 # Otherwise, test the type and find an appropriate parser. 99 100 else: 101 self.parse_region_type(self.region) 102 103 # Assign heading identifiers. 104 105 self.identify_headings() 106 107 return self.region 108 109 110 111 # Macro evaluation. 112 113 def evaluate_macros(self): 114 115 "Evaluate the macro nodes in the document." 116 117 for node in self.macros: 118 119 # Obtain a class for the named macro. 120 121 macro_cls = get_macro(node.name) 122 if not macro_cls: 123 continue 124 125 # Instantiate the class and evaluate the macro. 126 127 macro = macro_cls(node, self.region) 128 macro.evaluate() 129 130 # Heading disambiguation. 131 132 def identify_headings(self): 133 134 "Assign identifiers to headings based on their textual content." 135 136 d = {} 137 138 for heading in self.headings: 139 text = heading.text_content() 140 141 if not d.has_key(text): 142 d[text] = 0 143 heading.identifier = text 144 else: 145 d[text] += 1 146 heading.identifier = "%s-%d" % (text, d[text]) 147 148 149 150 # Parser methods supporting different page features. 151 152 def parse_attrname(self, attrs): 153 154 "Handle an attribute name within 'attrs'." 155 156 name = self.match_group("name") 157 attr = TableAttr(name) 158 159 preceding = self.read_until(["attrvalue"], False) 160 if preceding == "": 161 attr.quote = self.match_group("quote") 162 attr.value = self.match_group("value") 163 164 attrs.append(attr) 165 166 def parse_break(self, region): 167 168 "Handle a paragraph break within 'region'." 169 170 self.add_node(region, Break()) 171 self.new_block(region) 172 173 def parse_defitem(self, region, extra=""): 174 175 "Handle a definition item within 'region'." 176 177 pad = self.match_group("pad") 178 item = DefItem([], pad, extra) 179 self.parse_region_details(item, self.listitem_pattern_names) 180 self.add_node(region, item) 181 self.new_block(region) 182 183 def parse_defterm(self, region): 184 185 "Handle a definition term within 'region'." 186 187 pad = self.match_group("pad") 188 term = DefTerm([], pad) 189 self.parse_region_details(term, ["deftermend", "deftermsep"]) 190 self.add_node(region, term) 191 192 if self.matching_pattern() == "deftermsep": 193 self.parse_defitem(region) 194 195 # Add padding from the separator to the term, there being no item. 196 197 else: 198 term.extra = self.match_group("pad") 199 200 def parse_defterm_empty(self, region): 201 202 "Handle an empty definition term within 'region'." 203 204 extra = self.match_group("pad") 205 self.parse_region_details(region, ["deftermsep"]) 206 self.parse_defitem(region, extra) 207 208 def parse_fontstyle(self, region): 209 210 "Handle emphasis and strong styles." 211 212 n = len(self.match_group("style")) 213 214 # Handle endings. 215 216 if isinstance(region, FontStyle): 217 emphasis = n in (2, 4, 5) 218 strong = n in (3, 5, 6) 219 active = True 220 221 if region.emphasis and emphasis: 222 active = region.close_emphasis() 223 n -= 2 224 if region.strong and strong: 225 active = region.close_strong() 226 n -= 3 227 228 if not active: 229 if n: 230 self.items.rewind(n) 231 raise StopIteration 232 233 elif not n: 234 return 235 236 # Handle new styles. 237 238 emphasis = n in (2, 4, 5) 239 strong = n in (3, 5, 6) 240 double = n in (4, 6) 241 242 span = FontStyle([], emphasis, strong) 243 if not double: 244 self.parse_region_details(span, self.inline_pattern_names) 245 region.append_inline(span) 246 247 def parse_halign(self, attrs): 248 249 "Handle horizontal alignment within 'attrs'." 250 251 value = self.match_group("value") 252 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 253 attrs.append(attr) 254 255 def parse_heading(self, region): 256 257 "Handle a heading." 258 259 start_extra = self.match_group("extra") 260 level = len(self.match_group("level")) 261 start_pad = self.match_group("pad") 262 heading = Heading([], level, start_extra, start_pad) 263 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 264 self.add_node(region, heading) 265 self.new_block(region) 266 267 # Record the heading for later processing. 268 269 self.root.headings.append(heading) 270 271 def parse_heading_end(self, heading): 272 273 "Handle the end of a heading." 274 275 level = len(self.match_group("level")) 276 if heading.level == level: 277 heading.end_pad = self.match_group("pad") 278 heading.end_extra = self.match_group("extra") 279 raise StopIteration 280 281 def parse_list(self, item): 282 283 "Create a list, starting with 'item'." 284 285 list = List([item], item.indent, item.marker, item.num) 286 self.parse_region_details(list, self.list_pattern_names, True) 287 return list 288 289 def parse_listitem(self, region): 290 291 "Handle a list item marker within 'region'." 292 293 indent = len(self.match_group("indent")) 294 marker = self.match_group("marker") 295 num = self.match_group("num") 296 space = self.match_group("pad") 297 298 last = region.node(-1) 299 300 new_list = not isinstance(last, (List, ListItem)) 301 same_indent = not new_list and indent == last.indent 302 new_marker = not new_list and last.marker != marker and same_indent 303 new_num = not new_list and num is not None and last.num != num and same_indent 304 305 # If the marker or number changes at the same indent, or if the indent 306 # is smaller, queue the item and end the list. 307 308 # Note that Moin format does not seek to support item renumbering, 309 # instead starting new lists on number changes. 310 311 if not new_list and (new_marker or new_num or indent < last.indent): 312 self.queue_match() 313 self.end_region(region) 314 315 # Obtain a list item and populate it. 316 317 item = ListItem([], indent, marker, space, num) 318 self.parse_region_details(item, self.listitem_pattern_names) 319 320 # Start a new list if not preceded by a list item, adding a trailing 321 # block for new elements. 322 323 if new_list: 324 item = self.parse_list(item) 325 self.add_node(region, item) 326 self.new_block(region) 327 328 # Add a nested list to the last item. 329 330 elif indent > last.indent: 331 item = self.parse_list(item) 332 self.add_node(last, item) 333 334 # Add the item to the current list. 335 336 else: 337 self.add_node(region, item) 338 339 def parse_rule(self, region): 340 341 "Handle a horizontal rule within 'region'." 342 343 length = len(self.match_group("rule")) 344 rule = Rule(length) 345 self.add_node(region, rule) 346 self.new_block(region) 347 348 def parse_section(self, region): 349 350 "Handle the start of a new section within 'region'." 351 352 # Parse the section and start a new block after the section. 353 354 indent = len(self.match_group("indent")) 355 level = len(self.match_group("level")) 356 357 section = self.parse_region(level, indent, "inline") 358 359 # If the section is inline, treat it like any other inline element. 360 361 if section.type == "inline": 362 region.append_inline(section) 363 364 # Otherwise, add it as a new block element. 365 366 else: 367 self.add_node(region, section) 368 if region.allow_blocks: 369 self.new_block(region) 370 371 def parse_table_attrs(self, cell): 372 373 "Handle the start of table attributes within 'cell'." 374 375 attrs = TableAttrs([]) 376 self.parse_region_details(attrs, self.table_attr_pattern_names) 377 378 # Test the validity of the attributes. 379 380 last = None 381 382 for node in attrs.nodes: 383 384 # Text separator nodes must be whitespace. 385 386 if isinstance(node, Text): 387 if node.s.strip(): 388 break 389 390 # Named attributes must be preceded by space if not the first. 391 392 elif last and not node.concise and not isinstance(last, Text): 393 break 394 395 last = node 396 397 # All nodes were valid: preserve the collection. 398 399 else: 400 # Add the attributes as a node, also recording their presence. 401 402 cell.append(attrs) 403 cell.attrs = attrs 404 return 405 406 # Invalid nodes were found: serialise the attributes as text. 407 408 cell.append_inline(Text(serialise(attrs))) 409 410 def parse_table_row(self, region): 411 412 "Handle the start of a table row within 'region'." 413 414 # Identify any active table. 415 416 table = region.node(-2) 417 block = region.node(-1) 418 419 if not (isinstance(table, Table) and block.empty()): 420 new_table = table = Table([]) 421 else: 422 new_table = None 423 424 row = TableRow([]) 425 426 while True: 427 cell = TableCell([]) 428 self.parse_region_details(cell, self.table_row_pattern_names) 429 430 # Handle the end of the row. 431 432 if self.matching_pattern() == "tableend": 433 trailing = self.match_group("extra") 434 435 # If the cell was started but not finished, convert the row into text. 436 437 if not row.nodes or not cell.empty(): 438 for node in row.nodes: 439 region.append_inline(Text(serialise(node))) 440 region.append_inline(Text(serialise(cell) + trailing)) 441 442 self.new_block(region) 443 return 444 445 # Append the final cell, if not empty. 446 447 else: 448 row.trailing = trailing 449 450 if not cell.empty(): 451 row.append(cell) 452 break 453 454 # A cell separator has been found. 455 456 row.append(cell) 457 458 # Add the row to the table and any new table to the region. 459 460 table.add(row) 461 if new_table: 462 self.add_node(region, new_table) 463 464 self.new_block(region) 465 466 def parse_valign(self, attrs): 467 468 "Handle vertical alignment within 'attrs'." 469 470 value = self.match_group("value") 471 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 472 attrs.append(attr) 473 474 475 476 def inline_patterns_for(self, name): 477 names = self.inline_pattern_names[:] 478 names[names.index(name)] = "%send" % name 479 return names 480 481 482 483 # Inline formatting handlers. 484 485 def parse_inline(self, region, cls, pattern_name): 486 487 "Handle an inline region." 488 489 span = cls([]) 490 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 491 region.append_inline(span) 492 493 def parse_larger(self, region): 494 self.parse_inline(region, Larger, "larger") 495 496 def parse_monospace(self, region): 497 span = Monospace([]) 498 self.parse_region_details(span, ["monospaceend"]) 499 region.append_inline(span) 500 501 def parse_smaller(self, region): 502 self.parse_inline(region, Smaller, "smaller") 503 504 def parse_strike(self, region): 505 self.parse_inline(region, Strikethrough, "strike") 506 507 def parse_sub(self, region): 508 self.parse_inline(region, Subscript, "sub") 509 510 def parse_super(self, region): 511 self.parse_inline(region, Superscript, "super") 512 513 def parse_underline(self, region): 514 self.parse_inline(region, Underline, "underline") 515 516 517 518 # Complete inline pattern handlers. 519 520 def parse_anchor(self, region): 521 target = self.match_group("target") 522 anchor = Anchor(target) 523 region.append_inline(anchor) 524 525 def parse_linebreak(self, region): 526 region.append_inline(LineBreak()) 527 528 def parse_link(self, region): 529 target = self.match_group("target") 530 text = self.match_group("text") 531 link = Link(text and [Text(text)] or [], target) 532 region.append_inline(link) 533 534 def parse_macro(self, region): 535 name = self.match_group("name") 536 args = self.match_group("args") 537 538 # Obtain the raw arguments. Moin usually leaves it to the macro to 539 # interpret the individual arguments. 540 541 arglist = args and args.split(",") or [] 542 macro = Macro(name, arglist, region.append_point()) 543 region.append_inline(macro) 544 545 # Record the macro for later processing. 546 547 self.root.macros.append(macro) 548 549 550 551 # Table attribute handlers. 552 553 def parse_table_attr(self, attrs, pattern_name): 554 555 "Handle a table attribute." 556 557 attrs.append(TableAttr(pattern_name, self.match_group("value"), True)) 558 559 def parse_colour(self, cell): 560 self.parse_table_attr(cell, "colour") 561 562 def parse_colspan(self, cell): 563 self.parse_table_attr(cell, "colspan") 564 565 def parse_rowspan(self, cell): 566 self.parse_table_attr(cell, "rowspan") 567 568 def parse_width(self, cell): 569 self.parse_table_attr(cell, "width") 570 571 572 573 # Regular expressions. 574 575 syntax = { 576 # Page regions: 577 578 "regionstart" : join((group("indent", r"\N*"), # ws... (optional) 579 group("level", repeat("[{]", 3)))), # {{{... 580 581 "regionend" : join((r"\N*", # ws... (optional) 582 group("feature", join(( 583 group("level", repeat("[}]", 3)), # }}}... 584 group("extra", r"\n"), 585 "?"))))), # nl (optional) 586 587 "header" : join(("#!", # #! 588 group("args", ".*?"), "\n")), # text-excl-nl 589 590 # Region contents: 591 592 # Line-oriented patterns support features which require their own 593 # separate lines. 594 595 "break" : r"^(\s*?)\n", # blank line 596 597 "defterm" : join(("^", 598 group("pad", r"\N+"), # ws... 599 expect(".+?::"))), # text :: 600 601 "defterm_empty" : join(("^", 602 group("pad", r"\N+"), # ws... 603 expect("::\s+"))), # :: ws... 604 605 "heading" : join(("^", 606 group("extra", r"\N*"), # ws... (optional) 607 group("level", "=+"), # =... 608 group("pad", r"\s+"), # ws... 609 expect(join((r".*?\N+", # text 610 recur("level"), # =... 611 r"\N*$"))))), # ws... (optional) 612 613 "listitem" : join(("^", 614 group("indent", r"\N+"), # ws... 615 group("marker", r"\*"), # list-marker 616 group("pad", r"\s*"))), # ws... (optional) 617 618 "listitem_num" : join(("^", 619 group("indent", r"\N+"), # ws... 620 group("marker", r"\d+\."), # decimal-marker 621 optional(join(("#", group("num", r"\d+")))), # # num (optional) 622 group("pad", r"\s+"))), # ws... 623 624 "listitem_alpha": join(("^", 625 group("indent", r"\N+"), # ws... 626 group("marker", r"[aA]\."), # alpha-marker 627 optional(join(("#", group("num", r"\d+")))), # # num (optional) 628 group("pad", r"\s+"))), # ws... 629 630 "listitem_roman": join(("^", 631 group("indent", r"\N+"), # ws... 632 group("marker", r"[iI]\."), # roman-marker 633 optional(join(("#", group("num", r"\d+")))), # # num (optional) 634 group("pad", r"\s+"))), # ws... 635 636 "listitem_dot" : join(("^", 637 group("indent", r"\N+"), # ws... 638 group("marker", r"\."), # dot-marker 639 group("pad", r"\s*"))), # ws... (optional) 640 641 "tablerow" : r"^\|\|", # || 642 643 # Region contents: 644 645 # Inline patterns are for markup features that appear within blocks. 646 # The patterns below start inline spans that can contain other markup 647 # features. 648 649 "fontstyle" : group("style", repeat("'", 2, 6)), # ''... 650 "larger" : r"~\+", # ~+ 651 "monospace" : r"`", # ` 652 "rule" : group("rule", "-----*"), # ----... 653 "smaller" : r"~-", # ~- 654 "strike" : r"--\(", # --( 655 "sub" : r",,", # ,, 656 "super" : r"\^", # ^ 657 "underline" : r"__", # __ 658 659 # Complete inline patterns are for markup features that do not support 660 # arbitrary content within them: 661 662 "anchor" : join((r"\(\(", # (( 663 group("target", ".*?"), # target 664 r"\)\)")), # )) 665 666 "linebreak" : r"\\\\", # \\ 667 668 "link" : join((r"\[\[", # [[ 669 group("target", ".*?"), # target 670 optional(join((r"\|", group("text", ".*?")))), # | text (optional) 671 "]]")), # ]] 672 673 "macro" : join(("<<", # << 674 group("name", "\w+?"), # digit-letter... 675 optional(join((r"\(", # ( (optional) 676 group("args", ".*?"), # not-)... 677 r"\)"))), # ) (optional) 678 ">>")), # >> 679 680 # Ending patterns for inline features: 681 682 "largerend" : r"\+~", # +~ 683 "monospaceend" : r"`", # ` 684 "smallerend" : r"-~", # -~ 685 "strikeend" : r"\)--", # )-- 686 "subend" : r",,", # ,, 687 "superend" : r"\^", # ^ 688 "underlineend" : r"__", # __ 689 690 # Heading contents: 691 692 "headingend" : join((group("pad", r"\N+"), # ws... 693 group("level", "=+"), # =... 694 group("extra", r"\N*\n"))), # ws (optional) nl 695 696 # List contents: 697 698 "deftermend" : join(("::", group("pad", r"\s*?\n"))), # :: 699 # ws... (optional) 700 # nl 701 702 "deftermsep" : join(("::", group("pad", r"\s+"))), # :: 703 # ws... 704 705 "listitemend" : join((r"^", # next line 706 choice((expect(r"[^\s]"), # without indent 707 expect(r"\Z"), # end of string 708 expect(r"\N+\*"), # or with ws... list-marker 709 expect(r"\N+\d\."), # or with ws... decimal-marker 710 expect(r"\N+[aA]\."), # or with ws... alpha-marker 711 expect(r"\N+[iI]\."), # or with ws... roman-marker 712 expect(r"\N+\."), # or with ws... dot-marker 713 expect(r"\N+.+?::\s"), # or with ws... text :: ws (next defterm) 714 expect(r"\N+::\s"))))), # or with ws... :: ws (next defitem) 715 716 # Table contents: 717 718 "tableattrs" : join(("<", # lt 719 excl("<"))), # not-lt 720 721 "tablecell" : r"\|\|", # || 722 723 "tableend" : join((group("extra", r"\s*?"), # ws... (optional) 724 "^")), # next line 725 726 # Table attributes: 727 728 "tableattrsend" : r">", # > 729 "halign" : group("value", "[(:)]"), # halign-marker 730 "valign" : group("value", "[v^]"), # valign-marker 731 "colour" : group("value", join(("\#", # # 732 repeat("[0-9A-F]", 6, 6)))), # nnnnnn 733 734 "colspan" : join(("-", # - 735 group("value", "\d+"))), # n... 736 737 "rowspan" : join((r"\|", # | 738 group("value", "\d+"))), # n... 739 740 "width" : group("value", "\d+%"), # n... % 741 742 "attrname" : join((excl(r"[-\d]"), # not-dash-or-digit 743 group("name", r"[-\w]+"))), # dash-digit-letter... 744 745 "attrvalue" : join(("=", group("quote", r"\Q"), # quote 746 group("value", ".*?"), # non-quote... (optional) 747 recur("quote"))), # quote 748 } 749 750 patterns = get_patterns(syntax) 751 752 753 754 # Patterns available within certain markup features. 755 756 table_attr_pattern_names = [ 757 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 758 "valign", "width" 759 ] 760 761 inline_pattern_names = [ 762 "anchor", "fontstyle", "larger", "linebreak", "link", "macro", 763 "monospace", "regionstart", "smaller", "strike", "sub", "super", 764 "underline", 765 ] 766 767 list_pattern_names = [ 768 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 769 "listitem_roman", 770 ] 771 772 listitem_pattern_names = inline_pattern_names + ["listitemend"] 773 774 region_without_table_pattern_names = inline_pattern_names + list_pattern_names + [ 775 "break", "heading", "defterm", "defterm_empty", 776 "regionend", "rule", 777 ] 778 779 table_row_pattern_names = inline_pattern_names + [ 780 "tableattrs", "tablecell", "tableend" 781 ] 782 783 # The region pattern names are specifically used by the common parser 784 # functionality. 785 786 region_pattern_names = region_without_table_pattern_names + ["tablerow"] 787 788 789 790 # Pattern handlers. 791 792 end_region = ParserBase.end_region 793 parse_section_end = ParserBase.parse_region_end 794 795 handlers = { 796 None : end_region, 797 "anchor" : parse_anchor, 798 "attrname" : parse_attrname, 799 "break" : parse_break, 800 "colour" : parse_colour, 801 "colspan" : parse_colspan, 802 "defterm" : parse_defterm, 803 "defterm_empty" : parse_defterm_empty, 804 "deftermend" : end_region, 805 "deftermsep" : end_region, 806 "fontstyle" : parse_fontstyle, 807 "halign" : parse_halign, 808 "heading" : parse_heading, 809 "headingend" : parse_heading_end, 810 "larger" : parse_larger, 811 "largerend" : end_region, 812 "linebreak" : parse_linebreak, 813 "link" : parse_link, 814 "macro" : parse_macro, 815 "listitemend" : end_region, 816 "listitem" : parse_listitem, 817 "listitem_alpha" : parse_listitem, 818 "listitem_dot" : parse_listitem, 819 "listitem_num" : parse_listitem, 820 "listitem_roman" : parse_listitem, 821 "monospace" : parse_monospace, 822 "monospaceend" : end_region, 823 "regionstart" : parse_section, 824 "regionend" : parse_section_end, 825 "rowspan" : parse_rowspan, 826 "rule" : parse_rule, 827 "smaller" : parse_smaller, 828 "smallerend" : end_region, 829 "strike" : parse_strike, 830 "strikeend" : end_region, 831 "sub" : parse_sub, 832 "subend" : end_region, 833 "super" : parse_super, 834 "superend" : end_region, 835 "tableattrs" : parse_table_attrs, 836 "tableattrsend" : end_region, 837 "tablerow" : parse_table_row, 838 "tablecell" : end_region, 839 "tableend" : end_region, 840 "underline" : parse_underline, 841 "underlineend" : end_region, 842 "valign" : parse_valign, 843 "width" : parse_width, 844 } 845 846 parser = MoinParser 847 848 # vim: tabstop=4 expandtab shiftwidth=4