1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 # Document transformations. 23 24 from moinformat.macros import get_macro 25 26 # Parser functionality and pattern definition. 27 28 from moinformat.parsers.common import ParserBase, get_patterns, \ 29 excl, expect, group, optional, recur, \ 30 repeat 31 32 # Serialisation. 33 34 from moinformat.serialisers import serialise 35 36 # Document tree nodes. 37 38 from moinformat.tree.moin import Break, DefItem, DefTerm, FontStyle, Heading, \ 39 Larger, Link, List, ListItem, Macro, \ 40 Monospace, Region, Rule, Smaller, \ 41 Strikethrough, Subscript, Superscript, Table, \ 42 TableAttr, TableAttrs, TableCell, TableRow, \ 43 Text, Underline 44 45 join = "".join 46 47 class MoinParser(ParserBase): 48 49 "A wiki region parser." 50 51 def __init__(self, formats=None, root=None): 52 53 """ 54 Initialise the parser with any given 'formats' mapping from region type 55 names to parser objects. An optional 'root' indicates the document-level 56 parser. 57 """ 58 59 # Introduce this class as the default parser for the wiki format. 60 61 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 62 if formats: 63 default_formats.update(formats) 64 65 ParserBase.__init__(self, default_formats, root) 66 67 # Record certain node occurrences for later evaluation. 68 69 self.macros = [] 70 71 # Principal parser methods. 72 73 def parse(self, s): 74 75 """ 76 Parse page text 's'. Pages consist of regions delimited by markers. 77 """ 78 79 self.items = self.get_items(s) 80 self.region = Region([], type="moin") 81 82 # Parse page header. 83 84 self.parse_region_header(self.region) 85 86 # Handle pages directly with this parser. Pages do not need to use an 87 # explicit format indicator. 88 89 if not self.region.type: 90 self.parse_region_content(self.items, self.region) 91 92 # Otherwise, test the type and find an appropriate parser. 93 94 else: 95 self.parse_region_type(self.region) 96 97 return self.region 98 99 100 101 # Macro evaluation. 102 103 def evaluate_macros(self): 104 105 "Evaluate the macro nodes in the document." 106 107 for node in self.macros: 108 109 # Obtain a class for the named macro. 110 111 macro_cls = get_macro(node.name) 112 if not macro_cls: 113 continue 114 115 # Instantiate the class and evaluate the macro. 116 117 macro = macro_cls(node, self.region) 118 macro.evaluate() 119 120 121 122 # Parser methods supporting different page features. 123 124 def parse_attrname(self, attrs): 125 126 "Handle an attribute name within 'attrs'." 127 128 name = self.match_group("name") 129 attr = TableAttr(name) 130 131 preceding = self.read_until(["attrvalue"], False) 132 if preceding == "": 133 attr.quote = self.match_group("quote") 134 attr.value = self.match_group("value") 135 136 attrs.append(attr) 137 138 def parse_break(self, region): 139 140 "Handle a paragraph break within 'region'." 141 142 self.add_node(region, Break()) 143 self.new_block(region) 144 145 def parse_defitem(self, region, extra=""): 146 147 "Handle a definition item within 'region'." 148 149 pad = self.match_group("pad") 150 item = DefItem([], pad, extra) 151 self.parse_region_details(item, ["listitemend"]) 152 self.add_node(region, item) 153 self.new_block(region) 154 155 def parse_defterm(self, region): 156 157 "Handle a definition term within 'region'." 158 159 pad = self.match_group("pad") 160 term = DefTerm([], pad) 161 self.parse_region_details(term, ["deftermend", "deftermsep"]) 162 self.add_node(region, term) 163 if self.matching_pattern() == "deftermsep": 164 self.parse_defitem(region) 165 166 def parse_defterm_empty(self, region): 167 168 "Handle an empty definition term within 'region'." 169 170 extra = self.match_group("pad") 171 self.parse_region_details(region, ["deftermsep"]) 172 self.parse_defitem(region, extra) 173 174 def parse_fontstyle(self, region): 175 176 "Handle emphasis and strong styles." 177 178 n = len(self.match_group("style")) 179 180 # Handle endings. 181 182 if isinstance(region, FontStyle): 183 emphasis = n in (2, 4, 5) 184 strong = n in (3, 5, 6) 185 active = True 186 187 if region.emphasis and emphasis: 188 active = region.close_emphasis() 189 n -= 2 190 if region.strong and strong: 191 active = region.close_strong() 192 n -= 3 193 194 if not active: 195 if n: 196 self.items.rewind(n) 197 raise StopIteration 198 199 elif not n: 200 return 201 202 # Handle new styles. 203 204 emphasis = n in (2, 4, 5) 205 strong = n in (3, 5, 6) 206 double = n in (4, 6) 207 208 span = FontStyle([], emphasis, strong) 209 if not double: 210 self.parse_region_details(span, self.inline_pattern_names) 211 region.append_inline(span) 212 213 def parse_halign(self, attrs): 214 215 "Handle horizontal alignment within 'attrs'." 216 217 value = self.match_group("value") 218 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 219 attrs.append(attr) 220 221 def parse_heading(self, region): 222 223 "Handle a heading." 224 225 start_extra = self.match_group("extra") 226 level = len(self.match_group("level")) 227 start_pad = self.match_group("pad") 228 heading = Heading([], level, start_extra, start_pad) 229 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 230 self.add_node(region, heading) 231 self.new_block(region) 232 233 def parse_heading_end(self, heading): 234 235 "Handle the end of a heading." 236 237 level = len(self.match_group("level")) 238 if heading.level == level: 239 heading.end_pad = self.match_group("pad") 240 heading.end_extra = self.match_group("extra") 241 raise StopIteration 242 243 def parse_list(self, item): 244 245 "Create a list, starting with 'item'." 246 247 list = List([item], item.indent, item.marker, item.num) 248 self.parse_region_details(list, self.list_pattern_names, True) 249 return list 250 251 def parse_listitem(self, region): 252 253 "Handle a list item marker within 'region'." 254 255 indent = len(self.match_group("indent")) 256 marker = self.match_group("marker") 257 num = self.match_group("num") 258 space = self.match_group("pad") 259 260 last = region.node(-1) 261 262 new_list = not isinstance(last, (List, ListItem)) 263 same_indent = not new_list and indent == last.indent 264 new_marker = not new_list and last.marker != marker and same_indent 265 new_num = not new_list and num is not None and last.num != num and same_indent 266 267 # If the marker or number changes at the same indent, or if the indent 268 # is smaller, queue the item and end the list. 269 270 # Note that Moin format does not seek to support item renumbering, 271 # instead starting new lists on number changes. 272 273 if not new_list and (new_marker or new_num or indent < last.indent): 274 self.queue_match() 275 self.end_region(region) 276 277 # Obtain a list item and populate it. 278 279 item = ListItem([], indent, marker, space, num) 280 self.parse_region_details(item, self.listitem_pattern_names) 281 282 # Start a new list if not preceded by a list item, adding a trailing 283 # block for new elements. 284 285 if new_list: 286 item = self.parse_list(item) 287 self.add_node(region, item) 288 self.new_block(region) 289 290 # Add a nested list to the last item. 291 292 elif indent > last.indent: 293 item = self.parse_list(item) 294 self.add_node(last, item) 295 296 # Add the item to the current list. 297 298 else: 299 self.add_node(region, item) 300 301 def parse_rule(self, region): 302 303 "Handle a horizontal rule within 'region'." 304 305 length = len(self.match_group("rule")) 306 rule = Rule(length) 307 self.add_node(region, rule) 308 self.new_block(region) 309 310 def parse_section(self, region): 311 312 "Handle the start of a new section within 'region'." 313 314 # Parse the section and start a new block after the section. 315 316 indent = len(self.match_group("indent")) 317 level = len(self.match_group("level")) 318 319 section = self.parse_region(level, indent, "inline") 320 321 # If the section is inline, treat it like any other inline element. 322 323 if section.type == "inline": 324 region.append_inline(section) 325 326 # Otherwise, add it as a new block element. 327 328 else: 329 self.add_node(region, section) 330 if region.allow_blocks: 331 self.new_block(region) 332 333 def parse_section_end(self, region): 334 335 "Handle the end of a new section within 'region'." 336 337 level = self.match_group("level") 338 feature = self.match_group("feature") 339 region.extra = self.match_group("extra") 340 341 if region.have_end(level): 342 raise StopIteration 343 else: 344 region.append_inline(Text(feature)) 345 346 def parse_table_attrs(self, cell): 347 348 "Handle the start of table attributes within 'cell'." 349 350 attrs = TableAttrs([]) 351 self.parse_region_details(attrs, self.table_pattern_names) 352 353 # Test the validity of the attributes. 354 355 last = None 356 357 for node in attrs.nodes: 358 359 # Text separator nodes must be whitespace. 360 361 if isinstance(node, Text): 362 if node.s.strip(): 363 break 364 365 # Named attributes must be preceded by space if not the first. 366 367 elif last and not node.concise and not isinstance(last, Text): 368 break 369 370 last = node 371 372 # All nodes were valid: preserve the collection. 373 374 else: 375 # Add the attributes as a node, also recording their presence. 376 377 cell.append(attrs) 378 cell.attrs = attrs 379 return 380 381 # Invalid nodes were found: serialise the attributes as text. 382 383 cell.append_inline(Text(serialise(attrs))) 384 385 def parse_table_row(self, region): 386 387 "Handle the start of a table row within 'region'." 388 389 # Identify any active table. 390 391 table = region.node(-2) 392 block = region.node(-1) 393 394 if not (isinstance(table, Table) and block.empty()): 395 new_table = table = Table([]) 396 else: 397 new_table = None 398 399 row = TableRow([]) 400 401 while True: 402 cell = TableCell([]) 403 self.parse_region_details(cell, self.table_region_pattern_names) 404 405 # Handle the end of the row. 406 407 if self.matching_pattern() == "tableend": 408 trailing = self.match_group("extra") 409 410 # If the cell was started but not finished, convert the row into text. 411 412 if not row.nodes or not cell.empty(): 413 for node in row.nodes: 414 region.append_inline(Text(serialise(node))) 415 region.append_inline(Text(serialise(cell) + trailing)) 416 417 self.new_block(region) 418 return 419 420 # Append the final cell, if not empty. 421 422 else: 423 row.trailing = trailing 424 425 if not cell.empty(): 426 row.append(cell) 427 break 428 429 # A cell separator has been found. 430 431 row.append(cell) 432 433 # Add the row to the table and any new table to the region. 434 435 table.add(row) 436 if new_table: 437 self.add_node(region, new_table) 438 439 self.new_block(region) 440 441 def parse_valign(self, attrs): 442 443 "Handle vertical alignment within 'attrs'." 444 445 value = self.match_group("value") 446 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 447 attrs.append(attr) 448 449 450 451 # Inline formatting handlers. 452 453 def parse_inline(self, region, cls, pattern_name): 454 455 "Handle an inline region." 456 457 span = cls([]) 458 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 459 region.append_inline(span) 460 461 def parse_larger(self, region): 462 self.parse_inline(region, Larger, "larger") 463 464 def parse_monospace(self, region): 465 span = Monospace([]) 466 self.parse_region_details(span, ["monospaceend"]) 467 region.append_inline(span) 468 469 def parse_smaller(self, region): 470 self.parse_inline(region, Smaller, "smaller") 471 472 def parse_strike(self, region): 473 self.parse_inline(region, Strikethrough, "strike") 474 475 def parse_sub(self, region): 476 self.parse_inline(region, Subscript, "sub") 477 478 def parse_super(self, region): 479 self.parse_inline(region, Superscript, "super") 480 481 def parse_underline(self, region): 482 self.parse_inline(region, Underline, "underline") 483 484 485 486 # Complete inline pattern handlers. 487 488 def parse_link(self, region): 489 target = self.match_group("target") 490 text = self.match_group("text") 491 link = Link(text and [Text(text)], target) 492 region.append_inline(link) 493 494 def parse_macro(self, region): 495 name = self.match_group("name") 496 args = self.match_group("args") 497 498 # Obtain the raw arguments. Moin usually leaves it to the macro to 499 # interpret the individual arguments. 500 501 arglist = args and args.split(",") or [] 502 macro = Macro(name, arglist) 503 region.append_inline(macro) 504 505 # Record the macro for later processing. 506 507 self.root.macros.append(macro) 508 509 510 511 # Table attribute handlers. 512 513 def parse_table_attr(self, attrs, pattern_name): 514 515 "Handle a table attribute." 516 517 attrs.append(TableAttr(pattern_name, self.match_group("value"), True)) 518 519 def parse_colour(self, cell): 520 self.parse_table_attr(cell, "colour") 521 522 def parse_colspan(self, cell): 523 self.parse_table_attr(cell, "colspan") 524 525 def parse_rowspan(self, cell): 526 self.parse_table_attr(cell, "rowspan") 527 528 def parse_width(self, cell): 529 self.parse_table_attr(cell, "width") 530 531 532 533 # Regular expressions. 534 535 syntax = { 536 # Page regions: 537 538 "regionstart" : join((group("indent", r"\N*"), # ws... (optional) 539 group("level", repeat("[{]", 3)))), # {{{... 540 541 "regionend" : join((r"\N*", # ws... (optional) 542 group("feature", join(( 543 group("level", repeat("[}]", 3)), # }}}... 544 group("extra", r"\n"), 545 "?"))))), # nl (optional) 546 547 "header" : join(("#!", # #! 548 group("args", ".*?"), "\n")), # text-excl-nl 549 550 # Region contents: 551 552 # Line-oriented patterns support features which require their own 553 # separate lines. 554 555 "break" : r"^(\s*?)\n", # blank line 556 557 "defterm" : join(("^", 558 group("pad", r"\N+"), # ws... 559 expect(".+?::"))), # text :: 560 561 "defterm_empty" : join(("^", 562 group("pad", r"\N+"), # ws... 563 expect("::\s+"))), # :: 564 # ws... (optional) 565 566 "heading" : join(("^", 567 group("extra", r"\N*"), # ws... (optional) 568 group("level", "=+"), # =... 569 group("pad", r"\s+"), # ws... 570 expect(join((r".*?\N+", # text 571 recur("level"), # =... 572 r"\N*$"))))), # ws... (optional) 573 574 "listitem" : join(("^", 575 group("indent", r"\N+"), # ws... 576 group("marker", r"\*"), # list-marker 577 group("pad", r"\s*"))), # ws... (optional) 578 579 "listitem_num" : join(("^", 580 group("indent", r"\N+"), # ws... 581 group("marker", r"\d+\."), # decimal-marker 582 optional(join(("#", group("num", r"\d+")))), # # num (optional) 583 group("pad", r"\s+"))), # ws... 584 585 "listitem_alpha": join(("^", 586 group("indent", r"\N+"), # ws... 587 group("marker", r"[aA]\."), # alpha-marker 588 optional(join(("#", group("num", r"\d+")))), # # num (optional) 589 group("pad", r"\s+"))), # ws... 590 591 "listitem_roman": join(("^", 592 group("indent", r"\N+"), # ws... 593 group("marker", r"[iI]\."), # roman-marker 594 optional(join(("#", group("num", r"\d+")))), # # num (optional) 595 group("pad", r"\s+"))), # ws... 596 597 "listitem_dot" : join(("^", 598 group("indent", r"\N+"), # ws... 599 group("marker", r"\."), # dot-marker 600 group("pad", r"\s*"))), # ws... (optional) 601 602 "tablerow" : r"^\|\|", # || 603 604 # Region contents: 605 606 # Inline patterns are for markup features that appear within blocks. 607 # The patterns below start inline spans that can contain other markup 608 # features. 609 610 "fontstyle" : group("style", repeat("'", 2, 6)), # ''... 611 "larger" : r"~\+", # ~+ 612 "monospace" : r"`", # ` 613 "rule" : group("rule", "-----*"), # ----... 614 "smaller" : r"~-", # ~- 615 "strike" : r"--\(", # --( 616 "sub" : r",,", # ,, 617 "super" : r"\^", # ^ 618 "underline" : r"__", # __ 619 620 # Complete inline patterns are for markup features that do not support 621 # arbitrary content within them: 622 623 "link" : join((r"\[\[", # [[ 624 group("target", ".*?"), # target 625 optional(join((r"\|", group("text", ".*?")))), # | text (optional) 626 "]]")), # ]] 627 628 "macro" : join(("<<", # << 629 group("name", "\w+?"), # digit-letter... 630 optional(join((r"\(", # ( (optional) 631 group("args", ".*?"), # not-)... 632 r"\)"))), # ) (optional) 633 ">>")), # >> 634 635 # Ending patterns for inline features: 636 637 "largerend" : r"\+~", # +~ 638 "monospaceend" : r"`", # ` 639 "smallerend" : r"-~", # -~ 640 "strikeend" : r"\)--", # )-- 641 "subend" : r",,", # ,, 642 "superend" : r"\^", # ^ 643 "underlineend" : r"__", # __ 644 645 # Heading contents: 646 647 "headingend" : join((group("pad", r"\N+"), # ws... 648 group("level", "=+"), # =... 649 group("extra", r"\N*\n"))), # ws (optional) nl 650 651 # List contents: 652 653 "deftermend" : join(("::", group("pad", r"\s*?\n"))), # :: 654 # ws... (optional) 655 # nl 656 657 "deftermsep" : join(("::", group("pad", r"\s+"))), # :: 658 # ws... (optional) 659 660 "listitemend" : r"^", # next line 661 662 # Table contents: 663 664 "tableattrs" : r"<", # < 665 "tablecell" : r"\|\|", # || 666 667 "tableend" : join((group("extra", r"\s*?"), # ws... (optional) 668 "^")), # next line 669 670 # Table attributes: 671 672 "tableattrsend" : r">", # > 673 "halign" : group("value", "[(:)]"), # halign-marker 674 "valign" : group("value", "[v^]"), # valign-marker 675 "colour" : group("value", join(("\#", # # 676 repeat("[0-9A-F]", 6, 6)))), # nnnnnn 677 678 "colspan" : join(("-", # - 679 group("value", "\d+"))), # n... 680 681 "rowspan" : join((r"\|", # | 682 group("value", "\d+"))), # n... 683 684 "width" : group("value", "\d+%"), # n... % 685 686 "attrname" : join((excl(r"[-\d]"), # not-dash-or-digit 687 group("name", r"[-\w]+"))), # dash-digit-letter... 688 689 "attrvalue" : join(("=", group("quote", r"\Q"), # quote 690 group("value", ".*?"), # non-quote... (optional) 691 recur("quote"))), # quote 692 } 693 694 patterns = get_patterns(syntax) 695 696 697 698 # Patterns available within certain markup features. 699 700 table_pattern_names = [ 701 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 702 "valign", "width" 703 ] 704 705 inline_pattern_names = [ 706 "fontstyle", "larger", "link", "macro", "monospace", "regionstart", 707 "smaller", "strike", "sub", "super", "underline", 708 ] 709 710 list_pattern_names = [ 711 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 712 "listitem_roman", 713 ] 714 715 listitem_pattern_names = inline_pattern_names + ["listitemend"] 716 717 region_without_table_pattern_names = inline_pattern_names + list_pattern_names + [ 718 "break", "heading", "defterm", "defterm_empty", 719 "regionend", "rule", 720 ] 721 722 region_pattern_names = region_without_table_pattern_names + ["tablerow"] 723 724 table_region_pattern_names = inline_pattern_names + [ 725 "tableattrs", "tablecell", "tableend" 726 ] 727 728 def inline_patterns_for(self, name): 729 names = self.inline_pattern_names[:] 730 names[names.index(name)] = "%send" % name 731 return names 732 733 734 735 # Pattern handlers. 736 737 end_region = ParserBase.end_region 738 739 handlers = { 740 None : end_region, 741 "attrname" : parse_attrname, 742 "break" : parse_break, 743 "colour" : parse_colour, 744 "colspan" : parse_colspan, 745 "defterm" : parse_defterm, 746 "defterm_empty" : parse_defterm_empty, 747 "deftermend" : end_region, 748 "deftermsep" : end_region, 749 "fontstyle" : parse_fontstyle, 750 "halign" : parse_halign, 751 "heading" : parse_heading, 752 "headingend" : parse_heading_end, 753 "larger" : parse_larger, 754 "largerend" : end_region, 755 "link" : parse_link, 756 "macro" : parse_macro, 757 "listitemend" : end_region, 758 "listitem" : parse_listitem, 759 "listitem_alpha" : parse_listitem, 760 "listitem_dot" : parse_listitem, 761 "listitem_num" : parse_listitem, 762 "listitem_roman" : parse_listitem, 763 "monospace" : parse_monospace, 764 "monospaceend" : end_region, 765 "regionstart" : parse_section, 766 "regionend" : parse_section_end, 767 "rowspan" : parse_rowspan, 768 "rule" : parse_rule, 769 "smaller" : parse_smaller, 770 "smallerend" : end_region, 771 "strike" : parse_strike, 772 "strikeend" : end_region, 773 "sub" : parse_sub, 774 "subend" : end_region, 775 "super" : parse_super, 776 "superend" : end_region, 777 "tableattrs" : parse_table_attrs, 778 "tableattrsend" : end_region, 779 "tablerow" : parse_table_row, 780 "tablecell" : end_region, 781 "tableend" : end_region, 782 "underline" : parse_underline, 783 "underlineend" : end_region, 784 "valign" : parse_valign, 785 "width" : parse_width, 786 } 787 788 parser = MoinParser 789 790 # vim: tabstop=4 expandtab shiftwidth=4