1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsers.common import ParserBase, get_patterns, get_subset 23 from moinformat.serialisers import serialise 24 from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ 25 Larger, Link, List, ListItem, Monospace, Region, \ 26 Rule, Smaller, Strikethrough, Subscript, \ 27 Superscript, Table, TableAttr, TableAttrs, \ 28 TableCell, TableRow, Text, Underline 29 30 class MoinParser(ParserBase): 31 32 "A wiki region parser." 33 34 def __init__(self, formats=None): 35 36 """ 37 Initialise the parser with any given 'formats' mapping from region type 38 names to parser objects. 39 """ 40 41 # Introduce this class as the default parser for the wiki format. 42 43 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 44 if formats: 45 default_formats.update(formats) 46 47 ParserBase.__init__(self, default_formats) 48 49 # Principal parser methods. 50 51 def parse(self, s): 52 53 """ 54 Parse page text 's'. Pages consist of regions delimited by markers. 55 """ 56 57 self.items = self.get_items(s) 58 self.region = Region([], type="moin") 59 60 # Parse page header. 61 62 self.parse_region_header(self.region) 63 64 # Handle pages directly with this parser. Pages do not need to use an 65 # explicit format indicator. 66 67 if not self.region.type: 68 self.parse_region_content(self.items, self.region) 69 70 # Otherwise, test the type and find an appropriate parser. 71 72 else: 73 self.parse_region_type(self.region) 74 75 return self.region 76 77 78 79 # Parser methods supporting different page features. 80 81 def parse_attrname(self, attrs): 82 83 "Handle an attribute name within 'attrs'." 84 85 name = self.read_match() 86 attr = TableAttr(name) 87 88 preceding = self.read_until(["attrvalue"], False) 89 if preceding == "": 90 attr.quote = self.read_match(1) 91 attr.value = self.read_match(2) 92 93 attrs.append(attr) 94 95 def parse_break(self, region): 96 97 "Handle a paragraph break within 'region'." 98 99 self.add_node(region, Break()) 100 self.new_block(region) 101 102 def parse_defitem(self, region, extra=""): 103 104 "Handle a definition item within 'region'." 105 106 pad = self.read_match(1) 107 item = DefItem([], pad, extra) 108 self.parse_region_details(item, ["listitemend"]) 109 self.add_node(region, item) 110 self.new_block(region) 111 112 def parse_defterm(self, region): 113 114 "Handle a definition term within 'region'." 115 116 pad = self.read_match(1) 117 term = DefTerm([], pad) 118 self.parse_region_details(term, ["deftermend", "deftermsep"]) 119 self.add_node(region, term) 120 if self.read_matching() == "deftermsep": 121 self.parse_defitem(region) 122 123 def parse_defterm_empty(self, region): 124 125 "Handle an empty definition term within 'region'." 126 127 extra = self.read_match(1) 128 self.parse_region_details(region, ["deftermsep"]) 129 self.parse_defitem(region, extra) 130 131 def parse_fontstyle(self, region): 132 133 "Handle emphasis and strong styles." 134 135 n = len(self.read_match(1)) 136 137 # Handle endings. 138 139 if isinstance(region, FontStyle): 140 emphasis = n in (2, 4, 5) 141 strong = n in (3, 5, 6) 142 active = True 143 144 if region.emphasis and emphasis: 145 active = region.close_emphasis() 146 n -= 2 147 if region.strong and strong: 148 active = region.close_strong() 149 n -= 3 150 151 if not active: 152 if n: 153 self.items.rewind(n) 154 raise StopIteration 155 156 elif not n: 157 return 158 159 # Handle new styles. 160 161 emphasis = n in (2, 4, 5) 162 strong = n in (3, 5, 6) 163 double = n in (4, 6) 164 165 span = FontStyle([], emphasis, strong) 166 if not double: 167 self.parse_region_details(span, self.inline_pattern_names) 168 region.append_inline(span) 169 170 def parse_halign(self, attrs): 171 172 "Handle horizontal alignment within 'attrs'." 173 174 value = self.read_match() 175 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 176 attrs.append(attr) 177 178 def parse_heading(self, region): 179 180 "Handle a heading." 181 182 start_extra = self.read_match(1) 183 level = len(self.read_match(2)) 184 start_pad = self.read_match(3) 185 heading = Heading([], level, start_extra, start_pad) 186 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 187 self.add_node(region, heading) 188 self.new_block(region) 189 190 def parse_heading_end(self, heading): 191 192 "Handle the end of a heading." 193 194 level = len(self.read_match(2)) 195 if heading.level == level: 196 heading.end_pad = self.read_match(1) 197 heading.end_extra = self.read_match(3) 198 raise StopIteration 199 200 def parse_list(self, item): 201 202 "Create a list, starting with 'item'." 203 204 list = List([item], item.indent, item.marker, item.num) 205 self.parse_region_details(list, self.list_pattern_names, True) 206 return list 207 208 def parse_listitem(self, region): 209 210 "Handle a list item marker within 'region'." 211 212 final = len(self.match_groups()) 213 214 indent = len(self.read_match(1)) 215 marker = self.read_match(2) 216 space = self.read_match(final) 217 218 if final > 3: 219 num = self.read_match(3) 220 else: 221 num = None 222 223 last = region.node(-1) 224 225 new_list = not isinstance(last, (List, ListItem)) 226 same_indent = not new_list and indent == last.indent 227 new_marker = not new_list and last.marker != marker and same_indent 228 new_num = not new_list and num is not None and last.num != num and same_indent 229 230 # If the marker or number changes at the same indent, or if the indent 231 # is smaller, queue the item and end the list. 232 233 # Note that Moin format does not seek to support item renumbering, 234 # instead starting new lists on number changes. 235 236 if not new_list and (new_marker or new_num or indent < last.indent): 237 self.queue_match() 238 self.end_region(region) 239 240 # Obtain a list item and populate it. 241 242 item = ListItem([], indent, marker, space, num) 243 self.parse_region_details(item, self.listitem_pattern_names) 244 245 # Start a new list if not preceded by a list item, adding a trailing 246 # block for new elements. 247 248 if new_list: 249 item = self.parse_list(item) 250 self.add_node(region, item) 251 self.new_block(region) 252 253 # Add a nested list to the last item. 254 255 elif indent > last.indent: 256 item = self.parse_list(item) 257 self.add_node(last, item) 258 259 # Add the item to the current list. 260 261 else: 262 self.add_node(region, item) 263 264 def parse_rule(self, region): 265 266 "Handle a horizontal rule within 'region'." 267 268 length = len(self.read_match(1)) 269 rule = Rule(length) 270 self.add_node(region, rule) 271 self.new_block(region) 272 273 def parse_section(self, region): 274 275 "Handle the start of a new section within 'region'." 276 277 # Parse the section and start a new block after the section. 278 279 indent = len(self.read_match(2)) 280 level = len(self.read_match(3)) 281 self.add_node(region, self.parse_region(level, indent, "inline")) 282 self.new_block(region) 283 284 def parse_section_end(self, region): 285 286 "Handle the end of a new section within 'region'." 287 288 feature = self.read_match() 289 if region.have_end(feature): 290 raise StopIteration 291 else: 292 region.append_inline(Text(feature)) 293 294 def parse_table_attrs(self, cell): 295 296 "Handle the start of table attributes within 'cell'." 297 298 attrs = TableAttrs([]) 299 self.parse_region_details(attrs, self.table_pattern_names) 300 301 # Test the validity of the attributes. 302 303 last = None 304 305 for node in attrs.nodes: 306 307 # Text separator nodes must be whitespace. 308 309 if isinstance(node, Text): 310 if node.s.strip(): 311 break 312 313 # Named attributes must be preceded by space if not the first. 314 315 elif last and not node.concise and not isinstance(last, Text): 316 break 317 318 last = node 319 320 # All nodes were valid: preserve the collection. 321 322 else: 323 cell.attrs = attrs 324 return 325 326 # Invalid nodes were found: serialise the attributes as text. 327 328 cell.append_inline(Text(serialise(attrs))) 329 330 def parse_table_row(self, region): 331 332 "Handle the start of a table row within 'region'." 333 334 # Identify any active table. 335 336 table = region.node(-2) 337 block = region.node(-1) 338 339 if not (isinstance(table, Table) and block.empty()): 340 new_table = table = Table([]) 341 else: 342 new_table = None 343 344 row = TableRow([]) 345 346 while True: 347 cell = TableCell([]) 348 self.parse_region_details(cell, self.table_region_pattern_names) 349 350 # Handle the end of the row. 351 352 if self.read_matching() == "tableend": 353 trailing = self.read_match() 354 355 # If the cell was started but not finished, convert the row into text. 356 357 if not row.nodes or not cell.empty(): 358 for node in row.nodes: 359 region.append_inline(Text(serialise(node))) 360 region.append_inline(Text(serialise(cell))) 361 region.append_inline(Text(trailing)) 362 363 self.new_block(region) 364 return 365 366 # Append the final cell, if not empty. 367 368 else: 369 row.trailing = trailing 370 371 if not cell.empty(): 372 row.append(cell) 373 break 374 375 # A cell separator has been found. 376 377 row.append(cell) 378 379 # Add the row to the table and any new table to the region. 380 381 table.add(row) 382 if new_table: 383 self.add_node(region, new_table) 384 385 self.new_block(region) 386 387 def parse_valign(self, attrs): 388 389 "Handle vertical alignment within 'attrs'." 390 391 value = self.read_match() 392 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 393 attrs.append(attr) 394 395 396 397 # Inline formatting handlers. 398 399 def parse_inline(self, region, cls, pattern_name): 400 401 "Handle an inline region." 402 403 span = cls([]) 404 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 405 region.append_inline(span) 406 407 def parse_larger(self, region): 408 self.parse_inline(region, Larger, "larger") 409 410 def parse_link(self, region): 411 target = self.read_match(1) 412 text = self.read_match(2) 413 link = Link(text and [Text(text)], target) 414 region.append_inline(link) 415 416 def parse_monospace(self, region): 417 self.parse_inline(region, Monospace, "monospace") 418 419 def parse_smaller(self, region): 420 self.parse_inline(region, Smaller, "smaller") 421 422 def parse_strike(self, region): 423 self.parse_inline(region, Strikethrough, "strike") 424 425 def parse_sub(self, region): 426 self.parse_inline(region, Subscript, "sub") 427 428 def parse_super(self, region): 429 self.parse_inline(region, Superscript, "super") 430 431 def parse_underline(self, region): 432 self.parse_inline(region, Underline, "underline") 433 434 435 436 # Table attribute handlers. 437 438 def parse_table_attr(self, attrs, pattern_name): 439 440 "Handle a table attribute." 441 442 attrs.append(TableAttr(pattern_name, self.read_match(), True)) 443 444 def parse_colour(self, cell): 445 self.parse_table_attr(cell, "colour") 446 447 def parse_colspan(self, cell): 448 self.parse_table_attr(cell, "colspan") 449 450 def parse_rowspan(self, cell): 451 self.parse_table_attr(cell, "rowspan") 452 453 def parse_width(self, cell): 454 self.parse_table_attr(cell, "width") 455 456 457 458 # Regular expressions. 459 460 syntax = { 461 # Page regions: 462 "regionstart" : r"((\N*)([{]{3,}))", # [line-start ws] {{{... 463 "regionend" : r"(?:\N*)([}]{3,})", # [line-start ws] }}}... 464 "header" : r"#!(.*?)\n", # #! char-excl-nl 465 466 # Region contents: 467 # Line-oriented patterns: 468 # blank line 469 "break" : r"^(\s*?)\n", 470 # ws... expecting text :: 471 "defterm" : r"^(\N+)(?=.+?::)", 472 # ws... expecting :: ws... 473 "defterm_empty" : r"^(\N+)(?=::\s+)", 474 # [ws...] =... ws... expecting headingend 475 "heading" : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)", 476 # ws... list-item [ws...] 477 "listitem" : r"^(\N+)(\*)(\s*)", 478 # ws... number-item ws... [# number] 479 "listitem_num" : r"^(\N+)(\d+\.)(?:#(\d+))?(\s+)", 480 # ws... alpha-item ws... [# number] 481 "listitem_alpha": r"^(\N+)([aA]\.)(?:#(\d+))?(\s+)", 482 # ws... roman-item ws... [# number] 483 "listitem_roman": r"^(\N+)([iI]\.)(?:#(\d+))?(\s+)", 484 # ws... dot-item [ws...] 485 "listitem_dot" : r"^(\N+)(\.)(\s*)", 486 # || 487 "tablerow" : r"^\|\|", 488 489 # Region contents: 490 # Inline patterns: 491 "fontstyle" : r"('{2,6})", 492 "larger" : r"~\+", 493 "monospace" : r"`", 494 "rule" : r"(-----*)", # ----... 495 "smaller" : r"~-", 496 "strike" : r"--\(", # --( 497 "sub" : r",,", 498 "super" : r"\^", 499 "underline" : r"__", 500 501 # Complete inline patterns: 502 "link" : r"\[\[(.*?)(?:\|(.*?))?]]", # [[target]] or [[target|text]] 503 504 # Inline contents: 505 "largerend" : r"\+~", 506 "monospaceend" : r"`", 507 "smallerend" : r"-~", 508 "strikeend" : r"\)--", # )-- 509 "subend" : r",,", 510 "superend" : r"\^", 511 "underlineend" : r"__", 512 513 # Heading contents: 514 "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl 515 516 # List contents: 517 "deftermend" : r"::(\s*?\n)", 518 "deftermsep" : r"::(\s+)", 519 "listitemend" : r"^", # next line 520 521 # Table contents: 522 "tableattrs" : r"<", 523 "tablecell" : r"\|\|", 524 "tableend" : r"(\s*?)^", # [ws...] next line 525 526 # Table attributes: 527 "tableattrsend" : r">", 528 "halign" : r"([(:)])", 529 "valign" : r"([v^])", 530 "colour" : r"(\#[0-9A-F]{6})", 531 "colspan" : r"-(\d+)", 532 "rowspan" : r"\|(\d+)", 533 "width" : r"(\d+%)", 534 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 535 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 536 } 537 538 patterns = get_patterns(syntax) 539 540 541 542 # Pattern details. 543 544 table_pattern_names = [ 545 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 546 "valign", "width" 547 ] 548 549 inline_pattern_names = [ 550 "fontstyle", "larger", "link", "monospace", "regionstart", "smaller", 551 "strike", "sub", "super", "underline", 552 ] 553 554 list_pattern_names = [ 555 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 556 "listitem_roman", 557 ] 558 559 listitem_pattern_names = inline_pattern_names + ["listitemend"] 560 561 region_pattern_names = inline_pattern_names + list_pattern_names + [ 562 "break", "heading", "defterm", "defterm_empty", 563 "regionstart", "regionend", "rule", "tablerow", 564 ] 565 566 table_region_pattern_names = inline_pattern_names + [ 567 "tableattrs", "tablecell", "tableend" 568 ] 569 570 def inline_patterns_for(self, name): 571 names = self.inline_pattern_names[:] 572 names[names.index(name)] = "%send" % name 573 return names 574 575 576 577 # Pattern handlers. 578 579 end_region = ParserBase.end_region 580 581 handlers = { 582 None : end_region, 583 "attrname" : parse_attrname, 584 "break" : parse_break, 585 "colour" : parse_colour, 586 "colspan" : parse_colspan, 587 "defterm" : parse_defterm, 588 "defterm_empty" : parse_defterm_empty, 589 "deftermend" : end_region, 590 "deftermsep" : end_region, 591 "fontstyle" : parse_fontstyle, 592 "halign" : parse_halign, 593 "heading" : parse_heading, 594 "headingend" : parse_heading_end, 595 "larger" : parse_larger, 596 "largerend" : end_region, 597 "link" : parse_link, 598 "listitemend" : end_region, 599 "listitem" : parse_listitem, 600 "listitem_alpha" : parse_listitem, 601 "listitem_dot" : parse_listitem, 602 "listitem_num" : parse_listitem, 603 "listitem_roman" : parse_listitem, 604 "monospace" : parse_monospace, 605 "monospaceend" : end_region, 606 "regionstart" : parse_section, 607 "regionend" : parse_section_end, 608 "rowspan" : parse_rowspan, 609 "rule" : parse_rule, 610 "smaller" : parse_smaller, 611 "smallerend" : end_region, 612 "strike" : parse_strike, 613 "strikeend" : end_region, 614 "sub" : parse_sub, 615 "subend" : end_region, 616 "super" : parse_super, 617 "superend" : end_region, 618 "tableattrs" : parse_table_attrs, 619 "tableattrsend" : end_region, 620 "tablerow" : parse_table_row, 621 "tablecell" : end_region, 622 "tableend" : end_region, 623 "underline" : parse_underline, 624 "underlineend" : end_region, 625 "valign" : parse_valign, 626 "width" : parse_width, 627 } 628 629 parser = MoinParser 630 631 # vim: tabstop=4 expandtab shiftwidth=4