1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsers.common import ParserBase, get_patterns, get_subset 23 from moinformat.serialisers import serialise 24 from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ 25 Larger, Link, List, ListItem, Monospace, Region, \ 26 Rule, Smaller, Strikethrough, Subscript, \ 27 Superscript, Table, TableAttr, TableAttrs, \ 28 TableCell, TableRow, Text, Underline 29 30 class MoinParser(ParserBase): 31 32 "A wiki region parser." 33 34 def __init__(self, formats=None): 35 36 """ 37 Initialise the parser with any given 'formats' mapping from region type 38 names to parser objects. 39 """ 40 41 # Introduce this class as the default parser for the wiki format. 42 43 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 44 if formats: 45 default_formats.update(formats) 46 47 ParserBase.__init__(self, default_formats) 48 49 # Principal parser methods. 50 51 def parse(self, s): 52 53 """ 54 Parse page text 's'. Pages consist of regions delimited by markers. 55 """ 56 57 self.items = self.get_items(s) 58 self.region = Region([]) 59 60 # Parse page header. 61 62 self.parse_region_header(self.region) 63 64 # Handle pages directly with this parser. Pages do not need to use an 65 # explicit format indicator. 66 67 if not self.region.type: 68 self.parse_region_content(self.items, self.region) 69 70 # Otherwise, test the type and find an appropriate parser. 71 72 else: 73 self.parse_region_type(self.region) 74 75 return self.region 76 77 78 79 # Parser methods supporting different page features. 80 81 def parse_attrname(self, attrs): 82 83 "Handle an attribute name within 'attrs'." 84 85 name = self.read_match() 86 attr = TableAttr(name) 87 88 preceding = self.read_until(["attrvalue"], False) 89 if preceding == "": 90 attr.quote = self.read_match(1) 91 attr.value = self.read_match(2) 92 93 attrs.append(attr) 94 95 def parse_break(self, region): 96 97 "Handle a paragraph break within 'region'." 98 99 self.add_node(region, Break()) 100 self.new_block(region) 101 102 def parse_defitem(self, region, extra=""): 103 104 "Handle a definition item within 'region'." 105 106 pad = self.read_match(1) 107 item = DefItem([], pad, extra) 108 self.parse_region_details(item, ["listitemend"]) 109 self.add_node(region, item) 110 self.new_block(region) 111 112 def parse_defterm(self, region): 113 114 "Handle a definition term within 'region'." 115 116 pad = self.read_match(1) 117 term = DefTerm([], pad) 118 self.parse_region_details(term, ["deftermend", "deftermsep"]) 119 self.add_node(region, term) 120 if self.read_matching() == "deftermsep": 121 self.parse_defitem(region) 122 123 def parse_defterm_empty(self, region): 124 125 "Handle an empty definition term within 'region'." 126 127 extra = self.read_match(1) 128 self.parse_region_details(region, ["deftermsep"]) 129 self.parse_defitem(region, extra) 130 131 def parse_fontstyle(self, region): 132 133 "Handle emphasis and strong styles." 134 135 n = len(self.read_match(1)) 136 137 # Handle endings. 138 139 if isinstance(region, FontStyle): 140 emphasis = n in (2, 4, 5) 141 strong = n in (3, 5, 6) 142 active = True 143 144 if region.emphasis and emphasis: 145 active = region.close_emphasis() 146 n -= 2 147 if region.strong and strong: 148 active = region.close_strong() 149 n -= 3 150 151 if not active: 152 if n: 153 self.items.rewind(n) 154 raise StopIteration 155 156 elif not n: 157 return 158 159 # Handle new styles. 160 161 emphasis = n in (2, 4, 5) 162 strong = n in (3, 5, 6) 163 double = n in (4, 6) 164 165 span = FontStyle([], emphasis, strong) 166 if not double: 167 self.parse_region_details(span, self.inline_pattern_names) 168 region.append_inline(span) 169 170 def parse_halign(self, attrs): 171 172 "Handle horizontal alignment within 'attrs'." 173 174 value = self.read_match() 175 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 176 attrs.append(attr) 177 178 def parse_heading(self, region): 179 180 "Handle a heading." 181 182 start_extra = self.read_match(1) 183 level = len(self.read_match(2)) 184 start_pad = self.read_match(3) 185 heading = Heading([], level, start_extra, start_pad) 186 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 187 self.add_node(region, heading) 188 self.new_block(region) 189 190 def parse_heading_end(self, heading): 191 192 "Handle the end of a heading." 193 194 level = len(self.read_match(2)) 195 if heading.level == level: 196 heading.end_pad = self.read_match(1) 197 heading.end_extra = self.read_match(3) 198 raise StopIteration 199 200 def parse_list(self, item): 201 202 "Create a list, starting with 'item'." 203 204 list = List([item], item.indent, item.marker) 205 self.parse_region_details(list, self.list_pattern_names, True) 206 return list 207 208 def parse_listitem(self, region): 209 210 "Handle a list item marker within 'region'." 211 212 indent = len(self.read_match(1)) 213 marker = self.read_match(2) 214 space = self.read_match(3) 215 216 last = region.node(-1) 217 new_list = not isinstance(last, (List, ListItem)) 218 219 # If the marker is different or the indent is smaller, queue the item 220 # and end the list. 221 222 if not new_list and (last.marker != marker or indent < last.indent): 223 self.queue_match() 224 self.end_region(region) 225 226 # Obtain a list item and populate it. 227 228 item = ListItem([], indent, marker, space) 229 self.parse_region_details(item, self.listitem_pattern_names) 230 231 # Start a new list if not preceded by a list item or if the indent is 232 # greater. 233 234 if new_list or indent > last.indent: 235 item = self.parse_list(item) 236 237 # Add a new or completed nested list. 238 239 self.add_node(region, item) 240 241 if new_list: 242 self.new_block(region) 243 244 # Add the item to the current list. 245 246 else: 247 self.add_node(region, item) 248 249 def parse_rule(self, region): 250 251 "Handle a horizontal rule within 'region'." 252 253 length = len(self.read_match(1)) 254 rule = Rule(length) 255 self.add_node(region, rule) 256 self.new_block(region) 257 258 def parse_section(self, region): 259 260 "Handle the start of a new section within 'region'." 261 262 # Parse the section and start a new block after the section. 263 264 indent = len(self.read_match(2)) 265 level = len(self.read_match(3)) 266 self.add_node(region, self.parse_region(level, indent)) 267 self.new_block(region) 268 269 def parse_section_end(self, region): 270 271 "Handle the end of a new section within 'region'." 272 273 feature = self.read_match() 274 if region.have_end(feature): 275 raise StopIteration 276 else: 277 region.append_inline(Text(feature)) 278 279 def parse_table_attrs(self, cell): 280 281 "Handle the start of table attributes within 'cell'." 282 283 attrs = TableAttrs([]) 284 self.parse_region_details(attrs, self.table_pattern_names) 285 286 # Test the validity of the attributes. 287 288 last = None 289 290 for node in attrs.nodes: 291 292 # Text separator nodes must be whitespace. 293 294 if isinstance(node, Text): 295 if node.s.strip(): 296 break 297 298 # Named attributes must be preceded by space if not the first. 299 300 elif last and not node.concise and not isinstance(last, Text): 301 break 302 303 last = node 304 305 # All nodes were valid: preserve the collection. 306 307 else: 308 cell.attrs = attrs 309 return 310 311 # Invalid nodes were found: serialise the attributes as text. 312 313 cell.append_inline(Text(serialise(attrs))) 314 315 def parse_table_row(self, region): 316 317 "Handle the start of a table row within 'region'." 318 319 # Identify any active table. 320 321 table = region.node(-2) 322 block = region.node(-1) 323 324 if not (isinstance(table, Table) and block.empty()): 325 new_table = table = Table([]) 326 else: 327 new_table = None 328 329 row = TableRow([]) 330 331 while True: 332 cell = TableCell([]) 333 self.parse_region_details(cell, self.table_region_pattern_names) 334 335 # Handle the end of the row. 336 337 if self.read_matching() == "tableend": 338 trailing = self.read_match() 339 340 # If the cell was started but not finished, convert the row into text. 341 342 if not row.nodes or not cell.empty(): 343 for node in row.nodes: 344 region.append_inline(Text(serialise(node))) 345 region.append_inline(Text(serialise(cell))) 346 region.append_inline(Text(trailing)) 347 348 self.new_block(region) 349 return 350 351 # Append the final cell, if not empty. 352 353 else: 354 row.trailing = trailing 355 356 if not cell.empty(): 357 row.append(cell) 358 break 359 360 # A cell separator has been found. 361 362 row.append(cell) 363 364 # Add the row to the table and any new table to the region. 365 366 table.add(row) 367 if new_table: 368 self.add_node(region, new_table) 369 370 self.new_block(region) 371 372 def parse_valign(self, attrs): 373 374 "Handle vertical alignment within 'attrs'." 375 376 value = self.read_match() 377 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 378 attrs.append(attr) 379 380 381 382 # Inline formatting handlers. 383 384 def parse_inline(self, region, cls, pattern_name): 385 386 "Handle an inline region." 387 388 span = cls([]) 389 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 390 region.append_inline(span) 391 392 def parse_larger(self, region): 393 self.parse_inline(region, Larger, "larger") 394 395 def parse_link(self, region): 396 target = self.read_match(1) 397 text = self.read_match(2) 398 link = Link(text and [Text(text)], target) 399 region.append_inline(link) 400 401 def parse_monospace(self, region): 402 self.parse_inline(region, Monospace, "monospace") 403 404 def parse_smaller(self, region): 405 self.parse_inline(region, Smaller, "smaller") 406 407 def parse_strike(self, region): 408 self.parse_inline(region, Strikethrough, "strike") 409 410 def parse_sub(self, region): 411 self.parse_inline(region, Subscript, "sub") 412 413 def parse_super(self, region): 414 self.parse_inline(region, Superscript, "super") 415 416 def parse_underline(self, region): 417 self.parse_inline(region, Underline, "underline") 418 419 420 421 # Table attribute handlers. 422 423 def parse_table_attr(self, attrs, pattern_name): 424 425 "Handle a table attribute." 426 427 attrs.append(TableAttr(pattern_name, self.read_match(), True)) 428 429 def parse_colour(self, cell): 430 self.parse_table_attr(cell, "colour") 431 432 def parse_colspan(self, cell): 433 self.parse_table_attr(cell, "colspan") 434 435 def parse_rowspan(self, cell): 436 self.parse_table_attr(cell, "rowspan") 437 438 def parse_width(self, cell): 439 self.parse_table_attr(cell, "width") 440 441 442 443 # Regular expressions. 444 445 syntax = { 446 # Page regions: 447 "regionstart" : r"((^\N*)([{]{3,}))", # {{{... 448 "regionend" : r"^\N*([}]{3,})", # }}}... 449 "header" : r"#!(.*?)\n", # #! char-excl-nl 450 451 # Region contents: 452 # Line-oriented patterns: 453 # blank line 454 "break" : r"^(\s*?)\n", 455 # ws... expecting text :: 456 "defterm" : r"^(\N+)(?=.+?::)", 457 # ws... expecting :: ws... 458 "defterm_empty" : r"^(\N+)(?=::\s+)", 459 # [ws...] =... ws... expecting headingend 460 "heading" : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)", 461 # ws... list-item [ws...] 462 "listitem" : r"^(\N+)(\*)(\s*)", 463 # ws... number-item ws... 464 "listitem_num" : r"^(\N+)(\d+\.)(\s+)", 465 # ws... alpha-item ws... 466 "listitem_alpha": r"^(\N+)([aA]\.)(\s+)", 467 # ws... roman-item ws... 468 "listitem_roman": r"^(\N+)([iI]\.)(\s+)", 469 # ws... dot-item [ws...] 470 "listitem_dot" : r"^(\N+)(\.)(\s*)", 471 # || 472 "tablerow" : r"^\|\|", 473 474 # Region contents: 475 # Inline patterns: 476 "fontstyle" : r"('{2,6})", 477 "larger" : r"~\+", 478 "monospace" : r"`", 479 "rule" : r"(-----*)", # ----... 480 "smaller" : r"~-", 481 "strike" : r"--\(", # --( 482 "sub" : r",,", 483 "super" : r"\^", 484 "underline" : r"__", 485 486 # Complete inline patterns: 487 "link" : r"\[\[(.*?)(?:\|(.*?))?]]", # [[target]] or [[target|text]] 488 489 # Inline contents: 490 "largerend" : r"\+~", 491 "monospaceend" : r"`", 492 "smallerend" : r"-~", 493 "strikeend" : r"\)--", # )-- 494 "subend" : r",,", 495 "superend" : r"\^", 496 "underlineend" : r"__", 497 498 # Heading contents: 499 "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl 500 501 # List contents: 502 "deftermend" : r"::(\s*?\n)", 503 "deftermsep" : r"::(\s+)", 504 "listitemend" : r"^", # next line 505 506 # Table contents: 507 "tableattrs" : r"<", 508 "tablecell" : r"\|\|", 509 "tableend" : r"(\s*?)^", # [ws...] next line 510 511 # Table attributes: 512 "tableattrsend" : r">", 513 "halign" : r"([(:)])", 514 "valign" : r"([v^])", 515 "colour" : r"(\#[0-9A-F]{6})", 516 "colspan" : r"-(\d+)", 517 "rowspan" : r"\|(\d+)", 518 "width" : r"(\d+%)", 519 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 520 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 521 } 522 523 patterns = get_patterns(syntax) 524 525 526 527 # Pattern details. 528 529 table_pattern_names = [ 530 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 531 "valign", "width" 532 ] 533 534 inline_pattern_names = [ 535 "fontstyle", "larger", "link", "monospace", "smaller", "strike", "sub", 536 "super", "underline", 537 ] 538 539 list_pattern_names = [ 540 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 541 "listitem_roman", 542 ] 543 544 listitem_pattern_names = inline_pattern_names + ["listitemend"] 545 546 region_pattern_names = inline_pattern_names + list_pattern_names + [ 547 "break", "heading", "defterm", "defterm_empty", 548 "regionstart", "regionend", "rule", "tablerow", 549 ] 550 551 table_region_pattern_names = inline_pattern_names + [ 552 "tableattrs", "tablecell", "tableend" 553 ] 554 555 def inline_patterns_for(self, name): 556 names = self.inline_pattern_names[:] 557 names[names.index(name)] = "%send" % name 558 return names 559 560 561 562 # Pattern handlers. 563 564 end_region = ParserBase.end_region 565 566 handlers = { 567 None : end_region, 568 "attrname" : parse_attrname, 569 "break" : parse_break, 570 "colour" : parse_colour, 571 "colspan" : parse_colspan, 572 "defterm" : parse_defterm, 573 "defterm_empty" : parse_defterm_empty, 574 "deftermend" : end_region, 575 "deftermsep" : end_region, 576 "fontstyle" : parse_fontstyle, 577 "halign" : parse_halign, 578 "heading" : parse_heading, 579 "headingend" : parse_heading_end, 580 "larger" : parse_larger, 581 "largerend" : end_region, 582 "link" : parse_link, 583 "listitemend" : end_region, 584 "listitem" : parse_listitem, 585 "listitem_alpha" : parse_listitem, 586 "listitem_dot" : parse_listitem, 587 "listitem_num" : parse_listitem, 588 "listitem_roman" : parse_listitem, 589 "monospace" : parse_monospace, 590 "monospaceend" : end_region, 591 "regionstart" : parse_section, 592 "regionend" : parse_section_end, 593 "rowspan" : parse_rowspan, 594 "rule" : parse_rule, 595 "smaller" : parse_smaller, 596 "smallerend" : end_region, 597 "strike" : parse_strike, 598 "strikeend" : end_region, 599 "sub" : parse_sub, 600 "subend" : end_region, 601 "super" : parse_super, 602 "superend" : end_region, 603 "tableattrs" : parse_table_attrs, 604 "tableattrsend" : end_region, 605 "tablerow" : parse_table_row, 606 "tablecell" : end_region, 607 "tableend" : end_region, 608 "underline" : parse_underline, 609 "underlineend" : end_region, 610 "valign" : parse_valign, 611 "width" : parse_width, 612 } 613 614 parser = MoinParser 615 616 # vim: tabstop=4 expandtab shiftwidth=4