1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsers.common import ParserBase, get_patterns, get_subset 23 from moinformat.serialisers import serialise 24 from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ 25 Larger, Link, List, ListItem, Monospace, Region, \ 26 Rule, Smaller, Subscript, Superscript, Table, \ 27 TableAttr, TableAttrs, TableCell, TableRow, Text, \ 28 Underline 29 30 class MoinParser(ParserBase): 31 32 "A wiki region parser." 33 34 def __init__(self, formats=None): 35 36 """ 37 Initialise the parser with any given 'formats' mapping from region type 38 names to parser objects. 39 """ 40 41 # Introduce this class as the default parser for the wiki format. 42 43 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 44 if formats: 45 default_formats.update(formats) 46 47 ParserBase.__init__(self, default_formats) 48 49 # Principal parser methods. 50 51 def parse(self, s): 52 53 """ 54 Parse page text 's'. Pages consist of regions delimited by markers. 55 """ 56 57 self.items = self.get_items(s) 58 self.region = Region([]) 59 60 # Parse page header. 61 62 self.parse_region_header(self.region) 63 64 # Handle pages directly with this parser. Pages do not need to use an 65 # explicit format indicator. 66 67 if not self.region.type: 68 self.parse_region_content(self.items, self.region) 69 70 # Otherwise, test the type and find an appropriate parser. 71 72 else: 73 self.parse_region_type(self.region) 74 75 return self.region 76 77 78 79 # Parser methods supporting different page features. 80 81 def parse_attrname(self, attrs): 82 83 "Handle an attribute name within 'attrs'." 84 85 name = self.read_match() 86 attr = TableAttr(name) 87 88 preceding = self.read_until(["attrvalue"], False) 89 if preceding == "": 90 attr.quote = self.read_match(1) 91 attr.value = self.read_match(2) 92 93 attrs.append(attr) 94 95 def parse_break(self, region): 96 97 "Handle a paragraph break within 'region'." 98 99 self.add_node(region, Break()) 100 self.new_block(region) 101 102 def parse_defitem(self, region, extra=""): 103 104 "Handle a definition item within 'region'." 105 106 pad = self.read_match(1) 107 item = DefItem([], pad, extra) 108 self.parse_region_details(item, ["listitemend"]) 109 self.add_node(region, item) 110 self.new_block(region) 111 112 def parse_defterm(self, region): 113 114 "Handle a definition term within 'region'." 115 116 pad = self.read_match(1) 117 term = DefTerm([], pad) 118 self.parse_region_details(term, ["deftermend", "deftermsep"]) 119 self.add_node(region, term) 120 if self.read_matching() == "deftermsep": 121 self.parse_defitem(region) 122 123 def parse_defterm_empty(self, region): 124 125 "Handle an empty definition term within 'region'." 126 127 extra = self.read_match(1) 128 self.parse_region_details(region, ["deftermsep"]) 129 self.parse_defitem(region, extra) 130 131 def parse_fontstyle(self, region): 132 133 "Handle emphasis and strong styles." 134 135 n = len(self.read_match(1)) 136 137 # Handle endings. 138 139 if isinstance(region, FontStyle): 140 emphasis = n in (2, 4, 5) 141 strong = n in (3, 5, 6) 142 active = True 143 144 if region.emphasis and emphasis: 145 active = region.close_emphasis() 146 n -= 2 147 if region.strong and strong: 148 active = region.close_strong() 149 n -= 3 150 151 if not active: 152 if n: 153 self.items.rewind(n) 154 raise StopIteration 155 156 elif not n: 157 return 158 159 # Handle new styles. 160 161 emphasis = n in (2, 4, 5) 162 strong = n in (3, 5, 6) 163 double = n in (4, 6) 164 165 span = FontStyle([], emphasis, strong) 166 if not double: 167 self.parse_region_details(span, self.inline_pattern_names) 168 region.append_inline(span) 169 170 def parse_halign(self, attrs): 171 172 "Handle horizontal alignment within 'attrs'." 173 174 value = self.read_match() 175 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 176 attrs.append(attr) 177 178 def parse_heading(self, region): 179 180 "Handle a heading." 181 182 start_extra = self.read_match(1) 183 level = len(self.read_match(2)) 184 start_pad = self.read_match(3) 185 heading = Heading([], level, start_extra, start_pad) 186 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 187 self.add_node(region, heading) 188 self.new_block(region) 189 190 def parse_heading_end(self, heading): 191 192 "Handle the end of a heading." 193 194 level = len(self.read_match(2)) 195 if heading.level == level: 196 heading.end_pad = self.read_match(1) 197 heading.end_extra = self.read_match(3) 198 raise StopIteration 199 200 def parse_list(self, item): 201 202 "Create a list, starting with 'item'." 203 204 list = List([item], item.indent, item.marker) 205 self.parse_region_details(list, self.list_pattern_names, True) 206 return list 207 208 def parse_listitem(self, region): 209 210 "Handle a list item marker within 'region'." 211 212 indent = len(self.read_match(1)) 213 marker = self.read_match(2) 214 space = self.read_match(3) 215 216 last = region.node(-1) 217 new_list = not isinstance(last, (List, ListItem)) 218 219 # If the marker is different or the indent is smaller, queue the item 220 # and end the list. 221 222 if not new_list and (last.marker != marker or indent < last.indent): 223 self.queue_match() 224 self.end_region(region) 225 226 # Obtain a list item and populate it. 227 228 item = ListItem([], indent, marker, space) 229 self.parse_region_details(item, self.listitem_pattern_names) 230 231 # Start a new list if not preceded by a list item or if the indent is 232 # greater. 233 234 if new_list or indent > last.indent: 235 item = self.parse_list(item) 236 237 # Add a new or completed nested list. 238 239 self.add_node(region, item) 240 241 if new_list: 242 self.new_block(region) 243 244 # Add the item to the current list. 245 246 else: 247 self.add_node(region, item) 248 249 def parse_rule(self, region): 250 251 "Handle a horizontal rule within 'region'." 252 253 length = len(self.read_match(1)) 254 rule = Rule(length) 255 self.add_node(region, rule) 256 self.new_block(region) 257 258 def parse_section(self, region): 259 260 "Handle the start of a new section within 'region'." 261 262 # Parse the section and start a new block after the section. 263 264 indent = len(self.read_match(2)) 265 level = len(self.read_match(3)) 266 self.add_node(region, self.parse_region(level, indent)) 267 self.new_block(region) 268 269 def parse_section_end(self, region): 270 271 "Handle the end of a new section within 'region'." 272 273 feature = self.read_match() 274 if region.have_end(feature): 275 raise StopIteration 276 else: 277 region.append_inline(Text(feature)) 278 279 def parse_table_attrs(self, cell): 280 281 "Handle the start of table attributes within 'cell'." 282 283 attrs = TableAttrs([]) 284 self.parse_region_details(attrs, self.table_pattern_names) 285 286 # Test the validity of the attributes. 287 288 last = None 289 290 for node in attrs.nodes: 291 292 # Text separator nodes must be whitespace. 293 294 if isinstance(node, Text): 295 if node.s.strip(): 296 break 297 298 # Named attributes must be preceded by space if not the first. 299 300 elif last and not node.concise and not isinstance(last, Text): 301 break 302 303 last = node 304 305 # All nodes were valid: preserve the collection. 306 307 else: 308 cell.attrs = attrs 309 return 310 311 # Invalid nodes were found: serialise the attributes as text. 312 313 cell.append_inline(Text(serialise(attrs))) 314 315 def parse_table_row(self, region): 316 317 "Handle the start of a table row within 'region'." 318 319 # Identify any active table. 320 321 table = region.node(-2) 322 block = region.node(-1) 323 324 if not (isinstance(table, Table) and block.empty()): 325 new_table = table = Table([]) 326 else: 327 new_table = None 328 329 row = TableRow([]) 330 331 while True: 332 cell = TableCell([]) 333 self.parse_region_details(cell, self.table_region_pattern_names) 334 335 # Handle the end of the row. 336 337 if self.read_matching() == "tableend": 338 trailing = self.read_match() 339 340 # If the cell was started but not finished, convert the row into text. 341 342 if not row.nodes or not cell.empty(): 343 for node in row.nodes: 344 region.append_inline(Text(serialise(node))) 345 region.append_inline(Text(serialise(cell))) 346 region.append_inline(Text(trailing)) 347 348 self.new_block(region) 349 return 350 351 # Append the final cell, if not empty. 352 353 else: 354 row.trailing = trailing 355 356 if not cell.empty(): 357 row.append(cell) 358 break 359 360 # A cell separator has been found. 361 362 row.append(cell) 363 364 # Add the row to the table and any new table to the region. 365 366 table.add(row) 367 if new_table: 368 self.add_node(region, new_table) 369 370 self.new_block(region) 371 372 def parse_valign(self, attrs): 373 374 "Handle vertical alignment within 'attrs'." 375 376 value = self.read_match() 377 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 378 attrs.append(attr) 379 380 381 382 # Inline formatting handlers. 383 384 def parse_inline(self, region, cls, pattern_name): 385 386 "Handle an inline region." 387 388 span = cls([]) 389 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 390 region.append_inline(span) 391 392 def parse_larger(self, region): 393 self.parse_inline(region, Larger, "larger") 394 395 def parse_link(self, region): 396 target = self.read_match(1) 397 text = self.read_match(2) 398 link = Link(text and [Text(text)], target) 399 region.append_inline(link) 400 401 def parse_monospace(self, region): 402 self.parse_inline(region, Monospace, "monospace") 403 404 def parse_smaller(self, region): 405 self.parse_inline(region, Smaller, "smaller") 406 407 def parse_sub(self, region): 408 self.parse_inline(region, Subscript, "sub") 409 410 def parse_super(self, region): 411 self.parse_inline(region, Superscript, "super") 412 413 def parse_underline(self, region): 414 self.parse_inline(region, Underline, "underline") 415 416 417 418 # Table attribute handlers. 419 420 def parse_table_attr(self, attrs, pattern_name): 421 422 "Handle a table attribute." 423 424 attrs.append(TableAttr(pattern_name, self.read_match(), True)) 425 426 def parse_colour(self, cell): 427 self.parse_table_attr(cell, "colour") 428 429 def parse_colspan(self, cell): 430 self.parse_table_attr(cell, "colspan") 431 432 def parse_rowspan(self, cell): 433 self.parse_table_attr(cell, "rowspan") 434 435 def parse_width(self, cell): 436 self.parse_table_attr(cell, "width") 437 438 439 440 # Regular expressions. 441 442 syntax = { 443 # Page regions: 444 "regionstart" : r"((^\N*)([{]{3,}))", # {{{... 445 "regionend" : r"^\N*([}]{3,})", # }}}... 446 "header" : r"#!(.*?)\n", # #! char-excl-nl 447 448 # Region contents: 449 # Line-oriented patterns: 450 # blank line 451 "break" : r"^(\s*?)\n", 452 # ws... expecting text :: 453 "defterm" : r"^(\N+)(?=.+?::)", 454 # ws... expecting :: ws... 455 "defterm_empty" : r"^(\N+)(?=::\s+)", 456 # [ws...] =... ws... expecting headingend 457 "heading" : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)", 458 # ws... list-item [ws...] 459 "listitem" : r"^(\N+)(\*)(\s*)", 460 # ws... number-item ws... 461 "listitem_num" : r"^(\N+)(\d+\.)(\s+)", 462 # ws... alpha-item ws... 463 "listitem_alpha": r"^(\N+)([aA]\.)(\s+)", 464 # ws... roman-item ws... 465 "listitem_roman": r"^(\N+)([iI]\.)(\s+)", 466 # ws... dot-item [ws...] 467 "listitem_dot" : r"^(\N+)(\.)(\s*)", 468 # || 469 "tablerow" : r"^\|\|", 470 471 # Region contents: 472 # Inline patterns: 473 "fontstyle" : r"('{2,6})", 474 "larger" : r"~\+", 475 "monospace" : r"`", 476 "rule" : r"(-----*)", # ----... 477 "smaller" : r"~-", 478 "sub" : r",,", 479 "super" : r"\^", 480 "underline" : r"__", 481 482 # Complete inline patterns: 483 "link" : r"\[\[(.*?)(?:\|(.*?))?]]", # [[target]] or [[target|text]] 484 485 # Inline contents: 486 "largerend" : r"\+~", 487 "monospaceend" : r"`", 488 "smallerend" : r"-~", 489 "subend" : r",,", 490 "superend" : r"\^", 491 "underlineend" : r"__", 492 493 # Heading contents: 494 "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl 495 496 # List contents: 497 "deftermend" : r"::(\s*?\n)", 498 "deftermsep" : r"::(\s+)", 499 "listitemend" : r"^", # next line 500 501 # Table contents: 502 "tableattrs" : r"<", 503 "tablecell" : r"\|\|", 504 "tableend" : r"(\s*?)^", # [ws...] next line 505 506 # Table attributes: 507 "tableattrsend" : r">", 508 "halign" : r"([(:)])", 509 "valign" : r"([v^])", 510 "colour" : r"(\#[0-9A-F]{6})", 511 "colspan" : r"-(\d+)", 512 "rowspan" : r"\|(\d+)", 513 "width" : r"(\d+%)", 514 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 515 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 516 } 517 518 patterns = get_patterns(syntax) 519 520 521 522 # Pattern details. 523 524 table_pattern_names = [ 525 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 526 "valign", "width" 527 ] 528 529 inline_pattern_names = [ 530 "fontstyle", "larger", "link", "monospace", "smaller", "sub", "super", 531 "underline", 532 ] 533 534 list_pattern_names = [ 535 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 536 "listitem_roman", 537 ] 538 539 listitem_pattern_names = inline_pattern_names + ["listitemend"] 540 541 region_pattern_names = inline_pattern_names + list_pattern_names + [ 542 "break", "heading", "defterm", "defterm_empty", 543 "regionstart", "regionend", "rule", "tablerow", 544 ] 545 546 table_region_pattern_names = inline_pattern_names + [ 547 "tableattrs", "tablecell", "tableend" 548 ] 549 550 def inline_patterns_for(self, name): 551 names = self.inline_pattern_names[:] 552 names[names.index(name)] = "%send" % name 553 return names 554 555 556 557 # Pattern handlers. 558 559 end_region = ParserBase.end_region 560 561 handlers = { 562 None : end_region, 563 "attrname" : parse_attrname, 564 "break" : parse_break, 565 "colour" : parse_colour, 566 "colspan" : parse_colspan, 567 "defterm" : parse_defterm, 568 "defterm_empty" : parse_defterm_empty, 569 "deftermend" : end_region, 570 "deftermsep" : end_region, 571 "fontstyle" : parse_fontstyle, 572 "halign" : parse_halign, 573 "heading" : parse_heading, 574 "headingend" : parse_heading_end, 575 "larger" : parse_larger, 576 "largerend" : end_region, 577 "link" : parse_link, 578 "listitemend" : end_region, 579 "listitem" : parse_listitem, 580 "listitem_alpha" : parse_listitem, 581 "listitem_dot" : parse_listitem, 582 "listitem_num" : parse_listitem, 583 "listitem_roman" : parse_listitem, 584 "monospace" : parse_monospace, 585 "monospaceend" : end_region, 586 "regionstart" : parse_section, 587 "regionend" : parse_section_end, 588 "rowspan" : parse_rowspan, 589 "rule" : parse_rule, 590 "smaller" : parse_smaller, 591 "smallerend" : end_region, 592 "sub" : parse_sub, 593 "subend" : end_region, 594 "super" : parse_super, 595 "superend" : end_region, 596 "tableattrs" : parse_table_attrs, 597 "tableattrsend" : end_region, 598 "tablerow" : parse_table_row, 599 "tablecell" : end_region, 600 "tableend" : end_region, 601 "underline" : parse_underline, 602 "underlineend" : end_region, 603 "valign" : parse_valign, 604 "width" : parse_width, 605 } 606 607 parser = MoinParser 608 609 # vim: tabstop=4 expandtab shiftwidth=4