1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsers.common import ParserBase, get_patterns, get_subset 23 from moinformat.serialisers import serialise 24 from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ 25 Larger, List, ListItem, Monospace, Region, Rule, \ 26 Smaller, Subscript, Superscript, Table, TableAttr, \ 27 TableAttrs, TableCell, TableRow, Text, Underline 28 29 class MoinParser(ParserBase): 30 31 "A wiki region parser." 32 33 def __init__(self, formats=None): 34 35 """ 36 Initialise the parser with any given 'formats' mapping from region type 37 names to parser objects. 38 """ 39 40 # Introduce this class as the default parser for the wiki format. 41 42 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 43 if formats: 44 default_formats.update(formats) 45 46 ParserBase.__init__(self, default_formats) 47 48 # Principal parser methods. 49 50 def parse(self, s): 51 52 """ 53 Parse page text 's'. Pages consist of regions delimited by markers. 54 """ 55 56 self.items = self.get_items(s) 57 self.region = Region([]) 58 59 # Parse page header. 60 61 self.parse_region_header(self.region) 62 63 # Handle pages directly with this parser. Pages do not need to use an 64 # explicit format indicator. 65 66 if not self.region.type: 67 self.parse_region_content(self.items, self.region) 68 69 # Otherwise, test the type and find an appropriate parser. 70 71 else: 72 self.parse_region_type(self.region) 73 74 return self.region 75 76 77 78 # Parser methods supporting different page features. 79 80 def parse_attrname(self, attrs): 81 82 "Handle an attribute name within 'attrs'." 83 84 name = self.read_match() 85 attr = TableAttr(name) 86 87 preceding = self.read_until(["attrvalue"], False) 88 if preceding == "": 89 attr.quote = self.read_match(1) 90 attr.value = self.read_match(2) 91 92 attrs.append(attr) 93 94 def parse_break(self, region): 95 96 "Handle a paragraph break within 'region'." 97 98 self.add_node(region, Break()) 99 self.new_block(region) 100 101 def parse_defitem(self, region, extra=""): 102 103 "Handle a definition item within 'region'." 104 105 pad = self.read_match(1) 106 item = DefItem([], pad, extra) 107 self.parse_region_details(item, ["listitemend"]) 108 self.add_node(region, item) 109 self.new_block(region) 110 111 def parse_defterm(self, region): 112 113 "Handle a definition term within 'region'." 114 115 pad = self.read_match(1) 116 term = DefTerm([], pad) 117 self.parse_region_details(term, ["deftermend", "deftermsep"]) 118 self.add_node(region, term) 119 if self.read_matching() == "deftermsep": 120 self.parse_defitem(region) 121 122 def parse_defterm_empty(self, region): 123 124 "Handle an empty definition term within 'region'." 125 126 extra = self.read_match(1) 127 self.parse_region_details(region, ["deftermsep"]) 128 self.parse_defitem(region, extra) 129 130 def parse_fontstyle(self, region): 131 132 "Handle emphasis and strong styles." 133 134 n = len(self.read_match(1)) 135 136 # Handle endings. 137 138 if isinstance(region, FontStyle): 139 emphasis = n in (2, 4, 5) 140 strong = n in (3, 5, 6) 141 active = True 142 143 if region.emphasis and emphasis: 144 active = region.close_emphasis() 145 n -= 2 146 if region.strong and strong: 147 active = region.close_strong() 148 n -= 3 149 150 if not active: 151 if n: 152 self.items.rewind(n) 153 raise StopIteration 154 155 elif not n: 156 return 157 158 # Handle new styles. 159 160 emphasis = n in (2, 4, 5) 161 strong = n in (3, 5, 6) 162 double = n in (4, 6) 163 164 span = FontStyle([], emphasis, strong) 165 if not double: 166 self.parse_region_details(span, self.inline_pattern_names) 167 region.append_inline(span) 168 169 def parse_halign(self, attrs): 170 171 "Handle horizontal alignment within 'attrs'." 172 173 value = self.read_match() 174 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 175 attrs.append(attr) 176 177 def parse_heading(self, region): 178 179 "Handle a heading." 180 181 start_extra = self.read_match(1) 182 level = len(self.read_match(2)) 183 start_pad = self.read_match(3) 184 heading = Heading([], level, start_extra, start_pad) 185 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 186 self.add_node(region, heading) 187 self.new_block(region) 188 189 def parse_heading_end(self, heading): 190 191 "Handle the end of a heading." 192 193 level = len(self.read_match(2)) 194 if heading.level == level: 195 heading.end_pad = self.read_match(1) 196 heading.end_extra = self.read_match(3) 197 raise StopIteration 198 199 def parse_list(self, item): 200 201 "Create a list, starting with 'item'." 202 203 list = List([item], item.indent, item.marker) 204 self.parse_region_details(list, self.list_pattern_names, True) 205 return list 206 207 def parse_listitem(self, region): 208 209 "Handle a list item marker within 'region'." 210 211 indent = len(self.read_match(1)) 212 marker = self.read_match(2) 213 space = self.read_match(3) 214 215 last = region.node(-1) 216 new_list = not isinstance(last, (List, ListItem)) 217 218 # If the marker is different or the indent is smaller, queue the item 219 # and end the list. 220 221 if not new_list and (last.marker != marker or indent < last.indent): 222 self.queue_match() 223 self.end_region(region) 224 225 # Obtain a list item and populate it. 226 227 item = ListItem([], indent, marker, space) 228 self.parse_region_details(item, self.listitem_pattern_names) 229 230 # Start a new list if not preceded by a list item or if the indent is 231 # greater. 232 233 if new_list or indent > last.indent: 234 item = self.parse_list(item) 235 236 # Add a new or completed nested list. 237 238 self.add_node(region, item) 239 240 if new_list: 241 self.new_block(region) 242 243 # Add the item to the current list. 244 245 else: 246 self.add_node(region, item) 247 248 def parse_rule(self, region): 249 250 "Handle a horizontal rule within 'region'." 251 252 length = len(self.read_match(1)) 253 rule = Rule(length) 254 self.add_node(region, rule) 255 self.new_block(region) 256 257 def parse_section(self, region): 258 259 "Handle the start of a new section within 'region'." 260 261 # Parse the section and start a new block after the section. 262 263 indent = len(self.read_match(2)) 264 level = len(self.read_match(3)) 265 self.add_node(region, self.parse_region(level, indent)) 266 self.new_block(region) 267 268 def parse_section_end(self, region): 269 270 "Handle the end of a new section within 'region'." 271 272 feature = self.read_match() 273 if region.have_end(feature): 274 raise StopIteration 275 else: 276 region.append_inline(Text(feature)) 277 278 def parse_table_attrs(self, cell): 279 280 "Handle the start of table attributes within 'cell'." 281 282 attrs = TableAttrs([]) 283 self.parse_region_details(attrs, self.table_pattern_names) 284 285 # Test the validity of the attributes. 286 287 last = None 288 289 for node in attrs.nodes: 290 291 # Text separator nodes must be whitespace. 292 293 if isinstance(node, Text): 294 if node.s.strip(): 295 break 296 297 # Named attributes must be preceded by space if not the first. 298 299 elif last and not node.concise and not isinstance(last, Text): 300 break 301 302 last = node 303 304 # All nodes were valid: preserve the collection. 305 306 else: 307 cell.attrs = attrs 308 return 309 310 # Invalid nodes were found: serialise the attributes as text. 311 312 cell.append_inline(Text(serialise(attrs))) 313 314 def parse_table_row(self, region): 315 316 "Handle the start of a table row within 'region'." 317 318 # Identify any active table. 319 320 table = region.node(-2) 321 block = region.node(-1) 322 323 if not (isinstance(table, Table) and block.empty()): 324 new_table = table = Table([]) 325 else: 326 new_table = None 327 328 row = TableRow([]) 329 330 while True: 331 cell = TableCell([]) 332 self.parse_region_details(cell, self.table_region_pattern_names) 333 334 # Handle the end of the row. 335 336 if self.read_matching() == "tableend": 337 trailing = self.read_match() 338 339 # If the cell was started but not finished, convert the row into text. 340 341 if not row.nodes or not cell.empty(): 342 for node in row.nodes: 343 region.append_inline(Text(serialise(node))) 344 region.append_inline(Text(serialise(cell))) 345 region.append_inline(Text(trailing)) 346 347 self.new_block(region) 348 return 349 350 # Append the final cell, if not empty. 351 352 else: 353 row.trailing = trailing 354 355 if not cell.empty(): 356 row.append(cell) 357 break 358 359 # A cell separator has been found. 360 361 row.append(cell) 362 363 # Add the row to the table and any new table to the region. 364 365 table.add(row) 366 if new_table: 367 self.add_node(region, new_table) 368 369 self.new_block(region) 370 371 def parse_valign(self, attrs): 372 373 "Handle vertical alignment within 'attrs'." 374 375 value = self.read_match() 376 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 377 attrs.append(attr) 378 379 380 381 # Inline formatting handlers. 382 383 def parse_inline(self, region, cls, pattern_name): 384 385 "Handle an inline region." 386 387 span = cls([]) 388 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 389 region.append_inline(span) 390 391 def parse_larger(self, region): 392 self.parse_inline(region, Larger, "larger") 393 394 def parse_monospace(self, region): 395 self.parse_inline(region, Monospace, "monospace") 396 397 def parse_smaller(self, region): 398 self.parse_inline(region, Smaller, "smaller") 399 400 def parse_sub(self, region): 401 self.parse_inline(region, Subscript, "sub") 402 403 def parse_super(self, region): 404 self.parse_inline(region, Superscript, "super") 405 406 def parse_underline(self, region): 407 self.parse_inline(region, Underline, "underline") 408 409 410 411 # Table attribute handlers. 412 413 def parse_table_attr(self, attrs, pattern_name): 414 415 "Handle a table attribute." 416 417 attrs.append(TableAttr(pattern_name, self.read_match(), True)) 418 419 def parse_colour(self, cell): 420 self.parse_table_attr(cell, "colour") 421 422 def parse_colspan(self, cell): 423 self.parse_table_attr(cell, "colspan") 424 425 def parse_rowspan(self, cell): 426 self.parse_table_attr(cell, "rowspan") 427 428 def parse_width(self, cell): 429 self.parse_table_attr(cell, "width") 430 431 432 433 # Regular expressions. 434 435 syntax = { 436 # Page regions: 437 "regionstart" : r"((^\N*)([{]{3,}))", # {{{... 438 "regionend" : r"^\N*([}]{3,})", # }}}... 439 "header" : r"#!(.*?)\n", # #! char-excl-nl 440 441 # Region contents: 442 # Line-oriented patterns: 443 # blank line 444 "break" : r"^(\s*?)\n", 445 # ws... expecting text :: 446 "defterm" : r"^(\N+)(?=.+?::)", 447 # ws... expecting :: ws... 448 "defterm_empty" : r"^(\N+)(?=::\s+)", 449 # [ws...] =... ws... expecting headingend 450 "heading" : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)", 451 # ws... list-item [ws...] 452 "listitem" : r"^(\N+)(\*)(\s*)", 453 # ws... number-item ws... 454 "listitem_num" : r"^(\N+)(\d+\.)(\s+)", 455 # ws... alpha-item ws... 456 "listitem_alpha": r"^(\N+)([aA]\.)(\s+)", 457 # ws... roman-item ws... 458 "listitem_roman": r"^(\N+)([iI]\.)(\s+)", 459 # ws... dot-item [ws...] 460 "listitem_dot" : r"^(\N+)(\.)(\s*)", 461 # || 462 "tablerow" : r"^\|\|", 463 464 # Region contents: 465 # Inline patterns: 466 "fontstyle" : r"('{2,6})", 467 "larger" : r"~\+", 468 "monospace" : r"`", 469 "rule" : r"(-----*)", # ----... 470 "smaller" : r"~-", 471 "sub" : r",,", 472 "super" : r"\^", 473 "underline" : r"__", 474 475 # Inline contents: 476 "largerend" : r"\+~", 477 "monospaceend" : r"`", 478 "smallerend" : r"-~", 479 "subend" : r",,", 480 "superend" : r"\^", 481 "underlineend" : r"__", 482 483 # Heading contents: 484 "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl 485 486 # List contents: 487 "deftermend" : r"::(\s*?\n)", 488 "deftermsep" : r"::(\s+)", 489 "listitemend" : r"^", # next line 490 491 # Table contents: 492 "tableattrs" : r"<", 493 "tablecell" : r"\|\|", 494 "tableend" : r"(\s*?)^", # [ws...] next line 495 496 # Table attributes: 497 "tableattrsend" : r">", 498 "halign" : r"([(:)])", 499 "valign" : r"([v^])", 500 "colour" : r"(\#[0-9A-F]{6})", 501 "colspan" : r"-(\d+)", 502 "rowspan" : r"\|(\d+)", 503 "width" : r"(\d+%)", 504 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 505 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 506 } 507 508 patterns = get_patterns(syntax) 509 510 511 512 # Pattern details. 513 514 table_pattern_names = [ 515 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 516 "valign", "width" 517 ] 518 519 inline_pattern_names = [ 520 "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline", 521 ] 522 523 list_pattern_names = [ 524 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 525 "listitem_roman", 526 ] 527 528 listitem_pattern_names = inline_pattern_names + ["listitemend"] 529 530 region_pattern_names = inline_pattern_names + list_pattern_names + [ 531 "break", "heading", "defterm", "defterm_empty", 532 "regionstart", "regionend", "rule", "tablerow", 533 ] 534 535 table_region_pattern_names = inline_pattern_names + [ 536 "tableattrs", "tablecell", "tableend" 537 ] 538 539 def inline_patterns_for(self, name): 540 names = self.inline_pattern_names[:] 541 names[names.index(name)] = "%send" % name 542 return names 543 544 545 546 # Pattern handlers. 547 548 end_region = ParserBase.end_region 549 550 handlers = { 551 None : end_region, 552 "attrname" : parse_attrname, 553 "break" : parse_break, 554 "colour" : parse_colour, 555 "colspan" : parse_colspan, 556 "defterm" : parse_defterm, 557 "defterm_empty" : parse_defterm_empty, 558 "deftermend" : end_region, 559 "deftermsep" : end_region, 560 "fontstyle" : parse_fontstyle, 561 "halign" : parse_halign, 562 "heading" : parse_heading, 563 "headingend" : parse_heading_end, 564 "larger" : parse_larger, 565 "largerend" : end_region, 566 "listitemend" : end_region, 567 "listitem" : parse_listitem, 568 "listitem_alpha" : parse_listitem, 569 "listitem_dot" : parse_listitem, 570 "listitem_num" : parse_listitem, 571 "listitem_roman" : parse_listitem, 572 "monospace" : parse_monospace, 573 "monospaceend" : end_region, 574 "regionstart" : parse_section, 575 "regionend" : parse_section_end, 576 "rowspan" : parse_rowspan, 577 "rule" : parse_rule, 578 "smaller" : parse_smaller, 579 "smallerend" : end_region, 580 "sub" : parse_sub, 581 "subend" : end_region, 582 "super" : parse_super, 583 "superend" : end_region, 584 "tableattrs" : parse_table_attrs, 585 "tableattrsend" : end_region, 586 "tablerow" : parse_table_row, 587 "tablecell" : end_region, 588 "tableend" : end_region, 589 "underline" : parse_underline, 590 "underlineend" : end_region, 591 "valign" : parse_valign, 592 "width" : parse_width, 593 } 594 595 parser = MoinParser 596 597 # vim: tabstop=4 expandtab shiftwidth=4