1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \ 23 Larger, ListItem, Monospace, Region, Rule, Smaller, \ 24 Subscript, Superscript, TableAttr, TableAttrs, \ 25 TableCell, TableRow, Text, Underline 26 import re 27 28 # Regular expressions. 29 30 syntax = { 31 # Page regions: 32 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 33 "regionend" : r"^\s*([}]{3,})", # }}}... 34 "header" : r"#!(.*?)\n", # #! char-excl-nl 35 36 # Region contents: 37 # Line-oriented patterns: 38 # blank line 39 "break" : r"^(\s*?)\n", 40 # ws... expecting text :: 41 "defterm" : r"^(\s+)(?=.+?::)", 42 # ws... expecting :: ws... 43 "defterm_empty" : r"^(\s+)(?=::\s+)", 44 # [ws...] =... ws... expecting headingend 45 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 46 # ws... list-item [ws...] 47 "listitem" : r"^(\s+)(\*)(\s*)", 48 # ws... number-item ws... 49 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 50 # ws... alpha-item ws... 51 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 52 # ws... roman-item ws... 53 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 54 # ws... dot-item [ws...] 55 "listitem_dot" : r"^(\s+)(\.)(\s*)", 56 # || 57 "tablerow" : r"^\|\|", 58 59 # Region contents: 60 # Inline patterns: 61 "fontstyle" : r"('{2,6})", 62 "larger" : r"~\+", 63 "monospace" : r"`", 64 "rule" : r"(-----*)", # ----... 65 "smaller" : r"~-", 66 "sub" : r",,", 67 "super" : r"\^", 68 "underline" : r"__", 69 70 # Inline contents: 71 "largerend" : r"\+~", 72 "monospaceend" : r"`", 73 "smallerend" : r"-~", 74 "subend" : r",,", 75 "superend" : r"\^", 76 "underlineend" : r"__", 77 78 # Heading contents: 79 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 80 81 # List contents: 82 "deftermend" : r"::(\s*?\n)", 83 "deftermsep" : r"::(\s+)", 84 "listitemend" : r"^", # next line 85 86 # Table contents: 87 "tableattrs" : r"<", 88 "tablecell" : r"\|\|", 89 "tableend" : r"(\s*?)^", # [ws...] next line 90 91 # Table attributes: 92 "tableattrsend" : r">", 93 "halign" : r"([(:)])", 94 "valign" : r"([v^])", 95 "colour" : r"(\#[0-9A-F]{6})", 96 "colspan" : r"-(\d+)", 97 "rowspan" : r"\|(\d+)", 98 "width" : r"(\d+%)", 99 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 100 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 101 } 102 103 # Define pattern details. 104 105 table_pattern_names = ["attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", "valign", "width"] 106 107 inline_pattern_names = ["fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline"] 108 109 def inline_patterns_for(name): 110 names = inline_pattern_names[:] 111 names[names.index(name)] = "%send" % name 112 return names 113 114 # Define patterns for the regular expressions. 115 116 patterns = {} 117 for name, value in syntax.items(): 118 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 119 120 121 122 # Tokenising functions. 123 124 class TokenStream: 125 126 "A stream of tokens taken from a string." 127 128 def __init__(self, s): 129 self.s = s 130 self.pos = 0 131 self.match = None 132 self.matching = None 133 134 def rewind(self, length): 135 136 "Rewind in the string by 'length'." 137 138 self.pos -= min(length, self.pos) 139 140 def read_until(self, pattern_names, remaining=True): 141 142 """ 143 Find the first match for the given 'pattern_names'. Return the text 144 preceding any match, the remaining text if no match was found, or None 145 if no match was found and 'remaining' is given as a false value. 146 """ 147 148 first = None 149 self.matching = None 150 151 # Find the first matching pattern. 152 153 for pattern_name in pattern_names: 154 match = patterns[pattern_name].search(self.s, self.pos) 155 if match: 156 start, end = match.span() 157 if self.matching is None or start < first: 158 first = start 159 self.matching = pattern_name 160 self.match = match 161 162 if self.matching is None: 163 if remaining: 164 return self.s[self.pos:] 165 else: 166 return None 167 else: 168 return self.s[self.pos:first] 169 170 def read_match(self, group=1): 171 172 """ 173 Return the matched text, updating the position in the stream. If 'group' 174 is specified, the indicated group in a match will be returned. 175 Typically, group 1 should contain all pertinent data, but groups defined 176 within group 1 can provide sections of the data. 177 """ 178 179 if self.match: 180 _start, self.pos = self.match.span() 181 try: 182 return self.match.group(group) 183 except IndexError: 184 return "" 185 else: 186 self.pos = len(self.s) 187 return None 188 189 190 191 # Parser functions. 192 193 def parse_page(s): 194 195 """ 196 Parse page text 's'. Pages consist of regions delimited by markers. 197 """ 198 199 return parse_region(TokenStream(s)) 200 201 def parse_region(items, level=0, indent=0): 202 203 """ 204 Parse the data provided by 'items' to populate a region with the given 205 'level' at the given 'indent'. 206 """ 207 208 region = Region([], level, indent) 209 210 # Parse section headers. 211 212 parse_region_header(items, region) 213 214 # Parse section body. 215 216 if region.is_transparent(): 217 parse_region_wiki(items, region) 218 else: 219 parse_region_opaque(items, region) 220 221 return region 222 223 def parse_region_header(items, region): 224 225 """ 226 Parse the region header from the 'items', setting it for the given 'region'. 227 """ 228 229 if items.read_until(["header"], False) == "": # None means no header 230 region.type = items.read_match() 231 232 def parse_region_wiki(items, region): 233 234 "Parse the data provided by 'items' to populate a wiki 'region'." 235 236 new_block(region) 237 parse_region_details(items, region, inline_pattern_names + [ 238 "break", "heading", 239 "defterm", "defterm_empty", 240 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 241 "listitem_roman", 242 "regionstart", "regionend", 243 "rule", 244 "tablerow", 245 ]) 246 247 def parse_region_opaque(items, region): 248 249 "Parse the data provided by 'items' to populate an opaque 'region'." 250 251 parse_region_details(items, region, ["regionend"]) 252 253 def parse_region_details(items, region, pattern_names): 254 255 "Parse 'items' within 'region' searching using 'pattern_names'." 256 257 try: 258 while True: 259 260 # Obtain text before any marker or the end of the input. 261 262 preceding = items.read_until(pattern_names) 263 if preceding: 264 region.append_inline(Text(preceding)) 265 266 # End of input. 267 268 if not items.matching: 269 break 270 271 # Obtain any feature. 272 273 feature = items.read_match() 274 handler = handlers.get(items.matching) 275 276 # Handle each feature or add text to the region. 277 278 if handler: 279 handler(items, region) 280 else: 281 region.append_inline(Text(feature)) 282 283 except StopIteration: 284 pass 285 286 region.normalise() 287 288 def end_region(items, region): 289 290 "End the parsing of 'region'." 291 292 raise StopIteration 293 294 def parse_attrname(items, attrs): 295 296 "Handle an attribute name within 'attrs'." 297 298 name = items.read_match() 299 attr = TableAttr(name) 300 301 preceding = items.read_until(["attrvalue"], False) 302 if preceding == "": 303 attr.quote = items.read_match(1) 304 attr.value = items.read_match(2) 305 306 attrs.append(attr) 307 308 def parse_break(items, region): 309 310 "Handle a paragraph break within 'region'." 311 312 region.add(Break()) 313 new_block(region) 314 315 def parse_defitem(items, region, extra=""): 316 317 "Handle a definition item within 'region'." 318 319 pad = items.read_match(1) 320 item = DefItem([], pad, extra) 321 parse_region_details(items, item, ["listitemend"]) 322 region.add(item) 323 new_block(region) 324 325 def parse_defterm(items, region): 326 327 "Handle a definition term within 'region'." 328 329 pad = items.read_match(1) 330 term = DefTerm([], pad) 331 parse_region_details(items, term, ["deftermend", "deftermsep"]) 332 region.add(term) 333 if items.matching == "deftermsep": 334 parse_defitem(items, region) 335 336 def parse_defterm_empty(items, region): 337 338 "Handle an empty definition term within 'region'." 339 340 extra = items.read_match(1) 341 parse_region_details(items, region, ["deftermsep"]) 342 parse_defitem(items, region, extra) 343 344 def parse_fontstyle(items, region): 345 346 "Handle emphasis and strong styles." 347 348 n = len(items.read_match(1)) 349 350 # Handle endings. 351 352 if isinstance(region, FontStyle): 353 emphasis = n in (2, 4, 5) 354 strong = n in (3, 5, 6) 355 active = True 356 357 if region.emphasis and emphasis: 358 active = region.close_emphasis() 359 n -= 2 360 if region.strong and strong: 361 active = region.close_strong() 362 n -= 3 363 364 if not active: 365 if n: 366 items.rewind(n) 367 raise StopIteration 368 369 elif not n: 370 return 371 372 # Handle new styles. 373 374 emphasis = n in (2, 4, 5) 375 strong = n in (3, 5, 6) 376 double = n in (4, 6) 377 378 span = FontStyle([], emphasis, strong) 379 if not double: 380 parse_region_details(items, span, inline_pattern_names) 381 region.append_inline(span) 382 383 def parse_halign(items, attrs): 384 385 "Handle horizontal alignment within 'attrs'." 386 387 value = items.read_match() 388 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 389 attrs.append(attr) 390 391 def parse_heading(items, region): 392 393 "Handle a heading." 394 395 start_extra = items.read_match(1) 396 level = len(items.read_match(2)) 397 start_pad = items.read_match(3) 398 heading = Heading([], level, start_extra, start_pad) 399 parse_region_details(items, heading, ["headingend"] + inline_pattern_names) 400 region.add(heading) 401 new_block(region) 402 403 def parse_heading_end(items, heading): 404 405 "Handle the end of a heading." 406 407 level = len(items.read_match(2)) 408 if heading.level == level: 409 heading.end_pad = items.read_match(1) 410 heading.end_extra = items.read_match(3) 411 raise StopIteration 412 413 def parse_listitem(items, region): 414 415 "Handle a list item marker within 'region'." 416 417 indent = len(items.read_match(1)) 418 marker = items.read_match(2) 419 space = items.read_match(3) 420 item = ListItem([], indent, marker, space) 421 parse_region_details(items, item, ["listitemend"]) 422 region.add(item) 423 new_block(region) 424 425 def parse_rule(items, region): 426 427 "Handle a horizontal rule within 'region'." 428 429 length = len(items.read_match(1)) 430 rule = Rule(length) 431 region.add(rule) 432 new_block(region) 433 434 def parse_section(items, region): 435 436 "Handle the start of a new section within 'region'." 437 438 # Parse the section and start a new block after the section. 439 440 indent = len(items.read_match(2)) 441 level = len(items.read_match(3)) 442 region.add(parse_region(items, level, indent)) 443 new_block(region) 444 445 def parse_section_end(items, region): 446 447 "Handle the end of a new section within 'region'." 448 449 feature = items.read_match() 450 if region.have_end(feature): 451 raise StopIteration 452 else: 453 region.append_inline(Text(feature)) 454 455 def parse_table_attrs(items, cell): 456 457 "Handle the start of table attributes within 'cell'." 458 459 attrs = TableAttrs([]) 460 parse_region_details(items, attrs, table_pattern_names) 461 cell.attrs = attrs 462 463 def parse_table_row(items, region): 464 465 "Handle the start of a table row within 'region'." 466 467 row = TableRow([]) 468 469 while True: 470 cell = TableCell([]) 471 parse_region_details(items, cell, ["tableattrs", "tablecell", "tableend"]) 472 473 # Handle the end of the row. 474 475 if items.matching == "tableend": 476 trailing = items.read_match() 477 478 # If the cell was started but not finished, convert the row into text. 479 480 if not row.nodes or not cell.empty(): 481 region.append_inline(Text("||")) 482 483 # Convert all cells. 484 485 for node in row.nodes: 486 region.append_inline_many(node.nodes) 487 region.append_inline(Text("||")) 488 489 region.append_inline_many(cell.nodes) 490 region.append_inline(Text(trailing)) 491 492 new_block(region) 493 return 494 495 # Append the final cell, if not empty. 496 497 else: 498 row.trailing = trailing 499 500 if not cell.empty(): 501 row.append(cell) 502 break 503 504 # A cell separator has been found. 505 506 row.append(cell) 507 508 region.add(row) 509 new_block(region) 510 511 def parse_valign(items, attrs): 512 513 "Handle vertical alignment within 'attrs'." 514 515 value = items.read_match() 516 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 517 attrs.append(attr) 518 519 # Inline formatting handlers. 520 521 def parse_inline(items, region, cls, pattern_name): 522 523 "Handle an inline region." 524 525 span = cls([]) 526 parse_region_details(items, span, inline_patterns_for(pattern_name)) 527 region.append_inline(span) 528 529 parse_larger = lambda items, region: parse_inline(items, region, Larger, "larger") 530 parse_monospace = lambda items, region: parse_inline(items, region, Monospace, "monospace") 531 parse_smaller = lambda items, region: parse_inline(items, region, Smaller, "smaller") 532 parse_sub = lambda items, region: parse_inline(items, region, Subscript, "sub") 533 parse_super = lambda items, region: parse_inline(items, region, Superscript, "super") 534 parse_underline = lambda items, region: parse_inline(items, region, Underline, "underline") 535 536 # Table attribute handlers. 537 538 def parse_table_attr(items, attrs, pattern_name): 539 540 "Handle a table attribute." 541 542 value = items.read_match() 543 attrs.append(TableAttr(pattern_name, value, True)) 544 545 parse_colour = lambda items, cell: parse_table_attr(items, cell, "colour") 546 parse_colspan = lambda items, cell: parse_table_attr(items, cell, "colspan") 547 parse_rowspan = lambda items, cell: parse_table_attr(items, cell, "rowspan") 548 parse_width = lambda items, cell: parse_table_attr(items, cell, "width") 549 550 # Pattern handlers. 551 552 handlers = { 553 None : end_region, 554 "attrname" : parse_attrname, 555 "break" : parse_break, 556 "colour" : parse_colour, 557 "colspan" : parse_colspan, 558 "defterm" : parse_defterm, 559 "defterm_empty" : parse_defterm_empty, 560 "deftermend" : end_region, 561 "deftermsep" : end_region, 562 "fontstyle" : parse_fontstyle, 563 "halign" : parse_halign, 564 "heading" : parse_heading, 565 "headingend" : parse_heading_end, 566 "larger" : parse_larger, 567 "largerend" : end_region, 568 "listitemend" : end_region, 569 "listitem" : parse_listitem, 570 "listitem_alpha" : parse_listitem, 571 "listitem_dot" : parse_listitem, 572 "listitem_num" : parse_listitem, 573 "listitem_roman" : parse_listitem, 574 "monospace" : parse_monospace, 575 "monospaceend" : end_region, 576 "regionstart" : parse_section, 577 "regionend" : parse_section_end, 578 "rowspan" : parse_rowspan, 579 "rule" : parse_rule, 580 "smaller" : parse_smaller, 581 "smallerend" : end_region, 582 "sub" : parse_sub, 583 "subend" : end_region, 584 "super" : parse_super, 585 "superend" : end_region, 586 "tableattrs" : parse_table_attrs, 587 "tableattrsend" : end_region, 588 "tablerow" : parse_table_row, 589 "tablecell" : end_region, 590 "tableend" : end_region, 591 "underline" : parse_underline, 592 "underlineend" : end_region, 593 "valign" : parse_valign, 594 "width" : parse_width, 595 } 596 597 def new_block(region): 598 599 "Start a new block in 'region'." 600 601 block = Block([]) 602 region.add(block) 603 604 605 606 # Top-level functions. 607 608 parse = parse_page 609 610 # vim: tabstop=4 expandtab shiftwidth=4