1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.serialisers import serialise 23 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \ 24 Larger, ListItem, Monospace, Region, Rule, Smaller, \ 25 Subscript, Superscript, TableAttr, TableAttrs, \ 26 TableCell, TableRow, Text, Underline 27 import re 28 29 # Regular expressions. 30 31 syntax = { 32 # Page regions: 33 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 34 "regionend" : r"^\s*([}]{3,})", # }}}... 35 "header" : r"#!(.*?)\n", # #! char-excl-nl 36 37 # Region contents: 38 # Line-oriented patterns: 39 # blank line 40 "break" : r"^(\s*?)\n", 41 # ws... expecting text :: 42 "defterm" : r"^(\s+)(?=.+?::)", 43 # ws... expecting :: ws... 44 "defterm_empty" : r"^(\s+)(?=::\s+)", 45 # [ws...] =... ws... expecting headingend 46 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 47 # ws... list-item [ws...] 48 "listitem" : r"^(\s+)(\*)(\s*)", 49 # ws... number-item ws... 50 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 51 # ws... alpha-item ws... 52 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 53 # ws... roman-item ws... 54 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 55 # ws... dot-item [ws...] 56 "listitem_dot" : r"^(\s+)(\.)(\s*)", 57 # || 58 "tablerow" : r"^\|\|", 59 60 # Region contents: 61 # Inline patterns: 62 "fontstyle" : r"('{2,6})", 63 "larger" : r"~\+", 64 "monospace" : r"`", 65 "rule" : r"(-----*)", # ----... 66 "smaller" : r"~-", 67 "sub" : r",,", 68 "super" : r"\^", 69 "underline" : r"__", 70 71 # Inline contents: 72 "largerend" : r"\+~", 73 "monospaceend" : r"`", 74 "smallerend" : r"-~", 75 "subend" : r",,", 76 "superend" : r"\^", 77 "underlineend" : r"__", 78 79 # Heading contents: 80 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 81 82 # List contents: 83 "deftermend" : r"::(\s*?\n)", 84 "deftermsep" : r"::(\s+)", 85 "listitemend" : r"^", # next line 86 87 # Table contents: 88 "tableattrs" : r"<", 89 "tablecell" : r"\|\|", 90 "tableend" : r"(\s*?)^", # [ws...] next line 91 92 # Table attributes: 93 "tableattrsend" : r">", 94 "halign" : r"([(:)])", 95 "valign" : r"([v^])", 96 "colour" : r"(\#[0-9A-F]{6})", 97 "colspan" : r"-(\d+)", 98 "rowspan" : r"\|(\d+)", 99 "width" : r"(\d+%)", 100 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 101 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 102 } 103 104 # Define pattern details. 105 106 table_pattern_names = ["attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", "valign", "width"] 107 108 inline_pattern_names = ["fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline"] 109 110 def inline_patterns_for(name): 111 names = inline_pattern_names[:] 112 names[names.index(name)] = "%send" % name 113 return names 114 115 # Define patterns for the regular expressions. 116 117 patterns = {} 118 for name, value in syntax.items(): 119 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 120 121 122 123 # Tokenising functions. 124 125 class TokenStream: 126 127 "A stream of tokens taken from a string." 128 129 def __init__(self, s): 130 self.s = s 131 self.pos = 0 132 self.match = None 133 self.matching = None 134 135 def rewind(self, length): 136 137 "Rewind in the string by 'length'." 138 139 self.pos -= min(length, self.pos) 140 141 def read_until(self, pattern_names, remaining=True): 142 143 """ 144 Find the first match for the given 'pattern_names'. Return the text 145 preceding any match, the remaining text if no match was found, or None 146 if no match was found and 'remaining' is given as a false value. 147 """ 148 149 first = None 150 self.matching = None 151 152 # Find the first matching pattern. 153 154 for pattern_name in pattern_names: 155 match = patterns[pattern_name].search(self.s, self.pos) 156 if match: 157 start, end = match.span() 158 if self.matching is None or start < first: 159 first = start 160 self.matching = pattern_name 161 self.match = match 162 163 if self.matching is None: 164 if remaining: 165 return self.s[self.pos:] 166 else: 167 return None 168 else: 169 return self.s[self.pos:first] 170 171 def read_match(self, group=1): 172 173 """ 174 Return the matched text, updating the position in the stream. If 'group' 175 is specified, the indicated group in a match will be returned. 176 Typically, group 1 should contain all pertinent data, but groups defined 177 within group 1 can provide sections of the data. 178 """ 179 180 if self.match: 181 _start, self.pos = self.match.span() 182 try: 183 return self.match.group(group) 184 except IndexError: 185 return "" 186 else: 187 self.pos = len(self.s) 188 return None 189 190 191 192 # Parser functions. 193 194 def parse_page(s): 195 196 """ 197 Parse page text 's'. Pages consist of regions delimited by markers. 198 """ 199 200 return parse_region(TokenStream(s)) 201 202 def parse_region(items, level=0, indent=0): 203 204 """ 205 Parse the data provided by 'items' to populate a region with the given 206 'level' at the given 'indent'. 207 """ 208 209 region = Region([], level, indent) 210 211 # Parse section headers. 212 213 parse_region_header(items, region) 214 215 # Parse section body. 216 217 if region.is_transparent(): 218 parse_region_wiki(items, region) 219 else: 220 parse_region_opaque(items, region) 221 222 return region 223 224 def parse_region_header(items, region): 225 226 """ 227 Parse the region header from the 'items', setting it for the given 'region'. 228 """ 229 230 if items.read_until(["header"], False) == "": # None means no header 231 region.type = items.read_match() 232 233 def parse_region_wiki(items, region): 234 235 "Parse the data provided by 'items' to populate a wiki 'region'." 236 237 new_block(region) 238 parse_region_details(items, region, inline_pattern_names + [ 239 "break", "heading", 240 "defterm", "defterm_empty", 241 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 242 "listitem_roman", 243 "regionstart", "regionend", 244 "rule", 245 "tablerow", 246 ]) 247 248 def parse_region_opaque(items, region): 249 250 "Parse the data provided by 'items' to populate an opaque 'region'." 251 252 parse_region_details(items, region, ["regionend"]) 253 254 def parse_region_details(items, region, pattern_names): 255 256 "Parse 'items' within 'region' searching using 'pattern_names'." 257 258 try: 259 while True: 260 261 # Obtain text before any marker or the end of the input. 262 263 preceding = items.read_until(pattern_names) 264 if preceding: 265 region.append_inline(Text(preceding)) 266 267 # End of input. 268 269 if not items.matching: 270 break 271 272 # Obtain any feature. 273 274 feature = items.read_match() 275 handler = handlers.get(items.matching) 276 277 # Handle each feature or add text to the region. 278 279 if handler: 280 handler(items, region) 281 else: 282 region.append_inline(Text(feature)) 283 284 except StopIteration: 285 pass 286 287 region.normalise() 288 289 def end_region(items, region): 290 291 "End the parsing of 'region'." 292 293 raise StopIteration 294 295 def parse_attrname(items, attrs): 296 297 "Handle an attribute name within 'attrs'." 298 299 name = items.read_match() 300 attr = TableAttr(name) 301 302 preceding = items.read_until(["attrvalue"], False) 303 if preceding == "": 304 attr.quote = items.read_match(1) 305 attr.value = items.read_match(2) 306 307 attrs.append(attr) 308 309 def parse_break(items, region): 310 311 "Handle a paragraph break within 'region'." 312 313 region.add(Break()) 314 new_block(region) 315 316 def parse_defitem(items, region, extra=""): 317 318 "Handle a definition item within 'region'." 319 320 pad = items.read_match(1) 321 item = DefItem([], pad, extra) 322 parse_region_details(items, item, ["listitemend"]) 323 region.add(item) 324 new_block(region) 325 326 def parse_defterm(items, region): 327 328 "Handle a definition term within 'region'." 329 330 pad = items.read_match(1) 331 term = DefTerm([], pad) 332 parse_region_details(items, term, ["deftermend", "deftermsep"]) 333 region.add(term) 334 if items.matching == "deftermsep": 335 parse_defitem(items, region) 336 337 def parse_defterm_empty(items, region): 338 339 "Handle an empty definition term within 'region'." 340 341 extra = items.read_match(1) 342 parse_region_details(items, region, ["deftermsep"]) 343 parse_defitem(items, region, extra) 344 345 def parse_fontstyle(items, region): 346 347 "Handle emphasis and strong styles." 348 349 n = len(items.read_match(1)) 350 351 # Handle endings. 352 353 if isinstance(region, FontStyle): 354 emphasis = n in (2, 4, 5) 355 strong = n in (3, 5, 6) 356 active = True 357 358 if region.emphasis and emphasis: 359 active = region.close_emphasis() 360 n -= 2 361 if region.strong and strong: 362 active = region.close_strong() 363 n -= 3 364 365 if not active: 366 if n: 367 items.rewind(n) 368 raise StopIteration 369 370 elif not n: 371 return 372 373 # Handle new styles. 374 375 emphasis = n in (2, 4, 5) 376 strong = n in (3, 5, 6) 377 double = n in (4, 6) 378 379 span = FontStyle([], emphasis, strong) 380 if not double: 381 parse_region_details(items, span, inline_pattern_names) 382 region.append_inline(span) 383 384 def parse_halign(items, attrs): 385 386 "Handle horizontal alignment within 'attrs'." 387 388 value = items.read_match() 389 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 390 attrs.append(attr) 391 392 def parse_heading(items, region): 393 394 "Handle a heading." 395 396 start_extra = items.read_match(1) 397 level = len(items.read_match(2)) 398 start_pad = items.read_match(3) 399 heading = Heading([], level, start_extra, start_pad) 400 parse_region_details(items, heading, ["headingend"] + inline_pattern_names) 401 region.add(heading) 402 new_block(region) 403 404 def parse_heading_end(items, heading): 405 406 "Handle the end of a heading." 407 408 level = len(items.read_match(2)) 409 if heading.level == level: 410 heading.end_pad = items.read_match(1) 411 heading.end_extra = items.read_match(3) 412 raise StopIteration 413 414 def parse_listitem(items, region): 415 416 "Handle a list item marker within 'region'." 417 418 indent = len(items.read_match(1)) 419 marker = items.read_match(2) 420 space = items.read_match(3) 421 item = ListItem([], indent, marker, space) 422 parse_region_details(items, item, ["listitemend"]) 423 region.add(item) 424 new_block(region) 425 426 def parse_rule(items, region): 427 428 "Handle a horizontal rule within 'region'." 429 430 length = len(items.read_match(1)) 431 rule = Rule(length) 432 region.add(rule) 433 new_block(region) 434 435 def parse_section(items, region): 436 437 "Handle the start of a new section within 'region'." 438 439 # Parse the section and start a new block after the section. 440 441 indent = len(items.read_match(2)) 442 level = len(items.read_match(3)) 443 region.add(parse_region(items, level, indent)) 444 new_block(region) 445 446 def parse_section_end(items, region): 447 448 "Handle the end of a new section within 'region'." 449 450 feature = items.read_match() 451 if region.have_end(feature): 452 raise StopIteration 453 else: 454 region.append_inline(Text(feature)) 455 456 def parse_table_attrs(items, cell): 457 458 "Handle the start of table attributes within 'cell'." 459 460 attrs = TableAttrs([]) 461 parse_region_details(items, attrs, table_pattern_names) 462 cell.attrs = attrs 463 464 def parse_table_row(items, region): 465 466 "Handle the start of a table row within 'region'." 467 468 row = TableRow([]) 469 470 while True: 471 cell = TableCell([]) 472 parse_region_details(items, cell, ["tableattrs", "tablecell", "tableend"]) 473 474 # Handle the end of the row. 475 476 if items.matching == "tableend": 477 trailing = items.read_match() 478 479 # If the cell was started but not finished, convert the row into text. 480 481 if not row.nodes or not cell.empty(): 482 for node in row.nodes: 483 region.append_inline(Text(serialise(node))) 484 region.append_inline(Text(serialise(cell))) 485 region.append_inline(Text(trailing)) 486 487 new_block(region) 488 return 489 490 # Append the final cell, if not empty. 491 492 else: 493 row.trailing = trailing 494 495 if not cell.empty(): 496 row.append(cell) 497 break 498 499 # A cell separator has been found. 500 501 row.append(cell) 502 503 region.add(row) 504 new_block(region) 505 506 def parse_valign(items, attrs): 507 508 "Handle vertical alignment within 'attrs'." 509 510 value = items.read_match() 511 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 512 attrs.append(attr) 513 514 # Inline formatting handlers. 515 516 def parse_inline(items, region, cls, pattern_name): 517 518 "Handle an inline region." 519 520 span = cls([]) 521 parse_region_details(items, span, inline_patterns_for(pattern_name)) 522 region.append_inline(span) 523 524 parse_larger = lambda items, region: parse_inline(items, region, Larger, "larger") 525 parse_monospace = lambda items, region: parse_inline(items, region, Monospace, "monospace") 526 parse_smaller = lambda items, region: parse_inline(items, region, Smaller, "smaller") 527 parse_sub = lambda items, region: parse_inline(items, region, Subscript, "sub") 528 parse_super = lambda items, region: parse_inline(items, region, Superscript, "super") 529 parse_underline = lambda items, region: parse_inline(items, region, Underline, "underline") 530 531 # Table attribute handlers. 532 533 def parse_table_attr(items, attrs, pattern_name): 534 535 "Handle a table attribute." 536 537 value = items.read_match() 538 attrs.append(TableAttr(pattern_name, value, True)) 539 540 parse_colour = lambda items, cell: parse_table_attr(items, cell, "colour") 541 parse_colspan = lambda items, cell: parse_table_attr(items, cell, "colspan") 542 parse_rowspan = lambda items, cell: parse_table_attr(items, cell, "rowspan") 543 parse_width = lambda items, cell: parse_table_attr(items, cell, "width") 544 545 # Pattern handlers. 546 547 handlers = { 548 None : end_region, 549 "attrname" : parse_attrname, 550 "break" : parse_break, 551 "colour" : parse_colour, 552 "colspan" : parse_colspan, 553 "defterm" : parse_defterm, 554 "defterm_empty" : parse_defterm_empty, 555 "deftermend" : end_region, 556 "deftermsep" : end_region, 557 "fontstyle" : parse_fontstyle, 558 "halign" : parse_halign, 559 "heading" : parse_heading, 560 "headingend" : parse_heading_end, 561 "larger" : parse_larger, 562 "largerend" : end_region, 563 "listitemend" : end_region, 564 "listitem" : parse_listitem, 565 "listitem_alpha" : parse_listitem, 566 "listitem_dot" : parse_listitem, 567 "listitem_num" : parse_listitem, 568 "listitem_roman" : parse_listitem, 569 "monospace" : parse_monospace, 570 "monospaceend" : end_region, 571 "regionstart" : parse_section, 572 "regionend" : parse_section_end, 573 "rowspan" : parse_rowspan, 574 "rule" : parse_rule, 575 "smaller" : parse_smaller, 576 "smallerend" : end_region, 577 "sub" : parse_sub, 578 "subend" : end_region, 579 "super" : parse_super, 580 "superend" : end_region, 581 "tableattrs" : parse_table_attrs, 582 "tableattrsend" : end_region, 583 "tablerow" : parse_table_row, 584 "tablecell" : end_region, 585 "tableend" : end_region, 586 "underline" : parse_underline, 587 "underlineend" : end_region, 588 "valign" : parse_valign, 589 "width" : parse_width, 590 } 591 592 def new_block(region): 593 594 "Start a new block in 'region'." 595 596 block = Block([]) 597 region.add(block) 598 599 600 601 # Top-level functions. 602 603 parse = parse_page 604 605 # vim: tabstop=4 expandtab shiftwidth=4