1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsing import ParserBase, TokenStream, get_patterns, \ 23 init_formats, new_block 24 from moinformat.serialisers import serialise 25 from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ 26 Larger, ListItem, Monospace, Region, Rule, Smaller, \ 27 Subscript, Superscript, Table, TableAttr, \ 28 TableAttrs, TableCell, TableRow, Text, Underline 29 30 # Regular expressions. 31 32 syntax = { 33 # Page regions: 34 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 35 "regionend" : r"^\s*([}]{3,})", # }}}... 36 "header" : r"#!(.*?)\n", # #! char-excl-nl 37 38 # Region contents: 39 # Line-oriented patterns: 40 # blank line 41 "break" : r"^(\s*?)\n", 42 # ws... expecting text :: 43 "defterm" : r"^(\s+)(?=.+?::)", 44 # ws... expecting :: ws... 45 "defterm_empty" : r"^(\s+)(?=::\s+)", 46 # [ws...] =... ws... expecting headingend 47 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 48 # ws... list-item [ws...] 49 "listitem" : r"^(\s+)(\*)(\s*)", 50 # ws... number-item ws... 51 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 52 # ws... alpha-item ws... 53 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 54 # ws... roman-item ws... 55 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 56 # ws... dot-item [ws...] 57 "listitem_dot" : r"^(\s+)(\.)(\s*)", 58 # || 59 "tablerow" : r"^\|\|", 60 61 # Region contents: 62 # Inline patterns: 63 "fontstyle" : r"('{2,6})", 64 "larger" : r"~\+", 65 "monospace" : r"`", 66 "rule" : r"(-----*)", # ----... 67 "smaller" : r"~-", 68 "sub" : r",,", 69 "super" : r"\^", 70 "underline" : r"__", 71 72 # Inline contents: 73 "largerend" : r"\+~", 74 "monospaceend" : r"`", 75 "smallerend" : r"-~", 76 "subend" : r",,", 77 "superend" : r"\^", 78 "underlineend" : r"__", 79 80 # Heading contents: 81 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 82 83 # List contents: 84 "deftermend" : r"::(\s*?\n)", 85 "deftermsep" : r"::(\s+)", 86 "listitemend" : r"^", # next line 87 88 # Table contents: 89 "tableattrs" : r"<", 90 "tablecell" : r"\|\|", 91 "tableend" : r"(\s*?)^", # [ws...] next line 92 93 # Table attributes: 94 "tableattrsend" : r">", 95 "halign" : r"([(:)])", 96 "valign" : r"([v^])", 97 "colour" : r"(\#[0-9A-F]{6})", 98 "colspan" : r"-(\d+)", 99 "rowspan" : r"\|(\d+)", 100 "width" : r"(\d+%)", 101 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 102 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 103 } 104 105 106 107 class Parser(ParserBase): 108 109 "A wiki region parser." 110 111 def __init__(self, formats=None): 112 113 """ 114 Initialise the parser with any given 'formats' mapping from region type 115 names to parser objects. 116 """ 117 118 default_formats = {"wiki" : self} 119 if formats: 120 default_formats.update(formats) 121 122 ParserBase.__init__(self, default_formats) 123 124 # Pattern details. 125 126 patterns = get_patterns(syntax) 127 128 table_pattern_names = [ 129 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 130 "valign", "width" 131 ] 132 133 inline_pattern_names = [ 134 "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline", 135 ] 136 137 region_pattern_names = inline_pattern_names + [ 138 "break", "heading", "defterm", "defterm_empty", "listitem", 139 "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman", 140 "regionstart", "regionend", "rule", "tablerow", 141 ] 142 143 table_region_pattern_names = inline_pattern_names + [ 144 "tableattrs", "tablecell", "tableend" 145 ] 146 147 def inline_patterns_for(self, name): 148 names = self.inline_pattern_names[:] 149 names[names.index(name)] = "%send" % name 150 return names 151 152 # Principal parser methods. 153 154 def get_items(self, s, pos=0): 155 156 "Return a sequence of token items for 's' and 'pos'." 157 158 return TokenStream(s, self.patterns, pos) 159 160 def parse(self, s): 161 162 """ 163 Parse page text 's'. Pages consist of regions delimited by markers. 164 """ 165 166 items = self.get_items(s) 167 region = Region([]) 168 169 # Parse page header. 170 171 self.parse_region_header(items, region) 172 173 # Handle pages directly with this parser. 174 # Otherwise, test the type and find an appropriate parser. 175 176 if not region.type: 177 self.parse_region_content(items, region) 178 else: 179 self.parse_region_type(items, region) 180 181 return region 182 183 def parse_region_content(self, items, region): 184 185 "Parse the data provided by 'items' to populate a wiki 'region'." 186 187 # Obtain a suitable token stream. 188 189 items = self.replace_items(items) 190 191 # Define a block to hold text and start parsing. 192 193 new_block(region) 194 self.parse_region_details(items, region, self.region_pattern_names) 195 196 # Update the previous token stream. 197 198 self.update_items(items) 199 200 # Parser methods supporting different page features. 201 202 def parse_attrname(self, items, attrs): 203 204 "Handle an attribute name within 'attrs'." 205 206 name = items.read_match() 207 attr = TableAttr(name) 208 209 preceding = items.read_until(["attrvalue"], False) 210 if preceding == "": 211 attr.quote = items.read_match(1) 212 attr.value = items.read_match(2) 213 214 attrs.append(attr) 215 216 def parse_break(self, items, region): 217 218 "Handle a paragraph break within 'region'." 219 220 region.add(Break()) 221 new_block(region) 222 223 def parse_defitem(self, items, region, extra=""): 224 225 "Handle a definition item within 'region'." 226 227 pad = items.read_match(1) 228 item = DefItem([], pad, extra) 229 self.parse_region_details(items, item, ["listitemend"]) 230 region.add(item) 231 new_block(region) 232 233 def parse_defterm(self, items, region): 234 235 "Handle a definition term within 'region'." 236 237 pad = items.read_match(1) 238 term = DefTerm([], pad) 239 self.parse_region_details(items, term, ["deftermend", "deftermsep"]) 240 region.add(term) 241 if items.matching == "deftermsep": 242 self.parse_defitem(items, region) 243 244 def parse_defterm_empty(self, items, region): 245 246 "Handle an empty definition term within 'region'." 247 248 extra = items.read_match(1) 249 self.parse_region_details(items, region, ["deftermsep"]) 250 self.parse_defitem(items, region, extra) 251 252 def parse_fontstyle(self, items, region): 253 254 "Handle emphasis and strong styles." 255 256 n = len(items.read_match(1)) 257 258 # Handle endings. 259 260 if isinstance(region, FontStyle): 261 emphasis = n in (2, 4, 5) 262 strong = n in (3, 5, 6) 263 active = True 264 265 if region.emphasis and emphasis: 266 active = region.close_emphasis() 267 n -= 2 268 if region.strong and strong: 269 active = region.close_strong() 270 n -= 3 271 272 if not active: 273 if n: 274 items.rewind(n) 275 raise StopIteration 276 277 elif not n: 278 return 279 280 # Handle new styles. 281 282 emphasis = n in (2, 4, 5) 283 strong = n in (3, 5, 6) 284 double = n in (4, 6) 285 286 span = FontStyle([], emphasis, strong) 287 if not double: 288 self.parse_region_details(items, span, self.inline_pattern_names) 289 region.append_inline(span) 290 291 def parse_halign(self, items, attrs): 292 293 "Handle horizontal alignment within 'attrs'." 294 295 value = items.read_match() 296 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 297 attrs.append(attr) 298 299 def parse_heading(self, items, region): 300 301 "Handle a heading." 302 303 start_extra = items.read_match(1) 304 level = len(items.read_match(2)) 305 start_pad = items.read_match(3) 306 heading = Heading([], level, start_extra, start_pad) 307 self.parse_region_details(items, heading, ["headingend"] + self.inline_pattern_names) 308 region.add(heading) 309 new_block(region) 310 311 def parse_heading_end(self, items, heading): 312 313 "Handle the end of a heading." 314 315 level = len(items.read_match(2)) 316 if heading.level == level: 317 heading.end_pad = items.read_match(1) 318 heading.end_extra = items.read_match(3) 319 raise StopIteration 320 321 def parse_listitem(self, items, region): 322 323 "Handle a list item marker within 'region'." 324 325 indent = len(items.read_match(1)) 326 marker = items.read_match(2) 327 space = items.read_match(3) 328 item = ListItem([], indent, marker, space) 329 self.parse_region_details(items, item, ["listitemend"]) 330 region.add(item) 331 new_block(region) 332 333 def parse_rule(self, items, region): 334 335 "Handle a horizontal rule within 'region'." 336 337 length = len(items.read_match(1)) 338 rule = Rule(length) 339 region.add(rule) 340 new_block(region) 341 342 def parse_section(self, items, region): 343 344 "Handle the start of a new section within 'region'." 345 346 # Parse the section and start a new block after the section. 347 348 indent = len(items.read_match(2)) 349 level = len(items.read_match(3)) 350 region.add(self.parse_region(items, level, indent)) 351 new_block(region) 352 353 def parse_section_end(self, items, region): 354 355 "Handle the end of a new section within 'region'." 356 357 feature = items.read_match() 358 if region.have_end(feature): 359 raise StopIteration 360 else: 361 region.append_inline(Text(feature)) 362 363 def parse_table_attrs(self, items, cell): 364 365 "Handle the start of table attributes within 'cell'." 366 367 attrs = TableAttrs([]) 368 self.parse_region_details(items, attrs, self.table_pattern_names) 369 370 # Test the validity of the attributes. 371 372 last = None 373 374 for node in attrs.nodes: 375 376 # Text separator nodes must be whitespace. 377 378 if isinstance(node, Text): 379 if node.s.strip(): 380 break 381 382 # Named attributes must be preceded by space if not the first. 383 384 elif last and not node.concise and not isinstance(last, Text): 385 break 386 387 last = node 388 389 # All nodes were valid: preserve the collection. 390 391 else: 392 cell.attrs = attrs 393 return 394 395 # Invalid nodes were found: serialise the attributes as text. 396 397 cell.append_inline(Text(serialise(attrs))) 398 399 def parse_table_row(self, items, region): 400 401 "Handle the start of a table row within 'region'." 402 403 # Identify any active table. 404 405 table = region.node(-2) 406 block = region.node(-1) 407 408 if not (isinstance(table, Table) and block.empty()): 409 new_table = table = Table([]) 410 else: 411 new_table = None 412 413 row = TableRow([]) 414 415 while True: 416 cell = TableCell([]) 417 self.parse_region_details(items, cell, self.table_region_pattern_names) 418 419 # Handle the end of the row. 420 421 if items.matching == "tableend": 422 trailing = items.read_match() 423 424 # If the cell was started but not finished, convert the row into text. 425 426 if not row.nodes or not cell.empty(): 427 for node in row.nodes: 428 region.append_inline(Text(serialise(node))) 429 region.append_inline(Text(serialise(cell))) 430 region.append_inline(Text(trailing)) 431 432 new_block(region) 433 return 434 435 # Append the final cell, if not empty. 436 437 else: 438 row.trailing = trailing 439 440 if not cell.empty(): 441 row.append(cell) 442 break 443 444 # A cell separator has been found. 445 446 row.append(cell) 447 448 # Add the row to the table and any new table to the region. 449 450 table.add(row) 451 if new_table: 452 region.add(new_table) 453 454 new_block(region) 455 456 def parse_valign(self, items, attrs): 457 458 "Handle vertical alignment within 'attrs'." 459 460 value = items.read_match() 461 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 462 attrs.append(attr) 463 464 465 466 # Inline formatting handlers. 467 468 def parse_inline(self, items, region, cls, pattern_name): 469 470 "Handle an inline region." 471 472 span = cls([]) 473 self.parse_region_details(items, span, self.inline_patterns_for(pattern_name)) 474 region.append_inline(span) 475 476 def parse_larger(self, items, region): 477 self.parse_inline(items, region, Larger, "larger") 478 479 def parse_monospace(self, items, region): 480 self.parse_inline(items, region, Monospace, "monospace") 481 482 def parse_smaller(self, items, region): 483 self.parse_inline(items, region, Smaller, "smaller") 484 485 def parse_sub(self, items, region): 486 self.parse_inline(items, region, Subscript, "sub") 487 488 def parse_super(self, items, region): 489 self.parse_inline(items, region, Superscript, "super") 490 491 def parse_underline(self, items, region): 492 self.parse_inline(items, region, Underline, "underline") 493 494 495 496 # Table attribute handlers. 497 498 def parse_table_attr(self, items, attrs, pattern_name): 499 500 "Handle a table attribute." 501 502 value = items.read_match() 503 attrs.append(TableAttr(pattern_name, value, True)) 504 505 def parse_colour(self, items, cell): 506 self.parse_table_attr(items, cell, "colour") 507 508 def parse_colspan(self, items, cell): 509 self.parse_table_attr(items, cell, "colspan") 510 511 def parse_rowspan(self, items, cell): 512 self.parse_table_attr(items, cell, "rowspan") 513 514 def parse_width(self, items, cell): 515 self.parse_table_attr(items, cell, "width") 516 517 518 519 # Pattern handlers. 520 521 end_region = ParserBase.end_region 522 523 handlers = { 524 None : end_region, 525 "attrname" : parse_attrname, 526 "break" : parse_break, 527 "colour" : parse_colour, 528 "colspan" : parse_colspan, 529 "defterm" : parse_defterm, 530 "defterm_empty" : parse_defterm_empty, 531 "deftermend" : end_region, 532 "deftermsep" : end_region, 533 "fontstyle" : parse_fontstyle, 534 "halign" : parse_halign, 535 "heading" : parse_heading, 536 "headingend" : parse_heading_end, 537 "larger" : parse_larger, 538 "largerend" : end_region, 539 "listitemend" : end_region, 540 "listitem" : parse_listitem, 541 "listitem_alpha" : parse_listitem, 542 "listitem_dot" : parse_listitem, 543 "listitem_num" : parse_listitem, 544 "listitem_roman" : parse_listitem, 545 "monospace" : parse_monospace, 546 "monospaceend" : end_region, 547 "regionstart" : parse_section, 548 "regionend" : parse_section_end, 549 "rowspan" : parse_rowspan, 550 "rule" : parse_rule, 551 "smaller" : parse_smaller, 552 "smallerend" : end_region, 553 "sub" : parse_sub, 554 "subend" : end_region, 555 "super" : parse_super, 556 "superend" : end_region, 557 "tableattrs" : parse_table_attrs, 558 "tableattrsend" : end_region, 559 "tablerow" : parse_table_row, 560 "tablecell" : end_region, 561 "tableend" : end_region, 562 "underline" : parse_underline, 563 "underlineend" : end_region, 564 "valign" : parse_valign, 565 "width" : parse_width, 566 } 567 568 569 570 # Top-level functions. 571 572 def parse(s, formats=None): 573 return Parser(init_formats(formats)).parse(s) 574 575 # vim: tabstop=4 expandtab shiftwidth=4