1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.parsers.common import ParserBase, get_patterns, get_subset, new_block 23 from moinformat.serialisers import serialise 24 from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \ 25 Larger, ListItem, Monospace, Region, Rule, Smaller, \ 26 Subscript, Superscript, Table, TableAttr, \ 27 TableAttrs, TableCell, TableRow, Text, Underline 28 29 class MoinParser(ParserBase): 30 31 "A wiki region parser." 32 33 def __init__(self, formats=None): 34 35 """ 36 Initialise the parser with any given 'formats' mapping from region type 37 names to parser objects. 38 """ 39 40 # Introduce this class as the default parser for the wiki format. 41 42 default_formats = {"wiki" : MoinParser, "moin" : MoinParser} 43 if formats: 44 default_formats.update(formats) 45 46 ParserBase.__init__(self, default_formats) 47 48 # Principal parser methods. 49 50 def parse(self, s): 51 52 """ 53 Parse page text 's'. Pages consist of regions delimited by markers. 54 """ 55 56 self.items = self.get_items(s) 57 self.region = Region([]) 58 59 # Parse page header. 60 61 self.parse_region_header(self.region) 62 63 # Handle pages directly with this parser. Pages do not need to use an 64 # explicit format indicator. 65 66 if not self.region.type: 67 self.parse_region_content(self.items, self.region) 68 69 # Otherwise, test the type and find an appropriate parser. 70 71 else: 72 self.parse_region_type(self.region) 73 74 return self.region 75 76 77 78 # Parser methods supporting different page features. 79 80 def parse_attrname(self, attrs): 81 82 "Handle an attribute name within 'attrs'." 83 84 name = self.read_match() 85 attr = TableAttr(name) 86 87 preceding = self.read_until(["attrvalue"], False) 88 if preceding == "": 89 attr.quote = self.read_match(1) 90 attr.value = self.read_match(2) 91 92 attrs.append(attr) 93 94 def parse_break(self, region): 95 96 "Handle a paragraph break within 'region'." 97 98 region.add(Break()) 99 new_block(region) 100 101 def parse_defitem(self, region, extra=""): 102 103 "Handle a definition item within 'region'." 104 105 pad = self.read_match(1) 106 item = DefItem([], pad, extra) 107 self.parse_region_details(item, ["listitemend"]) 108 region.add(item) 109 new_block(region) 110 111 def parse_defterm(self, region): 112 113 "Handle a definition term within 'region'." 114 115 pad = self.read_match(1) 116 term = DefTerm([], pad) 117 self.parse_region_details(term, ["deftermend", "deftermsep"]) 118 region.add(term) 119 if self.read_matching() == "deftermsep": 120 self.parse_defitem(region) 121 122 def parse_defterm_empty(self, region): 123 124 "Handle an empty definition term within 'region'." 125 126 extra = self.read_match(1) 127 self.parse_region_details(region, ["deftermsep"]) 128 self.parse_defitem(region, extra) 129 130 def parse_fontstyle(self, region): 131 132 "Handle emphasis and strong styles." 133 134 n = len(self.read_match(1)) 135 136 # Handle endings. 137 138 if isinstance(region, FontStyle): 139 emphasis = n in (2, 4, 5) 140 strong = n in (3, 5, 6) 141 active = True 142 143 if region.emphasis and emphasis: 144 active = region.close_emphasis() 145 n -= 2 146 if region.strong and strong: 147 active = region.close_strong() 148 n -= 3 149 150 if not active: 151 if n: 152 self.items.rewind(n) 153 raise StopIteration 154 155 elif not n: 156 return 157 158 # Handle new styles. 159 160 emphasis = n in (2, 4, 5) 161 strong = n in (3, 5, 6) 162 double = n in (4, 6) 163 164 span = FontStyle([], emphasis, strong) 165 if not double: 166 self.parse_region_details(span, self.inline_pattern_names) 167 region.append_inline(span) 168 169 def parse_halign(self, attrs): 170 171 "Handle horizontal alignment within 'attrs'." 172 173 value = self.read_match() 174 attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True) 175 attrs.append(attr) 176 177 def parse_heading(self, region): 178 179 "Handle a heading." 180 181 start_extra = self.read_match(1) 182 level = len(self.read_match(2)) 183 start_pad = self.read_match(3) 184 heading = Heading([], level, start_extra, start_pad) 185 self.parse_region_details(heading, ["headingend"] + self.inline_pattern_names) 186 region.add(heading) 187 new_block(region) 188 189 def parse_heading_end(self, heading): 190 191 "Handle the end of a heading." 192 193 level = len(self.read_match(2)) 194 if heading.level == level: 195 heading.end_pad = self.read_match(1) 196 heading.end_extra = self.read_match(3) 197 raise StopIteration 198 199 def parse_listitem(self, region): 200 201 "Handle a list item marker within 'region'." 202 203 indent = len(self.read_match(1)) 204 marker = self.read_match(2) 205 space = self.read_match(3) 206 item = ListItem([], indent, marker, space) 207 self.parse_region_details(item, self.listitem_pattern_names) 208 region.add(item) 209 new_block(region) 210 211 def parse_rule(self, region): 212 213 "Handle a horizontal rule within 'region'." 214 215 length = len(self.read_match(1)) 216 rule = Rule(length) 217 region.add(rule) 218 new_block(region) 219 220 def parse_section(self, region): 221 222 "Handle the start of a new section within 'region'." 223 224 # Parse the section and start a new block after the section. 225 226 indent = len(self.read_match(2)) 227 level = len(self.read_match(3)) 228 region.add(self.parse_region(level, indent)) 229 new_block(region) 230 231 def parse_section_end(self, region): 232 233 "Handle the end of a new section within 'region'." 234 235 feature = self.read_match() 236 if region.have_end(feature): 237 raise StopIteration 238 else: 239 region.append_inline(Text(feature)) 240 241 def parse_table_attrs(self, cell): 242 243 "Handle the start of table attributes within 'cell'." 244 245 attrs = TableAttrs([]) 246 self.parse_region_details(attrs, self.table_pattern_names) 247 248 # Test the validity of the attributes. 249 250 last = None 251 252 for node in attrs.nodes: 253 254 # Text separator nodes must be whitespace. 255 256 if isinstance(node, Text): 257 if node.s.strip(): 258 break 259 260 # Named attributes must be preceded by space if not the first. 261 262 elif last and not node.concise and not isinstance(last, Text): 263 break 264 265 last = node 266 267 # All nodes were valid: preserve the collection. 268 269 else: 270 cell.attrs = attrs 271 return 272 273 # Invalid nodes were found: serialise the attributes as text. 274 275 cell.append_inline(Text(serialise(attrs))) 276 277 def parse_table_row(self, region): 278 279 "Handle the start of a table row within 'region'." 280 281 # Identify any active table. 282 283 table = region.node(-2) 284 block = region.node(-1) 285 286 if not (isinstance(table, Table) and block.empty()): 287 new_table = table = Table([]) 288 else: 289 new_table = None 290 291 row = TableRow([]) 292 293 while True: 294 cell = TableCell([]) 295 self.parse_region_details(cell, self.table_region_pattern_names) 296 297 # Handle the end of the row. 298 299 if self.read_matching() == "tableend": 300 trailing = self.read_match() 301 302 # If the cell was started but not finished, convert the row into text. 303 304 if not row.nodes or not cell.empty(): 305 for node in row.nodes: 306 region.append_inline(Text(serialise(node))) 307 region.append_inline(Text(serialise(cell))) 308 region.append_inline(Text(trailing)) 309 310 new_block(region) 311 return 312 313 # Append the final cell, if not empty. 314 315 else: 316 row.trailing = trailing 317 318 if not cell.empty(): 319 row.append(cell) 320 break 321 322 # A cell separator has been found. 323 324 row.append(cell) 325 326 # Add the row to the table and any new table to the region. 327 328 table.add(row) 329 if new_table: 330 region.add(new_table) 331 332 new_block(region) 333 334 def parse_valign(self, attrs): 335 336 "Handle vertical alignment within 'attrs'." 337 338 value = self.read_match() 339 attr = TableAttr("valign", value == "^" and "top" or "bottom", True) 340 attrs.append(attr) 341 342 343 344 # Inline formatting handlers. 345 346 def parse_inline(self, region, cls, pattern_name): 347 348 "Handle an inline region." 349 350 span = cls([]) 351 self.parse_region_details(span, self.inline_patterns_for(pattern_name)) 352 region.append_inline(span) 353 354 def parse_larger(self, region): 355 self.parse_inline(region, Larger, "larger") 356 357 def parse_monospace(self, region): 358 self.parse_inline(region, Monospace, "monospace") 359 360 def parse_smaller(self, region): 361 self.parse_inline(region, Smaller, "smaller") 362 363 def parse_sub(self, region): 364 self.parse_inline(region, Subscript, "sub") 365 366 def parse_super(self, region): 367 self.parse_inline(region, Superscript, "super") 368 369 def parse_underline(self, region): 370 self.parse_inline(region, Underline, "underline") 371 372 373 374 # Table attribute handlers. 375 376 def parse_table_attr(self, attrs, pattern_name): 377 378 "Handle a table attribute." 379 380 attrs.append(TableAttr(pattern_name, self.read_match(), True)) 381 382 def parse_colour(self, cell): 383 self.parse_table_attr(cell, "colour") 384 385 def parse_colspan(self, cell): 386 self.parse_table_attr(cell, "colspan") 387 388 def parse_rowspan(self, cell): 389 self.parse_table_attr(cell, "rowspan") 390 391 def parse_width(self, cell): 392 self.parse_table_attr(cell, "width") 393 394 395 396 # Regular expressions. 397 398 syntax = { 399 # Page regions: 400 "regionstart" : r"((^\N*)([{]{3,}))", # {{{... 401 "regionend" : r"^\N*([}]{3,})", # }}}... 402 "header" : r"#!(.*?)\n", # #! char-excl-nl 403 404 # Region contents: 405 # Line-oriented patterns: 406 # blank line 407 "break" : r"^(\s*?)\n", 408 # ws... expecting text :: 409 "defterm" : r"^(\N+)(?=.+?::)", 410 # ws... expecting :: ws... 411 "defterm_empty" : r"^(\N+)(?=::\s+)", 412 # [ws...] =... ws... expecting headingend 413 "heading" : r"^(\N*)(?P<x>=+)(\s+)(?=.*?\N+(?P=x)\N*$)", 414 # ws... list-item [ws...] 415 "listitem" : r"^(\N+)(\*)(\s*)", 416 # ws... number-item ws... 417 "listitem_num" : r"^(\N+)(\d+\.)(\s+)", 418 # ws... alpha-item ws... 419 "listitem_alpha": r"^(\N+)([aA]\.)(\s+)", 420 # ws... roman-item ws... 421 "listitem_roman": r"^(\N+)([iI]\.)(\s+)", 422 # ws... dot-item [ws...] 423 "listitem_dot" : r"^(\N+)(\.)(\s*)", 424 # || 425 "tablerow" : r"^\|\|", 426 427 # Region contents: 428 # Inline patterns: 429 "fontstyle" : r"('{2,6})", 430 "larger" : r"~\+", 431 "monospace" : r"`", 432 "rule" : r"(-----*)", # ----... 433 "smaller" : r"~-", 434 "sub" : r",,", 435 "super" : r"\^", 436 "underline" : r"__", 437 438 # Inline contents: 439 "largerend" : r"\+~", 440 "monospaceend" : r"`", 441 "smallerend" : r"-~", 442 "subend" : r",,", 443 "superend" : r"\^", 444 "underlineend" : r"__", 445 446 # Heading contents: 447 "headingend" : r"(\N+)(=+)(\N*$)", # ws... =... [ws...] nl 448 449 # List contents: 450 "deftermend" : r"::(\s*?\n)", 451 "deftermsep" : r"::(\s+)", 452 "listitemend" : r"^", # next line 453 454 # Table contents: 455 "tableattrs" : r"<", 456 "tablecell" : r"\|\|", 457 "tableend" : r"(\s*?)^", # [ws...] next line 458 459 # Table attributes: 460 "tableattrsend" : r">", 461 "halign" : r"([(:)])", 462 "valign" : r"([v^])", 463 "colour" : r"(\#[0-9A-F]{6})", 464 "colspan" : r"-(\d+)", 465 "rowspan" : r"\|(\d+)", 466 "width" : r"(\d+%)", 467 "attrname" : r"((?![-\d])[-\w]+)", # not-dash-or-digit dash-or-word-char... 468 "attrvalue" : r"""=(?P<x>['"])(.*?)(?P=x)""", 469 } 470 471 patterns = get_patterns(syntax) 472 473 474 475 # Pattern details. 476 477 table_pattern_names = [ 478 "attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", 479 "valign", "width" 480 ] 481 482 inline_pattern_names = [ 483 "fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline", 484 ] 485 486 listitem_pattern_names = inline_pattern_names + ["listitemend"] 487 488 region_pattern_names = inline_pattern_names + [ 489 "break", "heading", "defterm", "defterm_empty", "listitem", 490 "listitem_alpha", "listitem_dot", "listitem_num", "listitem_roman", 491 "regionstart", "regionend", "rule", "tablerow", 492 ] 493 494 table_region_pattern_names = inline_pattern_names + [ 495 "tableattrs", "tablecell", "tableend" 496 ] 497 498 def inline_patterns_for(self, name): 499 names = self.inline_pattern_names[:] 500 names[names.index(name)] = "%send" % name 501 return names 502 503 504 505 # Pattern handlers. 506 507 end_region = ParserBase.end_region 508 509 handlers = { 510 None : end_region, 511 "attrname" : parse_attrname, 512 "break" : parse_break, 513 "colour" : parse_colour, 514 "colspan" : parse_colspan, 515 "defterm" : parse_defterm, 516 "defterm_empty" : parse_defterm_empty, 517 "deftermend" : end_region, 518 "deftermsep" : end_region, 519 "fontstyle" : parse_fontstyle, 520 "halign" : parse_halign, 521 "heading" : parse_heading, 522 "headingend" : parse_heading_end, 523 "larger" : parse_larger, 524 "largerend" : end_region, 525 "listitemend" : end_region, 526 "listitem" : parse_listitem, 527 "listitem_alpha" : parse_listitem, 528 "listitem_dot" : parse_listitem, 529 "listitem_num" : parse_listitem, 530 "listitem_roman" : parse_listitem, 531 "monospace" : parse_monospace, 532 "monospaceend" : end_region, 533 "regionstart" : parse_section, 534 "regionend" : parse_section_end, 535 "rowspan" : parse_rowspan, 536 "rule" : parse_rule, 537 "smaller" : parse_smaller, 538 "smallerend" : end_region, 539 "sub" : parse_sub, 540 "subend" : end_region, 541 "super" : parse_super, 542 "superend" : end_region, 543 "tableattrs" : parse_table_attrs, 544 "tableattrsend" : end_region, 545 "tablerow" : parse_table_row, 546 "tablecell" : end_region, 547 "tableend" : end_region, 548 "underline" : parse_underline, 549 "underlineend" : end_region, 550 "valign" : parse_valign, 551 "width" : parse_width, 552 } 553 554 parser = MoinParser 555 556 # vim: tabstop=4 expandtab shiftwidth=4