1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then split into regions. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 import operator 39 40 # Section extraction. 41 42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}|^(?P<rowstart>[|]{1,2})|(?P<rowend>[|]{1,2})(\n|$)" 43 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 44 45 def get_regions(s): 46 47 """ 48 Return a list of regions from 's'. Each region is specified using a tuple of 49 the form (type, text). 50 """ 51 52 last = 0 53 regions = [""] 54 depth = 0 55 had_row = False 56 57 for match in sections_regexp.finditer(s): 58 start, end = match.span() 59 is_start = match.group("options") or match.group("rowstart") 60 is_section = is_section_marker(match.group("type")) 61 is_row = match.group("rowstart") or match.group("rowend") 62 63 # The start of a region is either indicated by a marker with options or 64 # by a marker where no region is currently active. 65 66 if is_start or not depth: 67 68 # Where no region is active, add the text since the last match as a 69 # "null" region. 70 71 if not depth: 72 regions[-1] += s[last:start] 73 74 # A new region is maintained as a string. 75 76 if is_section: 77 regions.append(s[start:end]) 78 79 # A new row may either continue a table region or start a new 80 # table region. 81 82 elif is_row: 83 if (last != start or not had_row): 84 regions.append(s[start:end]) 85 else: 86 regions[-2] += regions[-1] + s[start:end] 87 regions.pop() 88 89 # Certain markers may be standalone macros. 90 91 else: 92 regions[-1] += s[start:end] 93 94 # Where a region is active, add the text since the last match as 95 # well as the text in this match to the region. 96 97 else: 98 regions[-1] += s[last:end] 99 100 if is_section or is_row: 101 depth += 1 102 103 # The end of a region is indicated by a marker with no options. 104 105 else: 106 # Where no region is active, the text since the last match plus the 107 # marker are added to the current "null" region. 108 109 if not depth: 110 111 # Add to the string portion of the "null" region. 112 113 regions[-1] += s[last:end] 114 115 # Where a region is active, the end marker and preceding text is 116 # either incorporated into the current region if more than one 117 # region is active, or the preceding text is incorporated into the 118 # current region and the details of the region are then obtained. 119 120 else: 121 if depth > 1 or (not is_section and not is_row): 122 regions[-1] += s[last:end] 123 124 # Terminate the active region, interpreting its contents. 125 126 else: 127 regions[-1] += s[last:end] 128 regions.append("") 129 130 if is_section or is_row: 131 depth -= 1 132 133 had_row = is_row 134 last = end 135 136 # Where a region is still active, terminate it. 137 138 regions[-1] += s[last:] 139 140 return [get_section_details(s) for s in regions if s] 141 142 def is_section_marker(sectiontype): 143 return sectiontypes.has_key(sectiontype) or sectiontype == "color" 144 145 # Section inspection. 146 147 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 148 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 149 150 def get_section_details(s): 151 152 "Return the details of a section 's' in the form (type, text)." 153 154 match = section_regexp.match(s) 155 if match: 156 return (match.group("sectiontype"), match.group("options")), match.group("section") 157 else: 158 return None, s 159 160 # Heading, table and list extraction. 161 162 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 163 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 164 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 165 166 blockelement_regexp = re.compile( 167 "(" + list_regexp_str + ")" 168 "|" 169 "(" + table_regexp_str + ")" 170 "|" 171 "(" + blocktext_regexp_str + ")", 172 re.MULTILINE 173 ) 174 175 def get_block_elements(s): 176 177 """ 178 Extract headings, tables and lists from the given string 's'. 179 """ 180 181 last = 0 182 blocks = [] 183 for match in blockelement_regexp.finditer(s): 184 start, end = match.span() 185 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 186 blocks.append((None, s[last:start])) 187 blocks.append((matchtype, match.group("text") or s[start:end])) 188 last = end 189 blocks.append((None, s[last:])) 190 return blocks 191 192 # Block extraction. 193 194 block_regexp_str = r"^(?:\s*\n)+" 195 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 196 197 def get_basic_blocks(s): 198 199 """ 200 Return blocks from the given string 's' by splitting the text on blank lines 201 and eliminating those lines. 202 """ 203 204 return [b for b in block_regexp.split(s) if b.strip()] 205 206 # Block inspection. 207 208 def get_blocks(s): 209 210 """ 211 Return blocks from the given string 's', inspecting the basic blocks and 212 generating additional block-level text where appropriate. 213 """ 214 215 blocks = [] 216 217 for blocktype, blocktext in get_block_elements(s): 218 219 # Collect heading, list and table blocks. 220 221 if blocktype is not None: 222 blocks.append((blocktype, blocktext)) 223 224 # Attempt to find new subblocks in other regions. 225 226 else: 227 for block in get_basic_blocks(blocktext): 228 blocks.append((None, block)) 229 230 return blocks 231 232 # List item inspection. 233 234 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 235 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 236 237 def get_list_items(text): 238 239 "Return a list of (marker, text) tuples for the given list 'text'." 240 241 items = [] 242 243 for match in listitem_regexp.finditer(text): 244 items.append((match.group("marker"), match.group("text"))) 245 246 return items 247 248 # Content inspection. 249 250 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 251 link_regexp_str = r"[[](?P<linktext>.*?)]" 252 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 253 macro_regexp_str = r"{(?P<macro>.*?):(?P<options>.*?)}" 254 255 # Word-dependent patterns. 256 # Here, the unbracketed markers must test for the absence of surrounding word 257 # characters. 258 259 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 260 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 261 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 262 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 263 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 264 265 content_regexp_str = ( 266 "(" + monospace_regexp_str + ")" 267 "|" 268 "(" + link_regexp_str + ")" 269 "|" 270 "(" + image_regexp_str + ")" 271 "|" 272 "(" + macro_regexp_str + ")" 273 "|" 274 "(" + italic_regexp_str + ")" 275 "|" 276 "(" + bold_regexp_str + ")" 277 "|" 278 "(" + del_regexp_str + ")" 279 "|" 280 "(" + underline_regexp_str + ")" 281 "|" 282 "(" + sub_regexp_str + ")" 283 ) 284 285 # Table row inspection. 286 287 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 288 289 table_content_regexp_str = ( 290 content_regexp_str + 291 "|" 292 "(" + cellsep_regexp_str + ")" 293 ) 294 295 content_regexp = re.compile(content_regexp_str) 296 table_content_regexp = re.compile(table_content_regexp_str) 297 298 def get_table_rows(text): 299 300 "Return a list of (cellsep, columns) tuples for the given table 'text'." 301 302 rows = [] 303 304 for row in text.split("|\n"): 305 if not row: 306 break 307 308 row += "|" 309 cellsep = None 310 columns = [""] 311 last = 0 312 for match in table_content_regexp.finditer(row): 313 start, end = match.span() 314 columns[-1] += row[last:start] 315 316 if match.group("celltype"): 317 if cellsep is None: 318 cellsep = match.group("celltype") 319 columns.append("") 320 else: 321 columns[-1] += match.group() 322 323 last = end 324 325 columns[-1] += row[last:] 326 327 if cellsep: 328 rows.append((cellsep, columns[1:-1])) 329 330 return rows 331 332 # Notation conversion. 333 334 notation_mapping = [ 335 (r"\!", "!"), 336 (r"\-", "-"), 337 (r"\\""\n", "<<BR>>"), 338 (r"\\ ", "<<BR>>"), 339 (r"\~", "~"), 340 ] 341 342 preformatted_notation_mapping = [ 343 (r"\!", "!"), 344 (r"\-", "-"), 345 (r"\\""\n", "\n"), 346 (r"\\ ", "\n"), 347 (r"\~", "~"), 348 ] 349 350 # Translation helpers. 351 352 markers = { 353 "*" : "*", 354 "#" : "1.", 355 "-" : "*", 356 } 357 358 cellseps = { 359 "|" : "\n|| ", 360 "||" : "\n|| ", 361 } 362 363 cellextra = { 364 "|" : "", 365 "||" : "'''", 366 } 367 368 sectiontypes = { 369 "code" : "", 370 "noformat" : "", 371 "quote" : "", 372 "info" : "#!wiki important", 373 "note" : "#!wiki caution", 374 "tip" : "#!wiki tip", 375 "warning" : "#!wiki warning", 376 } 377 378 preformatted_sectiontypes = (None, "noformat") 379 380 macroargs = { 381 "color" : "col", 382 } 383 384 macrotypes = { 385 "anchor" : "<<Anchor(%(args)s)>>", 386 "color" : "<<Color2(%(content)s, %(args)s)>>", 387 } 388 389 class ConfluenceParser: 390 391 "A parser for Confluence markup." 392 393 def __init__(self): 394 self.max_level = self.level = 0 395 self.in_heading = False 396 self.held_anchors = [] 397 self.macro = None 398 self.sections = [] 399 400 def translate_marker(self, marker): 401 402 "Translate the given 'marker' to a suitable Moin representation." 403 404 return " " * len(marker) + markers[marker[-1]] 405 406 def translate_cellsep(self, cellsep): 407 408 "Translate the given 'cellsep' to a suitable Moin representation." 409 410 return cellseps[cellsep] 411 412 def translate_cell(self, cellsep, text): 413 414 "Using 'cellsep', translate the cell 'text'." 415 416 return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] 417 418 def translate_content_match(self, match): 419 420 "Translate the content described by the given 'match', returning a string." 421 422 if match.group("monotext"): 423 self.enter_section(); self.leave_section() 424 return "{{{%s}}}" % match.group("monotext") 425 426 elif match.group("linktext"): 427 parts = match.group("linktext").split("|") 428 429 # NOTE: Proper detection of external links required. 430 431 if len(parts) == 1: 432 label, target, title = None, parts[0], None 433 elif len(parts) == 2: 434 (label, target), title = parts, None 435 else: 436 label, target, title = parts 437 438 target = target.strip() 439 440 # Look for namespace links and rewrite them. 441 442 if target.find(":") != -1: 443 prefix = "" 444 space, rest = target.split(":", 1) 445 if space not in URL_SCHEMES: 446 rest = get_page_title(rest) 447 target = "%s/%s" % (space, rest) 448 449 # Detect anchors. 450 451 elif target.startswith("#"): 452 prefix = "" 453 454 # Detect attachments. 455 456 elif target.startswith("^"): 457 prefix = "attachment:" 458 459 # Link to other pages within a space. 460 461 else: 462 prefix = "../" 463 464 # Make the link tidier by making a target if none was given. 465 466 if not label: 467 label = target 468 469 target = get_page_title(target) 470 471 if not label and not title: 472 return "[[%s%s]]" % (prefix, target) 473 elif not title: 474 return "[[%s%s|%s]]" % (prefix, target, label) 475 else: 476 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 477 478 elif match.group("imagetext"): 479 parts = match.group("imagetext").split("|") 480 481 # NOTE: Proper detection of external links required. 482 483 if parts[0].startswith("http"): 484 prefix = "" 485 else: 486 prefix = "attachment:" 487 488 # NOTE: Proper options conversion required. 489 490 if len(parts) == 1: 491 return "{{%s%s}}" % (prefix, parts[0]) 492 else: 493 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 494 495 elif match.group("macro"): 496 macro_name = match.group("macro") 497 if macrotypes.has_key(macro_name): 498 argname = macroargs.get(macro_name) 499 result = macrotypes[macro_name] % { 500 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options")) 501 } 502 if not self.forbids_macros(): 503 return result 504 if macro_name == "anchor": 505 self.held_anchors.append(result) 506 return "" 507 508 elif match.group("italictext"): 509 return "''%s''" % self.translate_content(match.group("italictext")) 510 511 elif match.group("boldtext"): 512 return "'''%s'''" % self.translate_content(match.group("boldtext")) 513 514 elif match.group("deltext"): 515 return "--(%s)--" % self.translate_content(match.group("deltext")) 516 517 elif match.group("underlinetext"): 518 return "__%s__" % self.translate_content(match.group("underlinetext")) 519 520 elif match.group("subtext"): 521 return ",,%s,," % self.translate_content(match.group("subtext")) 522 523 else: 524 return self.translate_text(match.group()) 525 526 def translate_text(self, s, preformatted=False): 527 528 "Translate the plain text string 's', converting notation." 529 530 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 531 s = s.replace(before, after) 532 return s 533 534 def translate_content(self, text): 535 536 """ 537 Return a translation of the given 'text'. If the optional 'sectiontype' is 538 specified, the translation may be modified to a form appropriate to the 539 section being translated. 540 """ 541 542 parts = [] 543 preformatted = self.is_preformatted() 544 545 last = 0 546 for match in content_regexp.finditer(text): 547 start, end = match.span() 548 parts.append(self.translate_text(text[last:start], preformatted)) 549 550 # Handle unformatted sections. 551 552 if self.sections and self.sections[-1] in ("code", "noformat"): 553 parts.append(match.group()) 554 else: 555 parts.append(self.translate_content_match(match)) 556 557 last = end 558 559 parts.append(self.translate_text(text[last:], preformatted)) 560 return "".join(parts) 561 562 def is_preformatted(self): 563 return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False) 564 565 def translate_block(self, blocktype, blocktext): 566 567 "Translate the block with the given 'blocktype' and 'blocktext'." 568 569 if blocktype in headings: 570 self.in_heading = True 571 self.held_anchors = [] 572 573 parts = [] 574 575 # Translate headings and blockquotes. 576 577 if blocktypes.has_key(blocktype): 578 text = self.parse_text(blocktext) 579 for anchor in self.held_anchors: 580 parts.append(anchor) 581 parts.append(blocktypes[blocktype] % text) 582 583 # Translate list items. 584 585 elif blocktype == "list": 586 for listmarker, listitem in get_list_items(blocktext): 587 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem))) 588 589 # Translate table items. 590 591 elif blocktype == "table": 592 593 # Enter the table. 594 595 self.enter_section() 596 597 table_parts = [] 598 first = True 599 600 for cellsep, columns in get_table_rows(blocktext): 601 if not first: 602 table_parts.append("==") 603 else: 604 first = False 605 moinsep = self.translate_cellsep(cellsep) 606 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) 607 608 # Nest the section appropriately. 609 610 opening, closing = self.nest_section() 611 612 parts.append("%s#!table" % opening) 613 parts += table_parts 614 parts.append(closing) 615 616 # Leave the table. 617 618 self.leave_section() 619 620 # Handle anonymous blocks. 621 622 else: 623 parts.append(self.parse_text(blocktext)) 624 625 if blocktype in headings: 626 self.in_heading = False 627 628 return "\n".join(parts) 629 630 def translate_section(self, sectiontype, options, text): 631 632 """ 633 Translate the section with the given 'sectiontype', 'options' and 634 'text'. 635 """ 636 637 parts = [] 638 639 # Enter the section. 640 641 self.enter_section(sectiontype) 642 643 # Sections can contain other sections. 644 645 section_content = self.parse_text(text.strip()) 646 647 # Nest the section appropriately. 648 649 opening, closing = self.nest_section() 650 mointype = sectiontypes.get(sectiontype) 651 652 parts.append("%s%s\n" % (opening, mointype or "")) 653 if options: 654 parts.append("## %s\n" % options) 655 parts.append(section_content) 656 parts.append("\n%s\n" % closing) 657 658 # Leave the section. 659 660 self.leave_section() 661 662 return parts 663 664 def enter_section(self, sectiontype=None): 665 self.level += 1 666 self.max_level = max(self.level, self.max_level) 667 self.sections.append(sectiontype) 668 669 def leave_section(self): 670 self.level -= 1 671 if not self.level: 672 self.max_level = 0 673 self.sections.pop() 674 675 def nest_section(self): 676 level = 3 + self.max_level - self.level 677 opening = "{" * level 678 closing = "}" * level 679 return opening, closing 680 681 # General parsing. 682 683 def parse_text(self, s, top=False): 684 685 "Parse the content in the string 's', returning the translation." 686 687 parts = [] 688 689 # Control spacing between blocks and other blocks or sections. 690 691 preceded_by_block = False 692 693 for type, text in get_regions(s): 694 695 # Handle list, heading, blockquote or anonymous blocks. 696 697 if type is None: 698 699 # Where the region is the same as the provided text, return 700 # immediately. This is the base case of the recursive parsing 701 # process. 702 703 if text == s and not top: 704 return self.translate_content(text) 705 706 # Otherwise, obtain and translate the blocks. 707 708 if preceded_by_block: 709 parts.append("\n") 710 711 first = True 712 for blocktype, blocktext in get_blocks(text): 713 if not first: 714 parts.append("\n") 715 else: 716 first = False 717 parts.append("%s" % self.translate_block(blocktype, blocktext)) 718 719 if not first: 720 preceded_by_block = True 721 722 # Handle sections. 723 724 else: 725 sectiontype, options = type 726 727 # Direct translations of sections. 728 729 if sectiontypes.has_key(sectiontype): 730 if preceded_by_block: 731 parts.append("\n") 732 733 parts += self.translate_section(sectiontype, options, text) 734 preceded_by_block = True 735 736 # Translations of macros acting as sections. 737 738 elif macrotypes.has_key(sectiontype): 739 740 # Prevent the production of macros in places they would 741 # produce illegal Moin syntax. 742 743 if not self.forbids_macros(): 744 self.macro = sectiontype 745 argname = macroargs.get(sectiontype) 746 parts.append(macrotypes[sectiontype] % { 747 "content" : quote_macro_argument(self.parse_text(text)), 748 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options) 749 }) 750 self.macro = None 751 752 # Include the contents of section-based macros where the 753 # macros themselves are not allowed. 754 755 else: 756 parts.append(self.translate_content(text)) 757 758 preceded_by_block = False 759 760 # Unrecognised sections. 761 762 else: 763 parts += self.translate_section(sectiontype, None, text) 764 preceded_by_block = False 765 766 return "".join(parts) 767 768 def forbids_macros(self): 769 return self.in_heading or self.macro 770 771 def parse(s, out): 772 773 "Parse the content in the string 's', writing a translation to 'out'." 774 775 parser = ConfluenceParser() 776 out.write(parser.parse_text(s, top=True)) 777 778 if __name__ == "__main__": 779 s = codecs.getreader("utf-8")(sys.stdin).read() 780 out = codecs.getwriter("utf-8")(sys.stdout) 781 parse(s, out) 782 783 # vim: tabstop=4 expandtab shiftwidth=4