1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then split into regions. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 import operator 39 40 # Section extraction. 41 42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" 43 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 44 45 def get_regions(s): 46 47 """ 48 Return a list of regions from 's'. Each region is specified using a tuple of 49 the form (type, text). 50 """ 51 52 last = 0 53 regions = [""] 54 depth = 0 55 56 for match in sections_regexp.finditer(s): 57 start, end = match.span() 58 is_start = match.group("options") 59 is_section = is_section_marker(match.group("type")) 60 61 # The start of a region is either indicated by a marker with options or 62 # by a marker where no region is currently active. 63 64 if is_start or not depth: 65 66 # Where no region is active, add the text since the last match as a 67 # "null" region. 68 69 if not depth: 70 regions[-1] += s[last:start] 71 72 # A new region is maintained as a string. 73 74 if is_section: 75 regions.append(s[start:end]) 76 77 # Certain markers may be standalone macros. 78 79 else: 80 regions[-1] += s[start:end] 81 82 # Where a region is active, add the text since the last match as 83 # well as the text in this match to the region. 84 85 else: 86 regions[-1] += s[last:end] 87 88 if is_section: 89 depth += 1 90 91 # The end of a region is indicated by a marker with no options. 92 93 else: 94 # Where no region is active, the text since the last match plus the 95 # marker are added to the current "null" region. 96 97 if not depth: 98 99 # Add to the string portion of the "null" region. 100 101 regions[-1] += s[last:end] 102 103 # Where a region is active, the end marker and preceding text is 104 # either incorporated into the current region if more than one 105 # region is active, or the preceding text is incorporated into the 106 # current region and the details of the region are then obtained. 107 108 else: 109 if depth > 1 or not is_section: 110 regions[-1] += s[last:end] 111 112 # Terminate the active region, interpreting its contents. 113 114 else: 115 regions[-1] += s[last:end] 116 regions.append("") 117 118 if is_section: 119 depth -= 1 120 121 last = end 122 123 # Where a region is still active, terminate it. 124 125 regions[-1] += s[last:] 126 127 return [get_section_details(s) for s in regions if s] 128 129 def is_section_marker(sectiontype): 130 return sectiontypes.has_key(sectiontype) or sectiontype == "color" 131 132 # Section inspection. 133 134 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 135 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 136 137 def get_section_details(s): 138 139 "Return the details of a section 's' in the form (type, text)." 140 141 match = section_regexp.match(s) 142 if match: 143 return (match.group("sectiontype"), match.group("options")), match.group("section") 144 else: 145 return None, s 146 147 # Heading, table and list extraction. 148 149 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 150 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 151 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 152 153 blockelement_regexp = re.compile( 154 "(" + list_regexp_str + ")" 155 "|" 156 "(" + table_regexp_str + ")" 157 "|" 158 "(" + blocktext_regexp_str + ")", 159 re.MULTILINE 160 ) 161 162 def get_block_elements(s): 163 164 """ 165 Extract headings, tables and lists from the given string 's'. 166 """ 167 168 last = 0 169 blocks = [] 170 for match in blockelement_regexp.finditer(s): 171 start, end = match.span() 172 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 173 blocks.append((None, s[last:start])) 174 blocks.append((matchtype, match.group("text") or s[start:end])) 175 last = end 176 blocks.append((None, s[last:])) 177 return blocks 178 179 # Block extraction. 180 181 block_regexp_str = r"^(?:\s*\n)+" 182 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 183 184 def get_basic_blocks(s): 185 186 """ 187 Return blocks from the given string 's' by splitting the text on blank lines 188 and eliminating those lines. 189 """ 190 191 return [b for b in block_regexp.split(s) if b.strip()] 192 193 # Block inspection. 194 195 def get_blocks(s): 196 197 """ 198 Return blocks from the given string 's', inspecting the basic blocks and 199 generating additional block-level text where appropriate. 200 """ 201 202 blocks = [] 203 204 for blocktype, blocktext in get_block_elements(s): 205 206 # Collect heading, list and table blocks. 207 208 if blocktype is not None: 209 blocks.append((blocktype, blocktext)) 210 211 # Attempt to find new subblocks in other regions. 212 213 else: 214 for block in get_basic_blocks(blocktext): 215 blocks.append((None, block)) 216 217 return blocks 218 219 # List item inspection. 220 221 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 222 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 223 224 def get_list_items(text): 225 226 "Return a list of (marker, text) tuples for the given list 'text'." 227 228 items = [] 229 230 for match in listitem_regexp.finditer(text): 231 items.append((match.group("marker"), match.group("text"))) 232 233 return items 234 235 # Content inspection. 236 237 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 238 link_regexp_str = r"[[](?P<linktext>.*?)]" 239 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 240 macro_regexp_str = r"{(?P<macro>.*?):(?P<options>.*?)}" 241 242 # Word-dependent patterns. 243 # Here, the unbracketed markers must test for the absence of surrounding word 244 # characters. 245 246 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 247 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 248 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 249 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 250 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 251 252 content_regexp_str = ( 253 "(" + monospace_regexp_str + ")" 254 "|" 255 "(" + link_regexp_str + ")" 256 "|" 257 "(" + image_regexp_str + ")" 258 "|" 259 "(" + macro_regexp_str + ")" 260 "|" 261 "(" + italic_regexp_str + ")" 262 "|" 263 "(" + bold_regexp_str + ")" 264 "|" 265 "(" + del_regexp_str + ")" 266 "|" 267 "(" + underline_regexp_str + ")" 268 "|" 269 "(" + sub_regexp_str + ")" 270 ) 271 272 # Table row inspection. 273 274 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 275 276 table_content_regexp_str = ( 277 content_regexp_str + 278 "|" 279 "(" + cellsep_regexp_str + ")" 280 ) 281 282 content_regexp = re.compile(content_regexp_str) 283 table_content_regexp = re.compile(table_content_regexp_str) 284 285 def get_table_rows(text): 286 287 "Return a list of (cellsep, columns) tuples for the given table 'text'." 288 289 rows = [] 290 291 for row in text.split("|\n"): 292 if not row: 293 break 294 295 row += "|" 296 cellsep = None 297 columns = [""] 298 last = 0 299 for match in table_content_regexp.finditer(row): 300 start, end = match.span() 301 columns[-1] += row[last:start] 302 303 if match.group("celltype"): 304 if cellsep is None: 305 cellsep = match.group("celltype") 306 columns.append("") 307 else: 308 columns[-1] += match.group() 309 310 last = end 311 312 columns[-1] += row[last:] 313 314 if cellsep: 315 rows.append((cellsep, columns[1:-1])) 316 317 return rows 318 319 # Notation conversion. 320 321 notation_mapping = [ 322 (r"\!", "!"), 323 (r"\-", "-"), 324 (r"\\""\n", "<<BR>>"), 325 (r"\\ ", "<<BR>>"), 326 (r"\~", "~"), 327 ] 328 329 preformatted_notation_mapping = [ 330 (r"\!", "!"), 331 (r"\-", "-"), 332 (r"\\""\n", "\n"), 333 (r"\\ ", "\n"), 334 (r"\~", "~"), 335 ] 336 337 # Translation helpers. 338 339 markers = { 340 "*" : "*", 341 "#" : "1.", 342 "-" : "*", 343 } 344 345 cellseps = { 346 "|" : "\n|| ", 347 "||" : "\n|| ", 348 } 349 350 cellextra = { 351 "|" : "", 352 "||" : "'''", 353 } 354 355 sectiontypes = { 356 "code" : "", 357 "noformat" : "", 358 "quote" : "", 359 "info" : "#!wiki important", 360 "note" : "#!wiki caution", 361 "tip" : "#!wiki tip", 362 "warning" : "#!wiki warning", 363 } 364 365 preformatted_sectiontypes = (None, "noformat") 366 367 macroargs = { 368 "color" : "col", 369 } 370 371 macrotypes = { 372 "anchor" : "<<Anchor(%(args)s)>>", 373 "color" : "<<Color2(%(content)s, %(args)s)>>", 374 } 375 376 class ConfluenceParser: 377 378 "A parser for Confluence markup." 379 380 def __init__(self): 381 self.max_level = self.level = 0 382 self.in_heading = False 383 self.held_anchors = [] 384 self.macro = None 385 self.sections = [] 386 387 def translate_marker(self, marker): 388 389 "Translate the given 'marker' to a suitable Moin representation." 390 391 return " " * len(marker) + markers[marker[-1]] 392 393 def translate_cellsep(self, cellsep): 394 395 "Translate the given 'cellsep' to a suitable Moin representation." 396 397 return cellseps[cellsep] 398 399 def translate_cell(self, cellsep, text): 400 401 "Using 'cellsep', translate the cell 'text'." 402 403 return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] 404 405 def translate_content_match(self, match): 406 407 "Translate the content described by the given 'match', returning a string." 408 409 if match.group("monotext"): 410 self.enter_section(); self.leave_section() 411 return "{{{%s}}}" % match.group("monotext") 412 413 elif match.group("linktext"): 414 parts = match.group("linktext").split("|") 415 416 # NOTE: Proper detection of external links required. 417 418 if len(parts) == 1: 419 label, target, title = None, parts[0], None 420 elif len(parts) == 2: 421 (label, target), title = parts, None 422 else: 423 label, target, title = parts 424 425 target = target.strip() 426 427 # Look for namespace links and rewrite them. 428 429 if target.find(":") != -1: 430 prefix = "" 431 space, rest = target.split(":", 1) 432 if space not in URL_SCHEMES: 433 rest = get_page_title(rest) 434 target = "%s/%s" % (space, rest) 435 436 # Detect anchors. 437 438 elif target.startswith("#"): 439 prefix = "" 440 441 # Detect attachments. 442 443 elif target.startswith("^"): 444 prefix = "attachment:" 445 446 # Link to other pages within a space. 447 448 else: 449 prefix = "../" 450 451 # Make the link tidier by making a target if none was given. 452 453 if not label: 454 label = target 455 456 target = get_page_title(target) 457 458 if not label and not title: 459 return "[[%s%s]]" % (prefix, target) 460 elif not title: 461 return "[[%s%s|%s]]" % (prefix, target, label) 462 else: 463 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 464 465 elif match.group("imagetext"): 466 parts = match.group("imagetext").split("|") 467 468 # NOTE: Proper detection of external links required. 469 470 if parts[0].startswith("http"): 471 prefix = "" 472 else: 473 prefix = "attachment:" 474 475 # NOTE: Proper options conversion required. 476 477 if len(parts) == 1: 478 return "{{%s%s}}" % (prefix, parts[0]) 479 else: 480 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 481 482 elif match.group("macro"): 483 macro_name = match.group("macro") 484 if macrotypes.has_key(macro_name): 485 argname = macroargs.get(macro_name) 486 result = macrotypes[macro_name] % { 487 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options")) 488 } 489 if not self.forbids_macros(): 490 return result 491 if macro_name == "anchor": 492 self.held_anchors.append(result) 493 return "" 494 495 elif match.group("italictext"): 496 return "''%s''" % self.translate_content(match.group("italictext")) 497 498 elif match.group("boldtext"): 499 return "'''%s'''" % self.translate_content(match.group("boldtext")) 500 501 elif match.group("deltext"): 502 return "--(%s)--" % self.translate_content(match.group("deltext")) 503 504 elif match.group("underlinetext"): 505 return "__%s__" % self.translate_content(match.group("underlinetext")) 506 507 elif match.group("subtext"): 508 return ",,%s,," % self.translate_content(match.group("subtext")) 509 510 else: 511 return self.translate_text(match.group()) 512 513 def translate_text(self, s, preformatted=False): 514 515 "Translate the plain text string 's', converting notation." 516 517 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 518 s = s.replace(before, after) 519 return s 520 521 def translate_content(self, text): 522 523 """ 524 Return a translation of the given 'text'. If the optional 'sectiontype' is 525 specified, the translation may be modified to a form appropriate to the 526 section being translated. 527 """ 528 529 parts = [] 530 preformatted = self.is_preformatted() 531 532 last = 0 533 for match in content_regexp.finditer(text): 534 start, end = match.span() 535 parts.append(self.translate_text(text[last:start], preformatted)) 536 537 # Handle unformatted sections. 538 539 if self.sections and self.sections[-1] in ("code", "noformat"): 540 parts.append(match.group()) 541 else: 542 parts.append(self.translate_content_match(match)) 543 544 last = end 545 546 parts.append(self.translate_text(text[last:], preformatted)) 547 return "".join(parts) 548 549 def is_preformatted(self): 550 return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False) 551 552 def translate_block(self, blocktype, blocktext): 553 554 "Translate the block with the given 'blocktype' and 'blocktext'." 555 556 if blocktype in headings: 557 self.in_heading = True 558 self.held_anchors = [] 559 560 parts = [] 561 562 # Translate headings and blockquotes. 563 564 if blocktypes.has_key(blocktype): 565 text = self.parse_text(blocktext) 566 for anchor in self.held_anchors: 567 parts.append(anchor) 568 parts.append(blocktypes[blocktype] % text) 569 570 # Translate list items. 571 572 elif blocktype == "list": 573 for listmarker, listitem in get_list_items(blocktext): 574 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem))) 575 576 # Translate table items. 577 578 elif blocktype == "table": 579 580 # Enter the table. 581 582 self.enter_section() 583 584 table_parts = [] 585 first = True 586 587 for cellsep, columns in get_table_rows(blocktext): 588 if not first: 589 table_parts.append("==") 590 else: 591 first = False 592 moinsep = self.translate_cellsep(cellsep) 593 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) 594 595 # Nest the section appropriately. 596 597 opening, closing = self.nest_section() 598 599 parts.append("%s#!table" % opening) 600 parts += table_parts 601 parts.append(closing) 602 603 # Leave the table. 604 605 self.leave_section() 606 607 # Handle anonymous blocks. 608 609 else: 610 parts.append(self.parse_text(blocktext)) 611 612 if blocktype in headings: 613 self.in_heading = False 614 615 return "\n".join(parts) 616 617 def translate_section(self, sectiontype, options, text): 618 619 """ 620 Translate the section with the given 'sectiontype', 'options' and 621 'text'. 622 """ 623 624 parts = [] 625 626 # Enter the section. 627 628 self.enter_section(sectiontype) 629 630 # Sections can contain other sections. 631 632 section_content = self.parse_text(text.strip()) 633 634 # Nest the section appropriately. 635 636 opening, closing = self.nest_section() 637 mointype = sectiontypes.get(sectiontype) 638 639 parts.append("%s%s\n" % (opening, mointype or "")) 640 if options: 641 parts.append("## %s\n" % options) 642 parts.append(section_content) 643 parts.append("\n%s\n" % closing) 644 645 # Leave the section. 646 647 self.leave_section() 648 649 return parts 650 651 def enter_section(self, sectiontype=None): 652 self.level += 1 653 self.max_level = max(self.level, self.max_level) 654 self.sections.append(sectiontype) 655 656 def leave_section(self): 657 self.level -= 1 658 if not self.level: 659 self.max_level = 0 660 self.sections.pop() 661 662 def nest_section(self): 663 level = 3 + self.max_level - self.level 664 opening = "{" * level 665 closing = "}" * level 666 return opening, closing 667 668 # General parsing. 669 670 def parse_text(self, s, top=False): 671 672 "Parse the content in the string 's', returning the translation." 673 674 parts = [] 675 676 # Control spacing between blocks and other blocks or sections. 677 678 preceded_by_block = False 679 680 for type, text in get_regions(s): 681 682 # Handle list, heading, blockquote or anonymous blocks. 683 684 if type is None: 685 686 # Where the region is the same as the provided text, return 687 # immediately. This is the base case of the recursive parsing 688 # process. 689 690 if text == s and not top: 691 return self.translate_content(text) 692 693 # Otherwise, obtain and translate the blocks. 694 695 if preceded_by_block: 696 parts.append("\n") 697 698 first = True 699 for blocktype, blocktext in get_blocks(text): 700 if not first: 701 parts.append("\n") 702 else: 703 first = False 704 parts.append("%s" % self.translate_block(blocktype, blocktext)) 705 706 if not first: 707 preceded_by_block = True 708 709 # Handle sections. 710 711 else: 712 sectiontype, options = type 713 714 # Direct translations of sections. 715 716 if sectiontypes.has_key(sectiontype): 717 if preceded_by_block: 718 parts.append("\n") 719 720 parts += self.translate_section(sectiontype, options, text) 721 preceded_by_block = True 722 723 # Translations of macros acting as sections. 724 725 elif macrotypes.has_key(sectiontype): 726 727 # Prevent the production of macros in places they would 728 # produce illegal Moin syntax. 729 730 if not self.forbids_macros(): 731 self.macro = sectiontype 732 argname = macroargs.get(sectiontype) 733 parts.append(macrotypes[sectiontype] % { 734 "content" : quote_macro_argument(self.parse_text(text)), 735 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options) 736 }) 737 self.macro = None 738 739 # Include the contents of section-based macros where the 740 # macros themselves are not allowed. 741 742 else: 743 parts.append(self.translate_content(text)) 744 745 preceded_by_block = False 746 747 # Unrecognised sections. 748 749 else: 750 parts += self.translate_section(sectiontype, None, text) 751 preceded_by_block = False 752 753 return "".join(parts) 754 755 def forbids_macros(self): 756 return self.in_heading or self.macro 757 758 def parse(s, out): 759 760 "Parse the content in the string 's', writing a translation to 'out'." 761 762 parser = ConfluenceParser() 763 out.write(parser.parse_text(s, top=True)) 764 765 if __name__ == "__main__": 766 s = codecs.getreader("utf-8")(sys.stdin).read() 767 out = codecs.getwriter("utf-8")(sys.stdout) 768 parse(s, out) 769 770 # vim: tabstop=4 expandtab shiftwidth=4