1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then split into regions. 32 """ 33 34 from common import * 35 import re 36 import sys 37 import codecs 38 import operator 39 40 # Section extraction. 41 42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}" 43 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 44 45 def get_regions(s): 46 47 """ 48 Return a list of regions from 's'. Each region is specified using a tuple of 49 the form (type, text). 50 """ 51 52 last = 0 53 regions = [""] 54 depth = 0 55 56 for match in sections_regexp.finditer(s): 57 start, end = match.span() 58 is_start = match.group("options") 59 is_section = is_section_marker(match.group("type")) 60 61 # The start of a region is either indicated by a marker with options or 62 # by a marker where no region is currently active. 63 64 if is_start or not depth: 65 66 # Where no region is active, add the text since the last match as a 67 # "null" region. 68 69 if not depth: 70 regions[-1] += s[last:start] 71 72 # A new region is maintained as a string. 73 74 if is_section: 75 regions.append(s[start:end]) 76 77 # Certain markers may be standalone macros. 78 79 else: 80 regions[-1] += s[start:end] 81 82 # Where a region is active, add the text since the last match as 83 # well as the text in this match to the region. 84 85 else: 86 regions[-1] += s[last:end] 87 88 if is_section: 89 depth += 1 90 91 # The end of a region is indicated by a marker with no options. 92 93 else: 94 # Where no region is active, the text since the last match plus the 95 # marker are added to the current "null" region. 96 97 if not depth: 98 99 # Add to the string portion of the "null" region. 100 101 regions[-1] += s[last:end] 102 103 # Where a region is active, the end marker and preceding text is 104 # either incorporated into the current region if more than one 105 # region is active, or the preceding text is incorporated into the 106 # current region and the details of the region are then obtained. 107 108 else: 109 if depth > 1 or not is_section: 110 regions[-1] += s[last:end] 111 112 # Terminate the active region, interpreting its contents. 113 114 else: 115 regions[-1] += s[last:end] 116 regions.append("") 117 118 if is_section: 119 depth -= 1 120 121 last = end 122 123 # Where a region is still active, terminate it. 124 125 regions[-1] += s[last:] 126 127 return [get_section_details(s) for s in regions if s] 128 129 def is_section_marker(sectiontype): 130 return sectiontypes.has_key(sectiontype) or sectiontype == "color" 131 132 # Section inspection. 133 134 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 135 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 136 137 def get_section_details(s): 138 139 "Return the details of a section 's' in the form (type, text)." 140 141 match = section_regexp.match(s) 142 if match: 143 return (match.group("sectiontype"), match.group("options")), match.group("section") 144 else: 145 return None, s 146 147 # Heading, table and list extraction. 148 149 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)" 150 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+" 151 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 152 153 blockelement_regexp = re.compile( 154 "(" + list_regexp_str + ")" 155 "|" 156 "(" + table_regexp_str + ")" 157 "|" 158 "(" + blocktext_regexp_str + ")", 159 re.MULTILINE 160 ) 161 162 def get_block_elements(s): 163 164 """ 165 Extract headings, tables and lists from the given string 's'. 166 """ 167 168 last = 0 169 blocks = [] 170 for match in blockelement_regexp.finditer(s): 171 start, end = match.span() 172 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 173 blocks.append((None, s[last:start])) 174 blocks.append((matchtype, match.group("text") or s[start:end])) 175 last = end 176 blocks.append((None, s[last:])) 177 return blocks 178 179 # Block extraction. 180 181 block_regexp_str = r"^(?:\s*\n)+" 182 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 183 184 def get_basic_blocks(s): 185 186 """ 187 Return blocks from the given string 's' by splitting the text on blank lines 188 and eliminating those lines. 189 """ 190 191 return [b for b in block_regexp.split(s) if b.strip()] 192 193 # Block inspection. 194 195 def get_blocks(s): 196 197 """ 198 Return blocks from the given string 's', inspecting the basic blocks and 199 generating additional block-level text where appropriate. 200 """ 201 202 blocks = [] 203 204 for blocktype, blocktext in get_block_elements(s): 205 206 # Collect heading, list and table blocks. 207 208 if blocktype is not None: 209 blocks.append((blocktype, blocktext)) 210 211 # Attempt to find new subblocks in other regions. 212 213 else: 214 for block in get_basic_blocks(blocktext): 215 blocks.append((None, block)) 216 217 return blocks 218 219 # List item inspection. 220 221 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$" 222 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 223 224 def get_list_items(text): 225 226 "Return a list of (marker, text) tuples for the given list 'text'." 227 228 items = [] 229 230 for match in listitem_regexp.finditer(text): 231 items.append((match.group("marker"), match.group("text"))) 232 233 return items 234 235 # Content inspection. 236 237 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 238 link_regexp_str = r"[[](?P<linktext>.*?)]" 239 image_regexp_str = r"!(?P<imagetext>\w.*?)!" 240 macro_regexp_str = r"{(?P<macro>.*?):(?P<options>.*?)}" 241 242 # Word-dependent patterns. 243 # Here, the unbracketed markers must test for the absence of surrounding word 244 # characters. 245 246 italic_regexp_str = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})" 247 bold_regexp_str = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})" 248 del_regexp_str = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})" 249 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})" 250 sub_regexp_str = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})" 251 252 content_regexp_str = ( 253 "(" + monospace_regexp_str + ")" 254 "|" 255 "(" + link_regexp_str + ")" 256 "|" 257 "(" + image_regexp_str + ")" 258 "|" 259 "(" + macro_regexp_str + ")" 260 "|" 261 "(" + italic_regexp_str + ")" 262 "|" 263 "(" + bold_regexp_str + ")" 264 "|" 265 "(" + del_regexp_str + ")" 266 "|" 267 "(" + underline_regexp_str + ")" 268 "|" 269 "(" + sub_regexp_str + ")" 270 ) 271 272 # Table row inspection. 273 274 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 275 276 table_content_regexp_str = ( 277 content_regexp_str + 278 "|" 279 "(" + cellsep_regexp_str + ")" 280 ) 281 282 content_regexp = re.compile(content_regexp_str) 283 table_content_regexp = re.compile(table_content_regexp_str) 284 285 def get_table_rows(text): 286 287 "Return a list of (cellsep, columns) tuples for the given table 'text'." 288 289 rows = [] 290 291 for row in text.split("|\n"): 292 if not row: 293 break 294 295 row += "|" 296 cellsep = None 297 columns = [""] 298 last = 0 299 for match in table_content_regexp.finditer(row): 300 start, end = match.span() 301 columns[-1] += row[last:start] 302 303 if match.group("celltype"): 304 if cellsep is None: 305 cellsep = match.group("celltype") 306 columns.append("") 307 else: 308 columns[-1] += match.group() 309 310 last = end 311 312 columns[-1] += row[last:] 313 314 if cellsep: 315 rows.append((cellsep, columns[1:-1])) 316 317 return rows 318 319 # Notation conversion. 320 321 notation_mapping = [ 322 (r"\!", "!"), 323 (r"\-", "-"), 324 (r"\\""\n", "<<BR>>"), 325 (r"\\ ", "<<BR>>"), 326 (r"\~", "~"), 327 ] 328 329 preformatted_notation_mapping = [ 330 (r"\!", "!"), 331 (r"\-", "-"), 332 (r"\\""\n", "\n"), 333 (r"\\ ", "\n"), 334 (r"\~", "~"), 335 ] 336 337 # Translation helpers. 338 339 markers = { 340 "*" : "*", 341 "#" : "1.", 342 "-" : "*", 343 } 344 345 cellseps = { 346 "|" : "\n|| ", 347 "||" : "\n|| ", 348 } 349 350 cellextra = { 351 "|" : "", 352 "||" : "'''", 353 } 354 355 sectiontypes = { 356 "code" : "", 357 "noformat" : "", 358 "quote" : "", 359 "info" : "#!wiki important", 360 "note" : "#!wiki caution", 361 "tip" : "#!wiki tip", 362 "warning" : "#!wiki warning", 363 } 364 365 preformatted_sectiontypes = (None, "noformat") 366 367 macroargs = { 368 "color" : "col", 369 } 370 371 macrotypes = { 372 "anchor" : "<<Anchor(%(args)s)>>", 373 "color" : "<<Color2(%(content)s, %(args)s)>>", 374 } 375 376 class ConfluenceParser: 377 378 "A parser for Confluence markup." 379 380 def __init__(self): 381 self.max_level = self.level = 0 382 self.in_heading = False 383 self.held_anchors = [] 384 self.macro = None 385 self.sections = [] 386 387 def translate_marker(self, marker): 388 389 "Translate the given 'marker' to a suitable Moin representation." 390 391 return " " * len(marker) + markers[marker[-1]] 392 393 def translate_cellsep(self, cellsep): 394 395 "Translate the given 'cellsep' to a suitable Moin representation." 396 397 return cellseps[cellsep] 398 399 def translate_cell(self, cellsep, text): 400 401 "Using 'cellsep', translate the cell 'text'." 402 403 return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep] 404 405 def translate_content_match(self, match): 406 407 "Translate the content described by the given 'match', returning a string." 408 409 if match.group("monotext"): 410 self.enter_section(); self.leave_section() 411 return "{{{%s}}}" % match.group("monotext") 412 413 elif match.group("linktext"): 414 parts = match.group("linktext").split("|") 415 416 # NOTE: Proper detection of external links required. 417 418 if len(parts) == 1: 419 label, target, title = None, parts[0], None 420 elif len(parts) == 2: 421 (label, target), title = parts, None 422 else: 423 label, target, title = parts 424 425 target = target.strip() 426 427 # Look for namespace links and rewrite them. 428 429 if target.find(":") != -1: 430 prefix = "" 431 space, rest = target.split(":", 1) 432 if space not in URL_SCHEMES: 433 target = "%s/%s" % (space, rest) 434 435 # Detect anchors. 436 437 elif target.startswith("#"): 438 prefix = "" 439 440 # Detect attachments. 441 442 elif target.startswith("^"): 443 prefix = "attachment:" 444 445 # Link to other pages within a space. 446 447 else: 448 prefix = "../" 449 450 # Make the link tidier by making a target if none was given. 451 452 if not label: 453 label = target 454 455 if not label and not title: 456 return "[[%s%s]]" % (prefix, target) 457 elif not title: 458 return "[[%s%s|%s]]" % (prefix, target, label) 459 else: 460 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 461 462 elif match.group("imagetext"): 463 parts = match.group("imagetext").split("|") 464 465 # NOTE: Proper detection of external links required. 466 467 if parts[0].startswith("http"): 468 prefix = "" 469 else: 470 prefix = "attachment:" 471 472 # NOTE: Proper options conversion required. 473 474 if len(parts) == 1: 475 return "{{%s%s}}" % (prefix, parts[0]) 476 else: 477 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 478 479 elif match.group("macro"): 480 macro_name = match.group("macro") 481 if macrotypes.has_key(macro_name): 482 argname = macroargs.get(macro_name) 483 result = macrotypes[macro_name] % { 484 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options")) 485 } 486 if not self.forbids_macros(): 487 return result 488 if macro_name == "anchor": 489 self.held_anchors.append(result) 490 return "" 491 492 elif match.group("italictext"): 493 return "''%s''" % self.translate_content(match.group("italictext")) 494 495 elif match.group("boldtext"): 496 return "'''%s'''" % self.translate_content(match.group("boldtext")) 497 498 elif match.group("deltext"): 499 return "--(%s)--" % self.translate_content(match.group("deltext")) 500 501 elif match.group("underlinetext"): 502 return "__%s__" % self.translate_content(match.group("underlinetext")) 503 504 elif match.group("subtext"): 505 return ",,%s,," % self.translate_content(match.group("subtext")) 506 507 else: 508 return self.translate_text(match.group()) 509 510 def translate_text(self, s, preformatted=False): 511 512 "Translate the plain text string 's', converting notation." 513 514 for before, after in preformatted and preformatted_notation_mapping or notation_mapping: 515 s = s.replace(before, after) 516 return s 517 518 def translate_content(self, text): 519 520 """ 521 Return a translation of the given 'text'. If the optional 'sectiontype' is 522 specified, the translation may be modified to a form appropriate to the 523 section being translated. 524 """ 525 526 parts = [] 527 preformatted = self.is_preformatted() 528 529 last = 0 530 for match in content_regexp.finditer(text): 531 start, end = match.span() 532 parts.append(self.translate_text(text[last:start], preformatted)) 533 534 # Handle unformatted sections. 535 536 if self.sections and self.sections[-1] in ("code", "noformat"): 537 parts.append(match.group()) 538 else: 539 parts.append(self.translate_content_match(match)) 540 541 last = end 542 543 parts.append(self.translate_text(text[last:], preformatted)) 544 return "".join(parts) 545 546 def is_preformatted(self): 547 return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False) 548 549 def translate_block(self, blocktype, blocktext): 550 551 "Translate the block with the given 'blocktype' and 'blocktext'." 552 553 if blocktype in headings: 554 self.in_heading = True 555 self.held_anchors = [] 556 557 parts = [] 558 559 # Translate headings and blockquotes. 560 561 if blocktypes.has_key(blocktype): 562 text = self.parse_text(blocktext) 563 for anchor in self.held_anchors: 564 parts.append(anchor) 565 parts.append(blocktypes[blocktype] % text) 566 567 # Translate list items. 568 569 elif blocktype == "list": 570 for listmarker, listitem in get_list_items(blocktext): 571 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem))) 572 573 # Translate table items. 574 575 elif blocktype == "table": 576 577 # Enter the table. 578 579 self.enter_section() 580 581 table_parts = [] 582 first = True 583 584 for cellsep, columns in get_table_rows(blocktext): 585 if not first: 586 table_parts.append("==") 587 else: 588 first = False 589 moinsep = self.translate_cellsep(cellsep) 590 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns])) 591 592 # Nest the section appropriately. 593 594 opening, closing = self.nest_section() 595 596 parts.append("%s#!table" % opening) 597 parts += table_parts 598 parts.append(closing) 599 600 # Leave the table. 601 602 self.leave_section() 603 604 # Handle anonymous blocks. 605 606 else: 607 parts.append(self.parse_text(blocktext)) 608 609 if blocktype in headings: 610 self.in_heading = False 611 612 return "\n".join(parts) 613 614 def translate_section(self, sectiontype, options, text): 615 616 """ 617 Translate the section with the given 'sectiontype', 'options' and 618 'text'. 619 """ 620 621 parts = [] 622 623 # Enter the section. 624 625 self.enter_section(sectiontype) 626 627 # Sections can contain other sections. 628 629 section_content = self.parse_text(text.strip()) 630 631 # Nest the section appropriately. 632 633 opening, closing = self.nest_section() 634 mointype = sectiontypes.get(sectiontype) 635 636 parts.append("%s%s\n" % (opening, mointype or "")) 637 if options: 638 parts.append("## %s\n" % options) 639 parts.append(section_content) 640 parts.append("\n%s\n" % closing) 641 642 # Leave the section. 643 644 self.leave_section() 645 646 return parts 647 648 def enter_section(self, sectiontype=None): 649 self.level += 1 650 self.max_level = max(self.level, self.max_level) 651 self.sections.append(sectiontype) 652 653 def leave_section(self): 654 self.level -= 1 655 if not self.level: 656 self.max_level = 0 657 self.sections.pop() 658 659 def nest_section(self): 660 level = 3 + self.max_level - self.level 661 opening = "{" * level 662 closing = "}" * level 663 return opening, closing 664 665 # General parsing. 666 667 def parse_text(self, s, top=False): 668 669 "Parse the content in the string 's', returning the translation." 670 671 parts = [] 672 673 # Control spacing between blocks and other blocks or sections. 674 675 preceded_by_block = False 676 677 for type, text in get_regions(s): 678 679 # Handle list, heading, blockquote or anonymous blocks. 680 681 if type is None: 682 683 # Where the region is the same as the provided text, return 684 # immediately. This is the base case of the recursive parsing 685 # process. 686 687 if text == s and not top: 688 return self.translate_content(text) 689 690 # Otherwise, obtain and translate the blocks. 691 692 if preceded_by_block: 693 parts.append("\n") 694 695 first = True 696 for blocktype, blocktext in get_blocks(text): 697 if not first: 698 parts.append("\n") 699 else: 700 first = False 701 parts.append("%s" % self.translate_block(blocktype, blocktext)) 702 703 if not first: 704 preceded_by_block = True 705 706 # Handle sections. 707 708 else: 709 sectiontype, options = type 710 711 # Direct translations of sections. 712 713 if sectiontypes.has_key(sectiontype): 714 if preceded_by_block: 715 parts.append("\n") 716 717 parts += self.translate_section(sectiontype, options, text) 718 preceded_by_block = True 719 720 # Translations of macros acting as sections. 721 722 elif macrotypes.has_key(sectiontype): 723 724 # Prevent the production of macros in places they would 725 # produce illegal Moin syntax. 726 727 if not self.forbids_macros(): 728 self.macro = sectiontype 729 argname = macroargs.get(sectiontype) 730 parts.append(macrotypes[sectiontype] % { 731 "content" : quote_macro_argument(self.parse_text(text)), 732 "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + options) 733 }) 734 self.macro = None 735 736 # Include the contents of section-based macros where the 737 # macros themselves are not allowed. 738 739 else: 740 parts.append(self.translate_content(text)) 741 742 preceded_by_block = False 743 744 # Unrecognised sections. 745 746 else: 747 parts += self.translate_section(sectiontype, None, text) 748 preceded_by_block = False 749 750 return "".join(parts) 751 752 def forbids_macros(self): 753 return self.in_heading or self.macro 754 755 def parse(s, out): 756 757 "Parse the content in the string 's', writing a translation to 'out'." 758 759 parser = ConfluenceParser() 760 out.write(parser.parse_text(s, top=True)) 761 762 if __name__ == "__main__": 763 s = codecs.getreader("utf-8")(sys.stdin).read() 764 out = codecs.getwriter("utf-8")(sys.stdout) 765 parse(s, out) 766 767 # vim: tabstop=4 expandtab shiftwidth=4