1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 try: 35 from cStringIO import StringIO 36 except ImportError: 37 from StringIO import StringIO 38 39 from xmlread import Parser 40 import re 41 import sys 42 import operator 43 import htmlentitydefs 44 45 URL_SCHEMES = ("http", "https", "ftp", "mailto") 46 47 # Section extraction. 48 49 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}" 50 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 51 52 def get_regions(s): 53 54 """ 55 Return a list of regions from 's'. Each region is specified using a tuple of 56 the form (type, text). 57 """ 58 59 last = 0 60 regions = [] 61 for match in sections_regexp.finditer(s): 62 start, end = match.span() 63 regions.append((None, s[last:start])) 64 regions.append(get_section_details(s[start:end])) 65 last = end 66 regions.append((None, s[last:])) 67 return regions 68 69 # Section inspection. 70 71 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}" 72 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 73 74 def get_section_details(s): 75 76 "Return the details of a section 's' in the form (type, text)." 77 78 match = section_regexp.match(s) 79 if match: 80 return (match.group("sectiontype"), match.group("options")), match.group("section") 81 else: 82 return None, s 83 84 # Heading, table and list extraction. 85 86 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*" 87 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+" 88 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 89 90 blockelement_regexp = re.compile( 91 "(" + list_regexp_str + ")" 92 "|" 93 "(" + table_regexp_str + ")" 94 "|" 95 "(" + blocktext_regexp_str + ")", 96 re.MULTILINE 97 ) 98 99 def get_block_elements(s): 100 101 """ 102 Extract headings, tables and lists from the given string 's'. 103 """ 104 105 last = 0 106 blocks = [] 107 for match in blockelement_regexp.finditer(s): 108 start, end = match.span() 109 matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type") 110 blocks.append((None, s[last:start])) 111 blocks.append((matchtype, match.group("text") or s[start:end])) 112 last = end 113 blocks.append((None, s[last:])) 114 return blocks 115 116 # Block extraction. 117 118 block_regexp_str = r"^(?:\s*\n)+" 119 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 120 121 def get_basic_blocks(s): 122 123 """ 124 Return blocks from the given string 's' by splitting the text on blank lines 125 and eliminating those lines. 126 """ 127 128 return [b for b in block_regexp.split(s) if b.strip()] 129 130 # Block inspection. 131 132 def get_blocks(s): 133 134 """ 135 Return blocks from the given string 's', inspecting the basic blocks and 136 generating additional block-level text where appropriate. 137 """ 138 139 blocks = [] 140 141 for blocktype, blocktext in get_block_elements(s): 142 143 # Collect heading, list and table blocks. 144 145 if blocktype is not None: 146 blocks.append((blocktype, blocktext)) 147 148 # Attempt to find new subblocks in other regions. 149 150 else: 151 for block in get_basic_blocks(blocktext): 152 blocks.append((None, block)) 153 154 return blocks 155 156 # List item inspection. 157 158 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$" 159 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 160 161 def get_list_items(text): 162 163 "Return a list of (marker, text) tuples for the given list 'text'." 164 165 items = [] 166 167 for match in listitem_regexp.finditer(text): 168 items.append((match.group("marker"), match.group("text"))) 169 170 return items 171 172 # Table row inspection. 173 174 monospace_regexp_str = r"{{(?P<monotext>.*?)}}" 175 link_regexp_str = r"[[](?P<linktext>.*?)]" 176 image_regexp_str = r"!(?P<imagetext>.*?)!" 177 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})" 178 179 content_regexp_str = ( 180 "(" + monospace_regexp_str + ")" 181 "|" 182 "(" + link_regexp_str + ")" 183 "|" 184 "(" + image_regexp_str + ")" 185 ) 186 187 table_content_regexp_str = ( 188 content_regexp_str + 189 "|" 190 "(" + cellsep_regexp_str + ")" 191 ) 192 193 content_regexp = re.compile(content_regexp_str) 194 table_content_regexp = re.compile(table_content_regexp_str) 195 196 def translate_content_match(match): 197 198 "Translate the content described by the given 'match', returning a string." 199 200 if match.group("monotext"): 201 return "{{{%s}}}" % match.group("monotext") 202 203 elif match.group("linktext"): 204 parts = match.group("linktext").split("|") 205 206 # NOTE: Proper detection of external links required. 207 208 if len(parts) == 1: 209 label, target, title = None, parts[0], None 210 elif len(parts) == 2: 211 (label, target), title = parts, None 212 else: 213 label, target, title = parts 214 215 target = target.strip() 216 217 # Look for namespace links and rewrite them. 218 219 if target.find(":") != -1: 220 prefix = "" 221 space, rest = target.split(":", 1) 222 if space not in URL_SCHEMES: 223 target = "%s/%s" % (space, rest) 224 225 # Detect anchors. 226 227 elif target.startswith("#"): 228 prefix = "" 229 230 # Detect attachments. 231 232 elif target.startswith("^"): 233 prefix = "attachment:" 234 235 # Link to other pages within a space. 236 237 else: 238 prefix = "../" 239 240 # Make the link tidier by making a target if none was given. 241 242 if not label: 243 label = target 244 245 if not label and not title: 246 return "[[%s%s]]" % (prefix, target) 247 elif not title: 248 return "[[%s%s|%s]]" % (prefix, target, label) 249 else: 250 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title) 251 252 elif match.group("imagetext"): 253 parts = match.group("imagetext").split("|") 254 255 # NOTE: Proper detection of external links required. 256 257 if parts[0].startswith("http"): 258 prefix = "" 259 else: 260 prefix = "attachment:" 261 262 # NOTE: Proper options conversion required. 263 264 if len(parts) == 1: 265 return "{{%s%s}}" % (prefix, parts[0]) 266 else: 267 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1]) 268 269 else: 270 return match.group() 271 272 def get_table_rows(text): 273 274 "Return a list of (cellsep, columns) tuples for the given table 'text'." 275 276 rows = [] 277 278 for line in text.split("\n"): 279 cellsep = None 280 columns = [""] 281 last = 0 282 for match in table_content_regexp.finditer(line): 283 start, end = match.span() 284 columns[-1] += line[last:start] 285 286 if match.group("celltype"): 287 if cellsep is None: 288 cellsep = match.group("celltype") 289 columns.append("") 290 else: 291 columns[-1] += match.group() 292 293 last = end 294 295 columns[-1] += line[last:] 296 297 if cellsep: 298 rows.append((cellsep, columns[1:-1])) 299 300 return rows 301 302 def translate_content(text, sectiontype=None): 303 304 """ 305 Return a translation of the given 'text'. If the optional 'sectiontype' is 306 specified, the translation may be modified to a form appropriate to the 307 section being translated. 308 """ 309 310 parts = [] 311 312 last = 0 313 for match in content_regexp.finditer(text): 314 start, end = match.span() 315 parts.append(text[last:start]) 316 317 # Handle unformatted sections. 318 319 if sectiontype in ("code", "noformat"): 320 parts.append(match.group()) 321 else: 322 parts.append(translate_content_match(match)) 323 324 last = end 325 326 parts.append(text[last:]) 327 return "".join(parts) 328 329 # Translation helpers. 330 331 blocktypes = { 332 "h1" : "= %s =", 333 "h2" : "== %s ==", 334 "h3" : "=== %s ===", 335 "h4" : "==== %s ====", 336 "h5" : "===== %s =====", 337 "h6" : "====== %s ======", 338 "bq" : "{{{%s}}}", 339 } 340 341 markers = { 342 "*" : "*", 343 "#" : "1.", 344 "-" : "*", 345 } 346 347 def translate_marker(marker): 348 349 "Translate the given 'marker' to a suitable Moin representation." 350 351 return " " * len(marker) + markers[marker[-1]] 352 353 cellseps = { 354 "|" : "||", 355 "||" : "||", 356 } 357 358 cellextra = { 359 "|" : "", 360 "||" : "'''", 361 } 362 363 def translate_cellsep(cellsep): 364 365 "Translate the given 'cellsep' to a suitable Moin representation." 366 367 return cellseps[cellsep] 368 369 def translate_cell(cellsep, text): 370 371 "Using 'cellsep', translate the cell 'text'." 372 373 return cellextra[cellsep] + translate_content(text) + cellextra[cellsep] 374 375 sectiontypes = { 376 "code" : "", 377 "noformat" : "", 378 "quote" : "", 379 "info" : "wiki important", 380 "note" : "wiki caution", 381 "tip" : "wiki tip", 382 "warning" : "wiki warning", 383 } 384 385 # XML dialect syntax parsing. 386 387 tags = { 388 # XHTML tag MoinMoin syntax 389 "strong" : "'''%s'''", 390 "em" : "''%s''", 391 "u" : "__%s__", 392 "del" : "--(%s)--", 393 "sup" : "^%s^", 394 "sub" : ",,%s,,", 395 "code" : "`%s`", 396 "pre" : "{{{%s}}}", 397 "blockquote" : " %s", 398 "small" : "~-%s-~", 399 "big" : "~+%s+~", 400 "p" : "\n%s\n", 401 "ol" : "\n%s", 402 "ul" : "\n%s", 403 "ac:plain-text-body" : "{{{%s}}}", 404 "ac:link" : "[[%s%s|%s]]", 405 } 406 407 for tag, translation in blocktypes.items(): 408 tags[tag] = "\n%s\n" % translation 409 410 simple_tags = { 411 # XHTML tag MoinMoin syntax 412 "br" : "<<BR>>", 413 } 414 415 list_tags = { 416 # XHTML list tag MoinMoin list item syntax 417 "ol" : "1. %s\n", 418 "ul" : "* %s\n", 419 } 420 421 indented_tags = ["li", "p"] 422 423 link_target_tags = { 424 # Confluence element Attribute providing the target 425 "ri:page" : "ri:content-title", 426 "ri:attachment" : "ri:filename", 427 } 428 429 macro_rich_text_styles = { 430 # Confluence style MoinMoin admonition style 431 "note" : "caution", 432 "warning" : "warning", 433 "info" : "important", 434 "tip" : "tip", 435 } 436 437 normalise_regexp_str = r"\s+" 438 normalise_regexp = re.compile(normalise_regexp_str) 439 440 normalise_end_regexp_str = r"\s\s+$" 441 normalise_end_regexp = re.compile(normalise_end_regexp_str) 442 443 class ConfluenceXMLParser(Parser): 444 445 "Handle content from Confluence 4 page revisions." 446 447 def __init__(self, out): 448 Parser.__init__(self) 449 self.out = out 450 451 # Link target information. 452 453 self.target = None 454 self.target_type = None 455 456 # Macro information. 457 458 self.macro = None 459 self.macro_parameters = {} 460 461 # Indentation and preformatted states. 462 463 self.indent = 0 464 self.states = {} 465 for name in ("pre", "ac:plain-text-body"): 466 self.states[name] = 0 467 468 # ContentHandler-related methods. 469 470 def startElement(self, name, attrs): 471 if list_tags.has_key(name): 472 self.indent += 1 473 elif self.states.has_key(name): 474 self.states[name] += 1 475 Parser.startElement(self, name, attrs) 476 477 def endElement(self, name): 478 Parser.endElement(self, name) 479 if list_tags.has_key(name): 480 self.indent -= 1 481 elif self.states.has_key(name): 482 self.states[name] -= 1 483 484 def characters(self, content): 485 if not self.is_preformatted(): 486 content = self.normalise(content, self.elements[-1]) 487 Parser.characters(self, content) 488 489 def skippedEntity(self, name): 490 ch = htmlentitydefs.name2codepoint.get(name) 491 if ch: 492 self.text[-1].append(unichr(ch)) 493 494 # Parser-related methods. 495 496 def handleElement(self, name): 497 text = "".join(self.text[-1]) 498 conversion = None 499 500 # Handle list elements. 501 502 if name == "li" and len(self.elements) > 1: 503 list_tag = self.elements[-2] 504 conversion = list_tags.get(list_tag) 505 506 # Remember link target information. 507 508 elif link_target_tags.has_key(name): 509 self.target = self.attributes[-1].get(link_target_tags[name]) 510 self.target_type = name 511 text = "" 512 513 # Remember macro information. 514 515 elif name == "ac:parameter": 516 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 517 text = "" 518 519 elif name == "ac:macro": 520 self.macro = self.attributes[-1].get("ac:name") 521 522 # Handle the common case. 523 524 else: 525 conversion = tags.get(name) 526 527 # Attempt to convert the text. 528 529 # Links require target information. 530 531 if name == "ac:link": 532 if self.target_type == "ri:attachment": 533 prefix = "attachment:" 534 else: 535 prefix = "../" 536 537 text = conversion % (prefix, self.target, text or self.target) 538 self.target = self.target_type = None 539 540 # Macro name information is used to style rich text body regions. 541 542 elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro): 543 details = macro_rich_text_styles[self.macro] 544 title = self.macro_parameters.get("title") 545 if title: 546 details = "%s\n\n%s" % (details, title) 547 text = "{{{#!wiki %s\n\n%s}}}" % (details, text) 548 self.macro = None 549 self.macro_parameters = {} 550 551 # Handle the common case. 552 553 elif text and conversion: 554 text = conversion % text 555 elif simple_tags.has_key(name): 556 text = simple_tags[name] 557 558 # Normalise leading whitespace and indent the text if appropriate. 559 560 if name in indented_tags: 561 text = " " * self.indent + text.lstrip() 562 563 # Add the converted text to the end of the parent element's text nodes. 564 565 if len(self.text) > 1: 566 preceding = "".join(self.text[-2]) 567 568 if not self.is_preformatted(): 569 preceding = self.normalise_end(preceding, self.elements[-2]) 570 571 self.text[-2] = [preceding] 572 self.text[-2].append(text) 573 574 # Otherwise, emit the text. 575 576 else: 577 self.out.write(text) 578 579 def is_preformatted(self): 580 return reduce(operator.or_, self.states.values(), False) 581 582 def get_replacement(self, name, end=False): 583 if list_tags.has_key(name): 584 if end: 585 return "\n" 586 else: 587 return "" 588 elif name == "body": 589 return "\n\n" 590 else: 591 return " " 592 593 def normalise(self, text, name): 594 return normalise_regexp.sub(self.get_replacement(name), text) 595 596 def normalise_end(self, text, name): 597 return normalise_end_regexp.sub(self.get_replacement(name, True), text) 598 599 def xmlparse(s, out): 600 601 "Parse the content in the string 's', writing a translation to 'out'." 602 603 # NOTE: CDATA sections appear to have erroneous endings. 604 605 s = u"""\ 606 <?xml version="1.0"?> 607 <!DOCTYPE html 608 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 609 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 610 <html xmlns="http://www.w3.org/1999/xhtml"> 611 <body> 612 %s 613 </body> 614 </html>""" % s.replace("]] >", "]]>") 615 616 f = StringIO(s.encode("utf-8")) 617 try: 618 parser = ConfluenceXMLParser(out) 619 parser.parse(f) 620 finally: 621 f.close() 622 623 # General parsing. 624 625 def parse(s, out): 626 627 "Parse the content in the string 's', writing a translation to 'out'." 628 629 for type, text in get_regions(s): 630 631 # Handle list, heading, blockquote or anonymous blocks. 632 633 if type is None: 634 for blocktype, blocktext in get_blocks(text): 635 636 # Translate headings and blockquotes. 637 638 if blocktypes.has_key(blocktype): 639 print >>out, blocktypes[blocktype] % blocktext 640 641 # Translate list items. 642 643 elif blocktype == "list": 644 for listmarker, listitem in get_list_items(blocktext): 645 print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem)) 646 647 # Translate table items. 648 649 elif blocktype == "table": 650 for cellsep, columns in get_table_rows(blocktext): 651 moinsep = translate_cellsep(cellsep) 652 print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep 653 654 # Handle anonymous blocks. 655 656 else: 657 print >>out, translate_content(blocktext.rstrip()) 658 659 print >>out 660 661 # Handle sections. 662 663 else: 664 sectiontype, options = type 665 666 # Direct translations of sections. 667 668 mointype = sectiontypes.get(sectiontype) 669 if mointype: 670 print >>out, "{{{#!%s" % mointype 671 if options: 672 print >>out, "##", options 673 else: 674 print >>out, "{{{", 675 print >>out, translate_content(text, sectiontype), 676 print >>out, "}}}" 677 print >>out 678 679 if __name__ == "__main__": 680 s = sys.stdin.read() 681 parse(s, sys.stdout) 682 683 # vim: tabstop=4 expandtab shiftwidth=4