1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s%s|%s]]", 60 "ac:image" : "{{%s%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 formatted_tags = ["ac:rich-text-body"] 84 layout_tags = ["ac:layout", "ac:layout-section", "ac:layout-cell"] 85 preformatted_tags = ["pre", "ac:plain-text-body"] 86 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 87 table_tags = ["ac:task-list", "table"] 88 table_cell_tags = ["ac:task-body", "ac:task-status", "td", "th"] 89 table_row_tags = ["ac:task", "tr"] 90 91 hierarchical_tags = formatted_tags + preformatted_tags + layout_tags + table_tags 92 indented_tags = ["li", "p"] + hierarchical_tags 93 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 94 span_override_tags = ["ac:link"] 95 96 link_target_tags = { 97 # Confluence element Attributes providing the target 98 "ri:page" : ("ri:space-key", "ri:content-title"), 99 "ri:attachment" : ("ri:filename",), 100 "ri:user" : ("ri:username",), 101 } 102 103 link_target_prefixes = { 104 # Attribute with details Prefix ensuring correct relative link 105 "ri:space-key" : "..", 106 "ri:content-title" : "..", 107 } 108 109 link_label_attributes = "ri:content-title", "ac:link-body" 110 111 # NOTE: User links should support the intended user namespace prefix. 112 113 link_target_types = { 114 # Confluence element MoinMoin link prefix 115 "ri:attachment" : "attachment:", 116 "ri:user" : "", 117 } 118 119 macro_rich_text_styles = { 120 # Confluence style MoinMoin admonition style 121 "note" : "caution", 122 "warning" : "warning", 123 "info" : "important", 124 "tip" : "tip", 125 "excerpt" : "", 126 } 127 128 macroargs = { 129 # Confluence macro Confluence and MoinMoin macro arguments 130 "attachments" : [("page", "pagename")], 131 "color" : [("color", "col")], 132 } 133 134 macrotypes = { 135 # Confluence macro MoinMoin syntax 136 "anchor" : "<<Anchor(%(anchor)s)>>", 137 "attachments" : "<<AttachList(%(args)s)>>", 138 "color" : "<<Color2(%(content)s, %(args)s)>>", 139 "recently-updated" : "<<RecentChanges>>", 140 "toc" : "<<TableOfContents>>", 141 } 142 143 normalise_regexp_str = r"\s+" 144 normalise_regexp = re.compile(normalise_regexp_str) 145 146 class ConfluenceXMLParser(Parser): 147 148 "Handle content from Confluence 4 page revisions." 149 150 def __init__(self, out, is_comment_page=False): 151 Parser.__init__(self) 152 self.out = out 153 self.is_comment_page = is_comment_page 154 155 # Link target and label information. 156 157 self.target = None 158 self.target_type = None 159 self.label = None 160 161 # Macro information. 162 163 self.macros = [] 164 self.macro_parameters = [] 165 self.held_anchors = [] 166 167 # Indentation and element nesting states. 168 169 self.indents = [0] 170 self.states = {} 171 self.max_level = self.level = 0 172 173 for name in preformatted_tags + single_level_tags: 174 self.states[name] = 0 175 176 # Table states. 177 178 self.table_rows = 0 179 self.table_columns = 0 180 181 # Block states. 182 183 self.have_block = False 184 185 # ContentHandler-related methods. 186 187 def startElement(self, name, attrs): 188 189 # Track indentation for lists. 190 191 if list_tags.has_key(name): 192 self.indents.append(self.indents[-1] + 1) 193 194 # Track element nesting. 195 196 if self.states.has_key(name): 197 self.states[name] += 1 198 199 # Track cumulative element nesting in order to produce appropriate depth 200 # indicators in the formatted output. 201 202 if name in hierarchical_tags: 203 self.level += 1 204 self.max_level = max(self.level, self.max_level) 205 206 # Reset indentation within regions. 207 208 self.indents.append(0) 209 210 if name in headings: 211 self.held_anchors = [] 212 213 Parser.startElement(self, name, attrs) 214 215 # Remember macro information for use within the element. 216 217 if name in ("ac:macro", "ac:structured-macro"): 218 self.macros.append(self.attributes[-1].get("ac:name")) 219 self.macro_parameters.append({}) 220 221 def endElement(self, name): 222 223 # Reset the indent for any preformatted/formatted region so that it may 224 # itself be indented. 225 226 if name in hierarchical_tags: 227 self.indents.pop() 228 229 Parser.endElement(self, name) 230 231 if list_tags.has_key(name): 232 self.indents.pop() 233 234 if self.states.has_key(name): 235 self.states[name] -= 1 236 237 if name in hierarchical_tags: 238 self.level -= 1 239 if not self.level: 240 self.max_level = 0 241 242 # Discard macro state. 243 244 if name in ("ac:macro", "ac:structured-macro"): 245 self.macros.pop() 246 self.macro_parameters.pop() 247 248 def characters(self, content): 249 if not self.is_preformatted(): 250 content = self.normalise(content, self.elements[-1]) 251 Parser.characters(self, content) 252 253 def skippedEntity(self, name): 254 ch = htmlentitydefs.name2codepoint.get(name) 255 if ch: 256 self.text[-1].append(unichr(ch)) 257 258 # Parser-related methods. 259 260 def handleElement(self, name): 261 262 """ 263 Handle the completion of the element with the given 'name'. Any content 264 will either be recorded for later use (by an enclosing element, for 265 example) or emitted in some form. 266 """ 267 268 text = u"".join(self.text[-1]) 269 270 # Handle state. 271 272 if name in table_tags: 273 self.table_rows = 0 274 elif name in table_row_tags: 275 self.table_columns = 0 276 277 # Find conversions. 278 279 conversion = None 280 281 # Handle list elements. 282 283 if name == "li" and len(self.elements) > 1: 284 list_tag = self.elements[-2] 285 conversion = list_tags.get(list_tag) 286 287 # Remember link target information. 288 289 elif link_target_tags.has_key(name): 290 target_details = [] 291 292 # Get target details from the element's attributes. 293 294 for attrname in link_target_tags[name]: 295 attrvalue = self.attributes[-1].get(attrname) 296 if attrvalue: 297 298 # Obtain a link label. 299 300 if attrname in link_label_attributes and not self.label: 301 self.label = attrvalue 302 303 # Validate any page title. 304 305 if attrname == "ri:content-title": 306 attrvalue = get_page_title(attrvalue) 307 target_details.append(attrvalue) 308 309 # Insert any prefix required for the link. 310 311 prefix = link_target_prefixes.get(attrname) 312 if prefix: 313 target_details.insert(0, prefix) 314 if self.is_comment_page: 315 target_details.insert(0, prefix) 316 317 # Make a link based on the details. 318 319 self.target = u"/".join(target_details) 320 self.target_type = name 321 text = "" 322 323 # For anchor links, just use the raw text and let Moin do the formatting. 324 # Set an empty default target, overwriting it if enclosing elements 325 # specify target details. 326 327 elif name in ("ac:link-body", "ac:plain-text-link-body"): 328 self.target = self.target or "" 329 self.label = text.strip() 330 text = "" 331 332 # For conventional links, remember the href attribute as the target. 333 334 elif name == "a": 335 self.target = self.attributes[-1].get("href") 336 self.label = text.strip() 337 text = "" 338 339 # Remember macro information. 340 341 elif name == "ac:parameter": 342 self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text 343 text = "" 344 345 elif name == "ac:default-parameter": 346 self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text 347 text = "" 348 349 # Handle single-level tags. 350 351 elif name in single_level_tags and self.states[name] > 1: 352 conversion = "%s" 353 354 # Handle preformatted sections. 355 356 elif name in hierarchical_tags: 357 358 # Nest the section appropriately. 359 360 level = 3 + self.max_level - self.level 361 opening = "{" * level 362 closing = "}" * level 363 364 # Macro name information is used to style rich text body regions. 365 366 if name not in table_tags and self.macros and macro_rich_text_styles.has_key(self.macros[-1]): 367 details = macro_rich_text_styles[self.macros[-1]] 368 title = self.macro_parameters[-1].get("title") 369 if title: 370 details = "%s\n\n%s" % (details, title) 371 372 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 373 374 # Tables employ specially-marked sections. 375 376 elif name in table_tags: 377 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 378 379 # Layout tags may be nested and their markers are placed on separate 380 # lines in the output. They also employ specially-marked sections. 381 382 elif name in layout_tags: 383 section_name = name.split(":", 1)[-1] 384 conversion = "%s#!%s\n%%s\n%s" % (opening, section_name, closing) 385 386 else: 387 # Preformatted sections containing newlines must contain an 388 # initial newline. 389 390 if text.find("\n") != -1 and not text.startswith("\n"): 391 opening += "\n" 392 393 conversion = "%s%%s%s" % (opening, closing) 394 395 # Handle the common case and simpler special cases. 396 397 if not conversion: 398 conversion = tags.get(name) 399 400 401 402 # Attempt to convert the text. 403 404 # Links require target information. 405 406 if name in ("ac:link", "ac:image"): 407 prefix = link_target_types.get(self.target_type, "") 408 anchor = self.attributes[-1].get("ac:anchor") or "" 409 label = self.label or text.strip() or self.target 410 text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label) 411 self.target = self.target_type = self.label = None 412 413 elif name == "a": 414 text = conversion % (self.target, self.label or self.target) 415 self.target = self.target_type = self.label = None 416 417 # Macros require various kinds of information. 418 # Some macros affect the formatting of their contents, whereas other 419 # simpler macros are handled here. 420 421 elif name in ("ac:macro", "ac:structured-macro"): 422 conversion = macrotypes.get(self.macros[-1]) 423 424 # Produce the converted macro. 425 426 if conversion: 427 parameters = {"content" : text} 428 parameters.update(self.macro_parameters[-1]) 429 argnames = macroargs.get(self.macros[-1]) 430 431 # Convert Confluence arguments to Moin arguments. Unlike the 432 # wiki markup parser, multiple arguments are supported. 433 434 if argnames: 435 all_args = [] 436 for confargname, moinargname in argnames: 437 argvalue = self.macro_parameters[-1].get(confargname) 438 if argvalue: 439 all_args.append(quote_macro_argument("%s=%s" % (moinargname, argvalue))) 440 parameters["args"] = ", ".join(all_args) 441 442 # Obtain the Moin macro with parameters substituted. 443 444 text = conversion % parameters 445 if self.macros[-1] == "anchor" and self.forbids_macros(): 446 self.held_anchors.append(text) 447 text = "" 448 449 # Warn about macros that are not converted. 450 451 elif not macro_rich_text_styles.has_key(self.macros[-1]): 452 print >>sys.stderr, "No conversion possible for macro", self.macros[-1] 453 print >>sys.stderr, "Macro has arguments", self.macro_parameters[-1] 454 print >>sys.stderr 455 456 # Handle the common cases for parameterised and unparameterised 457 # substitutions. 458 459 elif text and conversion: 460 text = conversion % text 461 elif simple_tags.has_key(name) and not self.is_preformatted(): 462 text = simple_tags[name] 463 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 464 text = simple_preformatted_tags[name] 465 466 467 468 # Postprocess table columns and rows. 469 470 if name in table_cell_tags: 471 if self.table_columns: 472 text = "\n|| %s" % text 473 self.table_columns += 1 474 elif name in table_row_tags: 475 if self.table_rows: 476 text = "\n==\n%s" % text 477 self.table_rows += 1 478 479 # Postprocess held anchor tags in headings. 480 481 elif name in headings and self.held_anchors: 482 text = "%s\n%s" % ("".join(self.held_anchors), text) 483 484 485 486 # Normalise leading whitespace and indent the text if appropriate. 487 488 if name in indented_tags: 489 text = " " * self.indents[-1] + text.lstrip() 490 491 # Add the converted text to the end of the parent element's text nodes. 492 493 if len(self.text) > 1: 494 nodes = self.text[-2] 495 parent = self.elements[-2] 496 497 # Where preceding text exists, add any blank line separators. 498 499 if u"".join(nodes): 500 501 # All top-level elements are separated with blank lines. 502 503 if parent == "body": 504 nodes.append("\n") 505 506 # Block elements always cause a new line to be started. 507 508 if name in block_tags or self.have_block and name not in span_override_tags: 509 nodes.append("\n") 510 511 self.have_block = False 512 513 # Lists inside lists require separation. 514 515 elif list_tags.has_key(name) and parent == "li": 516 nodes.append("\n") 517 518 # Without preceding text, save any block node state for non-block 519 # elements so that newline separators can be added at another 520 # level. 521 522 elif name in block_tags and parent not in block_tags: 523 self.have_block = True 524 525 elif name not in block_tags and self.have_block and name not in span_override_tags: 526 self.have_block = True 527 528 else: 529 self.have_block = False 530 531 nodes.append(text) 532 533 # Otherwise, emit the text (at the top level of the document). 534 535 else: 536 self.out.write(text) 537 538 def is_preformatted(self): 539 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 540 541 def forbids_macros(self): 542 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 543 544 # Whitespace normalisation. 545 546 def get_replacement(self, name): 547 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 548 return "" 549 else: 550 return " " 551 552 def normalise(self, text, name): 553 return normalise_regexp.sub(self.get_replacement(name), text) 554 555 def parse(s, out, is_comment_page=False): 556 557 "Parse the content in the string 's', writing a translation to 'out'." 558 559 # NOTE: CDATA sections appear to have erroneous endings. 560 561 s = u"""\ 562 <?xml version="1.0"?> 563 <!DOCTYPE html 564 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 565 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 566 <html xmlns="http://www.w3.org/1999/xhtml"> 567 <body> 568 %s 569 </body> 570 </html>""" % s.replace("]] >", "]]>") 571 572 f = StringIO(s.encode("utf-8")) 573 try: 574 parser = ConfluenceXMLParser(out, is_comment_page) 575 parser.parse(f) 576 finally: 577 f.close() 578 579 if __name__ == "__main__": 580 s = codecs.getreader("utf-8")(sys.stdin).read() 581 out = codecs.getwriter("utf-8")(sys.stdout) 582 parse(s, out) 583 584 # vim: tabstop=4 expandtab shiftwidth=4