1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s%s|%s]]", 60 "ac:image" : "{{%s%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:layout", "ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 } 113 114 macro_rich_text_styles = { 115 # Confluence style MoinMoin admonition style 116 "note" : "caution", 117 "warning" : "warning", 118 "info" : "important", 119 "tip" : "tip", 120 "excerpt" : "", 121 } 122 123 macroargs = { 124 # Confluence macro Confluence and MoinMoin macro arguments 125 "attachments" : [("page", "pagename")], 126 "color" : [("color", "col")], 127 } 128 129 macrotypes = { 130 # Confluence macro MoinMoin syntax 131 "anchor" : "<<Anchor(%(anchor)s)>>", 132 "attachments" : "<<AttachList(%(args)s)>>", 133 "color" : "<<Color2(%(content)s, %(args)s)>>", 134 "recently-updated" : "<<RecentChanges>>", 135 "toc" : "<<TableOfContents>>", 136 } 137 138 normalise_regexp_str = r"\s+" 139 normalise_regexp = re.compile(normalise_regexp_str) 140 141 class ConfluenceXMLParser(Parser): 142 143 "Handle content from Confluence 4 page revisions." 144 145 def __init__(self, out, is_comment_page=False): 146 Parser.__init__(self) 147 self.out = out 148 self.is_comment_page = is_comment_page 149 150 # Link target and label information. 151 152 self.target = None 153 self.target_type = None 154 self.label = None 155 156 # Macro information. 157 158 self.macros = [] 159 self.macro_parameters = [] 160 self.held_anchors = [] 161 162 # Indentation and element nesting states. 163 164 self.indents = [0] 165 self.states = {} 166 self.max_level = self.level = 0 167 168 for name in preformatted_tags + single_level_tags: 169 self.states[name] = 0 170 171 # Table states. 172 173 self.table_rows = 0 174 self.table_columns = 0 175 176 # Block states. 177 178 self.have_block = False 179 180 # ContentHandler-related methods. 181 182 def startElement(self, name, attrs): 183 184 # Track indentation for lists. 185 186 if list_tags.has_key(name): 187 self.indents.append(self.indents[-1] + 1) 188 189 # Track element nesting. 190 191 if self.states.has_key(name): 192 self.states[name] += 1 193 194 # Track cumulative element nesting in order to produce appropriate depth 195 # indicators in the formatted output. 196 197 if name in preformatted_tags or name in formatted_tags: 198 self.level += 1 199 self.max_level = max(self.level, self.max_level) 200 201 # Reset indentation within regions. 202 203 self.indents.append(0) 204 205 if name in headings: 206 self.held_anchors = [] 207 208 Parser.startElement(self, name, attrs) 209 210 # Remember macro information for use within the element. 211 212 if name in ("ac:macro", "ac:structured-macro"): 213 self.macros.append(self.attributes[-1].get("ac:name")) 214 self.macro_parameters.append({}) 215 216 def endElement(self, name): 217 218 # Reset the indent for any preformatted/formatted region so that it may 219 # itself be indented. 220 221 if name in preformatted_tags or name in formatted_tags: 222 self.indents.pop() 223 224 Parser.endElement(self, name) 225 226 if list_tags.has_key(name): 227 self.indents.pop() 228 229 if self.states.has_key(name): 230 self.states[name] -= 1 231 232 if name in preformatted_tags or name in formatted_tags: 233 self.level -= 1 234 if not self.level: 235 self.max_level = 0 236 237 # Discard macro state. 238 239 if name in ("ac:macro", "ac:structured-macro"): 240 self.macros.pop() 241 self.macro_parameters.pop() 242 243 def characters(self, content): 244 if not self.is_preformatted(): 245 content = self.normalise(content, self.elements[-1]) 246 Parser.characters(self, content) 247 248 def skippedEntity(self, name): 249 ch = htmlentitydefs.name2codepoint.get(name) 250 if ch: 251 self.text[-1].append(unichr(ch)) 252 253 # Parser-related methods. 254 255 def handleElement(self, name): 256 257 """ 258 Handle the completion of the element with the given 'name'. Any content 259 will either be recorded for later use (by an enclosing element, for 260 example) or emitted in some form. 261 """ 262 263 text = u"".join(self.text[-1]) 264 265 # Handle state. 266 267 if name == "table": 268 self.table_rows = 0 269 elif name == "tr": 270 self.table_columns = 0 271 272 # Find conversions. 273 274 conversion = None 275 276 # Handle list elements. 277 278 if name == "li" and len(self.elements) > 1: 279 list_tag = self.elements[-2] 280 conversion = list_tags.get(list_tag) 281 282 # Remember link target information. 283 284 elif link_target_tags.has_key(name): 285 target_details = [] 286 287 # Get target details from the element's attributes. 288 289 for attrname in link_target_tags[name]: 290 attrvalue = self.attributes[-1].get(attrname) 291 if attrvalue: 292 293 # Obtain a link label. 294 295 if attrname in link_label_attributes and not self.label: 296 self.label = attrvalue 297 298 # Validate any page title. 299 300 if attrname == "ri:content-title": 301 attrvalue = get_page_title(attrvalue) 302 target_details.append(attrvalue) 303 304 # Insert any prefix required for the link. 305 306 prefix = link_target_prefixes.get(attrname) 307 if prefix: 308 target_details.insert(0, prefix) 309 if self.is_comment_page: 310 target_details.insert(0, prefix) 311 312 # Make a link based on the details. 313 314 self.target = u"/".join(target_details) 315 self.target_type = name 316 text = "" 317 318 # For anchor links, just use the raw text and let Moin do the formatting. 319 # Set an empty default target, overwriting it if enclosing elements 320 # specify target details. 321 322 elif name in ("ac:link-body", "ac:plain-text-link-body"): 323 self.target = self.target or "" 324 self.label = text.strip() 325 text = "" 326 327 # For conventional links, remember the href attribute as the target. 328 329 elif name == "a": 330 self.target = self.attributes[-1].get("href") 331 self.label = text.strip() 332 text = "" 333 334 # Remember macro information. 335 336 elif name == "ac:parameter": 337 self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text 338 text = "" 339 340 elif name == "ac:default-parameter": 341 self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text 342 text = "" 343 344 # Handle single-level tags. 345 346 elif name in single_level_tags and self.states[name] > 1: 347 conversion = "%s" 348 349 # Handle preformatted sections. 350 351 elif name in preformatted_tags or name in formatted_tags: 352 353 # Nest the section appropriately. 354 355 level = 3 + self.max_level - self.level 356 opening = "{" * level 357 closing = "}" * level 358 359 # Macro name information is used to style rich text body regions. 360 361 if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]): 362 details = macro_rich_text_styles[self.macros[-1]] 363 title = self.macro_parameters[-1].get("title") 364 if title: 365 details = "%s\n\n%s" % (details, title) 366 367 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 368 369 elif name == "table": 370 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 371 372 else: 373 # Preformatted sections containing newlines must contain an initial 374 # newline. 375 376 if text.find("\n") != -1 and not text.startswith("\n"): 377 opening += "\n" 378 379 conversion = "%s%%s%s" % (opening, closing) 380 381 # Handle the common case and simpler special cases. 382 383 if not conversion: 384 conversion = tags.get(name) 385 386 387 388 # Attempt to convert the text. 389 390 # Links require target information. 391 392 if name in ("ac:link", "ac:image"): 393 prefix = link_target_types.get(self.target_type, "") 394 anchor = self.attributes[-1].get("ac:anchor") or "" 395 label = self.label or text.strip() or self.target 396 text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label) 397 self.target = self.target_type = self.label = None 398 399 elif name == "a": 400 text = conversion % (self.target, self.label or self.target) 401 self.target = self.target_type = self.label = None 402 403 # Macros require various kinds of information. 404 # Some macros affect the formatting of their contents, whereas other 405 # simpler macros are handled here. 406 407 elif name in ("ac:macro", "ac:structured-macro"): 408 conversion = macrotypes.get(self.macros[-1]) 409 410 # Produce the converted macro. 411 412 if conversion: 413 parameters = {"content" : text} 414 parameters.update(self.macro_parameters[-1]) 415 argnames = macroargs.get(self.macros[-1]) 416 417 # Convert Confluence arguments to Moin arguments. Unlike the 418 # wiki markup parser, multiple arguments are supported. 419 420 if argnames: 421 all_args = [] 422 for confargname, moinargname in argnames: 423 argvalue = self.macro_parameters[-1].get(confargname) 424 if argvalue: 425 all_args.append(quote_macro_argument("%s=%s" % (moinargname, argvalue))) 426 parameters["args"] = ", ".join(all_args) 427 428 # Obtain the Moin macro with parameters substituted. 429 430 text = conversion % parameters 431 if self.macros[-1] == "anchor" and self.forbids_macros(): 432 self.held_anchors.append(text) 433 text = "" 434 435 # Warn about macros that are not converted. 436 437 elif not macro_rich_text_styles.has_key(self.macros[-1]): 438 print >>sys.stderr, "No conversion possible for macro", self.macros[-1] 439 print >>sys.stderr, "Macro has arguments", self.macro_parameters[-1] 440 print >>sys.stderr 441 442 # Handle the common cases for parameterised and unparameterised 443 # substitutions. 444 445 elif text and conversion: 446 text = conversion % text 447 elif simple_tags.has_key(name) and not self.is_preformatted(): 448 text = simple_tags[name] 449 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 450 text = simple_preformatted_tags[name] 451 452 453 454 # Postprocess table columns and rows. 455 456 if name in ("th", "td"): 457 if self.table_columns: 458 text = "\n|| %s" % text 459 self.table_columns += 1 460 elif name == "tr": 461 if self.table_rows: 462 text = "\n==\n%s" % text 463 self.table_rows += 1 464 465 # Postprocess held anchor tags in headings. 466 467 elif name in headings and self.held_anchors: 468 text = "%s\n%s" % ("".join(self.held_anchors), text) 469 470 471 472 # Normalise leading whitespace and indent the text if appropriate. 473 474 if name in indented_tags: 475 text = " " * self.indents[-1] + text.lstrip() 476 477 # Add the converted text to the end of the parent element's text nodes. 478 479 if len(self.text) > 1: 480 nodes = self.text[-2] 481 parent = self.elements[-2] 482 483 # Where preceding text exists, add any blank line separators. 484 485 if u"".join(nodes): 486 487 # All top-level elements are separated with blank lines. 488 489 if parent == "body": 490 nodes.append("\n") 491 492 # Block elements always cause a new line to be started. 493 494 if name in block_tags or self.have_block and name not in span_override_tags: 495 nodes.append("\n") 496 497 self.have_block = False 498 499 # Lists inside lists require separation. 500 501 elif list_tags.has_key(name) and parent == "li": 502 nodes.append("\n") 503 504 # Without preceding text, save any block node state for non-block 505 # elements so that newline separators can be added at another 506 # level. 507 508 elif name in block_tags and parent not in block_tags: 509 self.have_block = True 510 511 elif name not in block_tags and self.have_block and name not in span_override_tags: 512 self.have_block = True 513 514 else: 515 self.have_block = False 516 517 nodes.append(text) 518 519 # Otherwise, emit the text (at the top level of the document). 520 521 else: 522 self.out.write(text) 523 524 def is_preformatted(self): 525 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 526 527 def forbids_macros(self): 528 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 529 530 # Whitespace normalisation. 531 532 def get_replacement(self, name): 533 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 534 return "" 535 else: 536 return " " 537 538 def normalise(self, text, name): 539 return normalise_regexp.sub(self.get_replacement(name), text) 540 541 def parse(s, out, is_comment_page=False): 542 543 "Parse the content in the string 's', writing a translation to 'out'." 544 545 # NOTE: CDATA sections appear to have erroneous endings. 546 547 s = u"""\ 548 <?xml version="1.0"?> 549 <!DOCTYPE html 550 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 551 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 552 <html xmlns="http://www.w3.org/1999/xhtml"> 553 <body> 554 %s 555 </body> 556 </html>""" % s.replace("]] >", "]]>") 557 558 f = StringIO(s.encode("utf-8")) 559 try: 560 parser = ConfluenceXMLParser(out, is_comment_page) 561 parser.parse(f) 562 finally: 563 f.close() 564 565 if __name__ == "__main__": 566 s = codecs.getreader("utf-8")(sys.stdin).read() 567 out = codecs.getwriter("utf-8")(sys.stdout) 568 parse(s, out) 569 570 # vim: tabstop=4 expandtab shiftwidth=4