1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s|%s]]", 60 "ac:image" : "{{%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 "ac:link-body" : "#", 113 } 114 115 macro_rich_text_styles = { 116 # Confluence style MoinMoin admonition style 117 "note" : "caution", 118 "warning" : "warning", 119 "info" : "important", 120 "tip" : "tip", 121 } 122 123 macroargs = { 124 # Confluence macro Confluence and MoinMoin macro arguments 125 "color" : ("color", "col"), 126 } 127 128 macrotypes = { 129 # Confluence macro MoinMoin syntax 130 "anchor" : "<<Anchor(%(anchor)s)>>", 131 "color" : "<<Color2(%(content)s, %(args)s)>>", 132 } 133 134 normalise_regexp_str = r"\s+" 135 normalise_regexp = re.compile(normalise_regexp_str) 136 137 class ConfluenceXMLParser(Parser): 138 139 "Handle content from Confluence 4 page revisions." 140 141 def __init__(self, out): 142 Parser.__init__(self) 143 self.out = out 144 145 # Link target and label information. 146 147 self.target = None 148 self.target_type = None 149 self.label = None 150 151 # Macro information. 152 153 self.macro = None 154 self.macro_parameters = {} 155 self.held_anchors = [] 156 157 # Indentation and element nesting states. 158 159 self.indents = [0] 160 self.states = {} 161 self.max_level = self.level = 0 162 163 for name in preformatted_tags + single_level_tags: 164 self.states[name] = 0 165 166 # Table states. 167 168 self.table_rows = 0 169 self.table_columns = 0 170 171 # Block states. 172 173 self.have_block = False 174 175 # ContentHandler-related methods. 176 177 def startElement(self, name, attrs): 178 179 # Track indentation for lists. 180 181 if list_tags.has_key(name): 182 self.indents.append(self.indents[-1] + 1) 183 184 # Track element nesting. 185 186 if self.states.has_key(name): 187 self.states[name] += 1 188 189 # Track cumulative element nesting in order to produce appropriate depth 190 # indicators in the formatted output. 191 192 if name in preformatted_tags or name in formatted_tags: 193 self.level += 1 194 self.max_level = max(self.level, self.max_level) 195 196 # Reset indentation within regions. 197 198 self.indents.append(0) 199 200 if name in headings: 201 self.held_anchors = [] 202 203 Parser.startElement(self, name, attrs) 204 205 # Remember macro information for use within the element. 206 207 if name == "ac:macro": 208 self.macro = self.attributes[-1].get("ac:name") 209 210 def endElement(self, name): 211 212 # Reset the indent for any preformatted/formatted region so that it may 213 # itself be indented. 214 215 if name in preformatted_tags or name in formatted_tags: 216 self.indents.pop() 217 218 Parser.endElement(self, name) 219 220 if list_tags.has_key(name): 221 self.indents.pop() 222 223 if self.states.has_key(name): 224 self.states[name] -= 1 225 226 if name in preformatted_tags or name in formatted_tags: 227 self.level -= 1 228 if not self.level: 229 self.max_level = 0 230 231 # Discard macro state. 232 233 if name == "ac:macro": 234 self.macro = None 235 self.macro_parameters = {} 236 237 def characters(self, content): 238 if not self.is_preformatted(): 239 content = self.normalise(content, self.elements[-1]) 240 Parser.characters(self, content) 241 242 def skippedEntity(self, name): 243 ch = htmlentitydefs.name2codepoint.get(name) 244 if ch: 245 self.text[-1].append(unichr(ch)) 246 247 # Parser-related methods. 248 249 def handleElement(self, name): 250 251 """ 252 Handle the completion of the element with the given 'name'. Any content 253 will either be recorded for later use (by an enclosing element, for 254 example) or emitted in some form. 255 """ 256 257 text = u"".join(self.text[-1]) 258 259 # Handle state. 260 261 if name == "table": 262 self.table_rows = 0 263 elif name == "tr": 264 self.table_columns = 0 265 266 # Find conversions. 267 268 conversion = None 269 270 # Handle list elements. 271 272 if name == "li" and len(self.elements) > 1: 273 list_tag = self.elements[-2] 274 conversion = list_tags.get(list_tag) 275 276 # Remember link target information. 277 278 elif link_target_tags.has_key(name): 279 target_details = [] 280 281 # Get target details from the element's attributes. 282 283 for attrname in link_target_tags[name]: 284 attrvalue = self.attributes[-1].get(attrname) 285 if attrvalue: 286 target_details.append(attrvalue) 287 prefix = link_target_prefixes.get(attrname) 288 if prefix: 289 target_details.insert(0, prefix) 290 if attrname in link_label_attributes and not self.label: 291 self.label = attrvalue 292 293 # Make a link based on the details. 294 295 self.target = u"/".join(target_details) 296 self.target_type = name 297 text = "" 298 299 # For anchor links, just use the raw text and let Moin do the formatting. 300 301 elif name == "ac:link-body": 302 if not self.target_type: 303 self.target_type = name 304 self.label = text.strip() 305 text = "" 306 307 # For conventional links, remember the href attribute as the target. 308 309 elif name == "a": 310 self.target = self.attributes[-1].get("href") 311 self.label = text.strip() 312 text = "" 313 314 # Remember macro information. 315 316 elif name == "ac:parameter": 317 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 318 text = "" 319 320 elif name == "ac:default-parameter": 321 self.macro_parameters[self.attributes[-2].get("ac:name")] = text 322 text = "" 323 324 # Handle single-level tags. 325 326 elif name in single_level_tags and self.states[name] > 1: 327 conversion = "%s" 328 329 # Handle preformatted sections. 330 331 elif name in preformatted_tags or name in formatted_tags: 332 333 # Nest the section appropriately. 334 335 level = 3 + self.max_level - self.level 336 opening = "{" * level 337 closing = "}" * level 338 339 # Macro name information is used to style rich text body regions. 340 341 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 342 details = macro_rich_text_styles[self.macro] 343 title = self.macro_parameters.get("title") 344 if title: 345 details = "%s\n\n%s" % (details, title) 346 347 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 348 349 elif name == "table": 350 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 351 352 else: 353 # Preformatted sections containing newlines must contain an initial 354 # newline. 355 356 if text.find("\n") != -1 and not text.startswith("\n"): 357 opening += "\n" 358 359 conversion = "%s%%s%s" % (opening, closing) 360 361 # Handle the common case and simpler special cases. 362 363 if not conversion: 364 conversion = tags.get(name) 365 366 367 368 # Attempt to convert the text. 369 370 # Links require target information. 371 372 if name in ("ac:link", "ac:image"): 373 prefix = link_target_types.get(self.target_type, "") 374 anchor = self.attributes[-1].get("ac:anchor") 375 text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target) 376 self.target = self.target_type = self.label = None 377 378 elif name == "a": 379 text = conversion % (self.target, self.label or self.target) 380 self.target = self.target_type = self.label = None 381 382 # Macros require various kinds of information. 383 # Some macros affect the formatting of their contents, whereas other 384 # simpler macros are handled here. 385 386 elif name == "ac:macro": 387 conversion = macrotypes.get(self.macro) 388 if conversion: 389 parameters = {"content" : text} 390 parameters.update(self.macro_parameters) 391 argnames = macroargs.get(self.macro) 392 if argnames: 393 confargname, moinargname = argnames 394 parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[confargname])) 395 text = conversion % parameters 396 if self.macro == "anchor" and self.forbids_macros(): 397 self.held_anchors.append(text) 398 text = "" 399 400 # Handle the common cases for parameterised and unparameterised 401 # substitutions. 402 403 elif text and conversion: 404 text = conversion % text 405 elif simple_tags.has_key(name) and not self.is_preformatted(): 406 text = simple_tags[name] 407 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 408 text = simple_preformatted_tags[name] 409 410 411 412 # Postprocess table columns and rows. 413 414 if name in ("th", "td"): 415 if self.table_columns: 416 text = "\n|| %s" % text 417 self.table_columns += 1 418 elif name == "tr": 419 if self.table_rows: 420 text = "\n==\n%s" % text 421 self.table_rows += 1 422 423 # Postprocess held anchor tags in headings. 424 425 elif name in headings and self.held_anchors: 426 text = "%s\n%s" % ("".join(self.held_anchors), text) 427 428 429 430 # Normalise leading whitespace and indent the text if appropriate. 431 432 if name in indented_tags: 433 text = " " * self.indents[-1] + text.lstrip() 434 435 # Add the converted text to the end of the parent element's text nodes. 436 437 if len(self.text) > 1: 438 nodes = self.text[-2] 439 parent = self.elements[-2] 440 441 # Where preceding text exists, add any blank line separators. 442 443 if u"".join(nodes): 444 445 # All top-level elements are separated with blank lines. 446 447 if parent == "body": 448 nodes.append("\n") 449 450 # Block elements always cause a new line to be started. 451 452 if name in block_tags or self.have_block and name not in span_override_tags: 453 nodes.append("\n") 454 455 self.have_block = False 456 457 # Lists inside lists require separation. 458 459 elif list_tags.has_key(name) and parent == "li": 460 nodes.append("\n") 461 462 # Without preceding text, save any block node state for non-block 463 # elements so that newline separators can be added at another 464 # level. 465 466 elif name in block_tags and parent not in block_tags: 467 self.have_block = True 468 469 elif name not in block_tags and self.have_block and name not in span_override_tags: 470 self.have_block = True 471 472 else: 473 self.have_block = False 474 475 nodes.append(text) 476 477 # Otherwise, emit the text (at the top level of the document). 478 479 else: 480 self.out.write(text) 481 482 def is_preformatted(self): 483 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 484 485 def forbids_macros(self): 486 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 487 488 # Whitespace normalisation. 489 490 def get_replacement(self, name): 491 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 492 return "" 493 else: 494 return " " 495 496 def normalise(self, text, name): 497 return normalise_regexp.sub(self.get_replacement(name), text) 498 499 def parse(s, out): 500 501 "Parse the content in the string 's', writing a translation to 'out'." 502 503 # NOTE: CDATA sections appear to have erroneous endings. 504 505 s = u"""\ 506 <?xml version="1.0"?> 507 <!DOCTYPE html 508 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 509 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 510 <html xmlns="http://www.w3.org/1999/xhtml"> 511 <body> 512 %s 513 </body> 514 </html>""" % s.replace("]] >", "]]>") 515 516 f = StringIO(s.encode("utf-8")) 517 try: 518 parser = ConfluenceXMLParser(out) 519 parser.parse(f) 520 finally: 521 f.close() 522 523 if __name__ == "__main__": 524 s = codecs.getreader("utf-8")(sys.stdin).read() 525 out = codecs.getwriter("utf-8")(sys.stdout) 526 parse(s, out) 527 528 # vim: tabstop=4 expandtab shiftwidth=4