1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s%s|%s]]", 60 "ac:image" : "{{%s%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 simple_preformatted_tags = { 73 # XHTML tag MoinMoin syntax 74 "br" : "\n", 75 } 76 77 list_tags = { 78 # XHTML list tag MoinMoin list item syntax 79 "ol" : "1. %s", 80 "ul" : "* %s", 81 } 82 83 preformatted_tags = ["pre", "ac:plain-text-body"] 84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 85 formatted_tags = ["ac:rich-text-body", "table"] 86 87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 89 span_override_tags = ["ac:link"] 90 91 link_target_tags = { 92 # Confluence element Attributes providing the target 93 "ri:page" : ("ri:space-key", "ri:content-title"), 94 "ri:attachment" : ("ri:filename",), 95 "ri:user" : ("ri:username",), 96 } 97 98 link_target_prefixes = { 99 # Attribute with details Prefix ensuring correct relative link 100 "ri:space-key" : "..", 101 "ri:content-title" : "..", 102 } 103 104 link_label_attributes = "ri:content-title", "ac:link-body" 105 106 # NOTE: User links should support the intended user namespace prefix. 107 108 link_target_types = { 109 # Confluence element MoinMoin link prefix 110 "ri:attachment" : "attachment:", 111 "ri:user" : "", 112 } 113 114 macro_rich_text_styles = { 115 # Confluence style MoinMoin admonition style 116 "note" : "caution", 117 "warning" : "warning", 118 "info" : "important", 119 "tip" : "tip", 120 } 121 122 macroargs = { 123 # Confluence macro Confluence and MoinMoin macro arguments 124 "color" : ("color", "col"), 125 } 126 127 macrotypes = { 128 # Confluence macro MoinMoin syntax 129 "anchor" : "<<Anchor(%(anchor)s)>>", 130 "color" : "<<Color2(%(content)s, %(args)s)>>", 131 } 132 133 normalise_regexp_str = r"\s+" 134 normalise_regexp = re.compile(normalise_regexp_str) 135 136 class ConfluenceXMLParser(Parser): 137 138 "Handle content from Confluence 4 page revisions." 139 140 def __init__(self, out): 141 Parser.__init__(self) 142 self.out = out 143 144 # Link target and label information. 145 146 self.target = None 147 self.target_type = None 148 self.label = None 149 150 # Macro information. 151 152 self.macro = None 153 self.macro_parameters = {} 154 self.held_anchors = [] 155 156 # Indentation and element nesting states. 157 158 self.indents = [0] 159 self.states = {} 160 self.max_level = self.level = 0 161 162 for name in preformatted_tags + single_level_tags: 163 self.states[name] = 0 164 165 # Table states. 166 167 self.table_rows = 0 168 self.table_columns = 0 169 170 # Block states. 171 172 self.have_block = False 173 174 # ContentHandler-related methods. 175 176 def startElement(self, name, attrs): 177 178 # Track indentation for lists. 179 180 if list_tags.has_key(name): 181 self.indents.append(self.indents[-1] + 1) 182 183 # Track element nesting. 184 185 if self.states.has_key(name): 186 self.states[name] += 1 187 188 # Track cumulative element nesting in order to produce appropriate depth 189 # indicators in the formatted output. 190 191 if name in preformatted_tags or name in formatted_tags: 192 self.level += 1 193 self.max_level = max(self.level, self.max_level) 194 195 # Reset indentation within regions. 196 197 self.indents.append(0) 198 199 if name in headings: 200 self.held_anchors = [] 201 202 Parser.startElement(self, name, attrs) 203 204 # Remember macro information for use within the element. 205 206 if name == "ac:macro": 207 self.macro = self.attributes[-1].get("ac:name") 208 209 def endElement(self, name): 210 211 # Reset the indent for any preformatted/formatted region so that it may 212 # itself be indented. 213 214 if name in preformatted_tags or name in formatted_tags: 215 self.indents.pop() 216 217 Parser.endElement(self, name) 218 219 if list_tags.has_key(name): 220 self.indents.pop() 221 222 if self.states.has_key(name): 223 self.states[name] -= 1 224 225 if name in preformatted_tags or name in formatted_tags: 226 self.level -= 1 227 if not self.level: 228 self.max_level = 0 229 230 # Discard macro state. 231 232 if name == "ac:macro": 233 self.macro = None 234 self.macro_parameters = {} 235 236 def characters(self, content): 237 if not self.is_preformatted(): 238 content = self.normalise(content, self.elements[-1]) 239 Parser.characters(self, content) 240 241 def skippedEntity(self, name): 242 ch = htmlentitydefs.name2codepoint.get(name) 243 if ch: 244 self.text[-1].append(unichr(ch)) 245 246 # Parser-related methods. 247 248 def handleElement(self, name): 249 250 """ 251 Handle the completion of the element with the given 'name'. Any content 252 will either be recorded for later use (by an enclosing element, for 253 example) or emitted in some form. 254 """ 255 256 text = u"".join(self.text[-1]) 257 258 # Handle state. 259 260 if name == "table": 261 self.table_rows = 0 262 elif name == "tr": 263 self.table_columns = 0 264 265 # Find conversions. 266 267 conversion = None 268 269 # Handle list elements. 270 271 if name == "li" and len(self.elements) > 1: 272 list_tag = self.elements[-2] 273 conversion = list_tags.get(list_tag) 274 275 # Remember link target information. 276 277 elif link_target_tags.has_key(name): 278 target_details = [] 279 280 # Get target details from the element's attributes. 281 282 for attrname in link_target_tags[name]: 283 attrvalue = self.attributes[-1].get(attrname) 284 if attrvalue: 285 target_details.append(attrvalue) 286 prefix = link_target_prefixes.get(attrname) 287 if prefix: 288 target_details.insert(0, prefix) 289 if attrname in link_label_attributes and not self.label: 290 self.label = attrvalue 291 292 # Make a link based on the details. 293 294 self.target = u"/".join(target_details) 295 self.target_type = name 296 text = "" 297 298 # For anchor links, just use the raw text and let Moin do the formatting. 299 300 elif name == "ac:link-body": 301 self.label = text.strip() 302 text = "" 303 304 # For conventional links, remember the href attribute as the target. 305 306 elif name == "a": 307 self.target = self.attributes[-1].get("href") 308 self.label = text.strip() 309 text = "" 310 311 # Remember macro information. 312 313 elif name == "ac:parameter": 314 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 315 text = "" 316 317 elif name == "ac:default-parameter": 318 self.macro_parameters[self.attributes[-2].get("ac:name")] = text 319 text = "" 320 321 # Handle single-level tags. 322 323 elif name in single_level_tags and self.states[name] > 1: 324 conversion = "%s" 325 326 # Handle preformatted sections. 327 328 elif name in preformatted_tags or name in formatted_tags: 329 330 # Nest the section appropriately. 331 332 level = 3 + self.max_level - self.level 333 opening = "{" * level 334 closing = "}" * level 335 336 # Macro name information is used to style rich text body regions. 337 338 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 339 details = macro_rich_text_styles[self.macro] 340 title = self.macro_parameters.get("title") 341 if title: 342 details = "%s\n\n%s" % (details, title) 343 344 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 345 346 elif name == "table": 347 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 348 349 else: 350 # Preformatted sections containing newlines must contain an initial 351 # newline. 352 353 if text.find("\n") != -1 and not text.startswith("\n"): 354 opening += "\n" 355 356 conversion = "%s%%s%s" % (opening, closing) 357 358 # Handle the common case and simpler special cases. 359 360 if not conversion: 361 conversion = tags.get(name) 362 363 364 365 # Attempt to convert the text. 366 367 # Links require target information. 368 369 if name in ("ac:link", "ac:image"): 370 prefix = link_target_types.get(self.target_type, "") 371 anchor = self.attributes[-1].get("ac:anchor") or "" 372 label = self.label or text.strip() or self.target 373 text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label) 374 self.target = self.target_type = self.label = None 375 376 elif name == "a": 377 text = conversion % (self.target, self.label or self.target) 378 self.target = self.target_type = self.label = None 379 380 # Macros require various kinds of information. 381 # Some macros affect the formatting of their contents, whereas other 382 # simpler macros are handled here. 383 384 elif name == "ac:macro": 385 conversion = macrotypes.get(self.macro) 386 if conversion: 387 parameters = {"content" : text} 388 parameters.update(self.macro_parameters) 389 argnames = macroargs.get(self.macro) 390 if argnames: 391 confargname, moinargname = argnames 392 parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[confargname])) 393 text = conversion % parameters 394 if self.macro == "anchor" and self.forbids_macros(): 395 self.held_anchors.append(text) 396 text = "" 397 398 # Handle the common cases for parameterised and unparameterised 399 # substitutions. 400 401 elif text and conversion: 402 text = conversion % text 403 elif simple_tags.has_key(name) and not self.is_preformatted(): 404 text = simple_tags[name] 405 elif simple_preformatted_tags.has_key(name) and self.is_preformatted(): 406 text = simple_preformatted_tags[name] 407 408 409 410 # Postprocess table columns and rows. 411 412 if name in ("th", "td"): 413 if self.table_columns: 414 text = "\n|| %s" % text 415 self.table_columns += 1 416 elif name == "tr": 417 if self.table_rows: 418 text = "\n==\n%s" % text 419 self.table_rows += 1 420 421 # Postprocess held anchor tags in headings. 422 423 elif name in headings and self.held_anchors: 424 text = "%s\n%s" % ("".join(self.held_anchors), text) 425 426 427 428 # Normalise leading whitespace and indent the text if appropriate. 429 430 if name in indented_tags: 431 text = " " * self.indents[-1] + text.lstrip() 432 433 # Add the converted text to the end of the parent element's text nodes. 434 435 if len(self.text) > 1: 436 nodes = self.text[-2] 437 parent = self.elements[-2] 438 439 # Where preceding text exists, add any blank line separators. 440 441 if u"".join(nodes): 442 443 # All top-level elements are separated with blank lines. 444 445 if parent == "body": 446 nodes.append("\n") 447 448 # Block elements always cause a new line to be started. 449 450 if name in block_tags or self.have_block and name not in span_override_tags: 451 nodes.append("\n") 452 453 self.have_block = False 454 455 # Lists inside lists require separation. 456 457 elif list_tags.has_key(name) and parent == "li": 458 nodes.append("\n") 459 460 # Without preceding text, save any block node state for non-block 461 # elements so that newline separators can be added at another 462 # level. 463 464 elif name in block_tags and parent not in block_tags: 465 self.have_block = True 466 467 elif name not in block_tags and self.have_block and name not in span_override_tags: 468 self.have_block = True 469 470 else: 471 self.have_block = False 472 473 nodes.append(text) 474 475 # Otherwise, emit the text (at the top level of the document). 476 477 else: 478 self.out.write(text) 479 480 def is_preformatted(self): 481 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 482 483 def forbids_macros(self): 484 return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False) 485 486 # Whitespace normalisation. 487 488 def get_replacement(self, name): 489 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 490 return "" 491 else: 492 return " " 493 494 def normalise(self, text, name): 495 return normalise_regexp.sub(self.get_replacement(name), text) 496 497 def parse(s, out): 498 499 "Parse the content in the string 's', writing a translation to 'out'." 500 501 # NOTE: CDATA sections appear to have erroneous endings. 502 503 s = u"""\ 504 <?xml version="1.0"?> 505 <!DOCTYPE html 506 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 507 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 508 <html xmlns="http://www.w3.org/1999/xhtml"> 509 <body> 510 %s 511 </body> 512 </html>""" % s.replace("]] >", "]]>") 513 514 f = StringIO(s.encode("utf-8")) 515 try: 516 parser = ConfluenceXMLParser(out) 517 parser.parse(f) 518 finally: 519 f.close() 520 521 if __name__ == "__main__": 522 s = codecs.getreader("utf-8")(sys.stdin).read() 523 out = codecs.getwriter("utf-8")(sys.stdout) 524 parse(s, out) 525 526 # vim: tabstop=4 expandtab shiftwidth=4