1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s|%s]]", 60 "ac:image" : "{{%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 list_tags = { 73 # XHTML list tag MoinMoin list item syntax 74 "ol" : "1. %s", 75 "ul" : "* %s", 76 } 77 78 preformatted_tags = ["pre", "ac:plain-text-body"] 79 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 80 formatted_tags = ["ac:rich-text-body", "table"] 81 82 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags 83 block_tags = indented_tags + blocktypes.keys() + list_tags.keys() 84 span_override_tags = ["ac:link"] 85 86 link_target_tags = { 87 # Confluence element Attributes providing the target 88 "ri:page" : ("ri:space-key", "ri:content-title"), 89 "ri:attachment" : ("ri:filename",), 90 "ri:user" : ("ri:username",), 91 } 92 93 link_target_prefixes = { 94 # Attribute with details Prefix ensuring correct relative link 95 "ri:space-key" : "..", 96 "ri:content-title" : "..", 97 } 98 99 link_label_attributes = "ri:content-title", "ac:link-body" 100 101 # NOTE: User links should support the intended user namespace prefix. 102 103 link_target_types = { 104 # Confluence element MoinMoin link prefix 105 "ri:attachment" : "attachment:", 106 "ri:user" : "", 107 "ac:link-body" : "#", 108 } 109 110 macro_rich_text_styles = { 111 # Confluence style MoinMoin admonition style 112 "note" : "caution", 113 "warning" : "warning", 114 "info" : "important", 115 "tip" : "tip", 116 } 117 118 normalise_regexp_str = r"\s+" 119 normalise_regexp = re.compile(normalise_regexp_str) 120 121 class ConfluenceXMLParser(Parser): 122 123 "Handle content from Confluence 4 page revisions." 124 125 def __init__(self, out): 126 Parser.__init__(self) 127 self.out = out 128 129 # Link target and label information. 130 131 self.target = None 132 self.target_type = None 133 self.label = None 134 135 # Macro information. 136 137 self.macro = None 138 self.macro_parameters = {} 139 140 # Indentation and element nesting states. 141 142 self.indent = 0 143 self.states = {} 144 self.max_level = self.level = 0 145 146 for name in preformatted_tags + single_level_tags: 147 self.states[name] = 0 148 149 # Table states. 150 151 self.table_rows = 0 152 self.table_columns = 0 153 154 # Block states. 155 156 self.have_block = False 157 158 # ContentHandler-related methods. 159 160 def startElement(self, name, attrs): 161 162 # Track indentation for lists. 163 164 if list_tags.has_key(name): 165 self.indent += 1 166 167 # Track element nesting. 168 169 elif self.states.has_key(name): 170 self.states[name] += 1 171 172 # Track cumulative element nesting in order to produce appropriate depth 173 # indicators in the formatted output. 174 175 if name in preformatted_tags or name in formatted_tags: 176 self.level += 1 177 self.max_level = max(self.level, self.max_level) 178 179 Parser.startElement(self, name, attrs) 180 181 # Remember macro information for use within the element. 182 183 if name == "ac:macro": 184 self.macro = self.attributes[-1].get("ac:name") 185 186 def endElement(self, name): 187 Parser.endElement(self, name) 188 189 if list_tags.has_key(name): 190 self.indent -= 1 191 elif self.states.has_key(name): 192 self.states[name] -= 1 193 if name in preformatted_tags or name in formatted_tags: 194 self.level -= 1 195 if not self.level: 196 self.max_level = 0 197 198 def characters(self, content): 199 if not self.is_preformatted(): 200 content = self.normalise(content, self.elements[-1]) 201 Parser.characters(self, content) 202 203 def skippedEntity(self, name): 204 ch = htmlentitydefs.name2codepoint.get(name) 205 if ch: 206 self.text[-1].append(unichr(ch)) 207 208 # Parser-related methods. 209 210 def handleElement(self, name): 211 212 """ 213 Handle the completion of the element with the given 'name'. Any content 214 will either be recorded for later use (by an enclosing element, for 215 example) or emitted in some form. 216 """ 217 218 text = u"".join(self.text[-1]) 219 220 # Handle state. 221 222 if name == "table": 223 self.table_rows = 0 224 elif name == "tr": 225 self.table_columns = 0 226 227 # Find conversions. 228 229 conversion = None 230 231 # Handle list elements. 232 233 if name == "li" and len(self.elements) > 1: 234 list_tag = self.elements[-2] 235 conversion = list_tags.get(list_tag) 236 237 # Remember link target information. 238 239 elif link_target_tags.has_key(name): 240 target_details = [] 241 242 # Get target details from the element's attributes. 243 244 for attrname in link_target_tags[name]: 245 attrvalue = self.attributes[-1].get(attrname) 246 if attrvalue: 247 target_details.append(attrvalue) 248 prefix = link_target_prefixes.get(attrname) 249 if prefix: 250 target_details.insert(0, prefix) 251 if attrname in link_label_attributes and not self.label: 252 self.label = attrvalue 253 254 # Make a link based on the details. 255 256 self.target = u"/".join(target_details) 257 self.target_type = name 258 text = "" 259 260 # For anchor links, just use the raw text and let Moin do the formatting. 261 262 elif name == "ac:link-body": 263 if not self.target_type: 264 self.target_type = name 265 self.label = text.strip() 266 text = "" 267 268 # For conventional links, remember the href attribute as the target. 269 270 elif name == "a": 271 self.target = self.attributes[-1].get("href") 272 self.label = text.strip() 273 text = "" 274 275 # Discard macro state. 276 277 elif name == "ac:macro": 278 self.macro = None 279 self.macro_parameters = {} 280 281 # Remember macro information. 282 283 elif name in ("ac:parameter", "ac:default-parameter"): 284 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 285 text = "" 286 287 # Handle single-level tags. 288 289 elif name in single_level_tags and self.states[name] > 1: 290 conversion = "%s" 291 292 # Handle preformatted sections. 293 294 elif name in preformatted_tags or name in formatted_tags: 295 296 # Nest the section appropriately. 297 298 level = 3 + self.max_level - self.level 299 opening = "{" * level 300 closing = "}" * level 301 302 # Macro name information is used to style rich text body regions. 303 304 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 305 details = macro_rich_text_styles[self.macro] 306 title = self.macro_parameters.get("title") 307 if title: 308 details = "%s\n\n%s" % (details, title) 309 310 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 311 312 elif name == "table": 313 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 314 315 else: 316 conversion = "%s%%s%s" % (opening, closing) 317 318 # Handle the common case and simpler special cases. 319 320 if not conversion: 321 conversion = tags.get(name) 322 323 324 325 # Attempt to convert the text. 326 327 # Links require target information. 328 329 if name in ("ac:link", "ac:image"): 330 prefix = link_target_types.get(self.target_type, "") 331 anchor = self.attributes[-1].get("ac:anchor") 332 text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target) 333 self.target = self.target_type = self.label = None 334 335 elif name == "a": 336 text = conversion % (self.target, self.label or self.target) 337 self.target = self.target_type = self.label = None 338 339 # Handle the common case. 340 341 elif text and conversion: 342 text = conversion % text 343 elif simple_tags.has_key(name): 344 text = simple_tags[name] 345 346 # Postprocess table columns and rows. 347 348 if name in ("th", "td"): 349 if self.table_columns: 350 text = "\n|| %s" % text 351 self.table_columns += 1 352 elif name == "tr": 353 if self.table_rows: 354 text = "\n==\n%s" % text 355 self.table_rows += 1 356 357 # Normalise leading whitespace and indent the text if appropriate. 358 359 if name in indented_tags: 360 text = " " * self.indent + text.lstrip() 361 362 # Add the converted text to the end of the parent element's text nodes. 363 364 if len(self.text) > 1: 365 nodes = self.text[-2] 366 parent = self.elements[-2] 367 368 # Where preceding text exists, add any blank line separators. 369 370 if u"".join(nodes): 371 372 # All top-level elements are separated with blank lines. 373 374 if parent == "body": 375 nodes.append("\n") 376 377 # Block elements always cause a new line to be started. 378 379 if name in block_tags or self.have_block and name not in span_override_tags: 380 nodes.append("\n") 381 382 self.have_block = False 383 384 # Lists inside lists require separation. 385 386 elif list_tags.has_key(name) and parent == "li": 387 nodes.append("\n") 388 389 # Without preceding text, save any block node state for non-block 390 # elements so that new line separators can be added at another 391 # level. 392 393 elif name in block_tags and parent not in block_tags: 394 self.have_block = True 395 396 elif name not in block_tags and self.have_block and name not in span_override_tags: 397 self.have_block = True 398 399 else: 400 self.have_block = False 401 402 nodes.append(text) 403 404 # Otherwise, emit the text (at the top level of the document). 405 406 else: 407 self.out.write(text) 408 409 def is_preformatted(self): 410 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 411 412 # Whitespace normalisation. 413 414 def get_replacement(self, name): 415 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 416 return "" 417 else: 418 return " " 419 420 def normalise(self, text, name): 421 return normalise_regexp.sub(self.get_replacement(name), text) 422 423 def parse(s, out): 424 425 "Parse the content in the string 's', writing a translation to 'out'." 426 427 # NOTE: CDATA sections appear to have erroneous endings. 428 429 s = u"""\ 430 <?xml version="1.0"?> 431 <!DOCTYPE html 432 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 433 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 434 <html xmlns="http://www.w3.org/1999/xhtml"> 435 <body> 436 %s 437 </body> 438 </html>""" % s.replace("]] >", "]]>") 439 440 f = StringIO(s.encode("utf-8")) 441 try: 442 parser = ConfluenceXMLParser(out) 443 parser.parse(f) 444 finally: 445 f.close() 446 447 if __name__ == "__main__": 448 s = sys.stdin.read() 449 out = codecs.getwriter("utf-8")(sys.stdout) 450 parse(s, out) 451 452 # vim: tabstop=4 expandtab shiftwidth=4