1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from MoinMoin import wikiutil 30 from common import * 31 from xmlread import Parser 32 import re 33 import sys 34 import operator 35 import htmlentitydefs 36 import codecs 37 38 # XML dialect syntax parsing. 39 40 tags = { 41 # XHTML tag MoinMoin syntax 42 "strong" : "'''%s'''", 43 "em" : "''%s''", 44 "u" : "__%s__", 45 "del" : "--(%s)--", 46 "sup" : "^%s^", 47 "sub" : ",,%s,,", 48 "code" : "`%s`", 49 "tbody" : "%s", 50 "tr" : "%s", 51 "th" : "'''%s'''", 52 "td" : "%s", 53 "blockquote" : " %s", 54 "small" : "~-%s-~", 55 "big" : "~+%s+~", 56 "p" : "%s", 57 "ol" : "%s", 58 "ul" : "%s", 59 "ac:link" : "[[%s%s|%s]]", 60 "ac:image" : "{{%s%s|%s}}", 61 "a" : "[[%s|%s]]", 62 } 63 64 for tag, translation in blocktypes.items(): 65 tags[tag] = translation 66 67 simple_tags = { 68 # XHTML tag MoinMoin syntax 69 "br" : "<<BR>>", 70 } 71 72 list_tags = { 73 # XHTML list tag MoinMoin list item syntax 74 "ol" : "1. %s", 75 "ul" : "* %s", 76 } 77 78 indented_tags = ["li", "p"] 79 80 preformatted_tags = ["pre", "ac:plain-text-body"] 81 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"] 82 formatted_tags = ["ac:rich-text-body", "table"] 83 84 link_target_tags = { 85 # Confluence element Attributes providing the target 86 "ri:page" : ("ri:space-key", "ri:content-title"), 87 "ri:attachment" : ("ri:filename",), 88 "ri:user" : ("ri:username",), 89 } 90 91 link_target_prefixes = { 92 # Attribute with details Prefix ensuring correct relative link 93 "ri:space-key" : "..", 94 "ri:content-title" : "..", 95 } 96 97 link_label_attributes = "ri:content-title", "ac:link-body" 98 99 # NOTE: User links should support the intended user namespace prefix. 100 101 link_target_types = { 102 # Confluence element MoinMoin link prefix 103 "ri:attachment" : "attachment:", 104 "ri:user" : "", 105 "ac:link-body" : "#", 106 } 107 108 macro_rich_text_styles = { 109 # Confluence style MoinMoin admonition style 110 "note" : "caution", 111 "warning" : "warning", 112 "info" : "important", 113 "tip" : "tip", 114 } 115 116 normalise_regexp_str = r"\s+" 117 normalise_regexp = re.compile(normalise_regexp_str) 118 119 class ConfluenceXMLParser(Parser): 120 121 "Handle content from Confluence 4 page revisions." 122 123 def __init__(self, out): 124 Parser.__init__(self) 125 self.out = out 126 127 # Link target and label information. 128 129 self.target = None 130 self.target_type = None 131 self.label = None 132 133 # Macro information. 134 135 self.macro = None 136 self.macro_parameters = {} 137 138 # Indentation and element nesting states. 139 140 self.indent = 0 141 self.states = {} 142 self.max_level = self.level = 0 143 144 for name in preformatted_tags + single_level_tags: 145 self.states[name] = 0 146 147 # Table states. 148 149 self.table_rows = 0 150 self.table_columns = 0 151 152 # ContentHandler-related methods. 153 154 def startElement(self, name, attrs): 155 156 # Track indentation for lists. 157 158 if list_tags.has_key(name): 159 self.indent += 1 160 161 # Track element nesting. 162 163 elif self.states.has_key(name): 164 self.states[name] += 1 165 166 # Track cumulative element nesting in order to produce appropriate depth 167 # indicators in the formatted output. 168 169 if name in preformatted_tags or name in formatted_tags: 170 self.level += 1 171 self.max_level = max(self.level, self.max_level) 172 173 Parser.startElement(self, name, attrs) 174 175 # Remember macro information for use within the element. 176 177 if name == "ac:macro": 178 self.macro = self.attributes[-1].get("ac:name") 179 180 def endElement(self, name): 181 Parser.endElement(self, name) 182 183 if list_tags.has_key(name): 184 self.indent -= 1 185 elif self.states.has_key(name): 186 self.states[name] -= 1 187 if name in preformatted_tags or name in formatted_tags: 188 self.level -= 1 189 if not self.level: 190 self.max_level = 0 191 192 def characters(self, content): 193 if not self.is_preformatted(): 194 content = self.normalise(content, self.elements[-1]) 195 Parser.characters(self, content) 196 197 def skippedEntity(self, name): 198 ch = htmlentitydefs.name2codepoint.get(name) 199 if ch: 200 self.text[-1].append(unichr(ch)) 201 202 # Parser-related methods. 203 204 def handleElement(self, name): 205 206 """ 207 Handle the completion of the element with the given 'name'. Any content 208 will either be recorded for later use (by an enclosing element, for 209 example) or emitted in some form. 210 """ 211 212 text = "".join(self.text[-1]) 213 214 # Handle state. 215 216 if name == "table": 217 self.table_rows = 0 218 elif name == "tr": 219 self.table_columns = 0 220 221 # Find conversions. 222 223 conversion = None 224 225 # Handle list elements. 226 227 if name == "li" and len(self.elements) > 1: 228 list_tag = self.elements[-2] 229 conversion = list_tags.get(list_tag) 230 231 # Remember link target information. 232 233 elif link_target_tags.has_key(name): 234 target_details = [] 235 236 # Get target details from the element's attributes. 237 238 for attrname in link_target_tags[name]: 239 attrvalue = self.attributes[-1].get(attrname) 240 if attrvalue: 241 target_details.append(attrvalue) 242 prefix = link_target_prefixes.get(attrname) 243 if prefix: 244 target_details.insert(0, prefix) 245 if attrname in link_label_attributes and not self.label: 246 self.label = attrvalue 247 248 # Make a link based on the details. 249 250 self.target = "/".join(target_details) 251 self.target_type = name 252 text = "" 253 254 # For anchor links, just use the raw text and let Moin do the formatting. 255 256 elif name == "ac:link-body": 257 if not self.target_type: 258 self.target_type = name 259 self.label = text 260 text = "" 261 262 # For conventional links, remember the href attribute as the target. 263 264 elif name == "a": 265 self.target = self.attributes[-1].get("href") 266 self.label = text 267 text = "" 268 269 # Discard macro state. 270 271 elif name == "ac:macro": 272 self.macro = None 273 self.macro_parameters = {} 274 275 # Remember macro information. 276 277 elif name in ("ac:parameter", "ac:default-parameter"): 278 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 279 text = "" 280 281 # Handle single-level tags. 282 283 elif name in single_level_tags and self.states[name] > 1: 284 conversion = "%s" 285 286 # Handle preformatted sections. 287 288 elif name in preformatted_tags or name in formatted_tags: 289 290 # Nest the section appropriately. 291 292 level = 3 + self.max_level - self.level 293 opening = "{" * level 294 closing = "}" * level 295 296 # Macro name information is used to style rich text body regions. 297 298 if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro): 299 details = macro_rich_text_styles[self.macro] 300 title = self.macro_parameters.get("title") 301 if title: 302 details = "%s\n\n%s" % (details, title) 303 304 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing) 305 306 elif name == "table": 307 conversion = "%s#!table\n%%s\n%s" % (opening, closing) 308 309 else: 310 conversion = "%s%%s%s" % (opening, closing) 311 312 # Handle the common case and simpler special cases. 313 314 if not conversion: 315 conversion = tags.get(name) 316 317 # Attempt to convert the text. 318 319 # Links require target information. 320 321 if name in ("ac:link", "ac:image"): 322 prefix = link_target_types.get(self.target_type, "") 323 anchor = self.attributes[-1].get("ac:anchor") 324 text = conversion % (prefix, anchor or self.target, self.label or text or self.target) 325 self.target = self.target_type = self.label = None 326 327 elif name == "a": 328 text = conversion % (self.target, self.label) 329 self.target = self.target_type = self.label = None 330 331 # Handle the common case. 332 333 elif text and conversion: 334 text = conversion % text 335 elif simple_tags.has_key(name): 336 text = simple_tags[name] 337 338 # Postprocess table columns and rows. 339 340 if name in ("th", "td"): 341 if self.table_columns: 342 text = "\n|| %s" % text 343 self.table_columns += 1 344 elif name == "tr": 345 if self.table_rows: 346 text = "\n==\n%s" % text 347 self.table_rows += 1 348 349 # Normalise leading whitespace and indent the text if appropriate. 350 351 if name in indented_tags: 352 text = " " * self.indent + text.lstrip() 353 354 # Add the converted text to the end of the parent element's text nodes. 355 356 if len(self.text) > 1: 357 nodes = self.text[-2] 358 if "".join(self.text[-2]): 359 parent = self.elements[-2] 360 if parent == "body": 361 nodes.append("\n\n") 362 elif list_tags.has_key(parent): 363 nodes.append("\n") 364 elif list_tags.has_key(name): 365 nodes.append("\n") 366 nodes.append(text) 367 368 # Otherwise, emit the text. 369 370 else: 371 self.out.write(text) 372 373 def is_preformatted(self): 374 return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False) 375 376 # Whitespace normalisation. 377 378 def get_replacement(self, name): 379 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 380 return "" 381 else: 382 return " " 383 384 def normalise(self, text, name): 385 return normalise_regexp.sub(self.get_replacement(name), text) 386 387 def parse(s, out): 388 389 "Parse the content in the string 's', writing a translation to 'out'." 390 391 # NOTE: CDATA sections appear to have erroneous endings. 392 393 s = u"""\ 394 <?xml version="1.0"?> 395 <!DOCTYPE html 396 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 397 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 398 <html xmlns="http://www.w3.org/1999/xhtml"> 399 <body> 400 %s 401 </body> 402 </html>""" % s.replace("]] >", "]]>") 403 404 f = StringIO(s.encode("utf-8")) 405 try: 406 parser = ConfluenceXMLParser(out) 407 parser.parse(f) 408 finally: 409 f.close() 410 411 if __name__ == "__main__": 412 s = sys.stdin.read() 413 out = codecs.getwriter("utf-8")(sys.stdout) 414 parse(s, out) 415 416 # vim: tabstop=4 expandtab shiftwidth=4