1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from common import * 30 from xmlread import Parser 31 import re 32 import sys 33 import operator 34 import htmlentitydefs 35 import codecs 36 37 # XML dialect syntax parsing. 38 39 tags = { 40 # XHTML tag MoinMoin syntax 41 "strong" : "'''%s'''", 42 "em" : "''%s''", 43 "u" : "__%s__", 44 "del" : "--(%s)--", 45 "sup" : "^%s^", 46 "sub" : ",,%s,,", 47 "code" : "`%s`", 48 "pre" : "{{{%s}}}", 49 "table" : "{{{#!table\n%s\n}}}", 50 "tbody" : "%s", 51 "tr" : "%s", 52 "th" : "'''%s'''", 53 "td" : "%s", 54 "blockquote" : " %s", 55 "small" : "~-%s-~", 56 "big" : "~+%s+~", 57 "p" : "%s", 58 "ol" : "%s", 59 "ul" : "%s", 60 "ac:plain-text-body" : "{{{%s}}}", 61 "ac:link" : "[[%s%s|%s]]", 62 "ac:image" : "{{%s%s|%s}}", 63 } 64 65 for tag, translation in blocktypes.items(): 66 tags[tag] = translation 67 68 simple_tags = { 69 # XHTML tag MoinMoin syntax 70 "br" : "<<BR>>", 71 } 72 73 list_tags = { 74 # XHTML list tag MoinMoin list item syntax 75 "ol" : "1. %s", 76 "ul" : "* %s", 77 } 78 79 indented_tags = ["li", "p"] 80 81 link_target_tags = { 82 # Confluence element Attribute providing the target 83 "ri:page" : "ri:content-title", 84 "ri:attachment" : "ri:filename", 85 "ri:user" : "ri:username", 86 } 87 88 macro_rich_text_styles = { 89 # Confluence style MoinMoin admonition style 90 "note" : "caution", 91 "warning" : "warning", 92 "info" : "important", 93 "tip" : "tip", 94 } 95 96 normalise_regexp_str = r"\s+" 97 normalise_regexp = re.compile(normalise_regexp_str) 98 99 class ConfluenceXMLParser(Parser): 100 101 "Handle content from Confluence 4 page revisions." 102 103 def __init__(self, out): 104 Parser.__init__(self) 105 self.out = out 106 107 # Link target information. 108 109 self.target = None 110 self.target_type = None 111 112 # Macro information. 113 114 self.macro = None 115 self.macro_parameters = {} 116 117 # Indentation and preformatted states. 118 119 self.indent = 0 120 self.states = {} 121 for name in ("pre", "ac:plain-text-body"): 122 self.states[name] = 0 123 124 # Table states. 125 126 self.table_rows = 0 127 self.table_columns = 0 128 129 # ContentHandler-related methods. 130 131 def startElement(self, name, attrs): 132 if list_tags.has_key(name): 133 self.indent += 1 134 elif self.states.has_key(name): 135 self.states[name] += 1 136 Parser.startElement(self, name, attrs) 137 138 def endElement(self, name): 139 Parser.endElement(self, name) 140 if list_tags.has_key(name): 141 self.indent -= 1 142 elif self.states.has_key(name): 143 self.states[name] -= 1 144 145 def characters(self, content): 146 if not self.is_preformatted(): 147 content = self.normalise(content, self.elements[-1]) 148 Parser.characters(self, content) 149 150 def skippedEntity(self, name): 151 ch = htmlentitydefs.name2codepoint.get(name) 152 if ch: 153 self.text[-1].append(unichr(ch)) 154 155 # Parser-related methods. 156 157 def handleElement(self, name): 158 text = "".join(self.text[-1]) 159 160 # Handle state. 161 162 if name == "table": 163 self.table_rows = 0 164 elif name == "tr": 165 self.table_columns = 0 166 167 # Find conversions. 168 169 conversion = None 170 171 # Handle list elements. 172 173 if name == "li" and len(self.elements) > 1: 174 list_tag = self.elements[-2] 175 conversion = list_tags.get(list_tag) 176 177 # Remember link target information. 178 179 elif link_target_tags.has_key(name): 180 self.target = self.attributes[-1].get(link_target_tags[name]) 181 self.target_type = name 182 text = "" 183 184 # Remember macro information. 185 186 elif name == "ac:parameter": 187 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 188 text = "" 189 190 elif name == "ac:macro": 191 self.macro = self.attributes[-1].get("ac:name") 192 193 # Handle the common case. 194 195 else: 196 conversion = tags.get(name) 197 198 # Attempt to convert the text. 199 200 # Links require target information. 201 # NOTE: User links should support the intended user namespace prefix. 202 203 if name in ("ac:link", "ac:image"): 204 if self.target_type == "ri:attachment": 205 prefix = "attachment:" 206 elif self.target_type == "ri:user": 207 prefix = "" 208 else: 209 prefix = "../" 210 211 text = conversion % (prefix, self.target, text or self.target) 212 self.target = self.target_type = None 213 214 # Macro name information is used to style rich text body regions. 215 216 elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro): 217 details = macro_rich_text_styles[self.macro] 218 title = self.macro_parameters.get("title") 219 if title: 220 details = "%s\n\n%s" % (details, title) 221 text = "{{{#!wiki %s\n\n%s}}}" % (details, text) 222 self.macro = None 223 self.macro_parameters = {} 224 225 # Handle the common case. 226 227 elif text and conversion: 228 text = conversion % text 229 elif simple_tags.has_key(name): 230 text = simple_tags[name] 231 232 # Postprocess table columns and rows. 233 234 if name in ("th", "td"): 235 if self.table_columns: 236 text = "\n|| %s" % text 237 self.table_columns += 1 238 elif name == "tr": 239 if self.table_rows: 240 text = "\n==\n%s" % text 241 self.table_rows += 1 242 243 # Normalise leading whitespace and indent the text if appropriate. 244 245 if name in indented_tags: 246 text = " " * self.indent + text.lstrip() 247 248 # Add the converted text to the end of the parent element's text nodes. 249 250 if len(self.text) > 1: 251 nodes = self.text[-2] 252 if "".join(self.text[-2]): 253 parent = self.elements[-2] 254 if parent == "body": 255 nodes.append("\n\n") 256 elif list_tags.has_key(parent): 257 nodes.append("\n") 258 elif list_tags.has_key(name): 259 nodes.append("\n") 260 nodes.append(text) 261 262 # Otherwise, emit the text. 263 264 else: 265 self.out.write(text) 266 267 def is_preformatted(self): 268 return reduce(operator.or_, self.states.values(), False) 269 270 # Whitespace normalisation. 271 272 def get_replacement(self, name): 273 if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): 274 return "" 275 else: 276 return " " 277 278 def normalise(self, text, name): 279 return normalise_regexp.sub(self.get_replacement(name), text) 280 281 def parse(s, out): 282 283 "Parse the content in the string 's', writing a translation to 'out'." 284 285 # NOTE: CDATA sections appear to have erroneous endings. 286 287 s = u"""\ 288 <?xml version="1.0"?> 289 <!DOCTYPE html 290 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 291 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 292 <html xmlns="http://www.w3.org/1999/xhtml"> 293 <body> 294 %s 295 </body> 296 </html>""" % s.replace("]] >", "]]>") 297 298 f = StringIO(s.encode("utf-8")) 299 try: 300 parser = ConfluenceXMLParser(out) 301 parser.parse(f) 302 finally: 303 f.close() 304 305 if __name__ == "__main__": 306 s = sys.stdin.read() 307 out = codecs.getwriter("utf-8")(sys.stdout) 308 parse(s, out) 309 310 # vim: tabstop=4 expandtab shiftwidth=4