1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki XML/XHTML syntax parsing. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 try: 25 from cStringIO import StringIO 26 except ImportError: 27 from StringIO import StringIO 28 29 from common import * 30 from xmlread import Parser 31 import re 32 import sys 33 import operator 34 import htmlentitydefs 35 36 # XML dialect syntax parsing. 37 38 tags = { 39 # XHTML tag MoinMoin syntax 40 "strong" : "'''%s'''", 41 "em" : "''%s''", 42 "u" : "__%s__", 43 "del" : "--(%s)--", 44 "sup" : "^%s^", 45 "sub" : ",,%s,,", 46 "code" : "`%s`", 47 "pre" : "{{{%s}}}", 48 "blockquote" : " %s", 49 "small" : "~-%s-~", 50 "big" : "~+%s+~", 51 "p" : "%s", 52 "ol" : "%s", 53 "ul" : "%s", 54 "ac:plain-text-body" : "{{{%s}}}", 55 "ac:link" : "[[%s%s|%s]]", 56 } 57 58 for tag, translation in blocktypes.items(): 59 tags[tag] = translation 60 61 simple_tags = { 62 # XHTML tag MoinMoin syntax 63 "br" : "<<BR>>", 64 } 65 66 list_tags = { 67 # XHTML list tag MoinMoin list item syntax 68 "ol" : "1. %s", 69 "ul" : "* %s", 70 } 71 72 indented_tags = ["li", "p"] 73 74 link_target_tags = { 75 # Confluence element Attribute providing the target 76 "ri:page" : "ri:content-title", 77 "ri:attachment" : "ri:filename", 78 "ri:user" : "ri:username", 79 } 80 81 macro_rich_text_styles = { 82 # Confluence style MoinMoin admonition style 83 "note" : "caution", 84 "warning" : "warning", 85 "info" : "important", 86 "tip" : "tip", 87 } 88 89 normalise_regexp_str = r"\s+" 90 normalise_regexp = re.compile(normalise_regexp_str) 91 92 class ConfluenceXMLParser(Parser): 93 94 "Handle content from Confluence 4 page revisions." 95 96 def __init__(self, out): 97 Parser.__init__(self) 98 self.out = out 99 100 # Link target information. 101 102 self.target = None 103 self.target_type = None 104 105 # Macro information. 106 107 self.macro = None 108 self.macro_parameters = {} 109 110 # Indentation and preformatted states. 111 112 self.indent = 0 113 self.states = {} 114 for name in ("pre", "ac:plain-text-body"): 115 self.states[name] = 0 116 117 # ContentHandler-related methods. 118 119 def startElement(self, name, attrs): 120 if list_tags.has_key(name): 121 self.indent += 1 122 elif self.states.has_key(name): 123 self.states[name] += 1 124 Parser.startElement(self, name, attrs) 125 126 def endElement(self, name): 127 Parser.endElement(self, name) 128 if list_tags.has_key(name): 129 self.indent -= 1 130 elif self.states.has_key(name): 131 self.states[name] -= 1 132 133 def characters(self, content): 134 if not self.is_preformatted(): 135 content = self.normalise(content, self.elements[-1]) 136 Parser.characters(self, content) 137 138 def skippedEntity(self, name): 139 ch = htmlentitydefs.name2codepoint.get(name) 140 if ch: 141 self.text[-1].append(unichr(ch)) 142 143 # Parser-related methods. 144 145 def handleElement(self, name): 146 text = "".join(self.text[-1]) 147 conversion = None 148 149 # Handle list elements. 150 151 if name == "li" and len(self.elements) > 1: 152 list_tag = self.elements[-2] 153 conversion = list_tags.get(list_tag) 154 155 # Remember link target information. 156 157 elif link_target_tags.has_key(name): 158 self.target = self.attributes[-1].get(link_target_tags[name]) 159 self.target_type = name 160 text = "" 161 162 # Remember macro information. 163 164 elif name == "ac:parameter": 165 self.macro_parameters[self.attributes[-1].get("ac:name")] = text 166 text = "" 167 168 elif name == "ac:macro": 169 self.macro = self.attributes[-1].get("ac:name") 170 171 # Handle the common case. 172 173 else: 174 conversion = tags.get(name) 175 176 # Attempt to convert the text. 177 178 # Links require target information. 179 # NOTE: User links should support the intended user namespace prefix. 180 181 if name == "ac:link": 182 if self.target_type == "ri:attachment": 183 prefix = "attachment:" 184 elif self.target_type == "ri:user": 185 prefix = "" 186 else: 187 prefix = "../" 188 189 text = conversion % (prefix, self.target, text or self.target) 190 self.target = self.target_type = None 191 192 # Macro name information is used to style rich text body regions. 193 194 elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro): 195 details = macro_rich_text_styles[self.macro] 196 title = self.macro_parameters.get("title") 197 if title: 198 details = "%s\n\n%s" % (details, title) 199 text = "{{{#!wiki %s\n\n%s}}}" % (details, text) 200 self.macro = None 201 self.macro_parameters = {} 202 203 # Handle the common case. 204 205 elif text and conversion: 206 text = conversion % text 207 elif simple_tags.has_key(name): 208 text = simple_tags[name] 209 210 # Normalise leading whitespace and indent the text if appropriate. 211 212 if name in indented_tags: 213 text = " " * self.indent + text.lstrip() 214 215 # Add the converted text to the end of the parent element's text nodes. 216 217 if len(self.text) > 1: 218 nodes = self.text[-2] 219 if "".join(self.text[-2]): 220 parent = self.elements[-2] 221 if parent == "body": 222 nodes.append("\n\n") 223 elif list_tags.has_key(parent): 224 nodes.append("\n") 225 elif list_tags.has_key(name) and parent == "li": 226 nodes.append("\n") 227 nodes.append(text) 228 229 # Otherwise, emit the text. 230 231 else: 232 self.out.write(text) 233 234 def is_preformatted(self): 235 return reduce(operator.or_, self.states.values(), False) 236 237 # Whitespace normalisation. 238 239 def get_replacement(self, name): 240 if name in ("html", "body") or list_tags.has_key(name): 241 return "" 242 else: 243 return " " 244 245 def normalise(self, text, name): 246 return normalise_regexp.sub(self.get_replacement(name), text) 247 248 def parse(s, out): 249 250 "Parse the content in the string 's', writing a translation to 'out'." 251 252 # NOTE: CDATA sections appear to have erroneous endings. 253 254 s = u"""\ 255 <?xml version="1.0"?> 256 <!DOCTYPE html 257 PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" 258 "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> 259 <html xmlns="http://www.w3.org/1999/xhtml"> 260 <body> 261 %s 262 </body> 263 </html>""" % s.replace("]] >", "]]>") 264 265 f = StringIO(s.encode("utf-8")) 266 try: 267 parser = ConfluenceXMLParser(out) 268 parser.parse(f) 269 finally: 270 f.close() 271 272 if __name__ == "__main__": 273 s = sys.stdin.read() 274 parse(s, sys.stdout) 275 276 # vim: tabstop=4 expandtab shiftwidth=4