paul@35 | 1 | #!/usr/bin/env python |
paul@35 | 2 | |
paul@35 | 3 | """ |
paul@35 | 4 | Confluence Wiki XML/XHTML syntax parsing. |
paul@35 | 5 | |
paul@35 | 6 | Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> |
paul@35 | 7 | |
paul@35 | 8 | This software is free software; you can redistribute it and/or |
paul@35 | 9 | modify it under the terms of the GNU General Public License as |
paul@35 | 10 | published by the Free Software Foundation; either version 2 of |
paul@35 | 11 | the License, or (at your option) any later version. |
paul@35 | 12 | |
paul@35 | 13 | This software is distributed in the hope that it will be useful, |
paul@35 | 14 | but WITHOUT ANY WARRANTY; without even the implied warranty of |
paul@35 | 15 | MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the |
paul@35 | 16 | GNU General Public License for more details. |
paul@35 | 17 | |
paul@35 | 18 | You should have received a copy of the GNU General Public |
paul@35 | 19 | License along with this library; see the file LICENCE.txt |
paul@35 | 20 | If not, write to the Free Software Foundation, Inc., |
paul@35 | 21 | 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA |
paul@35 | 22 | """ |
paul@35 | 23 | |
paul@35 | 24 | try: |
paul@35 | 25 | from cStringIO import StringIO |
paul@35 | 26 | except ImportError: |
paul@35 | 27 | from StringIO import StringIO |
paul@35 | 28 | |
paul@35 | 29 | from common import * |
paul@35 | 30 | from xmlread import Parser |
paul@35 | 31 | import re |
paul@35 | 32 | import sys |
paul@35 | 33 | import operator |
paul@35 | 34 | import htmlentitydefs |
paul@41 | 35 | import codecs |
paul@35 | 36 | |
paul@35 | 37 | # XML dialect syntax parsing. |
paul@35 | 38 | |
paul@35 | 39 | tags = { |
paul@35 | 40 | # XHTML tag MoinMoin syntax |
paul@35 | 41 | "strong" : "'''%s'''", |
paul@35 | 42 | "em" : "''%s''", |
paul@35 | 43 | "u" : "__%s__", |
paul@35 | 44 | "del" : "--(%s)--", |
paul@35 | 45 | "sup" : "^%s^", |
paul@35 | 46 | "sub" : ",,%s,,", |
paul@35 | 47 | "code" : "`%s`", |
paul@35 | 48 | "pre" : "{{{%s}}}", |
paul@41 | 49 | "table" : "{{{#!table\n%s\n}}}", |
paul@41 | 50 | "tbody" : "%s", |
paul@41 | 51 | "tr" : "%s", |
paul@41 | 52 | "th" : "'''%s'''", |
paul@41 | 53 | "td" : "%s", |
paul@35 | 54 | "blockquote" : " %s", |
paul@35 | 55 | "small" : "~-%s-~", |
paul@35 | 56 | "big" : "~+%s+~", |
paul@35 | 57 | "p" : "%s", |
paul@35 | 58 | "ol" : "%s", |
paul@35 | 59 | "ul" : "%s", |
paul@35 | 60 | "ac:plain-text-body" : "{{{%s}}}", |
paul@35 | 61 | "ac:link" : "[[%s%s|%s]]", |
paul@42 | 62 | "ac:image" : "{{%s%s|%s}}", |
paul@35 | 63 | } |
paul@35 | 64 | |
paul@35 | 65 | for tag, translation in blocktypes.items(): |
paul@35 | 66 | tags[tag] = translation |
paul@35 | 67 | |
paul@35 | 68 | simple_tags = { |
paul@35 | 69 | # XHTML tag MoinMoin syntax |
paul@35 | 70 | "br" : "<<BR>>", |
paul@35 | 71 | } |
paul@35 | 72 | |
paul@35 | 73 | list_tags = { |
paul@35 | 74 | # XHTML list tag MoinMoin list item syntax |
paul@35 | 75 | "ol" : "1. %s", |
paul@35 | 76 | "ul" : "* %s", |
paul@35 | 77 | } |
paul@35 | 78 | |
paul@35 | 79 | indented_tags = ["li", "p"] |
paul@35 | 80 | |
paul@35 | 81 | link_target_tags = { |
paul@35 | 82 | # Confluence element Attribute providing the target |
paul@35 | 83 | "ri:page" : "ri:content-title", |
paul@35 | 84 | "ri:attachment" : "ri:filename", |
paul@35 | 85 | "ri:user" : "ri:username", |
paul@35 | 86 | } |
paul@35 | 87 | |
paul@35 | 88 | macro_rich_text_styles = { |
paul@35 | 89 | # Confluence style MoinMoin admonition style |
paul@35 | 90 | "note" : "caution", |
paul@35 | 91 | "warning" : "warning", |
paul@35 | 92 | "info" : "important", |
paul@35 | 93 | "tip" : "tip", |
paul@35 | 94 | } |
paul@35 | 95 | |
paul@35 | 96 | normalise_regexp_str = r"\s+" |
paul@35 | 97 | normalise_regexp = re.compile(normalise_regexp_str) |
paul@35 | 98 | |
paul@35 | 99 | class ConfluenceXMLParser(Parser): |
paul@35 | 100 | |
paul@35 | 101 | "Handle content from Confluence 4 page revisions." |
paul@35 | 102 | |
paul@35 | 103 | def __init__(self, out): |
paul@35 | 104 | Parser.__init__(self) |
paul@35 | 105 | self.out = out |
paul@35 | 106 | |
paul@35 | 107 | # Link target information. |
paul@35 | 108 | |
paul@35 | 109 | self.target = None |
paul@35 | 110 | self.target_type = None |
paul@35 | 111 | |
paul@35 | 112 | # Macro information. |
paul@35 | 113 | |
paul@35 | 114 | self.macro = None |
paul@35 | 115 | self.macro_parameters = {} |
paul@35 | 116 | |
paul@35 | 117 | # Indentation and preformatted states. |
paul@35 | 118 | |
paul@35 | 119 | self.indent = 0 |
paul@35 | 120 | self.states = {} |
paul@35 | 121 | for name in ("pre", "ac:plain-text-body"): |
paul@35 | 122 | self.states[name] = 0 |
paul@35 | 123 | |
paul@41 | 124 | # Table states. |
paul@41 | 125 | |
paul@41 | 126 | self.table_rows = 0 |
paul@41 | 127 | self.table_columns = 0 |
paul@41 | 128 | |
paul@35 | 129 | # ContentHandler-related methods. |
paul@35 | 130 | |
paul@35 | 131 | def startElement(self, name, attrs): |
paul@35 | 132 | if list_tags.has_key(name): |
paul@35 | 133 | self.indent += 1 |
paul@35 | 134 | elif self.states.has_key(name): |
paul@35 | 135 | self.states[name] += 1 |
paul@35 | 136 | Parser.startElement(self, name, attrs) |
paul@35 | 137 | |
paul@35 | 138 | def endElement(self, name): |
paul@35 | 139 | Parser.endElement(self, name) |
paul@35 | 140 | if list_tags.has_key(name): |
paul@35 | 141 | self.indent -= 1 |
paul@35 | 142 | elif self.states.has_key(name): |
paul@35 | 143 | self.states[name] -= 1 |
paul@35 | 144 | |
paul@35 | 145 | def characters(self, content): |
paul@35 | 146 | if not self.is_preformatted(): |
paul@35 | 147 | content = self.normalise(content, self.elements[-1]) |
paul@35 | 148 | Parser.characters(self, content) |
paul@35 | 149 | |
paul@35 | 150 | def skippedEntity(self, name): |
paul@35 | 151 | ch = htmlentitydefs.name2codepoint.get(name) |
paul@35 | 152 | if ch: |
paul@35 | 153 | self.text[-1].append(unichr(ch)) |
paul@35 | 154 | |
paul@35 | 155 | # Parser-related methods. |
paul@35 | 156 | |
paul@35 | 157 | def handleElement(self, name): |
paul@42 | 158 | text = "".join(self.text[-1]) |
paul@41 | 159 | |
paul@41 | 160 | # Handle state. |
paul@41 | 161 | |
paul@41 | 162 | if name == "table": |
paul@41 | 163 | self.table_rows = 0 |
paul@41 | 164 | elif name == "tr": |
paul@41 | 165 | self.table_columns = 0 |
paul@41 | 166 | |
paul@41 | 167 | # Find conversions. |
paul@41 | 168 | |
paul@35 | 169 | conversion = None |
paul@35 | 170 | |
paul@35 | 171 | # Handle list elements. |
paul@35 | 172 | |
paul@35 | 173 | if name == "li" and len(self.elements) > 1: |
paul@35 | 174 | list_tag = self.elements[-2] |
paul@35 | 175 | conversion = list_tags.get(list_tag) |
paul@35 | 176 | |
paul@35 | 177 | # Remember link target information. |
paul@35 | 178 | |
paul@35 | 179 | elif link_target_tags.has_key(name): |
paul@35 | 180 | self.target = self.attributes[-1].get(link_target_tags[name]) |
paul@35 | 181 | self.target_type = name |
paul@35 | 182 | text = "" |
paul@35 | 183 | |
paul@35 | 184 | # Remember macro information. |
paul@35 | 185 | |
paul@35 | 186 | elif name == "ac:parameter": |
paul@35 | 187 | self.macro_parameters[self.attributes[-1].get("ac:name")] = text |
paul@35 | 188 | text = "" |
paul@35 | 189 | |
paul@35 | 190 | elif name == "ac:macro": |
paul@35 | 191 | self.macro = self.attributes[-1].get("ac:name") |
paul@35 | 192 | |
paul@35 | 193 | # Handle the common case. |
paul@35 | 194 | |
paul@35 | 195 | else: |
paul@35 | 196 | conversion = tags.get(name) |
paul@35 | 197 | |
paul@35 | 198 | # Attempt to convert the text. |
paul@35 | 199 | |
paul@35 | 200 | # Links require target information. |
paul@35 | 201 | # NOTE: User links should support the intended user namespace prefix. |
paul@35 | 202 | |
paul@42 | 203 | if name in ("ac:link", "ac:image"): |
paul@35 | 204 | if self.target_type == "ri:attachment": |
paul@35 | 205 | prefix = "attachment:" |
paul@35 | 206 | elif self.target_type == "ri:user": |
paul@35 | 207 | prefix = "" |
paul@35 | 208 | else: |
paul@35 | 209 | prefix = "../" |
paul@35 | 210 | |
paul@35 | 211 | text = conversion % (prefix, self.target, text or self.target) |
paul@35 | 212 | self.target = self.target_type = None |
paul@35 | 213 | |
paul@35 | 214 | # Macro name information is used to style rich text body regions. |
paul@35 | 215 | |
paul@35 | 216 | elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro): |
paul@35 | 217 | details = macro_rich_text_styles[self.macro] |
paul@35 | 218 | title = self.macro_parameters.get("title") |
paul@35 | 219 | if title: |
paul@35 | 220 | details = "%s\n\n%s" % (details, title) |
paul@35 | 221 | text = "{{{#!wiki %s\n\n%s}}}" % (details, text) |
paul@35 | 222 | self.macro = None |
paul@35 | 223 | self.macro_parameters = {} |
paul@35 | 224 | |
paul@35 | 225 | # Handle the common case. |
paul@35 | 226 | |
paul@35 | 227 | elif text and conversion: |
paul@35 | 228 | text = conversion % text |
paul@35 | 229 | elif simple_tags.has_key(name): |
paul@35 | 230 | text = simple_tags[name] |
paul@35 | 231 | |
paul@41 | 232 | # Postprocess table columns and rows. |
paul@41 | 233 | |
paul@41 | 234 | if name in ("th", "td"): |
paul@41 | 235 | if self.table_columns: |
paul@41 | 236 | text = "\n|| %s" % text |
paul@41 | 237 | self.table_columns += 1 |
paul@41 | 238 | elif name == "tr": |
paul@41 | 239 | if self.table_rows: |
paul@41 | 240 | text = "\n==\n%s" % text |
paul@41 | 241 | self.table_rows += 1 |
paul@41 | 242 | |
paul@35 | 243 | # Normalise leading whitespace and indent the text if appropriate. |
paul@35 | 244 | |
paul@35 | 245 | if name in indented_tags: |
paul@35 | 246 | text = " " * self.indent + text.lstrip() |
paul@35 | 247 | |
paul@35 | 248 | # Add the converted text to the end of the parent element's text nodes. |
paul@35 | 249 | |
paul@35 | 250 | if len(self.text) > 1: |
paul@35 | 251 | nodes = self.text[-2] |
paul@35 | 252 | if "".join(self.text[-2]): |
paul@35 | 253 | parent = self.elements[-2] |
paul@35 | 254 | if parent == "body": |
paul@35 | 255 | nodes.append("\n\n") |
paul@35 | 256 | elif list_tags.has_key(parent): |
paul@35 | 257 | nodes.append("\n") |
paul@42 | 258 | elif list_tags.has_key(name): |
paul@35 | 259 | nodes.append("\n") |
paul@35 | 260 | nodes.append(text) |
paul@35 | 261 | |
paul@35 | 262 | # Otherwise, emit the text. |
paul@35 | 263 | |
paul@35 | 264 | else: |
paul@35 | 265 | self.out.write(text) |
paul@35 | 266 | |
paul@35 | 267 | def is_preformatted(self): |
paul@35 | 268 | return reduce(operator.or_, self.states.values(), False) |
paul@35 | 269 | |
paul@35 | 270 | # Whitespace normalisation. |
paul@35 | 271 | |
paul@35 | 272 | def get_replacement(self, name): |
paul@42 | 273 | if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name): |
paul@35 | 274 | return "" |
paul@35 | 275 | else: |
paul@35 | 276 | return " " |
paul@35 | 277 | |
paul@35 | 278 | def normalise(self, text, name): |
paul@35 | 279 | return normalise_regexp.sub(self.get_replacement(name), text) |
paul@35 | 280 | |
paul@35 | 281 | def parse(s, out): |
paul@35 | 282 | |
paul@35 | 283 | "Parse the content in the string 's', writing a translation to 'out'." |
paul@35 | 284 | |
paul@35 | 285 | # NOTE: CDATA sections appear to have erroneous endings. |
paul@35 | 286 | |
paul@35 | 287 | s = u"""\ |
paul@35 | 288 | <?xml version="1.0"?> |
paul@35 | 289 | <!DOCTYPE html |
paul@35 | 290 | PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN" |
paul@35 | 291 | "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd"> |
paul@35 | 292 | <html xmlns="http://www.w3.org/1999/xhtml"> |
paul@35 | 293 | <body> |
paul@35 | 294 | %s |
paul@35 | 295 | </body> |
paul@35 | 296 | </html>""" % s.replace("]] >", "]]>") |
paul@35 | 297 | |
paul@35 | 298 | f = StringIO(s.encode("utf-8")) |
paul@35 | 299 | try: |
paul@35 | 300 | parser = ConfluenceXMLParser(out) |
paul@35 | 301 | parser.parse(f) |
paul@35 | 302 | finally: |
paul@35 | 303 | f.close() |
paul@35 | 304 | |
paul@35 | 305 | if __name__ == "__main__": |
paul@35 | 306 | s = sys.stdin.read() |
paul@41 | 307 | out = codecs.getwriter("utf-8")(sys.stdout) |
paul@41 | 308 | parse(s, out) |
paul@35 | 309 | |
paul@35 | 310 | # vim: tabstop=4 expandtab shiftwidth=4 |