ConfluenceConverter (file xmlparser.py at e6f73b0dfd83)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from common import *    30 from xmlread import Parser    31 import re    32 import sys    33 import operator    34 import htmlentitydefs    35 import codecs    36     37 # XML dialect syntax parsing.    38     39 tags = {    40     # XHTML tag               MoinMoin syntax    41     "strong"                : "'''%s'''",    42     "em"                    : "''%s''",    43     "u"                     : "__%s__",    44     "del"                   : "--(%s)--",    45     "sup"                   : "^%s^",    46     "sub"                   : ",,%s,,",    47     "code"                  : "`%s`",    48     "pre"                   : "{{{%s}}}",    49     "table"                 : "{{{#!table\n%s\n}}}",    50     "tbody"                 : "%s",    51     "tr"                    : "%s",    52     "th"                    : "'''%s'''",    53     "td"                    : "%s",    54     "blockquote"            : " %s",    55     "small"                 : "~-%s-~",    56     "big"                   : "~+%s+~",    57     "p"                     : "%s",    58     "ol"                    : "%s",    59     "ul"                    : "%s",    60     "ac:plain-text-body"    : "{{{%s}}}",    61     "ac:link"               : "[[%s%s|%s]]",    62     "ac:image"              : "{{%s%s|%s}}",    63     }    64     65 for tag, translation in blocktypes.items():    66     tags[tag] = translation    67     68 simple_tags = {    69     # XHTML tag               MoinMoin syntax    70     "br"                    : "<<BR>>",    71     }    72     73 list_tags = {    74     # XHTML list tag          MoinMoin list item syntax    75     "ol"                    : "1. %s",    76     "ul"                    : "* %s",    77     }    78     79 indented_tags = ["li", "p"]    80     81 link_target_tags = {    82     # Confluence element      Attribute providing the target    83     "ri:page"               : "ri:content-title",    84     "ri:attachment"         : "ri:filename",    85     "ri:user"               : "ri:username",    86     }    87     88 macro_rich_text_styles = {    89     # Confluence style        MoinMoin admonition style    90     "note"                  : "caution",    91     "warning"               : "warning",    92     "info"                  : "important",    93     "tip"                   : "tip",    94     }    95     96 normalise_regexp_str = r"\s+"    97 normalise_regexp = re.compile(normalise_regexp_str)    98     99 class ConfluenceXMLParser(Parser):   100    101     "Handle content from Confluence 4 page revisions."   102    103     def __init__(self, out):   104         Parser.__init__(self)   105         self.out = out   106    107         # Link target information.   108    109         self.target = None   110         self.target_type = None   111    112         # Macro information.   113    114         self.macro = None   115         self.macro_parameters = {}   116    117         # Indentation and preformatted states.   118    119         self.indent = 0   120         self.states = {}   121         for name in ("pre", "ac:plain-text-body"):   122             self.states[name] = 0   123    124         # Table states.   125    126         self.table_rows = 0   127         self.table_columns = 0   128    129     # ContentHandler-related methods.   130    131     def startElement(self, name, attrs):   132         if list_tags.has_key(name):   133             self.indent += 1   134         elif self.states.has_key(name):   135             self.states[name] += 1   136         Parser.startElement(self, name, attrs)   137    138     def endElement(self, name):   139         Parser.endElement(self, name)   140         if list_tags.has_key(name):   141             self.indent -= 1   142         elif self.states.has_key(name):   143             self.states[name] -= 1   144    145     def characters(self, content):   146         if not self.is_preformatted():   147             content = self.normalise(content, self.elements[-1])   148         Parser.characters(self, content)   149    150     def skippedEntity(self, name):   151         ch = htmlentitydefs.name2codepoint.get(name)   152         if ch:   153             self.text[-1].append(unichr(ch))   154    155     # Parser-related methods.   156    157     def handleElement(self, name):   158         text = "".join(self.text[-1])   159    160         # Handle state.   161    162         if name == "table":   163             self.table_rows = 0   164         elif name == "tr":   165             self.table_columns = 0   166    167         # Find conversions.   168    169         conversion = None   170    171         # Handle list elements.   172    173         if name == "li" and len(self.elements) > 1:   174             list_tag = self.elements[-2]   175             conversion = list_tags.get(list_tag)   176    177         # Remember link target information.   178    179         elif link_target_tags.has_key(name):   180             self.target = self.attributes[-1].get(link_target_tags[name])   181             self.target_type = name   182             text = ""   183    184         # Remember macro information.   185    186         elif name == "ac:parameter":   187             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   188             text = ""   189    190         elif name == "ac:macro":   191             self.macro = self.attributes[-1].get("ac:name")   192    193         # Handle the common case.   194    195         else:   196             conversion = tags.get(name)   197    198         # Attempt to convert the text.   199    200         # Links require target information.   201         # NOTE: User links should support the intended user namespace prefix.   202    203         if name in ("ac:link", "ac:image"):   204             if self.target_type == "ri:attachment":   205                 prefix = "attachment:"   206             elif self.target_type == "ri:user":   207                 prefix = ""   208             else:   209                 prefix = "../"   210    211             text = conversion % (prefix, self.target, text or self.target)   212             self.target = self.target_type = None   213    214         # Macro name information is used to style rich text body regions.   215    216         elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):   217             details = macro_rich_text_styles[self.macro]   218             title = self.macro_parameters.get("title")   219             if title:   220                 details = "%s\n\n%s" % (details, title)   221             text = "{{{#!wiki %s\n\n%s}}}" % (details, text)   222             self.macro = None   223             self.macro_parameters = {}   224    225         # Handle the common case.   226    227         elif text and conversion:   228             text = conversion % text   229         elif simple_tags.has_key(name):   230             text = simple_tags[name]   231    232         # Postprocess table columns and rows.   233    234         if name in ("th", "td"):   235             if self.table_columns:   236                 text = "\n|| %s" % text   237             self.table_columns += 1   238         elif name == "tr":   239             if self.table_rows:   240                 text = "\n==\n%s" % text   241             self.table_rows += 1   242    243         # Normalise leading whitespace and indent the text if appropriate.   244    245         if name in indented_tags:   246             text = " " * self.indent + text.lstrip()   247    248         # Add the converted text to the end of the parent element's text nodes.   249    250         if len(self.text) > 1:   251             nodes = self.text[-2]   252             if "".join(self.text[-2]):   253                 parent = self.elements[-2]   254                 if parent == "body":   255                     nodes.append("\n\n")   256                 elif list_tags.has_key(parent):   257                     nodes.append("\n")   258                 elif list_tags.has_key(name):   259                     nodes.append("\n")   260             nodes.append(text)   261    262         # Otherwise, emit the text.   263    264         else:   265             self.out.write(text)   266    267     def is_preformatted(self):   268         return reduce(operator.or_, self.states.values(), False)   269    270     # Whitespace normalisation.   271    272     def get_replacement(self, name):   273         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   274             return ""   275         else:   276             return " "   277    278     def normalise(self, text, name):   279         return normalise_regexp.sub(self.get_replacement(name), text)   280    281 def parse(s, out):   282    283     "Parse the content in the string 's', writing a translation to 'out'."   284    285     # NOTE: CDATA sections appear to have erroneous endings.   286    287     s = u"""\   288 <?xml version="1.0"?>   289 <!DOCTYPE html    290      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   291      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   292 <html xmlns="http://www.w3.org/1999/xhtml">   293 <body>   294 %s   295 </body>   296 </html>""" % s.replace("]] >", "]]>")   297    298     f = StringIO(s.encode("utf-8"))   299     try:   300         parser = ConfluenceXMLParser(out)   301         parser.parse(f)   302     finally:   303         f.close()   304    305 if __name__ == "__main__":   306     s = sys.stdin.read()   307     out = codecs.getwriter("utf-8")(sys.stdout)   308     parse(s, out)   309    310 # vim: tabstop=4 expandtab shiftwidth=4