ConfluenceConverter (file xmlparser.py at f9771c857a29)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from common import *    30 from xmlread import Parser    31 import re    32 import sys    33 import operator    34 import htmlentitydefs    35     36 # XML dialect syntax parsing.    37     38 tags = {    39     # XHTML tag               MoinMoin syntax    40     "strong"                : "'''%s'''",    41     "em"                    : "''%s''",    42     "u"                     : "__%s__",    43     "del"                   : "--(%s)--",    44     "sup"                   : "^%s^",    45     "sub"                   : ",,%s,,",    46     "code"                  : "`%s`",    47     "pre"                   : "{{{%s}}}",    48     "blockquote"            : " %s",    49     "small"                 : "~-%s-~",    50     "big"                   : "~+%s+~",    51     "p"                     : "%s",    52     "ol"                    : "%s",    53     "ul"                    : "%s",    54     "ac:plain-text-body"    : "{{{%s}}}",    55     "ac:link"               : "[[%s%s|%s]]",    56     }    57     58 for tag, translation in blocktypes.items():    59     tags[tag] = translation    60     61 simple_tags = {    62     # XHTML tag               MoinMoin syntax    63     "br"                    : "<<BR>>",    64     }    65     66 list_tags = {    67     # XHTML list tag          MoinMoin list item syntax    68     "ol"                    : "1. %s",    69     "ul"                    : "* %s",    70     }    71     72 indented_tags = ["li", "p"]    73     74 link_target_tags = {    75     # Confluence element      Attribute providing the target    76     "ri:page"               : "ri:content-title",    77     "ri:attachment"         : "ri:filename",    78     "ri:user"               : "ri:username",    79     }    80     81 macro_rich_text_styles = {    82     # Confluence style        MoinMoin admonition style    83     "note"                  : "caution",    84     "warning"               : "warning",    85     "info"                  : "important",    86     "tip"                   : "tip",    87     }    88     89 normalise_regexp_str = r"\s+"    90 normalise_regexp = re.compile(normalise_regexp_str)    91     92 class ConfluenceXMLParser(Parser):    93     94     "Handle content from Confluence 4 page revisions."    95     96     def __init__(self, out):    97         Parser.__init__(self)    98         self.out = out    99    100         # Link target information.   101    102         self.target = None   103         self.target_type = None   104    105         # Macro information.   106    107         self.macro = None   108         self.macro_parameters = {}   109    110         # Indentation and preformatted states.   111    112         self.indent = 0   113         self.states = {}   114         for name in ("pre", "ac:plain-text-body"):   115             self.states[name] = 0   116    117     # ContentHandler-related methods.   118    119     def startElement(self, name, attrs):   120         if list_tags.has_key(name):   121             self.indent += 1   122         elif self.states.has_key(name):   123             self.states[name] += 1   124         Parser.startElement(self, name, attrs)   125    126     def endElement(self, name):   127         Parser.endElement(self, name)   128         if list_tags.has_key(name):   129             self.indent -= 1   130         elif self.states.has_key(name):   131             self.states[name] -= 1   132    133     def characters(self, content):   134         if not self.is_preformatted():   135             content = self.normalise(content, self.elements[-1])   136         Parser.characters(self, content)   137    138     def skippedEntity(self, name):   139         ch = htmlentitydefs.name2codepoint.get(name)   140         if ch:   141             self.text[-1].append(unichr(ch))   142    143     # Parser-related methods.   144    145     def handleElement(self, name):   146         text = "".join(self.text[-1])   147         conversion = None   148    149         # Handle list elements.   150    151         if name == "li" and len(self.elements) > 1:   152             list_tag = self.elements[-2]   153             conversion = list_tags.get(list_tag)   154    155         # Remember link target information.   156    157         elif link_target_tags.has_key(name):   158             self.target = self.attributes[-1].get(link_target_tags[name])   159             self.target_type = name   160             text = ""   161    162         # Remember macro information.   163    164         elif name == "ac:parameter":   165             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   166             text = ""   167    168         elif name == "ac:macro":   169             self.macro = self.attributes[-1].get("ac:name")   170    171         # Handle the common case.   172    173         else:   174             conversion = tags.get(name)   175    176         # Attempt to convert the text.   177    178         # Links require target information.   179         # NOTE: User links should support the intended user namespace prefix.   180    181         if name == "ac:link":   182             if self.target_type == "ri:attachment":   183                 prefix = "attachment:"   184             elif self.target_type == "ri:user":   185                 prefix = ""   186             else:   187                 prefix = "../"   188    189             text = conversion % (prefix, self.target, text or self.target)   190             self.target = self.target_type = None   191    192         # Macro name information is used to style rich text body regions.   193    194         elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):   195             details = macro_rich_text_styles[self.macro]   196             title = self.macro_parameters.get("title")   197             if title:   198                 details = "%s\n\n%s" % (details, title)   199             text = "{{{#!wiki %s\n\n%s}}}" % (details, text)   200             self.macro = None   201             self.macro_parameters = {}   202    203         # Handle the common case.   204    205         elif text and conversion:   206             text = conversion % text   207         elif simple_tags.has_key(name):   208             text = simple_tags[name]   209    210         # Normalise leading whitespace and indent the text if appropriate.   211    212         if name in indented_tags:   213             text = " " * self.indent + text.lstrip()   214    215         # Add the converted text to the end of the parent element's text nodes.   216    217         if len(self.text) > 1:   218             nodes = self.text[-2]   219             if "".join(self.text[-2]):   220                 parent = self.elements[-2]   221                 if parent == "body":   222                     nodes.append("\n\n")   223                 elif list_tags.has_key(parent):   224                     nodes.append("\n")   225                 elif list_tags.has_key(name) and parent == "li":   226                     nodes.append("\n")   227             nodes.append(text)   228    229         # Otherwise, emit the text.   230    231         else:   232             self.out.write(text)   233    234     def is_preformatted(self):   235         return reduce(operator.or_, self.states.values(), False)   236    237     # Whitespace normalisation.   238    239     def get_replacement(self, name):   240         if name in ("html", "body") or list_tags.has_key(name):   241             return ""   242         else:   243             return " "   244    245     def normalise(self, text, name):   246         return normalise_regexp.sub(self.get_replacement(name), text)   247    248 def parse(s, out):   249    250     "Parse the content in the string 's', writing a translation to 'out'."   251    252     # NOTE: CDATA sections appear to have erroneous endings.   253    254     s = u"""\   255 <?xml version="1.0"?>   256 <!DOCTYPE html    257      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   258      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   259 <html xmlns="http://www.w3.org/1999/xhtml">   260 <body>   261 %s   262 </body>   263 </html>""" % s.replace("]] >", "]]>")   264    265     f = StringIO(s.encode("utf-8"))   266     try:   267         parser = ConfluenceXMLParser(out)   268         parser.parse(f)   269     finally:   270         f.close()   271    272 if __name__ == "__main__":   273     s = sys.stdin.read()   274     parse(s, sys.stdout)   275    276 # vim: tabstop=4 expandtab shiftwidth=4