ConfluenceConverter (file xmlparser.py at a2449a212f99)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s|%s]]",    60     "ac:image"              : "{{%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 list_tags = {    73     # XHTML list tag          MoinMoin list item syntax    74     "ol"                    : "1. %s",    75     "ul"                    : "* %s",    76     }    77     78 indented_tags = ["li", "p"]    79     80 preformatted_tags = ["pre", "ac:plain-text-body"]    81 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    82 formatted_tags    = ["ac:rich-text-body", "table"]    83     84 link_target_tags = {    85     # Confluence element      Attributes providing the target    86     "ri:page"               : ("ri:space-key", "ri:content-title"),    87     "ri:attachment"         : ("ri:filename",),    88     "ri:user"               : ("ri:username",),    89     }    90     91 link_target_prefixes = {    92     # Attribute with details  Prefix ensuring correct relative link    93     "ri:space-key"          : "..",    94     "ri:content-title"      : "..",    95     }    96     97 link_label_attributes = "ri:content-title", "ac:link-body"    98     99 # NOTE: User links should support the intended user namespace prefix.   100    101 link_target_types = {   102     # Confluence element      MoinMoin link prefix   103     "ri:attachment"         : "attachment:",   104     "ri:user"               : "",   105     "ac:link-body"          : "#",   106     }   107    108 macro_rich_text_styles = {   109     # Confluence style        MoinMoin admonition style   110     "note"                  : "caution",   111     "warning"               : "warning",   112     "info"                  : "important",   113     "tip"                   : "tip",   114     }   115    116 normalise_regexp_str = r"\s+"   117 normalise_regexp = re.compile(normalise_regexp_str)   118    119 class ConfluenceXMLParser(Parser):   120    121     "Handle content from Confluence 4 page revisions."   122    123     def __init__(self, out):   124         Parser.__init__(self)   125         self.out = out   126    127         # Link target and label information.   128    129         self.target = None   130         self.target_type = None   131         self.label = None   132    133         # Macro information.   134    135         self.macro = None   136         self.macro_parameters = {}   137    138         # Indentation and element nesting states.   139    140         self.indent = 0   141         self.states = {}   142         self.max_level = self.level = 0   143    144         for name in preformatted_tags + single_level_tags:   145             self.states[name] = 0   146    147         # Table states.   148    149         self.table_rows = 0   150         self.table_columns = 0   151    152     # ContentHandler-related methods.   153    154     def startElement(self, name, attrs):   155    156         # Track indentation for lists.   157    158         if list_tags.has_key(name):   159             self.indent += 1   160    161         # Track element nesting.   162    163         elif self.states.has_key(name):   164             self.states[name] += 1   165    166         # Track cumulative element nesting in order to produce appropriate depth   167         # indicators in the formatted output.   168    169         if name in preformatted_tags or name in formatted_tags:   170             self.level += 1   171             self.max_level = max(self.level, self.max_level)   172    173         Parser.startElement(self, name, attrs)   174    175         # Remember macro information for use within the element.   176    177         if name == "ac:macro":   178             self.macro = self.attributes[-1].get("ac:name")   179    180     def endElement(self, name):   181         Parser.endElement(self, name)   182    183         if list_tags.has_key(name):   184             self.indent -= 1   185         elif self.states.has_key(name):   186             self.states[name] -= 1   187         if name in preformatted_tags or name in formatted_tags:   188             self.level -= 1   189             if not self.level:   190                 self.max_level = 0   191    192     def characters(self, content):   193         if not self.is_preformatted():   194             content = self.normalise(content, self.elements[-1])   195         Parser.characters(self, content)   196    197     def skippedEntity(self, name):   198         ch = htmlentitydefs.name2codepoint.get(name)   199         if ch:   200             self.text[-1].append(unichr(ch))   201    202     # Parser-related methods.   203    204     def handleElement(self, name):   205    206         """   207         Handle the completion of the element with the given 'name'. Any content   208         will either be recorded for later use (by an enclosing element, for   209         example) or emitted in some form.   210         """   211    212         text = "".join(self.text[-1])   213    214         # Handle state.   215    216         if name == "table":   217             self.table_rows = 0   218         elif name == "tr":   219             self.table_columns = 0   220    221         # Find conversions.   222    223         conversion = None   224    225         # Handle list elements.   226    227         if name == "li" and len(self.elements) > 1:   228             list_tag = self.elements[-2]   229             conversion = list_tags.get(list_tag)   230    231         # Remember link target information.   232    233         elif link_target_tags.has_key(name):   234             target_details = []   235    236             # Get target details from the element's attributes.   237    238             for attrname in link_target_tags[name]:   239                 attrvalue = self.attributes[-1].get(attrname)   240                 if attrvalue:   241                     target_details.append(attrvalue)   242                     prefix = link_target_prefixes.get(attrname)   243                     if prefix:   244                         target_details.insert(0, prefix)   245                     if attrname in link_label_attributes and not self.label:   246                         self.label = attrvalue   247    248             # Make a link based on the details.   249    250             self.target = "/".join(target_details)   251             self.target_type = name   252             text = ""   253    254         # For anchor links, just use the raw text and let Moin do the formatting.   255    256         elif name == "ac:link-body":   257             if not self.target_type:   258                 self.target_type = name   259             self.label = text   260             text = ""   261    262         # For conventional links, remember the href attribute as the target.   263    264         elif name == "a":   265             self.target = self.attributes[-1].get("href")   266             self.label = text   267             text = ""   268    269         # Discard macro state.   270    271         elif name == "ac:macro":   272             self.macro = None   273             self.macro_parameters = {}   274    275         # Remember macro information.   276    277         elif name in ("ac:parameter", "ac:default-parameter"):   278             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   279             text = ""   280    281         # Handle single-level tags.   282    283         elif name in single_level_tags and self.states[name] > 1:   284             conversion = "%s"   285    286         # Handle preformatted sections.   287    288         elif name in preformatted_tags or name in formatted_tags:   289    290             # Nest the section appropriately.   291    292             level = 3 + self.max_level - self.level   293             opening = "{" * level   294             closing = "}" * level   295    296             # Macro name information is used to style rich text body regions.   297    298             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   299                 details = macro_rich_text_styles[self.macro]   300                 title = self.macro_parameters.get("title")   301                 if title:   302                     details = "%s\n\n%s" % (details, title)   303    304                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   305    306             elif name == "table":   307                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   308    309             else:   310                 conversion = "%s%%s%s" % (opening, closing)   311    312         # Handle the common case and simpler special cases.   313    314         if not conversion:   315             conversion = tags.get(name)   316    317         # Attempt to convert the text.   318    319         # Links require target information.   320    321         if name in ("ac:link", "ac:image"):   322             prefix = link_target_types.get(self.target_type, "")   323             anchor = self.attributes[-1].get("ac:anchor")   324             text = conversion % (prefix, anchor or self.target, self.label or text or self.target)   325             self.target = self.target_type = self.label = None   326    327         elif name == "a":   328             text = conversion % (self.target, self.label)   329             self.target = self.target_type = self.label = None   330    331         # Handle the common case.   332    333         elif text and conversion:   334             text = conversion % text   335         elif simple_tags.has_key(name):   336             text = simple_tags[name]   337    338         # Postprocess table columns and rows.   339    340         if name in ("th", "td"):   341             if self.table_columns:   342                 text = "\n|| %s" % text   343             self.table_columns += 1   344         elif name == "tr":   345             if self.table_rows:   346                 text = "\n==\n%s" % text   347             self.table_rows += 1   348    349         # Normalise leading whitespace and indent the text if appropriate.   350    351         if name in indented_tags:   352             text = " " * self.indent + text.lstrip()   353    354         # Add the converted text to the end of the parent element's text nodes.   355    356         if len(self.text) > 1:   357             nodes = self.text[-2]   358             if "".join(self.text[-2]):   359                 parent = self.elements[-2]   360                 if parent == "body":   361                     nodes.append("\n\n")   362                 elif list_tags.has_key(parent):   363                     nodes.append("\n")   364                 elif list_tags.has_key(name):   365                     nodes.append("\n")   366             nodes.append(text)   367    368         # Otherwise, emit the text.   369    370         else:   371             self.out.write(text)   372    373     def is_preformatted(self):   374         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   375    376     # Whitespace normalisation.   377    378     def get_replacement(self, name):   379         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   380             return ""   381         else:   382             return " "   383    384     def normalise(self, text, name):   385         return normalise_regexp.sub(self.get_replacement(name), text)   386    387 def parse(s, out):   388    389     "Parse the content in the string 's', writing a translation to 'out'."   390    391     # NOTE: CDATA sections appear to have erroneous endings.   392    393     s = u"""\   394 <?xml version="1.0"?>   395 <!DOCTYPE html    396      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   397      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   398 <html xmlns="http://www.w3.org/1999/xhtml">   399 <body>   400 %s   401 </body>   402 </html>""" % s.replace("]] >", "]]>")   403    404     f = StringIO(s.encode("utf-8"))   405     try:   406         parser = ConfluenceXMLParser(out)   407         parser.parse(f)   408     finally:   409         f.close()   410    411 if __name__ == "__main__":   412     s = sys.stdin.read()   413     out = codecs.getwriter("utf-8")(sys.stdout)   414     parse(s, out)   415    416 # vim: tabstop=4 expandtab shiftwidth=4