ConfluenceConverter (file xmlparser.py at 5864968f5999)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s|%s]]",    60     "ac:image"              : "{{%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     "ac:link-body"          : "#",   113     }   114    115 macro_rich_text_styles = {   116     # Confluence style        MoinMoin admonition style   117     "note"                  : "caution",   118     "warning"               : "warning",   119     "info"                  : "important",   120     "tip"                   : "tip",   121     }   122    123 macroargs = {   124     # Confluence macro        Confluence and MoinMoin macro arguments   125     "color"                 : ("color", "col"),   126     }   127    128 macrotypes = {   129     # Confluence macro        MoinMoin syntax   130     "anchor"                : "<<Anchor(%(anchor)s)>>",   131     "color"                 : "<<Color2(%(content)s, %(args)s)>>",   132     }   133    134 normalise_regexp_str = r"\s+"   135 normalise_regexp = re.compile(normalise_regexp_str)   136    137 class ConfluenceXMLParser(Parser):   138    139     "Handle content from Confluence 4 page revisions."   140    141     def __init__(self, out):   142         Parser.__init__(self)   143         self.out = out   144    145         # Link target and label information.   146    147         self.target = None   148         self.target_type = None   149         self.label = None   150    151         # Macro information.   152    153         self.macro = None   154         self.macro_parameters = {}   155         self.held_anchors = []   156    157         # Indentation and element nesting states.   158    159         self.indents = [0]   160         self.states = {}   161         self.max_level = self.level = 0   162    163         for name in preformatted_tags + single_level_tags:   164             self.states[name] = 0   165    166         # Table states.   167    168         self.table_rows = 0   169         self.table_columns = 0   170    171         # Block states.   172    173         self.have_block = False   174    175     # ContentHandler-related methods.   176    177     def startElement(self, name, attrs):   178    179         # Track indentation for lists.   180    181         if list_tags.has_key(name):   182             self.indents.append(self.indents[-1] + 1)   183    184         # Track element nesting.   185    186         if self.states.has_key(name):   187             self.states[name] += 1   188    189         # Track cumulative element nesting in order to produce appropriate depth   190         # indicators in the formatted output.   191    192         if name in preformatted_tags or name in formatted_tags:   193             self.level += 1   194             self.max_level = max(self.level, self.max_level)   195    196             # Reset indentation within regions.   197    198             self.indents.append(0)   199    200         if name in headings:   201             self.held_anchors = []   202    203         Parser.startElement(self, name, attrs)   204    205         # Remember macro information for use within the element.   206    207         if name == "ac:macro":   208             self.macro = self.attributes[-1].get("ac:name")   209    210     def endElement(self, name):   211    212         # Reset the indent for any preformatted/formatted region so that it may   213         # itself be indented.   214    215         if name in preformatted_tags or name in formatted_tags:   216             self.indents.pop()   217    218         Parser.endElement(self, name)   219    220         if list_tags.has_key(name):   221             self.indents.pop()   222    223         if self.states.has_key(name):   224             self.states[name] -= 1   225    226         if name in preformatted_tags or name in formatted_tags:   227             self.level -= 1   228             if not self.level:   229                 self.max_level = 0   230    231         # Discard macro state.   232    233         if name == "ac:macro":   234             self.macro = None   235             self.macro_parameters = {}   236    237     def characters(self, content):   238         if not self.is_preformatted():   239             content = self.normalise(content, self.elements[-1])   240         Parser.characters(self, content)   241    242     def skippedEntity(self, name):   243         ch = htmlentitydefs.name2codepoint.get(name)   244         if ch:   245             self.text[-1].append(unichr(ch))   246    247     # Parser-related methods.   248    249     def handleElement(self, name):   250    251         """   252         Handle the completion of the element with the given 'name'. Any content   253         will either be recorded for later use (by an enclosing element, for   254         example) or emitted in some form.   255         """   256    257         text = u"".join(self.text[-1])   258    259         # Handle state.   260    261         if name == "table":   262             self.table_rows = 0   263         elif name == "tr":   264             self.table_columns = 0   265    266         # Find conversions.   267    268         conversion = None   269    270         # Handle list elements.   271    272         if name == "li" and len(self.elements) > 1:   273             list_tag = self.elements[-2]   274             conversion = list_tags.get(list_tag)   275    276         # Remember link target information.   277    278         elif link_target_tags.has_key(name):   279             target_details = []   280    281             # Get target details from the element's attributes.   282    283             for attrname in link_target_tags[name]:   284                 attrvalue = self.attributes[-1].get(attrname)   285                 if attrvalue:   286                     target_details.append(attrvalue)   287                     prefix = link_target_prefixes.get(attrname)   288                     if prefix:   289                         target_details.insert(0, prefix)   290                     if attrname in link_label_attributes and not self.label:   291                         self.label = attrvalue   292    293             # Make a link based on the details.   294    295             self.target = u"/".join(target_details)   296             self.target_type = name   297             text = ""   298    299         # For anchor links, just use the raw text and let Moin do the formatting.   300    301         elif name == "ac:link-body":   302             if not self.target_type:   303                 self.target_type = name   304             self.label = text.strip()   305             text = ""   306    307         # For conventional links, remember the href attribute as the target.   308    309         elif name == "a":   310             self.target = self.attributes[-1].get("href")   311             self.label = text.strip()   312             text = ""   313    314         # Remember macro information.   315    316         elif name == "ac:parameter":   317             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   318             text = ""   319    320         elif name == "ac:default-parameter":   321             self.macro_parameters[self.attributes[-2].get("ac:name")] = text   322             text = ""   323    324         # Handle single-level tags.   325    326         elif name in single_level_tags and self.states[name] > 1:   327             conversion = "%s"   328    329         # Handle preformatted sections.   330    331         elif name in preformatted_tags or name in formatted_tags:   332    333             # Nest the section appropriately.   334    335             level = 3 + self.max_level - self.level   336             opening = "{" * level   337             closing = "}" * level   338    339             # Macro name information is used to style rich text body regions.   340    341             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   342                 details = macro_rich_text_styles[self.macro]   343                 title = self.macro_parameters.get("title")   344                 if title:   345                     details = "%s\n\n%s" % (details, title)   346    347                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   348    349             elif name == "table":   350                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   351    352             else:   353                 # Preformatted sections containing newlines must contain an initial   354                 # newline.   355    356                 if text.find("\n") != -1 and not text.startswith("\n"):   357                     opening += "\n"   358    359                 conversion = "%s%%s%s" % (opening, closing)   360    361         # Handle the common case and simpler special cases.   362    363         if not conversion:   364             conversion = tags.get(name)   365    366    367    368         # Attempt to convert the text.   369    370         # Links require target information.   371    372         if name in ("ac:link", "ac:image"):   373             prefix = link_target_types.get(self.target_type, "")   374             anchor = self.attributes[-1].get("ac:anchor")   375             text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target)   376             self.target = self.target_type = self.label = None   377    378         elif name == "a":   379             text = conversion % (self.target, self.label or self.target)   380             self.target = self.target_type = self.label = None   381    382         # Macros require various kinds of information.   383         # Some macros affect the formatting of their contents, whereas other   384         # simpler macros are handled here.   385    386         elif name == "ac:macro":   387             conversion = macrotypes.get(self.macro)   388             if conversion:   389                 parameters = {"content" : text}   390                 parameters.update(self.macro_parameters)   391                 argnames = macroargs.get(self.macro)   392                 if argnames:   393                     confargname, moinargname = argnames   394                     parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[confargname]))   395                 text = conversion % parameters   396                 if self.macro == "anchor" and self.forbids_macros():   397                     self.held_anchors.append(text)   398                     text = ""   399    400         # Handle the common cases for parameterised and unparameterised   401         # substitutions.   402    403         elif text and conversion:   404             text = conversion % text   405         elif simple_tags.has_key(name) and not self.is_preformatted():   406             text = simple_tags[name]   407         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   408             text = simple_preformatted_tags[name]   409    410    411    412         # Postprocess table columns and rows.   413    414         if name in ("th", "td"):   415             if self.table_columns:   416                 text = "\n|| %s" % text   417             self.table_columns += 1   418         elif name == "tr":   419             if self.table_rows:   420                 text = "\n==\n%s" % text   421             self.table_rows += 1   422    423         # Postprocess held anchor tags in headings.   424    425         elif name in headings and self.held_anchors:   426             text = "%s\n%s" % ("".join(self.held_anchors), text)   427    428    429    430         # Normalise leading whitespace and indent the text if appropriate.   431    432         if name in indented_tags:   433             text = " " * self.indents[-1] + text.lstrip()   434    435         # Add the converted text to the end of the parent element's text nodes.   436    437         if len(self.text) > 1:   438             nodes = self.text[-2]   439             parent = self.elements[-2]   440    441             # Where preceding text exists, add any blank line separators.   442    443             if u"".join(nodes):   444    445                 # All top-level elements are separated with blank lines.   446    447                 if parent == "body":   448                     nodes.append("\n")   449    450                 # Block elements always cause a new line to be started.   451    452                 if name in block_tags or self.have_block and name not in span_override_tags:   453                     nodes.append("\n")   454    455                 self.have_block = False   456    457             # Lists inside lists require separation.   458    459             elif list_tags.has_key(name) and parent == "li":   460                 nodes.append("\n")   461    462             # Without preceding text, save any block node state for non-block   463             # elements so that newline separators can be added at another   464             # level.   465    466             elif name in block_tags and parent not in block_tags:   467                 self.have_block = True   468    469             elif name not in block_tags and self.have_block and name not in span_override_tags:   470                 self.have_block = True   471    472             else:   473                 self.have_block = False   474    475             nodes.append(text)   476    477         # Otherwise, emit the text (at the top level of the document).   478    479         else:   480             self.out.write(text)   481    482     def is_preformatted(self):   483         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   484    485     def forbids_macros(self):   486         return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)   487    488     # Whitespace normalisation.   489    490     def get_replacement(self, name):   491         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   492             return ""   493         else:   494             return " "   495    496     def normalise(self, text, name):   497         return normalise_regexp.sub(self.get_replacement(name), text)   498    499 def parse(s, out):   500    501     "Parse the content in the string 's', writing a translation to 'out'."   502    503     # NOTE: CDATA sections appear to have erroneous endings.   504    505     s = u"""\   506 <?xml version="1.0"?>   507 <!DOCTYPE html    508      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   509      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   510 <html xmlns="http://www.w3.org/1999/xhtml">   511 <body>   512 %s   513 </body>   514 </html>""" % s.replace("]] >", "]]>")   515    516     f = StringIO(s.encode("utf-8"))   517     try:   518         parser = ConfluenceXMLParser(out)   519         parser.parse(f)   520     finally:   521         f.close()   522    523 if __name__ == "__main__":   524     s = codecs.getreader("utf-8")(sys.stdin).read()   525     out = codecs.getwriter("utf-8")(sys.stdout)   526     parse(s, out)   527    528 # vim: tabstop=4 expandtab shiftwidth=4