ConfluenceConverter (file xmlparser.py at 589001c9539e)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s%s|%s]]",    60     "ac:image"              : "{{%s%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     }   113    114 macro_rich_text_styles = {   115     # Confluence style        MoinMoin admonition style   116     "note"                  : "caution",   117     "warning"               : "warning",   118     "info"                  : "important",   119     "tip"                   : "tip",   120     }   121    122 macroargs = {   123     # Confluence macro        Confluence and MoinMoin macro arguments   124     "color"                 : ("color", "col"),   125     }   126    127 macrotypes = {   128     # Confluence macro        MoinMoin syntax   129     "anchor"                : "<<Anchor(%(anchor)s)>>",   130     "color"                 : "<<Color2(%(content)s, %(args)s)>>",   131     }   132    133 normalise_regexp_str = r"\s+"   134 normalise_regexp = re.compile(normalise_regexp_str)   135    136 class ConfluenceXMLParser(Parser):   137    138     "Handle content from Confluence 4 page revisions."   139    140     def __init__(self, out):   141         Parser.__init__(self)   142         self.out = out   143    144         # Link target and label information.   145    146         self.target = None   147         self.target_type = None   148         self.label = None   149    150         # Macro information.   151    152         self.macro = None   153         self.macro_parameters = {}   154         self.held_anchors = []   155    156         # Indentation and element nesting states.   157    158         self.indents = [0]   159         self.states = {}   160         self.max_level = self.level = 0   161    162         for name in preformatted_tags + single_level_tags:   163             self.states[name] = 0   164    165         # Table states.   166    167         self.table_rows = 0   168         self.table_columns = 0   169    170         # Block states.   171    172         self.have_block = False   173    174     # ContentHandler-related methods.   175    176     def startElement(self, name, attrs):   177    178         # Track indentation for lists.   179    180         if list_tags.has_key(name):   181             self.indents.append(self.indents[-1] + 1)   182    183         # Track element nesting.   184    185         if self.states.has_key(name):   186             self.states[name] += 1   187    188         # Track cumulative element nesting in order to produce appropriate depth   189         # indicators in the formatted output.   190    191         if name in preformatted_tags or name in formatted_tags:   192             self.level += 1   193             self.max_level = max(self.level, self.max_level)   194    195             # Reset indentation within regions.   196    197             self.indents.append(0)   198    199         if name in headings:   200             self.held_anchors = []   201    202         Parser.startElement(self, name, attrs)   203    204         # Remember macro information for use within the element.   205    206         if name == "ac:macro":   207             self.macro = self.attributes[-1].get("ac:name")   208    209     def endElement(self, name):   210    211         # Reset the indent for any preformatted/formatted region so that it may   212         # itself be indented.   213    214         if name in preformatted_tags or name in formatted_tags:   215             self.indents.pop()   216    217         Parser.endElement(self, name)   218    219         if list_tags.has_key(name):   220             self.indents.pop()   221    222         if self.states.has_key(name):   223             self.states[name] -= 1   224    225         if name in preformatted_tags or name in formatted_tags:   226             self.level -= 1   227             if not self.level:   228                 self.max_level = 0   229    230         # Discard macro state.   231    232         if name == "ac:macro":   233             self.macro = None   234             self.macro_parameters = {}   235    236     def characters(self, content):   237         if not self.is_preformatted():   238             content = self.normalise(content, self.elements[-1])   239         Parser.characters(self, content)   240    241     def skippedEntity(self, name):   242         ch = htmlentitydefs.name2codepoint.get(name)   243         if ch:   244             self.text[-1].append(unichr(ch))   245    246     # Parser-related methods.   247    248     def handleElement(self, name):   249    250         """   251         Handle the completion of the element with the given 'name'. Any content   252         will either be recorded for later use (by an enclosing element, for   253         example) or emitted in some form.   254         """   255    256         text = u"".join(self.text[-1])   257    258         # Handle state.   259    260         if name == "table":   261             self.table_rows = 0   262         elif name == "tr":   263             self.table_columns = 0   264    265         # Find conversions.   266    267         conversion = None   268    269         # Handle list elements.   270    271         if name == "li" and len(self.elements) > 1:   272             list_tag = self.elements[-2]   273             conversion = list_tags.get(list_tag)   274    275         # Remember link target information.   276    277         elif link_target_tags.has_key(name):   278             target_details = []   279    280             # Get target details from the element's attributes.   281    282             for attrname in link_target_tags[name]:   283                 attrvalue = self.attributes[-1].get(attrname)   284                 if attrvalue:   285                     target_details.append(attrvalue)   286                     prefix = link_target_prefixes.get(attrname)   287                     if prefix:   288                         target_details.insert(0, prefix)   289                     if attrname in link_label_attributes and not self.label:   290                         self.label = attrvalue   291    292             # Make a link based on the details.   293    294             self.target = u"/".join(target_details)   295             self.target_type = name   296             text = ""   297    298         # For anchor links, just use the raw text and let Moin do the formatting.   299    300         elif name == "ac:link-body":   301             self.label = text.strip()   302             text = ""   303    304         # For conventional links, remember the href attribute as the target.   305    306         elif name == "a":   307             self.target = self.attributes[-1].get("href")   308             self.label = text.strip()   309             text = ""   310    311         # Remember macro information.   312    313         elif name == "ac:parameter":   314             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   315             text = ""   316    317         elif name == "ac:default-parameter":   318             self.macro_parameters[self.attributes[-2].get("ac:name")] = text   319             text = ""   320    321         # Handle single-level tags.   322    323         elif name in single_level_tags and self.states[name] > 1:   324             conversion = "%s"   325    326         # Handle preformatted sections.   327    328         elif name in preformatted_tags or name in formatted_tags:   329    330             # Nest the section appropriately.   331    332             level = 3 + self.max_level - self.level   333             opening = "{" * level   334             closing = "}" * level   335    336             # Macro name information is used to style rich text body regions.   337    338             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   339                 details = macro_rich_text_styles[self.macro]   340                 title = self.macro_parameters.get("title")   341                 if title:   342                     details = "%s\n\n%s" % (details, title)   343    344                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   345    346             elif name == "table":   347                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   348    349             else:   350                 # Preformatted sections containing newlines must contain an initial   351                 # newline.   352    353                 if text.find("\n") != -1 and not text.startswith("\n"):   354                     opening += "\n"   355    356                 conversion = "%s%%s%s" % (opening, closing)   357    358         # Handle the common case and simpler special cases.   359    360         if not conversion:   361             conversion = tags.get(name)   362    363    364    365         # Attempt to convert the text.   366    367         # Links require target information.   368    369         if name in ("ac:link", "ac:image"):   370             prefix = link_target_types.get(self.target_type, "")   371             anchor = self.attributes[-1].get("ac:anchor") or ""   372             label = self.label or text.strip() or self.target   373             text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)   374             self.target = self.target_type = self.label = None   375    376         elif name == "a":   377             text = conversion % (self.target, self.label or self.target)   378             self.target = self.target_type = self.label = None   379    380         # Macros require various kinds of information.   381         # Some macros affect the formatting of their contents, whereas other   382         # simpler macros are handled here.   383    384         elif name == "ac:macro":   385             conversion = macrotypes.get(self.macro)   386             if conversion:   387                 parameters = {"content" : text}   388                 parameters.update(self.macro_parameters)   389                 argnames = macroargs.get(self.macro)   390                 if argnames:   391                     confargname, moinargname = argnames   392                     parameters["args"] = quote_macro_argument("%s=%s" % (moinargname, self.macro_parameters[confargname]))   393                 text = conversion % parameters   394                 if self.macro == "anchor" and self.forbids_macros():   395                     self.held_anchors.append(text)   396                     text = ""   397    398         # Handle the common cases for parameterised and unparameterised   399         # substitutions.   400    401         elif text and conversion:   402             text = conversion % text   403         elif simple_tags.has_key(name) and not self.is_preformatted():   404             text = simple_tags[name]   405         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   406             text = simple_preformatted_tags[name]   407    408    409    410         # Postprocess table columns and rows.   411    412         if name in ("th", "td"):   413             if self.table_columns:   414                 text = "\n|| %s" % text   415             self.table_columns += 1   416         elif name == "tr":   417             if self.table_rows:   418                 text = "\n==\n%s" % text   419             self.table_rows += 1   420    421         # Postprocess held anchor tags in headings.   422    423         elif name in headings and self.held_anchors:   424             text = "%s\n%s" % ("".join(self.held_anchors), text)   425    426    427    428         # Normalise leading whitespace and indent the text if appropriate.   429    430         if name in indented_tags:   431             text = " " * self.indents[-1] + text.lstrip()   432    433         # Add the converted text to the end of the parent element's text nodes.   434    435         if len(self.text) > 1:   436             nodes = self.text[-2]   437             parent = self.elements[-2]   438    439             # Where preceding text exists, add any blank line separators.   440    441             if u"".join(nodes):   442    443                 # All top-level elements are separated with blank lines.   444    445                 if parent == "body":   446                     nodes.append("\n")   447    448                 # Block elements always cause a new line to be started.   449    450                 if name in block_tags or self.have_block and name not in span_override_tags:   451                     nodes.append("\n")   452    453                 self.have_block = False   454    455             # Lists inside lists require separation.   456    457             elif list_tags.has_key(name) and parent == "li":   458                 nodes.append("\n")   459    460             # Without preceding text, save any block node state for non-block   461             # elements so that newline separators can be added at another   462             # level.   463    464             elif name in block_tags and parent not in block_tags:   465                 self.have_block = True   466    467             elif name not in block_tags and self.have_block and name not in span_override_tags:   468                 self.have_block = True   469    470             else:   471                 self.have_block = False   472    473             nodes.append(text)   474    475         # Otherwise, emit the text (at the top level of the document).   476    477         else:   478             self.out.write(text)   479    480     def is_preformatted(self):   481         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   482    483     def forbids_macros(self):   484         return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)   485    486     # Whitespace normalisation.   487    488     def get_replacement(self, name):   489         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   490             return ""   491         else:   492             return " "   493    494     def normalise(self, text, name):   495         return normalise_regexp.sub(self.get_replacement(name), text)   496    497 def parse(s, out):   498    499     "Parse the content in the string 's', writing a translation to 'out'."   500    501     # NOTE: CDATA sections appear to have erroneous endings.   502    503     s = u"""\   504 <?xml version="1.0"?>   505 <!DOCTYPE html    506      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   507      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   508 <html xmlns="http://www.w3.org/1999/xhtml">   509 <body>   510 %s   511 </body>   512 </html>""" % s.replace("]] >", "]]>")   513    514     f = StringIO(s.encode("utf-8"))   515     try:   516         parser = ConfluenceXMLParser(out)   517         parser.parse(f)   518     finally:   519         f.close()   520    521 if __name__ == "__main__":   522     s = codecs.getreader("utf-8")(sys.stdin).read()   523     out = codecs.getwriter("utf-8")(sys.stdout)   524     parse(s, out)   525    526 # vim: tabstop=4 expandtab shiftwidth=4