ConfluenceConverter (file xmlparser.py at 1d93ae7fe8ed)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s%s|%s]]",    60     "ac:image"              : "{{%s%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 simple_preformatted_tags = {    73     # XHTML tag               MoinMoin syntax    74     "br"                    : "\n",    75     }    76     77 list_tags = {    78     # XHTML list tag          MoinMoin list item syntax    79     "ol"                    : "1. %s",    80     "ul"                    : "* %s",    81     }    82     83 preformatted_tags = ["pre", "ac:plain-text-body"]    84 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    85 formatted_tags    = ["ac:layout", "ac:rich-text-body", "table"]    86     87 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    88 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    89 span_override_tags = ["ac:link"]    90     91 link_target_tags = {    92     # Confluence element      Attributes providing the target    93     "ri:page"               : ("ri:space-key", "ri:content-title"),    94     "ri:attachment"         : ("ri:filename",),    95     "ri:user"               : ("ri:username",),    96     }    97     98 link_target_prefixes = {    99     # Attribute with details  Prefix ensuring correct relative link   100     "ri:space-key"          : "..",   101     "ri:content-title"      : "..",   102     }   103    104 link_label_attributes = "ri:content-title", "ac:link-body"   105    106 # NOTE: User links should support the intended user namespace prefix.   107    108 link_target_types = {   109     # Confluence element      MoinMoin link prefix   110     "ri:attachment"         : "attachment:",   111     "ri:user"               : "",   112     }   113    114 macro_rich_text_styles = {   115     # Confluence style        MoinMoin admonition style   116     "note"                  : "caution",   117     "warning"               : "warning",   118     "info"                  : "important",   119     "tip"                   : "tip",   120     "excerpt"               : "",   121     }   122    123 macroargs = {   124     # Confluence macro        Confluence and MoinMoin macro arguments   125     "attachments"           : [("page", "pagename")],   126     "color"                 : [("color", "col")],   127     }   128    129 macrotypes = {   130     # Confluence macro        MoinMoin syntax   131     "anchor"                : "<<Anchor(%(anchor)s)>>",   132     "attachments"           : "<<AttachList(%(args)s)>>",   133     "color"                 : "<<Color2(%(content)s, %(args)s)>>",   134     "recently-updated"      : "<<RecentChanges>>",   135     "toc"                   : "<<TableOfContents>>",   136     }   137    138 normalise_regexp_str = r"\s+"   139 normalise_regexp = re.compile(normalise_regexp_str)   140    141 class ConfluenceXMLParser(Parser):   142    143     "Handle content from Confluence 4 page revisions."   144    145     def __init__(self, out, is_comment_page=False):   146         Parser.__init__(self)   147         self.out = out   148         self.is_comment_page = is_comment_page   149    150         # Link target and label information.   151    152         self.target = None   153         self.target_type = None   154         self.label = None   155    156         # Macro information.   157    158         self.macros = []   159         self.macro_parameters = []   160         self.held_anchors = []   161    162         # Indentation and element nesting states.   163    164         self.indents = [0]   165         self.states = {}   166         self.max_level = self.level = 0   167    168         for name in preformatted_tags + single_level_tags:   169             self.states[name] = 0   170    171         # Table states.   172    173         self.table_rows = 0   174         self.table_columns = 0   175    176         # Block states.   177    178         self.have_block = False   179    180     # ContentHandler-related methods.   181    182     def startElement(self, name, attrs):   183    184         # Track indentation for lists.   185    186         if list_tags.has_key(name):   187             self.indents.append(self.indents[-1] + 1)   188    189         # Track element nesting.   190    191         if self.states.has_key(name):   192             self.states[name] += 1   193    194         # Track cumulative element nesting in order to produce appropriate depth   195         # indicators in the formatted output.   196    197         if name in preformatted_tags or name in formatted_tags:   198             self.level += 1   199             self.max_level = max(self.level, self.max_level)   200    201             # Reset indentation within regions.   202    203             self.indents.append(0)   204    205         if name in headings:   206             self.held_anchors = []   207    208         Parser.startElement(self, name, attrs)   209    210         # Remember macro information for use within the element.   211    212         if name in ("ac:macro", "ac:structured-macro"):   213             self.macros.append(self.attributes[-1].get("ac:name"))   214             self.macro_parameters.append({})   215    216     def endElement(self, name):   217    218         # Reset the indent for any preformatted/formatted region so that it may   219         # itself be indented.   220    221         if name in preformatted_tags or name in formatted_tags:   222             self.indents.pop()   223    224         Parser.endElement(self, name)   225    226         if list_tags.has_key(name):   227             self.indents.pop()   228    229         if self.states.has_key(name):   230             self.states[name] -= 1   231    232         if name in preformatted_tags or name in formatted_tags:   233             self.level -= 1   234             if not self.level:   235                 self.max_level = 0   236    237         # Discard macro state.   238    239         if name in ("ac:macro", "ac:structured-macro"):   240             self.macros.pop()   241             self.macro_parameters.pop()   242    243     def characters(self, content):   244         if not self.is_preformatted():   245             content = self.normalise(content, self.elements[-1])   246         Parser.characters(self, content)   247    248     def skippedEntity(self, name):   249         ch = htmlentitydefs.name2codepoint.get(name)   250         if ch:   251             self.text[-1].append(unichr(ch))   252    253     # Parser-related methods.   254    255     def handleElement(self, name):   256    257         """   258         Handle the completion of the element with the given 'name'. Any content   259         will either be recorded for later use (by an enclosing element, for   260         example) or emitted in some form.   261         """   262    263         text = u"".join(self.text[-1])   264    265         # Handle state.   266    267         if name == "table":   268             self.table_rows = 0   269         elif name == "tr":   270             self.table_columns = 0   271    272         # Find conversions.   273    274         conversion = None   275    276         # Handle list elements.   277    278         if name == "li" and len(self.elements) > 1:   279             list_tag = self.elements[-2]   280             conversion = list_tags.get(list_tag)   281    282         # Remember link target information.   283    284         elif link_target_tags.has_key(name):   285             target_details = []   286    287             # Get target details from the element's attributes.   288    289             for attrname in link_target_tags[name]:   290                 attrvalue = self.attributes[-1].get(attrname)   291                 if attrvalue:   292    293                     # Obtain a link label.   294    295                     if attrname in link_label_attributes and not self.label:   296                         self.label = attrvalue   297    298                     # Validate any page title.   299    300                     if attrname == "ri:content-title":   301                         attrvalue = get_page_title(attrvalue)   302                     target_details.append(attrvalue)   303    304                     # Insert any prefix required for the link.   305    306                     prefix = link_target_prefixes.get(attrname)   307                     if prefix:   308                         target_details.insert(0, prefix)   309                         if self.is_comment_page:   310                             target_details.insert(0, prefix)   311    312             # Make a link based on the details.   313    314             self.target = u"/".join(target_details)   315             self.target_type = name   316             text = ""   317    318         # For anchor links, just use the raw text and let Moin do the formatting.   319         # Set an empty default target, overwriting it if enclosing elements   320         # specify target details.   321    322         elif name in ("ac:link-body", "ac:plain-text-link-body"):   323             self.target = self.target or ""   324             self.label = text.strip()   325             text = ""   326    327         # For conventional links, remember the href attribute as the target.   328    329         elif name == "a":   330             self.target = self.attributes[-1].get("href")   331             self.label = text.strip()   332             text = ""   333    334         # Remember macro information.   335    336         elif name == "ac:parameter":   337             self.macro_parameters[-1][self.attributes[-1].get("ac:name")] = text   338             text = ""   339    340         elif name == "ac:default-parameter":   341             self.macro_parameters[-1][self.attributes[-2].get("ac:name")] = text   342             text = ""   343    344         # Handle single-level tags.   345    346         elif name in single_level_tags and self.states[name] > 1:   347             conversion = "%s"   348    349         # Handle preformatted sections.   350    351         elif name in preformatted_tags or name in formatted_tags:   352    353             # Nest the section appropriately.   354    355             level = 3 + self.max_level - self.level   356             opening = "{" * level   357             closing = "}" * level   358    359             # Macro name information is used to style rich text body regions.   360    361             if name != "table" and self.macros and macro_rich_text_styles.has_key(self.macros[-1]):   362                 details = macro_rich_text_styles[self.macros[-1]]   363                 title = self.macro_parameters[-1].get("title")   364                 if title:   365                     details = "%s\n\n%s" % (details, title)   366    367                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   368    369             elif name == "table":   370                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   371    372             else:   373                 # Preformatted sections containing newlines must contain an initial   374                 # newline.   375    376                 if text.find("\n") != -1 and not text.startswith("\n"):   377                     opening += "\n"   378    379                 conversion = "%s%%s%s" % (opening, closing)   380    381         # Handle the common case and simpler special cases.   382    383         if not conversion:   384             conversion = tags.get(name)   385    386    387    388         # Attempt to convert the text.   389    390         # Links require target information.   391    392         if name in ("ac:link", "ac:image"):   393             prefix = link_target_types.get(self.target_type, "")   394             anchor = self.attributes[-1].get("ac:anchor") or ""   395             label = self.label or text.strip() or self.target   396             text = conversion % (prefix, self.target, anchor and ("#%s" % anchor) or "", label)   397             self.target = self.target_type = self.label = None   398    399         elif name == "a":   400             text = conversion % (self.target, self.label or self.target)   401             self.target = self.target_type = self.label = None   402    403         # Macros require various kinds of information.   404         # Some macros affect the formatting of their contents, whereas other   405         # simpler macros are handled here.   406    407         elif name in ("ac:macro", "ac:structured-macro"):   408             conversion = macrotypes.get(self.macros[-1])   409    410             # Produce the converted macro.   411    412             if conversion:   413                 parameters = {"content" : text}   414                 parameters.update(self.macro_parameters[-1])   415                 argnames = macroargs.get(self.macros[-1])   416    417                 # Convert Confluence arguments to Moin arguments. Unlike the   418                 # wiki markup parser, multiple arguments are supported.   419    420                 if argnames:   421                     all_args = []   422                     for confargname, moinargname in argnames:   423                         argvalue = self.macro_parameters[-1].get(confargname)   424                         if argvalue:   425                             all_args.append(quote_macro_argument("%s=%s" % (moinargname, argvalue)))   426                     parameters["args"] = ", ".join(all_args)   427    428                 # Obtain the Moin macro with parameters substituted.   429    430                 text = conversion % parameters   431                 if self.macros[-1] == "anchor" and self.forbids_macros():   432                     self.held_anchors.append(text)   433                     text = ""   434    435             # Warn about macros that are not converted.   436    437             elif not macro_rich_text_styles.has_key(self.macros[-1]):   438                 print >>sys.stderr, "No conversion possible for macro", self.macros[-1]   439                 print >>sys.stderr, "Macro has arguments", self.macro_parameters[-1]   440                 print >>sys.stderr   441    442         # Handle the common cases for parameterised and unparameterised   443         # substitutions.   444    445         elif text and conversion:   446             text = conversion % text   447         elif simple_tags.has_key(name) and not self.is_preformatted():   448             text = simple_tags[name]   449         elif simple_preformatted_tags.has_key(name) and self.is_preformatted():   450             text = simple_preformatted_tags[name]   451    452    453    454         # Postprocess table columns and rows.   455    456         if name in ("th", "td"):   457             if self.table_columns:   458                 text = "\n|| %s" % text   459             self.table_columns += 1   460         elif name == "tr":   461             if self.table_rows:   462                 text = "\n==\n%s" % text   463             self.table_rows += 1   464    465         # Postprocess held anchor tags in headings.   466    467         elif name in headings and self.held_anchors:   468             text = "%s\n%s" % ("".join(self.held_anchors), text)   469    470    471    472         # Normalise leading whitespace and indent the text if appropriate.   473    474         if name in indented_tags:   475             text = " " * self.indents[-1] + text.lstrip()   476    477         # Add the converted text to the end of the parent element's text nodes.   478    479         if len(self.text) > 1:   480             nodes = self.text[-2]   481             parent = self.elements[-2]   482    483             # Where preceding text exists, add any blank line separators.   484    485             if u"".join(nodes):   486    487                 # All top-level elements are separated with blank lines.   488    489                 if parent == "body":   490                     nodes.append("\n")   491    492                 # Block elements always cause a new line to be started.   493    494                 if name in block_tags or self.have_block and name not in span_override_tags:   495                     nodes.append("\n")   496    497                 self.have_block = False   498    499             # Lists inside lists require separation.   500    501             elif list_tags.has_key(name) and parent == "li":   502                 nodes.append("\n")   503    504             # Without preceding text, save any block node state for non-block   505             # elements so that newline separators can be added at another   506             # level.   507    508             elif name in block_tags and parent not in block_tags:   509                 self.have_block = True   510    511             elif name not in block_tags and self.have_block and name not in span_override_tags:   512                 self.have_block = True   513    514             else:   515                 self.have_block = False   516    517             nodes.append(text)   518    519         # Otherwise, emit the text (at the top level of the document).   520    521         else:   522             self.out.write(text)   523    524     def is_preformatted(self):   525         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   526    527     def forbids_macros(self):   528         return reduce(operator.or_, [(tag in headings or tag == "a") for tag in self.elements], False)   529    530     # Whitespace normalisation.   531    532     def get_replacement(self, name):   533         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   534             return ""   535         else:   536             return " "   537    538     def normalise(self, text, name):   539         return normalise_regexp.sub(self.get_replacement(name), text)   540    541 def parse(s, out, is_comment_page=False):   542    543     "Parse the content in the string 's', writing a translation to 'out'."   544    545     # NOTE: CDATA sections appear to have erroneous endings.   546    547     s = u"""\   548 <?xml version="1.0"?>   549 <!DOCTYPE html    550      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   551      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   552 <html xmlns="http://www.w3.org/1999/xhtml">   553 <body>   554 %s   555 </body>   556 </html>""" % s.replace("]] >", "]]>")   557    558     f = StringIO(s.encode("utf-8"))   559     try:   560         parser = ConfluenceXMLParser(out, is_comment_page)   561         parser.parse(f)   562     finally:   563         f.close()   564    565 if __name__ == "__main__":   566     s = codecs.getreader("utf-8")(sys.stdin).read()   567     out = codecs.getwriter("utf-8")(sys.stdout)   568     parse(s, out)   569    570 # vim: tabstop=4 expandtab shiftwidth=4