ConfluenceConverter (file xmlparser.py at b7133a21ad01)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki XML/XHTML syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 try:    25     from cStringIO import StringIO    26 except ImportError:    27     from StringIO import StringIO    28     29 from MoinMoin import wikiutil    30 from common import *    31 from xmlread import Parser    32 import re    33 import sys    34 import operator    35 import htmlentitydefs    36 import codecs    37     38 # XML dialect syntax parsing.    39     40 tags = {    41     # XHTML tag               MoinMoin syntax    42     "strong"                : "'''%s'''",    43     "em"                    : "''%s''",    44     "u"                     : "__%s__",    45     "del"                   : "--(%s)--",    46     "sup"                   : "^%s^",    47     "sub"                   : ",,%s,,",    48     "code"                  : "`%s`",    49     "tbody"                 : "%s",    50     "tr"                    : "%s",    51     "th"                    : "'''%s'''",    52     "td"                    : "%s",    53     "blockquote"            : " %s",    54     "small"                 : "~-%s-~",    55     "big"                   : "~+%s+~",    56     "p"                     : "%s",    57     "ol"                    : "%s",    58     "ul"                    : "%s",    59     "ac:link"               : "[[%s%s|%s]]",    60     "ac:image"              : "{{%s%s|%s}}",    61     "a"                     : "[[%s|%s]]",    62     }    63     64 for tag, translation in blocktypes.items():    65     tags[tag] = translation    66     67 simple_tags = {    68     # XHTML tag               MoinMoin syntax    69     "br"                    : "<<BR>>",    70     }    71     72 list_tags = {    73     # XHTML list tag          MoinMoin list item syntax    74     "ol"                    : "1. %s",    75     "ul"                    : "* %s",    76     }    77     78 preformatted_tags = ["pre", "ac:plain-text-body"]    79 single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]    80 formatted_tags    = ["ac:rich-text-body", "table"]    81     82 indented_tags = ["li", "p"] + preformatted_tags + formatted_tags    83 block_tags = indented_tags + blocktypes.keys() + list_tags.keys()    84 span_override_tags = ["ac:link"]    85     86 link_target_tags = {    87     # Confluence element      Attributes providing the target    88     "ri:page"               : ("ri:space-key", "ri:content-title"),    89     "ri:attachment"         : ("ri:filename",),    90     "ri:user"               : ("ri:username",),    91     }    92     93 link_target_prefixes = {    94     # Attribute with details  Prefix ensuring correct relative link    95     "ri:space-key"          : "..",    96     "ri:content-title"      : "..",    97     }    98     99 link_label_attributes = "ri:content-title", "ac:link-body"   100    101 # NOTE: User links should support the intended user namespace prefix.   102    103 link_target_types = {   104     # Confluence element      MoinMoin link prefix   105     "ri:attachment"         : "attachment:",   106     "ri:user"               : "",   107     "ac:link-body"          : "#",   108     }   109    110 macro_rich_text_styles = {   111     # Confluence style        MoinMoin admonition style   112     "note"                  : "caution",   113     "warning"               : "warning",   114     "info"                  : "important",   115     "tip"                   : "tip",   116     }   117    118 normalise_regexp_str = r"\s+"   119 normalise_regexp = re.compile(normalise_regexp_str)   120    121 class ConfluenceXMLParser(Parser):   122    123     "Handle content from Confluence 4 page revisions."   124    125     def __init__(self, out):   126         Parser.__init__(self)   127         self.out = out   128    129         # Link target and label information.   130    131         self.target = None   132         self.target_type = None   133         self.label = None   134    135         # Macro information.   136    137         self.macro = None   138         self.macro_parameters = {}   139    140         # Indentation and element nesting states.   141    142         self.indent = 0   143         self.states = {}   144         self.max_level = self.level = 0   145    146         for name in preformatted_tags + single_level_tags:   147             self.states[name] = 0   148    149         # Table states.   150    151         self.table_rows = 0   152         self.table_columns = 0   153    154         # Block states.   155    156         self.have_block = False   157    158     # ContentHandler-related methods.   159    160     def startElement(self, name, attrs):   161    162         # Track indentation for lists.   163    164         if list_tags.has_key(name):   165             self.indent += 1   166    167         # Track element nesting.   168    169         elif self.states.has_key(name):   170             self.states[name] += 1   171    172         # Track cumulative element nesting in order to produce appropriate depth   173         # indicators in the formatted output.   174    175         if name in preformatted_tags or name in formatted_tags:   176             self.level += 1   177             self.max_level = max(self.level, self.max_level)   178    179         Parser.startElement(self, name, attrs)   180    181         # Remember macro information for use within the element.   182    183         if name == "ac:macro":   184             self.macro = self.attributes[-1].get("ac:name")   185    186     def endElement(self, name):   187         Parser.endElement(self, name)   188    189         if list_tags.has_key(name):   190             self.indent -= 1   191         elif self.states.has_key(name):   192             self.states[name] -= 1   193         if name in preformatted_tags or name in formatted_tags:   194             self.level -= 1   195             if not self.level:   196                 self.max_level = 0   197    198     def characters(self, content):   199         if not self.is_preformatted():   200             content = self.normalise(content, self.elements[-1])   201         Parser.characters(self, content)   202    203     def skippedEntity(self, name):   204         ch = htmlentitydefs.name2codepoint.get(name)   205         if ch:   206             self.text[-1].append(unichr(ch))   207    208     # Parser-related methods.   209    210     def handleElement(self, name):   211    212         """   213         Handle the completion of the element with the given 'name'. Any content   214         will either be recorded for later use (by an enclosing element, for   215         example) or emitted in some form.   216         """   217    218         text = u"".join(self.text[-1])   219    220         # Handle state.   221    222         if name == "table":   223             self.table_rows = 0   224         elif name == "tr":   225             self.table_columns = 0   226    227         # Find conversions.   228    229         conversion = None   230    231         # Handle list elements.   232    233         if name == "li" and len(self.elements) > 1:   234             list_tag = self.elements[-2]   235             conversion = list_tags.get(list_tag)   236    237         # Remember link target information.   238    239         elif link_target_tags.has_key(name):   240             target_details = []   241    242             # Get target details from the element's attributes.   243    244             for attrname in link_target_tags[name]:   245                 attrvalue = self.attributes[-1].get(attrname)   246                 if attrvalue:   247                     target_details.append(attrvalue)   248                     prefix = link_target_prefixes.get(attrname)   249                     if prefix:   250                         target_details.insert(0, prefix)   251                     if attrname in link_label_attributes and not self.label:   252                         self.label = attrvalue   253    254             # Make a link based on the details.   255    256             self.target = u"/".join(target_details)   257             self.target_type = name   258             text = ""   259    260         # For anchor links, just use the raw text and let Moin do the formatting.   261    262         elif name == "ac:link-body":   263             if not self.target_type:   264                 self.target_type = name   265             self.label = text.strip()   266             text = ""   267    268         # For conventional links, remember the href attribute as the target.   269    270         elif name == "a":   271             self.target = self.attributes[-1].get("href")   272             self.label = text.strip()   273             text = ""   274    275         # Discard macro state.   276    277         elif name == "ac:macro":   278             self.macro = None   279             self.macro_parameters = {}   280    281         # Remember macro information.   282    283         elif name in ("ac:parameter", "ac:default-parameter"):   284             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   285             text = ""   286    287         # Handle single-level tags.   288    289         elif name in single_level_tags and self.states[name] > 1:   290             conversion = "%s"   291    292         # Handle preformatted sections.   293    294         elif name in preformatted_tags or name in formatted_tags:   295    296             # Nest the section appropriately.   297    298             level = 3 + self.max_level - self.level   299             opening = "{" * level   300             closing = "}" * level   301    302             # Macro name information is used to style rich text body regions.   303    304             if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):   305                 details = macro_rich_text_styles[self.macro]   306                 title = self.macro_parameters.get("title")   307                 if title:   308                     details = "%s\n\n%s" % (details, title)   309    310                 conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)   311    312             elif name == "table":   313                 conversion = "%s#!table\n%%s\n%s" % (opening, closing)   314    315             else:   316                 conversion = "%s%%s%s" % (opening, closing)   317    318         # Handle the common case and simpler special cases.   319    320         if not conversion:   321             conversion = tags.get(name)   322    323    324    325         # Attempt to convert the text.   326    327         # Links require target information.   328    329         if name in ("ac:link", "ac:image"):   330             prefix = link_target_types.get(self.target_type, "")   331             anchor = self.attributes[-1].get("ac:anchor")   332             text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target)   333             self.target = self.target_type = self.label = None   334    335         elif name == "a":   336             text = conversion % (self.target, self.label or self.target)   337             self.target = self.target_type = self.label = None   338    339         # Handle the common case.   340    341         elif text and conversion:   342             text = conversion % text   343         elif simple_tags.has_key(name):   344             text = simple_tags[name]   345    346         # Postprocess table columns and rows.   347    348         if name in ("th", "td"):   349             if self.table_columns:   350                 text = "\n|| %s" % text   351             self.table_columns += 1   352         elif name == "tr":   353             if self.table_rows:   354                 text = "\n==\n%s" % text   355             self.table_rows += 1   356    357         # Normalise leading whitespace and indent the text if appropriate.   358    359         if name in indented_tags:   360             text = " " * self.indent + text.lstrip()   361    362         # Add the converted text to the end of the parent element's text nodes.   363    364         if len(self.text) > 1:   365             nodes = self.text[-2]   366             parent = self.elements[-2]   367    368             # Where preceding text exists, add any blank line separators.   369    370             if u"".join(nodes):   371    372                 # All top-level elements are separated with blank lines.   373    374                 if parent == "body":   375                     nodes.append("\n")   376    377                 # Block elements always cause a new line to be started.   378    379                 if name in block_tags or self.have_block and name not in span_override_tags:   380                     nodes.append("\n")   381    382                 self.have_block = False   383    384             # Lists inside lists require separation.   385    386             elif list_tags.has_key(name) and parent == "li":   387                 nodes.append("\n")   388    389             # Without preceding text, save any block node state for non-block   390             # elements so that new line separators can be added at another   391             # level.   392    393             elif name in block_tags and parent not in block_tags:   394                 self.have_block = True   395    396             elif name not in block_tags and self.have_block and name not in span_override_tags:   397                 self.have_block = True   398    399             else:   400                 self.have_block = False   401    402             nodes.append(text)   403    404         # Otherwise, emit the text (at the top level of the document).   405    406         else:   407             self.out.write(text)   408    409     def is_preformatted(self):   410         return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)   411    412     # Whitespace normalisation.   413    414     def get_replacement(self, name):   415         if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):   416             return ""   417         else:   418             return " "   419    420     def normalise(self, text, name):   421         return normalise_regexp.sub(self.get_replacement(name), text)   422    423 def parse(s, out):   424    425     "Parse the content in the string 's', writing a translation to 'out'."   426    427     # NOTE: CDATA sections appear to have erroneous endings.   428    429     s = u"""\   430 <?xml version="1.0"?>   431 <!DOCTYPE html    432      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   433      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   434 <html xmlns="http://www.w3.org/1999/xhtml">   435 <body>   436 %s   437 </body>   438 </html>""" % s.replace("]] >", "]]>")   439    440     f = StringIO(s.encode("utf-8"))   441     try:   442         parser = ConfluenceXMLParser(out)   443         parser.parse(f)   444     finally:   445         f.close()   446    447 if __name__ == "__main__":   448     s = sys.stdin.read()   449     out = codecs.getwriter("utf-8")(sys.stdout)   450     parse(s, out)   451    452 # vim: tabstop=4 expandtab shiftwidth=4