ConfluenceConverter (file parser.py at 38084dc565d0)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 try:    35     from cStringIO import StringIO    36 except ImportError:    37     from StringIO import StringIO    38     39 from xmlread import Parser    40 import re    41 import sys    42 import operator    43 import htmlentitydefs    44     45 URL_SCHEMES = ("http", "https", "ftp", "mailto")    46     47 # Section extraction.    48     49 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    50 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    51     52 def get_regions(s):    53     54     """    55     Return a list of regions from 's'. Each region is specified using a tuple of    56     the form (type, text).    57     """    58     59     last = 0    60     regions = []    61     for match in sections_regexp.finditer(s):    62         start, end = match.span()    63         regions.append((None, s[last:start]))    64         regions.append(get_section_details(s[start:end]))    65         last = end    66     regions.append((None, s[last:]))    67     return regions    68     69 # Section inspection.    70     71 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    72 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    73     74 def get_section_details(s):    75     76     "Return the details of a section 's' in the form (type, text)."    77     78     match = section_regexp.match(s)    79     if match:    80         return (match.group("sectiontype"), match.group("options")), match.group("section")    81     else:    82         return None, s    83     84 # Heading, table and list extraction.    85     86 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"    87 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"    88 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    89     90 blockelement_regexp = re.compile(    91     "(" + list_regexp_str + ")"    92     "|"    93     "(" + table_regexp_str + ")"    94     "|"    95     "(" + blocktext_regexp_str + ")",    96     re.MULTILINE    97     )    98     99 def get_block_elements(s):   100    101     """   102     Extract headings, tables and lists from the given string 's'.   103     """   104    105     last = 0   106     blocks = []   107     for match in blockelement_regexp.finditer(s):   108         start, end = match.span()   109         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   110         blocks.append((None, s[last:start]))   111         blocks.append((matchtype, match.group("text") or s[start:end]))   112         last = end   113     blocks.append((None, s[last:]))   114     return blocks   115    116 # Block extraction.   117    118 block_regexp_str = r"^(?:\s*\n)+"   119 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   120    121 def get_basic_blocks(s):   122    123     """   124     Return blocks from the given string 's' by splitting the text on blank lines   125     and eliminating those lines.   126     """   127    128     return [b for b in block_regexp.split(s) if b.strip()]   129    130 # Block inspection.   131    132 def get_blocks(s):   133    134     """   135     Return blocks from the given string 's', inspecting the basic blocks and   136     generating additional block-level text where appropriate.   137     """   138    139     blocks = []   140    141     for blocktype, blocktext in get_block_elements(s):   142    143         # Collect heading, list and table blocks.   144    145         if blocktype is not None:   146             blocks.append((blocktype, blocktext))   147    148         # Attempt to find new subblocks in other regions.   149    150         else:   151             for block in get_basic_blocks(blocktext):   152                 blocks.append((None, block))   153    154     return blocks   155    156 # List item inspection.   157    158 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"   159 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   160    161 def get_list_items(text):   162    163     "Return a list of (marker, text) tuples for the given list 'text'."   164    165     items = []   166    167     for match in listitem_regexp.finditer(text):   168         items.append((match.group("marker"), match.group("text")))   169    170     return items   171    172 # Table row inspection.   173    174 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   175 link_regexp_str = r"[[](?P<linktext>.*?)]"   176 image_regexp_str = r"!(?P<imagetext>.*?)!"   177 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   178    179 content_regexp_str = (   180     "(" + monospace_regexp_str + ")"   181     "|"   182     "(" + link_regexp_str + ")"   183     "|"   184     "(" + image_regexp_str + ")"   185     )   186    187 table_content_regexp_str = (   188     content_regexp_str +   189     "|"   190     "(" + cellsep_regexp_str + ")"   191     )   192    193 content_regexp = re.compile(content_regexp_str)   194 table_content_regexp = re.compile(table_content_regexp_str)   195    196 def translate_content_match(match):   197    198     "Translate the content described by the given 'match', returning a string."   199    200     if match.group("monotext"):   201         return "{{{%s}}}" % match.group("monotext")   202    203     elif match.group("linktext"):   204         parts = match.group("linktext").split("|")   205    206         # NOTE: Proper detection of external links required.   207    208         if len(parts) == 1:   209             label, target, title = None, parts[0], None   210         elif len(parts) == 2:   211             (label, target), title = parts, None   212         else:   213             label, target, title = parts   214    215         target = target.strip()   216    217         # Look for namespace links and rewrite them.   218    219         if target.find(":") != -1:   220             prefix = ""   221             space, rest = target.split(":", 1)   222             if space not in URL_SCHEMES:   223                 target = "%s/%s" % (space, rest)   224    225         # Detect anchors.   226    227         elif target.startswith("#"):   228             prefix = ""   229    230         # Detect attachments.   231    232         elif target.startswith("^"):   233             prefix = "attachment:"   234    235         # Link to other pages within a space.   236    237         else:   238             prefix = "../"   239    240             # Make the link tidier by making a target if none was given.   241    242             if not label:   243                 label = target   244    245         if not label and not title:   246             return "[[%s%s]]" % (prefix, target)   247         elif not title:   248             return "[[%s%s|%s]]" % (prefix, target, label)   249         else:   250             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   251    252     elif match.group("imagetext"):   253         parts = match.group("imagetext").split("|")   254    255         # NOTE: Proper detection of external links required.   256    257         if parts[0].startswith("http"):   258             prefix = ""   259         else:   260             prefix = "attachment:"   261    262         # NOTE: Proper options conversion required.   263    264         if len(parts) == 1:   265             return "{{%s%s}}" % (prefix, parts[0])   266         else:   267             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   268    269     else:   270         return match.group()   271    272 def get_table_rows(text):   273    274     "Return a list of (cellsep, columns) tuples for the given table 'text'."   275    276     rows = []   277    278     for line in text.split("\n"):   279         cellsep = None   280         columns = [""]   281         last = 0   282         for match in table_content_regexp.finditer(line):   283             start, end = match.span()   284             columns[-1] += line[last:start]   285    286             if match.group("celltype"):   287                 if cellsep is None:   288                     cellsep = match.group("celltype")   289                 columns.append("")   290             else:   291                 columns[-1] += match.group()   292    293             last = end   294    295         columns[-1] += line[last:]   296    297         if cellsep:   298             rows.append((cellsep, columns[1:-1]))   299    300     return rows   301    302 def translate_content(text, sectiontype=None):   303    304     """   305     Return a translation of the given 'text'. If the optional 'sectiontype' is   306     specified, the translation may be modified to a form appropriate to the   307     section being translated.   308     """   309    310     parts = []   311    312     last = 0   313     for match in content_regexp.finditer(text):   314         start, end = match.span()   315         parts.append(text[last:start])   316    317         # Handle unformatted sections.   318    319         if sectiontype in ("code", "noformat"):   320             parts.append(match.group())   321         else:   322             parts.append(translate_content_match(match))   323    324         last = end   325    326     parts.append(text[last:])   327     return "".join(parts)   328    329 # Translation helpers.   330    331 blocktypes = {   332     "h1" : "= %s =",   333     "h2" : "== %s ==",   334     "h3" : "=== %s ===",   335     "h4" : "==== %s ====",   336     "h5" : "===== %s =====",   337     "h6" : "====== %s ======",   338     "bq" : "{{{%s}}}",   339     }   340    341 markers = {   342     "*" : "*",   343     "#" : "1.",   344     "-" : "*",   345     }   346    347 def translate_marker(marker):   348    349     "Translate the given 'marker' to a suitable Moin representation."   350    351     return " " * len(marker) + markers[marker[-1]]   352    353 cellseps = {   354     "|" : "||",   355     "||" : "||",   356     }   357    358 cellextra = {   359     "|" : "",   360     "||" : "'''",   361     }   362    363 def translate_cellsep(cellsep):   364    365     "Translate the given 'cellsep' to a suitable Moin representation."   366    367     return cellseps[cellsep]   368    369 def translate_cell(cellsep, text):   370    371     "Using 'cellsep', translate the cell 'text'."   372    373     return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]   374    375 sectiontypes = {   376     "code" : "",   377     "noformat" : "",   378     "quote" : "",   379     "info" : "wiki important",   380     "note" : "wiki caution",   381     "tip" : "wiki tip",   382     "warning" : "wiki warning",   383     }   384    385 # XML dialect syntax parsing.   386    387 tags = {   388     # XHTML tag               MoinMoin syntax   389     "strong"                : "'''%s'''",   390     "em"                    : "''%s''",   391     "u"                     : "__%s__",   392     "del"                   : "--(%s)--",   393     "sup"                   : "^%s^",   394     "sub"                   : ",,%s,,",   395     "code"                  : "`%s`",   396     "pre"                   : "{{{%s}}}",   397     "blockquote"            : " %s",   398     "small"                 : "~-%s-~",   399     "big"                   : "~+%s+~",   400     "p"                     : "\n%s\n",   401     "ol"                    : "\n%s",   402     "ul"                    : "\n%s",   403     "ac:plain-text-body"    : "{{{%s}}}",   404     "ac:link"               : "[[%s%s|%s]]",   405     }   406    407 for tag, translation in blocktypes.items():   408     tags[tag] = "\n%s\n" % translation   409    410 simple_tags = {   411     # XHTML tag               MoinMoin syntax   412     "br"                    : "<<BR>>",   413     }   414    415 list_tags = {   416     # XHTML list tag          MoinMoin list item syntax   417     "ol"                    : "1. %s\n",   418     "ul"                    : "* %s\n",   419     }   420    421 indented_tags = ["li", "p"]   422    423 link_target_tags = {   424     # Confluence element      Attribute providing the target   425     "ri:page"               : "ri:content-title",   426     "ri:attachment"         : "ri:filename",   427     }   428    429 macro_rich_text_styles = {   430     # Confluence style        MoinMoin admonition style   431     "note"                  : "caution",   432     "warning"               : "warning",   433     "info"                  : "important",   434     "tip"                   : "tip",   435     }   436    437 normalise_regexp_str = r"\s+"   438 normalise_regexp = re.compile(normalise_regexp_str)   439    440 normalise_end_regexp_str = r"\s\s+$"   441 normalise_end_regexp = re.compile(normalise_end_regexp_str)   442    443 class ConfluenceXMLParser(Parser):   444    445     "Handle content from Confluence 4 page revisions."   446    447     def __init__(self, out):   448         Parser.__init__(self)   449         self.out = out   450    451         # Link target information.   452    453         self.target = None   454         self.target_type = None   455    456         # Macro information.   457    458         self.macro = None   459         self.macro_parameters = {}   460    461         # Indentation and preformatted states.   462    463         self.indent = 0   464         self.states = {}   465         for name in ("pre", "ac:plain-text-body"):   466             self.states[name] = 0   467    468     # ContentHandler-related methods.   469    470     def startElement(self, name, attrs):   471         if list_tags.has_key(name):   472             self.indent += 1   473         elif self.states.has_key(name):   474             self.states[name] += 1   475         Parser.startElement(self, name, attrs)   476    477     def endElement(self, name):   478         Parser.endElement(self, name)   479         if list_tags.has_key(name):   480             self.indent -= 1   481         elif self.states.has_key(name):   482             self.states[name] -= 1   483    484     def characters(self, content):   485         if not self.is_preformatted():   486             content = self.normalise(content, self.elements[-1])   487         Parser.characters(self, content)   488    489     def skippedEntity(self, name):   490         ch = htmlentitydefs.name2codepoint.get(name)   491         if ch:   492             self.text[-1].append(unichr(ch))   493    494     # Parser-related methods.   495    496     def handleElement(self, name):   497         text = "".join(self.text[-1])   498         conversion = None   499    500         # Handle list elements.   501    502         if name == "li" and len(self.elements) > 1:   503             list_tag = self.elements[-2]   504             conversion = list_tags.get(list_tag)   505    506         # Remember link target information.   507    508         elif link_target_tags.has_key(name):   509             self.target = self.attributes[-1].get(link_target_tags[name])   510             self.target_type = name   511             text = ""   512    513         # Remember macro information.   514    515         elif name == "ac:parameter":   516             self.macro_parameters[self.attributes[-1].get("ac:name")] = text   517             text = ""   518    519         elif name == "ac:macro":   520             self.macro = self.attributes[-1].get("ac:name")   521    522         # Handle the common case.   523    524         else:   525             conversion = tags.get(name)   526    527         # Attempt to convert the text.   528    529         # Links require target information.   530    531         if name == "ac:link":   532             if self.target_type == "ri:attachment":   533                 prefix = "attachment:"   534             else:   535                 prefix = "../"   536    537             text = conversion % (prefix, self.target, text or self.target)   538             self.target = self.target_type = None   539    540         # Macro name information is used to style rich text body regions.   541    542         elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):   543             details = macro_rich_text_styles[self.macro]   544             title = self.macro_parameters.get("title")   545             if title:   546                 details = "%s\n\n%s" % (details, title)   547             text = "{{{#!wiki %s\n\n%s}}}" % (details, text)   548             self.macro = None   549             self.macro_parameters = {}   550    551         # Handle the common case.   552    553         elif text and conversion:   554             text = conversion % text   555         elif simple_tags.has_key(name):   556             text = simple_tags[name]   557    558         # Normalise leading whitespace and indent the text if appropriate.   559    560         if name in indented_tags:   561             text = " " * self.indent + text.lstrip()   562    563         # Add the converted text to the end of the parent element's text nodes.   564    565         if len(self.text) > 1:   566             preceding = "".join(self.text[-2])   567    568             if not self.is_preformatted():   569                 preceding = self.normalise_end(preceding, self.elements[-2])   570    571             self.text[-2] = [preceding]   572             self.text[-2].append(text)   573    574         # Otherwise, emit the text.   575    576         else:   577             self.out.write(text)   578    579     def is_preformatted(self):   580         return reduce(operator.or_, self.states.values(), False)   581    582     def get_replacement(self, name, end=False):   583         if list_tags.has_key(name):   584             if end:   585                 return "\n"   586             else:   587                 return ""   588         elif name == "body":   589             return "\n\n"   590         else:   591             return " "   592    593     def normalise(self, text, name):   594         return normalise_regexp.sub(self.get_replacement(name), text)   595    596     def normalise_end(self, text, name):   597         return normalise_end_regexp.sub(self.get_replacement(name, True), text)   598    599 def xmlparse(s, out):   600    601     "Parse the content in the string 's', writing a translation to 'out'."   602    603     # NOTE: CDATA sections appear to have erroneous endings.   604    605     s = u"""\   606 <?xml version="1.0"?>   607 <!DOCTYPE html    608      PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"   609      "http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">   610 <html xmlns="http://www.w3.org/1999/xhtml">   611 <body>   612 %s   613 </body>   614 </html>""" % s.replace("]] >", "]]>")   615    616     f = StringIO(s.encode("utf-8"))   617     try:   618         parser = ConfluenceXMLParser(out)   619         parser.parse(f)   620     finally:   621         f.close()   622    623 # General parsing.   624    625 def parse(s, out):   626    627     "Parse the content in the string 's', writing a translation to 'out'."   628    629     for type, text in get_regions(s):   630    631         # Handle list, heading, blockquote or anonymous blocks.   632    633         if type is None:   634             for blocktype, blocktext in get_blocks(text):   635    636                 # Translate headings and blockquotes.   637    638                 if blocktypes.has_key(blocktype):   639                     print >>out, blocktypes[blocktype] % blocktext   640    641                 # Translate list items.   642    643                 elif blocktype == "list":   644                     for listmarker, listitem in get_list_items(blocktext):   645                         print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))   646    647                 # Translate table items.   648    649                 elif blocktype == "table":   650                     for cellsep, columns in get_table_rows(blocktext):   651                         moinsep = translate_cellsep(cellsep)   652                         print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep   653    654                 # Handle anonymous blocks.   655    656                 else:   657                     print >>out, translate_content(blocktext.rstrip())   658    659                 print >>out   660    661         # Handle sections.   662    663         else:   664             sectiontype, options = type   665    666             # Direct translations of sections.   667    668             mointype = sectiontypes.get(sectiontype)   669             if mointype:   670                 print >>out, "{{{#!%s" % mointype   671                 if options:   672                     print >>out, "##", options   673             else:   674                 print >>out, "{{{",   675             print >>out, translate_content(text, sectiontype),   676             print >>out, "}}}"   677             print >>out   678    679 if __name__ == "__main__":   680     s = sys.stdin.read()   681     parse(s, sys.stdout)   682    683 # vim: tabstop=4 expandtab shiftwidth=4