ConfluenceConverter (file wikiparser.py at 4a10cbd14a49)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then split into regions.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38 import operator    39     40 # Section extraction.    41     42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}"    43 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    44     45 def get_regions(s):    46     47     """    48     Return a list of regions from 's'. Each region is specified using a tuple of    49     the form (type, text).    50     """    51     52     last = 0    53     regions = [""]    54     depth = 0    55     56     for match in sections_regexp.finditer(s):    57         start, end = match.span()    58         is_start = match.group("options")    59         is_section = is_section_marker(match.group("type"))    60     61         # The start of a region is either indicated by a marker with options or    62         # by a marker where no region is currently active.    63     64         if is_start or not depth:    65     66             # Where no region is active, add the text since the last match as a    67             # "null" region.    68     69             if not depth:    70                 regions[-1] += s[last:start]    71     72                 # A new region is maintained as a string.    73     74                 if is_section:    75                     regions.append(s[start:end])    76     77                 # Certain markers may be standalone macros.    78     79                 else:    80                     regions[-1] += s[start:end]    81     82             # Where a region is active, add the text since the last match as    83             # well as the text in this match to the region.    84     85             else:    86                 regions[-1] += s[last:end]    87     88             if is_section:    89                 depth += 1    90     91         # The end of a region is indicated by a marker with no options.    92     93         else:    94             # Where no region is active, the text since the last match plus the    95             # marker are added to the current "null" region.    96     97             if not depth:    98     99                 # Add to the string portion of the "null" region.   100    101                 regions[-1] += s[last:end]   102    103             # Where a region is active, the end marker and preceding text is   104             # either incorporated into the current region if more than one   105             # region is active, or the preceding text is incorporated into the   106             # current region and the details of the region are then obtained.   107    108             else:   109                 if depth > 1 or not is_section:   110                     regions[-1] += s[last:end]   111    112                 # Terminate the active region, interpreting its contents.   113    114                 else:   115                     regions[-1] += s[last:end]   116                     regions.append("")   117    118                 if is_section:   119                     depth -= 1   120    121         last = end   122    123     # Where a region is still active, terminate it.   124    125     regions[-1] += s[last:]   126    127     return [get_section_details(s) for s in regions if s]   128    129 def is_section_marker(sectiontype):   130     return sectiontypes.has_key(sectiontype) or sectiontype == "color"   131    132 # Section inspection.   133    134 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"   135 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)   136    137 def get_section_details(s):   138    139     "Return the details of a section 's' in the form (type, text)."   140    141     match = section_regexp.match(s)   142     if match:   143         return (match.group("sectiontype"), match.group("options")), match.group("section")   144     else:   145         return None, s   146    147 # Heading, table and list extraction.   148    149 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"   150 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"   151 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"   152    153 blockelement_regexp = re.compile(   154     "(" + list_regexp_str + ")"   155     "|"   156     "(" + table_regexp_str + ")"   157     "|"   158     "(" + blocktext_regexp_str + ")",   159     re.MULTILINE   160     )   161    162 def get_block_elements(s):   163    164     """   165     Extract headings, tables and lists from the given string 's'.   166     """   167    168     last = 0   169     blocks = []   170     for match in blockelement_regexp.finditer(s):   171         start, end = match.span()   172         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   173         blocks.append((None, s[last:start]))   174         blocks.append((matchtype, match.group("text") or s[start:end]))   175         last = end   176     blocks.append((None, s[last:]))   177     return blocks   178    179 # Block extraction.   180    181 block_regexp_str = r"^(?:\s*\n)+"   182 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   183    184 def get_basic_blocks(s):   185    186     """   187     Return blocks from the given string 's' by splitting the text on blank lines   188     and eliminating those lines.   189     """   190    191     return [b for b in block_regexp.split(s) if b.strip()]   192    193 # Block inspection.   194    195 def get_blocks(s):   196    197     """   198     Return blocks from the given string 's', inspecting the basic blocks and   199     generating additional block-level text where appropriate.   200     """   201    202     blocks = []   203    204     for blocktype, blocktext in get_block_elements(s):   205    206         # Collect heading, list and table blocks.   207    208         if blocktype is not None:   209             blocks.append((blocktype, blocktext))   210    211         # Attempt to find new subblocks in other regions.   212    213         else:   214             for block in get_basic_blocks(blocktext):   215                 blocks.append((None, block))   216    217     return blocks   218    219 # List item inspection.   220    221 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   222 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   223    224 def get_list_items(text):   225    226     "Return a list of (marker, text) tuples for the given list 'text'."   227    228     items = []   229    230     for match in listitem_regexp.finditer(text):   231         items.append((match.group("marker"), match.group("text")))   232    233     return items   234    235 # Content inspection.   236    237 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   238 link_regexp_str      = r"[[](?P<linktext>.*?)]"   239 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   240 macro_regexp_str     = r"{(?P<macro>.*?):(?P<options>.*?)}"   241    242 # Word-dependent patterns.   243 # Here, the unbracketed markers must test for the absence of surrounding word   244 # characters.   245    246 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   247 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   248 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   249 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   250 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   251    252 content_regexp_str = (   253     "(" + monospace_regexp_str + ")"   254     "|"   255     "(" + link_regexp_str + ")"   256     "|"   257     "(" + image_regexp_str + ")"   258     "|"   259     "(" + macro_regexp_str + ")"   260     "|"   261     "(" + italic_regexp_str + ")"   262     "|"   263     "(" + bold_regexp_str + ")"   264     "|"   265     "(" + del_regexp_str + ")"   266     "|"   267     "(" + underline_regexp_str + ")"   268     "|"   269     "(" + sub_regexp_str + ")"   270     )   271    272 # Table row inspection.   273    274 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   275    276 table_content_regexp_str = (   277     content_regexp_str +   278     "|"   279     "(" + cellsep_regexp_str + ")"   280     )   281    282 content_regexp = re.compile(content_regexp_str)   283 table_content_regexp = re.compile(table_content_regexp_str)   284    285 def get_table_rows(text):   286    287     "Return a list of (cellsep, columns) tuples for the given table 'text'."   288    289     rows = []   290    291     for row in text.split("|\n"):   292         if not row:   293             break   294    295         row += "|"   296         cellsep = None   297         columns = [""]   298         last = 0   299         for match in table_content_regexp.finditer(row):   300             start, end = match.span()   301             columns[-1] += row[last:start]   302    303             if match.group("celltype"):   304                 if cellsep is None:   305                     cellsep = match.group("celltype")   306                 columns.append("")   307             else:   308                 columns[-1] += match.group()   309    310             last = end   311    312         columns[-1] += row[last:]   313    314         if cellsep:   315             rows.append((cellsep, columns[1:-1]))   316    317     return rows   318    319 # Notation conversion.   320    321 notation_mapping = [   322     (r"\!", "!"),   323     (r"\-", "-"),   324     (r"\\""\n", "<<BR>>"),   325     (r"\\ ", "<<BR>>"),   326     (r"\~", "~"),   327     ]   328    329 preformatted_notation_mapping = [   330     (r"\!", "!"),   331     (r"\-", "-"),   332     (r"\\""\n", "\n"),   333     (r"\\ ", "\n"),   334     (r"\~", "~"),   335     ]   336    337 # Translation helpers.   338    339 markers = {   340     "*" : "*",   341     "#" : "1.",   342     "-" : "*",   343     }   344    345 cellseps = {   346     "|" : "\n|| ",   347     "||" : "\n|| ",   348     }   349    350 cellextra = {   351     "|" : "",   352     "||" : "'''",   353     }   354    355 sectiontypes = {   356     "code"      : "",   357     "noformat"  : "",   358     "quote"     : "",   359     "info"      : "#!wiki important",   360     "note"      : "#!wiki caution",   361     "tip"       : "#!wiki tip",   362     "warning"   : "#!wiki warning",   363     }   364    365 preformatted_sectiontypes = (None, "noformat")   366    367 macroargs = {   368     "color"     : "col",   369     }   370    371 macrotypes = {   372     "anchor"    : "<<Anchor(%(args)s)>>",   373     "color"     : "<<Color2(%(content)s, %(args)s)>>",   374     }   375    376 class ConfluenceParser:   377    378     "A parser for Confluence markup."   379    380     def __init__(self):   381         self.max_level = self.level = 0   382         self.in_heading = False   383         self.held_anchors = []   384         self.macro = None   385         self.sections = []   386    387     def translate_marker(self, marker):   388    389         "Translate the given 'marker' to a suitable Moin representation."   390    391         return " " * len(marker) + markers[marker[-1]]   392    393     def translate_cellsep(self, cellsep):   394    395         "Translate the given 'cellsep' to a suitable Moin representation."   396    397         return cellseps[cellsep]   398    399     def translate_cell(self, cellsep, text):   400    401         "Using 'cellsep', translate the cell 'text'."   402    403         return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]   404    405     def translate_content_match(self, match):   406    407         "Translate the content described by the given 'match', returning a string."   408    409         if match.group("monotext"):   410             self.enter_section(); self.leave_section()   411             return "{{{%s}}}" % match.group("monotext")   412    413         elif match.group("linktext"):   414             parts = match.group("linktext").split("|")   415    416             # NOTE: Proper detection of external links required.   417    418             if len(parts) == 1:   419                 label, target, title = None, parts[0], None   420             elif len(parts) == 2:   421                 (label, target), title = parts, None   422             else:   423                 label, target, title = parts   424    425             target = target.strip()   426    427             # Look for namespace links and rewrite them.   428    429             if target.find(":") != -1:   430                 prefix = ""   431                 space, rest = target.split(":", 1)   432                 if space not in URL_SCHEMES:   433                     rest = get_page_title(rest)   434                     target = "%s/%s" % (space, rest)   435    436             # Detect anchors.   437    438             elif target.startswith("#"):   439                 prefix = ""   440    441             # Detect attachments.   442    443             elif target.startswith("^"):   444                 prefix = "attachment:"   445    446             # Link to other pages within a space.   447    448             else:   449                 prefix = "../"   450    451                 # Make the link tidier by making a target if none was given.   452    453                 if not label:   454                     label = target   455    456                 target = get_page_title(target)   457    458             if not label and not title:   459                 return "[[%s%s]]" % (prefix, target)   460             elif not title:   461                 return "[[%s%s|%s]]" % (prefix, target, label)   462             else:   463                 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   464    465         elif match.group("imagetext"):   466             parts = match.group("imagetext").split("|")   467    468             # NOTE: Proper detection of external links required.   469    470             if parts[0].startswith("http"):   471                 prefix = ""   472             else:   473                 prefix = "attachment:"   474    475             # NOTE: Proper options conversion required.   476    477             if len(parts) == 1:   478                 return "{{%s%s}}" % (prefix, parts[0])   479             else:   480                 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   481    482         elif match.group("macro"):   483             macro_name = match.group("macro")   484             if macrotypes.has_key(macro_name):   485                 argname = macroargs.get(macro_name)   486                 result = macrotypes[macro_name] % {   487                     "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options"))   488                     }   489                 if not self.forbids_macros():   490                     return result   491                 if macro_name == "anchor":   492                     self.held_anchors.append(result)   493             return ""   494    495         elif match.group("italictext"):   496             return "''%s''" % self.translate_content(match.group("italictext"))   497    498         elif match.group("boldtext"):   499             return "'''%s'''" % self.translate_content(match.group("boldtext"))   500    501         elif match.group("deltext"):   502             return "--(%s)--" % self.translate_content(match.group("deltext"))   503    504         elif match.group("underlinetext"):   505             return "__%s__" % self.translate_content(match.group("underlinetext"))   506    507         elif match.group("subtext"):   508             return ",,%s,," % self.translate_content(match.group("subtext"))   509    510         else:   511             return self.translate_text(match.group())   512    513     def translate_text(self, s, preformatted=False):   514    515         "Translate the plain text string 's', converting notation."   516    517         for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   518             s = s.replace(before, after)   519         return s   520    521     def translate_content(self, text):   522    523         """   524         Return a translation of the given 'text'. If the optional 'sectiontype' is   525         specified, the translation may be modified to a form appropriate to the   526         section being translated.   527         """   528    529         parts = []   530         preformatted = self.is_preformatted()   531    532         last = 0   533         for match in content_regexp.finditer(text):   534             start, end = match.span()   535             parts.append(self.translate_text(text[last:start], preformatted))   536    537             # Handle unformatted sections.   538    539             if self.sections and self.sections[-1] in ("code", "noformat"):   540                 parts.append(match.group())   541             else:   542                 parts.append(self.translate_content_match(match))   543    544             last = end   545    546         parts.append(self.translate_text(text[last:], preformatted))   547         return "".join(parts)   548    549     def is_preformatted(self):   550         return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False)   551    552     def translate_block(self, blocktype, blocktext):   553    554         "Translate the block with the given 'blocktype' and 'blocktext'."   555    556         if blocktype in headings:   557             self.in_heading = True   558             self.held_anchors = []   559    560         parts = []   561    562         # Translate headings and blockquotes.   563    564         if blocktypes.has_key(blocktype):   565             text = self.parse_text(blocktext)   566             for anchor in self.held_anchors:   567                 parts.append(anchor)   568             parts.append(blocktypes[blocktype] % text)   569    570         # Translate list items.   571    572         elif blocktype == "list":   573             for listmarker, listitem in get_list_items(blocktext):   574                 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem)))   575    576         # Translate table items.   577    578         elif blocktype == "table":   579    580             # Enter the table.   581    582             self.enter_section()   583    584             table_parts = []   585             first = True   586    587             for cellsep, columns in get_table_rows(blocktext):   588                 if not first:   589                     table_parts.append("==")   590                 else:   591                     first = False   592                 moinsep = self.translate_cellsep(cellsep)   593                 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))   594    595             # Nest the section appropriately.   596    597             opening, closing = self.nest_section()   598    599             parts.append("%s#!table" % opening)   600             parts += table_parts   601             parts.append(closing)   602    603             # Leave the table.   604    605             self.leave_section()   606    607         # Handle anonymous blocks.   608    609         else:   610             parts.append(self.parse_text(blocktext))   611    612         if blocktype in headings:   613             self.in_heading = False   614    615         return "\n".join(parts)   616    617     def translate_section(self, sectiontype, options, text):   618    619         """   620         Translate the section with the given 'sectiontype', 'options' and   621         'text'.   622         """   623    624         parts = []   625    626         # Enter the section.   627    628         self.enter_section(sectiontype)   629    630         # Sections can contain other sections.   631    632         section_content = self.parse_text(text.strip())   633    634         # Nest the section appropriately.   635    636         opening, closing = self.nest_section()   637         mointype = sectiontypes.get(sectiontype)   638    639         parts.append("%s%s\n" % (opening, mointype or ""))   640         if options:   641             parts.append("## %s\n" % options)   642         parts.append(section_content)   643         parts.append("\n%s\n" % closing)   644    645         # Leave the section.   646    647         self.leave_section()   648    649         return parts   650    651     def enter_section(self, sectiontype=None):   652         self.level += 1   653         self.max_level = max(self.level, self.max_level)   654         self.sections.append(sectiontype)   655    656     def leave_section(self):   657         self.level -= 1   658         if not self.level:   659             self.max_level = 0   660         self.sections.pop()   661    662     def nest_section(self):   663         level = 3 + self.max_level - self.level   664         opening = "{" * level   665         closing = "}" * level   666         return opening, closing   667    668     # General parsing.   669    670     def parse_text(self, s, top=False):   671    672         "Parse the content in the string 's', returning the translation."   673    674         parts = []   675    676         # Control spacing between blocks and other blocks or sections.   677    678         preceded_by_block = False   679    680         for type, text in get_regions(s):   681    682             # Handle list, heading, blockquote or anonymous blocks.   683    684             if type is None:   685    686                 # Where the region is the same as the provided text, return   687                 # immediately. This is the base case of the recursive parsing   688                 # process.   689    690                 if text == s and not top:   691                     return self.translate_content(text)   692    693                 # Otherwise, obtain and translate the blocks.   694    695                 if preceded_by_block:   696                     parts.append("\n")   697    698                 first = True   699                 for blocktype, blocktext in get_blocks(text):   700                     if not first:   701                         parts.append("\n")   702                     else:   703                         first = False   704                     parts.append("%s" % self.translate_block(blocktype, blocktext))   705    706                 if not first:   707                     preceded_by_block = True   708    709             # Handle sections.   710    711             else:   712                 sectiontype, options = type   713    714                 # Direct translations of sections.   715    716                 if sectiontypes.has_key(sectiontype):   717                     if preceded_by_block:   718                         parts.append("\n")   719    720                     parts += self.translate_section(sectiontype, options, text)   721                     preceded_by_block = True   722    723                 # Translations of macros acting as sections.   724    725                 elif macrotypes.has_key(sectiontype):   726    727                     # Prevent the production of macros in places they would   728                     # produce illegal Moin syntax.   729    730                     if not self.forbids_macros():   731                         self.macro = sectiontype   732                         argname = macroargs.get(sectiontype)   733                         parts.append(macrotypes[sectiontype] % {   734                             "content"   : quote_macro_argument(self.parse_text(text)),   735                             "args"      : quote_macro_argument((argname and ("%s=" % argname) or "") + options)   736                             })   737                         self.macro = None   738    739                     # Include the contents of section-based macros where the   740                     # macros themselves are not allowed.   741    742                     else:   743                         parts.append(self.translate_content(text))   744    745                     preceded_by_block = False   746    747                 # Unrecognised sections.   748    749                 else:   750                     parts += self.translate_section(sectiontype, None, text)   751                     preceded_by_block = False   752    753         return "".join(parts)   754    755     def forbids_macros(self):   756         return self.in_heading or self.macro   757    758 def parse(s, out):   759    760     "Parse the content in the string 's', writing a translation to 'out'."   761    762     parser = ConfluenceParser()   763     out.write(parser.parse_text(s, top=True))   764    765 if __name__ == "__main__":   766     s = codecs.getreader("utf-8")(sys.stdin).read()   767     out = codecs.getwriter("utf-8")(sys.stdout)   768     parse(s, out)   769    770 # vim: tabstop=4 expandtab shiftwidth=4