ConfluenceConverter (file wikiparser.py at ba33e8011a6f)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then split into regions.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38 import operator    39     40 # Section extraction.    41     42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}"    43 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    44     45 def get_regions(s):    46     47     """    48     Return a list of regions from 's'. Each region is specified using a tuple of    49     the form (type, text).    50     """    51     52     last = 0    53     regions = [""]    54     depth = 0    55     56     for match in sections_regexp.finditer(s):    57         start, end = match.span()    58         is_start = match.group("options")    59         is_section = is_section_marker(match.group("type"))    60     61         # The start of a region is either indicated by a marker with options or    62         # by a marker where no region is currently active.    63     64         if is_start or not depth:    65     66             # Where no region is active, add the text since the last match as a    67             # "null" region.    68     69             if not depth:    70                 regions[-1] += s[last:start]    71     72                 # A new region is maintained as a string.    73     74                 if is_section:    75                     regions.append(s[start:end])    76     77                 # Certain markers may be standalone macros.    78     79                 else:    80                     regions[-1] += s[start:end]    81     82             # Where a region is active, add the text since the last match as    83             # well as the text in this match to the region.    84     85             else:    86                 regions[-1] += s[last:end]    87     88             if is_section:    89                 depth += 1    90     91         # The end of a region is indicated by a marker with no options.    92     93         else:    94             # Where no region is active, the text since the last match plus the    95             # marker are added to the current "null" region.    96     97             if not depth:    98     99                 # Add to the string portion of the "null" region.   100    101                 regions[-1] += s[last:end]   102    103             # Where a region is active, the end marker and preceding text is   104             # either incorporated into the current region if more than one   105             # region is active, or the preceding text is incorporated into the   106             # current region and the details of the region are then obtained.   107    108             else:   109                 if depth > 1 or not is_section:   110                     regions[-1] += s[last:end]   111    112                 # Terminate the active region, interpreting its contents.   113    114                 else:   115                     regions[-1] += s[last:end]   116                     regions.append("")   117    118                 if is_section:   119                     depth -= 1   120    121         last = end   122    123     # Where a region is still active, terminate it.   124    125     regions[-1] += s[last:]   126    127     return [get_section_details(s) for s in regions if s]   128    129 def is_section_marker(sectiontype):   130     return sectiontypes.has_key(sectiontype) or sectiontype == "color"   131    132 # Section inspection.   133    134 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"   135 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)   136    137 def get_section_details(s):   138    139     "Return the details of a section 's' in the form (type, text)."   140    141     match = section_regexp.match(s)   142     if match:   143         return (match.group("sectiontype"), match.group("options")), match.group("section")   144     else:   145         return None, s   146    147 # Heading, table and list extraction.   148    149 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"   150 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"   151 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"   152    153 blockelement_regexp = re.compile(   154     "(" + list_regexp_str + ")"   155     "|"   156     "(" + table_regexp_str + ")"   157     "|"   158     "(" + blocktext_regexp_str + ")",   159     re.MULTILINE   160     )   161    162 def get_block_elements(s):   163    164     """   165     Extract headings, tables and lists from the given string 's'.   166     """   167    168     last = 0   169     blocks = []   170     for match in blockelement_regexp.finditer(s):   171         start, end = match.span()   172         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   173         blocks.append((None, s[last:start]))   174         blocks.append((matchtype, match.group("text") or s[start:end]))   175         last = end   176     blocks.append((None, s[last:]))   177     return blocks   178    179 # Block extraction.   180    181 block_regexp_str = r"^(?:\s*\n)+"   182 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   183    184 def get_basic_blocks(s):   185    186     """   187     Return blocks from the given string 's' by splitting the text on blank lines   188     and eliminating those lines.   189     """   190    191     return [b for b in block_regexp.split(s) if b.strip()]   192    193 # Block inspection.   194    195 def get_blocks(s):   196    197     """   198     Return blocks from the given string 's', inspecting the basic blocks and   199     generating additional block-level text where appropriate.   200     """   201    202     blocks = []   203    204     for blocktype, blocktext in get_block_elements(s):   205    206         # Collect heading, list and table blocks.   207    208         if blocktype is not None:   209             blocks.append((blocktype, blocktext))   210    211         # Attempt to find new subblocks in other regions.   212    213         else:   214             for block in get_basic_blocks(blocktext):   215                 blocks.append((None, block))   216    217     return blocks   218    219 # List item inspection.   220    221 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   222 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   223    224 def get_list_items(text):   225    226     "Return a list of (marker, text) tuples for the given list 'text'."   227    228     items = []   229    230     for match in listitem_regexp.finditer(text):   231         items.append((match.group("marker"), match.group("text")))   232    233     return items   234    235 # Content inspection.   236    237 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   238 link_regexp_str      = r"[[](?P<linktext>.*?)]"   239 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   240 macro_regexp_str     = r"{(?P<macro>.*?):(?P<options>.*?)}"   241    242 # Word-dependent patterns.   243 # Here, the unbracketed markers must test for the absence of surrounding word   244 # characters.   245    246 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   247 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   248 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   249 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   250 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   251    252 content_regexp_str = (   253     "(" + monospace_regexp_str + ")"   254     "|"   255     "(" + link_regexp_str + ")"   256     "|"   257     "(" + image_regexp_str + ")"   258     "|"   259     "(" + macro_regexp_str + ")"   260     "|"   261     "(" + italic_regexp_str + ")"   262     "|"   263     "(" + bold_regexp_str + ")"   264     "|"   265     "(" + del_regexp_str + ")"   266     "|"   267     "(" + underline_regexp_str + ")"   268     "|"   269     "(" + sub_regexp_str + ")"   270     )   271    272 # Table row inspection.   273    274 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   275    276 table_content_regexp_str = (   277     content_regexp_str +   278     "|"   279     "(" + cellsep_regexp_str + ")"   280     )   281    282 content_regexp = re.compile(content_regexp_str)   283 table_content_regexp = re.compile(table_content_regexp_str)   284    285 def get_table_rows(text):   286    287     "Return a list of (cellsep, columns) tuples for the given table 'text'."   288    289     rows = []   290    291     for row in text.split("|\n"):   292         if not row:   293             break   294    295         row += "|"   296         cellsep = None   297         columns = [""]   298         last = 0   299         for match in table_content_regexp.finditer(row):   300             start, end = match.span()   301             columns[-1] += row[last:start]   302    303             if match.group("celltype"):   304                 if cellsep is None:   305                     cellsep = match.group("celltype")   306                 columns.append("")   307             else:   308                 columns[-1] += match.group()   309    310             last = end   311    312         columns[-1] += row[last:]   313    314         if cellsep:   315             rows.append((cellsep, columns[1:-1]))   316    317     return rows   318    319 # Notation conversion.   320    321 notation_mapping = [   322     (r"\!", "!"),   323     (r"\-", "-"),   324     (r"\\""\n", "<<BR>>"),   325     (r"\\ ", "<<BR>>"),   326     (r"\~", "~"),   327     ]   328    329 preformatted_notation_mapping = [   330     (r"\!", "!"),   331     (r"\-", "-"),   332     (r"\\""\n", "\n"),   333     (r"\\ ", "\n"),   334     (r"\~", "~"),   335     ]   336    337 # Translation helpers.   338    339 markers = {   340     "*" : "*",   341     "#" : "1.",   342     "-" : "*",   343     }   344    345 cellseps = {   346     "|" : "\n|| ",   347     "||" : "\n|| ",   348     }   349    350 cellextra = {   351     "|" : "",   352     "||" : "'''",   353     }   354    355 sectiontypes = {   356     "code"      : "",   357     "noformat"  : "",   358     "quote"     : "",   359     "info"      : "#!wiki important",   360     "note"      : "#!wiki caution",   361     "tip"       : "#!wiki tip",   362     "warning"   : "#!wiki warning",   363     }   364    365 preformatted_sectiontypes = (None, "noformat")   366    367 macroargs = {   368     "color"     : "col",   369     }   370    371 macrotypes = {   372     "anchor"    : "<<Anchor(%(args)s)>>",   373     "color"     : "<<Color2(%(content)s, %(args)s)>>",   374     }   375    376 class ConfluenceParser:   377    378     "A parser for Confluence markup."   379    380     def __init__(self):   381         self.max_level = self.level = 0   382         self.in_heading = False   383         self.held_anchors = []   384         self.macro = None   385         self.sections = []   386    387     def translate_marker(self, marker):   388    389         "Translate the given 'marker' to a suitable Moin representation."   390    391         return " " * len(marker) + markers[marker[-1]]   392    393     def translate_cellsep(self, cellsep):   394    395         "Translate the given 'cellsep' to a suitable Moin representation."   396    397         return cellseps[cellsep]   398    399     def translate_cell(self, cellsep, text):   400    401         "Using 'cellsep', translate the cell 'text'."   402    403         return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]   404    405     def translate_content_match(self, match):   406    407         "Translate the content described by the given 'match', returning a string."   408    409         if match.group("monotext"):   410             self.enter_section(); self.leave_section()   411             return "{{{%s}}}" % match.group("monotext")   412    413         elif match.group("linktext"):   414             parts = match.group("linktext").split("|")   415    416             # NOTE: Proper detection of external links required.   417    418             if len(parts) == 1:   419                 label, target, title = None, parts[0], None   420             elif len(parts) == 2:   421                 (label, target), title = parts, None   422             else:   423                 label, target, title = parts   424    425             target = target.strip()   426    427             # Look for namespace links and rewrite them.   428    429             if target.find(":") != -1:   430                 prefix = ""   431                 space, rest = target.split(":", 1)   432                 if space not in URL_SCHEMES:   433                     target = "%s/%s" % (space, rest)   434    435             # Detect anchors.   436    437             elif target.startswith("#"):   438                 prefix = ""   439    440             # Detect attachments.   441    442             elif target.startswith("^"):   443                 prefix = "attachment:"   444    445             # Link to other pages within a space.   446    447             else:   448                 prefix = "../"   449    450                 # Make the link tidier by making a target if none was given.   451    452                 if not label:   453                     label = target   454    455             if not label and not title:   456                 return "[[%s%s]]" % (prefix, target)   457             elif not title:   458                 return "[[%s%s|%s]]" % (prefix, target, label)   459             else:   460                 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   461    462         elif match.group("imagetext"):   463             parts = match.group("imagetext").split("|")   464    465             # NOTE: Proper detection of external links required.   466    467             if parts[0].startswith("http"):   468                 prefix = ""   469             else:   470                 prefix = "attachment:"   471    472             # NOTE: Proper options conversion required.   473    474             if len(parts) == 1:   475                 return "{{%s%s}}" % (prefix, parts[0])   476             else:   477                 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   478    479         elif match.group("macro"):   480             macro_name = match.group("macro")   481             if macrotypes.has_key(macro_name):   482                 argname = macroargs.get(macro_name)   483                 result = macrotypes[macro_name] % {   484                     "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options"))   485                     }   486                 if not self.forbids_macros():   487                     return result   488                 if macro_name == "anchor":   489                     self.held_anchors.append(result)   490             return ""   491    492         elif match.group("italictext"):   493             return "''%s''" % self.translate_content(match.group("italictext"))   494    495         elif match.group("boldtext"):   496             return "'''%s'''" % self.translate_content(match.group("boldtext"))   497    498         elif match.group("deltext"):   499             return "--(%s)--" % self.translate_content(match.group("deltext"))   500    501         elif match.group("underlinetext"):   502             return "__%s__" % self.translate_content(match.group("underlinetext"))   503    504         elif match.group("subtext"):   505             return ",,%s,," % self.translate_content(match.group("subtext"))   506    507         else:   508             return self.translate_text(match.group())   509    510     def translate_text(self, s, preformatted=False):   511    512         "Translate the plain text string 's', converting notation."   513    514         for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   515             s = s.replace(before, after)   516         return s   517    518     def translate_content(self, text):   519    520         """   521         Return a translation of the given 'text'. If the optional 'sectiontype' is   522         specified, the translation may be modified to a form appropriate to the   523         section being translated.   524         """   525    526         parts = []   527         preformatted = self.is_preformatted()   528    529         last = 0   530         for match in content_regexp.finditer(text):   531             start, end = match.span()   532             parts.append(self.translate_text(text[last:start], preformatted))   533    534             # Handle unformatted sections.   535    536             if self.sections and self.sections[-1] in ("code", "noformat"):   537                 parts.append(match.group())   538             else:   539                 parts.append(self.translate_content_match(match))   540    541             last = end   542    543         parts.append(self.translate_text(text[last:], preformatted))   544         return "".join(parts)   545    546     def is_preformatted(self):   547         return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False)   548    549     def translate_block(self, blocktype, blocktext):   550    551         "Translate the block with the given 'blocktype' and 'blocktext'."   552    553         if blocktype in headings:   554             self.in_heading = True   555             self.held_anchors = []   556    557         parts = []   558    559         # Translate headings and blockquotes.   560    561         if blocktypes.has_key(blocktype):   562             text = self.parse_text(blocktext)   563             for anchor in self.held_anchors:   564                 parts.append(anchor)   565             parts.append(blocktypes[blocktype] % text)   566    567         # Translate list items.   568    569         elif blocktype == "list":   570             for listmarker, listitem in get_list_items(blocktext):   571                 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem)))   572    573         # Translate table items.   574    575         elif blocktype == "table":   576    577             # Enter the table.   578    579             self.enter_section()   580    581             table_parts = []   582             first = True   583    584             for cellsep, columns in get_table_rows(blocktext):   585                 if not first:   586                     table_parts.append("==")   587                 else:   588                     first = False   589                 moinsep = self.translate_cellsep(cellsep)   590                 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))   591    592             # Nest the section appropriately.   593    594             opening, closing = self.nest_section()   595    596             parts.append("%s#!table" % opening)   597             parts += table_parts   598             parts.append(closing)   599    600             # Leave the table.   601    602             self.leave_section()   603    604         # Handle anonymous blocks.   605    606         else:   607             parts.append(self.parse_text(blocktext))   608    609         if blocktype in headings:   610             self.in_heading = False   611    612         return "\n".join(parts)   613    614     def translate_section(self, sectiontype, options, text):   615    616         """   617         Translate the section with the given 'sectiontype', 'options' and   618         'text'.   619         """   620    621         parts = []   622    623         # Enter the section.   624    625         self.enter_section(sectiontype)   626    627         # Sections can contain other sections.   628    629         section_content = self.parse_text(text.strip())   630    631         # Nest the section appropriately.   632    633         opening, closing = self.nest_section()   634         mointype = sectiontypes.get(sectiontype)   635    636         parts.append("%s%s\n" % (opening, mointype or ""))   637         if options:   638             parts.append("## %s\n" % options)   639         parts.append(section_content)   640         parts.append("\n%s\n" % closing)   641    642         # Leave the section.   643    644         self.leave_section()   645    646         return parts   647    648     def enter_section(self, sectiontype=None):   649         self.level += 1   650         self.max_level = max(self.level, self.max_level)   651         self.sections.append(sectiontype)   652    653     def leave_section(self):   654         self.level -= 1   655         if not self.level:   656             self.max_level = 0   657         self.sections.pop()   658    659     def nest_section(self):   660         level = 3 + self.max_level - self.level   661         opening = "{" * level   662         closing = "}" * level   663         return opening, closing   664    665     # General parsing.   666    667     def parse_text(self, s, top=False):   668    669         "Parse the content in the string 's', returning the translation."   670    671         parts = []   672    673         # Control spacing between blocks and other blocks or sections.   674    675         preceded_by_block = False   676    677         for type, text in get_regions(s):   678    679             # Handle list, heading, blockquote or anonymous blocks.   680    681             if type is None:   682    683                 # Where the region is the same as the provided text, return   684                 # immediately. This is the base case of the recursive parsing   685                 # process.   686    687                 if text == s and not top:   688                     return self.translate_content(text)   689    690                 # Otherwise, obtain and translate the blocks.   691    692                 if preceded_by_block:   693                     parts.append("\n")   694    695                 first = True   696                 for blocktype, blocktext in get_blocks(text):   697                     if not first:   698                         parts.append("\n")   699                     else:   700                         first = False   701                     parts.append("%s" % self.translate_block(blocktype, blocktext))   702    703                 if not first:   704                     preceded_by_block = True   705    706             # Handle sections.   707    708             else:   709                 sectiontype, options = type   710    711                 # Direct translations of sections.   712    713                 if sectiontypes.has_key(sectiontype):   714                     if preceded_by_block:   715                         parts.append("\n")   716    717                     parts += self.translate_section(sectiontype, options, text)   718                     preceded_by_block = True   719    720                 # Translations of macros acting as sections.   721    722                 elif macrotypes.has_key(sectiontype):   723    724                     # Prevent the production of macros in places they would   725                     # produce illegal Moin syntax.   726    727                     if not self.forbids_macros():   728                         self.macro = sectiontype   729                         argname = macroargs.get(sectiontype)   730                         parts.append(macrotypes[sectiontype] % {   731                             "content"   : quote_macro_argument(self.parse_text(text)),   732                             "args"      : quote_macro_argument((argname and ("%s=" % argname) or "") + options)   733                             })   734                         self.macro = None   735    736                     # Include the contents of section-based macros where the   737                     # macros themselves are not allowed.   738    739                     else:   740                         parts.append(self.translate_content(text))   741    742                     preceded_by_block = False   743    744                 # Unrecognised sections.   745    746                 else:   747                     parts += self.translate_section(sectiontype, None, text)   748                     preceded_by_block = False   749    750         return "".join(parts)   751    752     def forbids_macros(self):   753         return self.in_heading or self.macro   754    755 def parse(s, out):   756    757     "Parse the content in the string 's', writing a translation to 'out'."   758    759     parser = ConfluenceParser()   760     out.write(parser.parse_text(s, top=True))   761    762 if __name__ == "__main__":   763     s = codecs.getreader("utf-8")(sys.stdin).read()   764     out = codecs.getwriter("utf-8")(sys.stdout)   765     parse(s, out)   766    767 # vim: tabstop=4 expandtab shiftwidth=4