ConfluenceConverter (file wikiparser.py at 5909d1da7230)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then split into regions.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38 import operator    39     40 # Section extraction.    41     42 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(?P<options>:[^}\n]+)?}|^(?P<rowstart>[|]{1,2})|(?P<rowend>[|]{1,2})(\n|$)"    43 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    44     45 def get_regions(s):    46     47     """    48     Return a list of regions from 's'. Each region is specified using a tuple of    49     the form (type, text).    50     """    51     52     last = 0    53     regions = [""]    54     depth = 0    55     had_row = False    56     57     for match in sections_regexp.finditer(s):    58         start, end = match.span()    59         is_start = match.group("options") or match.group("rowstart")    60         is_section = is_section_marker(match.group("type"))    61         is_row = match.group("rowstart") or match.group("rowend")    62     63         # The start of a region is either indicated by a marker with options or    64         # by a marker where no region is currently active.    65     66         if is_start or not depth:    67     68             # Where no region is active, add the text since the last match as a    69             # "null" region.    70     71             if not depth:    72                 regions[-1] += s[last:start]    73     74                 # A new region is maintained as a string.    75     76                 if is_section:    77                     regions.append(s[start:end])    78     79                 # A new row may either continue a table region or start a new    80                 # table region.    81     82                 elif is_row:    83                     if (last != start or not had_row):    84                         regions.append(s[start:end])    85                     else:    86                         regions[-2] += regions[-1] + s[start:end]    87                         regions.pop()    88     89                 # Certain markers may be standalone macros.    90     91                 else:    92                     regions[-1] += s[start:end]    93     94             # Where a region is active, add the text since the last match as    95             # well as the text in this match to the region.    96     97             else:    98                 regions[-1] += s[last:end]    99    100             if is_section or is_row:   101                 depth += 1   102    103         # The end of a region is indicated by a marker with no options.   104    105         else:   106             # Where no region is active, the text since the last match plus the   107             # marker are added to the current "null" region.   108    109             if not depth:   110    111                 # Add to the string portion of the "null" region.   112    113                 regions[-1] += s[last:end]   114    115             # Where a region is active, the end marker and preceding text is   116             # either incorporated into the current region if more than one   117             # region is active, or the preceding text is incorporated into the   118             # current region and the details of the region are then obtained.   119    120             else:   121                 if depth > 1 or (not is_section and not is_row):   122                     regions[-1] += s[last:end]   123    124                 # Terminate the active region, interpreting its contents.   125    126                 else:   127                     regions[-1] += s[last:end]   128                     regions.append("")   129    130                 if is_section or is_row:   131                     depth -= 1   132    133         had_row = is_row   134         last = end   135    136     # Where a region is still active, terminate it.   137    138     regions[-1] += s[last:]   139    140     return [get_section_details(s) for s in regions if s]   141    142 def is_section_marker(sectiontype):   143     return sectiontypes.has_key(sectiontype) or sectiontype == "color"   144    145 # Section inspection.   146    147 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"   148 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)   149    150 def get_section_details(s):   151    152     "Return the details of a section 's' in the form (type, text)."   153    154     match = section_regexp.match(s)   155     if match:   156         return (match.group("sectiontype"), match.group("options")), match.group("section")   157     else:   158         return None, s   159    160 # Heading, table and list extraction.   161    162 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"   163 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"   164 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"   165    166 blockelement_regexp = re.compile(   167     "(" + list_regexp_str + ")"   168     "|"   169     "(" + table_regexp_str + ")"   170     "|"   171     "(" + blocktext_regexp_str + ")",   172     re.MULTILINE   173     )   174    175 def get_block_elements(s):   176    177     """   178     Extract headings, tables and lists from the given string 's'.   179     """   180    181     last = 0   182     blocks = []   183     for match in blockelement_regexp.finditer(s):   184         start, end = match.span()   185         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   186         blocks.append((None, s[last:start]))   187         blocks.append((matchtype, match.group("text") or s[start:end]))   188         last = end   189     blocks.append((None, s[last:]))   190     return blocks   191    192 # Block extraction.   193    194 block_regexp_str = r"^(?:\s*\n)+"   195 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   196    197 def get_basic_blocks(s):   198    199     """   200     Return blocks from the given string 's' by splitting the text on blank lines   201     and eliminating those lines.   202     """   203    204     return [b for b in block_regexp.split(s) if b.strip()]   205    206 # Block inspection.   207    208 def get_blocks(s):   209    210     """   211     Return blocks from the given string 's', inspecting the basic blocks and   212     generating additional block-level text where appropriate.   213     """   214    215     blocks = []   216    217     for blocktype, blocktext in get_block_elements(s):   218    219         # Collect heading, list and table blocks.   220    221         if blocktype is not None:   222             blocks.append((blocktype, blocktext))   223    224         # Attempt to find new subblocks in other regions.   225    226         else:   227             for block in get_basic_blocks(blocktext):   228                 blocks.append((None, block))   229    230     return blocks   231    232 # List item inspection.   233    234 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   235 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   236    237 def get_list_items(text):   238    239     "Return a list of (marker, text) tuples for the given list 'text'."   240    241     items = []   242    243     for match in listitem_regexp.finditer(text):   244         items.append((match.group("marker"), match.group("text")))   245    246     return items   247    248 # Content inspection.   249    250 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   251 link_regexp_str      = r"[[](?P<linktext>.*?)]"   252 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   253 macro_regexp_str     = r"{(?P<macro>.*?):(?P<options>.*?)}"   254    255 # Word-dependent patterns.   256 # Here, the unbracketed markers must test for the absence of surrounding word   257 # characters.   258    259 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   260 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   261 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   262 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   263 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   264    265 content_regexp_str = (   266     "(" + monospace_regexp_str + ")"   267     "|"   268     "(" + link_regexp_str + ")"   269     "|"   270     "(" + image_regexp_str + ")"   271     "|"   272     "(" + macro_regexp_str + ")"   273     "|"   274     "(" + italic_regexp_str + ")"   275     "|"   276     "(" + bold_regexp_str + ")"   277     "|"   278     "(" + del_regexp_str + ")"   279     "|"   280     "(" + underline_regexp_str + ")"   281     "|"   282     "(" + sub_regexp_str + ")"   283     )   284    285 # Table row inspection.   286    287 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   288    289 table_content_regexp_str = (   290     content_regexp_str +   291     "|"   292     "(" + cellsep_regexp_str + ")"   293     )   294    295 content_regexp = re.compile(content_regexp_str)   296 table_content_regexp = re.compile(table_content_regexp_str)   297    298 def get_table_rows(text):   299    300     "Return a list of (cellsep, columns) tuples for the given table 'text'."   301    302     rows = []   303    304     for row in text.split("|\n"):   305         if not row:   306             break   307    308         row += "|"   309         cellsep = None   310         columns = [""]   311         last = 0   312         for match in table_content_regexp.finditer(row):   313             start, end = match.span()   314             columns[-1] += row[last:start]   315    316             if match.group("celltype"):   317                 if cellsep is None:   318                     cellsep = match.group("celltype")   319                 columns.append("")   320             else:   321                 columns[-1] += match.group()   322    323             last = end   324    325         columns[-1] += row[last:]   326    327         if cellsep:   328             rows.append((cellsep, columns[1:-1]))   329    330     return rows   331    332 # Notation conversion.   333    334 notation_mapping = [   335     (r"\!", "!"),   336     (r"\-", "-"),   337     (r"\\""\n", "<<BR>>"),   338     (r"\\ ", "<<BR>>"),   339     (r"\~", "~"),   340     ]   341    342 preformatted_notation_mapping = [   343     (r"\!", "!"),   344     (r"\-", "-"),   345     (r"\\""\n", "\n"),   346     (r"\\ ", "\n"),   347     (r"\~", "~"),   348     ]   349    350 # Translation helpers.   351    352 markers = {   353     "*" : "*",   354     "#" : "1.",   355     "-" : "*",   356     }   357    358 cellseps = {   359     "|" : "\n|| ",   360     "||" : "\n|| ",   361     }   362    363 cellextra = {   364     "|" : "",   365     "||" : "'''",   366     }   367    368 sectiontypes = {   369     "code"      : "",   370     "noformat"  : "",   371     "quote"     : "",   372     "info"      : "#!wiki important",   373     "note"      : "#!wiki caution",   374     "tip"       : "#!wiki tip",   375     "warning"   : "#!wiki warning",   376     }   377    378 preformatted_sectiontypes = (None, "noformat")   379    380 macroargs = {   381     "color"     : "col",   382     }   383    384 macrotypes = {   385     "anchor"    : "<<Anchor(%(args)s)>>",   386     "color"     : "<<Color2(%(content)s, %(args)s)>>",   387     }   388    389 class ConfluenceParser:   390    391     "A parser for Confluence markup."   392    393     def __init__(self):   394         self.max_level = self.level = 0   395         self.in_heading = False   396         self.held_anchors = []   397         self.macro = None   398         self.sections = []   399    400     def translate_marker(self, marker):   401    402         "Translate the given 'marker' to a suitable Moin representation."   403    404         return " " * len(marker) + markers[marker[-1]]   405    406     def translate_cellsep(self, cellsep):   407    408         "Translate the given 'cellsep' to a suitable Moin representation."   409    410         return cellseps[cellsep]   411    412     def translate_cell(self, cellsep, text):   413    414         "Using 'cellsep', translate the cell 'text'."   415    416         return cellextra[cellsep] + self.parse_text(text).strip() + cellextra[cellsep]   417    418     def translate_content_match(self, match):   419    420         "Translate the content described by the given 'match', returning a string."   421    422         if match.group("monotext"):   423             self.enter_section(); self.leave_section()   424             return "{{{%s}}}" % match.group("monotext")   425    426         elif match.group("linktext"):   427             parts = match.group("linktext").split("|")   428    429             # NOTE: Proper detection of external links required.   430    431             if len(parts) == 1:   432                 label, target, title = None, parts[0], None   433             elif len(parts) == 2:   434                 (label, target), title = parts, None   435             else:   436                 label, target, title = parts   437    438             target = target.strip()   439    440             # Look for namespace links and rewrite them.   441    442             if target.find(":") != -1:   443                 prefix = ""   444                 space, rest = target.split(":", 1)   445                 if space not in URL_SCHEMES:   446                     rest = get_page_title(rest)   447                     target = "%s/%s" % (space, rest)   448    449             # Detect anchors.   450    451             elif target.startswith("#"):   452                 prefix = ""   453    454             # Detect attachments.   455    456             elif target.startswith("^"):   457                 prefix = "attachment:"   458    459             # Link to other pages within a space.   460    461             else:   462                 prefix = "../"   463    464                 # Make the link tidier by making a target if none was given.   465    466                 if not label:   467                     label = target   468    469                 target = get_page_title(target)   470    471             if not label and not title:   472                 return "[[%s%s]]" % (prefix, target)   473             elif not title:   474                 return "[[%s%s|%s]]" % (prefix, target, label)   475             else:   476                 return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   477    478         elif match.group("imagetext"):   479             parts = match.group("imagetext").split("|")   480    481             # NOTE: Proper detection of external links required.   482    483             if parts[0].startswith("http"):   484                 prefix = ""   485             else:   486                 prefix = "attachment:"   487    488             # NOTE: Proper options conversion required.   489    490             if len(parts) == 1:   491                 return "{{%s%s}}" % (prefix, parts[0])   492             else:   493                 return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   494    495         elif match.group("macro"):   496             macro_name = match.group("macro")   497             if macrotypes.has_key(macro_name):   498                 argname = macroargs.get(macro_name)   499                 result = macrotypes[macro_name] % {   500                     "args" : quote_macro_argument((argname and ("%s=" % argname) or "") + match.group("options"))   501                     }   502                 if not self.forbids_macros():   503                     return result   504                 if macro_name == "anchor":   505                     self.held_anchors.append(result)   506             return ""   507    508         elif match.group("italictext"):   509             return "''%s''" % self.translate_content(match.group("italictext"))   510    511         elif match.group("boldtext"):   512             return "'''%s'''" % self.translate_content(match.group("boldtext"))   513    514         elif match.group("deltext"):   515             return "--(%s)--" % self.translate_content(match.group("deltext"))   516    517         elif match.group("underlinetext"):   518             return "__%s__" % self.translate_content(match.group("underlinetext"))   519    520         elif match.group("subtext"):   521             return ",,%s,," % self.translate_content(match.group("subtext"))   522    523         else:   524             return self.translate_text(match.group())   525    526     def translate_text(self, s, preformatted=False):   527    528         "Translate the plain text string 's', converting notation."   529    530         for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   531             s = s.replace(before, after)   532         return s   533    534     def translate_content(self, text):   535    536         """   537         Return a translation of the given 'text'. If the optional 'sectiontype' is   538         specified, the translation may be modified to a form appropriate to the   539         section being translated.   540         """   541    542         parts = []   543         preformatted = self.is_preformatted()   544    545         last = 0   546         for match in content_regexp.finditer(text):   547             start, end = match.span()   548             parts.append(self.translate_text(text[last:start], preformatted))   549    550             # Handle unformatted sections.   551    552             if self.sections and self.sections[-1] in ("code", "noformat"):   553                 parts.append(match.group())   554             else:   555                 parts.append(self.translate_content_match(match))   556    557             last = end   558    559         parts.append(self.translate_text(text[last:], preformatted))   560         return "".join(parts)   561    562     def is_preformatted(self):   563         return reduce(operator.or_, [x in preformatted_sectiontypes for x in self.sections], False)   564    565     def translate_block(self, blocktype, blocktext):   566    567         "Translate the block with the given 'blocktype' and 'blocktext'."   568    569         if blocktype in headings:   570             self.in_heading = True   571             self.held_anchors = []   572    573         parts = []   574    575         # Translate headings and blockquotes.   576    577         if blocktypes.has_key(blocktype):   578             text = self.parse_text(blocktext)   579             for anchor in self.held_anchors:   580                 parts.append(anchor)   581             parts.append(blocktypes[blocktype] % text)   582    583         # Translate list items.   584    585         elif blocktype == "list":   586             for listmarker, listitem in get_list_items(blocktext):   587                 parts.append("%s %s" % (self.translate_marker(listmarker), self.parse_text(listitem)))   588    589         # Translate table items.   590    591         elif blocktype == "table":   592    593             # Enter the table.   594    595             self.enter_section()   596    597             table_parts = []   598             first = True   599    600             for cellsep, columns in get_table_rows(blocktext):   601                 if not first:   602                     table_parts.append("==")   603                 else:   604                     first = False   605                 moinsep = self.translate_cellsep(cellsep)   606                 table_parts.append(moinsep.join([self.translate_cell(cellsep, column) for column in columns]))   607    608             # Nest the section appropriately.   609    610             opening, closing = self.nest_section()   611    612             parts.append("%s#!table" % opening)   613             parts += table_parts   614             parts.append(closing)   615    616             # Leave the table.   617    618             self.leave_section()   619    620         # Handle anonymous blocks.   621    622         else:   623             parts.append(self.parse_text(blocktext))   624    625         if blocktype in headings:   626             self.in_heading = False   627    628         return "\n".join(parts)   629    630     def translate_section(self, sectiontype, options, text):   631    632         """   633         Translate the section with the given 'sectiontype', 'options' and   634         'text'.   635         """   636    637         parts = []   638    639         # Enter the section.   640    641         self.enter_section(sectiontype)   642    643         # Sections can contain other sections.   644    645         section_content = self.parse_text(text.strip())   646    647         # Nest the section appropriately.   648    649         opening, closing = self.nest_section()   650         mointype = sectiontypes.get(sectiontype)   651    652         parts.append("%s%s\n" % (opening, mointype or ""))   653         if options:   654             parts.append("## %s\n" % options)   655         parts.append(section_content)   656         parts.append("\n%s\n" % closing)   657    658         # Leave the section.   659    660         self.leave_section()   661    662         return parts   663    664     def enter_section(self, sectiontype=None):   665         self.level += 1   666         self.max_level = max(self.level, self.max_level)   667         self.sections.append(sectiontype)   668    669     def leave_section(self):   670         self.level -= 1   671         if not self.level:   672             self.max_level = 0   673         self.sections.pop()   674    675     def nest_section(self):   676         level = 3 + self.max_level - self.level   677         opening = "{" * level   678         closing = "}" * level   679         return opening, closing   680    681     # General parsing.   682    683     def parse_text(self, s, top=False):   684    685         "Parse the content in the string 's', returning the translation."   686    687         parts = []   688    689         # Control spacing between blocks and other blocks or sections.   690    691         preceded_by_block = False   692    693         for type, text in get_regions(s):   694    695             # Handle list, heading, blockquote or anonymous blocks.   696    697             if type is None:   698    699                 # Where the region is the same as the provided text, return   700                 # immediately. This is the base case of the recursive parsing   701                 # process.   702    703                 if text == s and not top:   704                     return self.translate_content(text)   705    706                 # Otherwise, obtain and translate the blocks.   707    708                 if preceded_by_block:   709                     parts.append("\n")   710    711                 first = True   712                 for blocktype, blocktext in get_blocks(text):   713                     if not first:   714                         parts.append("\n")   715                     else:   716                         first = False   717                     parts.append("%s" % self.translate_block(blocktype, blocktext))   718    719                 if not first:   720                     preceded_by_block = True   721    722             # Handle sections.   723    724             else:   725                 sectiontype, options = type   726    727                 # Direct translations of sections.   728    729                 if sectiontypes.has_key(sectiontype):   730                     if preceded_by_block:   731                         parts.append("\n")   732    733                     parts += self.translate_section(sectiontype, options, text)   734                     preceded_by_block = True   735    736                 # Translations of macros acting as sections.   737    738                 elif macrotypes.has_key(sectiontype):   739    740                     # Prevent the production of macros in places they would   741                     # produce illegal Moin syntax.   742    743                     if not self.forbids_macros():   744                         self.macro = sectiontype   745                         argname = macroargs.get(sectiontype)   746                         parts.append(macrotypes[sectiontype] % {   747                             "content"   : quote_macro_argument(self.parse_text(text)),   748                             "args"      : quote_macro_argument((argname and ("%s=" % argname) or "") + options)   749                             })   750                         self.macro = None   751    752                     # Include the contents of section-based macros where the   753                     # macros themselves are not allowed.   754    755                     else:   756                         parts.append(self.translate_content(text))   757    758                     preceded_by_block = False   759    760                 # Unrecognised sections.   761    762                 else:   763                     parts += self.translate_section(sectiontype, None, text)   764                     preceded_by_block = False   765    766         return "".join(parts)   767    768     def forbids_macros(self):   769         return self.in_heading or self.macro   770    771 def parse(s, out):   772    773     "Parse the content in the string 's', writing a translation to 'out'."   774    775     parser = ConfluenceParser()   776     out.write(parser.parse_text(s, top=True))   777    778 if __name__ == "__main__":   779     s = codecs.getreader("utf-8")(sys.stdin).read()   780     out = codecs.getwriter("utf-8")(sys.stdout)   781     parse(s, out)   782    783 # vim: tabstop=4 expandtab shiftwidth=4