ConfluenceConverter (file wikiparser.py at 76b4aad9ffac)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38     39 # Section extraction.    40     41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    43     44 def get_regions(s):    45     46     """    47     Return a list of regions from 's'. Each region is specified using a tuple of    48     the form (type, text).    49     """    50     51     last = 0    52     regions = []    53     for match in sections_regexp.finditer(s):    54         start, end = match.span()    55         regions.append((None, s[last:start]))    56         regions.append(get_section_details(s[start:end]))    57         last = end    58     regions.append((None, s[last:]))    59     return regions    60     61 # Section inspection.    62     63 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    64 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    65     66 def get_section_details(s):    67     68     "Return the details of a section 's' in the form (type, text)."    69     70     match = section_regexp.match(s)    71     if match:    72         return (match.group("sectiontype"), match.group("options")), match.group("section")    73     else:    74         return None, s    75     76 # Heading, table and list extraction.    77     78 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"    79 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"    80 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    81     82 blockelement_regexp = re.compile(    83     "(" + list_regexp_str + ")"    84     "|"    85     "(" + table_regexp_str + ")"    86     "|"    87     "(" + blocktext_regexp_str + ")",    88     re.MULTILINE    89     )    90     91 def get_block_elements(s):    92     93     """    94     Extract headings, tables and lists from the given string 's'.    95     """    96     97     last = 0    98     blocks = []    99     for match in blockelement_regexp.finditer(s):   100         start, end = match.span()   101         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   102         blocks.append((None, s[last:start]))   103         blocks.append((matchtype, match.group("text") or s[start:end]))   104         last = end   105     blocks.append((None, s[last:]))   106     return blocks   107    108 # Block extraction.   109    110 block_regexp_str = r"^(?:\s*\n)+"   111 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   112    113 def get_basic_blocks(s):   114    115     """   116     Return blocks from the given string 's' by splitting the text on blank lines   117     and eliminating those lines.   118     """   119    120     return [b for b in block_regexp.split(s) if b.strip()]   121    122 # Block inspection.   123    124 def get_blocks(s):   125    126     """   127     Return blocks from the given string 's', inspecting the basic blocks and   128     generating additional block-level text where appropriate.   129     """   130    131     blocks = []   132    133     for blocktype, blocktext in get_block_elements(s):   134    135         # Collect heading, list and table blocks.   136    137         if blocktype is not None:   138             blocks.append((blocktype, blocktext))   139    140         # Attempt to find new subblocks in other regions.   141    142         else:   143             for block in get_basic_blocks(blocktext):   144                 blocks.append((None, block))   145    146     return blocks   147    148 # List item inspection.   149    150 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   151 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   152    153 def get_list_items(text):   154    155     "Return a list of (marker, text) tuples for the given list 'text'."   156    157     items = []   158    159     for match in listitem_regexp.finditer(text):   160         items.append((match.group("marker"), match.group("text")))   161    162     return items   163    164 # Content inspection.   165    166 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   167 link_regexp_str      = r"[[](?P<linktext>.*?)]"   168 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   169    170 # Word-dependent patterns.   171 # Here, the unbracketed markers must test for the absence of surrounding word   172 # characters.   173    174 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   175 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   176 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   177 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   178 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   179    180 content_regexp_str = (   181     "(" + monospace_regexp_str + ")"   182     "|"   183     "(" + link_regexp_str + ")"   184     "|"   185     "(" + image_regexp_str + ")"   186     "|"   187     "(" + italic_regexp_str + ")"   188     "|"   189     "(" + bold_regexp_str + ")"   190     "|"   191     "(" + del_regexp_str + ")"   192     "|"   193     "(" + underline_regexp_str + ")"   194     "|"   195     "(" + sub_regexp_str + ")"   196     )   197    198 # Table row inspection.   199    200 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   201    202 table_content_regexp_str = (   203     content_regexp_str +   204     "|"   205     "(" + cellsep_regexp_str + ")"   206     )   207    208 content_regexp = re.compile(content_regexp_str)   209 table_content_regexp = re.compile(table_content_regexp_str)   210    211 # Notation conversion.   212    213 notation_mapping = [   214     (r"\!", "!"),   215     (r"\-", "-"),   216     (r"\\""\n", "<<BR>>"),   217     (r"\\ ", "<<BR>>"),   218     (r"\~", "~"),   219     ]   220    221 # Translation helpers.   222    223 markers = {   224     "*" : "*",   225     "#" : "1.",   226     "-" : "*",   227     }   228    229 def translate_marker(marker):   230    231     "Translate the given 'marker' to a suitable Moin representation."   232    233     return " " * len(marker) + markers[marker[-1]]   234    235 cellseps = {   236     "|" : "\n|| ",   237     "||" : "\n|| ",   238     }   239    240 cellextra = {   241     "|" : "",   242     "||" : "'''",   243     }   244    245 def translate_cellsep(cellsep):   246    247     "Translate the given 'cellsep' to a suitable Moin representation."   248    249     return cellseps[cellsep]   250    251 def translate_cell(cellsep, text):   252    253     "Using 'cellsep', translate the cell 'text'."   254    255     return cellextra[cellsep] + parse_text(text).strip() + cellextra[cellsep]   256    257 def translate_content_match(match):   258    259     "Translate the content described by the given 'match', returning a string."   260    261     if match.group("monotext"):   262         return "{{{%s}}}" % match.group("monotext")   263    264     elif match.group("linktext"):   265         parts = match.group("linktext").split("|")   266    267         # NOTE: Proper detection of external links required.   268    269         if len(parts) == 1:   270             label, target, title = None, parts[0], None   271         elif len(parts) == 2:   272             (label, target), title = parts, None   273         else:   274             label, target, title = parts   275    276         target = target.strip()   277    278         # Look for namespace links and rewrite them.   279    280         if target.find(":") != -1:   281             prefix = ""   282             space, rest = target.split(":", 1)   283             if space not in URL_SCHEMES:   284                 target = "%s/%s" % (space, rest)   285    286         # Detect anchors.   287    288         elif target.startswith("#"):   289             prefix = ""   290    291         # Detect attachments.   292    293         elif target.startswith("^"):   294             prefix = "attachment:"   295    296         # Link to other pages within a space.   297    298         else:   299             prefix = "../"   300    301             # Make the link tidier by making a target if none was given.   302    303             if not label:   304                 label = target   305    306         if not label and not title:   307             return "[[%s%s]]" % (prefix, target)   308         elif not title:   309             return "[[%s%s|%s]]" % (prefix, target, label)   310         else:   311             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   312    313     elif match.group("imagetext"):   314         parts = match.group("imagetext").split("|")   315    316         # NOTE: Proper detection of external links required.   317    318         if parts[0].startswith("http"):   319             prefix = ""   320         else:   321             prefix = "attachment:"   322    323         # NOTE: Proper options conversion required.   324    325         if len(parts) == 1:   326             return "{{%s%s}}" % (prefix, parts[0])   327         else:   328             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   329    330     elif match.group("italictext"):   331         return "''%s''" % translate_content(match.group("italictext"))   332    333     elif match.group("boldtext"):   334         return "'''%s'''" % translate_content(match.group("boldtext"))   335    336     elif match.group("deltext"):   337         return "--(%s)--" % translate_content(match.group("deltext"))   338    339     elif match.group("underlinetext"):   340         return "__%s__" % translate_content(match.group("underlinetext"))   341    342     elif match.group("subtext"):   343         return ",,%s,," % translate_content(match.group("subtext"))   344    345     else:   346         return translate_text(match.group())   347    348 def translate_text(s):   349    350     "Translate the plain text string 's', converting notation."   351    352     for before, after in notation_mapping:   353         s = s.replace(before, after)   354     return s   355    356 def translate_content(text, sectiontype=None):   357    358     """   359     Return a translation of the given 'text'. If the optional 'sectiontype' is   360     specified, the translation may be modified to a form appropriate to the   361     section being translated.   362     """   363    364     parts = []   365    366     last = 0   367     for match in content_regexp.finditer(text):   368         start, end = match.span()   369         parts.append(translate_text(text[last:start]))   370    371         # Handle unformatted sections.   372    373         if sectiontype in ("code", "noformat"):   374             parts.append(match.group())   375         else:   376             parts.append(translate_content_match(match))   377    378         last = end   379    380     parts.append(translate_text(text[last:]))   381     return "".join(parts)   382    383 def translate_block(blocktype, blocktext):   384    385     "Translate the block with the given 'blocktype' and 'blocktext'."   386    387     parts = []   388    389     # Translate headings and blockquotes.   390    391     if blocktypes.has_key(blocktype):   392         parts.append(blocktypes[blocktype] % blocktext)   393    394     # Translate list items.   395    396     elif blocktype == "list":   397         for listmarker, listitem in get_list_items(blocktext):   398             parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem)))   399    400     # Translate table items.   401    402     elif blocktype == "table":   403         parts.append("{{{#!table")   404         first = True   405         for cellsep, columns in get_table_rows(blocktext):   406             if not first:   407                 parts.append("==")   408             else:   409                 first = False   410             moinsep = translate_cellsep(cellsep)   411             parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns]))   412         parts.append("}}}")   413    414     # Handle anonymous blocks.   415    416     else:   417         parts.append(translate_content(blocktext))   418    419     return "\n".join(parts)   420    421 def get_table_rows(text):   422    423     "Return a list of (cellsep, columns) tuples for the given table 'text'."   424    425     rows = []   426    427     for row in text.split("|\n"):   428         if not row:   429             break   430    431         row += "|"   432         cellsep = None   433         columns = [""]   434         last = 0   435         for match in table_content_regexp.finditer(row):   436             start, end = match.span()   437             columns[-1] += row[last:start]   438    439             if match.group("celltype"):   440                 if cellsep is None:   441                     cellsep = match.group("celltype")   442                 columns.append("")   443             else:   444                 columns[-1] += match.group()   445    446             last = end   447    448         columns[-1] += row[last:]   449    450         if cellsep:   451             rows.append((cellsep, columns[1:-1]))   452    453     return rows   454    455 sectiontypes = {   456     "code"      : "",   457     "noformat"  : "",   458     "quote"     : "",   459     "info"      : "#!wiki important\n",   460     "note"      : "#!wiki caution\n",   461     "tip"       : "#!wiki tip\n",   462     "warning"   : "#!wiki warning\n",   463     }   464    465 macrotypes = {   466     "anchor"    : "<<Anchor(%s)>>",   467     "color"     : "<<Color(%s)>>",   468     }   469    470 # General parsing.   471    472 def parse_text(s):   473    474     "Parse the content in the string 's', returning the translation."   475    476     parts = []   477    478     # Control spacing between blocks and other blocks or sections.   479    480     preceded_by_block = False   481    482     for type, text in get_regions(s):   483    484         # Handle list, heading, blockquote or anonymous blocks.   485    486         if type is None:   487             if preceded_by_block:   488                 parts.append("\n")   489    490             first = True   491             for blocktype, blocktext in get_blocks(text):   492                 if not first:   493                     parts.append("\n")   494                 else:   495                     first = False   496                 parts.append("%s" % translate_block(blocktype, blocktext))   497    498             if not first:   499                 preceded_by_block = True   500    501         # Handle sections.   502    503         else:   504             sectiontype, options = type   505    506             # Direct translations of sections.   507    508             if sectiontypes.has_key(sectiontype):   509                 if preceded_by_block:   510                     parts.append("\n")   511                 mointype = sectiontypes[sectiontype]   512    513                 parts.append("{{{%s" % (mointype or ""))   514                 text = text.strip()   515    516                 # Sections containing newlines must have a separate header line.   517    518                 if options or text.find("\n") != -1:   519                     parts.append("\n")   520    521                 if options:   522                     parts.append("## %s\n" % options)   523                 parts.append(translate_content(text, sectiontype))   524                 parts.append("%s}}}\n" % (mointype and "\n" or ""))   525    526                 preceded_by_block = True   527    528             # Translations of macros (which can look like sections).   529    530             elif macrotypes.has_key(sectiontype):   531                 parts.append(macrotypes[sectiontype] % translate_content(text, sectiontype))   532                 preceded_by_block = False   533    534             # Unrecognised sections.   535    536             else:   537                 parts.append("{{{")   538    539                 # Sections containing newlines must have a separate header line.   540    541                 if text.find("\n") != -1 and not text.startswith("\n"):   542                     parts.append("\n")   543    544                 parts.append(translate_content(text, sectiontype))   545                 parts.append("}}}")   546                 preceded_by_block = False   547    548     return "".join(parts)   549    550 def parse(s, out):   551    552     "Parse the content in the string 's', writing a translation to 'out'."   553    554     out.write(parse_text(s))   555    556 if __name__ == "__main__":   557     s = codecs.getreader("utf-8")(sys.stdin).read()   558     out = codecs.getwriter("utf-8")(sys.stdout)   559     parse(s, out)   560    561 # vim: tabstop=4 expandtab shiftwidth=4