ConfluenceConverter (file parser.py at 66605f1f9bfa)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 import re    35     36 URL_SCHEMES = ("http", "https", "ftp", "mailto")    37     38 # Section extraction.    39     40 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    41 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    42     43 def get_regions(s):    44     45     """    46     Return a list of regions from 's'. Each region is specified using a tuple of    47     the form (type, text).    48     """    49     50     last = 0    51     regions = []    52     for match in sections_regexp.finditer(s):    53         start, end = match.span()    54         regions.append((None, s[last:start]))    55         regions.append(get_section_details(s[start:end]))    56         last = end    57     regions.append((None, s[last:]))    58     return regions    59     60 # Section inspection.    61     62 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    63 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    64     65 def get_section_details(s):    66     67     "Return the details of a section 's' in the form (type, text)."    68     69     match = section_regexp.match(s)    70     if match:    71         return (match.group("sectiontype"), match.group("options")), match.group("section")    72     else:    73         return None, s    74     75 # Heading, table and list extraction.    76     77 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"    78 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"    79 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    80     81 blockelement_regexp = re.compile(    82     "(" + list_regexp_str + ")"    83     "|"    84     "(" + table_regexp_str + ")"    85     "|"    86     "(" + blocktext_regexp_str + ")",    87     re.MULTILINE    88     )    89     90 def get_block_elements(s):    91     92     """    93     Extract headings, tables and lists from the given string 's'.    94     """    95     96     last = 0    97     blocks = []    98     for match in blockelement_regexp.finditer(s):    99         start, end = match.span()   100         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   101         blocks.append((None, s[last:start]))   102         blocks.append((matchtype, match.group("text") or s[start:end]))   103         last = end   104     blocks.append((None, s[last:]))   105     return blocks   106    107 # Block extraction.   108    109 block_regexp_str = r"^(?:\s*\n)+"   110 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   111    112 def get_basic_blocks(s):   113    114     """   115     Return blocks from the given string 's' by splitting the text on blank lines   116     and eliminating those lines.   117     """   118    119     return [b for b in block_regexp.split(s) if b.strip()]   120    121 # Block inspection.   122    123 def get_blocks(s):   124    125     """   126     Return blocks from the given string 's', inspecting the basic blocks and   127     generating additional block-level text where appropriate.   128     """   129    130     blocks = []   131    132     for blocktype, blocktext in get_block_elements(s):   133    134         # Collect heading, list and table blocks.   135    136         if blocktype is not None:   137             blocks.append((blocktype, blocktext))   138    139         # Attempt to find new subblocks in other regions.   140    141         else:   142             for block in get_basic_blocks(blocktext):   143                 blocks.append((None, block))   144    145     return blocks   146    147 # List item inspection.   148    149 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"   150 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   151    152 def get_list_items(text):   153    154     "Return a list of (marker, text) tuples for the given list 'text'."   155    156     items = []   157    158     for match in listitem_regexp.finditer(text):   159         items.append((match.group("marker"), match.group("text")))   160    161     return items   162    163 # Table row inspection.   164    165 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   166 link_regexp_str = r"[[](?P<linktext>.*?)]"   167 image_regexp_str = r"!(?P<imagetext>.*?)!"   168 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   169    170 content_regexp_str = (   171     "(" + monospace_regexp_str + ")"   172     "|"   173     "(" + link_regexp_str + ")"   174     "|"   175     "(" + image_regexp_str + ")"   176     )   177    178 table_content_regexp_str = (   179     content_regexp_str +   180     "|"   181     "(" + cellsep_regexp_str + ")"   182     )   183    184 content_regexp = re.compile(content_regexp_str)   185 table_content_regexp = re.compile(table_content_regexp_str)   186    187 def translate_content_match(match):   188    189     "Translate the content described by the given 'match', returning a string."   190    191     if match.group("monotext"):   192         return "{{{%s}}}" % match.group("monotext")   193    194     elif match.group("linktext"):   195         parts = match.group("linktext").split("|")   196    197         # NOTE: Proper detection of external links required.   198    199         if len(parts) == 1:   200             label, target = None, parts[0]   201         elif len(parts) == 2:   202             label, target = parts   203         else:   204             label, target, title = parts   205    206         target = target.strip()   207    208         if target.find(":") != -1:   209             prefix = ""   210             space, rest = target.split(":", 1)   211             if space not in URL_SCHEMES:   212                 target = "%s/%s" % (space, rest)   213         elif target.startswith("#"):   214             prefix = ""   215         elif target.startswith("^"):   216             prefix = "attachment:"   217         else:   218             prefix = "../"   219    220         if len(parts) == 1:   221             return "[[%s%s]]" % (prefix, target)   222         elif len(parts) == 2:   223             return "[[%s%s|%s]]" % (prefix, target, label)   224         else:   225             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   226    227     elif match.group("imagetext"):   228         parts = match.group("imagetext").split("|")   229    230         # NOTE: Proper detection of external links required.   231    232         if parts[0].startswith("http"):   233             prefix = ""   234         else:   235             prefix = "attachment:"   236    237         # NOTE: Proper options conversion required.   238    239         if len(parts) == 1:   240             return "{{%s%s}}" % (prefix, parts[0])   241         else:   242             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   243    244     else:   245         return match.group()   246    247 def get_table_rows(text):   248    249     "Return a list of (cellsep, columns) tuples for the given table 'text'."   250    251     rows = []   252    253     for line in text.split("\n"):   254         cellsep = None   255         columns = [""]   256         last = 0   257         for match in table_content_regexp.finditer(line):   258             start, end = match.span()   259             columns[-1] += line[last:start]   260    261             if match.group("celltype"):   262                 if cellsep is None:   263                     cellsep = match.group("celltype")   264                 columns.append("")   265             else:   266                 columns[-1] += match.group()   267    268             last = end   269    270         columns[-1] += line[last:]   271    272         if cellsep:   273             rows.append((cellsep, columns[1:-1]))   274    275     return rows   276    277 def translate_content(text, sectiontype=None):   278    279     """   280     Return a translation of the given 'text'. If the optional 'sectiontype' is   281     specified, the translation may be modified to a form appropriate to the   282     section being translated.   283     """   284    285     parts = []   286    287     last = 0   288     for match in content_regexp.finditer(text):   289         start, end = match.span()   290         parts.append(text[last:start])   291    292         # Handle unformatted sections.   293    294         if sectiontype in ("code", "noformat"):   295             parts.append(match.group())   296         else:   297             parts.append(translate_content_match(match))   298    299         last = end   300    301     parts.append(text[last:])   302     return "".join(parts)   303    304 # Translation helpers.   305    306 blocktypes = {   307     "h1" : "= %s =",   308     "h2" : "== %s ==",   309     "h3" : "=== %s ===",   310     "h4" : "==== %s ====",   311     "h5" : "===== %s =====",   312     "h6" : "====== %s ======",   313     "bq" : "{{{%s}}}",   314     }   315    316 markers = {   317     "*" : "*",   318     "#" : "1.",   319     "-" : "*",   320     }   321    322 def translate_marker(marker):   323    324     "Translate the given 'marker' to a suitable Moin representation."   325    326     return " " * len(marker) + markers[marker[-1]]   327    328 cellseps = {   329     "|" : "||",   330     "||" : "||",   331     }   332    333 cellextra = {   334     "|" : "",   335     "||" : "'''",   336     }   337    338 def translate_cellsep(cellsep):   339    340     "Translate the given 'cellsep' to a suitable Moin representation."   341    342     return cellseps[cellsep]   343    344 def translate_cell(cellsep, text):   345    346     "Using 'cellsep', translate the cell 'text'."   347    348     return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]   349    350 sectiontypes = {   351     "code" : "",   352     "noformat" : "",   353     "quote" : "",   354     "info" : "wiki important",   355     "note" : "wiki caution",   356     "tip" : "wiki tip",   357     "warning" : "wiki warning",   358     }   359    360 # General parsing.   361    362 def parse(s, out):   363    364     "Parse the content in the string 's', writing a translation to 'out'."   365    366     for type, text in get_regions(s):   367    368         # Handle list, heading, blockquote or anonymous blocks.   369    370         if type is None:   371             for blocktype, blocktext in get_blocks(text):   372    373                 # Translate headings and blockquotes.   374    375                 if blocktypes.has_key(blocktype):   376                     print >>out, blocktypes[blocktype] % blocktext   377    378                 # Translate list items.   379    380                 elif blocktype == "list":   381                     for listmarker, listitem in get_list_items(blocktext):   382                         print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))   383    384                 # Translate table items.   385    386                 elif blocktype == "table":   387                     for cellsep, columns in get_table_rows(blocktext):   388                         moinsep = translate_cellsep(cellsep)   389                         print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep   390    391                 # Handle anonymous blocks.   392    393                 else:   394                     print >>out, translate_content(blocktext.rstrip())   395    396                 print >>out   397    398         # Handle sections.   399    400         else:   401             sectiontype, options = type   402    403             # Direct translations of sections.   404    405             mointype = sectiontypes.get(sectiontype)   406             if mointype:   407                 print >>out, "{{{#!%s" % mointype   408                 if options:   409                     print >>out, "##", options   410             else:   411                 print >>out, "{{{",   412             print >>out, translate_content(text, sectiontype),   413             print >>out, "}}}"   414             print >>out   415    416 if __name__ == "__main__":   417     import sys   418    419     s = sys.stdin.read()   420     parse(s, sys.stdout)   421    422 # vim: tabstop=4 expandtab shiftwidth=4