ConfluenceConverter (file parser.py at 0a5ff722fee3)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 import re    35     36 # Section extraction.    37     38 sections_regexp_str = r"(?<!{){(?P<type>[^{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    39 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    40     41 def get_regions(s):    42     43     """    44     Return a list of regions from 's'. Each region is specified using a tuple of    45     the form (type, text).    46     """    47     48     last = 0    49     regions = []    50     for match in sections_regexp.finditer(s):    51         start, end = match.span()    52         regions.append((None, s[last:start]))    53         regions.append(get_section_details(s[start:end]))    54         last = end    55     regions.append((None, s[last:]))    56     return regions    57     58 # Section inspection.    59     60 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    61 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    62     63 def get_section_details(s):    64     65     "Return the details of a section 's' in the form (type, text)."    66     67     match = section_regexp.match(s)    68     if match:    69         return (match.group("sectiontype"), match.group("options")), match.group("section")    70     else:    71         return None, s    72     73 # Heading, table and list extraction.    74     75 list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*"    76 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"    77 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    78     79 blockelement_regexp = re.compile(    80     "(" + list_regexp_str + ")"    81     "|"    82     "(" + table_regexp_str + ")"    83     "|"    84     "(" + blocktext_regexp_str + ")",    85     re.MULTILINE    86     )    87     88 def get_block_elements(s):    89     90     """    91     Extract headings, tables and lists from the given string 's'.    92     """    93     94     last = 0    95     blocks = []    96     for match in blockelement_regexp.finditer(s):    97         start, end = match.span()    98         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")    99         blocks.append((None, s[last:start]))   100         blocks.append((matchtype, match.group("text") or s[start:end]))   101         last = end   102     blocks.append((None, s[last:]))   103     return blocks   104    105 # Block extraction.   106    107 block_regexp_str = r"^(?:\s*\n)+"   108 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   109    110 def get_basic_blocks(s):   111    112     """   113     Return blocks from the given string 's' by splitting the text on blank lines   114     and eliminating those lines.   115     """   116    117     return [b for b in block_regexp.split(s) if b.strip()]   118    119 # Block inspection.   120    121 def get_blocks(s):   122    123     """   124     Return blocks from the given string 's', inspecting the basic blocks and   125     generating additional block-level text where appropriate.   126     """   127    128     blocks = []   129    130     for blocktype, blocktext in get_block_elements(s):   131    132         # Collect heading, list and table blocks.   133    134         if blocktype is not None:   135             blocks.append((blocktype, blocktext))   136    137         # Attempt to find new subblocks in other regions.   138    139         else:   140             for block in get_basic_blocks(blocktext):   141                 blocks.append((None, block))   142    143     return blocks   144    145 # List item inspection.   146    147 listitem_regexp_str = r"^(?P<marker>[*#-])+\s*(?P<text>.*)$"   148 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   149    150 def get_list_items(text):   151    152     "Return a list of (marker, text) tuples for the given list 'text'."   153    154     items = []   155    156     for match in listitem_regexp.finditer(text):   157         items.append((match.group("marker"), match.group("text")))   158    159     return items   160    161 # Table row inspection.   162    163 link_regexp_str = r"[[](?P<linktext>.*?)]"   164 image_regexp_str = r"!(?P<imagetext>.*?)!"   165 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   166    167 content_regexp_str = (   168     "(" + link_regexp_str + ")"   169     "|"   170     "(" + image_regexp_str + ")"   171     )   172    173 table_content_regexp_str = (   174     content_regexp_str +   175     "|"   176     "(" + cellsep_regexp_str + ")"   177     )   178    179 content_regexp = re.compile(content_regexp_str)   180 table_content_regexp = re.compile(table_content_regexp_str)   181    182 def translate_content_match(match):   183    184     "Translate the content described by the given 'match', returning a string."   185    186     if match.group("linktext"):   187         parts = match.group("linktext").split("|")   188    189         # NOTE: Proper detection of external links required.   190    191         if len(parts) > 1 and parts[1].startswith("http"):   192             prefix = ""   193         elif parts[0].startswith("#"):   194             prefix = ""   195         elif parts[0].startswith("^"):   196             prefix = "attachment:"   197         else:   198             prefix = "../"   199    200         if len(parts) == 1:   201             return "[[%s%s]]" % (prefix, parts[0])   202         elif len(parts) == 2:   203             return "[[%s%s|%s]]" % (prefix, parts[1], parts[0])   204         else:   205             return "[[%s%s|%s|title=%s]]" % (prefix, parts[1], parts[0], parts[2])   206    207     elif match.group("imagetext"):   208         parts = match.group("imagetext").split("|")   209    210         # NOTE: Proper detection of external links required.   211    212         if parts[0].startswith("http"):   213             prefix = ""   214         else:   215             prefix = "attachment:"   216    217         # NOTE: Proper options conversion required.   218    219         if len(parts) == 1:   220             return "{{%s%s}}" % (prefix, parts[0])   221         else:   222             return "{{%s%s|%s}}" % (prefix, parts[1], parts[0])   223    224     else:   225         return match.group()   226    227 def get_table_rows(text):   228    229     "Return a list of (cellsep, columns) tuples for the given table 'text'."   230    231     rows = []   232    233     for line in text.split("\n"):   234         cellsep = None   235         columns = [""]   236         last = 0   237         for match in table_content_regexp.finditer(line):   238             start, end = match.span()   239             columns[-1] += line[last:start]   240    241             if match.group("celltype"):   242                 if cellsep is None:   243                     cellsep = match.group("celltype")   244                 columns.append("")   245             else:   246                 columns[-1] += match.group()   247    248             last = end   249    250         columns[-1] += line[last:]   251    252         if cellsep:   253             rows.append((cellsep, columns[1:-1]))   254    255     return rows   256    257 def translate_content(text):   258    259     "Return a translation of the given 'text'."   260    261     parts = []   262    263     last = 0   264     for match in content_regexp.finditer(text):   265         start, end = match.span()   266         parts.append(text[last:start])   267         parts.append(translate_content_match(match))   268         last = end   269    270     parts.append(text[last:])   271     return "".join(parts)   272    273 # Translation helpers.   274    275 blocktypes = {   276     "h1" : "= %s =",   277     "h2" : "== %s ==",   278     "h3" : "=== %s ===",   279     "h4" : "==== %s ====",   280     "h5" : "===== %s =====",   281     "h6" : "====== %s ======",   282     "bq" : "{{{%s}}}",   283     }   284    285 markers = {   286     "*" : "*",   287     "#" : "1.",   288     "-" : "*",   289     }   290    291 def translate_marker(marker):   292    293     "Translate the given 'marker' to a suitable Moin representation."   294    295     return " " * len(marker) + markers[marker[-1]]   296    297 cellseps = {   298     "|" : "||",   299     "||" : "||",   300     }   301    302 cellextra = {   303     "|" : "",   304     "||" : "'''",   305     }   306    307 def translate_cellsep(cellsep):   308    309     "Translate the given 'cellsep' to a suitable Moin representation."   310    311     return cellseps[cellsep]   312    313 def translate_cell(cellsep, text):   314    315     "Using 'cellsep', translate the cell 'text'."   316    317     return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]   318    319 sectiontypes = {   320     "code" : "",   321     "noformat" : "",   322     "quote" : "",   323     "info" : "wiki important",   324     "note" : "wiki caution",   325     "tip" : "wiki tip",   326     "warning" : "wiki warning",   327     }   328    329 # General parsing.   330    331 def parse(s, out):   332    333     "Parse the content in the string 's', writing a translation to 'out'."   334    335     for type, text in get_regions(s):   336    337         # Handle list, heading, blockquote or anonymous blocks.   338    339         if type is None:   340             for blocktype, blocktext in get_blocks(text):   341    342                 # Translate headings and blockquotes.   343    344                 if blocktypes.has_key(blocktype):   345                     print >>out, blocktypes[blocktype] % blocktext   346    347                 # Translate list items.   348    349                 elif blocktype == "list":   350                     for listmarker, listitem in get_list_items(blocktext):   351                         print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))   352    353                 # Translate table items.   354    355                 elif blocktype == "table":   356                     for cellsep, columns in get_table_rows(blocktext):   357                         moinsep = translate_cellsep(cellsep)   358                         print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep   359    360                 # Handle anonymous blocks.   361    362                 else:   363                     print >>out, translate_content(blocktext.rstrip())   364    365                 print >>out   366    367         # Handle sections.   368    369         else:   370             sectiontype, options = type   371    372             # Direct translations of sections.   373    374             mointype = sectiontypes.get(sectiontype)   375             if mointype:   376                 print >>out, "{{{#!%s" % mointype   377                 if options:   378                     print >>out, "##", options   379             else:   380                 print >>out, "{{{",   381             print >>out, translate_content(text),   382             print >>out, "}}}"   383             print >>out   384    385 if __name__ == "__main__":   386     import sys   387    388     s = sys.stdin.read()   389     parse(s, sys.stdout)   390    391 # vim: tabstop=4 expandtab shiftwidth=4