ConfluenceConverter (file parser.py at 0e41fd332cf5)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 import re    35     36 URL_SCHEMES = ("http", "https", "ftp", "mailto")    37     38 # Section extraction.    39     40 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    41 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    42     43 def get_regions(s):    44     45     """    46     Return a list of regions from 's'. Each region is specified using a tuple of    47     the form (type, text).    48     """    49     50     last = 0    51     regions = []    52     for match in sections_regexp.finditer(s):    53         start, end = match.span()    54         regions.append((None, s[last:start]))    55         regions.append(get_section_details(s[start:end]))    56         last = end    57     regions.append((None, s[last:]))    58     return regions    59     60 # Section inspection.    61     62 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    63 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    64     65 def get_section_details(s):    66     67     "Return the details of a section 's' in the form (type, text)."    68     69     match = section_regexp.match(s)    70     if match:    71         return (match.group("sectiontype"), match.group("options")), match.group("section")    72     else:    73         return None, s    74     75 # Heading, table and list extraction.    76     77 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*.*\n(\s*(?P=listtype).*(?:\n|$))*"    78 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"    79 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    80     81 blockelement_regexp = re.compile(    82     "(" + list_regexp_str + ")"    83     "|"    84     "(" + table_regexp_str + ")"    85     "|"    86     "(" + blocktext_regexp_str + ")",    87     re.MULTILINE    88     )    89     90 def get_block_elements(s):    91     92     """    93     Extract headings, tables and lists from the given string 's'.    94     """    95     96     last = 0    97     blocks = []    98     for match in blockelement_regexp.finditer(s):    99         start, end = match.span()   100         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   101         blocks.append((None, s[last:start]))   102         blocks.append((matchtype, match.group("text") or s[start:end]))   103         last = end   104     blocks.append((None, s[last:]))   105     return blocks   106    107 # Block extraction.   108    109 block_regexp_str = r"^(?:\s*\n)+"   110 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   111    112 def get_basic_blocks(s):   113    114     """   115     Return blocks from the given string 's' by splitting the text on blank lines   116     and eliminating those lines.   117     """   118    119     return [b for b in block_regexp.split(s) if b.strip()]   120    121 # Block inspection.   122    123 def get_blocks(s):   124    125     """   126     Return blocks from the given string 's', inspecting the basic blocks and   127     generating additional block-level text where appropriate.   128     """   129    130     blocks = []   131    132     for blocktype, blocktext in get_block_elements(s):   133    134         # Collect heading, list and table blocks.   135    136         if blocktype is not None:   137             blocks.append((blocktype, blocktext))   138    139         # Attempt to find new subblocks in other regions.   140    141         else:   142             for block in get_basic_blocks(blocktext):   143                 blocks.append((None, block))   144    145     return blocks   146    147 # List item inspection.   148    149 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s*(?P<text>.*)$"   150 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   151    152 def get_list_items(text):   153    154     "Return a list of (marker, text) tuples for the given list 'text'."   155    156     items = []   157    158     for match in listitem_regexp.finditer(text):   159         items.append((match.group("marker"), match.group("text")))   160    161     return items   162    163 # Table row inspection.   164    165 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   166 link_regexp_str = r"[[](?P<linktext>.*?)]"   167 image_regexp_str = r"!(?P<imagetext>.*?)!"   168 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   169    170 content_regexp_str = (   171     "(" + monospace_regexp_str + ")"   172     "|"   173     "(" + link_regexp_str + ")"   174     "|"   175     "(" + image_regexp_str + ")"   176     )   177    178 table_content_regexp_str = (   179     content_regexp_str +   180     "|"   181     "(" + cellsep_regexp_str + ")"   182     )   183    184 content_regexp = re.compile(content_regexp_str)   185 table_content_regexp = re.compile(table_content_regexp_str)   186    187 def translate_content_match(match):   188    189     "Translate the content described by the given 'match', returning a string."   190    191     if match.group("monotext"):   192         return "{{{%s}}}" % match.group("monotext")   193    194     elif match.group("linktext"):   195         parts = match.group("linktext").split("|")   196    197         # NOTE: Proper detection of external links required.   198    199         if len(parts) == 1:   200             label, target, title = None, parts[0], None   201         elif len(parts) == 2:   202             (label, target), title = parts, None   203         else:   204             label, target, title = parts   205    206         target = target.strip()   207    208         # Look for namespace links and rewrite them.   209    210         if target.find(":") != -1:   211             prefix = ""   212             space, rest = target.split(":", 1)   213             if space not in URL_SCHEMES:   214                 target = "%s/%s" % (space, rest)   215    216         # Detect anchors.   217    218         elif target.startswith("#"):   219             prefix = ""   220    221         # Detect attachments.   222    223         elif target.startswith("^"):   224             prefix = "attachment:"   225    226         # Link to other pages within a space.   227    228         else:   229             prefix = "../"   230    231             # Make the link tidier by making a target if none was given.   232    233             if not label:   234                 label = target   235    236         if not label and not title:   237             return "[[%s%s]]" % (prefix, target)   238         elif not title:   239             return "[[%s%s|%s]]" % (prefix, target, label)   240         else:   241             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   242    243     elif match.group("imagetext"):   244         parts = match.group("imagetext").split("|")   245    246         # NOTE: Proper detection of external links required.   247    248         if parts[0].startswith("http"):   249             prefix = ""   250         else:   251             prefix = "attachment:"   252    253         # NOTE: Proper options conversion required.   254    255         if len(parts) == 1:   256             return "{{%s%s}}" % (prefix, parts[0])   257         else:   258             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   259    260     else:   261         return match.group()   262    263 def get_table_rows(text):   264    265     "Return a list of (cellsep, columns) tuples for the given table 'text'."   266    267     rows = []   268    269     for line in text.split("\n"):   270         cellsep = None   271         columns = [""]   272         last = 0   273         for match in table_content_regexp.finditer(line):   274             start, end = match.span()   275             columns[-1] += line[last:start]   276    277             if match.group("celltype"):   278                 if cellsep is None:   279                     cellsep = match.group("celltype")   280                 columns.append("")   281             else:   282                 columns[-1] += match.group()   283    284             last = end   285    286         columns[-1] += line[last:]   287    288         if cellsep:   289             rows.append((cellsep, columns[1:-1]))   290    291     return rows   292    293 def translate_content(text, sectiontype=None):   294    295     """   296     Return a translation of the given 'text'. If the optional 'sectiontype' is   297     specified, the translation may be modified to a form appropriate to the   298     section being translated.   299     """   300    301     parts = []   302    303     last = 0   304     for match in content_regexp.finditer(text):   305         start, end = match.span()   306         parts.append(text[last:start])   307    308         # Handle unformatted sections.   309    310         if sectiontype in ("code", "noformat"):   311             parts.append(match.group())   312         else:   313             parts.append(translate_content_match(match))   314    315         last = end   316    317     parts.append(text[last:])   318     return "".join(parts)   319    320 # Translation helpers.   321    322 blocktypes = {   323     "h1" : "= %s =",   324     "h2" : "== %s ==",   325     "h3" : "=== %s ===",   326     "h4" : "==== %s ====",   327     "h5" : "===== %s =====",   328     "h6" : "====== %s ======",   329     "bq" : "{{{%s}}}",   330     }   331    332 markers = {   333     "*" : "*",   334     "#" : "1.",   335     "-" : "*",   336     }   337    338 def translate_marker(marker):   339    340     "Translate the given 'marker' to a suitable Moin representation."   341    342     return " " * len(marker) + markers[marker[-1]]   343    344 cellseps = {   345     "|" : "||",   346     "||" : "||",   347     }   348    349 cellextra = {   350     "|" : "",   351     "||" : "'''",   352     }   353    354 def translate_cellsep(cellsep):   355    356     "Translate the given 'cellsep' to a suitable Moin representation."   357    358     return cellseps[cellsep]   359    360 def translate_cell(cellsep, text):   361    362     "Using 'cellsep', translate the cell 'text'."   363    364     return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]   365    366 sectiontypes = {   367     "code" : "",   368     "noformat" : "",   369     "quote" : "",   370     "info" : "wiki important",   371     "note" : "wiki caution",   372     "tip" : "wiki tip",   373     "warning" : "wiki warning",   374     }   375    376 # General parsing.   377    378 def parse(s, out):   379    380     "Parse the content in the string 's', writing a translation to 'out'."   381    382     for type, text in get_regions(s):   383    384         # Handle list, heading, blockquote or anonymous blocks.   385    386         if type is None:   387             for blocktype, blocktext in get_blocks(text):   388    389                 # Translate headings and blockquotes.   390    391                 if blocktypes.has_key(blocktype):   392                     print >>out, blocktypes[blocktype] % blocktext   393    394                 # Translate list items.   395    396                 elif blocktype == "list":   397                     for listmarker, listitem in get_list_items(blocktext):   398                         print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))   399    400                 # Translate table items.   401    402                 elif blocktype == "table":   403                     for cellsep, columns in get_table_rows(blocktext):   404                         moinsep = translate_cellsep(cellsep)   405                         print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep   406    407                 # Handle anonymous blocks.   408    409                 else:   410                     print >>out, translate_content(blocktext.rstrip())   411    412                 print >>out   413    414         # Handle sections.   415    416         else:   417             sectiontype, options = type   418    419             # Direct translations of sections.   420    421             mointype = sectiontypes.get(sectiontype)   422             if mointype:   423                 print >>out, "{{{#!%s" % mointype   424                 if options:   425                     print >>out, "##", options   426             else:   427                 print >>out, "{{{",   428             print >>out, translate_content(text, sectiontype),   429             print >>out, "}}}"   430             print >>out   431    432 if __name__ == "__main__":   433     import sys   434    435     s = sys.stdin.read()   436     parse(s, sys.stdout)   437    438 # vim: tabstop=4 expandtab shiftwidth=4