ConfluenceConverter (file parser.py at f4b8774961a0)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 import re    35     36 # Section extraction.    37     38 sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})"    39 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    40     41 def get_regions(s):    42     43     """    44     Return a list of regions from 's'. Each region is specified using a tuple of    45     the form (type, text).    46     """    47     48     last = 0    49     regions = []    50     for match in sections_regexp.finditer(s):    51         start, end = match.span()    52         regions.append((None, s[last:start]))    53         regions.append(get_section_details(s[start:end]))    54         last = end    55     regions.append((None, s[last:]))    56     return regions    57     58 # Section inspection.    59     60 section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}"    61 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    62     63 def get_section_details(s):    64     65     "Return the details of a section 's' in the form (type, text)."    66     67     match = section_regexp.match(s)    68     if match:    69         return match.group("sectiontype"), match.group("section")    70     else:    71         return None, s    72     73 # Heading, table and list extraction.    74     75 list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*"    76 table_regexp_str = r"^((?P<celltype>[|]{1,2})(.+?(?P=celltype))+(\n|$))+"    77 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    78     79 blockelement_regexp = re.compile(    80     "(" + list_regexp_str + ")"    81     "|"    82     "(" + table_regexp_str + ")"    83     "|"    84     "(" + blocktext_regexp_str + ")",    85     re.MULTILINE    86     )    87     88 def get_block_elements(s):    89     90     """    91     Extract headings, tables and lists from the given string 's'.    92     """    93     94     last = 0    95     blocks = []    96     for match in blockelement_regexp.finditer(s):    97         start, end = match.span()    98         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")    99         blocks.append((None, s[last:start]))   100         blocks.append((matchtype, match.group("text") or s[start:end]))   101         last = end   102     blocks.append((None, s[last:]))   103     return blocks   104    105 # Block extraction.   106    107 block_regexp_str = r"^(?:\s*\n)+"   108 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   109    110 def get_basic_blocks(s):   111    112     """   113     Return blocks from the given string 's' by splitting the text on blank lines   114     and eliminating those lines.   115     """   116    117     return [b for b in block_regexp.split(s) if b.strip()]   118    119 # Block inspection.   120    121 def get_blocks(s):   122    123     """   124     Return blocks from the given string 's', inspecting the basic blocks and   125     generating additional block-level text where appropriate.   126     """   127    128     blocks = []   129    130     for blocktype, blocktext in get_block_elements(s):   131    132         # Collect heading, list and table blocks.   133    134         if blocktype is not None:   135             blocks.append((blocktype, blocktext))   136    137         # Attempt to find new subblocks in other regions.   138    139         else:   140             for block in get_basic_blocks(blocktext):   141                 blocks.append((None, block))   142    143     return blocks   144    145 # List item inspection.   146    147 listitem_regexp_str = r"^(?P<marker>[*#-])+\s*(?P<text>.*)$"   148 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   149    150 def get_list_items(text):   151    152     "Return a list of (marker, text) tuples for the given list 'text'."   153    154     items = []   155    156     for match in listitem_regexp.finditer(text):   157         items.append((match.group("marker"), match.group("text")))   158    159     return items   160    161 # Table row inspection.   162    163 link_regexp_str = r"[[](?P<linktext>.*?)]"   164 image_regexp_str = r"!(?P<imagetext>.*?)!"   165 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   166 content_regexp = re.compile(   167     "(" + link_regexp_str + ")"   168     "|"   169     "(" + image_regexp_str + ")"   170     "|"   171     "(" + cellsep_regexp_str + ")"   172     )   173    174 def get_table_rows(text):   175    176     "Return a list of (cellsep, columns) tuples for the given table 'text'."   177    178     rows = []   179    180     for line in text.split("\n"):   181         cellsep = None   182         columns = [""]   183         last = 0   184         for match in content_regexp.finditer(line):   185             start, end = match.span()   186             columns[-1] += line[last:start]   187    188             if match.group("celltype"):   189                 if cellsep is None:   190                     cellsep = match.group("celltype")   191                 columns.append("")   192             else:   193                 columns[-1] += line[start:end]   194    195             last = end   196    197         columns[-1] += line[last:]   198    199         if cellsep:   200             rows.append((cellsep, columns[1:-1]))   201    202     return rows   203    204 # General parsing and translation.   205    206 blocktypes = {   207     "h1" : "= %s =",   208     "h2" : "== %s ==",   209     "h3" : "=== %s ===",   210     "h4" : "==== %s ====",   211     "h5" : "===== %s =====",   212     "h6" : "====== %s ======",   213     "bq" : "{{{%s}}}",   214     }   215    216 markers = {   217     "*" : "*",   218     "#" : "1.",   219     "-" : "*",   220     }   221    222 def translate_marker(marker):   223    224     "Translate the given 'marker' to a suitable Moin representation."   225    226     return " " * len(marker) + markers[marker[-1]]   227    228 cellseps = {   229     "|" : "||",   230     "||" : "||",   231     }   232    233 cellextra = {   234     "|" : "",   235     "||" : "'''",   236     }   237    238 def translate_cellsep(cellsep):   239    240     "Translate the given 'cellsep' to a suitable Moin representation."   241    242     return cellseps[cellsep]   243    244 def translate_cell(cellsep, text):   245    246     "Using 'cellsep', translate the cell 'text'."   247    248     return cellextra[cellsep] + text + cellextra[cellsep]   249    250 def parse(s, out):   251    252     "Parse the content in the string 's', writing a translation to 'out'."   253    254     for type, text in get_regions(s):   255    256         # Handle list, heading, blockquote or anonymous blocks.   257    258         if type is None:   259             for blocktype, blocktext in get_blocks(text):   260    261                 # Translate headings and blockquotes.   262    263                 if blocktypes.has_key(blocktype):   264                     print >>out, blocktypes[blocktype] % blocktext   265    266                 # Translate list items.   267    268                 elif blocktype == "list":   269                     for listmarker, listitem in get_list_items(blocktext):   270                         print >>out, "%s %s" % (translate_marker(listmarker), listitem)   271    272                 # Translate table items.   273    274                 elif blocktype == "table":   275                     for cellsep, columns in get_table_rows(blocktext):   276                         moinsep = translate_cellsep(cellsep)   277                         print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep   278    279                 # Handle anonymous blocks.   280    281                 else:   282                     print >>out, blocktext.rstrip()   283    284                 print >>out   285    286         # Handle sections.   287    288         else:   289             print >>out, "{{{",   290             print >>out, text,   291             print >>out, "}}}"   292             print >>out   293    294 if __name__ == "__main__":   295     import sys   296    297     s = sys.stdin.read()   298     parse(s, sys.stdout)   299    300 # vim: tabstop=4 expandtab shiftwidth=4