ConfluenceConverter (file wikiparser.py at b716feccdeba)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 from common import *    35 import re    36 import sys    37 import codecs    38     39 # Section extraction.    40     41 sections_regexp_str = r"(?<!{){(?P<type>[^-_*+{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"    42 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    43     44 def get_regions(s):    45     46     """    47     Return a list of regions from 's'. Each region is specified using a tuple of    48     the form (type, text).    49     """    50     51     last = 0    52     regions = []    53     for match in sections_regexp.finditer(s):    54         start, end = match.span()    55         regions.append((None, s[last:start]))    56         regions.append(get_section_details(s[start:end]))    57         last = end    58     regions.append((None, s[last:]))    59     return regions    60     61 # Section inspection.    62     63 section_regexp_str = r"{(?P<sectiontype>[^\n:]*?)(?::(?P<options>.*?))?}(?P<section>.*){(?P=sectiontype)}"    64 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    65     66 def get_section_details(s):    67     68     "Return the details of a section 's' in the form (type, text)."    69     70     match = section_regexp.match(s)    71     if match:    72         return (match.group("sectiontype"), match.group("options")), match.group("section")    73     else:    74         return None, s    75     76 # Heading, table and list extraction.    77     78 list_regexp_str = r"^\s*(?P<listtype>[*#-])[*#-]*\s+.*(\n\s*(?P=listtype).*?)*(?:\n|$)"    79 table_regexp_str = r"^((?P<celltype>[|]{1,2})((.|\n(?!\n))+?(?P=celltype))+(\n|$))+"    80 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    81     82 blockelement_regexp = re.compile(    83     "(" + list_regexp_str + ")"    84     "|"    85     "(" + table_regexp_str + ")"    86     "|"    87     "(" + blocktext_regexp_str + ")",    88     re.MULTILINE    89     )    90     91 def get_block_elements(s):    92     93     """    94     Extract headings, tables and lists from the given string 's'.    95     """    96     97     last = 0    98     blocks = []    99     for match in blockelement_regexp.finditer(s):   100         start, end = match.span()   101         matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")   102         blocks.append((None, s[last:start]))   103         blocks.append((matchtype, match.group("text") or s[start:end]))   104         last = end   105     blocks.append((None, s[last:]))   106     return blocks   107    108 # Block extraction.   109    110 block_regexp_str = r"^(?:\s*\n)+"   111 block_regexp = re.compile(block_regexp_str, re.MULTILINE)   112    113 def get_basic_blocks(s):   114    115     """   116     Return blocks from the given string 's' by splitting the text on blank lines   117     and eliminating those lines.   118     """   119    120     return [b for b in block_regexp.split(s) if b.strip()]   121    122 # Block inspection.   123    124 def get_blocks(s):   125    126     """   127     Return blocks from the given string 's', inspecting the basic blocks and   128     generating additional block-level text where appropriate.   129     """   130    131     blocks = []   132    133     for blocktype, blocktext in get_block_elements(s):   134    135         # Collect heading, list and table blocks.   136    137         if blocktype is not None:   138             blocks.append((blocktype, blocktext))   139    140         # Attempt to find new subblocks in other regions.   141    142         else:   143             for block in get_basic_blocks(blocktext):   144                 blocks.append((None, block))   145    146     return blocks   147    148 # List item inspection.   149    150 listitem_regexp_str = r"^(?P<marker> *[-*#]+)\s+(?P<text>.*)$"   151 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   152    153 def get_list_items(text):   154    155     "Return a list of (marker, text) tuples for the given list 'text'."   156    157     items = []   158    159     for match in listitem_regexp.finditer(text):   160         items.append((match.group("marker"), match.group("text")))   161    162     return items   163    164 # Content inspection.   165    166 monospace_regexp_str = r"{{(?P<monotext>.*?)}}"   167 link_regexp_str      = r"[[](?P<linktext>.*?)]"   168 image_regexp_str     = r"!(?P<imagetext>\w.*?)!"   169    170 # Word-dependent patterns.   171 # Here, the unbracketed markers must test for the absence of surrounding word   172 # characters.   173    174 italic_regexp_str    = r"(?:(?<!\w)_|\{_\})(?P<italictext>.*?)(?:_(?!\w)|\{_\})"   175 bold_regexp_str      = r"(?:(?<!\w)\*|\{\*\})(?P<boldtext>.*?)(?:\*(?!\w)|\{\*\})"   176 del_regexp_str       = r"(?:(?<!\w)-|\{-\})(?P<deltext>.*?)(?:-(?!\w)|\{-\})"   177 underline_regexp_str = r"(?:(?<!\w)\+|\{\+\})(?P<underlinetext>.*?)(?:\+(?!\w)|\{\+\})"   178 sub_regexp_str       = r"(?:(?<!\w)~|\{~\})(?P<subtext>.*?)(?:~(?!\w)|\{~\})"   179    180 content_regexp_str = (   181     "(" + monospace_regexp_str + ")"   182     "|"   183     "(" + link_regexp_str + ")"   184     "|"   185     "(" + image_regexp_str + ")"   186     "|"   187     "(" + italic_regexp_str + ")"   188     "|"   189     "(" + bold_regexp_str + ")"   190     "|"   191     "(" + del_regexp_str + ")"   192     "|"   193     "(" + underline_regexp_str + ")"   194     "|"   195     "(" + sub_regexp_str + ")"   196     )   197    198 # Table row inspection.   199    200 cellsep_regexp_str = r"(?P<celltype>[|]{1,2})"   201    202 table_content_regexp_str = (   203     content_regexp_str +   204     "|"   205     "(" + cellsep_regexp_str + ")"   206     )   207    208 content_regexp = re.compile(content_regexp_str)   209 table_content_regexp = re.compile(table_content_regexp_str)   210    211 # Notation conversion.   212    213 notation_mapping = [   214     (r"\!", "!"),   215     (r"\-", "-"),   216     (r"\\""\n", "<<BR>>"),   217     (r"\\ ", "<<BR>>"),   218     (r"\~", "~"),   219     ]   220    221 preformatted_notation_mapping = [   222     (r"\!", "!"),   223     (r"\-", "-"),   224     (r"\\""\n", "\n"),   225     (r"\\ ", "\n"),   226     (r"\~", "~"),   227     ]   228    229 # Translation helpers.   230    231 markers = {   232     "*" : "*",   233     "#" : "1.",   234     "-" : "*",   235     }   236    237 def translate_marker(marker):   238    239     "Translate the given 'marker' to a suitable Moin representation."   240    241     return " " * len(marker) + markers[marker[-1]]   242    243 cellseps = {   244     "|" : "\n|| ",   245     "||" : "\n|| ",   246     }   247    248 cellextra = {   249     "|" : "",   250     "||" : "'''",   251     }   252    253 def translate_cellsep(cellsep):   254    255     "Translate the given 'cellsep' to a suitable Moin representation."   256    257     return cellseps[cellsep]   258    259 def translate_cell(cellsep, text):   260    261     "Using 'cellsep', translate the cell 'text'."   262    263     return cellextra[cellsep] + parse_text(text).strip() + cellextra[cellsep]   264    265 def translate_content_match(match):   266    267     "Translate the content described by the given 'match', returning a string."   268    269     if match.group("monotext"):   270         return "{{{%s}}}" % match.group("monotext")   271    272     elif match.group("linktext"):   273         parts = match.group("linktext").split("|")   274    275         # NOTE: Proper detection of external links required.   276    277         if len(parts) == 1:   278             label, target, title = None, parts[0], None   279         elif len(parts) == 2:   280             (label, target), title = parts, None   281         else:   282             label, target, title = parts   283    284         target = target.strip()   285    286         # Look for namespace links and rewrite them.   287    288         if target.find(":") != -1:   289             prefix = ""   290             space, rest = target.split(":", 1)   291             if space not in URL_SCHEMES:   292                 target = "%s/%s" % (space, rest)   293    294         # Detect anchors.   295    296         elif target.startswith("#"):   297             prefix = ""   298    299         # Detect attachments.   300    301         elif target.startswith("^"):   302             prefix = "attachment:"   303    304         # Link to other pages within a space.   305    306         else:   307             prefix = "../"   308    309             # Make the link tidier by making a target if none was given.   310    311             if not label:   312                 label = target   313    314         if not label and not title:   315             return "[[%s%s]]" % (prefix, target)   316         elif not title:   317             return "[[%s%s|%s]]" % (prefix, target, label)   318         else:   319             return "[[%s%s|%s|title=%s]]" % (prefix, target, label, title)   320    321     elif match.group("imagetext"):   322         parts = match.group("imagetext").split("|")   323    324         # NOTE: Proper detection of external links required.   325    326         if parts[0].startswith("http"):   327             prefix = ""   328         else:   329             prefix = "attachment:"   330    331         # NOTE: Proper options conversion required.   332    333         if len(parts) == 1:   334             return "{{%s%s}}" % (prefix, parts[0])   335         else:   336             return "{{%s%s|%s}}" % (prefix, parts[0], parts[1])   337    338     elif match.group("italictext"):   339         return "''%s''" % translate_content(match.group("italictext"))   340    341     elif match.group("boldtext"):   342         return "'''%s'''" % translate_content(match.group("boldtext"))   343    344     elif match.group("deltext"):   345         return "--(%s)--" % translate_content(match.group("deltext"))   346    347     elif match.group("underlinetext"):   348         return "__%s__" % translate_content(match.group("underlinetext"))   349    350     elif match.group("subtext"):   351         return ",,%s,," % translate_content(match.group("subtext"))   352    353     else:   354         return translate_text(match.group())   355    356 def translate_text(s, preformatted=False):   357    358     "Translate the plain text string 's', converting notation."   359    360     for before, after in preformatted and preformatted_notation_mapping or notation_mapping:   361         s = s.replace(before, after)   362     return s   363    364 def translate_content(text, sectiontype=None):   365    366     """   367     Return a translation of the given 'text'. If the optional 'sectiontype' is   368     specified, the translation may be modified to a form appropriate to the   369     section being translated.   370     """   371    372     parts = []   373     preformatted = sectiontype in preformatted_sectiontypes   374    375     last = 0   376     for match in content_regexp.finditer(text):   377         start, end = match.span()   378         parts.append(translate_text(text[last:start], preformatted))   379    380         # Handle unformatted sections.   381    382         if sectiontype in ("code", "noformat"):   383             parts.append(match.group())   384         else:   385             parts.append(translate_content_match(match))   386    387         last = end   388    389     parts.append(translate_text(text[last:], preformatted))   390     return "".join(parts)   391    392 def translate_block(blocktype, blocktext):   393    394     "Translate the block with the given 'blocktype' and 'blocktext'."   395    396     parts = []   397    398     # Translate headings and blockquotes.   399    400     if blocktypes.has_key(blocktype):   401         parts.append(blocktypes[blocktype] % blocktext)   402    403     # Translate list items.   404    405     elif blocktype == "list":   406         for listmarker, listitem in get_list_items(blocktext):   407             parts.append("%s %s" % (translate_marker(listmarker), translate_content(listitem)))   408    409     # Translate table items.   410    411     elif blocktype == "table":   412         parts.append("{{{#!table")   413         first = True   414         for cellsep, columns in get_table_rows(blocktext):   415             if not first:   416                 parts.append("==")   417             else:   418                 first = False   419             moinsep = translate_cellsep(cellsep)   420             parts.append(moinsep.join([translate_cell(cellsep, column) for column in columns]))   421         parts.append("}}}")   422    423     # Handle anonymous blocks.   424    425     else:   426         parts.append(translate_content(blocktext))   427    428     return "\n".join(parts)   429    430 def get_table_rows(text):   431    432     "Return a list of (cellsep, columns) tuples for the given table 'text'."   433    434     rows = []   435    436     for row in text.split("|\n"):   437         if not row:   438             break   439    440         row += "|"   441         cellsep = None   442         columns = [""]   443         last = 0   444         for match in table_content_regexp.finditer(row):   445             start, end = match.span()   446             columns[-1] += row[last:start]   447    448             if match.group("celltype"):   449                 if cellsep is None:   450                     cellsep = match.group("celltype")   451                 columns.append("")   452             else:   453                 columns[-1] += match.group()   454    455             last = end   456    457         columns[-1] += row[last:]   458    459         if cellsep:   460             rows.append((cellsep, columns[1:-1]))   461    462     return rows   463    464 sectiontypes = {   465     "code"      : "",   466     "noformat"  : "",   467     "quote"     : "",   468     "info"      : "#!wiki important\n",   469     "note"      : "#!wiki caution\n",   470     "tip"       : "#!wiki tip\n",   471     "warning"   : "#!wiki warning\n",   472     }   473    474 preformatted_sectiontypes = (None, "noformat")   475    476 macrotypes = {   477     "anchor"    : "<<Anchor(%s)>>",   478     "color"     : "<<Color(%s)>>",   479     }   480    481 # General parsing.   482    483 def parse_text(s):   484    485     "Parse the content in the string 's', returning the translation."   486    487     parts = []   488    489     # Control spacing between blocks and other blocks or sections.   490    491     preceded_by_block = False   492    493     for type, text in get_regions(s):   494    495         # Handle list, heading, blockquote or anonymous blocks.   496    497         if type is None:   498             if preceded_by_block:   499                 parts.append("\n")   500    501             first = True   502             for blocktype, blocktext in get_blocks(text):   503                 if not first:   504                     parts.append("\n")   505                 else:   506                     first = False   507                 parts.append("%s" % translate_block(blocktype, blocktext))   508    509             if not first:   510                 preceded_by_block = True   511    512         # Handle sections.   513    514         else:   515             sectiontype, options = type   516    517             # Direct translations of sections.   518    519             if sectiontypes.has_key(sectiontype):   520                 if preceded_by_block:   521                     parts.append("\n")   522                 mointype = sectiontypes[sectiontype]   523    524                 parts.append("{{{%s" % (mointype or ""))   525                 text = text.strip()   526    527                 # Sections containing newlines must have a separate header line.   528    529                 if options or text.find("\n") != -1:   530                     parts.append("\n")   531    532                 if options:   533                     parts.append("## %s\n" % options)   534                 parts.append(translate_content(text, sectiontype))   535                 parts.append("%s}}}\n" % (mointype and "\n" or ""))   536    537                 preceded_by_block = True   538    539             # Translations of macros (which can look like sections).   540    541             elif macrotypes.has_key(sectiontype):   542                 parts.append(macrotypes[sectiontype] % translate_content(text, sectiontype))   543                 preceded_by_block = False   544    545             # Unrecognised sections.   546    547             else:   548                 parts.append("{{{")   549    550                 # Sections containing newlines must have a separate header line.   551    552                 if text.find("\n") != -1 and not text.startswith("\n"):   553                     parts.append("\n")   554    555                 parts.append(translate_content(text, sectiontype))   556                 parts.append("}}}")   557                 preceded_by_block = False   558    559     return "".join(parts)   560    561 def parse(s, out):   562    563     "Parse the content in the string 's', writing a translation to 'out'."   564    565     out.write(parse_text(s))   566    567 if __name__ == "__main__":   568     s = codecs.getreader("utf-8")(sys.stdin).read()   569     out = codecs.getwriter("utf-8")(sys.stdout)   570     parse(s, out)   571    572 # vim: tabstop=4 expandtab shiftwidth=4