ConfluenceConverter (file parser.py at cdbfc82274f8)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22     23 --------    24     25 The basic procedure is as follows:    26     27  1. Wiki pages are first split up into regions.    28  2. Then, within these regions, the text is split into blocks.    29     1. First, lists are identified.    30     2. Additionally, other block-like elements are identified.    31  3. Each block is then parsed.    32 """    33     34 import re    35     36 # Section extraction.    37     38 sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})"    39 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    40     41 def get_regions(s):    42     43     """    44     Return a list of regions from 's'. Each region is specified using a tuple of    45     the form (type, text).    46     """    47     48     last = 0    49     regions = []    50     for match in sections_regexp.finditer(s):    51         start, end = match.span()    52         regions.append((None, s[last:start]))    53         regions.append(get_section_details(s[start:end]))    54         last = end    55     regions.append((None, s[last:]))    56     return regions    57     58 # Section inspection.    59     60 section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}"    61 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    62     63 def get_section_details(s):    64     65     "Return the details of a section 's' in the form (type, text)."    66     67     match = section_regexp.match(s)    68     if match:    69         return match.group("sectiontype"), match.group("section")    70     else:    71         return None, s    72     73 # List extraction.    74     75 list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*"    76 list_regexp = re.compile(list_regexp_str, re.MULTILINE)    77     78 def get_lists(s):    79     80     """    81     Extract lists from the given string 's'.    82     """    83     84     last = 0    85     blocks = []    86     for match in list_regexp.finditer(s):    87         start, end = match.span()    88         blocks.append((None, s[last:start]))    89         blocks.append(("list", s[start:end]))    90         last = end    91     blocks.append((None, s[last:]))    92     return blocks    93     94 # Block extraction.    95     96 block_regexp_str = r"^(?:\s*\n)+"    97 block_regexp = re.compile(block_regexp_str, re.MULTILINE)    98     99 def get_basic_blocks(s):   100    101     """   102     Return blocks from the given string 's' by splitting the text on blank lines   103     and eliminating those lines.   104     """   105    106     return [b for b in block_regexp.split(s) if b.strip()]   107    108 # Block inspection.   109    110 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"   111 blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE)   112    113 def get_blocks(s):   114    115     """   116     Return blocks from the given string 's', inspecting the basic blocks and   117     generating additional block-level text where appropriate.   118     """   119    120     blocks = []   121    122     for blocktype, blocktext in get_lists(s):   123    124         # Collect list blocks.   125    126         if blocktype is not None:   127             blocks.append((blocktype, blocktext))   128    129         # Attempt to find new subblocks in other regions.   130    131         else:   132             for block in get_basic_blocks(blocktext):   133                 last = 0   134                 for match in blocktext_regexp.finditer(block):   135                     start, end = match.span()   136    137                     # Add preceding non-block text.   138    139                     preceding = block[last:start]   140                     if preceding.strip():   141                         blocks.append((None, preceding))   142    143                     # Add the subblock.   144    145                     blocks.append((match.group("type"), match.group("text")))   146                     last = end   147    148                 # Add trailing non-block text.   149    150                 trailing = block[last:]   151                 if trailing.strip():   152                     blocks.append((None, trailing))   153    154     return blocks   155    156 listitem_regexp_str = r"^([*#-])+\s*(.*)$"   157 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   158    159 if __name__ == "__main__":   160     import sys   161    162     s = sys.stdin.read()   163    164     for type, text in get_regions(s):   165         if type is None:   166             for blocktype, blocktext in get_blocks(text):   167                 print "Block type:", blocktype   168                 print blocktext   169                 print   170         else:   171             print "Region type:", type   172             print text   173             print   174    175         print "-" * 60   176    177 # vim: tabstop=4 expandtab shiftwidth=4