ConfluenceConverter (file parser.py at 192a2a30aff4)

     1 #!/usr/bin/env python     2      3 """     4 Confluence Wiki syntax parsing.     5      6  1. Wiki pages are first split up into regions.     7  2. Then, within these regions, the text is split into blocks.     8     1. First, lists are identified.     9     2. Additionally, other block-like elements are identified.    10  3. Each block is then parsed.    11 """    12     13 import re    14     15 # Section extraction.    16     17 sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})"    18 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE)    19     20 def get_regions(s):    21     22     """    23     Return a list of regions from 's'. Each region is specified using a tuple of    24     the form (type, text).    25     """    26     27     last = 0    28     regions = []    29     for match in sections_regexp.finditer(s):    30         start, end = match.span()    31         regions.append((None, s[last:start]))    32         regions.append(get_section_details(s[start:end]))    33         last = end    34     regions.append((None, s[last:]))    35     return regions    36     37 # Section inspection.    38     39 section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}"    40 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE)    41     42 def get_section_details(s):    43     44     "Return the details of a section 's' in the form (type, text)."    45     46     match = section_regexp.match(s)    47     if match:    48         return match.group("sectiontype"), match.group("section")    49     else:    50         return None, s    51     52 # List extraction.    53     54 list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*"    55 list_regexp = re.compile(list_regexp_str, re.MULTILINE)    56     57 def get_lists(s):    58     59     """    60     Extract lists from the given string 's'.    61     """    62     63     last = 0    64     blocks = []    65     for match in list_regexp.finditer(s):    66         start, end = match.span()    67         blocks.append((None, s[last:start]))    68         blocks.append(("list", s[start:end]))    69         last = end    70     blocks.append((None, s[last:]))    71     return blocks    72     73 # Block extraction.    74     75 block_regexp_str = r"^(?:\s*\n)+"    76 block_regexp = re.compile(block_regexp_str, re.MULTILINE)    77     78 def get_basic_blocks(s):    79     80     """    81     Return blocks from the given string 's' by splitting the text on blank lines    82     and eliminating those lines.    83     """    84     85     return [b for b in block_regexp.split(s) if b.strip()]    86     87 # Block inspection.    88     89 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$"    90 blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE)    91     92 def get_blocks(s):    93     94     """    95     Return blocks from the given string 's', inspecting the basic blocks and    96     generating additional block-level text where appropriate.    97     """    98     99     blocks = []   100    101     for blocktype, blocktext in get_lists(s):   102    103         # Collect list blocks.   104    105         if blocktype is not None:   106             blocks.append((blocktype, blocktext))   107    108         # Attempt to find new subblocks in other regions.   109    110         else:   111             for block in get_basic_blocks(blocktext):   112                 last = 0   113                 for match in blocktext_regexp.finditer(block):   114                     start, end = match.span()   115    116                     # Add preceding non-block text.   117    118                     preceding = block[last:start]   119                     if preceding.strip():   120                         blocks.append((None, preceding))   121    122                     # Add the subblock.   123    124                     blocks.append((match.group("type"), match.group("text")))   125                     last = end   126    127                 # Add trailing non-block text.   128    129                 trailing = block[last:]   130                 if trailing.strip():   131                     blocks.append((None, trailing))   132    133     return blocks   134    135 listitem_regexp_str = r"^([*#-])+\s*(.*)$"   136 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)   137    138 if __name__ == "__main__":   139     import sys   140    141     s = sys.stdin.read()   142    143     for type, text in get_regions(s):   144         if type is None:   145             for blocktype, blocktext in get_blocks(text):   146                 print "Block type:", blocktype   147                 print blocktext   148                 print   149         else:   150             print "Region type:", type   151             print text   152             print   153    154         print "-" * 60   155    156 # vim: tabstop=4 expandtab shiftwidth=4