1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 1. Wiki pages are first split up into regions. 7 2. Then, within these regions, the text is split into blocks. 8 1. First, lists are identified. 9 2. Additionally, other block-like elements are identified. 10 3. Each block is then parsed. 11 """ 12 13 import re 14 15 # Section extraction. 16 17 sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})" 18 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 19 20 def get_regions(s): 21 22 """ 23 Return a list of regions from 's'. Each region is specified using a tuple of 24 the form (type, text). 25 """ 26 27 last = 0 28 regions = [] 29 for match in sections_regexp.finditer(s): 30 start, end = match.span() 31 regions.append((None, s[last:start])) 32 regions.append(get_section_details(s[start:end])) 33 last = end 34 regions.append((None, s[last:])) 35 return regions 36 37 # Section inspection. 38 39 section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}" 40 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 41 42 def get_section_details(s): 43 44 "Return the details of a section 's' in the form (type, text)." 45 46 match = section_regexp.match(s) 47 if match: 48 return match.group("sectiontype"), match.group("section") 49 else: 50 return None, s 51 52 # List extraction. 53 54 list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*" 55 list_regexp = re.compile(list_regexp_str, re.MULTILINE) 56 57 def get_lists(s): 58 59 """ 60 Extract lists from the given string 's'. 61 """ 62 63 last = 0 64 blocks = [] 65 for match in list_regexp.finditer(s): 66 start, end = match.span() 67 blocks.append((None, s[last:start])) 68 blocks.append(("list", s[start:end])) 69 last = end 70 blocks.append((None, s[last:])) 71 return blocks 72 73 # Block extraction. 74 75 block_regexp_str = r"^(?:\s*\n)+" 76 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 77 78 def get_basic_blocks(s): 79 80 """ 81 Return blocks from the given string 's' by splitting the text on blank lines 82 and eliminating those lines. 83 """ 84 85 return [b for b in block_regexp.split(s) if b.strip()] 86 87 # Block inspection. 88 89 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 90 blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE) 91 92 def get_blocks(s): 93 94 """ 95 Return blocks from the given string 's', inspecting the basic blocks and 96 generating additional block-level text where appropriate. 97 """ 98 99 blocks = [] 100 101 for blocktype, blocktext in get_lists(s): 102 103 # Collect list blocks. 104 105 if blocktype is not None: 106 blocks.append((blocktype, blocktext)) 107 108 # Attempt to find new subblocks in other regions. 109 110 else: 111 for block in get_basic_blocks(blocktext): 112 last = 0 113 for match in blocktext_regexp.finditer(block): 114 start, end = match.span() 115 116 # Add preceding non-block text. 117 118 preceding = block[last:start] 119 if preceding.strip(): 120 blocks.append((None, preceding)) 121 122 # Add the subblock. 123 124 blocks.append((match.group("type"), match.group("text"))) 125 last = end 126 127 # Add trailing non-block text. 128 129 trailing = block[last:] 130 if trailing.strip(): 131 blocks.append((None, trailing)) 132 133 return blocks 134 135 listitem_regexp_str = r"^([*#-])+\s*(.*)$" 136 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 137 138 if __name__ == "__main__": 139 import sys 140 141 s = sys.stdin.read() 142 143 for type, text in get_regions(s): 144 if type is None: 145 for blocktype, blocktext in get_blocks(text): 146 print "Block type:", blocktype 147 print blocktext 148 print 149 else: 150 print "Region type:", type 151 print text 152 print 153 154 print "-" * 60 155 156 # vim: tabstop=4 expandtab shiftwidth=4