1 #!/usr/bin/env python 2 3 """ 4 Confluence Wiki syntax parsing. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 23 -------- 24 25 The basic procedure is as follows: 26 27 1. Wiki pages are first split up into regions. 28 2. Then, within these regions, the text is split into blocks. 29 1. First, lists are identified. 30 2. Additionally, other block-like elements are identified. 31 3. Each block is then parsed. 32 """ 33 34 import re 35 36 # Section extraction. 37 38 sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})" 39 sections_regexp = re.compile(sections_regexp_str, re.DOTALL | re.MULTILINE) 40 41 def get_regions(s): 42 43 """ 44 Return a list of regions from 's'. Each region is specified using a tuple of 45 the form (type, text). 46 """ 47 48 last = 0 49 regions = [] 50 for match in sections_regexp.finditer(s): 51 start, end = match.span() 52 regions.append((None, s[last:start])) 53 regions.append(get_section_details(s[start:end])) 54 last = end 55 regions.append((None, s[last:])) 56 return regions 57 58 # Section inspection. 59 60 section_regexp_str = r"{(?P<sectiontype>[^\n]*?)}(?P<section>.*){(?P=sectiontype)}" 61 section_regexp = re.compile(section_regexp_str, re.DOTALL | re.MULTILINE) 62 63 def get_section_details(s): 64 65 "Return the details of a section 's' in the form (type, text)." 66 67 match = section_regexp.match(s) 68 if match: 69 return match.group("sectiontype"), match.group("section") 70 else: 71 return None, s 72 73 # List extraction. 74 75 list_regexp_str = r"^(?P<listtype>[*#-])[*#-]*.*\n((?P=listtype).*(?:\n|$))*" 76 list_regexp = re.compile(list_regexp_str, re.MULTILINE) 77 78 def get_lists(s): 79 80 """ 81 Extract lists from the given string 's'. 82 """ 83 84 last = 0 85 blocks = [] 86 for match in list_regexp.finditer(s): 87 start, end = match.span() 88 blocks.append((None, s[last:start])) 89 blocks.append(("list", s[start:end])) 90 last = end 91 blocks.append((None, s[last:])) 92 return blocks 93 94 # Block extraction. 95 96 block_regexp_str = r"^(?:\s*\n)+" 97 block_regexp = re.compile(block_regexp_str, re.MULTILINE) 98 99 def get_basic_blocks(s): 100 101 """ 102 Return blocks from the given string 's' by splitting the text on blank lines 103 and eliminating those lines. 104 """ 105 106 return [b for b in block_regexp.split(s) if b.strip()] 107 108 # Block inspection. 109 110 blocktext_regexp_str = r"^(?P<type>h\d|bq)\.\s+(?P<text>.*)$" 111 blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE) 112 113 def get_blocks(s): 114 115 """ 116 Return blocks from the given string 's', inspecting the basic blocks and 117 generating additional block-level text where appropriate. 118 """ 119 120 blocks = [] 121 122 for blocktype, blocktext in get_lists(s): 123 124 # Collect list blocks. 125 126 if blocktype is not None: 127 blocks.append((blocktype, blocktext)) 128 129 # Attempt to find new subblocks in other regions. 130 131 else: 132 for block in get_basic_blocks(blocktext): 133 last = 0 134 for match in blocktext_regexp.finditer(block): 135 start, end = match.span() 136 137 # Add preceding non-block text. 138 139 preceding = block[last:start] 140 if preceding.strip(): 141 blocks.append((None, preceding)) 142 143 # Add the subblock. 144 145 blocks.append((match.group("type"), match.group("text"))) 146 last = end 147 148 # Add trailing non-block text. 149 150 trailing = block[last:] 151 if trailing.strip(): 152 blocks.append((None, trailing)) 153 154 return blocks 155 156 listitem_regexp_str = r"^([*#-])+\s*(.*)$" 157 listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE) 158 159 if __name__ == "__main__": 160 import sys 161 162 s = sys.stdin.read() 163 164 for type, text in get_regions(s): 165 if type is None: 166 for blocktype, blocktext in get_blocks(text): 167 print "Block type:", blocktype 168 print blocktext 169 print 170 else: 171 print "Region type:", type 172 print text 173 print 174 175 print "-" * 60 176 177 # vim: tabstop=4 expandtab shiftwidth=4