1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Break, Heading, ListItem, Region, Rule, Text 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 30 "regionend" : r"^\s*([}]{3,})", # }}}... 31 "header" : r"#!(.*?)\n", # #! char-excl-nl 32 33 # Region contents: 34 # Line-oriented patterns: 35 # blank line 36 "break" : r"^(\s*?)\n", 37 # [ws...] =... ws... expecting headingend 38 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 39 # indent list-item [ws...] 40 "listitem" : r"^(\s+)(\*)(\s*)", 41 # indent number-item ws... 42 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 43 # indent alpha-item ws... 44 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 45 # indent roman-item ws... 46 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 47 # indent dot-item [ws...] 48 "listitem_dot" : r"^(\s+)(\.)(\s*)", 49 50 # Region contents: 51 # Inline patterns: 52 "rule" : r"(-----*)", # ----... 53 54 # Heading contents: 55 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 56 57 # List contents: 58 "listitemend" : r"^", # next line 59 } 60 61 # Define patterns for the regular expressions. 62 63 patterns = {} 64 for name, value in syntax.items(): 65 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 66 67 68 69 # Tokenising functions. 70 71 class TokenStream: 72 73 "A stream of tokens taken from a string." 74 75 def __init__(self, s): 76 self.s = s 77 self.pos = 0 78 self.match = None 79 self.matching = None 80 81 def read_until(self, pattern_names, remaining=True): 82 83 """ 84 Find the first match for the given 'pattern_names'. Return the text 85 preceding any match, the remaining text if no match was found, or None 86 if no match was found and 'remaining' is given as a false value. 87 """ 88 89 first = None 90 self.matching = None 91 92 # Find the first matching pattern. 93 94 for pattern_name in pattern_names: 95 match = patterns[pattern_name].search(self.s, self.pos) 96 if match: 97 start, end = match.span() 98 if self.matching is None or start < first: 99 first = start 100 self.matching = pattern_name 101 self.match = match 102 103 if self.matching is None: 104 if remaining: 105 return self.s[self.pos:] 106 else: 107 return None 108 else: 109 return self.s[self.pos:first] 110 111 def read_match(self, group=1): 112 113 """ 114 Return the matched text, updating the position in the stream. If 'group' 115 is specified, the indicated group in a match will be returned. 116 Typically, group 1 should contain all pertinent data, but groups defined 117 within group 1 can provide sections of the data. 118 """ 119 120 if self.match: 121 _start, self.pos = self.match.span() 122 try: 123 return self.match.group(group) 124 except IndexError: 125 return "" 126 else: 127 self.pos = len(self.s) 128 return None 129 130 131 132 # Parser functions. 133 134 def parse_page(s): 135 136 """ 137 Parse page text 's'. Pages consist of regions delimited by markers. 138 """ 139 140 return parse_region(TokenStream(s)) 141 142 def parse_region(items, level=0, indent=0): 143 144 """ 145 Parse the data provided by 'items' to populate a region with the given 146 'level' at the given 'indent'. 147 """ 148 149 region = Region([], level, indent) 150 151 # Parse section headers. 152 153 parse_region_header(items, region) 154 155 # Parse section body. 156 157 if region.is_transparent(): 158 parse_region_wiki(items, region) 159 else: 160 parse_region_opaque(items, region) 161 162 return region 163 164 def parse_region_header(items, region): 165 166 """ 167 Parse the region header from the 'items', setting it for the given 'region'. 168 """ 169 170 if items.read_until(["header"], False) == "": # None means no header 171 region.type = items.read_match() 172 173 def parse_region_wiki(items, region): 174 175 "Parse the data provided by 'items' to populate a wiki 'region'." 176 177 new_block(region) 178 parse_region_details(items, region, [ 179 "break", "heading", "listitem", "listitem_num", "listitem_alpha", 180 "listitem_roman", "listitem_dot", "regionstart", "regionend", "rule"]) 181 182 def parse_region_opaque(items, region): 183 184 "Parse the data provided by 'items' to populate an opaque 'region'." 185 186 parse_region_details(items, region, ["regionend"]) 187 188 def parse_region_details(items, region, pattern_names): 189 190 "Parse 'items' within 'region' searching using 'pattern_names'." 191 192 try: 193 while True: 194 195 # Obtain text before any marker or the end of the input. 196 197 preceding = items.read_until(pattern_names) 198 if preceding: 199 region.append_text(Text(preceding)) 200 201 # End of input. 202 203 if not items.matching: 204 break 205 206 # Obtain any feature. 207 208 feature = items.read_match() 209 handler = handlers.get(items.matching) 210 211 # Handle each feature or add text to the region. 212 213 if handler: 214 handler(items, region) 215 else: 216 region.append_text(Text(feature)) 217 218 except StopIteration: 219 pass 220 221 region.normalise() 222 223 def end_region(items, region): 224 225 "End the parsing of 'region'." 226 227 raise StopIteration 228 229 def parse_break(items, region): 230 231 "Handle a paragraph break within 'region'." 232 233 region.add(Break()) 234 new_block(region) 235 236 def parse_heading(items, region): 237 238 "Handle a heading." 239 240 start_extra = items.read_match(1) 241 level = len(items.read_match(2)) 242 start_pad = items.read_match(3) 243 heading = Heading([], level, start_extra, start_pad) 244 parse_region_details(items, heading, ["headingend"]) 245 region.append(heading) 246 new_block(region) 247 248 def parse_heading_end(items, heading): 249 250 "Handle the end of a heading." 251 252 level = len(items.read_match(2)) 253 if heading.level == level: 254 heading.end_pad = items.read_match(1) 255 heading.end_extra = items.read_match(3) 256 raise StopIteration 257 258 def parse_listitem(items, region): 259 260 "Handle a list item marker within 'region'." 261 262 indent = len(items.read_match(1)) 263 marker = items.read_match(2) 264 space = items.read_match(3) 265 item = ListItem([], indent, marker, space) 266 parse_region_details(items, item, ["listitemend"]) 267 region.append(item) 268 new_block(region) 269 270 def parse_listitem_end(items, item): 271 272 "Handle the end of a list." 273 274 raise StopIteration 275 276 def parse_rule(items, region): 277 278 "Handle a horizontal rule within 'region'." 279 280 length = len(items.read_match(1)) 281 rule = Rule(length) 282 region.append(rule) 283 new_block(region) 284 285 def parse_section(items, region): 286 287 "Handle the start of a new section within 'region'." 288 289 # Parse the section and start a new block after the section. 290 291 indent = len(items.read_match(2)) 292 level = len(items.read_match(3)) 293 region.append(parse_region(items, level, indent)) 294 new_block(region) 295 296 def parse_section_end(items, region): 297 298 "Handle the end of a new section within 'region'." 299 300 feature = items.read_match() 301 if region.have_end(feature): 302 raise StopIteration 303 else: 304 region.append_text(Text(feature)) 305 306 # Pattern handlers. 307 308 handlers = { 309 None : end_region, 310 "break" : parse_break, 311 "heading" : parse_heading, 312 "headingend" : parse_heading_end, 313 "listitemend" : parse_listitem_end, 314 "listitem" : parse_listitem, 315 "listitem_alpha" : parse_listitem, 316 "listitem_dot" : parse_listitem, 317 "listitem_num" : parse_listitem, 318 "listitem_roman" : parse_listitem, 319 "regionstart" : parse_section, 320 "regionend" : parse_section_end, 321 "rule" : parse_rule, 322 } 323 324 def new_block(region): 325 326 "Start a new block in 'region'." 327 328 block = Block([]) 329 region.add(block) 330 331 332 333 # Top-level functions. 334 335 parse = parse_page 336 337 # vim: tabstop=4 expandtab shiftwidth=4