1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def excl(s): 32 33 "Return a non-matching pattern for 's'." 34 35 return "(?!%s)" % s 36 37 def expect(s): 38 39 "Return a pattern expecting 's'." 40 41 return "(?=%s)" % s 42 43 def group(name, s): 44 45 "Return a pattern group having 'name' and the pattern string 's'." 46 47 return "(?P<%s>%s)" % (name, s) 48 49 def optional(s): 50 51 "Return an optional pattern." 52 53 return "(?:%s)?" % s 54 55 def recur(name): 56 57 "Return a test for a recurrence of group 'name'." 58 59 return "(?P=%s)" % name 60 61 def repeat(s, min=None, max=None): 62 63 "Return a pattern matching 's' for the given 'min' and 'max' limits." 64 65 return "%s{%s,%s}" % (s, min is not None and min or "", 66 max is not None and max or "") 67 68 def get_patterns(syntax): 69 70 """ 71 Define patterns for the regular expressions in the 'syntax' mapping. In each 72 pattern, replace \N with a pattern for matching whitespace excluding 73 newlines. 74 """ 75 76 patterns = {} 77 for name, value in syntax.items(): 78 value = value.replace(r"\N", ws_excl_nl) 79 value = value.replace(r"\Q", quotes) 80 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 81 return patterns 82 83 def get_subset(d, keys): 84 85 "Return a subset of 'd' having the given 'keys'." 86 87 subset = {} 88 for key in keys: 89 subset[key] = d[key] 90 return subset 91 92 93 94 # Tokenising functions. 95 96 class TokenStream: 97 98 "A stream of tokens taken from a string." 99 100 def __init__(self, s, pos=0): 101 self.s = s 102 self.pos = pos 103 104 # Match details. 105 106 self.match = None 107 self.queued = None 108 self.match_start = None 109 110 # Pattern name details. 111 112 self.matching = None 113 114 def rewind(self, length): 115 116 "Rewind in the string by 'length'." 117 118 self.pos -= min(length, self.pos) 119 120 def queue_match(self): 121 122 "Rewind in the string to the start of the last match." 123 124 self.queued = self.match 125 126 def read_until(self, patterns, remaining=True): 127 128 """ 129 Find the first match for the given 'patterns'. Return the text preceding 130 any match, the remaining text if no match was found, or None if no match 131 was found and 'remaining' is given as a false value. 132 """ 133 134 if self.queued: 135 self.match = self.queued 136 self.queued = None 137 else: 138 self.match_start = None 139 self.matching = None 140 141 # Find the first matching pattern. 142 143 for pattern_name, pattern in patterns.items(): 144 match = pattern.search(self.s, self.pos) 145 if match: 146 start, end = match.span() 147 if self.matching is None or start < self.start: 148 self.start = start 149 self.matching = pattern_name 150 self.match = match 151 152 if self.matching is None: 153 if remaining: 154 return self.s[self.pos:] 155 else: 156 return None 157 else: 158 return self.s[self.pos:self.start] 159 160 def match_group(self, group=1): 161 162 """ 163 Return the matched text, updating the position in the stream. If 'group' 164 is specified, the indicated group in a match will be returned. 165 Typically, group 1 should contain all pertinent data, but groups defined 166 within group 1 can provide sections of the data. 167 """ 168 169 self.update_pos() 170 171 if self.match: 172 try: 173 return self.match.group(group) 174 except IndexError: 175 return "" 176 else: 177 return None 178 179 def match_groups(self, groups=None): 180 181 "Return the match 'groups', or all groups if unspecified." 182 183 self.update_pos() 184 185 if self.match: 186 if groups is None: 187 return self.match.groups() 188 else: 189 return self.match.groups(groups) 190 else: 191 return [] 192 193 def update_pos(self): 194 195 "Update the position in the stream." 196 197 if self.match: 198 _start, self.pos = self.match.span() 199 else: 200 self.pos = len(self.s) 201 202 203 204 # Parser abstractions. 205 206 class ParserBase: 207 208 "Common parsing methods." 209 210 region_pattern_names = None 211 212 def __init__(self, formats=None): 213 214 """ 215 Initialise the parser with any given 'formats' mapping from region type 216 names to parser objects. 217 """ 218 219 self.formats = formats 220 221 def get_parser(self, format_type): 222 223 """ 224 Return a parser for 'format_type' or None if no suitable parser is found. 225 """ 226 227 if not self.formats: 228 return None 229 230 cls = self.formats.get(format_type) 231 if cls: 232 return cls(self.formats) 233 else: 234 return None 235 236 def get_patterns(self, pattern_names): 237 238 "Return a mapping of the given 'pattern_names' to patterns." 239 240 return get_subset(self.patterns, pattern_names) 241 242 def get_items(self, s, pos=0): 243 244 "Return a sequence of token items for 's' and 'pos'." 245 246 return TokenStream(s, pos) 247 248 def set_region(self, items, region): 249 250 "Set the 'items' used to populate the given 'region'." 251 252 self.items = items 253 self.region = region 254 255 def read_until(self, pattern_names, remaining=True): 256 257 """ 258 Read the next portion of input, matching using 'pattern_names'. Return 259 the text preceding any match, the remaining text if no match was found, 260 or None if no match was found and 'remaining' is given as a false value. 261 """ 262 263 return self.items.read_until(self.get_patterns(pattern_names)) 264 265 def match_group(self, group=1): 266 267 """ 268 Return the group of the matching pattern with the given 'group' number. 269 """ 270 271 return self.items.match_group(group) 272 273 def matching_pattern(self): 274 275 "Return the name of the matching pattern." 276 277 return self.items.matching 278 279 def match_groups(self): 280 281 "Return the number of groups in the match." 282 283 return self.items.match_groups() 284 285 # Parser methods invoked from other objects. 286 287 def parse(self, s): 288 289 """ 290 Parse page text 's'. Pages consist of regions delimited by markers. 291 """ 292 293 self.items = self.get_items(s) 294 self.region = self.parse_region() 295 return self.region 296 297 def parse_region_content(self, items, region): 298 299 "Parse the data provided by 'items' to populate a 'region'." 300 301 self.set_region(items, region) 302 303 # Define a block to hold text and start parsing. 304 305 self.new_block(region) 306 307 if self.region_pattern_names: 308 self.parse_region_details(region, self.region_pattern_names) 309 310 # Top-level parser handler methods. 311 312 def parse_region(self, level=0, indent=0, type=None): 313 314 """ 315 Parse the data to populate a region with the given 'level' at the given 316 'indent' having the given initial 'type'. 317 """ 318 319 region = Region([], level, indent, type) 320 321 # Parse section headers, then parse according to region type. 322 323 self.parse_region_header(region) 324 self.parse_region_type(region) 325 326 return region 327 328 def parse_region_type(self, region): 329 330 """ 331 Use configured parsers to parse 'region' based on its type. 332 """ 333 334 # Handle potentially inline regions. 335 336 if region.type == "inline": 337 self.parse_region_inline(region) 338 return 339 340 # Find an appropriate parser given the type. 341 342 parser = self.get_parser(region.type) 343 344 if parser: 345 parser.parse_region_content(self.items, region) 346 347 # Otherwise, treat the section as opaque. 348 349 else: 350 self.parse_region_opaque(region) 351 352 def parse_region_header(self, region): 353 354 """ 355 Parse the region header, setting it on the 'region' object. 356 """ 357 358 if self.read_until(["header"], False) == "": # None means no header 359 region.type = self.match_group("args") 360 361 def parse_region_opaque(self, region): 362 363 "Parse the data to populate an opaque 'region'." 364 365 region.transparent = False 366 self.parse_region_details(region, ["regionend"]) 367 368 def parse_region_inline(self, region): 369 370 "Parse the data to populate an inline 'region'." 371 372 region.transparent = False 373 self.parse_region_details(region, ["regionend"]) 374 375 # Reset the type if the region was not inline. 376 377 if region.type == "inline": 378 first = region.nodes and region.nodes[0] 379 if first and isinstance(first, Text) and first.multiline(): 380 region.type = None 381 382 # Parsing utilities. 383 384 def parse_region_details(self, region, pattern_names, strict=False): 385 386 """ 387 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 388 value, forbid the accumulation of additional textual padding. 389 """ 390 391 try: 392 while True: 393 394 # Obtain text before any marker or the end of the input. 395 396 preceding = self.read_until(pattern_names) 397 if preceding: 398 if not strict: 399 region.append_inline(Text(preceding)) 400 else: 401 break 402 403 # End of input. 404 405 if not self.matching_pattern(): 406 break 407 408 # Obtain any feature. 409 410 feature = self.match_group("feature") or self.match_group() 411 handler = self.handlers.get(self.matching_pattern()) 412 413 # Handle each feature or add text to the region. 414 415 if handler: 416 handler(self, region) 417 elif not strict: 418 region.append_inline(Text(feature)) 419 else: 420 break 421 422 except StopIteration: 423 pass 424 425 region.normalise() 426 427 def add_node(self, region, node): 428 429 "Add to 'region' the given 'node'." 430 431 region.add(node) 432 433 def append_node(self, region, node): 434 435 "Append to 'region' the given 'node'." 436 437 region.append(node) 438 439 def end_region(self, region): 440 441 "End the parsing of 'region', breaking out of the parsing loop." 442 443 raise StopIteration 444 445 def queue_match(self): 446 447 "Queue the current match." 448 449 self.items.queue_match() 450 451 def new_block(self, region): 452 453 "Start a new block in 'region'." 454 455 self.add_node(region, Block([])) 456 457 # vim: tabstop=4 expandtab shiftwidth=4