1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 dotall = r"(.|\n)" 31 32 def choice(l): 33 34 "Return a pattern matching a choice of patterns in 'l'." 35 36 return "(%s)" % "|".join(l) 37 38 def excl(s): 39 40 "Return a non-matching pattern for 's'." 41 42 return "(?!%s)" % s 43 44 def expect(s): 45 46 "Return a pattern expecting 's'." 47 48 return "(?=%s)" % s 49 50 def group(name, s): 51 52 "Return a pattern group having 'name' and the pattern string 's'." 53 54 return "(?P<%s>%s)" % (name, s) 55 56 def optional(s): 57 58 "Return an optional pattern." 59 60 return "(?:%s)?" % s 61 62 def recur(name): 63 64 "Return a test for a recurrence of group 'name'." 65 66 return "(?P=%s)" % name 67 68 def repeat(s, min=None, max=None): 69 70 "Return a pattern matching 's' for the given 'min' and 'max' limits." 71 72 return "%s{%s,%s}" % (s, min is not None and min or "", 73 max is not None and max or "") 74 75 def get_pattern(s): 76 77 "Return a compiled regular expression for the given pattern 's'." 78 79 return re.compile(s, re.UNICODE | re.MULTILINE) 80 81 def get_patterns(syntax): 82 83 """ 84 Define patterns for the regular expressions in the 'syntax' mapping. In each 85 pattern, replace \N with a pattern for matching whitespace excluding 86 newlines. 87 """ 88 89 patterns = {} 90 for name, value in syntax.items(): 91 value = value.replace(r"\N", ws_excl_nl) 92 value = value.replace(r"\Q", quotes) 93 value = value.replace(r"\E", dotall) 94 patterns[name] = get_pattern(value) 95 return patterns 96 97 def get_subset(d, keys): 98 99 "Return a subset of 'd' having the given 'keys'." 100 101 subset = {} 102 for key in keys: 103 subset[key] = d[key] 104 return subset 105 106 107 108 # Tokenising functions. 109 110 class TokenStream: 111 112 "A stream of tokens taken from a string." 113 114 def __init__(self, s, pos=0): 115 self.s = s 116 self.pos = pos 117 118 # Match details. 119 120 self.match = None 121 self.queued = None 122 self.match_start = None 123 124 # Pattern name details. 125 126 self.matching = None 127 128 def rewind(self, length): 129 130 "Rewind in the string by 'length'." 131 132 self.pos -= min(length, self.pos) 133 134 def queue_match(self): 135 136 "Rewind in the string to the start of the last match." 137 138 self.queued = self.match 139 140 def read_until(self, patterns, remaining=True): 141 142 """ 143 Find the first match for the given 'patterns'. Return the text preceding 144 any match, the remaining text if no match was found, or None if no match 145 was found and 'remaining' is given as a false value. 146 """ 147 148 if self.queued: 149 self.match = self.queued 150 self.queued = None 151 else: 152 self.match_start = None 153 self.matching = None 154 155 # Find the first matching pattern. 156 157 for pattern_name, pattern in patterns.items(): 158 match = pattern.search(self.s, self.pos) 159 if match: 160 start, end = match.span() 161 if self.matching is None or start < self.start: 162 self.start = start 163 self.matching = pattern_name 164 self.match = match 165 166 if self.matching is None: 167 if remaining: 168 return self.s[self.pos:] 169 else: 170 return None 171 else: 172 return self.s[self.pos:self.start] 173 174 def match_group(self, group=1): 175 176 """ 177 Return the matched text, updating the position in the stream. If 'group' 178 is specified, the indicated group in a match will be returned. 179 Typically, group 1 should contain all pertinent data, but groups defined 180 within group 1 can provide sections of the data. 181 """ 182 183 self.update_pos() 184 185 if self.match: 186 try: 187 return self.match.group(group) 188 except IndexError: 189 return "" 190 else: 191 return None 192 193 def match_groups(self, groups=None): 194 195 "Return the match 'groups', or all groups if unspecified." 196 197 self.update_pos() 198 199 if self.match: 200 if groups is None: 201 return self.match.groups() 202 else: 203 return self.match.groups(groups) 204 else: 205 return [] 206 207 def update_pos(self): 208 209 "Update the position in the stream." 210 211 if self.match: 212 _start, self.pos = self.match.span() 213 else: 214 self.pos = len(self.s) 215 216 217 218 # Parser abstractions. 219 220 class ParserBase: 221 222 "Common parsing methods." 223 224 region_pattern_names = None 225 226 def __init__(self, metadata, parsers=None, root=None): 227 228 """ 229 Initialise the parser with the given 'metadata' and optional 'parsers'. 230 An optional 'root' indicates the document-level parser. 231 """ 232 233 self.metadata = metadata 234 self.parsers = parsers 235 self.root = root 236 237 def get_parser(self, format_type): 238 239 """ 240 Return a parser for 'format_type' or None if no suitable parser is found. 241 """ 242 243 cls = self.parsers and self.parsers.get(format_type) 244 if cls: 245 return cls(self.metadata, self.parsers, self.root or self) 246 else: 247 return None 248 249 def get_patterns(self, pattern_names): 250 251 "Return a mapping of the given 'pattern_names' to patterns." 252 253 return get_subset(self.patterns, pattern_names) 254 255 def get_items(self, s, pos=0): 256 257 "Return a sequence of token items for 's' and 'pos'." 258 259 return TokenStream(s, pos) 260 261 def set_region(self, items, region): 262 263 "Set the 'items' used to populate the given 'region'." 264 265 self.items = items 266 self.region = region 267 268 def read_until(self, pattern_names, remaining=True): 269 270 """ 271 Read the next portion of input, matching using 'pattern_names'. Return 272 the text preceding any match, the remaining text if no match was found, 273 or None if no match was found and 'remaining' is given as a false value. 274 """ 275 276 return self.items.read_until(self.get_patterns(pattern_names)) 277 278 def match_group(self, group=1): 279 280 """ 281 Return the group of the matching pattern with the given 'group' number. 282 """ 283 284 return self.items.match_group(group) 285 286 def matching_pattern(self): 287 288 "Return the name of the matching pattern." 289 290 return self.items.matching 291 292 def match_groups(self): 293 294 "Return the number of groups in the match." 295 296 return self.items.match_groups() 297 298 # Parser methods invoked from other objects. 299 300 def parse(self, s): 301 302 """ 303 Parse page text 's'. Pages consist of regions delimited by markers. 304 """ 305 306 self.items = self.get_items(s) 307 self.region = self.parse_region() 308 return self.region 309 310 def parse_region_content(self, items, region): 311 312 "Parse the data provided by 'items' to populate a 'region'." 313 314 self.set_region(items, region) 315 316 # Parse inline and opaque regions. 317 318 if not region.transparent: 319 pattern_names = ["regionend"] 320 321 # Define a block to hold text. 322 323 else: 324 self.new_block(region) 325 pattern_names = self.region_pattern_names 326 327 # Start parsing. 328 329 if pattern_names: 330 self.parse_region_details(region, pattern_names) 331 332 # Reset the type if the region was not inline. 333 334 if region.type == "inline": 335 first = region.nodes and region.nodes[0] 336 if first and isinstance(first, Text) and first.multiline(): 337 region.type = None 338 339 # Top-level parser handler methods. 340 341 def parse_region(self, level=0, indent=0, type=None): 342 343 """ 344 Parse the data to populate a region with the given 'level' at the given 345 'indent' having the given initial 'type'. 346 """ 347 348 region = Region([], level, indent, type) 349 350 # Parse section headers and directives, then parse according to region 351 # type. 352 353 self.parse_region_header(region) 354 self.parse_region_directives(region) 355 self.parse_region_type(region) 356 357 return region 358 359 def parse_region_type(self, region): 360 361 """ 362 Use configured parsers to parse 'region' based on its type. 363 """ 364 365 # Find an appropriate parser given the type. 366 367 parser = self.get_parser(region.type) 368 if not parser: 369 region.transparent = False 370 parser = parser or self.get_parser("moin") 371 parser.parse_region_content(self.items, region) 372 373 def parse_region_header(self, region): 374 375 """ 376 Parse the region header, setting it on the 'region' object. 377 """ 378 379 if self.read_until(["header"], False) == "": # None means no header 380 region.args = self.match_group("args") 381 region.type = region.args.split(" ", 1)[0] 382 383 def parse_region_directives(self, region): 384 385 """ 386 Parse any directives immediately after the region header, adding them to 387 the 'region' object. 388 """ 389 390 while True: 391 preceding = self.read_until(["directive"], False) 392 393 # With an immediately-appearing directive, handle its details. 394 395 if preceding == "": 396 handler = self.handlers.get(self.matching_pattern()) 397 if handler: 398 handler(self, region) 399 else: 400 break 401 402 # Otherwise, with no immediate directive (or none at all), stop. 403 404 else: 405 break 406 407 # Parsing utilities. 408 409 def parse_region_details(self, region, pattern_names, strict=False): 410 411 """ 412 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 413 value, forbid the accumulation of additional textual padding. 414 """ 415 416 try: 417 while True: 418 419 # Obtain text before any marker or the end of the input. 420 421 preceding = self.read_until(pattern_names) 422 if preceding: 423 if not strict: 424 region.append_inline(Text(preceding)) 425 else: 426 break 427 428 # End of input. 429 430 if not self.matching_pattern(): 431 break 432 433 # Obtain any feature. 434 435 feature = self.match_group("feature") or self.match_group() 436 handler = self.handlers.get(self.matching_pattern()) 437 438 # Handle each feature or add text to the region. 439 440 if handler: 441 handler(self, region) 442 elif not strict: 443 region.append_inline(Text(feature)) 444 else: 445 break 446 447 except StopIteration: 448 pass 449 450 region.normalise() 451 452 def add_node(self, region, node): 453 454 "Add to 'region' the given 'node'." 455 456 region.add(node) 457 458 def append_node(self, region, node): 459 460 "Append to 'region' the given 'node'." 461 462 region.append(node) 463 464 def end_region(self, region): 465 466 "End the parsing of 'region', breaking out of the parsing loop." 467 468 raise StopIteration 469 470 def queue_match(self): 471 472 "Queue the current match." 473 474 self.items.queue_match() 475 476 def new_block(self, region): 477 478 "Start a new block in 'region'." 479 480 self.add_node(region, Block([])) 481 482 # Common handler methods. 483 484 def parse_region_end(self, node): 485 486 "Handle the end of a region occurring within 'node'." 487 488 level = self.match_group("level") 489 feature = self.match_group("feature") 490 self.region.extra = self.match_group("extra") 491 492 if self.region.have_end(level): 493 raise StopIteration 494 else: 495 node.append_inline(Text(feature)) 496 497 # vim: tabstop=4 expandtab shiftwidth=4