1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 dotall = r"(.|\n)" # behave similarly to dot with DOTALL option 31 dotparagraph = r"(.|\n(?!\r?\n))" # match everything within paragraphs 32 33 def choice(l): 34 35 "Return a pattern matching a choice of patterns in 'l'." 36 37 return "(%s)" % "|".join(l) 38 39 def excl(s): 40 41 "Return a non-matching pattern for 's'." 42 43 return "(?!%s)" % s 44 45 def expect(s): 46 47 "Return a pattern expecting 's'." 48 49 return "(?=%s)" % s 50 51 def group(name, s): 52 53 "Return a pattern group having 'name' and the pattern string 's'." 54 55 return "(?P<%s>%s)" % (name, s) 56 57 def optional(s): 58 59 "Return an optional pattern." 60 61 return "(?:%s)?" % s 62 63 def recur(name): 64 65 "Return a test for a recurrence of group 'name'." 66 67 return "(?P=%s)" % name 68 69 def repeat(s, min=None, max=None): 70 71 "Return a pattern matching 's' for the given 'min' and 'max' limits." 72 73 return "%s{%s,%s}" % (s, min is not None and min or "", 74 max is not None and max or "") 75 76 def get_pattern(s): 77 78 "Return a compiled regular expression for the given pattern 's'." 79 80 return re.compile(s, re.UNICODE | re.MULTILINE) 81 82 def get_patterns(syntax): 83 84 """ 85 Define patterns for the regular expressions in the 'syntax' mapping. In each 86 pattern, replace \N with a pattern for matching whitespace excluding 87 newlines. 88 """ 89 90 patterns = {} 91 for name, value in syntax.items(): 92 value = value.replace(r"\N", ws_excl_nl) 93 value = value.replace(r"\Q", quotes) 94 value = value.replace(r"\E", dotall) 95 value = value.replace(r"\P", dotparagraph) 96 patterns[name] = get_pattern(value) 97 return patterns 98 99 def get_subset(d, keys): 100 101 "Return a subset of 'd' having the given 'keys'." 102 103 subset = {} 104 for key in keys: 105 subset[key] = d[key] 106 return subset 107 108 109 110 # Tokenising functions. 111 112 class TokenStream: 113 114 "A stream of tokens taken from a string." 115 116 def __init__(self, s, pos=0): 117 self.s = s 118 self.pos = pos 119 120 # Match details. 121 122 self.match = None 123 self.queued = None 124 self.match_start = None 125 126 # Pattern name details. 127 128 self.matching = None 129 130 def rewind(self, length): 131 132 "Rewind in the string by 'length'." 133 134 self.pos -= min(length, self.pos) 135 136 def queue_match(self): 137 138 "Rewind in the string to the start of the last match." 139 140 self.queued = self.match 141 142 def read_until(self, patterns, remaining=True): 143 144 """ 145 Find the first match for the given 'patterns'. Return the text preceding 146 any match, the remaining text if no match was found, or None if no match 147 was found and 'remaining' is given as a false value. 148 """ 149 150 if self.queued: 151 self.match = self.queued 152 self.queued = None 153 else: 154 self.match_start = None 155 self.matching = None 156 157 # Find the first matching pattern. 158 159 for pattern_name, pattern in patterns.items(): 160 match = pattern.search(self.s, self.pos) 161 if match: 162 start, end = match.span() 163 if self.matching is None or start < self.start: 164 self.start = start 165 self.matching = pattern_name 166 self.match = match 167 168 if self.matching is None: 169 if remaining: 170 return self.s[self.pos:] 171 else: 172 return None 173 else: 174 return self.s[self.pos:self.start] 175 176 def match_group(self, group=1): 177 178 """ 179 Return the matched text, updating the position in the stream. If 'group' 180 is specified, the indicated group in a match will be returned. 181 Typically, group 1 should contain all pertinent data, but groups defined 182 within group 1 can provide sections of the data. 183 """ 184 185 self.update_pos() 186 187 if self.match: 188 try: 189 return self.match.group(group) 190 except IndexError: 191 return "" 192 else: 193 return None 194 195 def match_groups(self, groups=None): 196 197 "Return the match 'groups', or all groups if unspecified." 198 199 self.update_pos() 200 201 if self.match: 202 if groups is None: 203 return self.match.groups() 204 else: 205 return self.match.groups(groups) 206 else: 207 return [] 208 209 def update_pos(self): 210 211 "Update the position in the stream." 212 213 if self.match: 214 _start, self.pos = self.match.span() 215 else: 216 self.pos = len(self.s) 217 218 219 220 # Parser abstractions. 221 222 class ParserBase: 223 224 "Common parsing methods." 225 226 region_pattern_names = None 227 228 def __init__(self, metadata, parsers=None, root=None): 229 230 """ 231 Initialise the parser with the given 'metadata' and optional 'parsers'. 232 An optional 'root' indicates the document-level parser. 233 """ 234 235 self.metadata = metadata 236 self.parsers = parsers 237 self.root = root 238 239 def get_parser(self, format_type): 240 241 """ 242 Return a parser for 'format_type' or None if no suitable parser is found. 243 """ 244 245 cls = self.parsers and self.parsers.get(format_type) 246 if cls: 247 return cls(self.metadata, self.parsers, self.root or self) 248 else: 249 return None 250 251 def get_patterns(self, pattern_names): 252 253 "Return a mapping of the given 'pattern_names' to patterns." 254 255 return get_subset(self.patterns, pattern_names) 256 257 def get_items(self, s, pos=0): 258 259 "Return a sequence of token items for 's' and 'pos'." 260 261 return TokenStream(s, pos) 262 263 def set_region(self, items, region): 264 265 "Set the 'items' used to populate the given 'region'." 266 267 self.items = items 268 self.region = region 269 270 def read_until(self, pattern_names, remaining=True): 271 272 """ 273 Read the next portion of input, matching using 'pattern_names'. Return 274 the text preceding any match, the remaining text if no match was found, 275 or None if no match was found and 'remaining' is given as a false value. 276 """ 277 278 return self.items.read_until(self.get_patterns(pattern_names)) 279 280 def match_group(self, group=1): 281 282 """ 283 Return the group of the matching pattern with the given 'group' number. 284 """ 285 286 return self.items.match_group(group) 287 288 def matching_pattern(self): 289 290 "Return the name of the matching pattern." 291 292 return self.items.matching 293 294 def match_groups(self): 295 296 "Return the number of groups in the match." 297 298 return self.items.match_groups() 299 300 # Parser methods invoked from other objects. 301 302 def parse(self, s): 303 304 """ 305 Parse page text 's'. Pages consist of regions delimited by markers. 306 """ 307 308 self.items = self.get_items(s) 309 self.region = self.parse_region() 310 return self.region 311 312 def parse_region_content(self, items, region): 313 314 "Parse the data provided by 'items' to populate a 'region'." 315 316 self.set_region(items, region) 317 318 # Parse inline and opaque regions. 319 320 if not region.transparent: 321 pattern_names = ["regionend"] 322 323 # Define a block to hold text. 324 325 else: 326 self.new_block(region) 327 pattern_names = self.region_pattern_names 328 329 # Start parsing. 330 331 if pattern_names: 332 self.parse_region_details(region, pattern_names) 333 334 # Reset the type if the region was not inline. 335 336 if region.type == "inline": 337 first = region.nodes and region.nodes[0] 338 if first and isinstance(first, Text) and first.multiline(): 339 region.type = None 340 341 # Top-level parser handler methods. 342 343 def parse_region(self, level=0, indent=0, type=None): 344 345 """ 346 Parse the data to populate a region with the given 'level' at the given 347 'indent' having the given initial 'type'. 348 """ 349 350 region = Region([], level, indent, type) 351 352 # Parse section headers and directives, then parse according to region 353 # type. 354 355 self.parse_region_header(region) 356 self.parse_region_directives(region) 357 self.parse_region_type(region) 358 359 return region 360 361 def parse_region_type(self, region): 362 363 """ 364 Use configured parsers to parse 'region' based on its type. 365 """ 366 367 # Find an appropriate parser given the type. 368 369 parser = self.get_parser(region.type) 370 if not parser: 371 region.transparent = False 372 parser = parser or self.get_parser("moin") 373 parser.parse_region_content(self.items, region) 374 375 def parse_region_header(self, region): 376 377 """ 378 Parse the region header, setting it on the 'region' object. 379 """ 380 381 if self.read_until(["header"], False) == "": # None means no header 382 region.args = self.match_group("args") 383 region.type = region.args.split(" ", 1)[0] 384 385 def parse_region_directives(self, region): 386 387 """ 388 Parse any directives immediately after the region header, adding them to 389 the 'region' object. 390 """ 391 392 while True: 393 preceding = self.read_until(["directive"], False) 394 395 # With an immediately-appearing directive, handle its details. 396 397 if preceding == "": 398 handler = self.handlers.get(self.matching_pattern()) 399 if handler: 400 handler(self, region) 401 else: 402 break 403 404 # Otherwise, with no immediate directive (or none at all), stop. 405 406 else: 407 break 408 409 # Parsing utilities. 410 411 def parse_region_details(self, region, pattern_names, strict=False): 412 413 """ 414 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 415 value, forbid the accumulation of additional textual padding. 416 """ 417 418 try: 419 while True: 420 421 # Obtain text before any marker or the end of the input. 422 423 preceding = self.read_until(pattern_names) 424 if preceding: 425 if not strict: 426 region.append_inline(Text(preceding)) 427 else: 428 break 429 430 # End of input. 431 432 if not self.matching_pattern(): 433 break 434 435 # Obtain any feature. 436 437 feature = self.match_group("feature") or self.match_group() 438 handler = self.handlers.get(self.matching_pattern()) 439 440 # Handle each feature or add text to the region. 441 442 if handler: 443 handler(self, region) 444 elif not strict: 445 region.append_inline(Text(feature)) 446 else: 447 break 448 449 except StopIteration: 450 pass 451 452 region.normalise() 453 454 def add_node(self, region, node): 455 456 "Add to 'region' the given 'node'." 457 458 region.add(node) 459 460 def append_node(self, region, node): 461 462 "Append to 'region' the given 'node'." 463 464 region.append(node) 465 466 def end_region(self, region): 467 468 "End the parsing of 'region', breaking out of the parsing loop." 469 470 raise StopIteration 471 472 def queue_match(self): 473 474 "Queue the current match." 475 476 self.items.queue_match() 477 478 def new_block(self, region): 479 480 "Start a new block in 'region'." 481 482 self.add_node(region, Block([])) 483 484 # Common handler methods. 485 486 def parse_region_end(self, node): 487 488 "Handle the end of a region occurring within 'node'." 489 490 level = self.match_group("level") 491 feature = self.match_group("feature") 492 self.region.extra = self.match_group("extra") 493 494 if self.region.have_end(level): 495 raise StopIteration 496 else: 497 node.append_inline(Text(feature)) 498 499 # vim: tabstop=4 expandtab shiftwidth=4