1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 dotall = r"(.|\n)" 31 32 def choice(l): 33 34 "Return a pattern matching a choice of patterns in 'l'." 35 36 return "(%s)" % "|".join(l) 37 38 def excl(s): 39 40 "Return a non-matching pattern for 's'." 41 42 return "(?!%s)" % s 43 44 def expect(s): 45 46 "Return a pattern expecting 's'." 47 48 return "(?=%s)" % s 49 50 def group(name, s): 51 52 """ 53 Return a pattern for the group having the given 'name' and employing the 54 pattern string 's'. 55 """ 56 57 return "(?P<%s>%s)" % (name, s) 58 59 def optional(s): 60 61 "Return an optional pattern." 62 63 return "(?:%s)?" % s 64 65 def recur(name): 66 67 "Return a test for a recurrence of group 'name'." 68 69 return "(?P=%s)" % name 70 71 def repeat(s, min=None, max=None): 72 73 "Return a pattern matching 's' for the given 'min' and 'max' limits." 74 75 return "%s{%s,%s}" % (s, min is not None and min or "", 76 max is not None and max or "") 77 78 def get_patterns(syntax): 79 80 """ 81 Define patterns for the regular expressions in the 'syntax' mapping. In each 82 pattern, replace... 83 84 \E with a pattern for matching all characters including newlines 85 \N with a pattern for matching whitespace excluding newlines 86 \Q with a pattern for matching quotation marks 87 88 Group names are also qualified with a pattern name prefix. 89 """ 90 91 patterns = {} 92 93 for name, value in syntax.items(): 94 value = value.replace(r"\N", ws_excl_nl) 95 value = value.replace(r"\Q", quotes) 96 value = value.replace(r"\E", dotall) 97 98 # Add the name to group names as a prefix. 99 100 value = value.replace("(?P<", "(?P<%s_" % name) 101 value = value.replace("(?P=", "(?P=%s_" % name) 102 103 # Record the updated expression and add an identifying null group. 104 105 patterns[name] = "%s(?P<group_%s>)" % (value, name) 106 107 return patterns 108 109 def get_expression(d, keys): 110 111 """ 112 Return a compiled expression combining patterns in 'd' having the given 113 'keys'. 114 """ 115 116 subset = [] 117 118 for key in keys: 119 subset.append(d[key]) 120 121 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 122 123 124 125 # Tokenising functions. 126 127 class TokenStream: 128 129 "A stream of tokens taken from a string." 130 131 def __init__(self, s, pos=0): 132 self.s = s 133 self.pos = pos 134 135 # Match details. 136 137 self.match = None 138 self.queued = None 139 self.groups = {} 140 141 # Pattern name details. 142 143 self.matching = None 144 145 def rewind(self, length): 146 147 "Rewind in the string by 'length'." 148 149 self.pos -= min(length, self.pos) 150 151 def queue_match(self): 152 153 "Rewind in the string to the start of the last match." 154 155 self.queued = self.match 156 157 def read_until(self, expression, remaining=True): 158 159 """ 160 Find the first match for the given 'expression'. Return the text 161 preceding any match, the remaining text if no match was found, or None 162 if no match was found and 'remaining' is given as a false value. 163 """ 164 165 if self.queued: 166 self.match = self.queued 167 self.queued = None 168 else: 169 self.matching = None 170 171 # Find the first matching pattern. 172 173 match = expression.search(self.s, self.pos) 174 175 if match: 176 for name, value in match.groupdict().items(): 177 178 # Use a group with a non-null value to identify the 179 # matching pattern. 180 181 if name.startswith("group_") and value is not None: 182 self.matching = name[len("group_"):] 183 self.start, self.end = match.span() 184 self.match = match 185 break 186 187 # Return the remaining text, if appropriate. 188 189 if self.matching is None: 190 self.groups = {} 191 if remaining: 192 return self.s[self.pos:] 193 else: 194 return None 195 else: 196 self.groups = self.filter_groups() 197 return self.s[self.pos:self.start] 198 199 def filter_groups(self): 200 201 "Filter groups from the current match for the matching pattern." 202 203 prefix = "%s_" % self.matching 204 205 d = {} 206 for key, value in self.match.groupdict().items(): 207 if key.startswith(prefix): 208 d[key[len(prefix):]] = value 209 return d 210 211 def match_group(self, group=None): 212 213 """ 214 Return the matched text, updating the position in the stream. If 'group' 215 is specified, the indicated group in a match will be returned. 216 Otherwise, the entire match is returned. 217 """ 218 219 self.update_pos() 220 221 if self.match: 222 if group is None: 223 return self.s[self.start:self.end] 224 else: 225 return self.groups.get(group) 226 else: 227 return None 228 229 def match_groups(self, groups=None): 230 231 "Return the match 'groups', or all groups if unspecified." 232 233 self.update_pos() 234 235 if self.match: 236 if groups is None: 237 return self.groups 238 else: 239 l = [] 240 for group in groups: 241 l.append(self.groups.get(group)) 242 return l 243 else: 244 return [] 245 246 def update_pos(self): 247 248 "Update the position in the stream." 249 250 if self.match: 251 _start, self.pos = self.match.span() 252 else: 253 self.pos = len(self.s) 254 255 256 257 # Parser abstractions. 258 259 class ParserBase: 260 261 "Common parsing methods." 262 263 region_pattern_names = None 264 265 def __init__(self, formats=None, root=None): 266 267 """ 268 Initialise the parser with any given 'formats' mapping from region type 269 names to parser objects. An optional 'root' indicates the document-level 270 parser. 271 """ 272 273 self.formats = formats 274 self.root = root 275 276 def get_parser(self, format_type): 277 278 """ 279 Return a parser for 'format_type' or None if no suitable parser is found. 280 """ 281 282 if not self.formats: 283 return None 284 285 cls = self.formats.get(format_type) 286 if cls: 287 return cls(self.formats, self.root or self) 288 else: 289 return None 290 291 def get_expression(self, pattern_names): 292 293 "Return a mapping of the given 'pattern_names' to patterns." 294 295 return get_expression(self.patterns, pattern_names) 296 297 def get_items(self, s, pos=0): 298 299 "Return a sequence of token items for 's' and 'pos'." 300 301 return TokenStream(s, pos) 302 303 def set_region(self, items, region): 304 305 "Set the 'items' used to populate the given 'region'." 306 307 self.items = items 308 self.region = region 309 310 def read_until(self, pattern_names, remaining=True): 311 312 """ 313 Read the next portion of input, matching using 'pattern_names'. Return 314 the text preceding any match, the remaining text if no match was found, 315 or None if no match was found and 'remaining' is given as a false value. 316 """ 317 318 return self.items.read_until(self.get_expression(pattern_names)) 319 320 def match_group(self, group=None): 321 322 """ 323 Return the group of the matching pattern with the given 'group' 324 identifier. If 'group' is omitted or None, return the entire match. 325 """ 326 327 return self.items.match_group(group) 328 329 def matching_pattern(self): 330 331 "Return the name of the matching pattern." 332 333 return self.items.matching 334 335 def match_groups(self): 336 337 "Return the number of groups in the match." 338 339 return self.items.match_groups() 340 341 # Parser methods invoked from other objects. 342 343 def parse(self, s): 344 345 """ 346 Parse page text 's'. Pages consist of regions delimited by markers. 347 """ 348 349 self.items = self.get_items(s) 350 self.region = self.parse_region() 351 return self.region 352 353 def parse_region_content(self, items, region): 354 355 "Parse the data provided by 'items' to populate a 'region'." 356 357 self.set_region(items, region) 358 359 # Parse inline and opaque regions. 360 361 if not region.transparent: 362 pattern_names = ["regionend"] 363 364 # Define a block to hold text. 365 366 else: 367 self.new_block(region) 368 pattern_names = self.region_pattern_names 369 370 # Start parsing. 371 372 if pattern_names: 373 self.parse_region_details(region, pattern_names) 374 375 # Reset the type if the region was not inline. 376 377 if region.type == "inline": 378 first = region.nodes and region.nodes[0] 379 if first and isinstance(first, Text) and first.multiline(): 380 region.type = None 381 382 # Top-level parser handler methods. 383 384 def parse_region(self, level=0, indent=0, type=None): 385 386 """ 387 Parse the data to populate a region with the given 'level' at the given 388 'indent' having the given initial 'type'. 389 """ 390 391 region = Region([], level, indent, type) 392 393 # Parse section headers and directives, then parse according to region 394 # type. 395 396 self.parse_region_header(region) 397 self.parse_region_directives(region) 398 self.parse_region_type(region) 399 400 return region 401 402 def parse_region_type(self, region): 403 404 """ 405 Use configured parsers to parse 'region' based on its type. 406 """ 407 408 # Find an appropriate parser given the type. 409 410 parser = self.get_parser(region.type) 411 if not parser: 412 region.transparent = False 413 parser = parser or self.get_parser("moin") 414 parser.parse_region_content(self.items, region) 415 416 def parse_region_header(self, region): 417 418 """ 419 Parse the region header, setting it on the 'region' object. 420 """ 421 422 if self.read_until(["header"], False) == "": # None means no header 423 region.args = self.match_group("args") 424 region.type = region.args.split(" ", 1)[0] 425 426 def parse_region_directives(self, region): 427 428 """ 429 Parse any directives immediately after the region header, adding them to 430 the 'region' object. 431 """ 432 433 while True: 434 preceding = self.read_until(["directive"], False) 435 436 # With an immediately-appearing directive, handle its details. 437 438 if preceding == "": 439 handler = self.handlers.get(self.matching_pattern()) 440 if handler: 441 handler(self, region) 442 else: 443 break 444 445 # Otherwise, with no immediate directive (or none at all), stop. 446 447 else: 448 break 449 450 # Parsing utilities. 451 452 def parse_region_details(self, region, pattern_names, strict=False): 453 454 """ 455 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 456 value, forbid the accumulation of additional textual padding. 457 """ 458 459 try: 460 while True: 461 462 # Obtain text before any marker or the end of the input. 463 464 preceding = self.read_until(pattern_names) 465 if preceding: 466 if not strict: 467 region.append_inline(Text(preceding)) 468 else: 469 break 470 471 # End of input. 472 473 if not self.matching_pattern(): 474 break 475 476 # Obtain any feature. 477 478 feature = self.match_group("feature") or self.match_group() 479 handler = self.handlers.get(self.matching_pattern()) 480 481 # Handle each feature or add text to the region. 482 483 if handler: 484 handler(self, region) 485 elif not strict: 486 region.append_inline(Text(feature)) 487 else: 488 break 489 490 except StopIteration: 491 pass 492 493 region.normalise() 494 495 def add_node(self, region, node): 496 497 "Add to 'region' the given 'node'." 498 499 region.add(node) 500 501 def append_node(self, region, node): 502 503 "Append to 'region' the given 'node'." 504 505 region.append(node) 506 507 def end_region(self, region): 508 509 "End the parsing of 'region', breaking out of the parsing loop." 510 511 raise StopIteration 512 513 def queue_match(self): 514 515 "Queue the current match." 516 517 self.items.queue_match() 518 519 def new_block(self, region): 520 521 "Start a new block in 'region'." 522 523 self.add_node(region, Block([])) 524 525 # Common handler methods. 526 527 def parse_region_end(self, node): 528 529 "Handle the end of a region occurring within 'node'." 530 531 level = self.match_group("level") 532 feature = self.match_group("feature") 533 self.region.extra = self.match_group("extra") 534 535 if self.region.have_end(level): 536 raise StopIteration 537 else: 538 node.append_inline(Text(feature)) 539 540 # vim: tabstop=4 expandtab shiftwidth=4