1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018, 2019 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 dotall = r"(.|\n)" # behave similarly to dot with DOTALL option 31 dotparagraph = r"(.|\n(?!\r?\n))" # match everything within paragraphs 32 33 def choice(l): 34 35 "Return a pattern matching a choice of patterns in 'l'." 36 37 return "(%s)" % "|".join(l) 38 39 def excl(s): 40 41 "Return a non-matching pattern for 's'." 42 43 return "(?!%s)" % s 44 45 def expect(s): 46 47 "Return a pattern expecting 's'." 48 49 return "(?=%s)" % s 50 51 def group(name, s): 52 53 """ 54 Return a pattern for the group having the given 'name' and employing the 55 pattern string 's'. 56 """ 57 58 return "(?P<%s>%s)" % (name, s) 59 60 def optional(s): 61 62 "Return an optional pattern." 63 64 return "(?:%s)?" % s 65 66 def recur(name): 67 68 "Return a test for a recurrence of group 'name'." 69 70 return "(?P=%s)" % name 71 72 def repeat(s, min=None, max=None): 73 74 "Return a pattern matching 's' for the given 'min' and 'max' limits." 75 76 return "%s{%s,%s}" % (s, min is not None and min or "", 77 max is not None and max or "") 78 79 def get_patterns(syntax): 80 81 """ 82 Define patterns for the regular expressions in the 'syntax' mapping. In each 83 pattern, replace... 84 85 \E with a pattern for matching all characters including newlines 86 \N with a pattern for matching whitespace excluding newlines 87 \P with a pattern for matching all characters within a paragraph 88 \Q with a pattern for matching quotation marks 89 90 Group names are also qualified with a pattern name prefix. 91 """ 92 93 patterns = {} 94 95 for name, value in syntax.items(): 96 value = value.replace(r"\N", ws_excl_nl) 97 value = value.replace(r"\Q", quotes) 98 value = value.replace(r"\E", dotall) 99 value = value.replace(r"\P", dotparagraph) 100 101 # Add the name to group names as a prefix. 102 103 value = value.replace("(?P<", "(?P<%s_" % name) 104 value = value.replace("(?P=", "(?P=%s_" % name) 105 106 # Record the updated expression and add an identifying null group. 107 108 patterns[name] = "%s(?P<group_%s>)" % (value, name) 109 110 return patterns 111 112 def get_expression(d, keys): 113 114 """ 115 Return a compiled expression combining patterns in 'd' having the given 116 'keys'. 117 """ 118 119 subset = [] 120 121 for key in keys: 122 subset.append(d[key]) 123 124 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 125 126 127 128 # Tokenising functions. 129 130 class TokenStream: 131 132 "A stream of tokens taken from a string." 133 134 def __init__(self, s, pos=0): 135 self.s = s 136 self.pos = pos 137 138 # Match details. 139 140 self.match = None 141 self.queued = None 142 self.groups = {} 143 144 # Pattern name details. 145 146 self.matching = None 147 148 def rewind(self, length): 149 150 "Rewind in the string by 'length'." 151 152 self.pos -= min(length, self.pos) 153 154 def queue_match(self): 155 156 "Rewind in the string to the start of the last match." 157 158 self.queued = self.match 159 160 def read_until(self, expression, remaining=True): 161 162 """ 163 Find the first match for the given 'expression'. Return the text 164 preceding any match, the remaining text if no match was found, or None 165 if no match was found and 'remaining' is given as a false value. 166 """ 167 168 if self.queued: 169 self.match = self.queued 170 self.queued = None 171 else: 172 self.matching = None 173 174 # Find the first matching pattern. 175 176 match = expression.search(self.s, self.pos) 177 178 if match: 179 for name, value in match.groupdict().items(): 180 181 # Use a group with a non-null value to identify the 182 # matching pattern. 183 184 if name.startswith("group_") and value is not None: 185 self.matching = name[len("group_"):] 186 self.start, self.end = match.span() 187 self.match = match 188 break 189 190 # Return the remaining text, if appropriate. 191 192 if self.matching is None: 193 self.groups = {} 194 if remaining: 195 return self.s[self.pos:] 196 else: 197 return None 198 else: 199 self.groups = self.filter_groups() 200 return self.s[self.pos:self.start] 201 202 def filter_groups(self): 203 204 "Filter groups from the current match for the matching pattern." 205 206 prefix = "%s_" % self.matching 207 208 d = {} 209 for key, value in self.match.groupdict().items(): 210 if key.startswith(prefix): 211 d[key[len(prefix):]] = value 212 return d 213 214 def match_group(self, group=None): 215 216 """ 217 Return the matched text, updating the position in the stream. If 'group' 218 is specified, the indicated group in a match will be returned. 219 Otherwise, the entire match is returned. 220 """ 221 222 self.update_pos() 223 224 if self.match: 225 if group is None: 226 return self.s[self.start:self.end] 227 else: 228 return self.groups.get(group) 229 else: 230 return None 231 232 def match_groups(self, groups=None): 233 234 "Return the match 'groups', or all groups if unspecified." 235 236 self.update_pos() 237 238 if self.match: 239 if groups is None: 240 return self.groups 241 else: 242 l = [] 243 for group in groups: 244 l.append(self.groups.get(group)) 245 return l 246 else: 247 return [] 248 249 def update_pos(self): 250 251 "Update the position in the stream." 252 253 if self.match: 254 _start, self.pos = self.match.span() 255 else: 256 self.pos = len(self.s) 257 258 259 260 # Parser abstractions. 261 262 class ParserBase: 263 264 "Common parsing methods." 265 266 region_pattern_names = None 267 268 def __init__(self, metadata, parsers=None, root=None): 269 270 """ 271 Initialise the parser with the given 'metadata' and optional 'parsers'. 272 An optional 'root' indicates the document-level parser. 273 """ 274 275 self.metadata = metadata 276 self.parsers = parsers 277 self.root = root 278 279 def get_parser(self, format_type): 280 281 """ 282 Return a parser for 'format_type' or None if no suitable parser is found. 283 """ 284 285 cls = self.parsers and self.parsers.get(format_type) 286 if cls: 287 return cls(self.metadata, self.parsers, self.root or self) 288 else: 289 return None 290 291 def get_expression(self, pattern_names): 292 293 "Return a mapping of the given 'pattern_names' to patterns." 294 295 return get_expression(self.patterns, pattern_names) 296 297 def get_items(self, s, pos=0): 298 299 "Return a sequence of token items for 's' and 'pos'." 300 301 return TokenStream(s, pos) 302 303 def set_region(self, items, region): 304 305 "Set the 'items' used to populate the given 'region'." 306 307 self.items = items 308 self.region = region 309 310 def read_until(self, pattern_names, remaining=True): 311 312 """ 313 Read the next portion of input, matching using 'pattern_names'. Return 314 the text preceding any match, the remaining text if no match was found, 315 or None if no match was found and 'remaining' is given as a false value. 316 """ 317 318 return self.items.read_until(self.get_expression(pattern_names)) 319 320 def match_group(self, group=None): 321 322 """ 323 Return the group of the matching pattern with the given 'group' 324 identifier. If 'group' is omitted or None, return the entire match. 325 """ 326 327 return self.items.match_group(group) 328 329 def matching_pattern(self): 330 331 "Return the name of the matching pattern." 332 333 return self.items.matching 334 335 def match_groups(self): 336 337 "Return the number of groups in the match." 338 339 return self.items.match_groups() 340 341 # Parser methods invoked from other objects. 342 343 def parse(self, s): 344 345 """ 346 Parse page text 's'. Pages consist of regions delimited by markers. 347 """ 348 349 self.items = self.get_items(s) 350 self.region = self.parse_region() 351 return self.region 352 353 def parse_region_content(self, items, region): 354 355 "Parse the data provided by 'items' to populate a 'region'." 356 357 self.set_region(items, region) 358 359 # Only parse directives if the region is transparent. 360 361 if region.transparent: 362 self.parse_region_directives(region) 363 364 # Parse inline and opaque regions. 365 366 if not region.transparent: 367 pattern_names = ["regionend"] 368 369 # Define a block to hold text. 370 371 else: 372 self.new_block(region) 373 pattern_names = self.region_pattern_names 374 375 # Start parsing. 376 377 if pattern_names: 378 self.parse_region_details(region, pattern_names) 379 380 # Reset the type if the region was not inline. 381 382 if region.type == "inline": 383 first = region.nodes and region.nodes[0] 384 if first and isinstance(first, Text) and first.multiline(): 385 region.type = None 386 387 # Top-level parser handler methods. 388 389 def parse_region(self, level=0, indent=0, type=None): 390 391 """ 392 Parse the data to populate a region with the given 'level' at the given 393 'indent' having the given initial 'type'. 394 """ 395 396 region = Region([], level, indent, type) 397 398 # Parse section headers, then parse according to region type. 399 400 self.parse_region_header(region) 401 self.parse_region_type(region) 402 403 return region 404 405 def parse_region_type(self, region): 406 407 """ 408 Use configured parsers to parse 'region' based on its type. 409 """ 410 411 # Find an appropriate parser given the type. 412 413 parser = self.get_parser(region.type) 414 if not parser: 415 region.transparent = False 416 parser = parser or self.get_parser("moin") 417 418 parser.parse_region_content(self.items, region) 419 420 def parse_region_header(self, region): 421 422 """ 423 Parse the region header, setting it on the 'region' object. 424 """ 425 426 if self.read_until(["header"], False) == "": # None means no header 427 region.args = self.match_group("args") 428 region.type = region.args.split(" ", 1)[0] 429 430 def parse_region_directives(self, region): 431 432 """ 433 Parse any directives immediately after the region header, adding them to 434 the 'region' object. 435 """ 436 437 try: 438 while True: 439 preceding = self.read_until(["directive"], False) 440 441 # With an immediately-appearing directive, handle its details. 442 443 if preceding == "": 444 handler = self.handlers.get(self.matching_pattern()) 445 if handler: 446 handler(self, region) 447 else: 448 break 449 450 # Otherwise, with no immediate directive (or none at all), stop. 451 452 else: 453 break 454 455 # Handle a premature end of region. 456 457 except StopIteration: 458 pass 459 460 # Parsing utilities. 461 462 def parse_region_details(self, region, pattern_names, strict=False): 463 464 """ 465 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 466 value, forbid the accumulation of additional textual padding. 467 """ 468 469 try: 470 while True: 471 472 # Obtain text before any marker or the end of the input. 473 474 preceding = self.read_until(pattern_names) 475 if preceding: 476 if not strict: 477 region.append_inline(Text(preceding)) 478 else: 479 break 480 481 # End of input. 482 483 if not self.matching_pattern(): 484 break 485 486 # Obtain any feature. 487 488 feature = self.match_group("feature") or self.match_group() 489 handler = self.handlers.get(self.matching_pattern()) 490 491 # Handle each feature or add text to the region. 492 493 if handler: 494 handler(self, region) 495 elif not strict: 496 region.append_inline(Text(feature)) 497 else: 498 break 499 500 except StopIteration: 501 pass 502 503 region.normalise() 504 505 def add_node(self, region, node): 506 507 "Add to 'region' the given 'node'." 508 509 region.add(node) 510 511 def append_node(self, region, node): 512 513 "Append to 'region' the given 'node'." 514 515 region.append(node) 516 517 def end_region(self, region): 518 519 "End the parsing of 'region', breaking out of the parsing loop." 520 521 raise StopIteration 522 523 def queue_match(self): 524 525 "Queue the current match." 526 527 self.items.queue_match() 528 529 def new_block(self, region): 530 531 "Start a new block in 'region'." 532 533 self.add_node(region, Block([])) 534 535 # Common handler methods. 536 537 def parse_region_end(self, node): 538 539 "Handle the end of a region occurring within 'node'." 540 541 level = self.match_group("level") 542 feature = self.match_group("feature") 543 self.region.extra = self.match_group("extra") 544 545 if self.region.have_end(level): 546 raise StopIteration 547 else: 548 node.append_inline(Text(feature)) 549 550 # vim: tabstop=4 expandtab shiftwidth=4