1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 dotall = r"(.|\n)" # behave similarly to dot with DOTALL option 31 dotparagraph = r"(.|\n(?!\r?\n))" # match everything within paragraphs 32 33 def choice(l): 34 35 "Return a pattern matching a choice of patterns in 'l'." 36 37 return "(%s)" % "|".join(l) 38 39 def excl(s): 40 41 "Return a non-matching pattern for 's'." 42 43 return "(?!%s)" % s 44 45 def expect(s): 46 47 "Return a pattern expecting 's'." 48 49 return "(?=%s)" % s 50 51 def group(name, s): 52 53 """ 54 Return a pattern for the group having the given 'name' and employing the 55 pattern string 's'. 56 """ 57 58 return "(?P<%s>%s)" % (name, s) 59 60 def optional(s): 61 62 "Return an optional pattern." 63 64 return "(?:%s)?" % s 65 66 def recur(name): 67 68 "Return a test for a recurrence of group 'name'." 69 70 return "(?P=%s)" % name 71 72 def repeat(s, min=None, max=None): 73 74 "Return a pattern matching 's' for the given 'min' and 'max' limits." 75 76 return "%s{%s,%s}" % (s, min is not None and min or "", 77 max is not None and max or "") 78 79 def get_patterns(syntax): 80 81 """ 82 Define patterns for the regular expressions in the 'syntax' mapping. In each 83 pattern, replace... 84 85 \E with a pattern for matching all characters including newlines 86 \N with a pattern for matching whitespace excluding newlines 87 \P with a pattern for matching all characters within a paragraph 88 \Q with a pattern for matching quotation marks 89 90 Group names are also qualified with a pattern name prefix. 91 """ 92 93 patterns = {} 94 95 for name, value in syntax.items(): 96 value = value.replace(r"\N", ws_excl_nl) 97 value = value.replace(r"\Q", quotes) 98 value = value.replace(r"\E", dotall) 99 value = value.replace(r"\P", dotparagraph) 100 101 # Add the name to group names as a prefix. 102 103 value = value.replace("(?P<", "(?P<%s_" % name) 104 value = value.replace("(?P=", "(?P=%s_" % name) 105 106 # Record the updated expression and add an identifying null group. 107 108 patterns[name] = "%s(?P<group_%s>)" % (value, name) 109 110 return patterns 111 112 def get_expression(d, keys): 113 114 """ 115 Return a compiled expression combining patterns in 'd' having the given 116 'keys'. 117 """ 118 119 subset = [] 120 121 for key in keys: 122 subset.append(d[key]) 123 124 return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) 125 126 127 128 # Tokenising functions. 129 130 class TokenStream: 131 132 "A stream of tokens taken from a string." 133 134 def __init__(self, s, pos=0): 135 self.s = s 136 self.pos = pos 137 138 # Match details. 139 140 self.match = None 141 self.queued = None 142 self.groups = {} 143 144 # Pattern name details. 145 146 self.matching = None 147 148 def rewind(self, length): 149 150 "Rewind in the string by 'length'." 151 152 self.pos -= min(length, self.pos) 153 154 def queue_match(self): 155 156 "Rewind in the string to the start of the last match." 157 158 self.queued = self.match 159 160 def read_until(self, expression, remaining=True): 161 162 """ 163 Find the first match for the given 'expression'. Return the text 164 preceding any match, the remaining text if no match was found, or None 165 if no match was found and 'remaining' is given as a false value. 166 """ 167 168 if self.queued: 169 self.match = self.queued 170 self.queued = None 171 else: 172 self.matching = None 173 174 # Find the first matching pattern. 175 176 match = expression.search(self.s, self.pos) 177 178 if match: 179 for name, value in match.groupdict().items(): 180 181 # Use a group with a non-null value to identify the 182 # matching pattern. 183 184 if name.startswith("group_") and value is not None: 185 self.matching = name[len("group_"):] 186 self.start, self.end = match.span() 187 self.match = match 188 break 189 190 # Return the remaining text, if appropriate. 191 192 if self.matching is None: 193 self.groups = {} 194 if remaining: 195 return self.s[self.pos:] 196 else: 197 return None 198 else: 199 self.groups = self.filter_groups() 200 return self.s[self.pos:self.start] 201 202 def filter_groups(self): 203 204 "Filter groups from the current match for the matching pattern." 205 206 prefix = "%s_" % self.matching 207 208 d = {} 209 for key, value in self.match.groupdict().items(): 210 if key.startswith(prefix): 211 d[key[len(prefix):]] = value 212 return d 213 214 def match_group(self, group=None): 215 216 """ 217 Return the matched text, updating the position in the stream. If 'group' 218 is specified, the indicated group in a match will be returned. 219 Otherwise, the entire match is returned. 220 """ 221 222 self.update_pos() 223 224 if self.match: 225 if group is None: 226 return self.s[self.start:self.end] 227 else: 228 return self.groups.get(group) 229 else: 230 return None 231 232 def match_groups(self, groups=None): 233 234 "Return the match 'groups', or all groups if unspecified." 235 236 self.update_pos() 237 238 if self.match: 239 if groups is None: 240 return self.groups 241 else: 242 l = [] 243 for group in groups: 244 l.append(self.groups.get(group)) 245 return l 246 else: 247 return [] 248 249 def update_pos(self): 250 251 "Update the position in the stream." 252 253 if self.match: 254 _start, self.pos = self.match.span() 255 else: 256 self.pos = len(self.s) 257 258 259 260 # Parser abstractions. 261 262 class ParserBase: 263 264 "Common parsing methods." 265 266 region_pattern_names = None 267 268 def __init__(self, metadata, parsers=None, root=None): 269 270 """ 271 Initialise the parser with the given 'metadata' and optional 'parsers'. 272 An optional 'root' indicates the document-level parser. 273 """ 274 275 self.metadata = metadata 276 self.parsers = parsers 277 self.root = root 278 279 def get_parser(self, format_type): 280 281 """ 282 Return a parser for 'format_type' or None if no suitable parser is found. 283 """ 284 285 cls = self.parsers and self.parsers.get(format_type) 286 if cls: 287 return cls(self.metadata, self.parsers, self.root or self) 288 else: 289 return None 290 291 def get_expression(self, pattern_names): 292 293 "Return a mapping of the given 'pattern_names' to patterns." 294 295 return get_expression(self.patterns, pattern_names) 296 297 def get_items(self, s, pos=0): 298 299 "Return a sequence of token items for 's' and 'pos'." 300 301 return TokenStream(s, pos) 302 303 def set_region(self, items, region): 304 305 "Set the 'items' used to populate the given 'region'." 306 307 self.items = items 308 self.region = region 309 310 def read_until(self, pattern_names, remaining=True): 311 312 """ 313 Read the next portion of input, matching using 'pattern_names'. Return 314 the text preceding any match, the remaining text if no match was found, 315 or None if no match was found and 'remaining' is given as a false value. 316 """ 317 318 return self.items.read_until(self.get_expression(pattern_names)) 319 320 def match_group(self, group=None): 321 322 """ 323 Return the group of the matching pattern with the given 'group' 324 identifier. If 'group' is omitted or None, return the entire match. 325 """ 326 327 return self.items.match_group(group) 328 329 def matching_pattern(self): 330 331 "Return the name of the matching pattern." 332 333 return self.items.matching 334 335 def match_groups(self): 336 337 "Return the number of groups in the match." 338 339 return self.items.match_groups() 340 341 # Parser methods invoked from other objects. 342 343 def parse(self, s): 344 345 """ 346 Parse page text 's'. Pages consist of regions delimited by markers. 347 """ 348 349 self.items = self.get_items(s) 350 self.region = self.parse_region() 351 return self.region 352 353 def parse_region_content(self, items, region): 354 355 "Parse the data provided by 'items' to populate a 'region'." 356 357 self.set_region(items, region) 358 359 # Parse inline and opaque regions. 360 361 if not region.transparent: 362 pattern_names = ["regionend"] 363 364 # Define a block to hold text. 365 366 else: 367 self.new_block(region) 368 pattern_names = self.region_pattern_names 369 370 # Start parsing. 371 372 if pattern_names: 373 self.parse_region_details(region, pattern_names) 374 375 # Reset the type if the region was not inline. 376 377 if region.type == "inline": 378 first = region.nodes and region.nodes[0] 379 if first and isinstance(first, Text) and first.multiline(): 380 region.type = None 381 382 # Top-level parser handler methods. 383 384 def parse_region(self, level=0, indent=0, type=None): 385 386 """ 387 Parse the data to populate a region with the given 'level' at the given 388 'indent' having the given initial 'type'. 389 """ 390 391 region = Region([], level, indent, type) 392 393 # Parse section headers and directives, then parse according to region 394 # type. 395 396 self.parse_region_header(region) 397 self.parse_region_directives(region) 398 self.parse_region_type(region) 399 400 return region 401 402 def parse_region_type(self, region): 403 404 """ 405 Use configured parsers to parse 'region' based on its type. 406 """ 407 408 # Find an appropriate parser given the type. 409 410 parser = self.get_parser(region.type) 411 if not parser: 412 region.transparent = False 413 parser = parser or self.get_parser("moin") 414 parser.parse_region_content(self.items, region) 415 416 def parse_region_header(self, region): 417 418 """ 419 Parse the region header, setting it on the 'region' object. 420 """ 421 422 if self.read_until(["header"], False) == "": # None means no header 423 region.args = self.match_group("args") 424 region.type = region.args.split(" ", 1)[0] 425 426 def parse_region_directives(self, region): 427 428 """ 429 Parse any directives immediately after the region header, adding them to 430 the 'region' object. 431 """ 432 433 while True: 434 preceding = self.read_until(["directive"], False) 435 436 # With an immediately-appearing directive, handle its details. 437 438 if preceding == "": 439 handler = self.handlers.get(self.matching_pattern()) 440 if handler: 441 handler(self, region) 442 else: 443 break 444 445 # Otherwise, with no immediate directive (or none at all), stop. 446 447 else: 448 break 449 450 # Parsing utilities. 451 452 def parse_region_details(self, region, pattern_names, strict=False): 453 454 """ 455 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 456 value, forbid the accumulation of additional textual padding. 457 """ 458 459 try: 460 while True: 461 462 # Obtain text before any marker or the end of the input. 463 464 preceding = self.read_until(pattern_names) 465 if preceding: 466 if not strict: 467 region.append_inline(Text(preceding)) 468 else: 469 break 470 471 # End of input. 472 473 if not self.matching_pattern(): 474 break 475 476 # Obtain any feature. 477 478 feature = self.match_group("feature") or self.match_group() 479 handler = self.handlers.get(self.matching_pattern()) 480 481 # Handle each feature or add text to the region. 482 483 if handler: 484 handler(self, region) 485 elif not strict: 486 region.append_inline(Text(feature)) 487 else: 488 break 489 490 except StopIteration: 491 pass 492 493 region.normalise() 494 495 def add_node(self, region, node): 496 497 "Add to 'region' the given 'node'." 498 499 region.add(node) 500 501 def append_node(self, region, node): 502 503 "Append to 'region' the given 'node'." 504 505 region.append(node) 506 507 def end_region(self, region): 508 509 "End the parsing of 'region', breaking out of the parsing loop." 510 511 raise StopIteration 512 513 def queue_match(self): 514 515 "Queue the current match." 516 517 self.items.queue_match() 518 519 def new_block(self, region): 520 521 "Start a new block in 'region'." 522 523 self.add_node(region, Block([])) 524 525 # Common handler methods. 526 527 def parse_region_end(self, node): 528 529 "Handle the end of a region occurring within 'node'." 530 531 level = self.match_group("level") 532 feature = self.match_group("feature") 533 self.region.extra = self.match_group("extra") 534 535 if self.region.have_end(level): 536 raise StopIteration 537 else: 538 node.append_inline(Text(feature)) 539 540 # vim: tabstop=4 expandtab shiftwidth=4