1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 dotall = r"(.|\n)" # behave similarly to dot with DOTALL option 31 dotparagraph = r"(.|\n(?!\r?\n))" # match everything within paragraphs 32 33 def choice(l): 34 35 "Return a pattern matching a choice of patterns in 'l'." 36 37 return "(%s)" % "|".join(l) 38 39 def excl(s): 40 41 "Return a non-matching pattern for 's'." 42 43 return "(?!%s)" % s 44 45 def expect(s): 46 47 "Return a pattern expecting 's'." 48 49 return "(?=%s)" % s 50 51 def group(name, s): 52 53 "Return a pattern group having 'name' and the pattern string 's'." 54 55 return "(?P<%s>%s)" % (name, s) 56 57 def optional(s): 58 59 "Return an optional pattern." 60 61 return "(?:%s)?" % s 62 63 def recur(name): 64 65 "Return a test for a recurrence of group 'name'." 66 67 return "(?P=%s)" % name 68 69 def repeat(s, min=None, max=None): 70 71 "Return a pattern matching 's' for the given 'min' and 'max' limits." 72 73 return "%s{%s,%s}" % (s, min is not None and min or "", 74 max is not None and max or "") 75 76 def get_pattern(s): 77 78 "Return a compiled regular expression for the given pattern 's'." 79 80 return re.compile(s, re.UNICODE | re.MULTILINE) 81 82 def get_patterns(syntax): 83 84 """ 85 Define patterns for the regular expressions in the 'syntax' mapping. In each 86 pattern, replace \N with a pattern for matching whitespace excluding 87 newlines. 88 """ 89 90 patterns = {} 91 for name, value in syntax.items(): 92 value = value.replace(r"\N", ws_excl_nl) 93 value = value.replace(r"\Q", quotes) 94 value = value.replace(r"\E", dotall) 95 value = value.replace(r"\P", dotparagraph) 96 patterns[name] = get_pattern(value) 97 return patterns 98 99 def get_subset(d, keys): 100 101 "Return a subset of 'd' having the given 'keys'." 102 103 subset = {} 104 for key in keys: 105 subset[key] = d[key] 106 return subset 107 108 109 110 # Tokenising functions. 111 112 class TokenStream: 113 114 "A stream of tokens taken from a string." 115 116 def __init__(self, s, pos=0): 117 self.s = s 118 self.pos = pos 119 120 # Match details. 121 122 self.match = None 123 self.queued = None 124 self.match_start = None 125 126 # Pattern name details. 127 128 self.matching = None 129 130 def rewind(self, length): 131 132 "Rewind in the string by 'length'." 133 134 self.pos -= min(length, self.pos) 135 136 def queue_match(self): 137 138 "Rewind in the string to the start of the last match." 139 140 self.queued = self.match 141 142 def read_until(self, patterns, remaining=True): 143 144 """ 145 Find the first match for the given 'patterns'. Return the text preceding 146 any match, the remaining text if no match was found, or None if no match 147 was found and 'remaining' is given as a false value. 148 """ 149 150 if self.queued: 151 self.match = self.queued 152 self.queued = None 153 else: 154 self.match_start = None 155 self.matching = None 156 157 # Find the first matching pattern. 158 159 for pattern_name, pattern in patterns.items(): 160 match = pattern.search(self.s, self.pos) 161 if match: 162 start, end = match.span() 163 if self.matching is None or start < self.start: 164 self.start = start 165 self.matching = pattern_name 166 self.match = match 167 168 if self.matching is None: 169 if remaining: 170 return self.s[self.pos:] 171 else: 172 return None 173 else: 174 return self.s[self.pos:self.start] 175 176 def match_group(self, group=1): 177 178 """ 179 Return the matched text, updating the position in the stream. If 'group' 180 is specified, the indicated group in a match will be returned. 181 Typically, group 1 should contain all pertinent data, but groups defined 182 within group 1 can provide sections of the data. 183 """ 184 185 self.update_pos() 186 187 if self.match: 188 try: 189 return self.match.group(group) 190 except IndexError: 191 return "" 192 else: 193 return None 194 195 def match_groups(self, groups=None): 196 197 "Return the match 'groups', or all groups if unspecified." 198 199 self.update_pos() 200 201 if self.match: 202 if groups is None: 203 return self.match.groups() 204 else: 205 return self.match.groups(groups) 206 else: 207 return [] 208 209 def update_pos(self): 210 211 "Update the position in the stream." 212 213 if self.match: 214 _start, self.pos = self.match.span() 215 else: 216 self.pos = len(self.s) 217 218 219 220 # Parser abstractions. 221 222 class ParserBase: 223 224 "Common parsing methods." 225 226 region_pattern_names = None 227 228 def __init__(self, metadata, parsers=None, root=None): 229 230 """ 231 Initialise the parser with the given 'metadata' and optional 'parsers'. 232 An optional 'root' indicates the document-level parser. 233 """ 234 235 self.metadata = metadata 236 self.parsers = parsers 237 self.root = root 238 239 def get_parser(self, format_type): 240 241 """ 242 Return a parser for 'format_type' or None if no suitable parser is found. 243 """ 244 245 cls = self.parsers and self.parsers.get(format_type) 246 if cls: 247 return cls(self.metadata, self.parsers, self.root or self) 248 else: 249 return None 250 251 def get_patterns(self, pattern_names): 252 253 "Return a mapping of the given 'pattern_names' to patterns." 254 255 return get_subset(self.patterns, pattern_names) 256 257 def get_items(self, s, pos=0): 258 259 "Return a sequence of token items for 's' and 'pos'." 260 261 return TokenStream(s, pos) 262 263 def set_region(self, items, region): 264 265 "Set the 'items' used to populate the given 'region'." 266 267 self.items = items 268 self.region = region 269 270 def read_until(self, pattern_names, remaining=True): 271 272 """ 273 Read the next portion of input, matching using 'pattern_names'. Return 274 the text preceding any match, the remaining text if no match was found, 275 or None if no match was found and 'remaining' is given as a false value. 276 """ 277 278 return self.items.read_until(self.get_patterns(pattern_names)) 279 280 def match_group(self, group=1): 281 282 """ 283 Return the group of the matching pattern with the given 'group' number. 284 """ 285 286 return self.items.match_group(group) 287 288 def matching_pattern(self): 289 290 "Return the name of the matching pattern." 291 292 return self.items.matching 293 294 def match_groups(self): 295 296 "Return the number of groups in the match." 297 298 return self.items.match_groups() 299 300 # Parser methods invoked from other objects. 301 302 def parse(self, s): 303 304 """ 305 Parse page text 's'. Pages consist of regions delimited by markers. 306 """ 307 308 self.items = self.get_items(s) 309 self.region = self.parse_region() 310 return self.region 311 312 def parse_region_content(self, items, region): 313 314 "Parse the data provided by 'items' to populate a 'region'." 315 316 self.set_region(items, region) 317 318 # Parse inline and opaque regions. 319 320 if not region.transparent: 321 pattern_names = ["regionend"] 322 323 # Define a block to hold text. 324 325 else: 326 self.new_block(region) 327 pattern_names = self.region_pattern_names 328 329 # Start parsing. 330 331 if pattern_names: 332 self.parse_region_details(region, pattern_names) 333 334 # Reset the type if the region was not inline. 335 336 if region.type == "inline": 337 first = region.nodes and region.nodes[0] 338 if first and isinstance(first, Text) and first.multiline(): 339 region.type = None 340 341 # Top-level parser handler methods. 342 343 def parse_region(self, level=0, indent=0, type=None): 344 345 """ 346 Parse the data to populate a region with the given 'level' at the given 347 'indent' having the given initial 'type'. 348 """ 349 350 region = Region([], level, indent, type) 351 352 # Parse section headers, then parse according to region type. 353 354 self.parse_region_header(region) 355 self.parse_region_type(region) 356 357 return region 358 359 def parse_region_type(self, region): 360 361 """ 362 Use configured parsers to parse 'region' based on its type. 363 """ 364 365 # Find an appropriate parser given the type. 366 367 parser = self.get_parser(region.type) 368 if not parser: 369 region.transparent = False 370 parser = parser or self.get_parser("moin") 371 372 # Only parse directives if the region is transparent. 373 374 if region.transparent: 375 self.parse_region_directives(region) 376 377 parser.parse_region_content(self.items, region) 378 379 def parse_region_header(self, region): 380 381 """ 382 Parse the region header, setting it on the 'region' object. 383 """ 384 385 if self.read_until(["header"], False) == "": # None means no header 386 region.args = self.match_group("args") 387 region.type = region.args.split(" ", 1)[0] 388 389 def parse_region_directives(self, region): 390 391 """ 392 Parse any directives immediately after the region header, adding them to 393 the 'region' object. 394 """ 395 396 while True: 397 preceding = self.read_until(["directive"], False) 398 399 # With an immediately-appearing directive, handle its details. 400 401 if preceding == "": 402 handler = self.handlers.get(self.matching_pattern()) 403 if handler: 404 handler(self, region) 405 else: 406 break 407 408 # Otherwise, with no immediate directive (or none at all), stop. 409 410 else: 411 break 412 413 # Parsing utilities. 414 415 def parse_region_details(self, region, pattern_names, strict=False): 416 417 """ 418 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 419 value, forbid the accumulation of additional textual padding. 420 """ 421 422 try: 423 while True: 424 425 # Obtain text before any marker or the end of the input. 426 427 preceding = self.read_until(pattern_names) 428 if preceding: 429 if not strict: 430 region.append_inline(Text(preceding)) 431 else: 432 break 433 434 # End of input. 435 436 if not self.matching_pattern(): 437 break 438 439 # Obtain any feature. 440 441 feature = self.match_group("feature") or self.match_group() 442 handler = self.handlers.get(self.matching_pattern()) 443 444 # Handle each feature or add text to the region. 445 446 if handler: 447 handler(self, region) 448 elif not strict: 449 region.append_inline(Text(feature)) 450 else: 451 break 452 453 except StopIteration: 454 pass 455 456 region.normalise() 457 458 def add_node(self, region, node): 459 460 "Add to 'region' the given 'node'." 461 462 region.add(node) 463 464 def append_node(self, region, node): 465 466 "Append to 'region' the given 'node'." 467 468 region.append(node) 469 470 def end_region(self, region): 471 472 "End the parsing of 'region', breaking out of the parsing loop." 473 474 raise StopIteration 475 476 def queue_match(self): 477 478 "Queue the current match." 479 480 self.items.queue_match() 481 482 def new_block(self, region): 483 484 "Start a new block in 'region'." 485 486 self.add_node(region, Block([])) 487 488 # Common handler methods. 489 490 def parse_region_end(self, node): 491 492 "Handle the end of a region occurring within 'node'." 493 494 level = self.match_group("level") 495 feature = self.match_group("feature") 496 self.region.extra = self.match_group("extra") 497 498 if self.region.have_end(level): 499 raise StopIteration 500 else: 501 node.append_inline(Text(feature)) 502 503 # vim: tabstop=4 expandtab shiftwidth=4