1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 dotall = r"(.|\n)" 31 32 def choice(l): 33 34 "Return a pattern matching a choice of patterns in 'l'." 35 36 return "(%s)" % "|".join(l) 37 38 def excl(s): 39 40 "Return a non-matching pattern for 's'." 41 42 return "(?!%s)" % s 43 44 def expect(s): 45 46 "Return a pattern expecting 's'." 47 48 return "(?=%s)" % s 49 50 def group(name, s): 51 52 "Return a pattern group having 'name' and the pattern string 's'." 53 54 return "(?P<%s>%s)" % (name, s) 55 56 def optional(s): 57 58 "Return an optional pattern." 59 60 return "(?:%s)?" % s 61 62 def recur(name): 63 64 "Return a test for a recurrence of group 'name'." 65 66 return "(?P=%s)" % name 67 68 def repeat(s, min=None, max=None): 69 70 "Return a pattern matching 's' for the given 'min' and 'max' limits." 71 72 return "%s{%s,%s}" % (s, min is not None and min or "", 73 max is not None and max or "") 74 75 def get_pattern(s): 76 77 "Return a compiled regular expression for the given pattern 's'." 78 79 return re.compile(s, re.UNICODE | re.MULTILINE) 80 81 def get_patterns(syntax): 82 83 """ 84 Define patterns for the regular expressions in the 'syntax' mapping. In each 85 pattern, replace \N with a pattern for matching whitespace excluding 86 newlines. 87 """ 88 89 patterns = {} 90 for name, value in syntax.items(): 91 value = value.replace(r"\N", ws_excl_nl) 92 value = value.replace(r"\Q", quotes) 93 value = value.replace(r"\E", dotall) 94 patterns[name] = get_pattern(value) 95 return patterns 96 97 def get_subset(d, keys): 98 99 "Return a subset of 'd' having the given 'keys'." 100 101 subset = {} 102 for key in keys: 103 subset[key] = d[key] 104 return subset 105 106 107 108 # Tokenising functions. 109 110 class TokenStream: 111 112 "A stream of tokens taken from a string." 113 114 def __init__(self, s, pos=0): 115 self.s = s 116 self.pos = pos 117 118 # Match details. 119 120 self.match = None 121 self.queued = None 122 self.match_start = None 123 124 # Pattern name details. 125 126 self.matching = None 127 128 def rewind(self, length): 129 130 "Rewind in the string by 'length'." 131 132 self.pos -= min(length, self.pos) 133 134 def queue_match(self): 135 136 "Rewind in the string to the start of the last match." 137 138 self.queued = self.match 139 140 def read_until(self, patterns, remaining=True): 141 142 """ 143 Find the first match for the given 'patterns'. Return the text preceding 144 any match, the remaining text if no match was found, or None if no match 145 was found and 'remaining' is given as a false value. 146 """ 147 148 if self.queued: 149 self.match = self.queued 150 self.queued = None 151 else: 152 self.match_start = None 153 self.matching = None 154 155 # Find the first matching pattern. 156 157 for pattern_name, pattern in patterns.items(): 158 match = pattern.search(self.s, self.pos) 159 if match: 160 start, end = match.span() 161 if self.matching is None or start < self.start: 162 self.start = start 163 self.matching = pattern_name 164 self.match = match 165 166 if self.matching is None: 167 if remaining: 168 return self.s[self.pos:] 169 else: 170 return None 171 else: 172 return self.s[self.pos:self.start] 173 174 def match_group(self, group=1): 175 176 """ 177 Return the matched text, updating the position in the stream. If 'group' 178 is specified, the indicated group in a match will be returned. 179 Typically, group 1 should contain all pertinent data, but groups defined 180 within group 1 can provide sections of the data. 181 """ 182 183 self.update_pos() 184 185 if self.match: 186 try: 187 return self.match.group(group) 188 except IndexError: 189 return "" 190 else: 191 return None 192 193 def match_groups(self, groups=None): 194 195 "Return the match 'groups', or all groups if unspecified." 196 197 self.update_pos() 198 199 if self.match: 200 if groups is None: 201 return self.match.groups() 202 else: 203 return self.match.groups(groups) 204 else: 205 return [] 206 207 def update_pos(self): 208 209 "Update the position in the stream." 210 211 if self.match: 212 _start, self.pos = self.match.span() 213 else: 214 self.pos = len(self.s) 215 216 217 218 # Parser abstractions. 219 220 class ParserBase: 221 222 "Common parsing methods." 223 224 region_pattern_names = None 225 226 def __init__(self, formats=None, root=None): 227 228 """ 229 Initialise the parser with any given 'formats' mapping from region type 230 names to parser objects. An optional 'root' indicates the document-level 231 parser. 232 """ 233 234 self.formats = formats 235 self.root = root 236 237 def get_parser(self, format_type): 238 239 """ 240 Return a parser for 'format_type' or None if no suitable parser is found. 241 """ 242 243 if not self.formats: 244 return None 245 246 cls = self.formats.get(format_type) 247 if cls: 248 return cls(self.formats, self.root or self) 249 else: 250 return None 251 252 def get_patterns(self, pattern_names): 253 254 "Return a mapping of the given 'pattern_names' to patterns." 255 256 return get_subset(self.patterns, pattern_names) 257 258 def get_items(self, s, pos=0): 259 260 "Return a sequence of token items for 's' and 'pos'." 261 262 return TokenStream(s, pos) 263 264 def set_region(self, items, region): 265 266 "Set the 'items' used to populate the given 'region'." 267 268 self.items = items 269 self.region = region 270 271 def read_until(self, pattern_names, remaining=True): 272 273 """ 274 Read the next portion of input, matching using 'pattern_names'. Return 275 the text preceding any match, the remaining text if no match was found, 276 or None if no match was found and 'remaining' is given as a false value. 277 """ 278 279 return self.items.read_until(self.get_patterns(pattern_names)) 280 281 def match_group(self, group=1): 282 283 """ 284 Return the group of the matching pattern with the given 'group' number. 285 """ 286 287 return self.items.match_group(group) 288 289 def matching_pattern(self): 290 291 "Return the name of the matching pattern." 292 293 return self.items.matching 294 295 def match_groups(self): 296 297 "Return the number of groups in the match." 298 299 return self.items.match_groups() 300 301 # Parser methods invoked from other objects. 302 303 def parse(self, s): 304 305 """ 306 Parse page text 's'. Pages consist of regions delimited by markers. 307 """ 308 309 self.items = self.get_items(s) 310 self.region = self.parse_region() 311 return self.region 312 313 def parse_region_content(self, items, region): 314 315 "Parse the data provided by 'items' to populate a 'region'." 316 317 self.set_region(items, region) 318 319 # Parse inline and opaque regions. 320 321 if not region.transparent: 322 pattern_names = ["regionend"] 323 324 # Define a block to hold text. 325 326 else: 327 self.new_block(region) 328 pattern_names = self.region_pattern_names 329 330 # Start parsing. 331 332 if pattern_names: 333 self.parse_region_details(region, pattern_names) 334 335 # Reset the type if the region was not inline. 336 337 if region.type == "inline": 338 first = region.nodes and region.nodes[0] 339 if first and isinstance(first, Text) and first.multiline(): 340 region.type = None 341 342 # Top-level parser handler methods. 343 344 def parse_region(self, level=0, indent=0, type=None): 345 346 """ 347 Parse the data to populate a region with the given 'level' at the given 348 'indent' having the given initial 'type'. 349 """ 350 351 region = Region([], level, indent, type) 352 353 # Parse section headers and directives, then parse according to region 354 # type. 355 356 self.parse_region_header(region) 357 self.parse_region_directives(region) 358 self.parse_region_type(region) 359 360 return region 361 362 def parse_region_type(self, region): 363 364 """ 365 Use configured parsers to parse 'region' based on its type. 366 """ 367 368 # Find an appropriate parser given the type. 369 370 parser = self.get_parser(region.type) 371 if not parser: 372 region.transparent = False 373 parser = parser or self.get_parser("moin") 374 parser.parse_region_content(self.items, region) 375 376 def parse_region_header(self, region): 377 378 """ 379 Parse the region header, setting it on the 'region' object. 380 """ 381 382 if self.read_until(["header"], False) == "": # None means no header 383 region.args = self.match_group("args") 384 region.type = region.args.split(" ", 1)[0] 385 386 def parse_region_directives(self, region): 387 388 """ 389 Parse any directives immediately after the region header, adding them to 390 the 'region' object. 391 """ 392 393 while True: 394 preceding = self.read_until(["directive"], False) 395 396 # With an immediately-appearing directive, handle its details. 397 398 if preceding == "": 399 handler = self.handlers.get(self.matching_pattern()) 400 if handler: 401 handler(self, region) 402 else: 403 break 404 405 # Otherwise, with no immediate directive (or none at all), stop. 406 407 else: 408 break 409 410 # Parsing utilities. 411 412 def parse_region_details(self, region, pattern_names, strict=False): 413 414 """ 415 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 416 value, forbid the accumulation of additional textual padding. 417 """ 418 419 try: 420 while True: 421 422 # Obtain text before any marker or the end of the input. 423 424 preceding = self.read_until(pattern_names) 425 if preceding: 426 if not strict: 427 region.append_inline(Text(preceding)) 428 else: 429 break 430 431 # End of input. 432 433 if not self.matching_pattern(): 434 break 435 436 # Obtain any feature. 437 438 feature = self.match_group("feature") or self.match_group() 439 handler = self.handlers.get(self.matching_pattern()) 440 441 # Handle each feature or add text to the region. 442 443 if handler: 444 handler(self, region) 445 elif not strict: 446 region.append_inline(Text(feature)) 447 else: 448 break 449 450 except StopIteration: 451 pass 452 453 region.normalise() 454 455 def add_node(self, region, node): 456 457 "Add to 'region' the given 'node'." 458 459 region.add(node) 460 461 def append_node(self, region, node): 462 463 "Append to 'region' the given 'node'." 464 465 region.append(node) 466 467 def end_region(self, region): 468 469 "End the parsing of 'region', breaking out of the parsing loop." 470 471 raise StopIteration 472 473 def queue_match(self): 474 475 "Queue the current match." 476 477 self.items.queue_match() 478 479 def new_block(self, region): 480 481 "Start a new block in 'region'." 482 483 self.add_node(region, Block([])) 484 485 # Common handler methods. 486 487 def parse_region_end(self, node): 488 489 "Handle the end of a region occurring within 'node'." 490 491 level = self.match_group("level") 492 feature = self.match_group("feature") 493 self.region.extra = self.match_group("extra") 494 495 if self.region.have_end(level): 496 raise StopIteration 497 else: 498 node.append_inline(Text(feature)) 499 500 # vim: tabstop=4 expandtab shiftwidth=4