1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def choice(l): 32 33 "Return a pattern matching a choice of patterns in 'l'." 34 35 return "(%s)" % "|".join(l) 36 37 def excl(s): 38 39 "Return a non-matching pattern for 's'." 40 41 return "(?!%s)" % s 42 43 def expect(s): 44 45 "Return a pattern expecting 's'." 46 47 return "(?=%s)" % s 48 49 def group(name, s): 50 51 "Return a pattern group having 'name' and the pattern string 's'." 52 53 return "(?P<%s>%s)" % (name, s) 54 55 def optional(s): 56 57 "Return an optional pattern." 58 59 return "(?:%s)?" % s 60 61 def recur(name): 62 63 "Return a test for a recurrence of group 'name'." 64 65 return "(?P=%s)" % name 66 67 def repeat(s, min=None, max=None): 68 69 "Return a pattern matching 's' for the given 'min' and 'max' limits." 70 71 return "%s{%s,%s}" % (s, min is not None and min or "", 72 max is not None and max or "") 73 74 def get_pattern(s): 75 76 "Return a compiled regular expression for the given pattern 's'." 77 78 return re.compile(s, re.UNICODE | re.MULTILINE) 79 80 def get_patterns(syntax): 81 82 """ 83 Define patterns for the regular expressions in the 'syntax' mapping. In each 84 pattern, replace \N with a pattern for matching whitespace excluding 85 newlines. 86 """ 87 88 patterns = {} 89 for name, value in syntax.items(): 90 value = value.replace(r"\N", ws_excl_nl) 91 value = value.replace(r"\Q", quotes) 92 patterns[name] = get_pattern(value) 93 return patterns 94 95 def get_subset(d, keys): 96 97 "Return a subset of 'd' having the given 'keys'." 98 99 subset = {} 100 for key in keys: 101 subset[key] = d[key] 102 return subset 103 104 105 106 # Tokenising functions. 107 108 class TokenStream: 109 110 "A stream of tokens taken from a string." 111 112 def __init__(self, s, pos=0): 113 self.s = s 114 self.pos = pos 115 116 # Match details. 117 118 self.match = None 119 self.queued = None 120 self.match_start = None 121 122 # Pattern name details. 123 124 self.matching = None 125 126 def rewind(self, length): 127 128 "Rewind in the string by 'length'." 129 130 self.pos -= min(length, self.pos) 131 132 def queue_match(self): 133 134 "Rewind in the string to the start of the last match." 135 136 self.queued = self.match 137 138 def read_until(self, patterns, remaining=True): 139 140 """ 141 Find the first match for the given 'patterns'. Return the text preceding 142 any match, the remaining text if no match was found, or None if no match 143 was found and 'remaining' is given as a false value. 144 """ 145 146 if self.queued: 147 self.match = self.queued 148 self.queued = None 149 else: 150 self.match_start = None 151 self.matching = None 152 153 # Find the first matching pattern. 154 155 for pattern_name, pattern in patterns.items(): 156 match = pattern.search(self.s, self.pos) 157 if match: 158 start, end = match.span() 159 if self.matching is None or start < self.start: 160 self.start = start 161 self.matching = pattern_name 162 self.match = match 163 164 if self.matching is None: 165 if remaining: 166 return self.s[self.pos:] 167 else: 168 return None 169 else: 170 return self.s[self.pos:self.start] 171 172 def match_group(self, group=1): 173 174 """ 175 Return the matched text, updating the position in the stream. If 'group' 176 is specified, the indicated group in a match will be returned. 177 Typically, group 1 should contain all pertinent data, but groups defined 178 within group 1 can provide sections of the data. 179 """ 180 181 self.update_pos() 182 183 if self.match: 184 try: 185 return self.match.group(group) 186 except IndexError: 187 return "" 188 else: 189 return None 190 191 def match_groups(self, groups=None): 192 193 "Return the match 'groups', or all groups if unspecified." 194 195 self.update_pos() 196 197 if self.match: 198 if groups is None: 199 return self.match.groups() 200 else: 201 return self.match.groups(groups) 202 else: 203 return [] 204 205 def update_pos(self): 206 207 "Update the position in the stream." 208 209 if self.match: 210 _start, self.pos = self.match.span() 211 else: 212 self.pos = len(self.s) 213 214 215 216 # Parser abstractions. 217 218 class ParserBase: 219 220 "Common parsing methods." 221 222 region_pattern_names = None 223 224 def __init__(self, formats=None, root=None): 225 226 """ 227 Initialise the parser with any given 'formats' mapping from region type 228 names to parser objects. An optional 'root' indicates the document-level 229 parser. 230 """ 231 232 self.formats = formats 233 self.root = root 234 235 def get_parser(self, format_type): 236 237 """ 238 Return a parser for 'format_type' or None if no suitable parser is found. 239 """ 240 241 if not self.formats: 242 return None 243 244 cls = self.formats.get(format_type) 245 if cls: 246 return cls(self.formats, self.root or self) 247 else: 248 return None 249 250 def get_patterns(self, pattern_names): 251 252 "Return a mapping of the given 'pattern_names' to patterns." 253 254 return get_subset(self.patterns, pattern_names) 255 256 def get_items(self, s, pos=0): 257 258 "Return a sequence of token items for 's' and 'pos'." 259 260 return TokenStream(s, pos) 261 262 def set_region(self, items, region): 263 264 "Set the 'items' used to populate the given 'region'." 265 266 self.items = items 267 self.region = region 268 269 def read_until(self, pattern_names, remaining=True): 270 271 """ 272 Read the next portion of input, matching using 'pattern_names'. Return 273 the text preceding any match, the remaining text if no match was found, 274 or None if no match was found and 'remaining' is given as a false value. 275 """ 276 277 return self.items.read_until(self.get_patterns(pattern_names)) 278 279 def match_group(self, group=1): 280 281 """ 282 Return the group of the matching pattern with the given 'group' number. 283 """ 284 285 return self.items.match_group(group) 286 287 def matching_pattern(self): 288 289 "Return the name of the matching pattern." 290 291 return self.items.matching 292 293 def match_groups(self): 294 295 "Return the number of groups in the match." 296 297 return self.items.match_groups() 298 299 # Parser methods invoked from other objects. 300 301 def parse(self, s): 302 303 """ 304 Parse page text 's'. Pages consist of regions delimited by markers. 305 """ 306 307 self.items = self.get_items(s) 308 self.region = self.parse_region() 309 return self.region 310 311 def parse_region_content(self, items, region): 312 313 "Parse the data provided by 'items' to populate a 'region'." 314 315 self.set_region(items, region) 316 317 # Parse inline and opaque regions. 318 319 if not region.transparent: 320 pattern_names = ["regionend"] 321 322 # Define a block to hold text. 323 324 else: 325 self.new_block(region) 326 pattern_names = self.region_pattern_names 327 328 # Start parsing. 329 330 if pattern_names: 331 self.parse_region_details(region, pattern_names) 332 333 # Reset the type if the region was not inline. 334 335 if region.type == "inline": 336 first = region.nodes and region.nodes[0] 337 if first and isinstance(first, Text) and first.multiline(): 338 region.type = None 339 340 # Top-level parser handler methods. 341 342 def parse_region(self, level=0, indent=0, type=None): 343 344 """ 345 Parse the data to populate a region with the given 'level' at the given 346 'indent' having the given initial 'type'. 347 """ 348 349 region = Region([], level, indent, type) 350 351 # Parse section headers and directives, then parse according to region 352 # type. 353 354 self.parse_region_header(region) 355 self.parse_region_directives(region) 356 self.parse_region_type(region) 357 358 return region 359 360 def parse_region_type(self, region): 361 362 """ 363 Use configured parsers to parse 'region' based on its type. 364 """ 365 366 # Find an appropriate parser given the type. 367 368 parser = self.get_parser(region.type) 369 if not parser: 370 region.transparent = False 371 parser = parser or self.get_parser("moin") 372 parser.parse_region_content(self.items, region) 373 374 def parse_region_header(self, region): 375 376 """ 377 Parse the region header, setting it on the 'region' object. 378 """ 379 380 if self.read_until(["header"], False) == "": # None means no header 381 region.args = self.match_group("args") 382 region.type = region.args.split(" ", 1)[0] 383 384 def parse_region_directives(self, region): 385 386 """ 387 Parse any directives immediately after the region header, adding them to 388 the 'region' object. 389 """ 390 391 while True: 392 preceding = self.read_until(["directive"], False) 393 394 # With an immediately-appearing directive, handle its details. 395 396 if preceding == "": 397 handler = self.handlers.get(self.matching_pattern()) 398 if handler: 399 handler(self, region) 400 else: 401 break 402 403 # Otherwise, with no immediate directive (or none at all), stop. 404 405 else: 406 break 407 408 # Parsing utilities. 409 410 def parse_region_details(self, region, pattern_names, strict=False): 411 412 """ 413 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 414 value, forbid the accumulation of additional textual padding. 415 """ 416 417 try: 418 while True: 419 420 # Obtain text before any marker or the end of the input. 421 422 preceding = self.read_until(pattern_names) 423 if preceding: 424 if not strict: 425 region.append_inline(Text(preceding)) 426 else: 427 break 428 429 # End of input. 430 431 if not self.matching_pattern(): 432 break 433 434 # Obtain any feature. 435 436 feature = self.match_group("feature") or self.match_group() 437 handler = self.handlers.get(self.matching_pattern()) 438 439 # Handle each feature or add text to the region. 440 441 if handler: 442 handler(self, region) 443 elif not strict: 444 region.append_inline(Text(feature)) 445 else: 446 break 447 448 except StopIteration: 449 pass 450 451 region.normalise() 452 453 def add_node(self, region, node): 454 455 "Add to 'region' the given 'node'." 456 457 region.add(node) 458 459 def append_node(self, region, node): 460 461 "Append to 'region' the given 'node'." 462 463 region.append(node) 464 465 def end_region(self, region): 466 467 "End the parsing of 'region', breaking out of the parsing loop." 468 469 raise StopIteration 470 471 def queue_match(self): 472 473 "Queue the current match." 474 475 self.items.queue_match() 476 477 def new_block(self, region): 478 479 "Start a new block in 'region'." 480 481 self.add_node(region, Block([])) 482 483 # Common handler methods. 484 485 def parse_region_end(self, node): 486 487 "Handle the end of a region occurring within 'node'." 488 489 level = self.match_group("level") 490 feature = self.match_group("feature") 491 self.region.extra = self.match_group("extra") 492 493 if self.region.have_end(level): 494 raise StopIteration 495 else: 496 node.append_inline(Text(feature)) 497 498 # vim: tabstop=4 expandtab shiftwidth=4