1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def excl(s): 32 33 "Return a non-matching pattern for 's'." 34 35 return "(?!%s)" % s 36 37 def expect(s): 38 39 "Return a pattern expecting 's'." 40 41 return "(?=%s)" % s 42 43 def group(name, s): 44 45 "Return a pattern group having 'name' and the pattern string 's'." 46 47 return "(?P<%s>%s)" % (name, s) 48 49 def optional(s): 50 51 "Return an optional pattern." 52 53 return "(?:%s)?" % s 54 55 def recur(name): 56 57 "Return a test for a recurrence of group 'name'." 58 59 return "(?P=%s)" % name 60 61 def repeat(s, min=None, max=None): 62 63 "Return a pattern matching 's' for the given 'min' and 'max' limits." 64 65 return "%s{%s,%s}" % (s, min is not None and min or "", 66 max is not None and max or "") 67 68 def get_pattern(s): 69 70 "Return a compiled regular expression for the given pattern 's'." 71 72 return re.compile(s, re.UNICODE | re.MULTILINE) 73 74 def get_patterns(syntax): 75 76 """ 77 Define patterns for the regular expressions in the 'syntax' mapping. In each 78 pattern, replace \N with a pattern for matching whitespace excluding 79 newlines. 80 """ 81 82 patterns = {} 83 for name, value in syntax.items(): 84 value = value.replace(r"\N", ws_excl_nl) 85 value = value.replace(r"\Q", quotes) 86 patterns[name] = get_pattern(value) 87 return patterns 88 89 def get_subset(d, keys): 90 91 "Return a subset of 'd' having the given 'keys'." 92 93 subset = {} 94 for key in keys: 95 subset[key] = d[key] 96 return subset 97 98 99 100 # Tokenising functions. 101 102 class TokenStream: 103 104 "A stream of tokens taken from a string." 105 106 def __init__(self, s, pos=0): 107 self.s = s 108 self.pos = pos 109 110 # Match details. 111 112 self.match = None 113 self.queued = None 114 self.match_start = None 115 116 # Pattern name details. 117 118 self.matching = None 119 120 def rewind(self, length): 121 122 "Rewind in the string by 'length'." 123 124 self.pos -= min(length, self.pos) 125 126 def queue_match(self): 127 128 "Rewind in the string to the start of the last match." 129 130 self.queued = self.match 131 132 def read_until(self, patterns, remaining=True): 133 134 """ 135 Find the first match for the given 'patterns'. Return the text preceding 136 any match, the remaining text if no match was found, or None if no match 137 was found and 'remaining' is given as a false value. 138 """ 139 140 if self.queued: 141 self.match = self.queued 142 self.queued = None 143 else: 144 self.match_start = None 145 self.matching = None 146 147 # Find the first matching pattern. 148 149 for pattern_name, pattern in patterns.items(): 150 match = pattern.search(self.s, self.pos) 151 if match: 152 start, end = match.span() 153 if self.matching is None or start < self.start: 154 self.start = start 155 self.matching = pattern_name 156 self.match = match 157 158 if self.matching is None: 159 if remaining: 160 return self.s[self.pos:] 161 else: 162 return None 163 else: 164 return self.s[self.pos:self.start] 165 166 def match_group(self, group=1): 167 168 """ 169 Return the matched text, updating the position in the stream. If 'group' 170 is specified, the indicated group in a match will be returned. 171 Typically, group 1 should contain all pertinent data, but groups defined 172 within group 1 can provide sections of the data. 173 """ 174 175 self.update_pos() 176 177 if self.match: 178 try: 179 return self.match.group(group) 180 except IndexError: 181 return "" 182 else: 183 return None 184 185 def match_groups(self, groups=None): 186 187 "Return the match 'groups', or all groups if unspecified." 188 189 self.update_pos() 190 191 if self.match: 192 if groups is None: 193 return self.match.groups() 194 else: 195 return self.match.groups(groups) 196 else: 197 return [] 198 199 def update_pos(self): 200 201 "Update the position in the stream." 202 203 if self.match: 204 _start, self.pos = self.match.span() 205 else: 206 self.pos = len(self.s) 207 208 209 210 # Parser abstractions. 211 212 class ParserBase: 213 214 "Common parsing methods." 215 216 region_pattern_names = None 217 218 def __init__(self, formats=None, root=None): 219 220 """ 221 Initialise the parser with any given 'formats' mapping from region type 222 names to parser objects. An optional 'root' indicates the document-level 223 parser. 224 """ 225 226 self.formats = formats 227 self.root = root 228 229 def get_parser(self, format_type): 230 231 """ 232 Return a parser for 'format_type' or None if no suitable parser is found. 233 """ 234 235 if not self.formats: 236 return None 237 238 cls = self.formats.get(format_type) 239 if cls: 240 return cls(self.formats, self.root or self) 241 else: 242 return None 243 244 def get_patterns(self, pattern_names): 245 246 "Return a mapping of the given 'pattern_names' to patterns." 247 248 return get_subset(self.patterns, pattern_names) 249 250 def get_items(self, s, pos=0): 251 252 "Return a sequence of token items for 's' and 'pos'." 253 254 return TokenStream(s, pos) 255 256 def set_region(self, items, region): 257 258 "Set the 'items' used to populate the given 'region'." 259 260 self.items = items 261 self.region = region 262 263 def read_until(self, pattern_names, remaining=True): 264 265 """ 266 Read the next portion of input, matching using 'pattern_names'. Return 267 the text preceding any match, the remaining text if no match was found, 268 or None if no match was found and 'remaining' is given as a false value. 269 """ 270 271 return self.items.read_until(self.get_patterns(pattern_names)) 272 273 def match_group(self, group=1): 274 275 """ 276 Return the group of the matching pattern with the given 'group' number. 277 """ 278 279 return self.items.match_group(group) 280 281 def matching_pattern(self): 282 283 "Return the name of the matching pattern." 284 285 return self.items.matching 286 287 def match_groups(self): 288 289 "Return the number of groups in the match." 290 291 return self.items.match_groups() 292 293 # Parser methods invoked from other objects. 294 295 def parse(self, s): 296 297 """ 298 Parse page text 's'. Pages consist of regions delimited by markers. 299 """ 300 301 self.items = self.get_items(s) 302 self.region = self.parse_region() 303 return self.region 304 305 def parse_region_content(self, items, region): 306 307 "Parse the data provided by 'items' to populate a 'region'." 308 309 self.set_region(items, region) 310 311 # Parse inline and opaque regions. 312 313 if not region.transparent: 314 pattern_names = ["regionend"] 315 316 # Define a block to hold text. 317 318 else: 319 self.new_block(region) 320 pattern_names = self.region_pattern_names 321 322 # Start parsing. 323 324 if pattern_names: 325 self.parse_region_details(region, pattern_names) 326 327 # Reset the type if the region was not inline. 328 329 if region.type == "inline": 330 first = region.nodes and region.nodes[0] 331 if first and isinstance(first, Text) and first.multiline(): 332 region.type = None 333 334 # Top-level parser handler methods. 335 336 def parse_region(self, level=0, indent=0, type=None): 337 338 """ 339 Parse the data to populate a region with the given 'level' at the given 340 'indent' having the given initial 'type'. 341 """ 342 343 region = Region([], level, indent, type) 344 345 # Parse section headers, then parse according to region type. 346 347 self.parse_region_header(region) 348 self.parse_region_type(region) 349 350 return region 351 352 def parse_region_type(self, region): 353 354 """ 355 Use configured parsers to parse 'region' based on its type. 356 """ 357 358 # Find an appropriate parser given the type. 359 360 parser = self.get_parser(region.type) 361 if not parser: 362 region.transparent = False 363 parser = parser or self.get_parser("moin") 364 parser.parse_region_content(self.items, region) 365 366 def parse_region_header(self, region): 367 368 """ 369 Parse the region header, setting it on the 'region' object. 370 """ 371 372 if self.read_until(["header"], False) == "": # None means no header 373 region.type = self.match_group("args") 374 375 # Parsing utilities. 376 377 def parse_region_details(self, region, pattern_names, strict=False): 378 379 """ 380 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 381 value, forbid the accumulation of additional textual padding. 382 """ 383 384 try: 385 while True: 386 387 # Obtain text before any marker or the end of the input. 388 389 preceding = self.read_until(pattern_names) 390 if preceding: 391 if not strict: 392 region.append_inline(Text(preceding)) 393 else: 394 break 395 396 # End of input. 397 398 if not self.matching_pattern(): 399 break 400 401 # Obtain any feature. 402 403 feature = self.match_group("feature") or self.match_group() 404 handler = self.handlers.get(self.matching_pattern()) 405 406 # Handle each feature or add text to the region. 407 408 if handler: 409 handler(self, region) 410 elif not strict: 411 region.append_inline(Text(feature)) 412 else: 413 break 414 415 except StopIteration: 416 pass 417 418 region.normalise() 419 420 def add_node(self, region, node): 421 422 "Add to 'region' the given 'node'." 423 424 region.add(node) 425 426 def append_node(self, region, node): 427 428 "Append to 'region' the given 'node'." 429 430 region.append(node) 431 432 def end_region(self, region): 433 434 "End the parsing of 'region', breaking out of the parsing loop." 435 436 raise StopIteration 437 438 def queue_match(self): 439 440 "Queue the current match." 441 442 self.items.queue_match() 443 444 def new_block(self, region): 445 446 "Start a new block in 'region'." 447 448 self.add_node(region, Block([])) 449 450 # Common handler methods. 451 452 def parse_region_end(self, node): 453 454 "Handle the end of a region occurring within 'node'." 455 456 level = self.match_group("level") 457 feature = self.match_group("feature") 458 self.region.extra = self.match_group("extra") 459 460 if self.region.have_end(level): 461 raise StopIteration 462 else: 463 node.append_inline(Text(feature)) 464 465 # vim: tabstop=4 expandtab shiftwidth=4