1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree.moin import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def excl(s): 32 33 "Return a non-matching pattern for 's'." 34 35 return "(?!%s)" % s 36 37 def expect(s): 38 39 "Return a pattern expecting 's'." 40 41 return "(?=%s)" % s 42 43 def group(name, s): 44 45 "Return a pattern group having 'name' and the pattern string 's'." 46 47 return "(?P<%s>%s)" % (name, s) 48 49 def optional(s): 50 51 "Return an optional pattern." 52 53 return "(?:%s)?" % s 54 55 def recur(name): 56 57 "Return a test for a recurrence of group 'name'." 58 59 return "(?P=%s)" % name 60 61 def repeat(s, min=None, max=None): 62 63 "Return a pattern matching 's' for the given 'min' and 'max' limits." 64 65 return "%s{%s,%s}" % (s, min is not None and min or "", 66 max is not None and max or "") 67 68 def get_patterns(syntax): 69 70 """ 71 Define patterns for the regular expressions in the 'syntax' mapping. In each 72 pattern, replace \N with a pattern for matching whitespace excluding 73 newlines. 74 """ 75 76 patterns = {} 77 for name, value in syntax.items(): 78 value = value.replace(r"\N", ws_excl_nl) 79 value = value.replace(r"\Q", quotes) 80 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 81 return patterns 82 83 def get_subset(d, keys): 84 85 "Return a subset of 'd' having the given 'keys'." 86 87 subset = {} 88 for key in keys: 89 subset[key] = d[key] 90 return subset 91 92 93 94 # Tokenising functions. 95 96 class TokenStream: 97 98 "A stream of tokens taken from a string." 99 100 def __init__(self, s, pos=0): 101 self.s = s 102 self.pos = pos 103 104 # Match details. 105 106 self.match = None 107 self.queued = None 108 self.match_start = None 109 110 # Pattern name details. 111 112 self.matching = None 113 114 def rewind(self, length): 115 116 "Rewind in the string by 'length'." 117 118 self.pos -= min(length, self.pos) 119 120 def queue_match(self): 121 122 "Rewind in the string to the start of the last match." 123 124 self.queued = self.match 125 126 def read_until(self, patterns, remaining=True): 127 128 """ 129 Find the first match for the given 'patterns'. Return the text preceding 130 any match, the remaining text if no match was found, or None if no match 131 was found and 'remaining' is given as a false value. 132 """ 133 134 if self.queued: 135 self.match = self.queued 136 self.queued = None 137 else: 138 self.match_start = None 139 self.matching = None 140 141 # Find the first matching pattern. 142 143 for pattern_name, pattern in patterns.items(): 144 match = pattern.search(self.s, self.pos) 145 if match: 146 start, end = match.span() 147 if self.matching is None or start < self.start: 148 self.start = start 149 self.matching = pattern_name 150 self.match = match 151 152 if self.matching is None: 153 if remaining: 154 return self.s[self.pos:] 155 else: 156 return None 157 else: 158 return self.s[self.pos:self.start] 159 160 def match_group(self, group=1): 161 162 """ 163 Return the matched text, updating the position in the stream. If 'group' 164 is specified, the indicated group in a match will be returned. 165 Typically, group 1 should contain all pertinent data, but groups defined 166 within group 1 can provide sections of the data. 167 """ 168 169 self.update_pos() 170 171 if self.match: 172 try: 173 return self.match.group(group) 174 except IndexError: 175 return "" 176 else: 177 return None 178 179 def match_groups(self, groups=None): 180 181 "Return the match 'groups', or all groups if unspecified." 182 183 self.update_pos() 184 185 if self.match: 186 if groups is None: 187 return self.match.groups() 188 else: 189 return self.match.groups(groups) 190 else: 191 return [] 192 193 def update_pos(self): 194 195 "Update the position in the stream." 196 197 if self.match: 198 _start, self.pos = self.match.span() 199 else: 200 self.pos = len(self.s) 201 202 203 204 # Parser abstractions. 205 206 class ParserBase: 207 208 "Common parsing methods." 209 210 region_pattern_names = None 211 212 def __init__(self, formats=None, root=None): 213 214 """ 215 Initialise the parser with any given 'formats' mapping from region type 216 names to parser objects. An optional 'root' indicates the document-level 217 parser. 218 """ 219 220 self.formats = formats 221 self.root = root 222 223 def get_parser(self, format_type): 224 225 """ 226 Return a parser for 'format_type' or None if no suitable parser is found. 227 """ 228 229 if not self.formats: 230 return None 231 232 cls = self.formats.get(format_type) 233 if cls: 234 return cls(self.formats, self.root or self) 235 else: 236 return None 237 238 def get_patterns(self, pattern_names): 239 240 "Return a mapping of the given 'pattern_names' to patterns." 241 242 return get_subset(self.patterns, pattern_names) 243 244 def get_items(self, s, pos=0): 245 246 "Return a sequence of token items for 's' and 'pos'." 247 248 return TokenStream(s, pos) 249 250 def set_region(self, items, region): 251 252 "Set the 'items' used to populate the given 'region'." 253 254 self.items = items 255 self.region = region 256 257 def read_until(self, pattern_names, remaining=True): 258 259 """ 260 Read the next portion of input, matching using 'pattern_names'. Return 261 the text preceding any match, the remaining text if no match was found, 262 or None if no match was found and 'remaining' is given as a false value. 263 """ 264 265 return self.items.read_until(self.get_patterns(pattern_names)) 266 267 def match_group(self, group=1): 268 269 """ 270 Return the group of the matching pattern with the given 'group' number. 271 """ 272 273 return self.items.match_group(group) 274 275 def matching_pattern(self): 276 277 "Return the name of the matching pattern." 278 279 return self.items.matching 280 281 def match_groups(self): 282 283 "Return the number of groups in the match." 284 285 return self.items.match_groups() 286 287 # Parser methods invoked from other objects. 288 289 def parse(self, s): 290 291 """ 292 Parse page text 's'. Pages consist of regions delimited by markers. 293 """ 294 295 self.items = self.get_items(s) 296 self.region = self.parse_region() 297 return self.region 298 299 def parse_region_content(self, items, region): 300 301 "Parse the data provided by 'items' to populate a 'region'." 302 303 self.set_region(items, region) 304 305 # Parse inline and opaque regions. 306 307 if not region.transparent: 308 pattern_names = ["regionend"] 309 310 # Define a block to hold text. 311 312 else: 313 self.new_block(region) 314 pattern_names = self.region_pattern_names 315 316 # Start parsing. 317 318 if pattern_names: 319 self.parse_region_details(region, pattern_names) 320 321 # Reset the type if the region was not inline. 322 323 if region.type == "inline": 324 first = region.nodes and region.nodes[0] 325 if first and isinstance(first, Text) and first.multiline(): 326 region.type = None 327 328 # Top-level parser handler methods. 329 330 def parse_region(self, level=0, indent=0, type=None): 331 332 """ 333 Parse the data to populate a region with the given 'level' at the given 334 'indent' having the given initial 'type'. 335 """ 336 337 region = Region([], level, indent, type) 338 339 # Parse section headers, then parse according to region type. 340 341 self.parse_region_header(region) 342 self.parse_region_type(region) 343 344 return region 345 346 def parse_region_type(self, region): 347 348 """ 349 Use configured parsers to parse 'region' based on its type. 350 """ 351 352 # Find an appropriate parser given the type. 353 354 parser = self.get_parser(region.type) 355 if not parser: 356 region.transparent = False 357 parser = parser or self.get_parser("moin") 358 parser.parse_region_content(self.items, region) 359 360 def parse_region_header(self, region): 361 362 """ 363 Parse the region header, setting it on the 'region' object. 364 """ 365 366 if self.read_until(["header"], False) == "": # None means no header 367 region.type = self.match_group("args") 368 369 # Parsing utilities. 370 371 def parse_region_details(self, region, pattern_names, strict=False): 372 373 """ 374 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 375 value, forbid the accumulation of additional textual padding. 376 """ 377 378 try: 379 while True: 380 381 # Obtain text before any marker or the end of the input. 382 383 preceding = self.read_until(pattern_names) 384 if preceding: 385 if not strict: 386 region.append_inline(Text(preceding)) 387 else: 388 break 389 390 # End of input. 391 392 if not self.matching_pattern(): 393 break 394 395 # Obtain any feature. 396 397 feature = self.match_group("feature") or self.match_group() 398 handler = self.handlers.get(self.matching_pattern()) 399 400 # Handle each feature or add text to the region. 401 402 if handler: 403 handler(self, region) 404 elif not strict: 405 region.append_inline(Text(feature)) 406 else: 407 break 408 409 except StopIteration: 410 pass 411 412 region.normalise() 413 414 def add_node(self, region, node): 415 416 "Add to 'region' the given 'node'." 417 418 region.add(node) 419 420 def append_node(self, region, node): 421 422 "Append to 'region' the given 'node'." 423 424 region.append(node) 425 426 def end_region(self, region): 427 428 "End the parsing of 'region', breaking out of the parsing loop." 429 430 raise StopIteration 431 432 def queue_match(self): 433 434 "Queue the current match." 435 436 self.items.queue_match() 437 438 def new_block(self, region): 439 440 "Start a new block in 'region'." 441 442 self.add_node(region, Block([])) 443 444 # Common handler methods. 445 446 def parse_region_end(self, node): 447 448 "Handle the end of a region occurring within 'node'." 449 450 level = self.match_group("level") 451 feature = self.match_group("feature") 452 self.region.extra = self.match_group("extra") 453 454 if self.region.have_end(level): 455 raise StopIteration 456 else: 457 node.append_inline(Text(feature)) 458 459 # vim: tabstop=4 expandtab shiftwidth=4