1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 quotes = "['" '"]' # ['"] 30 31 def excl(s): 32 33 "Return a non-matching pattern for 's'." 34 35 return "(?!%s)" % s 36 37 def expect(s): 38 39 "Return a pattern expecting 's'." 40 41 return "(?=%s)" % s 42 43 def group(name, s): 44 45 "Return a pattern group having 'name' and the pattern string 's'." 46 47 return "(?P<%s>%s)" % (name, s) 48 49 def optional(s): 50 51 "Return an optional pattern." 52 53 return "(?:%s)?" % s 54 55 def recur(name): 56 57 "Return a test for a recurrence of group 'name'." 58 59 return "(?P=%s)" % name 60 61 def repeat(s, min=None, max=None): 62 63 "Return a pattern matching 's' for the given 'min' and 'max' limits." 64 65 return "%s{%s,%s}" % (s, min is not None and min or "", 66 max is not None and max or "") 67 68 def get_patterns(syntax): 69 70 """ 71 Define patterns for the regular expressions in the 'syntax' mapping. In each 72 pattern, replace \N with a pattern for matching whitespace excluding 73 newlines. 74 """ 75 76 patterns = {} 77 for name, value in syntax.items(): 78 value = value.replace(r"\N", ws_excl_nl) 79 value = value.replace(r"\Q", quotes) 80 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 81 return patterns 82 83 def get_subset(d, keys): 84 85 "Return a subset of 'd' having the given 'keys'." 86 87 subset = {} 88 for key in keys: 89 subset[key] = d[key] 90 return subset 91 92 93 94 # Tokenising functions. 95 96 class TokenStream: 97 98 "A stream of tokens taken from a string." 99 100 def __init__(self, s, pos=0): 101 self.s = s 102 self.pos = pos 103 104 # Match details. 105 106 self.match = None 107 self.queued = None 108 self.match_start = None 109 110 # Pattern name details. 111 112 self.matching = None 113 114 def rewind(self, length): 115 116 "Rewind in the string by 'length'." 117 118 self.pos -= min(length, self.pos) 119 120 def queue_match(self): 121 122 "Rewind in the string to the start of the last match." 123 124 self.queued = self.match 125 126 def read_until(self, patterns, remaining=True): 127 128 """ 129 Find the first match for the given 'patterns'. Return the text preceding 130 any match, the remaining text if no match was found, or None if no match 131 was found and 'remaining' is given as a false value. 132 """ 133 134 if self.queued: 135 self.match = self.queued 136 self.queued = None 137 else: 138 self.match_start = None 139 self.matching = None 140 141 # Find the first matching pattern. 142 143 for pattern_name, pattern in patterns.items(): 144 match = pattern.search(self.s, self.pos) 145 if match: 146 start, end = match.span() 147 if self.matching is None or start < self.start: 148 self.start = start 149 self.matching = pattern_name 150 self.match = match 151 152 if self.matching is None: 153 if remaining: 154 return self.s[self.pos:] 155 else: 156 return None 157 else: 158 return self.s[self.pos:self.start] 159 160 def match_group(self, group=1): 161 162 """ 163 Return the matched text, updating the position in the stream. If 'group' 164 is specified, the indicated group in a match will be returned. 165 Typically, group 1 should contain all pertinent data, but groups defined 166 within group 1 can provide sections of the data. 167 """ 168 169 self.update_pos() 170 171 if self.match: 172 try: 173 return self.match.group(group) 174 except IndexError: 175 return "" 176 else: 177 return None 178 179 def match_groups(self, groups=None): 180 181 "Return the match 'groups', or all groups if unspecified." 182 183 self.update_pos() 184 185 if self.match: 186 if groups is None: 187 return self.match.groups() 188 else: 189 return self.match.groups(groups) 190 else: 191 return [] 192 193 def update_pos(self): 194 195 "Update the position in the stream." 196 197 if self.match: 198 _start, self.pos = self.match.span() 199 else: 200 self.pos = len(self.s) 201 202 203 204 # Parser abstractions. 205 206 class ParserBase: 207 208 "Common parsing methods." 209 210 region_pattern_names = None 211 212 def __init__(self, formats=None): 213 214 """ 215 Initialise the parser with any given 'formats' mapping from region type 216 names to parser objects. 217 """ 218 219 self.formats = formats 220 221 def get_parser(self, format_type): 222 223 """ 224 Return a parser for 'format_type' or None if no suitable parser is found. 225 """ 226 227 if not self.formats: 228 return None 229 230 cls = self.formats.get(format_type) 231 if cls: 232 return cls(self.formats) 233 else: 234 return None 235 236 def get_patterns(self, pattern_names): 237 238 "Return a mapping of the given 'pattern_names' to patterns." 239 240 return get_subset(self.patterns, pattern_names) 241 242 def get_items(self, s, pos=0): 243 244 "Return a sequence of token items for 's' and 'pos'." 245 246 return TokenStream(s, pos) 247 248 def set_region(self, items, region): 249 250 "Set the 'items' used to populate the given 'region'." 251 252 self.items = items 253 self.region = region 254 255 def read_until(self, pattern_names, remaining=True): 256 257 """ 258 Read the next portion of input, matching using 'pattern_names'. Return 259 the text preceding any match, the remaining text if no match was found, 260 or None if no match was found and 'remaining' is given as a false value. 261 """ 262 263 return self.items.read_until(self.get_patterns(pattern_names)) 264 265 def match_group(self, group=1): 266 267 """ 268 Return the group of the matching pattern with the given 'group' number. 269 """ 270 271 return self.items.match_group(group) 272 273 def matching_pattern(self): 274 275 "Return the name of the matching pattern." 276 277 return self.items.matching 278 279 def match_groups(self): 280 281 "Return the number of groups in the match." 282 283 return self.items.match_groups() 284 285 # Parser methods invoked from other objects. 286 287 def parse(self, s): 288 289 """ 290 Parse page text 's'. Pages consist of regions delimited by markers. 291 """ 292 293 self.items = self.get_items(s) 294 self.region = self.parse_region() 295 return self.region 296 297 def parse_region_content(self, items, region): 298 299 "Parse the data provided by 'items' to populate a 'region'." 300 301 self.set_region(items, region) 302 303 # Parse inline and opaque regions. 304 305 if not region.transparent: 306 pattern_names = ["regionend"] 307 308 # Define a block to hold text. 309 310 else: 311 self.new_block(region) 312 pattern_names = self.region_pattern_names 313 314 # Start parsing. 315 316 if pattern_names: 317 self.parse_region_details(region, pattern_names) 318 319 # Reset the type if the region was not inline. 320 321 if region.type == "inline": 322 first = region.nodes and region.nodes[0] 323 if first and isinstance(first, Text) and first.multiline(): 324 region.type = None 325 326 # Top-level parser handler methods. 327 328 def parse_region(self, level=0, indent=0, type=None): 329 330 """ 331 Parse the data to populate a region with the given 'level' at the given 332 'indent' having the given initial 'type'. 333 """ 334 335 region = Region([], level, indent, type) 336 337 # Parse section headers, then parse according to region type. 338 339 self.parse_region_header(region) 340 self.parse_region_type(region) 341 342 return region 343 344 def parse_region_type(self, region): 345 346 """ 347 Use configured parsers to parse 'region' based on its type. 348 """ 349 350 # Find an appropriate parser given the type. 351 352 parser = self.get_parser(region.type) 353 if not parser: 354 region.transparent = False 355 parser = parser or self.get_parser("moin") 356 parser.parse_region_content(self.items, region) 357 358 def parse_region_header(self, region): 359 360 """ 361 Parse the region header, setting it on the 'region' object. 362 """ 363 364 if self.read_until(["header"], False) == "": # None means no header 365 region.type = self.match_group("args") 366 367 # Parsing utilities. 368 369 def parse_region_details(self, region, pattern_names, strict=False): 370 371 """ 372 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 373 value, forbid the accumulation of additional textual padding. 374 """ 375 376 try: 377 while True: 378 379 # Obtain text before any marker or the end of the input. 380 381 preceding = self.read_until(pattern_names) 382 if preceding: 383 if not strict: 384 region.append_inline(Text(preceding)) 385 else: 386 break 387 388 # End of input. 389 390 if not self.matching_pattern(): 391 break 392 393 # Obtain any feature. 394 395 feature = self.match_group("feature") or self.match_group() 396 handler = self.handlers.get(self.matching_pattern()) 397 398 # Handle each feature or add text to the region. 399 400 if handler: 401 handler(self, region) 402 elif not strict: 403 region.append_inline(Text(feature)) 404 else: 405 break 406 407 except StopIteration: 408 pass 409 410 region.normalise() 411 412 def add_node(self, region, node): 413 414 "Add to 'region' the given 'node'." 415 416 region.add(node) 417 418 def append_node(self, region, node): 419 420 "Append to 'region' the given 'node'." 421 422 region.append(node) 423 424 def end_region(self, region): 425 426 "End the parsing of 'region', breaking out of the parsing loop." 427 428 raise StopIteration 429 430 def queue_match(self): 431 432 "Queue the current match." 433 434 self.items.queue_match() 435 436 def new_block(self, region): 437 438 "Start a new block in 'region'." 439 440 self.add_node(region, Block([])) 441 442 # vim: tabstop=4 expandtab shiftwidth=4