1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from collections import defaultdict 23 from moinformat.tree import Block, Region, Text 24 import re 25 26 # Pattern management. 27 28 ws_excl_nl = r"[ \f\r\t\v]" 29 30 def get_patterns(syntax): 31 32 """ 33 Define patterns for the regular expressions in the 'syntax' mapping. In each 34 pattern, replace \N with a pattern for matching whitespace excluding 35 newlines. 36 """ 37 38 patterns = {} 39 for name, value in syntax.items(): 40 value = value.replace(r"\N", ws_excl_nl) 41 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 42 return patterns 43 44 def get_subset(d, keys): 45 46 "Return a subset of 'd' having the given 'keys'." 47 48 subset = {} 49 for key in keys: 50 subset[key] = d[key] 51 return subset 52 53 54 55 # Tokenising functions. 56 57 class TokenStream: 58 59 "A stream of tokens taken from a string." 60 61 def __init__(self, s, pos=0): 62 self.s = s 63 self.pos = pos 64 self.match = None 65 self.matching = None 66 67 def rewind(self, length): 68 69 "Rewind in the string by 'length'." 70 71 self.pos -= min(length, self.pos) 72 73 def read_until(self, patterns, remaining=True): 74 75 """ 76 Find the first match for the given 'patterns'. Return the text preceding 77 any match, the remaining text if no match was found, or None if no match 78 was found and 'remaining' is given as a false value. 79 """ 80 81 first = None 82 self.matching = None 83 84 # Find the first matching pattern. 85 86 for pattern_name, pattern in patterns.items(): 87 match = pattern.search(self.s, self.pos) 88 if match: 89 start, end = match.span() 90 if self.matching is None or start < first: 91 first = start 92 self.matching = pattern_name 93 self.match = match 94 95 if self.matching is None: 96 if remaining: 97 return self.s[self.pos:] 98 else: 99 return None 100 else: 101 return self.s[self.pos:first] 102 103 def read_match(self, group=1): 104 105 """ 106 Return the matched text, updating the position in the stream. If 'group' 107 is specified, the indicated group in a match will be returned. 108 Typically, group 1 should contain all pertinent data, but groups defined 109 within group 1 can provide sections of the data. 110 """ 111 112 if self.match: 113 _start, self.pos = self.match.span() 114 try: 115 return self.match.group(group) 116 except IndexError: 117 return "" 118 else: 119 self.pos = len(self.s) 120 return None 121 122 123 124 # Parser abstractions. 125 126 class ParserBase: 127 128 "Common parsing methods." 129 130 region_pattern_names = None 131 132 def __init__(self, formats=None): 133 134 """ 135 Initialise the parser with any given 'formats' mapping from region type 136 names to parser objects. 137 """ 138 139 self.formats = formats 140 self.queued = defaultdict(list) 141 142 def get_parser(self, format_type): 143 144 """ 145 Return a parser for 'format_type' or None if no suitable parser is found. 146 """ 147 148 if not self.formats: 149 return None 150 151 cls = self.formats.get(format_type) 152 if cls: 153 return cls(self.formats) 154 else: 155 return None 156 157 def get_patterns(self, pattern_names): 158 159 "Return a mapping of the given 'pattern_names' to patterns." 160 161 return get_subset(self.patterns, pattern_names) 162 163 def get_items(self, s, pos=0): 164 165 "Return a sequence of token items for 's' and 'pos'." 166 167 return TokenStream(s, pos) 168 169 def set_region(self, items, region): 170 171 "Set the 'items' used to populate the given 'region'." 172 173 self.items = items 174 self.region = region 175 176 def read_until(self, pattern_names, remaining=True): 177 178 """ 179 Read the next portion of input, matching using 'pattern_names'. Return 180 the text preceding any match, the remaining text if no match was found, 181 or None if no match was found and 'remaining' is given as a false value. 182 """ 183 184 return self.items.read_until(self.get_patterns(pattern_names)) 185 186 def read_match(self, group=1): 187 188 """ 189 Return the group of the matching pattern with the given 'group' number. 190 """ 191 192 return self.items.read_match(group) 193 194 def read_matching(self): 195 196 "Return the name of the matching pattern." 197 198 return self.items.matching 199 200 # Parser methods invoked from other objects. 201 202 def parse(self, s): 203 204 """ 205 Parse page text 's'. Pages consist of regions delimited by markers. 206 """ 207 208 self.items = self.get_items(s) 209 self.region = self.parse_region() 210 return self.region 211 212 def parse_region_content(self, items, region): 213 214 "Parse the data provided by 'items' to populate a 'region'." 215 216 self.set_region(items, region) 217 218 # Define a block to hold text and start parsing. 219 220 self.new_block(region) 221 222 if self.region_pattern_names: 223 self.parse_region_details(region, self.region_pattern_names) 224 225 # Top-level parser handler methods. 226 227 def parse_region(self, level=0, indent=0): 228 229 """ 230 Parse the data to populate a region with the given 'level' at the given 231 'indent'. 232 """ 233 234 region = Region([], level, indent) 235 236 # Parse section headers, then parse according to region type. 237 238 self.parse_region_header(region) 239 self.parse_region_type(region) 240 241 return region 242 243 def parse_region_type(self, region): 244 245 """ 246 Use configured parsers to parse 'region' based on its type. 247 """ 248 249 # Find an appropriate parser given the type. 250 251 parser = self.get_parser(region.type) 252 253 if parser: 254 parser.parse_region_content(self.items, region) 255 256 # Otherwise, treat the section as opaque. 257 258 else: 259 self.parse_region_opaque(region) 260 261 def parse_region_header(self, region): 262 263 """ 264 Parse the region header, setting it on the 'region' object. 265 """ 266 267 if self.read_until(["header"], False) == "": # None means no header 268 region.type = self.read_match() 269 270 def parse_region_opaque(self, region): 271 272 "Parse the data to populate an opaque 'region'." 273 274 region.transparent = False 275 self.parse_region_details(region, ["regionend"]) 276 277 # Parsing utilities. 278 279 def parse_region_details(self, region, pattern_names, strict=False): 280 281 """ 282 Search 'region' using the 'pattern_names'. If 'strict' is set to a true 283 value, forbid the accumulation of additional textual padding. 284 """ 285 286 try: 287 while True: 288 289 # Obtain text before any marker or the end of the input. 290 291 preceding = self.read_until(pattern_names) 292 if preceding: 293 if not strict: 294 region.append_inline(Text(preceding)) 295 else: 296 break 297 298 # End of input. 299 300 if not self.read_matching(): 301 break 302 303 # Obtain any feature. 304 305 feature = self.read_match() 306 handler = self.handlers.get(self.read_matching()) 307 308 # Handle each feature or add text to the region. 309 310 if handler: 311 handler(self, region) 312 elif not strict: 313 region.append_inline(Text(feature)) 314 else: 315 break 316 317 except StopIteration: 318 pass 319 320 region.normalise() 321 322 def add_node(self, region, node): 323 324 "Add to 'region' the given 'node'." 325 326 region.add(node) 327 self.unqueue_region(region, node) 328 329 def append_node(self, region, node): 330 331 "Append to 'region' the given 'node'." 332 333 region.append(node) 334 self.unqueue_region(region, node) 335 336 def end_region(self, region): 337 338 "End the parsing of 'region', breaking out of the parsing loop." 339 340 raise StopIteration 341 342 def queue_region(self, region, current): 343 344 "Queue 'region' for appending after the 'current' region is ended." 345 346 self.queued[current].append(region) 347 348 def unqueue_region(self, region, ended): 349 350 "Unqueue any queued region, adding it to 'region' after 'ended'." 351 352 nodes = self.queued.get(ended) 353 354 while nodes: 355 node = nodes.pop() 356 region.add(node) 357 self.unqueue_region(region, node) 358 359 if self.queued.has_key(ended): 360 del self.queued[ended] 361 362 def new_block(self, region): 363 364 "Start a new block in 'region'." 365 366 self.add_node(region, Block([])) 367 368 # vim: tabstop=4 expandtab shiftwidth=4