1 #!/usr/bin/env python 2 3 """ 4 Moin wiki parsing functionality. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Region, Text 23 import re 24 25 # Pattern management. 26 27 ws_excl_nl = r"[ \f\r\t\v]" 28 29 def get_patterns(syntax): 30 31 """ 32 Define patterns for the regular expressions in the 'syntax' mapping. In each 33 pattern, replace \N with a pattern for matching whitespace excluding 34 newlines. 35 """ 36 37 patterns = {} 38 for name, value in syntax.items(): 39 value = value.replace(r"\N", ws_excl_nl) 40 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 41 return patterns 42 43 def get_subset(d, keys): 44 45 "Return a subset of 'd' having the given 'keys'." 46 47 subset = {} 48 for key in keys: 49 subset[key] = d[key] 50 return subset 51 52 53 54 # Tokenising functions. 55 56 class TokenStream: 57 58 "A stream of tokens taken from a string." 59 60 def __init__(self, s, pos=0): 61 self.s = s 62 self.pos = pos 63 self.match = None 64 self.matching = None 65 66 def rewind(self, length): 67 68 "Rewind in the string by 'length'." 69 70 self.pos -= min(length, self.pos) 71 72 def read_until(self, patterns, remaining=True): 73 74 """ 75 Find the first match for the given 'patterns'. Return the text preceding 76 any match, the remaining text if no match was found, or None if no match 77 was found and 'remaining' is given as a false value. 78 """ 79 80 first = None 81 self.matching = None 82 83 # Find the first matching pattern. 84 85 for pattern_name, pattern in patterns.items(): 86 match = pattern.search(self.s, self.pos) 87 if match: 88 start, end = match.span() 89 if self.matching is None or start < first: 90 first = start 91 self.matching = pattern_name 92 self.match = match 93 94 if self.matching is None: 95 if remaining: 96 return self.s[self.pos:] 97 else: 98 return None 99 else: 100 return self.s[self.pos:first] 101 102 def read_match(self, group=1): 103 104 """ 105 Return the matched text, updating the position in the stream. If 'group' 106 is specified, the indicated group in a match will be returned. 107 Typically, group 1 should contain all pertinent data, but groups defined 108 within group 1 can provide sections of the data. 109 """ 110 111 if self.match: 112 _start, self.pos = self.match.span() 113 try: 114 return self.match.group(group) 115 except IndexError: 116 return "" 117 else: 118 self.pos = len(self.s) 119 return None 120 121 122 123 # Utility functions. 124 125 def new_block(region): 126 127 "Start a new block in 'region'." 128 129 region.add(Block([])) 130 131 132 133 # Parser abstractions. 134 135 class ParserBase: 136 137 "Common parsing methods." 138 139 region_pattern_names = None 140 141 def __init__(self, formats=None): 142 143 """ 144 Initialise the parser with any given 'formats' mapping from region type 145 names to parser objects. 146 """ 147 148 self.formats = formats 149 150 def get_parser(self, format_type): 151 152 """ 153 Return a parser for 'format_type' or None if no suitable parser is found. 154 """ 155 156 if not self.formats: 157 return None 158 159 cls = self.formats.get(format_type) 160 if cls: 161 return cls(self.formats) 162 else: 163 return None 164 165 def get_patterns(self, pattern_names): 166 167 "Return a mapping of the given 'pattern_names' to patterns." 168 169 return get_subset(self.patterns, pattern_names) 170 171 def get_items(self, s, pos=0): 172 173 "Return a sequence of token items for 's' and 'pos'." 174 175 return TokenStream(s, pos) 176 177 def set_region(self, items, region): 178 179 "Set the 'items' used to populate the given 'region'." 180 181 self.items = items 182 self.region = region 183 184 def read_until(self, pattern_names, remaining=True): 185 186 """ 187 Read the next portion of input, matching using 'pattern_names'. Return 188 the text preceding any match, the remaining text if no match was found, 189 or None if no match was found and 'remaining' is given as a false value. 190 """ 191 192 return self.items.read_until(self.get_patterns(pattern_names)) 193 194 def read_match(self, group=1): 195 196 """ 197 Return the group of the matching pattern with the given 'group' number. 198 """ 199 200 return self.items.read_match(group) 201 202 def read_matching(self): 203 204 "Return the name of the matching pattern." 205 206 return self.items.matching 207 208 # Parser methods invoked from other objects. 209 210 def parse(self, s): 211 212 """ 213 Parse page text 's'. Pages consist of regions delimited by markers. 214 """ 215 216 self.items = self.get_items(s) 217 self.region = self.parse_region() 218 return self.region 219 220 def parse_region_content(self, items, region): 221 222 "Parse the data provided by 'items' to populate a 'region'." 223 224 self.set_region(items, region) 225 226 # Define a block to hold text and start parsing. 227 228 new_block(region) 229 230 if self.region_pattern_names: 231 self.parse_region_details(region, self.region_pattern_names) 232 233 # Top-level parser handler methods. 234 235 def parse_region(self, level=0, indent=0): 236 237 """ 238 Parse the data to populate a region with the given 'level' at the given 239 'indent'. 240 """ 241 242 region = Region([], level, indent) 243 244 # Parse section headers, then parse according to region type. 245 246 self.parse_region_header(region) 247 self.parse_region_type(region) 248 249 return region 250 251 def parse_region_type(self, region): 252 253 """ 254 Use configured parsers to parse 'region' based on its type. 255 """ 256 257 # Find an appropriate parser given the type. 258 259 parser = self.get_parser(region.type) 260 261 if parser: 262 parser.parse_region_content(self.items, region) 263 264 # Otherwise, treat the section as opaque. 265 266 else: 267 self.parse_region_opaque(region) 268 269 def parse_region_header(self, region): 270 271 """ 272 Parse the region header, setting it on the 'region' object. 273 """ 274 275 if self.read_until(["header"], False) == "": # None means no header 276 region.type = self.read_match() 277 278 def parse_region_opaque(self, region): 279 280 "Parse the data to populate an opaque 'region'." 281 282 region.transparent = False 283 self.parse_region_details(region, ["regionend"]) 284 285 # Parsing utilities. 286 287 def parse_region_details(self, region, pattern_names): 288 289 "Search 'region' using the 'pattern_names'." 290 291 try: 292 while True: 293 294 # Obtain text before any marker or the end of the input. 295 296 preceding = self.read_until(pattern_names) 297 if preceding: 298 region.append_inline(Text(preceding)) 299 300 # End of input. 301 302 if not self.read_matching(): 303 break 304 305 # Obtain any feature. 306 307 feature = self.read_match() 308 handler = self.handlers.get(self.read_matching()) 309 310 # Handle each feature or add text to the region. 311 312 if handler: 313 handler(self, region) 314 else: 315 region.append_inline(Text(feature)) 316 317 except StopIteration: 318 pass 319 320 region.normalise() 321 322 def end_region(self, region): 323 324 "End the parsing of 'region', breaking out of the parsing loop." 325 326 raise StopIteration 327 328 # vim: tabstop=4 expandtab shiftwidth=4