1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Heading, ListItem, Region, Rule, Text 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 30 "regionend" : r"^\s*([}]{3,})", # }}}... 31 "header" : r"#!(.*?)\n", # #! char-excl-nl 32 33 # Region contents: 34 # Line-oriented patterns: 35 "break" : r"^(\s*?)\n", # blank line 36 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", # [ws...] =... ws... expecting headingend 37 "listitem" : r"^((\s+)([*]|\d+[.]))", # indent (list-item or number-item) 38 39 # Region contents: 40 # Inline patterns: 41 "rule" : r"(-----*)", # ----... 42 43 # Heading contents: 44 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 45 46 # List contents: 47 "listitemend" : r"^", # next line 48 } 49 50 # Define patterns for the regular expressions. 51 52 patterns = {} 53 for name, value in syntax.items(): 54 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 55 56 57 58 # Tokenising functions. 59 60 class TokenStream: 61 62 "A stream of tokens taken from a string." 63 64 def __init__(self, s): 65 self.s = s 66 self.pos = 0 67 self.match = None 68 self.matching = None 69 70 def read_until(self, pattern_names, remaining=True): 71 72 """ 73 Find the first match for the given 'pattern_names'. Return the text 74 preceding any match, the remaining text if no match was found, or None 75 if no match was found and 'remaining' is given as a false value. 76 """ 77 78 first = None 79 self.matching = None 80 81 # Find the first matching pattern. 82 83 for pattern_name in pattern_names: 84 match = patterns[pattern_name].search(self.s, self.pos) 85 if match: 86 start, end = match.span() 87 if self.matching is None or start < first: 88 first = start 89 self.matching = pattern_name 90 self.match = match 91 92 if self.matching is None: 93 if remaining: 94 return self.s[self.pos:] 95 else: 96 return None 97 else: 98 return self.s[self.pos:first] 99 100 def read_match(self, group=1): 101 102 """ 103 Return the matched text, updating the position in the stream. If 'group' 104 is specified, the indicated group in a match will be returned. 105 Typically, group 1 should contain all pertinent data, but groups defined 106 within group 1 can provide sections of the data. 107 """ 108 109 if self.match: 110 _start, self.pos = self.match.span() 111 try: 112 return self.match.group(group) 113 except IndexError: 114 return "" 115 else: 116 self.pos = len(self.s) 117 return None 118 119 120 121 # Parser functions. 122 123 def parse_page(s): 124 125 """ 126 Parse page text 's'. Pages consist of regions delimited by markers. 127 """ 128 129 return parse_region(TokenStream(s)) 130 131 def parse_region(items, level=0, indent=0): 132 133 """ 134 Parse the data provided by 'items' to populate a region with the given 135 'level' at the given 'indent'. 136 """ 137 138 region = Region([], level, indent) 139 140 # Parse section headers. 141 142 parse_region_header(items, region) 143 144 # Parse section body. 145 146 if region.is_transparent(): 147 parse_region_wiki(items, region) 148 else: 149 parse_region_opaque(items, region) 150 151 return region 152 153 def parse_region_header(items, region): 154 155 """ 156 Parse the region header from the 'items', setting it for the given 'region'. 157 """ 158 159 if items.read_until(["header"], False) == "": # None means no header 160 region.type = items.read_match() 161 162 def parse_region_wiki(items, region): 163 164 "Parse the data provided by 'items' to populate a wiki 'region'." 165 166 new_block(region) 167 parse_region_details(items, region, ["break", "heading", "listitem", "regionstart", "regionend", "rule"]) 168 169 def parse_region_opaque(items, region): 170 171 "Parse the data provided by 'items' to populate an opaque 'region'." 172 173 parse_region_details(items, region, ["regionend"]) 174 175 def parse_region_details(items, region, pattern_names): 176 177 "Parse 'items' within 'region' searching using 'pattern_names'." 178 179 try: 180 while True: 181 182 # Obtain text before any marker or the end of the input. 183 184 preceding = items.read_until(pattern_names) 185 if preceding: 186 region.append_text(Text(preceding)) 187 188 # End of input. 189 190 if not items.matching: 191 break 192 193 # Obtain any feature. 194 195 feature = items.read_match() 196 handler = handlers.get(items.matching) 197 198 # Handle each feature or add text to the region. 199 200 if handler: 201 handler(items, region) 202 else: 203 region.append_text(Text(feature)) 204 205 except StopIteration: 206 pass 207 208 region.normalise() 209 210 def end_region(items, region): 211 212 "End the parsing of 'region'." 213 214 raise StopIteration 215 216 def parse_break(items, region): 217 218 "Handle a paragraph break within 'region'." 219 220 # Mark any previous block as not being the final one in a sequence. 221 222 block = region.nodes[-1] 223 block.final = False 224 new_block(region) 225 226 def parse_heading(items, region): 227 228 "Handle a heading." 229 230 start_extra = items.read_match(1) 231 level = len(items.read_match(2)) 232 start_pad = items.read_match(3) 233 heading = Heading([], level, start_extra, start_pad) 234 parse_region_details(items, heading, ["headingend"]) 235 region.append(heading) 236 new_block(region) 237 238 def parse_heading_end(items, heading): 239 240 "Handle the end of a heading." 241 242 level = len(items.read_match(2)) 243 if heading.level == level: 244 heading.end_pad = items.read_match(1) 245 heading.end_extra = items.read_match(3) 246 raise StopIteration 247 248 def parse_listitem(items, region): 249 250 "Handle a list item marker within 'region'." 251 252 item = ListItem([]) 253 parse_region_details(items, item, ["listitemend"]) 254 region.append(item) 255 new_block(region) 256 257 def parse_listitem_end(items, item): 258 259 "Handle the end of a list." 260 261 raise StopIteration 262 263 def parse_rule(items, region): 264 265 "Handle a horizontal rule within 'region'." 266 267 length = len(items.read_match(1)) 268 rule = Rule(length) 269 region.append(rule) 270 new_block(region) 271 272 def parse_section(items, region): 273 274 "Handle the start of a new section within 'region'." 275 276 # Parse the section and start a new block after the section. 277 278 indent = len(items.read_match(2)) 279 level = len(items.read_match(3)) 280 region.append(parse_region(items, level, indent)) 281 new_block(region) 282 283 def parse_section_end(items, region): 284 285 "Handle the end of a new section within 'region'." 286 287 feature = items.read_match() 288 if region.have_end(feature): 289 raise StopIteration 290 else: 291 region.append_text(Text(feature)) 292 293 # Pattern handlers. 294 295 handlers = { 296 None : end_region, 297 "break" : parse_break, 298 "heading" : parse_heading, 299 "headingend" : parse_heading_end, 300 "listitemend" : parse_listitem_end, 301 "listitem" : parse_listitem, 302 "regionstart" : parse_section, 303 "regionend" : parse_section_end, 304 "rule" : parse_rule, 305 } 306 307 def new_block(region): 308 309 "Start a new block in 'region'." 310 311 block = Block([]) 312 region.append(block) 313 314 315 316 # Top-level functions. 317 318 parse = parse_page 319 320 # vim: tabstop=4 expandtab shiftwidth=4