1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Break, DefItem, DefTerm, Heading, ListItem, Region, Rule, Text 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 30 "regionend" : r"^\s*([}]{3,})", # }}}... 31 "header" : r"#!(.*?)\n", # #! char-excl-nl 32 33 # Region contents: 34 # Line-oriented patterns: 35 # blank line 36 "break" : r"^(\s*?)\n", 37 # ws... expecting text :: 38 "defterm" : r"^(\s+)(?=.+?::)", 39 # ws... expecting :: ws... 40 "defterm_empty" : r"^(\s+)(?=::\s+)", 41 # [ws...] =... ws... expecting headingend 42 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 43 # ws... list-item [ws...] 44 "listitem" : r"^(\s+)(\*)(\s*)", 45 # ws... number-item ws... 46 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 47 # ws... alpha-item ws... 48 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 49 # ws... roman-item ws... 50 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 51 # ws... dot-item [ws...] 52 "listitem_dot" : r"^(\s+)(\.)(\s*)", 53 54 # Region contents: 55 # Inline patterns: 56 "rule" : r"(-----*)", # ----... 57 58 # Heading contents: 59 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 60 61 # List contents: 62 "deftermend" : r"::(\s*?\n)", 63 "deftermsep" : r"::(\s+)", 64 "listitemend" : r"^", # next line 65 } 66 67 # Define patterns for the regular expressions. 68 69 patterns = {} 70 for name, value in syntax.items(): 71 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 72 73 74 75 # Tokenising functions. 76 77 class TokenStream: 78 79 "A stream of tokens taken from a string." 80 81 def __init__(self, s): 82 self.s = s 83 self.pos = 0 84 self.match = None 85 self.matching = None 86 87 def read_until(self, pattern_names, remaining=True): 88 89 """ 90 Find the first match for the given 'pattern_names'. Return the text 91 preceding any match, the remaining text if no match was found, or None 92 if no match was found and 'remaining' is given as a false value. 93 """ 94 95 first = None 96 self.matching = None 97 98 # Find the first matching pattern. 99 100 for pattern_name in pattern_names: 101 match = patterns[pattern_name].search(self.s, self.pos) 102 if match: 103 start, end = match.span() 104 if self.matching is None or start < first: 105 first = start 106 self.matching = pattern_name 107 self.match = match 108 109 if self.matching is None: 110 if remaining: 111 return self.s[self.pos:] 112 else: 113 return None 114 else: 115 return self.s[self.pos:first] 116 117 def read_match(self, group=1): 118 119 """ 120 Return the matched text, updating the position in the stream. If 'group' 121 is specified, the indicated group in a match will be returned. 122 Typically, group 1 should contain all pertinent data, but groups defined 123 within group 1 can provide sections of the data. 124 """ 125 126 if self.match: 127 _start, self.pos = self.match.span() 128 try: 129 return self.match.group(group) 130 except IndexError: 131 return "" 132 else: 133 self.pos = len(self.s) 134 return None 135 136 137 138 # Parser functions. 139 140 def parse_page(s): 141 142 """ 143 Parse page text 's'. Pages consist of regions delimited by markers. 144 """ 145 146 return parse_region(TokenStream(s)) 147 148 def parse_region(items, level=0, indent=0): 149 150 """ 151 Parse the data provided by 'items' to populate a region with the given 152 'level' at the given 'indent'. 153 """ 154 155 region = Region([], level, indent) 156 157 # Parse section headers. 158 159 parse_region_header(items, region) 160 161 # Parse section body. 162 163 if region.is_transparent(): 164 parse_region_wiki(items, region) 165 else: 166 parse_region_opaque(items, region) 167 168 return region 169 170 def parse_region_header(items, region): 171 172 """ 173 Parse the region header from the 'items', setting it for the given 'region'. 174 """ 175 176 if items.read_until(["header"], False) == "": # None means no header 177 region.type = items.read_match() 178 179 def parse_region_wiki(items, region): 180 181 "Parse the data provided by 'items' to populate a wiki 'region'." 182 183 new_block(region) 184 parse_region_details(items, region, [ 185 "break", "heading", 186 "defterm", "defterm_empty", 187 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 188 "listitem_roman", 189 "regionstart", "regionend", "rule"]) 190 191 def parse_region_opaque(items, region): 192 193 "Parse the data provided by 'items' to populate an opaque 'region'." 194 195 parse_region_details(items, region, ["regionend"]) 196 197 def parse_region_details(items, region, pattern_names): 198 199 "Parse 'items' within 'region' searching using 'pattern_names'." 200 201 try: 202 while True: 203 204 # Obtain text before any marker or the end of the input. 205 206 preceding = items.read_until(pattern_names) 207 if preceding: 208 region.append_text(Text(preceding)) 209 210 # End of input. 211 212 if not items.matching: 213 break 214 215 # Obtain any feature. 216 217 feature = items.read_match() 218 handler = handlers.get(items.matching) 219 220 # Handle each feature or add text to the region. 221 222 if handler: 223 handler(items, region) 224 else: 225 region.append_text(Text(feature)) 226 227 except StopIteration: 228 pass 229 230 region.normalise() 231 232 def end_region(items, region): 233 234 "End the parsing of 'region'." 235 236 raise StopIteration 237 238 def parse_break(items, region): 239 240 "Handle a paragraph break within 'region'." 241 242 region.add(Break()) 243 new_block(region) 244 245 def parse_defitem(items, region, extra=""): 246 247 "Handle a definition item within 'region'." 248 249 pad = items.read_match(1) 250 item = DefItem([], pad, extra) 251 parse_region_details(items, item, ["listitemend"]) 252 region.append(item) 253 new_block(region) 254 255 def parse_defterm(items, region): 256 257 "Handle a definition term within 'region'." 258 259 pad = items.read_match(1) 260 term = DefTerm([], pad) 261 parse_region_details(items, term, ["deftermend", "deftermsep"]) 262 region.append(term) 263 if items.matching == "deftermsep": 264 parse_defitem(items, region) 265 266 def parse_defterm_empty(items, region): 267 268 "Handle an empty definition term within 'region'." 269 270 extra = items.read_match(1) 271 parse_region_details(items, region, ["deftermsep"]) 272 parse_defitem(items, region, extra) 273 274 parse_defterm_end = end_region 275 parse_defterm_sep = end_region 276 277 def parse_heading(items, region): 278 279 "Handle a heading." 280 281 start_extra = items.read_match(1) 282 level = len(items.read_match(2)) 283 start_pad = items.read_match(3) 284 heading = Heading([], level, start_extra, start_pad) 285 parse_region_details(items, heading, ["headingend"]) 286 region.append(heading) 287 new_block(region) 288 289 def parse_heading_end(items, heading): 290 291 "Handle the end of a heading." 292 293 level = len(items.read_match(2)) 294 if heading.level == level: 295 heading.end_pad = items.read_match(1) 296 heading.end_extra = items.read_match(3) 297 raise StopIteration 298 299 def parse_listitem(items, region): 300 301 "Handle a list item marker within 'region'." 302 303 indent = len(items.read_match(1)) 304 marker = items.read_match(2) 305 space = items.read_match(3) 306 item = ListItem([], indent, marker, space) 307 parse_region_details(items, item, ["listitemend"]) 308 region.append(item) 309 new_block(region) 310 311 parse_listitem_end = end_region 312 313 def parse_rule(items, region): 314 315 "Handle a horizontal rule within 'region'." 316 317 length = len(items.read_match(1)) 318 rule = Rule(length) 319 region.append(rule) 320 new_block(region) 321 322 def parse_section(items, region): 323 324 "Handle the start of a new section within 'region'." 325 326 # Parse the section and start a new block after the section. 327 328 indent = len(items.read_match(2)) 329 level = len(items.read_match(3)) 330 region.append(parse_region(items, level, indent)) 331 new_block(region) 332 333 def parse_section_end(items, region): 334 335 "Handle the end of a new section within 'region'." 336 337 feature = items.read_match() 338 if region.have_end(feature): 339 raise StopIteration 340 else: 341 region.append_text(Text(feature)) 342 343 # Pattern handlers. 344 345 handlers = { 346 None : end_region, 347 "break" : parse_break, 348 "defterm" : parse_defterm, 349 "defterm_empty" : parse_defterm_empty, 350 "deftermend" : parse_defterm_end, 351 "deftermsep" : parse_defterm_sep, 352 "heading" : parse_heading, 353 "headingend" : parse_heading_end, 354 "listitemend" : parse_listitem_end, 355 "listitem" : parse_listitem, 356 "listitem_alpha" : parse_listitem, 357 "listitem_dot" : parse_listitem, 358 "listitem_num" : parse_listitem, 359 "listitem_roman" : parse_listitem, 360 "regionstart" : parse_section, 361 "regionend" : parse_section_end, 362 "rule" : parse_rule, 363 } 364 365 def new_block(region): 366 367 "Start a new block in 'region'." 368 369 block = Block([]) 370 region.add(block) 371 372 373 374 # Top-level functions. 375 376 parse = parse_page 377 378 # vim: tabstop=4 expandtab shiftwidth=4