1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \ 23 ListItem, Monospace, Region, Rule, Text 24 import re 25 26 # Regular expressions. 27 28 syntax = { 29 # Page regions: 30 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 31 "regionend" : r"^\s*([}]{3,})", # }}}... 32 "header" : r"#!(.*?)\n", # #! char-excl-nl 33 34 # Region contents: 35 # Line-oriented patterns: 36 # blank line 37 "break" : r"^(\s*?)\n", 38 # ws... expecting text :: 39 "defterm" : r"^(\s+)(?=.+?::)", 40 # ws... expecting :: ws... 41 "defterm_empty" : r"^(\s+)(?=::\s+)", 42 # [ws...] =... ws... expecting headingend 43 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 44 # ws... list-item [ws...] 45 "listitem" : r"^(\s+)(\*)(\s*)", 46 # ws... number-item ws... 47 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 48 # ws... alpha-item ws... 49 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 50 # ws... roman-item ws... 51 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 52 # ws... dot-item [ws...] 53 "listitem_dot" : r"^(\s+)(\.)(\s*)", 54 55 # Region contents: 56 # Inline patterns: 57 "fontstyle" : r"('{2,6})", 58 "monospace" : r"`", 59 "rule" : r"(-----*)", # ----... 60 61 # Inline contents: 62 "monospaceend" : r"`", 63 64 # Heading contents: 65 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 66 67 # List contents: 68 "deftermend" : r"::(\s*?\n)", 69 "deftermsep" : r"::(\s+)", 70 "listitemend" : r"^", # next line 71 } 72 73 # Define patterns for the regular expressions. 74 75 patterns = {} 76 for name, value in syntax.items(): 77 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 78 79 80 81 # Tokenising functions. 82 83 class TokenStream: 84 85 "A stream of tokens taken from a string." 86 87 def __init__(self, s): 88 self.s = s 89 self.pos = 0 90 self.match = None 91 self.matching = None 92 93 def rewind(self, length): 94 95 "Rewind in the string by 'length'." 96 97 self.pos -= min(length, self.pos) 98 99 def read_until(self, pattern_names, remaining=True): 100 101 """ 102 Find the first match for the given 'pattern_names'. Return the text 103 preceding any match, the remaining text if no match was found, or None 104 if no match was found and 'remaining' is given as a false value. 105 """ 106 107 first = None 108 self.matching = None 109 110 # Find the first matching pattern. 111 112 for pattern_name in pattern_names: 113 match = patterns[pattern_name].search(self.s, self.pos) 114 if match: 115 start, end = match.span() 116 if self.matching is None or start < first: 117 first = start 118 self.matching = pattern_name 119 self.match = match 120 121 if self.matching is None: 122 if remaining: 123 return self.s[self.pos:] 124 else: 125 return None 126 else: 127 return self.s[self.pos:first] 128 129 def read_match(self, group=1): 130 131 """ 132 Return the matched text, updating the position in the stream. If 'group' 133 is specified, the indicated group in a match will be returned. 134 Typically, group 1 should contain all pertinent data, but groups defined 135 within group 1 can provide sections of the data. 136 """ 137 138 if self.match: 139 _start, self.pos = self.match.span() 140 try: 141 return self.match.group(group) 142 except IndexError: 143 return "" 144 else: 145 self.pos = len(self.s) 146 return None 147 148 149 150 # Parser functions. 151 152 def parse_page(s): 153 154 """ 155 Parse page text 's'. Pages consist of regions delimited by markers. 156 """ 157 158 return parse_region(TokenStream(s)) 159 160 def parse_region(items, level=0, indent=0): 161 162 """ 163 Parse the data provided by 'items' to populate a region with the given 164 'level' at the given 'indent'. 165 """ 166 167 region = Region([], level, indent) 168 169 # Parse section headers. 170 171 parse_region_header(items, region) 172 173 # Parse section body. 174 175 if region.is_transparent(): 176 parse_region_wiki(items, region) 177 else: 178 parse_region_opaque(items, region) 179 180 return region 181 182 def parse_region_header(items, region): 183 184 """ 185 Parse the region header from the 'items', setting it for the given 'region'. 186 """ 187 188 if items.read_until(["header"], False) == "": # None means no header 189 region.type = items.read_match() 190 191 def parse_region_wiki(items, region): 192 193 "Parse the data provided by 'items' to populate a wiki 'region'." 194 195 new_block(region) 196 parse_region_details(items, region, [ 197 "break", "heading", 198 "defterm", "defterm_empty", 199 "fontstyle", 200 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 201 "listitem_roman", 202 "monospace", 203 "regionstart", "regionend", 204 "rule", 205 ]) 206 207 def parse_region_opaque(items, region): 208 209 "Parse the data provided by 'items' to populate an opaque 'region'." 210 211 parse_region_details(items, region, ["regionend"]) 212 213 def parse_region_details(items, region, pattern_names): 214 215 "Parse 'items' within 'region' searching using 'pattern_names'." 216 217 try: 218 while True: 219 220 # Obtain text before any marker or the end of the input. 221 222 preceding = items.read_until(pattern_names) 223 if preceding: 224 region.append_inline(Text(preceding)) 225 226 # End of input. 227 228 if not items.matching: 229 break 230 231 # Obtain any feature. 232 233 feature = items.read_match() 234 handler = handlers.get(items.matching) 235 236 # Handle each feature or add text to the region. 237 238 if handler: 239 handler(items, region) 240 else: 241 region.append_inline(Text(feature)) 242 243 except StopIteration: 244 pass 245 246 region.normalise() 247 248 def end_region(items, region): 249 250 "End the parsing of 'region'." 251 252 raise StopIteration 253 254 def parse_break(items, region): 255 256 "Handle a paragraph break within 'region'." 257 258 region.add(Break()) 259 new_block(region) 260 261 def parse_defitem(items, region, extra=""): 262 263 "Handle a definition item within 'region'." 264 265 pad = items.read_match(1) 266 item = DefItem([], pad, extra) 267 parse_region_details(items, item, ["listitemend"]) 268 region.append(item) 269 new_block(region) 270 271 def parse_defterm(items, region): 272 273 "Handle a definition term within 'region'." 274 275 pad = items.read_match(1) 276 term = DefTerm([], pad) 277 parse_region_details(items, term, ["deftermend", "deftermsep"]) 278 region.append(term) 279 if items.matching == "deftermsep": 280 parse_defitem(items, region) 281 282 def parse_defterm_empty(items, region): 283 284 "Handle an empty definition term within 'region'." 285 286 extra = items.read_match(1) 287 parse_region_details(items, region, ["deftermsep"]) 288 parse_defitem(items, region, extra) 289 290 def parse_fontstyle(items, region): 291 292 "Handle emphasis and strong styles." 293 294 n = len(items.read_match(1)) 295 296 # Handle endings. 297 298 if isinstance(region, FontStyle): 299 emphasis = n in (2, 4, 5) 300 strong = n in (3, 5, 6) 301 active = True 302 303 if region.emphasis and emphasis: 304 active = region.close_emphasis() 305 n -= 2 306 if region.strong and strong: 307 active = region.close_strong() 308 n -= 3 309 310 if not active: 311 if n: 312 items.rewind(n) 313 raise StopIteration 314 315 elif not n: 316 return 317 318 # Handle new styles. 319 320 emphasis = n in (2, 4, 5) 321 strong = n in (3, 5, 6) 322 double = n in (4, 6) 323 324 span = FontStyle([], emphasis, strong) 325 if not double: 326 parse_region_details(items, span, ["fontstyle", "monospace"]) 327 region.append_inline(span) 328 329 def parse_heading(items, region): 330 331 "Handle a heading." 332 333 start_extra = items.read_match(1) 334 level = len(items.read_match(2)) 335 start_pad = items.read_match(3) 336 heading = Heading([], level, start_extra, start_pad) 337 parse_region_details(items, heading, ["headingend"]) 338 region.append(heading) 339 new_block(region) 340 341 def parse_heading_end(items, heading): 342 343 "Handle the end of a heading." 344 345 level = len(items.read_match(2)) 346 if heading.level == level: 347 heading.end_pad = items.read_match(1) 348 heading.end_extra = items.read_match(3) 349 raise StopIteration 350 351 def parse_listitem(items, region): 352 353 "Handle a list item marker within 'region'." 354 355 indent = len(items.read_match(1)) 356 marker = items.read_match(2) 357 space = items.read_match(3) 358 item = ListItem([], indent, marker, space) 359 parse_region_details(items, item, ["listitemend"]) 360 region.append(item) 361 new_block(region) 362 363 def parse_monospace(items, region): 364 365 "Handle monospace." 366 367 span = Monospace([]) 368 parse_region_details(items, span, ["fontstyle", "monospaceend"]) 369 region.append_inline(span) 370 371 def parse_rule(items, region): 372 373 "Handle a horizontal rule within 'region'." 374 375 length = len(items.read_match(1)) 376 rule = Rule(length) 377 region.append(rule) 378 new_block(region) 379 380 def parse_section(items, region): 381 382 "Handle the start of a new section within 'region'." 383 384 # Parse the section and start a new block after the section. 385 386 indent = len(items.read_match(2)) 387 level = len(items.read_match(3)) 388 region.append(parse_region(items, level, indent)) 389 new_block(region) 390 391 def parse_section_end(items, region): 392 393 "Handle the end of a new section within 'region'." 394 395 feature = items.read_match() 396 if region.have_end(feature): 397 raise StopIteration 398 else: 399 region.append_inline(Text(feature)) 400 401 # Pattern handlers. 402 403 handlers = { 404 None : end_region, 405 "break" : parse_break, 406 "defterm" : parse_defterm, 407 "defterm_empty" : parse_defterm_empty, 408 "deftermend" : end_region, 409 "deftermsep" : end_region, 410 "fontstyle" : parse_fontstyle, 411 "heading" : parse_heading, 412 "headingend" : parse_heading_end, 413 "listitemend" : end_region, 414 "listitem" : parse_listitem, 415 "listitem_alpha" : parse_listitem, 416 "listitem_dot" : parse_listitem, 417 "listitem_num" : parse_listitem, 418 "listitem_roman" : parse_listitem, 419 "monospace" : parse_monospace, 420 "monospaceend" : end_region, 421 "regionstart" : parse_section, 422 "regionend" : parse_section_end, 423 "rule" : parse_rule, 424 } 425 426 def new_block(region): 427 428 "Start a new block in 'region'." 429 430 block = Block([]) 431 region.add(block) 432 433 434 435 # Top-level functions. 436 437 parse = parse_page 438 439 # vim: tabstop=4 expandtab shiftwidth=4