1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Break, DefItem, DefTerm, Emphasis, Heading, \ 23 ListItem, Region, Rule, Strong, Text 24 import re 25 26 # Regular expressions. 27 28 syntax = { 29 # Page regions: 30 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 31 "regionend" : r"^\s*([}]{3,})", # }}}... 32 "header" : r"#!(.*?)\n", # #! char-excl-nl 33 34 # Region contents: 35 # Line-oriented patterns: 36 # blank line 37 "break" : r"^(\s*?)\n", 38 # ws... expecting text :: 39 "defterm" : r"^(\s+)(?=.+?::)", 40 # ws... expecting :: ws... 41 "defterm_empty" : r"^(\s+)(?=::\s+)", 42 # [ws...] =... ws... expecting headingend 43 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 44 # ws... list-item [ws...] 45 "listitem" : r"^(\s+)(\*)(\s*)", 46 # ws... number-item ws... 47 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 48 # ws... alpha-item ws... 49 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 50 # ws... roman-item ws... 51 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 52 # ws... dot-item [ws...] 53 "listitem_dot" : r"^(\s+)(\.)(\s*)", 54 55 # Region contents: 56 # Inline patterns: 57 "em" : r"''(?!')", # '' expecting not ' 58 "rule" : r"(-----*)", # ----... 59 "strong" : r"'''", # ''' 60 61 # Inline contents: 62 "emend" : r"''(?!')|''(?='')", 63 "strongend" : r"'''", 64 65 # Heading contents: 66 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 67 68 # List contents: 69 "deftermend" : r"::(\s*?\n)", 70 "deftermsep" : r"::(\s+)", 71 "listitemend" : r"^", # next line 72 } 73 74 # Define patterns for the regular expressions. 75 76 patterns = {} 77 for name, value in syntax.items(): 78 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 79 80 81 82 # Tokenising functions. 83 84 class TokenStream: 85 86 "A stream of tokens taken from a string." 87 88 def __init__(self, s): 89 self.s = s 90 self.pos = 0 91 self.match = None 92 self.matching = None 93 94 def read_until(self, pattern_names, remaining=True): 95 96 """ 97 Find the first match for the given 'pattern_names'. Return the text 98 preceding any match, the remaining text if no match was found, or None 99 if no match was found and 'remaining' is given as a false value. 100 """ 101 102 first = None 103 self.matching = None 104 105 # Find the first matching pattern. 106 107 for pattern_name in pattern_names: 108 match = patterns[pattern_name].search(self.s, self.pos) 109 if match: 110 start, end = match.span() 111 if self.matching is None or start < first: 112 first = start 113 self.matching = pattern_name 114 self.match = match 115 116 if self.matching is None: 117 if remaining: 118 return self.s[self.pos:] 119 else: 120 return None 121 else: 122 return self.s[self.pos:first] 123 124 def read_match(self, group=1): 125 126 """ 127 Return the matched text, updating the position in the stream. If 'group' 128 is specified, the indicated group in a match will be returned. 129 Typically, group 1 should contain all pertinent data, but groups defined 130 within group 1 can provide sections of the data. 131 """ 132 133 if self.match: 134 _start, self.pos = self.match.span() 135 try: 136 return self.match.group(group) 137 except IndexError: 138 return "" 139 else: 140 self.pos = len(self.s) 141 return None 142 143 144 145 # Parser functions. 146 147 def parse_page(s): 148 149 """ 150 Parse page text 's'. Pages consist of regions delimited by markers. 151 """ 152 153 return parse_region(TokenStream(s)) 154 155 def parse_region(items, level=0, indent=0): 156 157 """ 158 Parse the data provided by 'items' to populate a region with the given 159 'level' at the given 'indent'. 160 """ 161 162 region = Region([], level, indent) 163 164 # Parse section headers. 165 166 parse_region_header(items, region) 167 168 # Parse section body. 169 170 if region.is_transparent(): 171 parse_region_wiki(items, region) 172 else: 173 parse_region_opaque(items, region) 174 175 return region 176 177 def parse_region_header(items, region): 178 179 """ 180 Parse the region header from the 'items', setting it for the given 'region'. 181 """ 182 183 if items.read_until(["header"], False) == "": # None means no header 184 region.type = items.read_match() 185 186 def parse_region_wiki(items, region): 187 188 "Parse the data provided by 'items' to populate a wiki 'region'." 189 190 new_block(region) 191 parse_region_details(items, region, [ 192 "break", "heading", 193 "defterm", "defterm_empty", 194 "em", 195 "listitem", "listitem_alpha", "listitem_dot", "listitem_num", 196 "listitem_roman", 197 "regionstart", "regionend", 198 "rule", 199 "strong"]) 200 201 def parse_region_opaque(items, region): 202 203 "Parse the data provided by 'items' to populate an opaque 'region'." 204 205 parse_region_details(items, region, ["regionend"]) 206 207 def parse_region_details(items, region, pattern_names): 208 209 "Parse 'items' within 'region' searching using 'pattern_names'." 210 211 try: 212 while True: 213 214 # Obtain text before any marker or the end of the input. 215 216 preceding = items.read_until(pattern_names) 217 if preceding: 218 region.append_inline(Text(preceding)) 219 220 # End of input. 221 222 if not items.matching: 223 break 224 225 # Obtain any feature. 226 227 feature = items.read_match() 228 handler = handlers.get(items.matching) 229 230 # Handle each feature or add text to the region. 231 232 if handler: 233 handler(items, region) 234 else: 235 region.append_inline(Text(feature)) 236 237 except StopIteration: 238 pass 239 240 region.normalise() 241 242 def end_region(items, region): 243 244 "End the parsing of 'region'." 245 246 raise StopIteration 247 248 def parse_break(items, region): 249 250 "Handle a paragraph break within 'region'." 251 252 region.add(Break()) 253 new_block(region) 254 255 def parse_defitem(items, region, extra=""): 256 257 "Handle a definition item within 'region'." 258 259 pad = items.read_match(1) 260 item = DefItem([], pad, extra) 261 parse_region_details(items, item, ["listitemend"]) 262 region.append(item) 263 new_block(region) 264 265 def parse_defterm(items, region): 266 267 "Handle a definition term within 'region'." 268 269 pad = items.read_match(1) 270 term = DefTerm([], pad) 271 parse_region_details(items, term, ["deftermend", "deftermsep"]) 272 region.append(term) 273 if items.matching == "deftermsep": 274 parse_defitem(items, region) 275 276 def parse_defterm_empty(items, region): 277 278 "Handle an empty definition term within 'region'." 279 280 extra = items.read_match(1) 281 parse_region_details(items, region, ["deftermsep"]) 282 parse_defitem(items, region, extra) 283 284 def parse_em(items, region): 285 286 "Handle emphasis." 287 288 span = Emphasis([]) 289 parse_region_details(items, span, ["emend", "strong"]) 290 region.append_inline(span) 291 292 def parse_heading(items, region): 293 294 "Handle a heading." 295 296 start_extra = items.read_match(1) 297 level = len(items.read_match(2)) 298 start_pad = items.read_match(3) 299 heading = Heading([], level, start_extra, start_pad) 300 parse_region_details(items, heading, ["headingend"]) 301 region.append(heading) 302 new_block(region) 303 304 def parse_heading_end(items, heading): 305 306 "Handle the end of a heading." 307 308 level = len(items.read_match(2)) 309 if heading.level == level: 310 heading.end_pad = items.read_match(1) 311 heading.end_extra = items.read_match(3) 312 raise StopIteration 313 314 def parse_listitem(items, region): 315 316 "Handle a list item marker within 'region'." 317 318 indent = len(items.read_match(1)) 319 marker = items.read_match(2) 320 space = items.read_match(3) 321 item = ListItem([], indent, marker, space) 322 parse_region_details(items, item, ["listitemend"]) 323 region.append(item) 324 new_block(region) 325 326 def parse_rule(items, region): 327 328 "Handle a horizontal rule within 'region'." 329 330 length = len(items.read_match(1)) 331 rule = Rule(length) 332 region.append(rule) 333 new_block(region) 334 335 def parse_section(items, region): 336 337 "Handle the start of a new section within 'region'." 338 339 # Parse the section and start a new block after the section. 340 341 indent = len(items.read_match(2)) 342 level = len(items.read_match(3)) 343 region.append(parse_region(items, level, indent)) 344 new_block(region) 345 346 def parse_section_end(items, region): 347 348 "Handle the end of a new section within 'region'." 349 350 feature = items.read_match() 351 if region.have_end(feature): 352 raise StopIteration 353 else: 354 region.append_inline(Text(feature)) 355 356 def parse_strong(items, region): 357 358 "Handle emboldened text." 359 360 span = Strong([]) 361 parse_region_details(items, span, ["em", "strongend"]) 362 region.append_inline(span) 363 364 # Pattern handlers. 365 366 handlers = { 367 None : end_region, 368 "break" : parse_break, 369 "defterm" : parse_defterm, 370 "defterm_empty" : parse_defterm_empty, 371 "deftermend" : end_region, 372 "deftermsep" : end_region, 373 "em" : parse_em, 374 "emend" : end_region, 375 "heading" : parse_heading, 376 "headingend" : parse_heading_end, 377 "listitemend" : end_region, 378 "listitem" : parse_listitem, 379 "listitem_alpha" : parse_listitem, 380 "listitem_dot" : parse_listitem, 381 "listitem_num" : parse_listitem, 382 "listitem_roman" : parse_listitem, 383 "regionstart" : parse_section, 384 "regionend" : parse_section_end, 385 "rule" : parse_rule, 386 "strong" : parse_strong, 387 "strongend" : end_region, 388 } 389 390 def new_block(region): 391 392 "Start a new block in 'region'." 393 394 block = Block([]) 395 region.add(block) 396 397 398 399 # Top-level functions. 400 401 parse = parse_page 402 403 # vim: tabstop=4 expandtab shiftwidth=4