1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from moinformat.tree import Block, Heading, ListItem, Region, Rule, Text 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "regionstart" : r"((^\s*)([{]{3,}))", # {{{... 30 "regionend" : r"^\s*([}]{3,})", # }}}... 31 "header" : r"#!(.*?)\n", # #! char-excl-nl 32 33 # Region contents: 34 # Line-oriented patterns: 35 # blank line 36 "break" : r"^(\s*?)\n", 37 # [ws...] =... ws... expecting headingend 38 "heading" : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)", 39 # indent (list-item or number-item or alpha-item or roman-item or dot-item) 40 "listitem" : r"^(\s+)(\*)(\s*)", 41 "listitem_num" : r"^(\s+)(\d+\.)(\s+)", 42 "listitem_alpha": r"^(\s+)([aA]\.)(\s+)", 43 "listitem_roman": r"^(\s+)([iI]\.)(\s+)", 44 "listitem_dot" : r"^(\s+)(\.)(\s*)", 45 46 # Region contents: 47 # Inline patterns: 48 "rule" : r"(-----*)", # ----... 49 50 # Heading contents: 51 "headingend" : r"(\s+)(=+)(\s*\n)", # ws... =... [ws...] nl 52 53 # List contents: 54 "listitemend" : r"^", # next line 55 } 56 57 # Define patterns for the regular expressions. 58 59 patterns = {} 60 for name, value in syntax.items(): 61 patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE) 62 63 64 65 # Tokenising functions. 66 67 class TokenStream: 68 69 "A stream of tokens taken from a string." 70 71 def __init__(self, s): 72 self.s = s 73 self.pos = 0 74 self.match = None 75 self.matching = None 76 77 def read_until(self, pattern_names, remaining=True): 78 79 """ 80 Find the first match for the given 'pattern_names'. Return the text 81 preceding any match, the remaining text if no match was found, or None 82 if no match was found and 'remaining' is given as a false value. 83 """ 84 85 first = None 86 self.matching = None 87 88 # Find the first matching pattern. 89 90 for pattern_name in pattern_names: 91 match = patterns[pattern_name].search(self.s, self.pos) 92 if match: 93 start, end = match.span() 94 if self.matching is None or start < first: 95 first = start 96 self.matching = pattern_name 97 self.match = match 98 99 if self.matching is None: 100 if remaining: 101 return self.s[self.pos:] 102 else: 103 return None 104 else: 105 return self.s[self.pos:first] 106 107 def read_match(self, group=1): 108 109 """ 110 Return the matched text, updating the position in the stream. If 'group' 111 is specified, the indicated group in a match will be returned. 112 Typically, group 1 should contain all pertinent data, but groups defined 113 within group 1 can provide sections of the data. 114 """ 115 116 if self.match: 117 _start, self.pos = self.match.span() 118 try: 119 return self.match.group(group) 120 except IndexError: 121 return "" 122 else: 123 self.pos = len(self.s) 124 return None 125 126 127 128 # Parser functions. 129 130 def parse_page(s): 131 132 """ 133 Parse page text 's'. Pages consist of regions delimited by markers. 134 """ 135 136 return parse_region(TokenStream(s)) 137 138 def parse_region(items, level=0, indent=0): 139 140 """ 141 Parse the data provided by 'items' to populate a region with the given 142 'level' at the given 'indent'. 143 """ 144 145 region = Region([], level, indent) 146 147 # Parse section headers. 148 149 parse_region_header(items, region) 150 151 # Parse section body. 152 153 if region.is_transparent(): 154 parse_region_wiki(items, region) 155 else: 156 parse_region_opaque(items, region) 157 158 return region 159 160 def parse_region_header(items, region): 161 162 """ 163 Parse the region header from the 'items', setting it for the given 'region'. 164 """ 165 166 if items.read_until(["header"], False) == "": # None means no header 167 region.type = items.read_match() 168 169 def parse_region_wiki(items, region): 170 171 "Parse the data provided by 'items' to populate a wiki 'region'." 172 173 new_block(region) 174 parse_region_details(items, region, [ 175 "break", "heading", "listitem", "listitem_num", "listitem_alpha", 176 "listitem_roman", "listitem_dot", "regionstart", "regionend", "rule"]) 177 178 def parse_region_opaque(items, region): 179 180 "Parse the data provided by 'items' to populate an opaque 'region'." 181 182 parse_region_details(items, region, ["regionend"]) 183 184 def parse_region_details(items, region, pattern_names): 185 186 "Parse 'items' within 'region' searching using 'pattern_names'." 187 188 try: 189 while True: 190 191 # Obtain text before any marker or the end of the input. 192 193 preceding = items.read_until(pattern_names) 194 if preceding: 195 region.append_text(Text(preceding)) 196 197 # End of input. 198 199 if not items.matching: 200 break 201 202 # Obtain any feature. 203 204 feature = items.read_match() 205 handler = handlers.get(items.matching) 206 207 # Handle each feature or add text to the region. 208 209 if handler: 210 handler(items, region) 211 else: 212 region.append_text(Text(feature)) 213 214 except StopIteration: 215 pass 216 217 region.normalise() 218 219 def end_region(items, region): 220 221 "End the parsing of 'region'." 222 223 raise StopIteration 224 225 def parse_break(items, region): 226 227 "Handle a paragraph break within 'region'." 228 229 # Mark any previous block as not being the final one in a sequence. 230 231 block = region.last() 232 if isinstance(block, Block): 233 block.final = False 234 new_block(region) 235 236 def parse_heading(items, region): 237 238 "Handle a heading." 239 240 start_extra = items.read_match(1) 241 level = len(items.read_match(2)) 242 start_pad = items.read_match(3) 243 heading = Heading([], level, start_extra, start_pad) 244 parse_region_details(items, heading, ["headingend"]) 245 region.append(heading) 246 new_block(region) 247 248 def parse_heading_end(items, heading): 249 250 "Handle the end of a heading." 251 252 level = len(items.read_match(2)) 253 if heading.level == level: 254 heading.end_pad = items.read_match(1) 255 heading.end_extra = items.read_match(3) 256 raise StopIteration 257 258 def parse_listitem(items, region): 259 260 "Handle a list item marker within 'region'." 261 262 indent = len(items.read_match(1)) 263 marker = items.read_match(2) 264 space = items.read_match(3) 265 item = ListItem([], indent, marker, space) 266 parse_region_details(items, item, ["listitemend"]) 267 region.append(item) 268 new_block(region) 269 270 def parse_listitem_end(items, item): 271 272 "Handle the end of a list." 273 274 raise StopIteration 275 276 def parse_rule(items, region): 277 278 "Handle a horizontal rule within 'region'." 279 280 length = len(items.read_match(1)) 281 rule = Rule(length) 282 region.append(rule) 283 new_block(region) 284 285 def parse_section(items, region): 286 287 "Handle the start of a new section within 'region'." 288 289 # Parse the section and start a new block after the section. 290 291 indent = len(items.read_match(2)) 292 level = len(items.read_match(3)) 293 region.append(parse_region(items, level, indent)) 294 new_block(region) 295 296 def parse_section_end(items, region): 297 298 "Handle the end of a new section within 'region'." 299 300 feature = items.read_match() 301 if region.have_end(feature): 302 raise StopIteration 303 else: 304 region.append_text(Text(feature)) 305 306 # Pattern handlers. 307 308 handlers = { 309 None : end_region, 310 "break" : parse_break, 311 "heading" : parse_heading, 312 "headingend" : parse_heading_end, 313 "listitemend" : parse_listitem_end, 314 "listitem" : parse_listitem, 315 "listitem_alpha" : parse_listitem, 316 "listitem_dot" : parse_listitem, 317 "listitem_num" : parse_listitem, 318 "listitem_roman" : parse_listitem, 319 "regionstart" : parse_section, 320 "regionend" : parse_section_end, 321 "rule" : parse_rule, 322 } 323 324 def new_block(region): 325 326 "Start a new block in 'region'." 327 328 block = Block([]) 329 region.append(block) 330 331 332 333 # Top-level functions. 334 335 parse = parse_page 336 337 # vim: tabstop=4 expandtab shiftwidth=4