1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from cgi import escape 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "regionstart" : (r"^\s*([{]{3,})", re.MULTILINE | re.DOTALL), # {{{... 30 "regionend" : (r"^\s*([}]{3,})", re.MULTILINE | re.DOTALL), # }}}... 31 "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl 32 33 # Region contents: 34 "break" : (r"^(\s*?)\n", re.MULTILINE), # blank line 35 } 36 37 # Define patterns for the regular expressions. 38 39 patterns = {} 40 for name, (value, flags) in syntax.items(): 41 patterns[name] = re.compile(value, re.UNICODE | flags) 42 43 44 45 # Document nodes. 46 47 class Container: 48 49 "A container of document nodes." 50 51 def __init__(self, nodes): 52 self.nodes = nodes 53 54 def append(self, node): 55 self.nodes.append(node) 56 57 def normalise(self): 58 59 "Combine adjacent text nodes." 60 61 nodes = self.nodes 62 self.nodes = [] 63 text = None 64 65 for node in nodes: 66 67 # Open a text node or merge text into an open node. 68 69 if isinstance(node, Text): 70 if not text: 71 text = node 72 else: 73 text.merge(node) 74 75 # Close any open text node and append the current node. 76 77 else: 78 if text: 79 self.append(text) 80 text = None 81 self.append(node) 82 83 # Add any open text node. 84 85 if text: 86 self.append(text) 87 88 def __str__(self): 89 return self.prettyprint() 90 91 def prettyprint(self, indent=""): 92 pass 93 94 class Region(Container): 95 96 "A region of the page." 97 98 transparent_region_types = ["wiki"] 99 100 def __init__(self, nodes, level=0, type=None): 101 Container.__init__(self, nodes) 102 self.level = level 103 self.type = type 104 105 def have_end(self, s): 106 return self.level and s.startswith("}") and self.level == len(s) 107 108 def is_transparent(self): 109 return not self.level or self.type in self.transparent_region_types 110 111 def __repr__(self): 112 return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type) 113 114 def prettyprint(self, indent=""): 115 l = ["%sRegion: level=%d type=%s" % (indent, self.level, self.type)] 116 for node in self.nodes: 117 l.append(node.prettyprint(indent + " ")) 118 return "\n".join(l) 119 120 def to_string(self, out): 121 out.start_region(self.level, self.type) 122 for node in self.nodes: 123 node.to_string(out) 124 out.end_region(self.level, self.type) 125 126 class Block(Container): 127 128 "A block in the page." 129 130 def __init__(self, nodes, final=True): 131 Container.__init__(self, nodes) 132 self.final = final 133 134 def __repr__(self): 135 return "Block(%r)" % self.nodes 136 137 def prettyprint(self, indent=""): 138 l = ["%sBlock: final=%s" % (indent, self.final)] 139 for node in self.nodes: 140 l.append(node.prettyprint(indent + " ")) 141 return "\n".join(l) 142 143 def to_string(self, out): 144 out.start_block(self.final) 145 for node in self.nodes: 146 node.to_string(out) 147 out.end_block(self.final) 148 149 class Text: 150 151 "A text node." 152 153 def __init__(self, s): 154 self.s = s 155 156 def merge(self, text): 157 self.s += text.s 158 159 def __repr__(self): 160 return "Text(%r)" % self.s 161 162 def prettyprint(self, indent=""): 163 return "%sText: %r" % (indent, self.s) 164 165 def to_string(self, out): 166 out.text(self.s) 167 168 169 170 # Serialisation. 171 172 class Serialiser: 173 174 "General serialisation support." 175 176 def __init__(self, out): 177 self.out = out 178 179 class MoinSerialiser(Serialiser): 180 181 "Serialisation of the page." 182 183 def start_region(self, level, type): 184 out = self.out 185 if level: 186 out("{" * level) # marker 187 if type and level: 188 out("#!%s\n" % type) # header 189 190 def end_region(self, level, type): 191 out = self.out 192 if level: 193 out("}" * level) # marker 194 195 def start_block(self, final): 196 pass 197 198 def end_block(self, final): 199 if not final: 200 self.out("\n") 201 202 def text(self, s): 203 self.out(s) 204 205 class HTMLSerialiser(Serialiser): 206 207 "Serialisation of the page." 208 209 def start_region(self, level, type): 210 l = [] 211 out = l.append 212 if level: 213 out("level-%d" % level) # marker 214 215 # NOTE: Encode type details for CSS. 216 217 if type: 218 out("type-%s" % escape(type, True)) # header 219 220 self.out("<span class='%s'>" % " ".join(l)) 221 222 def end_region(self, level, type): 223 self.out("</span>") 224 225 def start_block(self, final): 226 self.out("<p>") 227 228 def end_block(self, final): 229 self.out("</p>") 230 231 def text(self, s): 232 self.out(escape(s)) 233 234 235 236 # Tokenising functions. 237 238 class TokenStream: 239 240 "A stream of tokens taken from a string." 241 242 def __init__(self, s): 243 self.s = s 244 self.pos = 0 245 self.match = None 246 self.matching = None 247 248 def read_until(self, pattern_names, remaining=True): 249 250 """ 251 Find the first match for the given 'pattern_names'. Return the text 252 preceding any match, the remaining text if no match was found, or None 253 if no match was found and 'remaining' is given as a false value. 254 """ 255 256 first = None 257 self.matching = None 258 259 # Find the first matching pattern. 260 261 for pattern_name in pattern_names: 262 match = patterns[pattern_name].search(self.s, self.pos) 263 if match: 264 start, end = match.span() 265 if self.matching is None or start < first: 266 first = start 267 self.matching = pattern_name 268 self.match = match 269 270 if self.matching is None: 271 if remaining: 272 return self.s[self.pos:] 273 else: 274 return None 275 else: 276 return self.s[self.pos:first] 277 278 def read_match(self): 279 280 "Return the matched text, updating the position in the stream." 281 282 if self.match: 283 _start, self.pos = self.match.span() 284 s = self.match.group(1) 285 return s 286 else: 287 self.pos = len(self.s) 288 return None 289 290 291 292 # Parser functions. 293 294 def parse_page(s): 295 296 """ 297 Parse page text 's'. Pages consist of regions delimited by markers. 298 """ 299 300 return parse_region(TokenStream(s)) 301 302 def parse_region(items, level=0): 303 304 """ 305 Parse the data provided by 'items' to populate a region at the given 306 'level'. 307 """ 308 309 region = Region([], level) 310 311 # Parse section headers. 312 313 parse_region_header(items, region) 314 315 if region.is_transparent(): 316 parse_region_wiki(items, region) 317 else: 318 parse_region_opaque(items, region) 319 320 return region 321 322 def parse_region_header(items, region): 323 324 """ 325 Parse the region header from the 'items', setting it for the given 'region'. 326 """ 327 328 if items.read_until(["header"], False) == "": # None means no header 329 region.type = items.read_match() 330 331 def parse_region_wiki(items, region): 332 333 "Parse the data provided by 'items' to populate a wiki 'region'." 334 335 # Process exposed text and sections. 336 337 block = new_block(region) 338 339 while True: 340 341 # Obtain text before any marker or the end of the input. 342 343 preceding = items.read_until(["break", "regionstart", "regionend"]) 344 if preceding: 345 block.append(Text(preceding)) 346 347 # Obtain any feature. 348 349 feature = items.read_match() 350 351 # End of input. 352 353 if not items.matching: 354 break 355 356 # Start a section if an appropriate marker is given. 357 358 if items.matching == "regionstart": 359 block = parse_region_within_wiki_region(items, region) 360 361 # Interpret the given marker, closing the current section if the 362 # given marker is the corresponding end marker for the current 363 # section. 364 365 elif items.matching == "regionend" and region.have_end(feature): 366 break 367 368 # Start a new block if a paragraph break is found. 369 370 elif items.matching == "break": 371 block = parse_block(items, region) 372 373 # Add any inappropriate marker to the text. 374 375 else: 376 block.append(Text(feature)) 377 378 region.normalise() 379 380 def parse_region_within_wiki_region(items, region): 381 382 # Parse the section and start a new block after the section. 383 384 feature = items.read_match() 385 region.append(parse_region(items, len(feature))) 386 return new_block(region) 387 388 def parse_block(items, region): 389 390 # Mark any previous block as not being the final one in a sequence. 391 392 block = region.nodes[-1] 393 block.final = False 394 return new_block(region) 395 396 def parse_region_opaque(items, region): 397 398 "Parse the data provided by 'items' to populate an opaque 'region'." 399 400 # Process exposed text and the section end. 401 402 while True: 403 404 # Obtain text before any marker or the end of the input. 405 406 preceding = items.read_until(["regionend"]) 407 if preceding: 408 region.append(Text(preceding)) 409 410 # Obtain any marker. 411 412 marker = items.read_match() 413 414 # End of input. 415 416 if not marker: 417 break 418 419 # Interpret the given marker, closing the current section if the 420 # given marker is the corresponding end marker for the current 421 # section. 422 423 if region.have_end(marker): 424 break 425 426 # Add any inappropriate marker to the text. 427 428 else: 429 region.append(Text(marker)) 430 431 region.normalise() 432 433 def new_block(region): 434 435 "Start a new block in 'region'." 436 437 block = Block([]) 438 region.append(block) 439 return block 440 441 442 443 # Top-level functions. 444 445 parse = parse_page 446 447 def serialise(doc, serialiser=MoinSerialiser): 448 l = [] 449 doc.to_string(serialiser(l.append)) 450 return "".join(l) 451 452 # vim: tabstop=4 expandtab shiftwidth=4