1 #!/usr/bin/env python 2 3 """ 4 Moin wiki format parser. 5 6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from cgi import escape 23 import re 24 25 # Regular expressions. 26 27 syntax = { 28 # Page regions: 29 "marker" : (r"^\s*([{]{3,}|[}]{3,})", re.MULTILINE | re.DOTALL), # {{{... or }}}... 30 31 # Region contents: 32 "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl 33 "break" : (r"^\s*?\n", re.MULTILINE), # blank line 34 } 35 36 # Define patterns for the regular expressions. 37 38 patterns = {} 39 for name, (value, flags) in syntax.items(): 40 patterns[name] = re.compile(value, re.UNICODE | flags) 41 42 43 44 # Document nodes. 45 46 class Container: 47 48 "A container of document nodes." 49 50 def __init__(self, nodes): 51 self.nodes = nodes 52 53 def append(self, node): 54 self.nodes.append(node) 55 56 def normalise(self): 57 58 "Combine adjacent text nodes." 59 60 nodes = self.nodes 61 self.nodes = [] 62 text = None 63 64 for node in nodes: 65 66 # Open a text node or merge text into an open node. 67 68 if isinstance(node, Text): 69 if not text: 70 text = node 71 else: 72 text.merge(node) 73 74 # Close any open text node and append the current node. 75 76 else: 77 if text: 78 self.append(text) 79 text = None 80 self.append(node) 81 82 # Add any open text node. 83 84 if text: 85 self.append(text) 86 87 def __str__(self): 88 return self.prettyprint() 89 90 def prettyprint(self, indent=""): 91 pass 92 93 class Region(Container): 94 95 "A region of the page." 96 97 transparent_region_types = ["wiki"] 98 99 def __init__(self, nodes, level=0, type=None): 100 Container.__init__(self, nodes) 101 self.level = level 102 self.type = type 103 104 def have_start(self, s): 105 return self.is_transparent() and s.startswith("{") 106 107 def have_end(self, s): 108 return self.level and s.startswith("}") and self.level == len(s) 109 110 def is_transparent(self): 111 return not self.level or self.type in self.transparent_region_types 112 113 def __repr__(self): 114 return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type) 115 116 def prettyprint(self, indent=""): 117 l = ["%sRegion: level=%d type=%s" % (indent, self.level, self.type)] 118 for node in self.nodes: 119 l.append(node.prettyprint(indent + " ")) 120 return "\n".join(l) 121 122 def to_string(self, out): 123 out.start_region(self.level, self.type) 124 for node in self.nodes: 125 node.to_string(out) 126 out.end_region(self.level, self.type) 127 128 class Block(Container): 129 130 "A block in the page." 131 132 def __init__(self, nodes, final=True): 133 Container.__init__(self, nodes) 134 self.final = final 135 136 def __repr__(self): 137 return "Block(%r)" % self.nodes 138 139 def prettyprint(self, indent=""): 140 l = ["%sBlock: final=%s" % (indent, self.final)] 141 for node in self.nodes: 142 l.append(node.prettyprint(indent + " ")) 143 return "\n".join(l) 144 145 def to_string(self, out): 146 out.start_block(self.final) 147 for node in self.nodes: 148 node.to_string(out) 149 out.end_block(self.final) 150 151 class Text: 152 153 "A text node." 154 155 def __init__(self, s): 156 self.s = s 157 158 def merge(self, text): 159 self.s += text.s 160 161 def __repr__(self): 162 return "Text(%r)" % self.s 163 164 def prettyprint(self, indent=""): 165 return "%sText: %r" % (indent, self.s) 166 167 def to_string(self, out): 168 out.text(self.s) 169 170 171 172 # Serialisation. 173 174 class Serialiser: 175 176 "General serialisation support." 177 178 def __init__(self, out): 179 self.out = out 180 181 class MoinSerialiser(Serialiser): 182 183 "Serialisation of the page." 184 185 def start_region(self, level, type): 186 out = self.out 187 if level: 188 out("{" * level) # marker 189 if type and level: 190 out("#!%s\n" % type) # header 191 192 def end_region(self, level, type): 193 out = self.out 194 if level: 195 out("}" * level) # marker 196 197 def start_block(self, final): 198 pass 199 200 def end_block(self, final): 201 if not final: 202 self.out("\n") 203 204 def text(self, s): 205 self.out(s) 206 207 class HTMLSerialiser(Serialiser): 208 209 "Serialisation of the page." 210 211 def start_region(self, level, type): 212 l = [] 213 out = l.append 214 if level: 215 out("level-%d" % level) # marker 216 217 # NOTE: Encode type details for CSS. 218 219 if type: 220 out("type-%s" % escape(type, True)) # header 221 222 self.out("<span class='%s'>" % " ".join(l)) 223 224 def end_region(self, level, type): 225 self.out("</span>") 226 227 def start_block(self, final): 228 self.out("<p>") 229 230 def end_block(self, final): 231 self.out("</p>") 232 233 def text(self, s): 234 self.out(escape(s)) 235 236 237 238 # Tokenising functions. 239 240 class TokenStream: 241 242 "A stream of tokens taken from a string." 243 244 def __init__(self, s): 245 self.s = s 246 self.pos = 0 247 self.match = None 248 self.matching = None 249 250 def read_until(self, pattern_names, remaining=True): 251 252 """ 253 Find the first match for the given 'pattern_names'. Return the text 254 preceding any match, the remaining text if no match was found, or None 255 if no match was found and 'remaining' is given as a false value. 256 """ 257 258 first = None 259 self.matching = None 260 261 # Find the first matching pattern. 262 263 for pattern_name in pattern_names: 264 match = patterns[pattern_name].search(self.s, self.pos) 265 if match: 266 start, end = match.span() 267 if self.matching is None or start < first: 268 first = start 269 self.matching = pattern_name 270 self.match = match 271 272 if self.matching is None: 273 if remaining: 274 return self.s[self.pos:] 275 else: 276 return None 277 else: 278 return self.s[self.pos:first] 279 280 def read_match(self): 281 282 "Return the matched text, updating the position in the stream." 283 284 if self.match: 285 _start, self.pos = self.match.span() 286 s = self.match.group(1) 287 self.match = None 288 return s 289 else: 290 self.pos = len(self.s) 291 return None 292 293 294 295 # Parser functions. 296 297 def parse_page(s): 298 299 """ 300 Parse page text 's'. Pages consist of regions delimited by markers. 301 """ 302 303 items = TokenStream(s) 304 305 # Define a region for the page and parse it. 306 307 region = Region([]) 308 parse_region(items, region) 309 return region 310 311 def parse_region(items, region): 312 313 "Parse the data provided by 'items' to populate 'region'." 314 315 # Parse section headers. 316 317 parse_region_header(items, region) 318 319 if region.is_transparent(): 320 parse_region_wiki(items, region) 321 else: 322 parse_region_opaque(items, region) 323 324 def parse_region_wiki(items, region): 325 326 "Parse the data provided by 'items' to populate a wiki 'region'." 327 328 # Process exposed text and sections. 329 330 block = Block([]) 331 region.append(block) 332 333 while True: 334 335 # Obtain text before any marker or the end of the input. 336 337 preceding = items.read_until(["break", "marker"]) 338 if preceding: 339 block.append(Text(preceding)) 340 341 # Obtain any feature. 342 343 feature = items.read_match() 344 345 # End of input. 346 347 if not items.matching: 348 break 349 350 # Start a section if an appropriate marker is given. 351 352 if region.have_start(feature): 353 354 # Define the section and parse it. 355 356 _region = Region([], len(feature)) 357 region.append(_region) 358 parse_region(items, _region) 359 360 # Start a new block after the section. 361 362 block = Block([]) 363 region.append(block) 364 365 # Interpret the given marker, closing the current section if the 366 # given marker is the corresponding end marker for the current 367 # section. 368 369 elif region.have_end(feature): 370 break 371 372 # Start a new block if a paragraph break is found. 373 374 elif items.matching == "break": 375 block.final = False 376 block = Block([]) 377 region.append(block) 378 379 # Add any inappropriate marker to the text. 380 381 else: 382 block.append(Text(feature)) 383 384 region.normalise() 385 386 def parse_region_opaque(items, region): 387 388 "Parse the data provided by 'items' to populate an opaque 'region'." 389 390 # Process exposed text and sections. 391 392 while True: 393 394 # Obtain text before any marker or the end of the input. 395 396 preceding = items.read_until(["marker"]) 397 if preceding: 398 region.append(Text(preceding)) 399 400 # Obtain any marker. 401 402 marker = items.read_match() 403 404 # End of input. 405 406 if not marker: 407 break 408 409 # Interpret the given marker, closing the current section if the 410 # given marker is the corresponding end marker for the current 411 # section. 412 413 if region.have_end(marker): 414 break 415 416 # Add any inappropriate marker to the text. 417 418 else: 419 region.append(Text(marker)) 420 421 region.normalise() 422 423 def parse_region_header(items, region): 424 425 """ 426 Parse the region header from the 'items', setting it for the given 'region'. 427 """ 428 429 if items.read_until(["header"], False) == "": # None means no header 430 region.type = items.read_match() 431 432 433 434 # Top-level functions. 435 436 parse = parse_page 437 438 def serialise(doc, serialiser=MoinSerialiser): 439 l = [] 440 doc.to_string(serialiser(l.append)) 441 return "".join(l) 442 443 # vim: tabstop=4 expandtab shiftwidth=4