1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/moinformat.py Wed Apr 26 13:57:10 2017 +0200
1.3 @@ -0,0 +1,279 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Moin wiki format parser.
1.8 +
1.9 +Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
1.19 +details.
1.20 +
1.21 +You should have received a copy of the GNU General Public License along with
1.22 +this program. If not, see <http://www.gnu.org/licenses/>.
1.23 +"""
1.24 +
1.25 +from cgi import escape
1.26 +import re
1.27 +
1.28 +# Regular expressions.
1.29 +
1.30 +syntax = {
1.31 + # Page regions:
1.32 + "markers" : (r"^\s*([{]{3,}|[}]{3,})", re.MULTILINE | re.DOTALL), # {{{... or }}}...
1.33 +
1.34 + # Region contents:
1.35 + "header" : (r"\A#!(.*?)$", re.MULTILINE), # #! char-excl-nl
1.36 + "region text" : (r"(^\s*$)", re.MULTILINE), # blank line
1.37 + }
1.38 +
1.39 +# Define patterns for the regular expressions.
1.40 +
1.41 +patterns = {}
1.42 +for name, (value, flags) in syntax.items():
1.43 + patterns[name] = re.compile(value, re.UNICODE | flags)
1.44 +
1.45 +
1.46 +
1.47 +# Document nodes.
1.48 +
1.49 +class Container:
1.50 +
1.51 + "A container of document nodes."
1.52 +
1.53 + def __init__(self, nodes):
1.54 + self.nodes = nodes
1.55 +
1.56 + def append(self, node):
1.57 + self.nodes.append(node)
1.58 +
1.59 +class Region(Container):
1.60 +
1.61 + "A region of the page."
1.62 +
1.63 + def __init__(self, nodes, level=0, type=None):
1.64 + Container.__init__(self, nodes)
1.65 + self.level = level
1.66 + self.type = type
1.67 +
1.68 + def have_end(self, s):
1.69 + return self.level and s.startswith("}") and self.level == len(s)
1.70 +
1.71 + def __repr__(self):
1.72 + return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type)
1.73 +
1.74 + def to_string(self, out):
1.75 + out.start_region(self.level, self.type)
1.76 + for node in self.nodes:
1.77 + node.to_string(out)
1.78 + out.end_region(self.level, self.type)
1.79 +
1.80 +class Block(Container):
1.81 +
1.82 + "A block in the page."
1.83 +
1.84 + def __init__(self, nodes, final=True):
1.85 + Container.__init__(self, nodes)
1.86 + self.final = final
1.87 +
1.88 + def __repr__(self):
1.89 + return "Block(%r)" % self.nodes
1.90 +
1.91 + def to_string(self, out):
1.92 + out.start_block(self.final)
1.93 + for node in self.nodes:
1.94 + node.to_string(out)
1.95 + out.end_block(self.final)
1.96 +
1.97 +class Text:
1.98 +
1.99 + "A text node."
1.100 +
1.101 + def __init__(self, s):
1.102 + self.s = s
1.103 +
1.104 + def __repr__(self):
1.105 + return "Text(%r)" % self.s
1.106 +
1.107 + def to_string(self, out):
1.108 + out.text(self.s)
1.109 +
1.110 +
1.111 +
1.112 +# Serialisation.
1.113 +
1.114 +class Serialiser:
1.115 +
1.116 + "General serialisation support."
1.117 +
1.118 + def __init__(self, out):
1.119 + self.out = out
1.120 +
1.121 +class MoinSerialiser(Serialiser):
1.122 +
1.123 + "Serialisation of the page."
1.124 +
1.125 + def start_region(self, level, type):
1.126 + out = self.out
1.127 + if level:
1.128 + out("{" * level) # marker
1.129 + if type:
1.130 + out("#!%s" % type) # header
1.131 +
1.132 + def end_region(self, level, type):
1.133 + out = self.out
1.134 + if level:
1.135 + out("}" * level) # marker
1.136 +
1.137 + def start_block(self, final):
1.138 + pass
1.139 +
1.140 + def end_block(self, final):
1.141 + if not final:
1.142 + self.out("\n")
1.143 +
1.144 + def text(self, s):
1.145 + self.out(s)
1.146 +
1.147 +class HTMLSerialiser(Serialiser):
1.148 +
1.149 + "Serialisation of the page."
1.150 +
1.151 + def start_region(self, level, type):
1.152 + l = []
1.153 + out = l.append
1.154 + if level:
1.155 + out("level-%d" % level) # marker
1.156 +
1.157 + # NOTE: Encode type details for CSS.
1.158 +
1.159 + if type:
1.160 + out("type-%s" % escape(type, True)) # header
1.161 +
1.162 + self.out("<span class='%s'>" % " ".join(l))
1.163 +
1.164 + def end_region(self, level, type):
1.165 + self.out("</span>")
1.166 +
1.167 + def start_block(self, final):
1.168 + self.out("<p>")
1.169 +
1.170 + def end_block(self, final):
1.171 + self.out("</p>")
1.172 +
1.173 + def text(self, s):
1.174 + self.out(escape(s))
1.175 +
1.176 +
1.177 +
1.178 +# Parser functions.
1.179 +
1.180 +def parse_page(s):
1.181 +
1.182 + """
1.183 + Parse page text 's'. Pages consist of regions delimited by markers.
1.184 + """
1.185 +
1.186 + region = Region([])
1.187 + items = iter(patterns["markers"].split(s))
1.188 + parse_region(items, region)
1.189 + return region
1.190 +
1.191 +def parse_region(items, region):
1.192 +
1.193 + "Parse the data provided by 'items' to populate 'region'."
1.194 +
1.195 + nodes = region.nodes
1.196 + first = True
1.197 +
1.198 + # Process exposed text and sections.
1.199 +
1.200 + try:
1.201 + while True:
1.202 +
1.203 + # Parse section headers.
1.204 +
1.205 + if first:
1.206 + match_text = parse_region_header(items.next(), region)
1.207 + first = False
1.208 + else:
1.209 + match_text = items.next()
1.210 +
1.211 + # Start a section if an appropriate marker is given.
1.212 +
1.213 + if match_text.startswith("{"):
1.214 + _region = Region([], len(match_text))
1.215 + region.append(_region)
1.216 + parse_region(items, _region)
1.217 +
1.218 + # Interpret the given marker, closing the current section if the
1.219 + # given marker is the corresponding end marker for the current
1.220 + # section.
1.221 +
1.222 + elif region.have_end(match_text):
1.223 + return
1.224 +
1.225 + # Otherwise, parse text in the region.
1.226 +
1.227 + else:
1.228 + parse_region_text(match_text, region)
1.229 +
1.230 + except StopIteration:
1.231 + pass
1.232 +
1.233 +def parse_region_header(s, region):
1.234 +
1.235 + """
1.236 + Parse the text 's', extracting any region header and setting it for the
1.237 + given 'region'. Return the remaining text.
1.238 + """
1.239 +
1.240 + items = iter(patterns["header"].split(s))
1.241 + pre_header = items.next()
1.242 +
1.243 + if not pre_header:
1.244 + region.type = items.next()
1.245 + return items.next()
1.246 + else:
1.247 + return pre_header
1.248 +
1.249 +def parse_region_text(s, region):
1.250 +
1.251 + "Parse the text 's' as part of 'region'."
1.252 +
1.253 + items = iter(patterns["region text"].split(s))
1.254 + block = Block([])
1.255 + region.append(block)
1.256 +
1.257 + try:
1.258 + while True:
1.259 + match_text = items.next()
1.260 +
1.261 + if not match_text.strip():
1.262 + region.append(block)
1.263 + block.final = False
1.264 + block = Block([])
1.265 + else:
1.266 + block.append(Text(match_text))
1.267 +
1.268 + except StopIteration:
1.269 + pass
1.270 +
1.271 +# Top-level parsing function.
1.272 +
1.273 +parse = parse_page
1.274 +
1.275 +# Top-level serialising functions.
1.276 +
1.277 +def serialise(doc, serialiser=MoinSerialiser):
1.278 + l = []
1.279 + doc.to_string(serialiser(l.append))
1.280 + return "".join(l)
1.281 +
1.282 +# vim: tabstop=4 expandtab shiftwidth=4