1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/moinformat/__init__.py Sat Apr 29 17:47:03 2017 +0200
1.3 @@ -0,0 +1,277 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Moin wiki format parser.
1.8 +
1.9 +Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
1.19 +details.
1.20 +
1.21 +You should have received a copy of the GNU General Public License along with
1.22 +this program. If not, see <http://www.gnu.org/licenses/>.
1.23 +"""
1.24 +
1.25 +from moinformat.tree import Region, Block, ListItem, Text
1.26 +import re
1.27 +
1.28 +# Regular expressions.
1.29 +
1.30 +syntax = {
1.31 + # Page regions:
1.32 + "regionstart" : (r"((^\s*)([{]{3,}))", re.MULTILINE | re.DOTALL), # {{{...
1.33 + "regionend" : (r"^\s*([}]{3,})", re.MULTILINE | re.DOTALL), # }}}...
1.34 + "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl
1.35 +
1.36 + # Region contents:
1.37 + "break" : (r"^(\s*?)\n", re.MULTILINE), # blank line
1.38 + "listitem" : (r"^((\s+)([*]|\d+[.]))", re.MULTILINE), # indent (list-item or number-item)
1.39 +
1.40 + # List contents:
1.41 + "listitemend" : (r"^", re.MULTILINE), # next line
1.42 + }
1.43 +
1.44 +# Define patterns for the regular expressions.
1.45 +
1.46 +patterns = {}
1.47 +for name, (value, flags) in syntax.items():
1.48 + patterns[name] = re.compile(value, re.UNICODE | flags)
1.49 +
1.50 +
1.51 +
1.52 +# Tokenising functions.
1.53 +
1.54 +class TokenStream:
1.55 +
1.56 + "A stream of tokens taken from a string."
1.57 +
1.58 + def __init__(self, s):
1.59 + self.s = s
1.60 + self.pos = 0
1.61 + self.match = None
1.62 + self.matching = None
1.63 +
1.64 + def read_until(self, pattern_names, remaining=True):
1.65 +
1.66 + """
1.67 + Find the first match for the given 'pattern_names'. Return the text
1.68 + preceding any match, the remaining text if no match was found, or None
1.69 + if no match was found and 'remaining' is given as a false value.
1.70 + """
1.71 +
1.72 + first = None
1.73 + self.matching = None
1.74 +
1.75 + # Find the first matching pattern.
1.76 +
1.77 + for pattern_name in pattern_names:
1.78 + match = patterns[pattern_name].search(self.s, self.pos)
1.79 + if match:
1.80 + start, end = match.span()
1.81 + if self.matching is None or start < first:
1.82 + first = start
1.83 + self.matching = pattern_name
1.84 + self.match = match
1.85 +
1.86 + if self.matching is None:
1.87 + if remaining:
1.88 + return self.s[self.pos:]
1.89 + else:
1.90 + return None
1.91 + else:
1.92 + return self.s[self.pos:first]
1.93 +
1.94 + def read_match(self, group=1):
1.95 +
1.96 + """
1.97 + Return the matched text, updating the position in the stream. If 'group'
1.98 + is specified, the indicated group in a match will be returned.
1.99 + Typically, group 1 should contain all pertinent data, but groups defined
1.100 + within group 1 can provide sections of the data.
1.101 + """
1.102 +
1.103 + if self.match:
1.104 + _start, self.pos = self.match.span()
1.105 + try:
1.106 + return self.match.group(group)
1.107 + except IndexError:
1.108 + return ""
1.109 + else:
1.110 + self.pos = len(self.s)
1.111 + return None
1.112 +
1.113 +
1.114 +
1.115 +# Parser functions.
1.116 +
1.117 +def parse_page(s):
1.118 +
1.119 + """
1.120 + Parse page text 's'. Pages consist of regions delimited by markers.
1.121 + """
1.122 +
1.123 + return parse_region(TokenStream(s))
1.124 +
1.125 +def parse_region(items, level=0, indent=0):
1.126 +
1.127 + """
1.128 + Parse the data provided by 'items' to populate a region with the given
1.129 + 'level' at the given 'indent'.
1.130 + """
1.131 +
1.132 + region = Region([], level, indent)
1.133 +
1.134 + # Parse section headers.
1.135 +
1.136 + parse_region_header(items, region)
1.137 +
1.138 + # Parse section body.
1.139 +
1.140 + if region.is_transparent():
1.141 + parse_region_wiki(items, region)
1.142 + else:
1.143 + parse_region_opaque(items, region)
1.144 +
1.145 + return region
1.146 +
1.147 +def parse_region_header(items, region):
1.148 +
1.149 + """
1.150 + Parse the region header from the 'items', setting it for the given 'region'.
1.151 + """
1.152 +
1.153 + if items.read_until(["header"], False) == "": # None means no header
1.154 + region.type = items.read_match()
1.155 +
1.156 +def parse_region_wiki(items, region):
1.157 +
1.158 + "Parse the data provided by 'items' to populate a wiki 'region'."
1.159 +
1.160 + new_block(region)
1.161 + parse_region_details(items, region, ["break", "listitem", "regionstart", "regionend"])
1.162 +
1.163 +def parse_region_opaque(items, region):
1.164 +
1.165 + "Parse the data provided by 'items' to populate an opaque 'region'."
1.166 +
1.167 + parse_region_details(items, region, ["regionend"])
1.168 +
1.169 +def parse_region_details(items, region, pattern_names):
1.170 +
1.171 + "Parse 'items' within 'region' searching using 'pattern_names'."
1.172 +
1.173 + try:
1.174 + while True:
1.175 +
1.176 + # Obtain text before any marker or the end of the input.
1.177 +
1.178 + preceding = items.read_until(pattern_names)
1.179 + if preceding:
1.180 + region.append_text(Text(preceding))
1.181 +
1.182 + # End of input.
1.183 +
1.184 + if not items.matching:
1.185 + break
1.186 +
1.187 + # Obtain any feature.
1.188 +
1.189 + feature = items.read_match()
1.190 + handler = handlers.get(items.matching)
1.191 +
1.192 + # Handle each feature or add text to the region.
1.193 +
1.194 + if handler:
1.195 + handler(items, region)
1.196 + else:
1.197 + region.append_text(Text(feature))
1.198 +
1.199 + except StopIteration:
1.200 + pass
1.201 +
1.202 + region.normalise()
1.203 +
1.204 +def end_region(items, region):
1.205 +
1.206 + "End the parsing of 'region'."
1.207 +
1.208 + raise StopIteration
1.209 +
1.210 +def parse_break(items, region):
1.211 +
1.212 + "Handle a paragraph break within 'region'."
1.213 +
1.214 + # Mark any previous block as not being the final one in a sequence.
1.215 +
1.216 + block = region.nodes[-1]
1.217 + block.final = False
1.218 + new_block(region)
1.219 +
1.220 +def parse_listitem_end(items, region):
1.221 +
1.222 + "Handle the end of a list."
1.223 +
1.224 + raise StopIteration
1.225 +
1.226 +def parse_listitem(items, region):
1.227 +
1.228 + "Handle a list item marker within 'region'."
1.229 +
1.230 + item = ListItem([])
1.231 + parse_region_details(items, item, ["listitemend"])
1.232 + region.append(item)
1.233 + new_block(region)
1.234 +
1.235 +def parse_section(items, region):
1.236 +
1.237 + "Handle the start of a new section within 'region'."
1.238 +
1.239 + # Parse the section and start a new block after the section.
1.240 +
1.241 + indent = len(items.read_match(2))
1.242 + level = len(items.read_match(3))
1.243 + region.append(parse_region(items, level, indent))
1.244 + new_block(region)
1.245 +
1.246 +def parse_section_end(items, region):
1.247 +
1.248 + "Handle the end of a new section within 'region'."
1.249 +
1.250 + feature = items.read_match()
1.251 + if region.have_end(feature):
1.252 + raise StopIteration
1.253 + else:
1.254 + region.append_text(Text(feature))
1.255 +
1.256 +# Pattern handlers.
1.257 +
1.258 +handlers = {
1.259 + None : end_region,
1.260 + "break" : parse_break,
1.261 + "listitemend" : parse_listitem_end,
1.262 + "listitem" : parse_listitem,
1.263 + "regionstart" : parse_section,
1.264 + "regionend" : parse_section_end,
1.265 + }
1.266 +
1.267 +def new_block(region):
1.268 +
1.269 + "Start a new block in 'region'."
1.270 +
1.271 + block = Block([])
1.272 + region.append(block)
1.273 +
1.274 +
1.275 +
1.276 +# Top-level functions.
1.277 +
1.278 +parse = parse_page
1.279 +
1.280 +# vim: tabstop=4 expandtab shiftwidth=4