1.1 --- a/moinformat/__init__.py Thu May 04 21:41:13 2017 +0200
1.2 +++ b/moinformat/__init__.py Thu May 04 22:39:00 2017 +0200
1.3 @@ -19,11 +19,13 @@
1.4 this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 +from moinformat.parsing import ParserBase, TokenStream, new_block
1.8 from moinformat.serialisers import serialise
1.9 -from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \
1.10 +from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \
1.11 Larger, ListItem, Monospace, Region, Rule, Smaller, \
1.12 Subscript, Superscript, TableAttr, TableAttrs, \
1.13 TableCell, TableRow, Text, Underline
1.14 +
1.15 import re
1.16
1.17 # Regular expressions.
1.18 @@ -120,141 +122,60 @@
1.19
1.20
1.21
1.22 -# Tokenising functions.
1.23 -
1.24 -class TokenStream:
1.25 -
1.26 - "A stream of tokens taken from a string."
1.27 +class Parser(ParserBase):
1.28
1.29 - def __init__(self, s):
1.30 - self.s = s
1.31 - self.pos = 0
1.32 - self.match = None
1.33 - self.matching = None
1.34 + "A wiki region parser."
1.35
1.36 - def rewind(self, length):
1.37 -
1.38 - "Rewind in the string by 'length'."
1.39 -
1.40 - self.pos -= min(length, self.pos)
1.41 -
1.42 - def read_until(self, pattern_names, remaining=True):
1.43 + def __init__(self, formats=None):
1.44
1.45 """
1.46 - Find the first match for the given 'pattern_names'. Return the text
1.47 - preceding any match, the remaining text if no match was found, or None
1.48 - if no match was found and 'remaining' is given as a false value.
1.49 + Initialise the parser with any given 'formats' mapping from region type
1.50 + names to parser objects.
1.51 """
1.52
1.53 - first = None
1.54 - self.matching = None
1.55 -
1.56 - # Find the first matching pattern.
1.57 -
1.58 - for pattern_name in pattern_names:
1.59 - match = patterns[pattern_name].search(self.s, self.pos)
1.60 - if match:
1.61 - start, end = match.span()
1.62 - if self.matching is None or start < first:
1.63 - first = start
1.64 - self.matching = pattern_name
1.65 - self.match = match
1.66 + formats = {"wiki" : self}
1.67 + if formats:
1.68 + formats.update(formats)
1.69
1.70 - if self.matching is None:
1.71 - if remaining:
1.72 - return self.s[self.pos:]
1.73 - else:
1.74 - return None
1.75 - else:
1.76 - return self.s[self.pos:first]
1.77 -
1.78 - def read_match(self, group=1):
1.79 -
1.80 - """
1.81 - Return the matched text, updating the position in the stream. If 'group'
1.82 - is specified, the indicated group in a match will be returned.
1.83 - Typically, group 1 should contain all pertinent data, but groups defined
1.84 - within group 1 can provide sections of the data.
1.85 - """
1.86 + ParserBase.__init__(self, formats)
1.87
1.88 - if self.match:
1.89 - _start, self.pos = self.match.span()
1.90 - try:
1.91 - return self.match.group(group)
1.92 - except IndexError:
1.93 - return ""
1.94 - else:
1.95 - self.pos = len(self.s)
1.96 - return None
1.97 -
1.98 -
1.99 -
1.100 -# Utility functions.
1.101 -
1.102 -def new_block(region):
1.103 + def get_items(self, s):
1.104
1.105 - "Start a new block in 'region'."
1.106 -
1.107 - block = Block([])
1.108 - region.add(block)
1.109 -
1.110 -
1.111 + "Return a sequence of token items for 's'."
1.112
1.113 -# Parser abstraction.
1.114 -
1.115 -class Parser:
1.116 -
1.117 - "An extensible parser."
1.118 -
1.119 - def __init__(self, formats=None):
1.120 - self.formats = formats
1.121 + return TokenStream(s, patterns)
1.122
1.123 # Principal parser methods.
1.124
1.125 - def parse_page(self, s):
1.126 + def parse(self, s):
1.127
1.128 """
1.129 Parse page text 's'. Pages consist of regions delimited by markers.
1.130 """
1.131
1.132 - return self.parse_region(TokenStream(s))
1.133 -
1.134 - def parse_region(self, items, level=0, indent=0):
1.135 + items = self.get_items(s)
1.136 + region = Region([])
1.137
1.138 - """
1.139 - Parse the data provided by 'items' to populate a region with the given
1.140 - 'level' at the given 'indent'.
1.141 - """
1.142 -
1.143 - region = Region([], level, indent)
1.144 -
1.145 - # Parse section headers.
1.146 + # Parse page header.
1.147
1.148 self.parse_region_header(items, region)
1.149
1.150 - # Parse section body.
1.151 + # Handle pages directly with this parser.
1.152 + # Otherwise, test the type and find an appropriate parser.
1.153
1.154 - if region.is_transparent():
1.155 - self.parse_region_wiki(items, region)
1.156 + if not region.type:
1.157 + self.parse_region_content(items, region)
1.158 else:
1.159 - self.parse_region_opaque(items, region)
1.160 + self.parse_region_type(items, region)
1.161
1.162 return region
1.163
1.164 - def parse_region_header(self, items, region):
1.165 -
1.166 - """
1.167 - Parse the region header from the 'items', setting it for the given 'region'.
1.168 - """
1.169 -
1.170 - if items.read_until(["header"], False) == "": # None means no header
1.171 - region.type = items.read_match()
1.172 -
1.173 - def parse_region_wiki(self, items, region):
1.174 + def parse_region_content(self, items, region):
1.175
1.176 "Parse the data provided by 'items' to populate a wiki 'region'."
1.177
1.178 new_block(region)
1.179 +
1.180 self.parse_region_details(items, region, inline_pattern_names + [
1.181 "break", "heading",
1.182 "defterm", "defterm_empty",
1.183 @@ -265,12 +186,6 @@
1.184 "tablerow",
1.185 ])
1.186
1.187 - def parse_region_opaque(self, items, region):
1.188 -
1.189 - "Parse the data provided by 'items' to populate an opaque 'region'."
1.190 -
1.191 - self.parse_region_details(items, region, ["regionend"])
1.192 -
1.193 # Parser methods supporting different page features.
1.194
1.195 def parse_attrname(self, items, attrs):
1.196 @@ -575,52 +490,9 @@
1.197
1.198
1.199
1.200 - # Parsing utilities.
1.201 -
1.202 - def parse_region_details(self, items, region, pattern_names):
1.203 -
1.204 - "Parse 'items' within 'region' searching using 'pattern_names'."
1.205 -
1.206 - try:
1.207 - while True:
1.208 -
1.209 - # Obtain text before any marker or the end of the input.
1.210 -
1.211 - preceding = items.read_until(pattern_names)
1.212 - if preceding:
1.213 - region.append_inline(Text(preceding))
1.214 -
1.215 - # End of input.
1.216 -
1.217 - if not items.matching:
1.218 - break
1.219 -
1.220 - # Obtain any feature.
1.221 + # Pattern handlers.
1.222
1.223 - feature = items.read_match()
1.224 - handler = self.handlers.get(items.matching)
1.225 -
1.226 - # Handle each feature or add text to the region.
1.227 -
1.228 - if handler:
1.229 - handler(self, items, region)
1.230 - else:
1.231 - region.append_inline(Text(feature))
1.232 -
1.233 - except StopIteration:
1.234 - pass
1.235 -
1.236 - region.normalise()
1.237 -
1.238 - def end_region(self, items, region):
1.239 -
1.240 - "End the parsing of 'region', breaking out of the parsing loop."
1.241 -
1.242 - raise StopIteration
1.243 -
1.244 -
1.245 -
1.246 - # Pattern handlers.
1.247 + end_region = ParserBase.end_region
1.248
1.249 handlers = {
1.250 None : end_region,
1.251 @@ -672,6 +544,6 @@
1.252 # Top-level functions.
1.253
1.254 def parse(s, formats=None):
1.255 - return Parser(formats).parse_page(s)
1.256 + return Parser(formats).parse(s)
1.257
1.258 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
2.2 +++ b/moinformat/parsing.py Thu May 04 22:39:00 2017 +0200
2.3 @@ -0,0 +1,231 @@
2.4 +#!/usr/bin/env python
2.5 +
2.6 +"""
2.7 +Moin wiki parsing functionality.
2.8 +
2.9 +Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
2.10 +
2.11 +This program is free software; you can redistribute it and/or modify it under
2.12 +the terms of the GNU General Public License as published by the Free Software
2.13 +Foundation; either version 3 of the License, or (at your option) any later
2.14 +version.
2.15 +
2.16 +This program is distributed in the hope that it will be useful, but WITHOUT
2.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
2.18 +FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
2.19 +details.
2.20 +
2.21 +You should have received a copy of the GNU General Public License along with
2.22 +this program. If not, see <http://www.gnu.org/licenses/>.
2.23 +"""
2.24 +
2.25 +from moinformat.tree import Block, Region, Text
2.26 +
2.27 +# Tokenising functions.
2.28 +
2.29 +class TokenStream:
2.30 +
2.31 + "A stream of tokens taken from a string."
2.32 +
2.33 + def __init__(self, s, patterns):
2.34 + self.s = s
2.35 + self.patterns = patterns
2.36 + self.pos = 0
2.37 + self.match = None
2.38 + self.matching = None
2.39 +
2.40 + def rewind(self, length):
2.41 +
2.42 + "Rewind in the string by 'length'."
2.43 +
2.44 + self.pos -= min(length, self.pos)
2.45 +
2.46 + def read_until(self, pattern_names, remaining=True):
2.47 +
2.48 + """
2.49 + Find the first match for the given 'pattern_names'. Return the text
2.50 + preceding any match, the remaining text if no match was found, or None
2.51 + if no match was found and 'remaining' is given as a false value.
2.52 + """
2.53 +
2.54 + first = None
2.55 + self.matching = None
2.56 +
2.57 + # Find the first matching pattern.
2.58 +
2.59 + for pattern_name in pattern_names:
2.60 + match = self.patterns[pattern_name].search(self.s, self.pos)
2.61 + if match:
2.62 + start, end = match.span()
2.63 + if self.matching is None or start < first:
2.64 + first = start
2.65 + self.matching = pattern_name
2.66 + self.match = match
2.67 +
2.68 + if self.matching is None:
2.69 + if remaining:
2.70 + return self.s[self.pos:]
2.71 + else:
2.72 + return None
2.73 + else:
2.74 + return self.s[self.pos:first]
2.75 +
2.76 + def read_match(self, group=1):
2.77 +
2.78 + """
2.79 + Return the matched text, updating the position in the stream. If 'group'
2.80 + is specified, the indicated group in a match will be returned.
2.81 + Typically, group 1 should contain all pertinent data, but groups defined
2.82 + within group 1 can provide sections of the data.
2.83 + """
2.84 +
2.85 + if self.match:
2.86 + _start, self.pos = self.match.span()
2.87 + try:
2.88 + return self.match.group(group)
2.89 + except IndexError:
2.90 + return ""
2.91 + else:
2.92 + self.pos = len(self.s)
2.93 + return None
2.94 +
2.95 +
2.96 +
2.97 +# Utility functions.
2.98 +
2.99 +def new_block(region):
2.100 +
2.101 + "Start a new block in 'region'."
2.102 +
2.103 + region.add(Block([]))
2.104 +
2.105 +
2.106 +
2.107 +# Parser abstractions.
2.108 +
2.109 +class ParserBase:
2.110 +
2.111 + "Common parsing methods."
2.112 +
2.113 + def __init__(self, formats=None):
2.114 +
2.115 + """
2.116 + Initialise the parser with any given 'formats' mapping from region type
2.117 + names to parser objects.
2.118 + """
2.119 +
2.120 + self.formats = formats
2.121 +
2.122 + def get_items(self, s):
2.123 +
2.124 + "Return a sequence of token items for 's'."
2.125 +
2.126 + raise NotImplementedError
2.127 +
2.128 + def parse(self, s):
2.129 +
2.130 + """
2.131 + Parse page text 's'. Pages consist of regions delimited by markers.
2.132 + """
2.133 +
2.134 + return self.parse_region(self.get_items(s))
2.135 +
2.136 + def parse_region(self, items, level=0, indent=0):
2.137 +
2.138 + """
2.139 + Parse the data provided by 'items' to populate a region with the given
2.140 + 'level' at the given 'indent'.
2.141 + """
2.142 +
2.143 + region = Region([], level, indent)
2.144 +
2.145 + # Parse section headers, then parse according to region type.
2.146 +
2.147 + self.parse_region_header(items, region)
2.148 + self.parse_region_type(items, region)
2.149 +
2.150 + return region
2.151 +
2.152 + def parse_region_type(self, items, region):
2.153 +
2.154 + """
2.155 + Given data provided by 'items', use configured parsers to parse the
2.156 + 'region' based on its type.
2.157 + """
2.158 +
2.159 + # Find an appropriate parser given the type.
2.160 +
2.161 + if self.formats.has_key(region.type):
2.162 + self.formats[region.type].parse_region_content(items, region)
2.163 +
2.164 + # Otherwise, treat the section as opaque.
2.165 +
2.166 + else:
2.167 + self.parse_region_opaque(items, region)
2.168 +
2.169 + def parse_region_header(self, items, region):
2.170 +
2.171 + """
2.172 + Parse the region header from the 'items', setting it for the given 'region'.
2.173 + """
2.174 +
2.175 + if items.read_until(["header"], False) == "": # None means no header
2.176 + region.type = items.read_match()
2.177 +
2.178 + def parse_region_opaque(self, items, region):
2.179 +
2.180 + "Parse the data provided by 'items' to populate an opaque 'region'."
2.181 +
2.182 + region.transparent = False
2.183 + self.parse_region_details(items, region, ["regionend"])
2.184 +
2.185 + def parse_region_content(self, items, region):
2.186 +
2.187 + "Parse the data provided by 'items' to populate the given 'region'."
2.188 +
2.189 + pass
2.190 +
2.191 + # Parsing utilities.
2.192 +
2.193 + def parse_region_details(self, items, region, pattern_names):
2.194 +
2.195 + "Parse 'items' within 'region' searching using 'pattern_names'."
2.196 +
2.197 + try:
2.198 + while True:
2.199 +
2.200 + # Obtain text before any marker or the end of the input.
2.201 +
2.202 + preceding = items.read_until(pattern_names)
2.203 + if preceding:
2.204 + region.append_inline(Text(preceding))
2.205 +
2.206 + # End of input.
2.207 +
2.208 + if not items.matching:
2.209 + break
2.210 +
2.211 + # Obtain any feature.
2.212 +
2.213 + feature = items.read_match()
2.214 + handler = self.handlers.get(items.matching)
2.215 +
2.216 + # Handle each feature or add text to the region.
2.217 +
2.218 + if handler:
2.219 + handler(self, items, region)
2.220 + else:
2.221 + region.append_inline(Text(feature))
2.222 +
2.223 + except StopIteration:
2.224 + pass
2.225 +
2.226 + region.normalise()
2.227 +
2.228 + def end_region(self, items, region):
2.229 +
2.230 + "End the parsing of 'region', breaking out of the parsing loop."
2.231 +
2.232 + raise StopIteration
2.233 +
2.234 +# vim: tabstop=4 expandtab shiftwidth=4
3.1 --- a/moinformat/tree.py Thu May 04 21:41:13 2017 +0200
3.2 +++ b/moinformat/tree.py Thu May 04 22:39:00 2017 +0200
3.3 @@ -97,13 +97,12 @@
3.4
3.5 "A region of the page."
3.6
3.7 - transparent_region_types = ["wiki"]
3.8 -
3.9 - def __init__(self, nodes, level=0, indent=0, type=None):
3.10 + def __init__(self, nodes, level=0, indent=0, type=None, transparent=True):
3.11 Container.__init__(self, nodes)
3.12 self.level = level
3.13 self.indent = indent
3.14 self.type = type
3.15 + self.transparent = transparent
3.16
3.17 def add(self, node):
3.18 last = self.node(-1)
3.19 @@ -113,7 +112,7 @@
3.20 self.append(node)
3.21
3.22 def append_inline(self, node):
3.23 - if self.is_transparent():
3.24 + if self.transparent:
3.25 self.nodes[-1].append(node)
3.26 else:
3.27 self.append(node)
3.28 @@ -121,9 +120,6 @@
3.29 def have_end(self, s):
3.30 return self.level and s.startswith("}") and self.level == len(s)
3.31
3.32 - def is_transparent(self):
3.33 - return not self.level or self.type in self.transparent_region_types
3.34 -
3.35 def __repr__(self):
3.36 return "Region(%r, %r, %r, %r)" % (self.nodes, self.level, self.indent, self.type)
3.37