Moved common parsing functionality into a separate module. Eliminated "transparent" region decisions in the Region class, deciding region transparency in parser classes instead.

     1.1 --- a/moinformat/__init__.py	Thu May 04 21:41:13 2017 +0200
     1.2 +++ b/moinformat/__init__.py	Thu May 04 22:39:00 2017 +0200
     1.3 @@ -19,11 +19,13 @@
     1.4  this program.  If not, see <http://www.gnu.org/licenses/>.
     1.5  """
     1.6  
     1.7 +from moinformat.parsing import ParserBase, TokenStream, new_block
     1.8  from moinformat.serialisers import serialise
     1.9 -from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \
    1.10 +from moinformat.tree import Break, DefItem, DefTerm, FontStyle, Heading, \
    1.11                              Larger, ListItem, Monospace, Region, Rule, Smaller, \
    1.12                              Subscript, Superscript, TableAttr, TableAttrs, \
    1.13                              TableCell, TableRow, Text, Underline
    1.14 +
    1.15  import re
    1.16  
    1.17  # Regular expressions.
    1.18 @@ -120,141 +122,60 @@
    1.19  
    1.20  
    1.21  
    1.22 -# Tokenising functions.
    1.23 -
    1.24 -class TokenStream:
    1.25 -
    1.26 -    "A stream of tokens taken from a string."
    1.27 +class Parser(ParserBase):
    1.28  
    1.29 -    def __init__(self, s):
    1.30 -        self.s = s
    1.31 -        self.pos = 0
    1.32 -        self.match = None
    1.33 -        self.matching = None
    1.34 +    "A wiki region parser."
    1.35  
    1.36 -    def rewind(self, length):
    1.37 -
    1.38 -        "Rewind in the string by 'length'."
    1.39 -
    1.40 -        self.pos -= min(length, self.pos)
    1.41 -
    1.42 -    def read_until(self, pattern_names, remaining=True):
    1.43 +    def __init__(self, formats=None):
    1.44  
    1.45          """
    1.46 -        Find the first match for the given 'pattern_names'. Return the text
    1.47 -        preceding any match, the remaining text if no match was found, or None
    1.48 -        if no match was found and 'remaining' is given as a false value.
    1.49 +        Initialise the parser with any given 'formats' mapping from region type
    1.50 +        names to parser objects.
    1.51          """
    1.52  
    1.53 -        first = None
    1.54 -        self.matching = None
    1.55 -
    1.56 -        # Find the first matching pattern.
    1.57 -
    1.58 -        for pattern_name in pattern_names:
    1.59 -            match = patterns[pattern_name].search(self.s, self.pos)
    1.60 -            if match:
    1.61 -                start, end = match.span()
    1.62 -                if self.matching is None or start < first:
    1.63 -                    first = start
    1.64 -                    self.matching = pattern_name
    1.65 -                    self.match = match
    1.66 +        formats = {"wiki" : self}
    1.67 +        if formats:
    1.68 +            formats.update(formats)
    1.69  
    1.70 -        if self.matching is None:
    1.71 -            if remaining:
    1.72 -                return self.s[self.pos:]
    1.73 -            else:
    1.74 -                return None
    1.75 -        else:
    1.76 -            return self.s[self.pos:first]
    1.77 -
    1.78 -    def read_match(self, group=1):
    1.79 -
    1.80 -        """
    1.81 -        Return the matched text, updating the position in the stream. If 'group'
    1.82 -        is specified, the indicated group in a match will be returned.
    1.83 -        Typically, group 1 should contain all pertinent data, but groups defined
    1.84 -        within group 1 can provide sections of the data.
    1.85 -        """
    1.86 +        ParserBase.__init__(self, formats)
    1.87  
    1.88 -        if self.match:
    1.89 -            _start, self.pos = self.match.span()
    1.90 -            try:
    1.91 -                return self.match.group(group)
    1.92 -            except IndexError:
    1.93 -                return ""
    1.94 -        else:
    1.95 -            self.pos = len(self.s)
    1.96 -            return None
    1.97 -
    1.98 -
    1.99 -
   1.100 -# Utility functions.
   1.101 -
   1.102 -def new_block(region):
   1.103 +    def get_items(self, s):
   1.104  
   1.105 -    "Start a new block in 'region'."
   1.106 -
   1.107 -    block = Block([])
   1.108 -    region.add(block)
   1.109 -
   1.110 -
   1.111 +        "Return a sequence of token items for 's'."
   1.112  
   1.113 -# Parser abstraction.
   1.114 -
   1.115 -class Parser:
   1.116 -
   1.117 -    "An extensible parser."
   1.118 -
   1.119 -    def __init__(self, formats=None):
   1.120 -        self.formats = formats
   1.121 +        return TokenStream(s, patterns)
   1.122  
   1.123      # Principal parser methods.
   1.124  
   1.125 -    def parse_page(self, s):
   1.126 +    def parse(self, s):
   1.127  
   1.128          """
   1.129          Parse page text 's'. Pages consist of regions delimited by markers.
   1.130          """
   1.131  
   1.132 -        return self.parse_region(TokenStream(s))
   1.133 -
   1.134 -    def parse_region(self, items, level=0, indent=0):
   1.135 +        items = self.get_items(s)
   1.136 +        region = Region([])
   1.137  
   1.138 -        """
   1.139 -        Parse the data provided by 'items' to populate a region with the given
   1.140 -        'level' at the given 'indent'.
   1.141 -        """
   1.142 -
   1.143 -        region = Region([], level, indent)
   1.144 -
   1.145 -        # Parse section headers.
   1.146 +        # Parse page header.
   1.147  
   1.148          self.parse_region_header(items, region)
   1.149  
   1.150 -        # Parse section body.
   1.151 +        # Handle pages directly with this parser.
   1.152 +        # Otherwise, test the type and find an appropriate parser.
   1.153  
   1.154 -        if region.is_transparent():
   1.155 -            self.parse_region_wiki(items, region)
   1.156 +        if not region.type:
   1.157 +            self.parse_region_content(items, region)
   1.158          else:
   1.159 -            self.parse_region_opaque(items, region)
   1.160 +            self.parse_region_type(items, region)
   1.161  
   1.162          return region
   1.163  
   1.164 -    def parse_region_header(self, items, region):
   1.165 -
   1.166 -        """
   1.167 -        Parse the region header from the 'items', setting it for the given 'region'.
   1.168 -        """
   1.169 -
   1.170 -        if items.read_until(["header"], False) == "": # None means no header
   1.171 -            region.type = items.read_match()
   1.172 -
   1.173 -    def parse_region_wiki(self, items, region):
   1.174 +    def parse_region_content(self, items, region):
   1.175  
   1.176          "Parse the data provided by 'items' to populate a wiki 'region'."
   1.177  
   1.178          new_block(region)
   1.179 +
   1.180          self.parse_region_details(items, region, inline_pattern_names + [
   1.181              "break", "heading",
   1.182              "defterm", "defterm_empty",
   1.183 @@ -265,12 +186,6 @@
   1.184              "tablerow",
   1.185              ])
   1.186  
   1.187 -    def parse_region_opaque(self, items, region):
   1.188 -
   1.189 -        "Parse the data provided by 'items' to populate an opaque 'region'."
   1.190 -
   1.191 -        self.parse_region_details(items, region, ["regionend"])
   1.192 -
   1.193      # Parser methods supporting different page features.
   1.194  
   1.195      def parse_attrname(self, items, attrs):
   1.196 @@ -575,52 +490,9 @@
   1.197  
   1.198  
   1.199  
   1.200 -    # Parsing utilities.
   1.201 -
   1.202 -    def parse_region_details(self, items, region, pattern_names):
   1.203 -
   1.204 -        "Parse 'items' within 'region' searching using 'pattern_names'."
   1.205 -
   1.206 -        try:
   1.207 -            while True:
   1.208 -
   1.209 -                # Obtain text before any marker or the end of the input.
   1.210 -
   1.211 -                preceding = items.read_until(pattern_names)
   1.212 -                if preceding:
   1.213 -                    region.append_inline(Text(preceding))
   1.214 -
   1.215 -                # End of input.
   1.216 -
   1.217 -                if not items.matching:
   1.218 -                    break
   1.219 -
   1.220 -                # Obtain any feature.
   1.221 +    # Pattern handlers.
   1.222  
   1.223 -                feature = items.read_match()
   1.224 -                handler = self.handlers.get(items.matching)
   1.225 -
   1.226 -                # Handle each feature or add text to the region.
   1.227 -
   1.228 -                if handler:
   1.229 -                    handler(self, items, region)
   1.230 -                else:
   1.231 -                    region.append_inline(Text(feature))
   1.232 -
   1.233 -        except StopIteration:
   1.234 -            pass
   1.235 -
   1.236 -        region.normalise()
   1.237 -
   1.238 -    def end_region(self, items, region):
   1.239 -
   1.240 -        "End the parsing of 'region', breaking out of the parsing loop."
   1.241 -
   1.242 -        raise StopIteration
   1.243 -
   1.244 -
   1.245 -
   1.246 -    # Pattern handlers.
   1.247 +    end_region = ParserBase.end_region
   1.248  
   1.249      handlers = {
   1.250          None : end_region,
   1.251 @@ -672,6 +544,6 @@
   1.252  # Top-level functions.
   1.253  
   1.254  def parse(s, formats=None):
   1.255 -    return Parser(formats).parse_page(s)
   1.256 +    return Parser(formats).parse(s)
   1.257  
   1.258  # vim: tabstop=4 expandtab shiftwidth=4

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/moinformat/parsing.py	Thu May 04 22:39:00 2017 +0200
     2.3 @@ -0,0 +1,231 @@
     2.4 +#!/usr/bin/env python
     2.5 +
     2.6 +"""
     2.7 +Moin wiki parsing functionality.
     2.8 +
     2.9 +Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>
    2.10 +
    2.11 +This program is free software; you can redistribute it and/or modify it under
    2.12 +the terms of the GNU General Public License as published by the Free Software
    2.13 +Foundation; either version 3 of the License, or (at your option) any later
    2.14 +version.
    2.15 +
    2.16 +This program is distributed in the hope that it will be useful, but WITHOUT
    2.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    2.18 +FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more
    2.19 +details.
    2.20 +
    2.21 +You should have received a copy of the GNU General Public License along with
    2.22 +this program.  If not, see <http://www.gnu.org/licenses/>.
    2.23 +"""
    2.24 +
    2.25 +from moinformat.tree import Block, Region, Text
    2.26 +
    2.27 +# Tokenising functions.
    2.28 +
    2.29 +class TokenStream:
    2.30 +
    2.31 +    "A stream of tokens taken from a string."
    2.32 +
    2.33 +    def __init__(self, s, patterns):
    2.34 +        self.s = s
    2.35 +        self.patterns = patterns
    2.36 +        self.pos = 0
    2.37 +        self.match = None
    2.38 +        self.matching = None
    2.39 +
    2.40 +    def rewind(self, length):
    2.41 +
    2.42 +        "Rewind in the string by 'length'."
    2.43 +
    2.44 +        self.pos -= min(length, self.pos)
    2.45 +
    2.46 +    def read_until(self, pattern_names, remaining=True):
    2.47 +
    2.48 +        """
    2.49 +        Find the first match for the given 'pattern_names'. Return the text
    2.50 +        preceding any match, the remaining text if no match was found, or None
    2.51 +        if no match was found and 'remaining' is given as a false value.
    2.52 +        """
    2.53 +
    2.54 +        first = None
    2.55 +        self.matching = None
    2.56 +
    2.57 +        # Find the first matching pattern.
    2.58 +
    2.59 +        for pattern_name in pattern_names:
    2.60 +            match = self.patterns[pattern_name].search(self.s, self.pos)
    2.61 +            if match:
    2.62 +                start, end = match.span()
    2.63 +                if self.matching is None or start < first:
    2.64 +                    first = start
    2.65 +                    self.matching = pattern_name
    2.66 +                    self.match = match
    2.67 +
    2.68 +        if self.matching is None:
    2.69 +            if remaining:
    2.70 +                return self.s[self.pos:]
    2.71 +            else:
    2.72 +                return None
    2.73 +        else:
    2.74 +            return self.s[self.pos:first]
    2.75 +
    2.76 +    def read_match(self, group=1):
    2.77 +
    2.78 +        """
    2.79 +        Return the matched text, updating the position in the stream. If 'group'
    2.80 +        is specified, the indicated group in a match will be returned.
    2.81 +        Typically, group 1 should contain all pertinent data, but groups defined
    2.82 +        within group 1 can provide sections of the data.
    2.83 +        """
    2.84 +
    2.85 +        if self.match:
    2.86 +            _start, self.pos = self.match.span()
    2.87 +            try:
    2.88 +                return self.match.group(group)
    2.89 +            except IndexError:
    2.90 +                return ""
    2.91 +        else:
    2.92 +            self.pos = len(self.s)
    2.93 +            return None
    2.94 +
    2.95 +
    2.96 +
    2.97 +# Utility functions.
    2.98 +
    2.99 +def new_block(region):
   2.100 +
   2.101 +    "Start a new block in 'region'."
   2.102 +
   2.103 +    region.add(Block([]))
   2.104 +
   2.105 +
   2.106 +
   2.107 +# Parser abstractions.
   2.108 +
   2.109 +class ParserBase:
   2.110 +
   2.111 +    "Common parsing methods."
   2.112 +
   2.113 +    def __init__(self, formats=None):
   2.114 +
   2.115 +        """
   2.116 +        Initialise the parser with any given 'formats' mapping from region type
   2.117 +        names to parser objects.
   2.118 +        """
   2.119 +
   2.120 +        self.formats = formats
   2.121 +
   2.122 +    def get_items(self, s):
   2.123 +
   2.124 +        "Return a sequence of token items for 's'."
   2.125 +
   2.126 +        raise NotImplementedError
   2.127 +
   2.128 +    def parse(self, s):
   2.129 +
   2.130 +        """
   2.131 +        Parse page text 's'. Pages consist of regions delimited by markers.
   2.132 +        """
   2.133 +
   2.134 +        return self.parse_region(self.get_items(s))
   2.135 +
   2.136 +    def parse_region(self, items, level=0, indent=0):
   2.137 +
   2.138 +        """
   2.139 +        Parse the data provided by 'items' to populate a region with the given
   2.140 +        'level' at the given 'indent'.
   2.141 +        """
   2.142 +
   2.143 +        region = Region([], level, indent)
   2.144 +
   2.145 +        # Parse section headers, then parse according to region type.
   2.146 +
   2.147 +        self.parse_region_header(items, region)
   2.148 +        self.parse_region_type(items, region)
   2.149 +
   2.150 +        return region
   2.151 +
   2.152 +    def parse_region_type(self, items, region):
   2.153 +
   2.154 +        """
   2.155 +        Given data provided by 'items', use configured parsers to parse the
   2.156 +        'region' based on its type.
   2.157 +        """
   2.158 +
   2.159 +        # Find an appropriate parser given the type.
   2.160 +
   2.161 +        if self.formats.has_key(region.type):
   2.162 +            self.formats[region.type].parse_region_content(items, region)
   2.163 +
   2.164 +        # Otherwise, treat the section as opaque.
   2.165 +
   2.166 +        else:
   2.167 +            self.parse_region_opaque(items, region)
   2.168 +
   2.169 +    def parse_region_header(self, items, region):
   2.170 +
   2.171 +        """
   2.172 +        Parse the region header from the 'items', setting it for the given 'region'.
   2.173 +        """
   2.174 +
   2.175 +        if items.read_until(["header"], False) == "": # None means no header
   2.176 +            region.type = items.read_match()
   2.177 +
   2.178 +    def parse_region_opaque(self, items, region):
   2.179 +
   2.180 +        "Parse the data provided by 'items' to populate an opaque 'region'."
   2.181 +
   2.182 +        region.transparent = False
   2.183 +        self.parse_region_details(items, region, ["regionend"])
   2.184 +
   2.185 +    def parse_region_content(self, items, region):
   2.186 +
   2.187 +        "Parse the data provided by 'items' to populate the given 'region'."
   2.188 +
   2.189 +        pass
   2.190 +
   2.191 +    # Parsing utilities.
   2.192 +
   2.193 +    def parse_region_details(self, items, region, pattern_names):
   2.194 +
   2.195 +        "Parse 'items' within 'region' searching using 'pattern_names'."
   2.196 +
   2.197 +        try:
   2.198 +            while True:
   2.199 +
   2.200 +                # Obtain text before any marker or the end of the input.
   2.201 +
   2.202 +                preceding = items.read_until(pattern_names)
   2.203 +                if preceding:
   2.204 +                    region.append_inline(Text(preceding))
   2.205 +
   2.206 +                # End of input.
   2.207 +
   2.208 +                if not items.matching:
   2.209 +                    break
   2.210 +
   2.211 +                # Obtain any feature.
   2.212 +
   2.213 +                feature = items.read_match()
   2.214 +                handler = self.handlers.get(items.matching)
   2.215 +
   2.216 +                # Handle each feature or add text to the region.
   2.217 +
   2.218 +                if handler:
   2.219 +                    handler(self, items, region)
   2.220 +                else:
   2.221 +                    region.append_inline(Text(feature))
   2.222 +
   2.223 +        except StopIteration:
   2.224 +            pass
   2.225 +
   2.226 +        region.normalise()
   2.227 +
   2.228 +    def end_region(self, items, region):
   2.229 +
   2.230 +        "End the parsing of 'region', breaking out of the parsing loop."
   2.231 +
   2.232 +        raise StopIteration
   2.233 +
   2.234 +# vim: tabstop=4 expandtab shiftwidth=4

     3.1 --- a/moinformat/tree.py	Thu May 04 21:41:13 2017 +0200
     3.2 +++ b/moinformat/tree.py	Thu May 04 22:39:00 2017 +0200
     3.3 @@ -97,13 +97,12 @@
     3.4  
     3.5      "A region of the page."
     3.6  
     3.7 -    transparent_region_types = ["wiki"]
     3.8 -
     3.9 -    def __init__(self, nodes, level=0, indent=0, type=None):
    3.10 +    def __init__(self, nodes, level=0, indent=0, type=None, transparent=True):
    3.11          Container.__init__(self, nodes)
    3.12          self.level = level
    3.13          self.indent = indent
    3.14          self.type = type
    3.15 +        self.transparent = transparent
    3.16  
    3.17      def add(self, node):
    3.18          last = self.node(-1)
    3.19 @@ -113,7 +112,7 @@
    3.20              self.append(node)
    3.21  
    3.22      def append_inline(self, node):
    3.23 -        if self.is_transparent():
    3.24 +        if self.transparent:
    3.25              self.nodes[-1].append(node)
    3.26          else:
    3.27              self.append(node)
    3.28 @@ -121,9 +120,6 @@
    3.29      def have_end(self, s):
    3.30          return self.level and s.startswith("}") and self.level == len(s)
    3.31  
    3.32 -    def is_transparent(self):
    3.33 -        return not self.level or self.type in self.transparent_region_types
    3.34 -
    3.35      def __repr__(self):
    3.36          return "Region(%r, %r, %r, %r)" % (self.nodes, self.level, self.indent, self.type)
    3.37
2017-05-04	Paul Boddie	raw files shortlog changelog graph	Moved common parsing functionality into a separate module. Eliminated "transparent" region decisions in the Region class, deciding region transparency in parser classes instead.
			moinformat/__init__.py (file) moinformat/parsing.py (file) moinformat/tree.py (file)