Switched to using a token stream in order to support different parsing modes as different kinds of regions are encountered.

     1.1 --- a/moinformat.py	Wed Apr 26 17:32:08 2017 +0200
     1.2 +++ b/moinformat.py	Thu Apr 27 18:13:53 2017 +0200
     1.3 @@ -26,11 +26,11 @@
     1.4  
     1.5  syntax = {
     1.6      # Page regions:
     1.7 -    "markers"       : (r"^\s*([{]{3,}|[}]{3,})",    re.MULTILINE | re.DOTALL),  # {{{... or }}}...
     1.8 +    "marker"        : (r"^\s*([{]{3,}|[}]{3,})",    re.MULTILINE | re.DOTALL),  # {{{... or }}}...
     1.9  
    1.10      # Region contents:
    1.11 -    "header"        : (r"\A#!(.*?)\n",              0),                         # #! char-excl-nl
    1.12 -    "region text"   : (r"(^\s*$)",                  re.MULTILINE),              # blank line
    1.13 +    "header"        : (r"#!(.*?)\n",                0),                         # #! char-excl-nl
    1.14 +    "break"         : (r"^\s*?\n",                  re.MULTILINE),              # blank line
    1.15      }
    1.16  
    1.17  # Define patterns for the regular expressions.
    1.18 @@ -95,23 +95,6 @@
    1.19          self.level = level
    1.20          self.type = type
    1.21  
    1.22 -    def expand(self):
    1.23 -
    1.24 -        """
    1.25 -        Expand text nodes by parsing them as region text, if the region is
    1.26 -        understandable to the standard parser.
    1.27 -        """
    1.28 -
    1.29 -        if self.is_transparent():
    1.30 -            nodes = self.nodes
    1.31 -            self.nodes = []
    1.32 -
    1.33 -            for node in nodes:
    1.34 -                if isinstance(node, Text):
    1.35 -                    parse_region_text(node.s, self)
    1.36 -                else:
    1.37 -                    self.append(node)
    1.38 -
    1.39      def have_start(self, s):
    1.40          return self.is_transparent() and s.startswith("{")
    1.41  
    1.42 @@ -231,6 +214,63 @@
    1.43  
    1.44  
    1.45  
    1.46 +# Tokenising functions.
    1.47 +
    1.48 +class TokenStream:
    1.49 +
    1.50 +    "A stream of tokens taken from a string."
    1.51 +
    1.52 +    def __init__(self, s):
    1.53 +        self.s = s
    1.54 +        self.pos = 0
    1.55 +        self.match = None
    1.56 +        self.matching = None
    1.57 +
    1.58 +    def read_until(self, pattern_names, remaining=True):
    1.59 +
    1.60 +        """
    1.61 +        Find the first match for the given 'pattern_names'. Return the text
    1.62 +        preceding any match, the remaining text if no match was found, or None
    1.63 +        if no match was found and 'remaining' is given as a false value.
    1.64 +        """
    1.65 +
    1.66 +        first = None
    1.67 +        self.matching = None
    1.68 +
    1.69 +        # Find the first matching pattern.
    1.70 +
    1.71 +        for pattern_name in pattern_names:
    1.72 +            match = patterns[pattern_name].search(self.s, self.pos)
    1.73 +            if match:
    1.74 +                start, end = match.span()
    1.75 +                if self.matching is None or start < first:
    1.76 +                    first = start
    1.77 +                    self.matching = pattern_name
    1.78 +                    self.match = match
    1.79 +
    1.80 +        if self.matching is None:
    1.81 +            if remaining:
    1.82 +                return self.s[self.pos:]
    1.83 +            else:
    1.84 +                return None
    1.85 +        else:
    1.86 +            return self.s[self.pos:first]
    1.87 +
    1.88 +    def read_match(self):
    1.89 +
    1.90 +        "Return the matched text, updating the position in the stream."
    1.91 +
    1.92 +        if self.match:
    1.93 +            _start, self.pos = self.match.span()
    1.94 +            s = self.match.group(1)
    1.95 +            self.match = None
    1.96 +            return s
    1.97 +        else:
    1.98 +            self.pos = len(self.s)
    1.99 +            return None
   1.100 +
   1.101 +
   1.102 +
   1.103  # Parser functions.
   1.104  
   1.105  def parse_page(s):
   1.106 @@ -239,9 +279,7 @@
   1.107      Parse page text 's'. Pages consist of regions delimited by markers.
   1.108      """
   1.109  
   1.110 -    # Define tokens for interpretation by the parser.
   1.111 -
   1.112 -    items = iter(patterns["markers"].split(s))
   1.113 +    items = TokenStream(s)
   1.114  
   1.115      # Define a region for the page and parse it.
   1.116  
   1.117 @@ -253,94 +291,122 @@
   1.118  
   1.119      "Parse the data provided by 'items' to populate 'region'."
   1.120  
   1.121 -    nodes = region.nodes
   1.122 -    first = True
   1.123 +    # Parse section headers.
   1.124 +
   1.125 +    parse_region_header(items, region)
   1.126 +
   1.127 +    if region.is_transparent():
   1.128 +        parse_region_wiki(items, region)
   1.129 +    else:
   1.130 +        parse_region_opaque(items, region)
   1.131 +
   1.132 +def parse_region_wiki(items, region):
   1.133 +
   1.134 +    "Parse the data provided by 'items' to populate a wiki 'region'."
   1.135  
   1.136      # Process exposed text and sections.
   1.137  
   1.138 -    try:
   1.139 -        try:
   1.140 -            while True:
   1.141 -
   1.142 -                # Parse section headers.
   1.143 +    block = Block([])
   1.144 +    region.append(block)
   1.145  
   1.146 -                if first:
   1.147 -                    match_text = parse_region_header(items.next(), region)
   1.148 -                    first = False
   1.149 -                else:
   1.150 -                    match_text = items.next()
   1.151 +    while True:
   1.152  
   1.153 -                # Start a section if an appropriate marker is given.
   1.154 -
   1.155 -                if region.have_start(match_text):
   1.156 +        # Obtain text before any marker or the end of the input.
   1.157  
   1.158 -                    # Define the section and parse it.
   1.159 -
   1.160 -                    _region = Region([], len(match_text))
   1.161 -                    region.append(_region)
   1.162 -                    parse_region(items, _region)
   1.163 +        match_text = items.read_until(["break", "marker"])
   1.164 +        if match_text:
   1.165 +            block.append(Text(match_text))
   1.166  
   1.167 -                # Interpret the given marker, closing the current section if the
   1.168 -                # given marker is the corresponding end marker for the current
   1.169 -                # section.
   1.170 +        # Obtain any feature.
   1.171  
   1.172 -                elif region.have_end(match_text):
   1.173 -                    return
   1.174 -
   1.175 -                # Otherwise, parse text in the region.
   1.176 -
   1.177 -                else:
   1.178 -                    region.append(Text(match_text))
   1.179 +        feature = items.read_match()
   1.180  
   1.181          # End of input.
   1.182  
   1.183 -        except StopIteration:
   1.184 -            pass
   1.185 +        if not items.matching:
   1.186 +            break
   1.187 +
   1.188 +        # Start a section if an appropriate marker is given.
   1.189 +
   1.190 +        if region.have_start(feature):
   1.191 +
   1.192 +            # Define the section and parse it.
   1.193 +
   1.194 +            _region = Region([], len(feature))
   1.195 +            region.append(_region)
   1.196 +            parse_region(items, _region)
   1.197 +
   1.198 +            # Start a new block after the section.
   1.199 +
   1.200 +            block = Block([])
   1.201 +            region.append(block)
   1.202  
   1.203 -    finally:
   1.204 -        region.normalise()
   1.205 +        # Interpret the given marker, closing the current section if the
   1.206 +        # given marker is the corresponding end marker for the current
   1.207 +        # section.
   1.208 +
   1.209 +        elif region.have_end(feature):
   1.210 +            break
   1.211 +
   1.212 +        # Start a new block if a paragraph break is found.
   1.213 +
   1.214 +        elif items.matching == "break":
   1.215 +            block.final = False
   1.216 +            block = Block([])
   1.217 +            region.append(block)
   1.218 +
   1.219 +        # Add any inappropriate marker to the text.
   1.220 +
   1.221 +        else:
   1.222 +            block.append(Text(feature))
   1.223 +
   1.224 +    region.normalise()
   1.225  
   1.226 -        # Parse region contents, if possible.
   1.227 +def parse_region_opaque(items, region):
   1.228 +
   1.229 +    "Parse the data provided by 'items' to populate an opaque 'region'."
   1.230 +
   1.231 +    # Process exposed text and sections.
   1.232 +
   1.233 +    while True:
   1.234 +
   1.235 +        # Obtain text before any marker or the end of the input.
   1.236 +
   1.237 +        match_text = items.read_until(["marker"])
   1.238 +        if match_text:
   1.239 +            region.append(Text(match_text))
   1.240 +
   1.241 +        # Obtain any marker.
   1.242 +
   1.243 +        marker = items.read_match()
   1.244  
   1.245 -        region.expand()
   1.246 +        # End of input.
   1.247 +
   1.248 +        if not marker:
   1.249 +            break
   1.250 +
   1.251 +        # Interpret the given marker, closing the current section if the
   1.252 +        # given marker is the corresponding end marker for the current
   1.253 +        # section.
   1.254  
   1.255 -def parse_region_header(s, region):
   1.256 +        if region.have_end(marker):
   1.257 +            break
   1.258 +
   1.259 +        # Add any inappropriate marker to the text.
   1.260 +
   1.261 +        else:
   1.262 +            region.append(Text(marker))
   1.263 +
   1.264 +    region.normalise()
   1.265 +
   1.266 +def parse_region_header(items, region):
   1.267  
   1.268      """
   1.269 -    Parse the text 's', extracting any region header and setting it for the
   1.270 -    given 'region'. Return the remaining text.
   1.271 +    Parse the region header from the 'items', setting it for the given 'region'.
   1.272      """
   1.273  
   1.274 -    items = iter(patterns["header"].split(s))
   1.275 -    pre_header = items.next()
   1.276 -
   1.277 -    if not pre_header:
   1.278 -        region.type = items.next()
   1.279 -        return items.next()
   1.280 -    else:
   1.281 -        return pre_header
   1.282 -
   1.283 -def parse_region_text(s, region):
   1.284 -
   1.285 -    "Parse the text 's' as part of 'region'."
   1.286 -
   1.287 -    items = iter(patterns["region text"].split(s))
   1.288 -    block = Block([])
   1.289 -    region.append(block)
   1.290 -
   1.291 -    try:
   1.292 -        while True:
   1.293 -            match_text = items.next()
   1.294 -
   1.295 -            if not match_text.strip():
   1.296 -                region.append(block)
   1.297 -                block.final = False
   1.298 -                block = Block([])
   1.299 -            else:
   1.300 -                block.append(Text(match_text))
   1.301 -
   1.302 -    except StopIteration:
   1.303 -        pass
   1.304 +    if items.read_until(["header"], False) == "": # None means no header
   1.305 +        region.type = items.read_match()
   1.306  
   1.307  
   1.308  

     2.1 --- a/tests/test_parser.py	Wed Apr 26 17:32:08 2017 +0200
     2.2 +++ b/tests/test_parser.py	Thu Apr 27 18:13:53 2017 +0200
     2.3 @@ -51,11 +51,13 @@
     2.4  
     2.5  for s, n in zip([s0, s1, s2, s3], [ns0, ns1, ns2, ns3]):
     2.6      print n == s
     2.7 -    print
     2.8 +    print "----"
     2.9      print n
    2.10      print "----"
    2.11  
    2.12  for d in [d0, d1, d2, d3]:
    2.13 +    print
    2.14 +    print "----"
    2.15      print serialise(d, HTMLSerialiser)
    2.16      print "----"
    2.17
2017-04-27	Paul Boddie	raw files shortlog changelog graph	Switched to using a token stream in order to support different parsing modes as different kinds of regions are encountered.
			moinformat.py (file) tests/test_parser.py (file)