1.1 --- a/moinformat.py Wed Apr 26 17:32:08 2017 +0200
1.2 +++ b/moinformat.py Thu Apr 27 18:13:53 2017 +0200
1.3 @@ -26,11 +26,11 @@
1.4
1.5 syntax = {
1.6 # Page regions:
1.7 - "markers" : (r"^\s*([{]{3,}|[}]{3,})", re.MULTILINE | re.DOTALL), # {{{... or }}}...
1.8 + "marker" : (r"^\s*([{]{3,}|[}]{3,})", re.MULTILINE | re.DOTALL), # {{{... or }}}...
1.9
1.10 # Region contents:
1.11 - "header" : (r"\A#!(.*?)\n", 0), # #! char-excl-nl
1.12 - "region text" : (r"(^\s*$)", re.MULTILINE), # blank line
1.13 + "header" : (r"#!(.*?)\n", 0), # #! char-excl-nl
1.14 + "break" : (r"^\s*?\n", re.MULTILINE), # blank line
1.15 }
1.16
1.17 # Define patterns for the regular expressions.
1.18 @@ -95,23 +95,6 @@
1.19 self.level = level
1.20 self.type = type
1.21
1.22 - def expand(self):
1.23 -
1.24 - """
1.25 - Expand text nodes by parsing them as region text, if the region is
1.26 - understandable to the standard parser.
1.27 - """
1.28 -
1.29 - if self.is_transparent():
1.30 - nodes = self.nodes
1.31 - self.nodes = []
1.32 -
1.33 - for node in nodes:
1.34 - if isinstance(node, Text):
1.35 - parse_region_text(node.s, self)
1.36 - else:
1.37 - self.append(node)
1.38 -
1.39 def have_start(self, s):
1.40 return self.is_transparent() and s.startswith("{")
1.41
1.42 @@ -231,6 +214,63 @@
1.43
1.44
1.45
1.46 +# Tokenising functions.
1.47 +
1.48 +class TokenStream:
1.49 +
1.50 + "A stream of tokens taken from a string."
1.51 +
1.52 + def __init__(self, s):
1.53 + self.s = s
1.54 + self.pos = 0
1.55 + self.match = None
1.56 + self.matching = None
1.57 +
1.58 + def read_until(self, pattern_names, remaining=True):
1.59 +
1.60 + """
1.61 + Find the first match for the given 'pattern_names'. Return the text
1.62 + preceding any match, the remaining text if no match was found, or None
1.63 + if no match was found and 'remaining' is given as a false value.
1.64 + """
1.65 +
1.66 + first = None
1.67 + self.matching = None
1.68 +
1.69 + # Find the first matching pattern.
1.70 +
1.71 + for pattern_name in pattern_names:
1.72 + match = patterns[pattern_name].search(self.s, self.pos)
1.73 + if match:
1.74 + start, end = match.span()
1.75 + if self.matching is None or start < first:
1.76 + first = start
1.77 + self.matching = pattern_name
1.78 + self.match = match
1.79 +
1.80 + if self.matching is None:
1.81 + if remaining:
1.82 + return self.s[self.pos:]
1.83 + else:
1.84 + return None
1.85 + else:
1.86 + return self.s[self.pos:first]
1.87 +
1.88 + def read_match(self):
1.89 +
1.90 + "Return the matched text, updating the position in the stream."
1.91 +
1.92 + if self.match:
1.93 + _start, self.pos = self.match.span()
1.94 + s = self.match.group(1)
1.95 + self.match = None
1.96 + return s
1.97 + else:
1.98 + self.pos = len(self.s)
1.99 + return None
1.100 +
1.101 +
1.102 +
1.103 # Parser functions.
1.104
1.105 def parse_page(s):
1.106 @@ -239,9 +279,7 @@
1.107 Parse page text 's'. Pages consist of regions delimited by markers.
1.108 """
1.109
1.110 - # Define tokens for interpretation by the parser.
1.111 -
1.112 - items = iter(patterns["markers"].split(s))
1.113 + items = TokenStream(s)
1.114
1.115 # Define a region for the page and parse it.
1.116
1.117 @@ -253,94 +291,122 @@
1.118
1.119 "Parse the data provided by 'items' to populate 'region'."
1.120
1.121 - nodes = region.nodes
1.122 - first = True
1.123 + # Parse section headers.
1.124 +
1.125 + parse_region_header(items, region)
1.126 +
1.127 + if region.is_transparent():
1.128 + parse_region_wiki(items, region)
1.129 + else:
1.130 + parse_region_opaque(items, region)
1.131 +
1.132 +def parse_region_wiki(items, region):
1.133 +
1.134 + "Parse the data provided by 'items' to populate a wiki 'region'."
1.135
1.136 # Process exposed text and sections.
1.137
1.138 - try:
1.139 - try:
1.140 - while True:
1.141 -
1.142 - # Parse section headers.
1.143 + block = Block([])
1.144 + region.append(block)
1.145
1.146 - if first:
1.147 - match_text = parse_region_header(items.next(), region)
1.148 - first = False
1.149 - else:
1.150 - match_text = items.next()
1.151 + while True:
1.152
1.153 - # Start a section if an appropriate marker is given.
1.154 -
1.155 - if region.have_start(match_text):
1.156 + # Obtain text before any marker or the end of the input.
1.157
1.158 - # Define the section and parse it.
1.159 -
1.160 - _region = Region([], len(match_text))
1.161 - region.append(_region)
1.162 - parse_region(items, _region)
1.163 + match_text = items.read_until(["break", "marker"])
1.164 + if match_text:
1.165 + block.append(Text(match_text))
1.166
1.167 - # Interpret the given marker, closing the current section if the
1.168 - # given marker is the corresponding end marker for the current
1.169 - # section.
1.170 + # Obtain any feature.
1.171
1.172 - elif region.have_end(match_text):
1.173 - return
1.174 -
1.175 - # Otherwise, parse text in the region.
1.176 -
1.177 - else:
1.178 - region.append(Text(match_text))
1.179 + feature = items.read_match()
1.180
1.181 # End of input.
1.182
1.183 - except StopIteration:
1.184 - pass
1.185 + if not items.matching:
1.186 + break
1.187 +
1.188 + # Start a section if an appropriate marker is given.
1.189 +
1.190 + if region.have_start(feature):
1.191 +
1.192 + # Define the section and parse it.
1.193 +
1.194 + _region = Region([], len(feature))
1.195 + region.append(_region)
1.196 + parse_region(items, _region)
1.197 +
1.198 + # Start a new block after the section.
1.199 +
1.200 + block = Block([])
1.201 + region.append(block)
1.202
1.203 - finally:
1.204 - region.normalise()
1.205 + # Interpret the given marker, closing the current section if the
1.206 + # given marker is the corresponding end marker for the current
1.207 + # section.
1.208 +
1.209 + elif region.have_end(feature):
1.210 + break
1.211 +
1.212 + # Start a new block if a paragraph break is found.
1.213 +
1.214 + elif items.matching == "break":
1.215 + block.final = False
1.216 + block = Block([])
1.217 + region.append(block)
1.218 +
1.219 + # Add any inappropriate marker to the text.
1.220 +
1.221 + else:
1.222 + block.append(Text(feature))
1.223 +
1.224 + region.normalise()
1.225
1.226 - # Parse region contents, if possible.
1.227 +def parse_region_opaque(items, region):
1.228 +
1.229 + "Parse the data provided by 'items' to populate an opaque 'region'."
1.230 +
1.231 + # Process exposed text and sections.
1.232 +
1.233 + while True:
1.234 +
1.235 + # Obtain text before any marker or the end of the input.
1.236 +
1.237 + match_text = items.read_until(["marker"])
1.238 + if match_text:
1.239 + region.append(Text(match_text))
1.240 +
1.241 + # Obtain any marker.
1.242 +
1.243 + marker = items.read_match()
1.244
1.245 - region.expand()
1.246 + # End of input.
1.247 +
1.248 + if not marker:
1.249 + break
1.250 +
1.251 + # Interpret the given marker, closing the current section if the
1.252 + # given marker is the corresponding end marker for the current
1.253 + # section.
1.254
1.255 -def parse_region_header(s, region):
1.256 + if region.have_end(marker):
1.257 + break
1.258 +
1.259 + # Add any inappropriate marker to the text.
1.260 +
1.261 + else:
1.262 + region.append(Text(marker))
1.263 +
1.264 + region.normalise()
1.265 +
1.266 +def parse_region_header(items, region):
1.267
1.268 """
1.269 - Parse the text 's', extracting any region header and setting it for the
1.270 - given 'region'. Return the remaining text.
1.271 + Parse the region header from the 'items', setting it for the given 'region'.
1.272 """
1.273
1.274 - items = iter(patterns["header"].split(s))
1.275 - pre_header = items.next()
1.276 -
1.277 - if not pre_header:
1.278 - region.type = items.next()
1.279 - return items.next()
1.280 - else:
1.281 - return pre_header
1.282 -
1.283 -def parse_region_text(s, region):
1.284 -
1.285 - "Parse the text 's' as part of 'region'."
1.286 -
1.287 - items = iter(patterns["region text"].split(s))
1.288 - block = Block([])
1.289 - region.append(block)
1.290 -
1.291 - try:
1.292 - while True:
1.293 - match_text = items.next()
1.294 -
1.295 - if not match_text.strip():
1.296 - region.append(block)
1.297 - block.final = False
1.298 - block = Block([])
1.299 - else:
1.300 - block.append(Text(match_text))
1.301 -
1.302 - except StopIteration:
1.303 - pass
1.304 + if items.read_until(["header"], False) == "": # None means no header
1.305 + region.type = items.read_match()
1.306
1.307
1.308
2.1 --- a/tests/test_parser.py Wed Apr 26 17:32:08 2017 +0200
2.2 +++ b/tests/test_parser.py Thu Apr 27 18:13:53 2017 +0200
2.3 @@ -51,11 +51,13 @@
2.4
2.5 for s, n in zip([s0, s1, s2, s3], [ns0, ns1, ns2, ns3]):
2.6 print n == s
2.7 - print
2.8 + print "----"
2.9 print n
2.10 print "----"
2.11
2.12 for d in [d0, d1, d2, d3]:
2.13 + print
2.14 + print "----"
2.15 print serialise(d, HTMLSerialiser)
2.16 print "----"
2.17