1.1 --- a/moinformat/parsers/common.py Fri Jul 12 22:06:58 2019 +0200
1.2 +++ b/moinformat/parsers/common.py Fri Jul 12 22:07:17 2019 +0200
1.3 @@ -50,7 +50,10 @@
1.4
1.5 def group(name, s):
1.6
1.7 - "Return a pattern group having 'name' and the pattern string 's'."
1.8 + """
1.9 + Return a pattern for the group having the given 'name' and employing the
1.10 + pattern string 's'.
1.11 + """
1.12
1.13 return "(?P<%s>%s)" % (name, s)
1.14
1.15 @@ -73,37 +76,52 @@
1.16 return "%s{%s,%s}" % (s, min is not None and min or "",
1.17 max is not None and max or "")
1.18
1.19 -def get_pattern(s):
1.20 -
1.21 - "Return a compiled regular expression for the given pattern 's'."
1.22 -
1.23 - return re.compile(s, re.UNICODE | re.MULTILINE)
1.24 -
1.25 def get_patterns(syntax):
1.26
1.27 """
1.28 Define patterns for the regular expressions in the 'syntax' mapping. In each
1.29 - pattern, replace \N with a pattern for matching whitespace excluding
1.30 - newlines.
1.31 + pattern, replace...
1.32 +
1.33 + \E with a pattern for matching all characters including newlines
1.34 + \N with a pattern for matching whitespace excluding newlines
1.35 + \P with a pattern for matching all characters within a paragraph
1.36 + \Q with a pattern for matching quotation marks
1.37 +
1.38 + Group names are also qualified with a pattern name prefix.
1.39 """
1.40
1.41 patterns = {}
1.42 +
1.43 for name, value in syntax.items():
1.44 value = value.replace(r"\N", ws_excl_nl)
1.45 value = value.replace(r"\Q", quotes)
1.46 value = value.replace(r"\E", dotall)
1.47 value = value.replace(r"\P", dotparagraph)
1.48 - patterns[name] = get_pattern(value)
1.49 +
1.50 + # Add the name to group names as a prefix.
1.51 +
1.52 + value = value.replace("(?P<", "(?P<%s_" % name)
1.53 + value = value.replace("(?P=", "(?P=%s_" % name)
1.54 +
1.55 + # Record the updated expression and add an identifying null group.
1.56 +
1.57 + patterns[name] = "%s(?P<group_%s>)" % (value, name)
1.58 +
1.59 return patterns
1.60
1.61 -def get_subset(d, keys):
1.62 +def get_expression(d, keys):
1.63
1.64 - "Return a subset of 'd' having the given 'keys'."
1.65 + """
1.66 + Return a compiled expression combining patterns in 'd' having the given
1.67 + 'keys'.
1.68 + """
1.69
1.70 - subset = {}
1.71 + subset = []
1.72 +
1.73 for key in keys:
1.74 - subset[key] = d[key]
1.75 - return subset
1.76 + subset.append(d[key])
1.77 +
1.78 + return re.compile("|".join(subset), re.UNICODE | re.MULTILINE)
1.79
1.80
1.81
1.82 @@ -121,7 +139,7 @@
1.83
1.84 self.match = None
1.85 self.queued = None
1.86 - self.match_start = None
1.87 + self.groups = {}
1.88
1.89 # Pattern name details.
1.90
1.91 @@ -139,56 +157,75 @@
1.92
1.93 self.queued = self.match
1.94
1.95 - def read_until(self, patterns, remaining=True):
1.96 + def read_until(self, expression, remaining=True):
1.97
1.98 """
1.99 - Find the first match for the given 'patterns'. Return the text preceding
1.100 - any match, the remaining text if no match was found, or None if no match
1.101 - was found and 'remaining' is given as a false value.
1.102 + Find the first match for the given 'expression'. Return the text
1.103 + preceding any match, the remaining text if no match was found, or None
1.104 + if no match was found and 'remaining' is given as a false value.
1.105 """
1.106
1.107 if self.queued:
1.108 self.match = self.queued
1.109 self.queued = None
1.110 else:
1.111 - self.match_start = None
1.112 self.matching = None
1.113
1.114 # Find the first matching pattern.
1.115
1.116 - for pattern_name, pattern in patterns.items():
1.117 - match = pattern.search(self.s, self.pos)
1.118 - if match:
1.119 - start, end = match.span()
1.120 - if self.matching is None or start < self.start:
1.121 - self.start = start
1.122 - self.matching = pattern_name
1.123 + match = expression.search(self.s, self.pos)
1.124 +
1.125 + if match:
1.126 + for name, value in match.groupdict().items():
1.127 +
1.128 + # Use a group with a non-null value to identify the
1.129 + # matching pattern.
1.130 +
1.131 + if name.startswith("group_") and value is not None:
1.132 + self.matching = name[len("group_"):]
1.133 + self.start, self.end = match.span()
1.134 self.match = match
1.135 + break
1.136 +
1.137 + # Return the remaining text, if appropriate.
1.138
1.139 if self.matching is None:
1.140 + self.groups = {}
1.141 if remaining:
1.142 return self.s[self.pos:]
1.143 else:
1.144 return None
1.145 else:
1.146 + self.groups = self.filter_groups()
1.147 return self.s[self.pos:self.start]
1.148
1.149 - def match_group(self, group=1):
1.150 + def filter_groups(self):
1.151 +
1.152 + "Filter groups from the current match for the matching pattern."
1.153 +
1.154 + prefix = "%s_" % self.matching
1.155 +
1.156 + d = {}
1.157 + for key, value in self.match.groupdict().items():
1.158 + if key.startswith(prefix):
1.159 + d[key[len(prefix):]] = value
1.160 + return d
1.161 +
1.162 + def match_group(self, group=None):
1.163
1.164 """
1.165 Return the matched text, updating the position in the stream. If 'group'
1.166 is specified, the indicated group in a match will be returned.
1.167 - Typically, group 1 should contain all pertinent data, but groups defined
1.168 - within group 1 can provide sections of the data.
1.169 + Otherwise, the entire match is returned.
1.170 """
1.171
1.172 self.update_pos()
1.173
1.174 if self.match:
1.175 - try:
1.176 - return self.match.group(group)
1.177 - except IndexError:
1.178 - return ""
1.179 + if group is None:
1.180 + return self.s[self.start:self.end]
1.181 + else:
1.182 + return self.groups.get(group)
1.183 else:
1.184 return None
1.185
1.186 @@ -200,9 +237,12 @@
1.187
1.188 if self.match:
1.189 if groups is None:
1.190 - return self.match.groups()
1.191 + return self.groups
1.192 else:
1.193 - return self.match.groups(groups)
1.194 + l = []
1.195 + for group in groups:
1.196 + l.append(self.groups.get(group))
1.197 + return l
1.198 else:
1.199 return []
1.200
1.201 @@ -248,11 +288,11 @@
1.202 else:
1.203 return None
1.204
1.205 - def get_patterns(self, pattern_names):
1.206 + def get_expression(self, pattern_names):
1.207
1.208 "Return a mapping of the given 'pattern_names' to patterns."
1.209
1.210 - return get_subset(self.patterns, pattern_names)
1.211 + return get_expression(self.patterns, pattern_names)
1.212
1.213 def get_items(self, s, pos=0):
1.214
1.215 @@ -275,12 +315,13 @@
1.216 or None if no match was found and 'remaining' is given as a false value.
1.217 """
1.218
1.219 - return self.items.read_until(self.get_patterns(pattern_names))
1.220 + return self.items.read_until(self.get_expression(pattern_names))
1.221
1.222 - def match_group(self, group=1):
1.223 + def match_group(self, group=None):
1.224
1.225 """
1.226 - Return the group of the matching pattern with the given 'group' number.
1.227 + Return the group of the matching pattern with the given 'group'
1.228 + identifier. If 'group' is omitted or None, return the entire match.
1.229 """
1.230
1.231 return self.items.match_group(group)