# HG changeset patch # User Paul Boddie # Date 1562962037 -7200 # Node ID 90003650d826a4ea57f0ea9a42ba8b52ef7bdc5b # Parent 2b8cbd82ee13ce4f6f8487224fcc30d175b57f2a# Parent ee5d6cf035d9a9cf080f85cb3daa2c5f1110c696 Merged the single-regexp-searching branch at last. diff -r 2b8cbd82ee13 -r 90003650d826 moinformat/parsers/common.py --- a/moinformat/parsers/common.py Fri Jul 12 22:06:58 2019 +0200 +++ b/moinformat/parsers/common.py Fri Jul 12 22:07:17 2019 +0200 @@ -50,7 +50,10 @@ def group(name, s): - "Return a pattern group having 'name' and the pattern string 's'." + """ + Return a pattern for the group having the given 'name' and employing the + pattern string 's'. + """ return "(?P<%s>%s)" % (name, s) @@ -73,37 +76,52 @@ return "%s{%s,%s}" % (s, min is not None and min or "", max is not None and max or "") -def get_pattern(s): - - "Return a compiled regular expression for the given pattern 's'." - - return re.compile(s, re.UNICODE | re.MULTILINE) - def get_patterns(syntax): """ Define patterns for the regular expressions in the 'syntax' mapping. In each - pattern, replace \N with a pattern for matching whitespace excluding - newlines. + pattern, replace... + + \E with a pattern for matching all characters including newlines + \N with a pattern for matching whitespace excluding newlines + \P with a pattern for matching all characters within a paragraph + \Q with a pattern for matching quotation marks + + Group names are also qualified with a pattern name prefix. """ patterns = {} + for name, value in syntax.items(): value = value.replace(r"\N", ws_excl_nl) value = value.replace(r"\Q", quotes) value = value.replace(r"\E", dotall) value = value.replace(r"\P", dotparagraph) - patterns[name] = get_pattern(value) + + # Add the name to group names as a prefix. + + value = value.replace("(?P<", "(?P<%s_" % name) + value = value.replace("(?P=", "(?P=%s_" % name) + + # Record the updated expression and add an identifying null group. + + patterns[name] = "%s(?P)" % (value, name) + return patterns -def get_subset(d, keys): +def get_expression(d, keys): - "Return a subset of 'd' having the given 'keys'." + """ + Return a compiled expression combining patterns in 'd' having the given + 'keys'. + """ - subset = {} + subset = [] + for key in keys: - subset[key] = d[key] - return subset + subset.append(d[key]) + + return re.compile("|".join(subset), re.UNICODE | re.MULTILINE) @@ -121,7 +139,7 @@ self.match = None self.queued = None - self.match_start = None + self.groups = {} # Pattern name details. @@ -139,56 +157,75 @@ self.queued = self.match - def read_until(self, patterns, remaining=True): + def read_until(self, expression, remaining=True): """ - Find the first match for the given 'patterns'. Return the text preceding - any match, the remaining text if no match was found, or None if no match - was found and 'remaining' is given as a false value. + Find the first match for the given 'expression'. Return the text + preceding any match, the remaining text if no match was found, or None + if no match was found and 'remaining' is given as a false value. """ if self.queued: self.match = self.queued self.queued = None else: - self.match_start = None self.matching = None # Find the first matching pattern. - for pattern_name, pattern in patterns.items(): - match = pattern.search(self.s, self.pos) - if match: - start, end = match.span() - if self.matching is None or start < self.start: - self.start = start - self.matching = pattern_name + match = expression.search(self.s, self.pos) + + if match: + for name, value in match.groupdict().items(): + + # Use a group with a non-null value to identify the + # matching pattern. + + if name.startswith("group_") and value is not None: + self.matching = name[len("group_"):] + self.start, self.end = match.span() self.match = match + break + + # Return the remaining text, if appropriate. if self.matching is None: + self.groups = {} if remaining: return self.s[self.pos:] else: return None else: + self.groups = self.filter_groups() return self.s[self.pos:self.start] - def match_group(self, group=1): + def filter_groups(self): + + "Filter groups from the current match for the matching pattern." + + prefix = "%s_" % self.matching + + d = {} + for key, value in self.match.groupdict().items(): + if key.startswith(prefix): + d[key[len(prefix):]] = value + return d + + def match_group(self, group=None): """ Return the matched text, updating the position in the stream. If 'group' is specified, the indicated group in a match will be returned. - Typically, group 1 should contain all pertinent data, but groups defined - within group 1 can provide sections of the data. + Otherwise, the entire match is returned. """ self.update_pos() if self.match: - try: - return self.match.group(group) - except IndexError: - return "" + if group is None: + return self.s[self.start:self.end] + else: + return self.groups.get(group) else: return None @@ -200,9 +237,12 @@ if self.match: if groups is None: - return self.match.groups() + return self.groups else: - return self.match.groups(groups) + l = [] + for group in groups: + l.append(self.groups.get(group)) + return l else: return [] @@ -248,11 +288,11 @@ else: return None - def get_patterns(self, pattern_names): + def get_expression(self, pattern_names): "Return a mapping of the given 'pattern_names' to patterns." - return get_subset(self.patterns, pattern_names) + return get_expression(self.patterns, pattern_names) def get_items(self, s, pos=0): @@ -275,12 +315,13 @@ or None if no match was found and 'remaining' is given as a false value. """ - return self.items.read_until(self.get_patterns(pattern_names)) + return self.items.read_until(self.get_expression(pattern_names)) - def match_group(self, group=1): + def match_group(self, group=None): """ - Return the group of the matching pattern with the given 'group' number. + Return the group of the matching pattern with the given 'group' + identifier. If 'group' is omitted or None, return the entire match. """ return self.items.match_group(group)