MoinLight (file moinformat/parsers/common.py at 602ab3fbb29c)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from collections import defaultdict    23 from moinformat.tree import Block, Region, Text    24 import re    25     26 # Pattern management.    27     28 ws_excl_nl = r"[ \f\r\t\v]"    29 quotes = "['" '"]'              # ['"]    30     31 def excl(s):    32     33     "Return a non-matching pattern for 's'."    34     35     return "(?!%s)" % s    36     37 def expect(s):    38     39     "Return a pattern expecting 's'."    40     41     return "(?=%s)" % s    42     43 def group(name, s):    44     45     "Return a pattern group having 'name' and the pattern string 's'."    46     47     return "(?P<%s>%s)" % (name, s)    48     49 def optional(s):    50     51     "Return an optional pattern."    52     53     return "(?:%s)?" % s    54     55 def recur(name):    56     57     "Return a test for a recurrence of group 'name'."    58     59     return "(?P=%s)" % name    60     61 def repeat(s, min=None, max=None):    62     63     "Return a pattern matching 's' for the given 'min' and 'max' limits."    64     65     return "%s{%s,%s}" % (s, min is not None and min or "",    66                              max is not None and max or "")    67     68 def get_patterns(syntax):    69     70     """    71     Define patterns for the regular expressions in the 'syntax' mapping. In each    72     pattern, replace \N with a pattern for matching whitespace excluding    73     newlines.    74     """    75     76     patterns = {}    77     for name, value in syntax.items():    78         value = value.replace(r"\N", ws_excl_nl)    79         value = value.replace(r"\Q", quotes)    80         patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)    81     return patterns    82     83 def get_subset(d, keys):    84     85     "Return a subset of 'd' having the given 'keys'."    86     87     subset = {}    88     for key in keys:    89         subset[key] = d[key]    90     return subset    91     92     93     94 # Tokenising functions.    95     96 class TokenStream:    97     98     "A stream of tokens taken from a string."    99    100     def __init__(self, s, pos=0):   101         self.s = s   102         self.pos = pos   103    104         # Match details.   105    106         self.match = None   107         self.queued = None   108         self.match_start = None   109    110         # Pattern name details.   111    112         self.matching = None   113    114     def rewind(self, length):   115    116         "Rewind in the string by 'length'."   117    118         self.pos -= min(length, self.pos)   119    120     def queue_match(self):   121    122         "Rewind in the string to the start of the last match."   123    124         self.queued = self.match   125    126     def read_until(self, patterns, remaining=True):   127    128         """   129         Find the first match for the given 'patterns'. Return the text preceding   130         any match, the remaining text if no match was found, or None if no match   131         was found and 'remaining' is given as a false value.   132         """   133    134         if self.queued:   135             self.match = self.queued   136             self.queued = None   137         else:   138             self.match_start = None   139             self.matching = None   140    141             # Find the first matching pattern.   142    143             for pattern_name, pattern in patterns.items():   144                 match = pattern.search(self.s, self.pos)   145                 if match:   146                     start, end = match.span()   147                     if self.matching is None or start < self.start:   148                         self.start = start   149                         self.matching = pattern_name   150                         self.match = match   151    152         if self.matching is None:   153             if remaining:   154                 return self.s[self.pos:]   155             else:   156                 return None   157         else:   158             return self.s[self.pos:self.start]   159    160     def match_group(self, group=1):   161    162         """   163         Return the matched text, updating the position in the stream. If 'group'   164         is specified, the indicated group in a match will be returned.   165         Typically, group 1 should contain all pertinent data, but groups defined   166         within group 1 can provide sections of the data.   167         """   168    169         self.update_pos()   170    171         if self.match:   172             try:   173                 return self.match.group(group)   174             except IndexError:   175                 return ""   176         else:   177             return None   178    179     def match_groups(self, groups=None):   180    181         "Return the match 'groups', or all groups if unspecified."   182    183         self.update_pos()   184    185         if self.match:   186             if groups is None:   187                 return self.match.groups()   188             else:   189                 return self.match.groups(groups)   190         else:   191             return []   192    193     def update_pos(self):   194    195         "Update the position in the stream."   196    197         if self.match:   198             _start, self.pos = self.match.span()   199         else:   200             self.pos = len(self.s)   201    202    203    204 # Parser abstractions.   205    206 class ParserBase:   207    208     "Common parsing methods."   209    210     region_pattern_names = None   211    212     def __init__(self, formats=None):   213    214         """   215         Initialise the parser with any given 'formats' mapping from region type   216         names to parser objects.   217         """   218    219         self.formats = formats   220    221     def get_parser(self, format_type):   222    223         """   224         Return a parser for 'format_type' or None if no suitable parser is found.   225         """   226    227         if not self.formats:   228             return None   229    230         cls = self.formats.get(format_type)   231         if cls:   232             return cls(self.formats)   233         else:   234             return None   235    236     def get_patterns(self, pattern_names):   237    238         "Return a mapping of the given 'pattern_names' to patterns."   239    240         return get_subset(self.patterns, pattern_names)   241    242     def get_items(self, s, pos=0):   243    244         "Return a sequence of token items for 's' and 'pos'."   245    246         return TokenStream(s, pos)   247    248     def set_region(self, items, region):   249    250         "Set the 'items' used to populate the given 'region'."   251    252         self.items = items   253         self.region = region   254    255     def read_until(self, pattern_names, remaining=True):   256    257         """   258         Read the next portion of input, matching using 'pattern_names'. Return   259         the text preceding any match, the remaining text if no match was found,   260         or None if no match was found and 'remaining' is given as a false value.   261         """   262    263         return self.items.read_until(self.get_patterns(pattern_names))   264    265     def match_group(self, group=1):   266    267         """   268         Return the group of the matching pattern with the given 'group' number.   269         """   270    271         return self.items.match_group(group)   272    273     def matching_pattern(self):   274    275         "Return the name of the matching pattern."   276    277         return self.items.matching   278    279     def match_groups(self):   280    281         "Return the number of groups in the match."   282    283         return self.items.match_groups()   284    285     # Parser methods invoked from other objects.   286    287     def parse(self, s):   288    289         """   290         Parse page text 's'. Pages consist of regions delimited by markers.   291         """   292    293         self.items = self.get_items(s)   294         self.region = self.parse_region()   295         return self.region   296    297     def parse_region_content(self, items, region):   298    299         "Parse the data provided by 'items' to populate a 'region'."   300    301         self.set_region(items, region)   302    303         # Define a block to hold text and start parsing.   304    305         self.new_block(region)   306    307         if self.region_pattern_names:   308             self.parse_region_details(region, self.region_pattern_names)   309    310     # Top-level parser handler methods.   311    312     def parse_region(self, level=0, indent=0, type=None):   313    314         """   315         Parse the data to populate a region with the given 'level' at the given   316         'indent' having the given initial 'type'.   317         """   318    319         region = Region([], level, indent, type)   320    321         # Parse section headers, then parse according to region type.   322    323         self.parse_region_header(region)   324         self.parse_region_type(region)   325    326         return region   327    328     def parse_region_type(self, region):   329    330         """   331         Use configured parsers to parse 'region' based on its type.   332         """   333    334         # Handle potentially inline regions.   335    336         if region.type == "inline":   337             self.parse_region_inline(region)   338             return   339    340         # Find an appropriate parser given the type.   341    342         parser = self.get_parser(region.type)   343    344         if parser:   345             parser.parse_region_content(self.items, region)   346    347         # Otherwise, treat the section as opaque.   348    349         else:   350             self.parse_region_opaque(region)   351    352     def parse_region_header(self, region):   353    354         """   355         Parse the region header, setting it on the 'region' object.   356         """   357    358         if self.read_until(["header"], False) == "": # None means no header   359             region.type = self.match_group("args")   360    361     def parse_region_opaque(self, region):   362    363         "Parse the data to populate an opaque 'region'."   364    365         region.transparent = False   366         self.parse_region_details(region, ["regionend"])   367    368     def parse_region_inline(self, region):   369    370         "Parse the data to populate an inline 'region'."   371    372         region.transparent = False   373         self.parse_region_details(region, ["regionend"])   374    375         # Reset the type if the region was not inline.   376    377         if region.type == "inline":   378             first = region.nodes and region.nodes[0]   379             if first and isinstance(first, Text) and first.multiline():   380                 region.type = None   381    382     # Parsing utilities.   383    384     def parse_region_details(self, region, pattern_names, strict=False):   385    386         """   387         Search 'region' using the 'pattern_names'. If 'strict' is set to a true   388         value, forbid the accumulation of additional textual padding.   389         """   390    391         try:   392             while True:   393    394                 # Obtain text before any marker or the end of the input.   395    396                 preceding = self.read_until(pattern_names)   397                 if preceding:   398                     if not strict:   399                         region.append_inline(Text(preceding))   400                     else:   401                         break   402    403                 # End of input.   404    405                 if not self.matching_pattern():   406                     break   407    408                 # Obtain any feature.   409    410                 feature = self.match_group("feature") or self.match_group()   411                 handler = self.handlers.get(self.matching_pattern())   412    413                 # Handle each feature or add text to the region.   414    415                 if handler:   416                     handler(self, region)   417                 elif not strict:   418                     region.append_inline(Text(feature))   419                 else:   420                     break   421    422         except StopIteration:   423             pass   424    425         region.normalise()   426    427     def add_node(self, region, node):   428    429         "Add to 'region' the given 'node'."   430    431         region.add(node)   432    433     def append_node(self, region, node):   434    435         "Append to 'region' the given 'node'."   436    437         region.append(node)   438    439     def end_region(self, region):   440    441         "End the parsing of 'region', breaking out of the parsing loop."   442    443         raise StopIteration   444    445     def queue_match(self):   446    447         "Queue the current match."   448    449         self.items.queue_match()   450    451     def new_block(self, region):   452    453         "Start a new block in 'region'."   454    455         self.add_node(region, Block([]))   456    457 # vim: tabstop=4 expandtab shiftwidth=4