MoinLight (file moinformat/parsing.py at 743a15ed73aa)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from moinformat.tree import Block, Region, Text    23 import re    24     25 # Pattern management.    26     27 ws_excl_nl = r"[ \f\r\t\v]"    28     29 def get_patterns(syntax):    30     31     """    32     Define patterns for the regular expressions in the 'syntax' mapping. In each    33     pattern, replace \N with a pattern for matching whitespace excluding    34     newlines.    35     """    36     37     patterns = {}    38     for name, value in syntax.items():    39         value = value.replace(r"\N", ws_excl_nl)    40         patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)    41     return patterns    42     43 def get_subset(d, keys):    44     45     "Return a subset of 'd' having the given 'keys'."    46     47     subset = {}    48     for key in keys:    49         subset[key] = d[key]    50     return subset    51     52     53     54 # Tokenising functions.    55     56 class TokenStream:    57     58     "A stream of tokens taken from a string."    59     60     def __init__(self, s, pos=0):    61         self.s = s    62         self.pos = pos    63         self.match = None    64         self.matching = None    65     66     def rewind(self, length):    67     68         "Rewind in the string by 'length'."    69     70         self.pos -= min(length, self.pos)    71     72     def read_until(self, patterns, remaining=True):    73     74         """    75         Find the first match for the given 'patterns'. Return the text preceding    76         any match, the remaining text if no match was found, or None if no match    77         was found and 'remaining' is given as a false value.    78         """    79     80         first = None    81         self.matching = None    82     83         # Find the first matching pattern.    84     85         for pattern_name, pattern in patterns.items():    86             match = pattern.search(self.s, self.pos)    87             if match:    88                 start, end = match.span()    89                 if self.matching is None or start < first:    90                     first = start    91                     self.matching = pattern_name    92                     self.match = match    93     94         if self.matching is None:    95             if remaining:    96                 return self.s[self.pos:]    97             else:    98                 return None    99         else:   100             return self.s[self.pos:first]   101    102     def read_match(self, group=1):   103    104         """   105         Return the matched text, updating the position in the stream. If 'group'   106         is specified, the indicated group in a match will be returned.   107         Typically, group 1 should contain all pertinent data, but groups defined   108         within group 1 can provide sections of the data.   109         """   110    111         if self.match:   112             _start, self.pos = self.match.span()   113             try:   114                 return self.match.group(group)   115             except IndexError:   116                 return ""   117         else:   118             self.pos = len(self.s)   119             return None   120    121    122    123 # Utility functions.   124    125 def new_block(region):   126    127     "Start a new block in 'region'."   128    129     region.add(Block([]))   130    131    132    133 # Parser abstractions.   134    135 class ParserBase:   136    137     "Common parsing methods."   138    139     region_pattern_names = None   140    141     def __init__(self, formats=None):   142    143         """   144         Initialise the parser with any given 'formats' mapping from region type   145         names to parser objects.   146         """   147    148         self.formats = formats   149    150     def get_parser(self, format_type):   151    152         """   153         Return a parser for 'format_type' or None if no suitable parser is found.   154         """   155    156         if not self.formats:   157             return None   158    159         cls = self.formats.get(format_type)   160         if cls:   161             return cls(self.formats)   162         else:   163             return None   164    165     def get_patterns(self, pattern_names):   166    167         "Return a mapping of the given 'pattern_names' to patterns."   168    169         return get_subset(self.patterns, pattern_names)   170    171     def get_items(self, s, pos=0):   172    173         "Return a sequence of token items for 's' and 'pos'."   174    175         return TokenStream(s, pos)   176    177     def set_region(self, items, region):   178    179         "Set the 'items' used to populate the given 'region'."   180    181         self.items = items   182         self.region = region   183    184     def read_until(self, pattern_names, remaining=True):   185    186         """   187         Read the next portion of input, matching using 'pattern_names'. Return   188         the text preceding any match, the remaining text if no match was found,   189         or None if no match was found and 'remaining' is given as a false value.   190         """   191    192         return self.items.read_until(self.get_patterns(pattern_names))   193    194     def read_match(self, group=1):   195    196         """   197         Return the group of the matching pattern with the given 'group' number.   198         """   199    200         return self.items.read_match(group)   201    202     def read_matching(self):   203    204         "Return the name of the matching pattern."   205    206         return self.items.matching   207    208     # Parser methods invoked from other objects.   209    210     def parse(self, s):   211    212         """   213         Parse page text 's'. Pages consist of regions delimited by markers.   214         """   215    216         self.items = self.get_items(s)   217         self.region = self.parse_region()   218         return self.region   219    220     def parse_region_content(self, items, region):   221    222         "Parse the data provided by 'items' to populate a 'region'."   223    224         self.set_region(items, region)   225    226         # Define a block to hold text and start parsing.   227    228         new_block(region)   229    230         if self.region_pattern_names:   231             self.parse_region_details(region, self.region_pattern_names)   232    233     # Top-level parser handler methods.   234    235     def parse_region(self, level=0, indent=0):   236    237         """   238         Parse the data to populate a region with the given 'level' at the given   239         'indent'.   240         """   241    242         region = Region([], level, indent)   243    244         # Parse section headers, then parse according to region type.   245    246         self.parse_region_header(region)   247         self.parse_region_type(region)   248    249         return region   250    251     def parse_region_type(self, region):   252    253         """   254         Use configured parsers to parse 'region' based on its type.   255         """   256    257         # Find an appropriate parser given the type.   258    259         parser = self.get_parser(region.type)   260    261         if parser:   262             parser.parse_region_content(self.items, region)   263    264         # Otherwise, treat the section as opaque.   265    266         else:   267             self.parse_region_opaque(region)   268    269     def parse_region_header(self, region):   270    271         """   272         Parse the region header, setting it on the 'region' object.   273         """   274    275         if self.read_until(["header"], False) == "": # None means no header   276             region.type = self.read_match()   277    278     def parse_region_opaque(self, region):   279    280         "Parse the data to populate an opaque 'region'."   281    282         region.transparent = False   283         self.parse_region_details(region, ["regionend"])   284    285     # Parsing utilities.   286    287     def parse_region_details(self, region, pattern_names):   288    289         "Search 'region' using the 'pattern_names'."   290    291         try:   292             while True:   293    294                 # Obtain text before any marker or the end of the input.   295    296                 preceding = self.read_until(pattern_names)   297                 if preceding:   298                     region.append_inline(Text(preceding))   299    300                 # End of input.   301    302                 if not self.read_matching():   303                     break   304    305                 # Obtain any feature.   306    307                 feature = self.read_match()   308                 handler = self.handlers.get(self.read_matching())   309    310                 # Handle each feature or add text to the region.   311    312                 if handler:   313                     handler(self, region)   314                 else:   315                     region.append_inline(Text(feature))   316    317         except StopIteration:   318             pass   319    320         region.normalise()   321    322     def end_region(self, region):   323    324         "End the parsing of 'region', breaking out of the parsing loop."   325    326         raise StopIteration   327    328 # vim: tabstop=4 expandtab shiftwidth=4