MoinLight (file moinformat/parsers/common.py at 518c6bf3b8ca)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from collections import defaultdict    23 from moinformat.tree import Block, Region, Text    24 import re    25     26 # Pattern management.    27     28 ws_excl_nl = r"[ \f\r\t\v]"    29     30 def get_patterns(syntax):    31     32     """    33     Define patterns for the regular expressions in the 'syntax' mapping. In each    34     pattern, replace \N with a pattern for matching whitespace excluding    35     newlines.    36     """    37     38     patterns = {}    39     for name, value in syntax.items():    40         value = value.replace(r"\N", ws_excl_nl)    41         patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)    42     return patterns    43     44 def get_subset(d, keys):    45     46     "Return a subset of 'd' having the given 'keys'."    47     48     subset = {}    49     for key in keys:    50         subset[key] = d[key]    51     return subset    52     53     54     55 # Tokenising functions.    56     57 class TokenStream:    58     59     "A stream of tokens taken from a string."    60     61     def __init__(self, s, pos=0):    62         self.s = s    63         self.pos = pos    64         self.match = None    65         self.matching = None    66     67     def rewind(self, length):    68     69         "Rewind in the string by 'length'."    70     71         self.pos -= min(length, self.pos)    72     73     def read_until(self, patterns, remaining=True):    74     75         """    76         Find the first match for the given 'patterns'. Return the text preceding    77         any match, the remaining text if no match was found, or None if no match    78         was found and 'remaining' is given as a false value.    79         """    80     81         first = None    82         self.matching = None    83     84         # Find the first matching pattern.    85     86         for pattern_name, pattern in patterns.items():    87             match = pattern.search(self.s, self.pos)    88             if match:    89                 start, end = match.span()    90                 if self.matching is None or start < first:    91                     first = start    92                     self.matching = pattern_name    93                     self.match = match    94     95         if self.matching is None:    96             if remaining:    97                 return self.s[self.pos:]    98             else:    99                 return None   100         else:   101             return self.s[self.pos:first]   102    103     def read_match(self, group=1):   104    105         """   106         Return the matched text, updating the position in the stream. If 'group'   107         is specified, the indicated group in a match will be returned.   108         Typically, group 1 should contain all pertinent data, but groups defined   109         within group 1 can provide sections of the data.   110         """   111    112         if self.match:   113             _start, self.pos = self.match.span()   114             try:   115                 return self.match.group(group)   116             except IndexError:   117                 return ""   118         else:   119             self.pos = len(self.s)   120             return None   121    122    123    124 # Parser abstractions.   125    126 class ParserBase:   127    128     "Common parsing methods."   129    130     region_pattern_names = None   131    132     def __init__(self, formats=None):   133    134         """   135         Initialise the parser with any given 'formats' mapping from region type   136         names to parser objects.   137         """   138    139         self.formats = formats   140         self.queued = defaultdict(list)   141    142     def get_parser(self, format_type):   143    144         """   145         Return a parser for 'format_type' or None if no suitable parser is found.   146         """   147    148         if not self.formats:   149             return None   150    151         cls = self.formats.get(format_type)   152         if cls:   153             return cls(self.formats)   154         else:   155             return None   156    157     def get_patterns(self, pattern_names):   158    159         "Return a mapping of the given 'pattern_names' to patterns."   160    161         return get_subset(self.patterns, pattern_names)   162    163     def get_items(self, s, pos=0):   164    165         "Return a sequence of token items for 's' and 'pos'."   166    167         return TokenStream(s, pos)   168    169     def set_region(self, items, region):   170    171         "Set the 'items' used to populate the given 'region'."   172    173         self.items = items   174         self.region = region   175    176     def read_until(self, pattern_names, remaining=True):   177    178         """   179         Read the next portion of input, matching using 'pattern_names'. Return   180         the text preceding any match, the remaining text if no match was found,   181         or None if no match was found and 'remaining' is given as a false value.   182         """   183    184         return self.items.read_until(self.get_patterns(pattern_names))   185    186     def read_match(self, group=1):   187    188         """   189         Return the group of the matching pattern with the given 'group' number.   190         """   191    192         return self.items.read_match(group)   193    194     def read_matching(self):   195    196         "Return the name of the matching pattern."   197    198         return self.items.matching   199    200     # Parser methods invoked from other objects.   201    202     def parse(self, s):   203    204         """   205         Parse page text 's'. Pages consist of regions delimited by markers.   206         """   207    208         self.items = self.get_items(s)   209         self.region = self.parse_region()   210         return self.region   211    212     def parse_region_content(self, items, region):   213    214         "Parse the data provided by 'items' to populate a 'region'."   215    216         self.set_region(items, region)   217    218         # Define a block to hold text and start parsing.   219    220         self.new_block(region)   221    222         if self.region_pattern_names:   223             self.parse_region_details(region, self.region_pattern_names)   224    225     # Top-level parser handler methods.   226    227     def parse_region(self, level=0, indent=0):   228    229         """   230         Parse the data to populate a region with the given 'level' at the given   231         'indent'.   232         """   233    234         region = Region([], level, indent)   235    236         # Parse section headers, then parse according to region type.   237    238         self.parse_region_header(region)   239         self.parse_region_type(region)   240    241         return region   242    243     def parse_region_type(self, region):   244    245         """   246         Use configured parsers to parse 'region' based on its type.   247         """   248    249         # Find an appropriate parser given the type.   250    251         parser = self.get_parser(region.type)   252    253         if parser:   254             parser.parse_region_content(self.items, region)   255    256         # Otherwise, treat the section as opaque.   257    258         else:   259             self.parse_region_opaque(region)   260    261     def parse_region_header(self, region):   262    263         """   264         Parse the region header, setting it on the 'region' object.   265         """   266    267         if self.read_until(["header"], False) == "": # None means no header   268             region.type = self.read_match()   269    270     def parse_region_opaque(self, region):   271    272         "Parse the data to populate an opaque 'region'."   273    274         region.transparent = False   275         self.parse_region_details(region, ["regionend"])   276    277     # Parsing utilities.   278    279     def parse_region_details(self, region, pattern_names, strict=False):   280    281         """   282         Search 'region' using the 'pattern_names'. If 'strict' is set to a true   283         value, forbid the accumulation of additional textual padding.   284         """   285    286         try:   287             while True:   288    289                 # Obtain text before any marker or the end of the input.   290    291                 preceding = self.read_until(pattern_names)   292                 if preceding:   293                     if not strict:   294                         region.append_inline(Text(preceding))   295                     else:   296                         break   297    298                 # End of input.   299    300                 if not self.read_matching():   301                     break   302    303                 # Obtain any feature.   304    305                 feature = self.read_match()   306                 handler = self.handlers.get(self.read_matching())   307    308                 # Handle each feature or add text to the region.   309    310                 if handler:   311                     handler(self, region)   312                 elif not strict:   313                     region.append_inline(Text(feature))   314                 else:   315                     break   316    317         except StopIteration:   318             pass   319    320         region.normalise()   321    322     def add_node(self, region, node):   323    324         "Add to 'region' the given 'node'."   325    326         region.add(node)   327         self.unqueue_region(region, node)   328    329     def append_node(self, region, node):   330    331         "Append to 'region' the given 'node'."   332    333         region.append(node)   334         self.unqueue_region(region, node)   335    336     def end_region(self, region):   337    338         "End the parsing of 'region', breaking out of the parsing loop."   339    340         raise StopIteration   341    342     def queue_region(self, region, current):   343    344         "Queue 'region' for appending after the 'current' region is ended."   345    346         self.queued[current].append(region)   347    348     def unqueue_region(self, region, ended):   349    350         "Unqueue any queued region, adding it to 'region' after 'ended'."   351    352         nodes = self.queued.get(ended)   353    354         while nodes:   355             node = nodes.pop()   356             region.add(node)   357             self.unqueue_region(region, node)   358    359         if self.queued.has_key(ended):   360             del self.queued[ended]   361    362     def new_block(self, region):   363    364         "Start a new block in 'region'."   365    366         self.add_node(region, Block([]))   367    368 # vim: tabstop=4 expandtab shiftwidth=4