MoinLight (file moinformat/parsers/common.py at d9f2d31daebd)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from collections import defaultdict    23 from moinformat.tree.moin import Block, Region, Text    24 import re    25     26 # Pattern management.    27     28 ws_excl_nl = r"[ \f\r\t\v]"    29 quotes = "['" '"]'              # ['"]    30 dotall = r"(.|\n)"    31     32 def choice(l):    33     34     "Return a pattern matching a choice of patterns in 'l'."    35     36     return "(%s)" % "|".join(l)    37     38 def excl(s):    39     40     "Return a non-matching pattern for 's'."    41     42     return "(?!%s)" % s    43     44 def expect(s):    45     46     "Return a pattern expecting 's'."    47     48     return "(?=%s)" % s    49     50 def group(name, s):    51     52     "Return a pattern group having 'name' and the pattern string 's'."    53     54     return "(?P<%s>%s)" % (name, s)    55     56 def optional(s):    57     58     "Return an optional pattern."    59     60     return "(?:%s)?" % s    61     62 def recur(name):    63     64     "Return a test for a recurrence of group 'name'."    65     66     return "(?P=%s)" % name    67     68 def repeat(s, min=None, max=None):    69     70     "Return a pattern matching 's' for the given 'min' and 'max' limits."    71     72     return "%s{%s,%s}" % (s, min is not None and min or "",    73                              max is not None and max or "")    74     75 def get_pattern(s):    76     77     "Return a compiled regular expression for the given pattern 's'."    78     79     return re.compile(s, re.UNICODE | re.MULTILINE)    80     81 def get_patterns(syntax):    82     83     """    84     Define patterns for the regular expressions in the 'syntax' mapping. In each    85     pattern, replace \N with a pattern for matching whitespace excluding    86     newlines.    87     """    88     89     patterns = {}    90     for name, value in syntax.items():    91         value = value.replace(r"\N", ws_excl_nl)    92         value = value.replace(r"\Q", quotes)    93         value = value.replace(r"\E", dotall)    94         patterns[name] = get_pattern(value)    95     return patterns    96     97 def get_subset(d, keys):    98     99     "Return a subset of 'd' having the given 'keys'."   100    101     subset = {}   102     for key in keys:   103         subset[key] = d[key]   104     return subset   105    106    107    108 # Tokenising functions.   109    110 class TokenStream:   111    112     "A stream of tokens taken from a string."   113    114     def __init__(self, s, pos=0):   115         self.s = s   116         self.pos = pos   117    118         # Match details.   119    120         self.match = None   121         self.queued = None   122         self.match_start = None   123    124         # Pattern name details.   125    126         self.matching = None   127    128     def rewind(self, length):   129    130         "Rewind in the string by 'length'."   131    132         self.pos -= min(length, self.pos)   133    134     def queue_match(self):   135    136         "Rewind in the string to the start of the last match."   137    138         self.queued = self.match   139    140     def read_until(self, patterns, remaining=True):   141    142         """   143         Find the first match for the given 'patterns'. Return the text preceding   144         any match, the remaining text if no match was found, or None if no match   145         was found and 'remaining' is given as a false value.   146         """   147    148         if self.queued:   149             self.match = self.queued   150             self.queued = None   151         else:   152             self.match_start = None   153             self.matching = None   154    155             # Find the first matching pattern.   156    157             for pattern_name, pattern in patterns.items():   158                 match = pattern.search(self.s, self.pos)   159                 if match:   160                     start, end = match.span()   161                     if self.matching is None or start < self.start:   162                         self.start = start   163                         self.matching = pattern_name   164                         self.match = match   165    166         if self.matching is None:   167             if remaining:   168                 return self.s[self.pos:]   169             else:   170                 return None   171         else:   172             return self.s[self.pos:self.start]   173    174     def match_group(self, group=1):   175    176         """   177         Return the matched text, updating the position in the stream. If 'group'   178         is specified, the indicated group in a match will be returned.   179         Typically, group 1 should contain all pertinent data, but groups defined   180         within group 1 can provide sections of the data.   181         """   182    183         self.update_pos()   184    185         if self.match:   186             try:   187                 return self.match.group(group)   188             except IndexError:   189                 return ""   190         else:   191             return None   192    193     def match_groups(self, groups=None):   194    195         "Return the match 'groups', or all groups if unspecified."   196    197         self.update_pos()   198    199         if self.match:   200             if groups is None:   201                 return self.match.groups()   202             else:   203                 return self.match.groups(groups)   204         else:   205             return []   206    207     def update_pos(self):   208    209         "Update the position in the stream."   210    211         if self.match:   212             _start, self.pos = self.match.span()   213         else:   214             self.pos = len(self.s)   215    216    217    218 # Parser abstractions.   219    220 class ParserBase:   221    222     "Common parsing methods."   223    224     region_pattern_names = None   225    226     def __init__(self, formats=None, root=None):   227    228         """   229         Initialise the parser with any given 'formats' mapping from region type   230         names to parser objects. An optional 'root' indicates the document-level   231         parser.   232         """   233    234         self.formats = formats   235         self.root = root   236    237     def get_parser(self, format_type):   238    239         """   240         Return a parser for 'format_type' or None if no suitable parser is found.   241         """   242    243         if not self.formats:   244             return None   245    246         cls = self.formats.get(format_type)   247         if cls:   248             return cls(self.formats, self.root or self)   249         else:   250             return None   251    252     def get_patterns(self, pattern_names):   253    254         "Return a mapping of the given 'pattern_names' to patterns."   255    256         return get_subset(self.patterns, pattern_names)   257    258     def get_items(self, s, pos=0):   259    260         "Return a sequence of token items for 's' and 'pos'."   261    262         return TokenStream(s, pos)   263    264     def set_region(self, items, region):   265    266         "Set the 'items' used to populate the given 'region'."   267    268         self.items = items   269         self.region = region   270    271     def read_until(self, pattern_names, remaining=True):   272    273         """   274         Read the next portion of input, matching using 'pattern_names'. Return   275         the text preceding any match, the remaining text if no match was found,   276         or None if no match was found and 'remaining' is given as a false value.   277         """   278    279         return self.items.read_until(self.get_patterns(pattern_names))   280    281     def match_group(self, group=1):   282    283         """   284         Return the group of the matching pattern with the given 'group' number.   285         """   286    287         return self.items.match_group(group)   288    289     def matching_pattern(self):   290    291         "Return the name of the matching pattern."   292    293         return self.items.matching   294    295     def match_groups(self):   296    297         "Return the number of groups in the match."   298    299         return self.items.match_groups()   300    301     # Parser methods invoked from other objects.   302    303     def parse(self, s):   304    305         """   306         Parse page text 's'. Pages consist of regions delimited by markers.   307         """   308    309         self.items = self.get_items(s)   310         self.region = self.parse_region()   311         return self.region   312    313     def parse_region_content(self, items, region):   314    315         "Parse the data provided by 'items' to populate a 'region'."   316    317         self.set_region(items, region)   318    319         # Parse inline and opaque regions.   320    321         if not region.transparent:   322             pattern_names = ["regionend"]   323    324         # Define a block to hold text.   325    326         else:   327             self.new_block(region)   328             pattern_names = self.region_pattern_names   329    330         # Start parsing.   331    332         if pattern_names:   333             self.parse_region_details(region, pattern_names)   334    335         # Reset the type if the region was not inline.   336    337         if region.type == "inline":   338             first = region.nodes and region.nodes[0]   339             if first and isinstance(first, Text) and first.multiline():   340                 region.type = None   341    342     # Top-level parser handler methods.   343    344     def parse_region(self, level=0, indent=0, type=None):   345    346         """   347         Parse the data to populate a region with the given 'level' at the given   348         'indent' having the given initial 'type'.   349         """   350    351         region = Region([], level, indent, type)   352    353         # Parse section headers and directives, then parse according to region   354         # type.   355    356         self.parse_region_header(region)   357         self.parse_region_directives(region)   358         self.parse_region_type(region)   359    360         return region   361    362     def parse_region_type(self, region):   363    364         """   365         Use configured parsers to parse 'region' based on its type.   366         """   367    368         # Find an appropriate parser given the type.   369    370         parser = self.get_parser(region.type)   371         if not parser:   372             region.transparent = False   373         parser = parser or self.get_parser("moin")   374         parser.parse_region_content(self.items, region)   375    376     def parse_region_header(self, region):   377    378         """   379         Parse the region header, setting it on the 'region' object.   380         """   381    382         if self.read_until(["header"], False) == "": # None means no header   383             region.args = self.match_group("args")   384             region.type = region.args.split(" ", 1)[0]   385    386     def parse_region_directives(self, region):   387    388         """   389         Parse any directives immediately after the region header, adding them to   390         the 'region' object.   391         """   392    393         while True:   394             preceding = self.read_until(["directive"], False)   395    396             # With an immediately-appearing directive, handle its details.   397    398             if preceding == "":   399                 handler = self.handlers.get(self.matching_pattern())   400                 if handler:   401                     handler(self, region)   402                 else:   403                     break   404    405             # Otherwise, with no immediate directive (or none at all), stop.   406    407             else:   408                 break   409    410     # Parsing utilities.   411    412     def parse_region_details(self, region, pattern_names, strict=False):   413    414         """   415         Search 'region' using the 'pattern_names'. If 'strict' is set to a true   416         value, forbid the accumulation of additional textual padding.   417         """   418    419         try:   420             while True:   421    422                 # Obtain text before any marker or the end of the input.   423    424                 preceding = self.read_until(pattern_names)   425                 if preceding:   426                     if not strict:   427                         region.append_inline(Text(preceding))   428                     else:   429                         break   430    431                 # End of input.   432    433                 if not self.matching_pattern():   434                     break   435    436                 # Obtain any feature.   437    438                 feature = self.match_group("feature") or self.match_group()   439                 handler = self.handlers.get(self.matching_pattern())   440    441                 # Handle each feature or add text to the region.   442    443                 if handler:   444                     handler(self, region)   445                 elif not strict:   446                     region.append_inline(Text(feature))   447                 else:   448                     break   449    450         except StopIteration:   451             pass   452    453         region.normalise()   454    455     def add_node(self, region, node):   456    457         "Add to 'region' the given 'node'."   458    459         region.add(node)   460    461     def append_node(self, region, node):   462    463         "Append to 'region' the given 'node'."   464    465         region.append(node)   466    467     def end_region(self, region):   468    469         "End the parsing of 'region', breaking out of the parsing loop."   470    471         raise StopIteration   472    473     def queue_match(self):   474    475         "Queue the current match."   476    477         self.items.queue_match()   478    479     def new_block(self, region):   480    481         "Start a new block in 'region'."   482    483         self.add_node(region, Block([]))   484    485     # Common handler methods.   486    487     def parse_region_end(self, node):   488    489         "Handle the end of a region occurring within 'node'."   490    491         level = self.match_group("level")   492         feature = self.match_group("feature")   493         self.region.extra = self.match_group("extra")   494    495         if self.region.have_end(level):   496             raise StopIteration   497         else:   498             node.append_inline(Text(feature))   499    500 # vim: tabstop=4 expandtab shiftwidth=4