MoinLight (file moinformat/parsers/common.py at 47af441b48bf)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from collections import defaultdict    23 from moinformat.tree.moin import Block, Region, Text    24 import re    25     26 # Pattern management.    27     28 ws_excl_nl = r"[ \f\r\t\v]"    29 quotes = "['" '"]'                  # ['"]    30 dotall = r"(.|\n)"                  # behave similarly to dot with DOTALL option    31 dotparagraph = r"(.|\n(?!\r?\n))"   # match everything within paragraphs    32     33 def choice(l):    34     35     "Return a pattern matching a choice of patterns in 'l'."    36     37     return "(%s)" % "|".join(l)    38     39 def excl(s):    40     41     "Return a non-matching pattern for 's'."    42     43     return "(?!%s)" % s    44     45 def expect(s):    46     47     "Return a pattern expecting 's'."    48     49     return "(?=%s)" % s    50     51 def group(name, s):    52     53     "Return a pattern group having 'name' and the pattern string 's'."    54     55     return "(?P<%s>%s)" % (name, s)    56     57 def optional(s):    58     59     "Return an optional pattern."    60     61     return "(?:%s)?" % s    62     63 def recur(name):    64     65     "Return a test for a recurrence of group 'name'."    66     67     return "(?P=%s)" % name    68     69 def repeat(s, min=None, max=None):    70     71     "Return a pattern matching 's' for the given 'min' and 'max' limits."    72     73     return "%s{%s,%s}" % (s, min is not None and min or "",    74                              max is not None and max or "")    75     76 def get_pattern(s):    77     78     "Return a compiled regular expression for the given pattern 's'."    79     80     return re.compile(s, re.UNICODE | re.MULTILINE)    81     82 def get_patterns(syntax):    83     84     """    85     Define patterns for the regular expressions in the 'syntax' mapping. In each    86     pattern, replace \N with a pattern for matching whitespace excluding    87     newlines.    88     """    89     90     patterns = {}    91     for name, value in syntax.items():    92         value = value.replace(r"\N", ws_excl_nl)    93         value = value.replace(r"\Q", quotes)    94         value = value.replace(r"\E", dotall)    95         value = value.replace(r"\P", dotparagraph)    96         patterns[name] = get_pattern(value)    97     return patterns    98     99 def get_subset(d, keys):   100    101     "Return a subset of 'd' having the given 'keys'."   102    103     subset = {}   104     for key in keys:   105         subset[key] = d[key]   106     return subset   107    108    109    110 # Tokenising functions.   111    112 class TokenStream:   113    114     "A stream of tokens taken from a string."   115    116     def __init__(self, s, pos=0):   117         self.s = s   118         self.pos = pos   119    120         # Match details.   121    122         self.match = None   123         self.queued = None   124         self.match_start = None   125    126         # Pattern name details.   127    128         self.matching = None   129    130     def rewind(self, length):   131    132         "Rewind in the string by 'length'."   133    134         self.pos -= min(length, self.pos)   135    136     def queue_match(self):   137    138         "Rewind in the string to the start of the last match."   139    140         self.queued = self.match   141    142     def read_until(self, patterns, remaining=True):   143    144         """   145         Find the first match for the given 'patterns'. Return the text preceding   146         any match, the remaining text if no match was found, or None if no match   147         was found and 'remaining' is given as a false value.   148         """   149    150         if self.queued:   151             self.match = self.queued   152             self.queued = None   153         else:   154             self.match_start = None   155             self.matching = None   156    157             # Find the first matching pattern.   158    159             for pattern_name, pattern in patterns.items():   160                 match = pattern.search(self.s, self.pos)   161                 if match:   162                     start, end = match.span()   163                     if self.matching is None or start < self.start:   164                         self.start = start   165                         self.matching = pattern_name   166                         self.match = match   167    168         if self.matching is None:   169             if remaining:   170                 return self.s[self.pos:]   171             else:   172                 return None   173         else:   174             return self.s[self.pos:self.start]   175    176     def match_group(self, group=1):   177    178         """   179         Return the matched text, updating the position in the stream. If 'group'   180         is specified, the indicated group in a match will be returned.   181         Typically, group 1 should contain all pertinent data, but groups defined   182         within group 1 can provide sections of the data.   183         """   184    185         self.update_pos()   186    187         if self.match:   188             try:   189                 return self.match.group(group)   190             except IndexError:   191                 return ""   192         else:   193             return None   194    195     def match_groups(self, groups=None):   196    197         "Return the match 'groups', or all groups if unspecified."   198    199         self.update_pos()   200    201         if self.match:   202             if groups is None:   203                 return self.match.groups()   204             else:   205                 return self.match.groups(groups)   206         else:   207             return []   208    209     def update_pos(self):   210    211         "Update the position in the stream."   212    213         if self.match:   214             _start, self.pos = self.match.span()   215         else:   216             self.pos = len(self.s)   217    218    219    220 # Parser abstractions.   221    222 class ParserBase:   223    224     "Common parsing methods."   225    226     region_pattern_names = None   227    228     def __init__(self, metadata, parsers=None, root=None):   229    230         """   231         Initialise the parser with the given 'metadata' and optional 'parsers'.   232         An optional 'root' indicates the document-level parser.   233         """   234    235         self.metadata = metadata   236         self.parsers = parsers   237         self.root = root   238    239     def get_parser(self, format_type):   240    241         """   242         Return a parser for 'format_type' or None if no suitable parser is found.   243         """   244    245         cls = self.parsers and self.parsers.get(format_type)   246         if cls:   247             return cls(self.metadata, self.parsers, self.root or self)   248         else:   249             return None   250    251     def get_patterns(self, pattern_names):   252    253         "Return a mapping of the given 'pattern_names' to patterns."   254    255         return get_subset(self.patterns, pattern_names)   256    257     def get_items(self, s, pos=0):   258    259         "Return a sequence of token items for 's' and 'pos'."   260    261         return TokenStream(s, pos)   262    263     def set_region(self, items, region):   264    265         "Set the 'items' used to populate the given 'region'."   266    267         self.items = items   268         self.region = region   269    270     def read_until(self, pattern_names, remaining=True):   271    272         """   273         Read the next portion of input, matching using 'pattern_names'. Return   274         the text preceding any match, the remaining text if no match was found,   275         or None if no match was found and 'remaining' is given as a false value.   276         """   277    278         return self.items.read_until(self.get_patterns(pattern_names))   279    280     def match_group(self, group=1):   281    282         """   283         Return the group of the matching pattern with the given 'group' number.   284         """   285    286         return self.items.match_group(group)   287    288     def matching_pattern(self):   289    290         "Return the name of the matching pattern."   291    292         return self.items.matching   293    294     def match_groups(self):   295    296         "Return the number of groups in the match."   297    298         return self.items.match_groups()   299    300     # Parser methods invoked from other objects.   301    302     def parse(self, s):   303    304         """   305         Parse page text 's'. Pages consist of regions delimited by markers.   306         """   307    308         self.items = self.get_items(s)   309         self.region = self.parse_region()   310         return self.region   311    312     def parse_region_content(self, items, region):   313    314         "Parse the data provided by 'items' to populate a 'region'."   315    316         self.set_region(items, region)   317    318         # Parse inline and opaque regions.   319    320         if not region.transparent:   321             pattern_names = ["regionend"]   322    323         # Define a block to hold text.   324    325         else:   326             self.new_block(region)   327             pattern_names = self.region_pattern_names   328    329         # Start parsing.   330    331         if pattern_names:   332             self.parse_region_details(region, pattern_names)   333    334         # Reset the type if the region was not inline.   335    336         if region.type == "inline":   337             first = region.nodes and region.nodes[0]   338             if first and isinstance(first, Text) and first.multiline():   339                 region.type = None   340    341     # Top-level parser handler methods.   342    343     def parse_region(self, level=0, indent=0, type=None):   344    345         """   346         Parse the data to populate a region with the given 'level' at the given   347         'indent' having the given initial 'type'.   348         """   349    350         region = Region([], level, indent, type)   351    352         # Parse section headers and directives, then parse according to region   353         # type.   354    355         self.parse_region_header(region)   356         self.parse_region_directives(region)   357         self.parse_region_type(region)   358    359         return region   360    361     def parse_region_type(self, region):   362    363         """   364         Use configured parsers to parse 'region' based on its type.   365         """   366    367         # Find an appropriate parser given the type.   368    369         parser = self.get_parser(region.type)   370         if not parser:   371             region.transparent = False   372         parser = parser or self.get_parser("moin")   373         parser.parse_region_content(self.items, region)   374    375     def parse_region_header(self, region):   376    377         """   378         Parse the region header, setting it on the 'region' object.   379         """   380    381         if self.read_until(["header"], False) == "": # None means no header   382             region.args = self.match_group("args")   383             region.type = region.args.split(" ", 1)[0]   384    385     def parse_region_directives(self, region):   386    387         """   388         Parse any directives immediately after the region header, adding them to   389         the 'region' object.   390         """   391    392         while True:   393             preceding = self.read_until(["directive"], False)   394    395             # With an immediately-appearing directive, handle its details.   396    397             if preceding == "":   398                 handler = self.handlers.get(self.matching_pattern())   399                 if handler:   400                     handler(self, region)   401                 else:   402                     break   403    404             # Otherwise, with no immediate directive (or none at all), stop.   405    406             else:   407                 break   408    409     # Parsing utilities.   410    411     def parse_region_details(self, region, pattern_names, strict=False):   412    413         """   414         Search 'region' using the 'pattern_names'. If 'strict' is set to a true   415         value, forbid the accumulation of additional textual padding.   416         """   417    418         try:   419             while True:   420    421                 # Obtain text before any marker or the end of the input.   422    423                 preceding = self.read_until(pattern_names)   424                 if preceding:   425                     if not strict:   426                         region.append_inline(Text(preceding))   427                     else:   428                         break   429    430                 # End of input.   431    432                 if not self.matching_pattern():   433                     break   434    435                 # Obtain any feature.   436    437                 feature = self.match_group("feature") or self.match_group()   438                 handler = self.handlers.get(self.matching_pattern())   439    440                 # Handle each feature or add text to the region.   441    442                 if handler:   443                     handler(self, region)   444                 elif not strict:   445                     region.append_inline(Text(feature))   446                 else:   447                     break   448    449         except StopIteration:   450             pass   451    452         region.normalise()   453    454     def add_node(self, region, node):   455    456         "Add to 'region' the given 'node'."   457    458         region.add(node)   459    460     def append_node(self, region, node):   461    462         "Append to 'region' the given 'node'."   463    464         region.append(node)   465    466     def end_region(self, region):   467    468         "End the parsing of 'region', breaking out of the parsing loop."   469    470         raise StopIteration   471    472     def queue_match(self):   473    474         "Queue the current match."   475    476         self.items.queue_match()   477    478     def new_block(self, region):   479    480         "Start a new block in 'region'."   481    482         self.add_node(region, Block([]))   483    484     # Common handler methods.   485    486     def parse_region_end(self, node):   487    488         "Handle the end of a region occurring within 'node'."   489    490         level = self.match_group("level")   491         feature = self.match_group("feature")   492         self.region.extra = self.match_group("extra")   493    494         if self.region.have_end(level):   495             raise StopIteration   496         else:   497             node.append_inline(Text(feature))   498    499 # vim: tabstop=4 expandtab shiftwidth=4