MoinLight (file moinformat/parsers/common.py at 69cb676460b1)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki parsing functionality.     5      6 Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from collections import defaultdict    23 from moinformat.tree.moin import Block, Region, Text    24 import re    25     26 # Pattern management.    27     28 ws_excl_nl = r"[ \f\r\t\v]"    29 quotes = "['" '"]'              # ['"]    30 dotall = r"(.|\n)"    31     32 def choice(l):    33     34     "Return a pattern matching a choice of patterns in 'l'."    35     36     return "(%s)" % "|".join(l)    37     38 def excl(s):    39     40     "Return a non-matching pattern for 's'."    41     42     return "(?!%s)" % s    43     44 def expect(s):    45     46     "Return a pattern expecting 's'."    47     48     return "(?=%s)" % s    49     50 def group(name, s):    51     52     "Return a pattern group having 'name' and the pattern string 's'."    53     54     return "(?P<%s>%s)" % (name, s)    55     56 def optional(s):    57     58     "Return an optional pattern."    59     60     return "(?:%s)?" % s    61     62 def recur(name):    63     64     "Return a test for a recurrence of group 'name'."    65     66     return "(?P=%s)" % name    67     68 def repeat(s, min=None, max=None):    69     70     "Return a pattern matching 's' for the given 'min' and 'max' limits."    71     72     return "%s{%s,%s}" % (s, min is not None and min or "",    73                              max is not None and max or "")    74     75 def get_pattern(s):    76     77     "Return a compiled regular expression for the given pattern 's'."    78     79     return re.compile(s, re.UNICODE | re.MULTILINE)    80     81 def get_patterns(syntax):    82     83     """    84     Define patterns for the regular expressions in the 'syntax' mapping. In each    85     pattern, replace \N with a pattern for matching whitespace excluding    86     newlines.    87     """    88     89     patterns = {}    90     for name, value in syntax.items():    91         value = value.replace(r"\N", ws_excl_nl)    92         value = value.replace(r"\Q", quotes)    93         value = value.replace(r"\E", dotall)    94         patterns[name] = get_pattern(value)    95     return patterns    96     97 def get_subset(d, keys):    98     99     "Return a subset of 'd' having the given 'keys'."   100    101     subset = {}   102     for key in keys:   103         subset[key] = d[key]   104     return subset   105    106    107    108 # Tokenising functions.   109    110 class TokenStream:   111    112     "A stream of tokens taken from a string."   113    114     def __init__(self, s, pos=0):   115         self.s = s   116         self.pos = pos   117    118         # Match details.   119    120         self.match = None   121         self.queued = None   122         self.match_start = None   123    124         # Pattern name details.   125    126         self.matching = None   127    128     def rewind(self, length):   129    130         "Rewind in the string by 'length'."   131    132         self.pos -= min(length, self.pos)   133    134     def queue_match(self):   135    136         "Rewind in the string to the start of the last match."   137    138         self.queued = self.match   139    140     def read_until(self, patterns, remaining=True):   141    142         """   143         Find the first match for the given 'patterns'. Return the text preceding   144         any match, the remaining text if no match was found, or None if no match   145         was found and 'remaining' is given as a false value.   146         """   147    148         if self.queued:   149             self.match = self.queued   150             self.queued = None   151         else:   152             self.match_start = None   153             self.matching = None   154    155             # Find the first matching pattern.   156    157             for pattern_name, pattern in patterns.items():   158                 match = pattern.search(self.s, self.pos)   159                 if match:   160                     start, end = match.span()   161                     if self.matching is None or start < self.start:   162                         self.start = start   163                         self.matching = pattern_name   164                         self.match = match   165    166         if self.matching is None:   167             if remaining:   168                 return self.s[self.pos:]   169             else:   170                 return None   171         else:   172             return self.s[self.pos:self.start]   173    174     def match_group(self, group=1):   175    176         """   177         Return the matched text, updating the position in the stream. If 'group'   178         is specified, the indicated group in a match will be returned.   179         Typically, group 1 should contain all pertinent data, but groups defined   180         within group 1 can provide sections of the data.   181         """   182    183         self.update_pos()   184    185         if self.match:   186             try:   187                 return self.match.group(group)   188             except IndexError:   189                 return ""   190         else:   191             return None   192    193     def match_groups(self, groups=None):   194    195         "Return the match 'groups', or all groups if unspecified."   196    197         self.update_pos()   198    199         if self.match:   200             if groups is None:   201                 return self.match.groups()   202             else:   203                 return self.match.groups(groups)   204         else:   205             return []   206    207     def update_pos(self):   208    209         "Update the position in the stream."   210    211         if self.match:   212             _start, self.pos = self.match.span()   213         else:   214             self.pos = len(self.s)   215    216    217    218 # Parser abstractions.   219    220 class ParserBase:   221    222     "Common parsing methods."   223    224     region_pattern_names = None   225    226     def __init__(self, metadata, parsers=None, root=None):   227    228         """   229         Initialise the parser with the given 'metadata' and optional 'parsers'.   230         An optional 'root' indicates the document-level parser.   231         """   232    233         self.metadata = metadata   234         self.parsers = parsers   235         self.root = root   236    237     def get_parser(self, format_type):   238    239         """   240         Return a parser for 'format_type' or None if no suitable parser is found.   241         """   242    243         cls = self.parsers and self.parsers.get(format_type)   244         if cls:   245             return cls(self.metadata, self.parsers, self.root or self)   246         else:   247             return None   248    249     def get_patterns(self, pattern_names):   250    251         "Return a mapping of the given 'pattern_names' to patterns."   252    253         return get_subset(self.patterns, pattern_names)   254    255     def get_items(self, s, pos=0):   256    257         "Return a sequence of token items for 's' and 'pos'."   258    259         return TokenStream(s, pos)   260    261     def set_region(self, items, region):   262    263         "Set the 'items' used to populate the given 'region'."   264    265         self.items = items   266         self.region = region   267    268     def read_until(self, pattern_names, remaining=True):   269    270         """   271         Read the next portion of input, matching using 'pattern_names'. Return   272         the text preceding any match, the remaining text if no match was found,   273         or None if no match was found and 'remaining' is given as a false value.   274         """   275    276         return self.items.read_until(self.get_patterns(pattern_names))   277    278     def match_group(self, group=1):   279    280         """   281         Return the group of the matching pattern with the given 'group' number.   282         """   283    284         return self.items.match_group(group)   285    286     def matching_pattern(self):   287    288         "Return the name of the matching pattern."   289    290         return self.items.matching   291    292     def match_groups(self):   293    294         "Return the number of groups in the match."   295    296         return self.items.match_groups()   297    298     # Parser methods invoked from other objects.   299    300     def parse(self, s):   301    302         """   303         Parse page text 's'. Pages consist of regions delimited by markers.   304         """   305    306         self.items = self.get_items(s)   307         self.region = self.parse_region()   308         return self.region   309    310     def parse_region_content(self, items, region):   311    312         "Parse the data provided by 'items' to populate a 'region'."   313    314         self.set_region(items, region)   315    316         # Parse inline and opaque regions.   317    318         if not region.transparent:   319             pattern_names = ["regionend"]   320    321         # Define a block to hold text.   322    323         else:   324             self.new_block(region)   325             pattern_names = self.region_pattern_names   326    327         # Start parsing.   328    329         if pattern_names:   330             self.parse_region_details(region, pattern_names)   331    332         # Reset the type if the region was not inline.   333    334         if region.type == "inline":   335             first = region.nodes and region.nodes[0]   336             if first and isinstance(first, Text) and first.multiline():   337                 region.type = None   338    339     # Top-level parser handler methods.   340    341     def parse_region(self, level=0, indent=0, type=None):   342    343         """   344         Parse the data to populate a region with the given 'level' at the given   345         'indent' having the given initial 'type'.   346         """   347    348         region = Region([], level, indent, type)   349    350         # Parse section headers and directives, then parse according to region   351         # type.   352    353         self.parse_region_header(region)   354         self.parse_region_directives(region)   355         self.parse_region_type(region)   356    357         return region   358    359     def parse_region_type(self, region):   360    361         """   362         Use configured parsers to parse 'region' based on its type.   363         """   364    365         # Find an appropriate parser given the type.   366    367         parser = self.get_parser(region.type)   368         if not parser:   369             region.transparent = False   370         parser = parser or self.get_parser("moin")   371         parser.parse_region_content(self.items, region)   372    373     def parse_region_header(self, region):   374    375         """   376         Parse the region header, setting it on the 'region' object.   377         """   378    379         if self.read_until(["header"], False) == "": # None means no header   380             region.args = self.match_group("args")   381             region.type = region.args.split(" ", 1)[0]   382    383     def parse_region_directives(self, region):   384    385         """   386         Parse any directives immediately after the region header, adding them to   387         the 'region' object.   388         """   389    390         while True:   391             preceding = self.read_until(["directive"], False)   392    393             # With an immediately-appearing directive, handle its details.   394    395             if preceding == "":   396                 handler = self.handlers.get(self.matching_pattern())   397                 if handler:   398                     handler(self, region)   399                 else:   400                     break   401    402             # Otherwise, with no immediate directive (or none at all), stop.   403    404             else:   405                 break   406    407     # Parsing utilities.   408    409     def parse_region_details(self, region, pattern_names, strict=False):   410    411         """   412         Search 'region' using the 'pattern_names'. If 'strict' is set to a true   413         value, forbid the accumulation of additional textual padding.   414         """   415    416         try:   417             while True:   418    419                 # Obtain text before any marker or the end of the input.   420    421                 preceding = self.read_until(pattern_names)   422                 if preceding:   423                     if not strict:   424                         region.append_inline(Text(preceding))   425                     else:   426                         break   427    428                 # End of input.   429    430                 if not self.matching_pattern():   431                     break   432    433                 # Obtain any feature.   434    435                 feature = self.match_group("feature") or self.match_group()   436                 handler = self.handlers.get(self.matching_pattern())   437    438                 # Handle each feature or add text to the region.   439    440                 if handler:   441                     handler(self, region)   442                 elif not strict:   443                     region.append_inline(Text(feature))   444                 else:   445                     break   446    447         except StopIteration:   448             pass   449    450         region.normalise()   451    452     def add_node(self, region, node):   453    454         "Add to 'region' the given 'node'."   455    456         region.add(node)   457    458     def append_node(self, region, node):   459    460         "Append to 'region' the given 'node'."   461    462         region.append(node)   463    464     def end_region(self, region):   465    466         "End the parsing of 'region', breaking out of the parsing loop."   467    468         raise StopIteration   469    470     def queue_match(self):   471    472         "Queue the current match."   473    474         self.items.queue_match()   475    476     def new_block(self, region):   477    478         "Start a new block in 'region'."   479    480         self.add_node(region, Block([]))   481    482     # Common handler methods.   483    484     def parse_region_end(self, node):   485    486         "Handle the end of a region occurring within 'node'."   487    488         level = self.match_group("level")   489         feature = self.match_group("feature")   490         self.region.extra = self.match_group("extra")   491    492         if self.region.have_end(level):   493             raise StopIteration   494         else:   495             node.append_inline(Text(feature))   496    497 # vim: tabstop=4 expandtab shiftwidth=4