MoinLight (file moinformat.py at 41a90c24c397)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki format parser.     5      6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from cgi import escape    23 import re    24     25 # Regular expressions.    26     27 syntax = {    28     # Page regions:    29     "marker"        : (r"^\s*([{]{3,}|[}]{3,})",    re.MULTILINE | re.DOTALL),  # {{{... or }}}...    30     31     # Region contents:    32     "header"        : (r"#!(.*?)\n",                0),                         # #! char-excl-nl    33     "break"         : (r"^\s*?\n",                  re.MULTILINE),              # blank line    34     }    35     36 # Define patterns for the regular expressions.    37     38 patterns = {}    39 for name, (value, flags) in syntax.items():    40     patterns[name] = re.compile(value, re.UNICODE | flags)    41     42     43     44 # Document nodes.    45     46 class Container:    47     48     "A container of document nodes."    49     50     def __init__(self, nodes):    51         self.nodes = nodes    52     53     def append(self, node):    54         self.nodes.append(node)    55     56     def normalise(self):    57     58         "Combine adjacent text nodes."    59     60         nodes = self.nodes    61         self.nodes = []    62         text = None    63     64         for node in nodes:    65     66             # Open a text node or merge text into an open node.    67     68             if isinstance(node, Text):    69                 if not text:    70                     text = node    71                 else:    72                     text.merge(node)    73     74             # Close any open text node and append the current node.    75     76             else:    77                 if text:    78                     self.append(text)    79                     text = None    80                 self.append(node)    81     82         # Add any open text node.    83     84         if text:    85             self.append(text)    86     87     def __str__(self):    88         return self.prettyprint()    89     90     def prettyprint(self, indent=""):    91         pass    92     93 class Region(Container):    94     95     "A region of the page."    96     97     transparent_region_types = ["wiki"]    98     99     def __init__(self, nodes, level=0, type=None):   100         Container.__init__(self, nodes)   101         self.level = level   102         self.type = type   103    104     def have_start(self, s):   105         return self.is_transparent() and s.startswith("{")   106    107     def have_end(self, s):   108         return self.level and s.startswith("}") and self.level == len(s)   109    110     def is_transparent(self):   111         return not self.level or self.type in self.transparent_region_types   112    113     def __repr__(self):   114         return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type)   115    116     def prettyprint(self, indent=""):   117         l = ["%sRegion: level=%d type=%s" % (indent, self.level, self.type)]   118         for node in self.nodes:   119             l.append(node.prettyprint(indent + "  "))   120         return "\n".join(l)   121    122     def to_string(self, out):   123         out.start_region(self.level, self.type)   124         for node in self.nodes:   125             node.to_string(out)   126         out.end_region(self.level, self.type)   127    128 class Block(Container):   129    130     "A block in the page."   131    132     def __init__(self, nodes, final=True):   133         Container.__init__(self, nodes)   134         self.final = final   135    136     def __repr__(self):   137         return "Block(%r)" % self.nodes   138    139     def prettyprint(self, indent=""):   140         l = ["%sBlock: final=%s" % (indent, self.final)]   141         for node in self.nodes:   142             l.append(node.prettyprint(indent + "  "))   143         return "\n".join(l)   144    145     def to_string(self, out):   146         out.start_block(self.final)   147         for node in self.nodes:   148             node.to_string(out)   149         out.end_block(self.final)   150    151 class Text:   152    153     "A text node."   154    155     def __init__(self, s):   156         self.s = s   157    158     def merge(self, text):   159         self.s += text.s   160    161     def __repr__(self):   162         return "Text(%r)" % self.s   163    164     def prettyprint(self, indent=""):   165         return "%sText: %r" % (indent, self.s)   166    167     def to_string(self, out):   168         out.text(self.s)   169    170    171    172 # Serialisation.   173    174 class Serialiser:   175    176     "General serialisation support."   177    178     def __init__(self, out):   179         self.out = out   180    181 class MoinSerialiser(Serialiser):   182    183     "Serialisation of the page."   184    185     def start_region(self, level, type):   186         out = self.out   187         if level:   188             out("{" * level)        # marker   189         if type and level:   190             out("#!%s\n" % type)    # header   191    192     def end_region(self, level, type):   193         out = self.out   194         if level:   195             out("}" * level)        # marker   196    197     def start_block(self, final):   198         pass   199    200     def end_block(self, final):   201         if not final:   202             self.out("\n")   203    204     def text(self, s):   205         self.out(s)   206    207 class HTMLSerialiser(Serialiser):   208    209     "Serialisation of the page."   210    211     def start_region(self, level, type):   212         l = []   213         out = l.append   214         if level:   215             out("level-%d" % level)                 # marker   216    217         # NOTE: Encode type details for CSS.   218    219         if type:   220             out("type-%s" % escape(type, True))     # header   221    222         self.out("<span class='%s'>" % " ".join(l))   223    224     def end_region(self, level, type):   225         self.out("</span>")   226    227     def start_block(self, final):   228         self.out("<p>")   229    230     def end_block(self, final):   231         self.out("</p>")   232    233     def text(self, s):   234         self.out(escape(s))   235    236    237    238 # Tokenising functions.   239    240 class TokenStream:   241    242     "A stream of tokens taken from a string."   243    244     def __init__(self, s):   245         self.s = s   246         self.pos = 0   247         self.match = None   248         self.matching = None   249    250     def read_until(self, pattern_names, remaining=True):   251    252         """   253         Find the first match for the given 'pattern_names'. Return the text   254         preceding any match, the remaining text if no match was found, or None   255         if no match was found and 'remaining' is given as a false value.   256         """   257    258         first = None   259         self.matching = None   260    261         # Find the first matching pattern.   262    263         for pattern_name in pattern_names:   264             match = patterns[pattern_name].search(self.s, self.pos)   265             if match:   266                 start, end = match.span()   267                 if self.matching is None or start < first:   268                     first = start   269                     self.matching = pattern_name   270                     self.match = match   271    272         if self.matching is None:   273             if remaining:   274                 return self.s[self.pos:]   275             else:   276                 return None   277         else:   278             return self.s[self.pos:first]   279    280     def read_match(self):   281    282         "Return the matched text, updating the position in the stream."   283    284         if self.match:   285             _start, self.pos = self.match.span()   286             s = self.match.group(1)   287             self.match = None   288             return s   289         else:   290             self.pos = len(self.s)   291             return None   292    293    294    295 # Parser functions.   296    297 def parse_page(s):   298    299     """   300     Parse page text 's'. Pages consist of regions delimited by markers.   301     """   302    303     items = TokenStream(s)   304    305     # Define a region for the page and parse it.   306    307     region = Region([])   308     parse_region(items, region)   309     return region   310    311 def parse_region(items, region):   312    313     "Parse the data provided by 'items' to populate 'region'."   314    315     # Parse section headers.   316    317     parse_region_header(items, region)   318    319     if region.is_transparent():   320         parse_region_wiki(items, region)   321     else:   322         parse_region_opaque(items, region)   323    324 def parse_region_wiki(items, region):   325    326     "Parse the data provided by 'items' to populate a wiki 'region'."   327    328     # Process exposed text and sections.   329    330     block = Block([])   331     region.append(block)   332    333     while True:   334    335         # Obtain text before any marker or the end of the input.   336    337         preceding = items.read_until(["break", "marker"])   338         if preceding:   339             block.append(Text(preceding))   340    341         # Obtain any feature.   342    343         feature = items.read_match()   344    345         # End of input.   346    347         if not items.matching:   348             break   349    350         # Start a section if an appropriate marker is given.   351    352         if region.have_start(feature):   353    354             # Define the section and parse it.   355    356             _region = Region([], len(feature))   357             region.append(_region)   358             parse_region(items, _region)   359    360             # Start a new block after the section.   361    362             block = Block([])   363             region.append(block)   364    365         # Interpret the given marker, closing the current section if the   366         # given marker is the corresponding end marker for the current   367         # section.   368    369         elif region.have_end(feature):   370             break   371    372         # Start a new block if a paragraph break is found.   373    374         elif items.matching == "break":   375             block.final = False   376             block = Block([])   377             region.append(block)   378    379         # Add any inappropriate marker to the text.   380    381         else:   382             block.append(Text(feature))   383    384     region.normalise()   385    386 def parse_region_opaque(items, region):   387    388     "Parse the data provided by 'items' to populate an opaque 'region'."   389    390     # Process exposed text and sections.   391    392     while True:   393    394         # Obtain text before any marker or the end of the input.   395    396         preceding = items.read_until(["marker"])   397         if preceding:   398             region.append(Text(preceding))   399    400         # Obtain any marker.   401    402         marker = items.read_match()   403    404         # End of input.   405    406         if not marker:   407             break   408    409         # Interpret the given marker, closing the current section if the   410         # given marker is the corresponding end marker for the current   411         # section.   412    413         if region.have_end(marker):   414             break   415    416         # Add any inappropriate marker to the text.   417    418         else:   419             region.append(Text(marker))   420    421     region.normalise()   422    423 def parse_region_header(items, region):   424    425     """   426     Parse the region header from the 'items', setting it for the given 'region'.   427     """   428    429     if items.read_until(["header"], False) == "": # None means no header   430         region.type = items.read_match()   431    432    433    434 # Top-level functions.   435    436 parse = parse_page   437    438 def serialise(doc, serialiser=MoinSerialiser):   439     l = []   440     doc.to_string(serialiser(l.append))   441     return "".join(l)   442    443 # vim: tabstop=4 expandtab shiftwidth=4