MoinLight (file moinformat.py at 863171f41eac)

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki format parser.     5      6 Copyright (C) 2012, 2013, 2015, 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from cgi import escape    23 import re    24     25 # Regular expressions.    26     27 syntax = {    28     # Page regions:    29     "regionstart"   : (r"^\s*([{]{3,})",                re.MULTILINE | re.DOTALL),  # {{{...    30     "regionend"     : (r"^\s*([}]{3,})",                re.MULTILINE | re.DOTALL),  # }}}...    31     "header"        : (r"#!(.*?)\n",                    0),                         # #! char-excl-nl    32     33     # Region contents:    34     "break"         : (r"^(\s*?)\n",                    re.MULTILINE),              # blank line    35     }    36     37 # Define patterns for the regular expressions.    38     39 patterns = {}    40 for name, (value, flags) in syntax.items():    41     patterns[name] = re.compile(value, re.UNICODE | flags)    42     43     44     45 # Document nodes.    46     47 class Container:    48     49     "A container of document nodes."    50     51     def __init__(self, nodes):    52         self.nodes = nodes    53     54     def append(self, node):    55         self.nodes.append(node)    56     57     def normalise(self):    58     59         "Combine adjacent text nodes."    60     61         nodes = self.nodes    62         self.nodes = []    63         text = None    64     65         for node in nodes:    66     67             # Open a text node or merge text into an open node.    68     69             if isinstance(node, Text):    70                 if not text:    71                     text = node    72                 else:    73                     text.merge(node)    74     75             # Close any open text node and append the current node.    76     77             else:    78                 if text:    79                     self.append(text)    80                     text = None    81                 self.append(node)    82     83         # Add any open text node.    84     85         if text:    86             self.append(text)    87     88     def __str__(self):    89         return self.prettyprint()    90     91     def prettyprint(self, indent=""):    92         pass    93     94 class Region(Container):    95     96     "A region of the page."    97     98     transparent_region_types = ["wiki"]    99    100     def __init__(self, nodes, level=0, type=None):   101         Container.__init__(self, nodes)   102         self.level = level   103         self.type = type   104    105     def have_end(self, s):   106         return self.level and s.startswith("}") and self.level == len(s)   107    108     def is_transparent(self):   109         return not self.level or self.type in self.transparent_region_types   110    111     def __repr__(self):   112         return "Region(%r, %r, %r)" % (self.nodes, self.level, self.type)   113    114     def prettyprint(self, indent=""):   115         l = ["%sRegion: level=%d type=%s" % (indent, self.level, self.type)]   116         for node in self.nodes:   117             l.append(node.prettyprint(indent + "  "))   118         return "\n".join(l)   119    120     def to_string(self, out):   121         out.start_region(self.level, self.type)   122         for node in self.nodes:   123             node.to_string(out)   124         out.end_region(self.level, self.type)   125    126 class Block(Container):   127    128     "A block in the page."   129    130     def __init__(self, nodes, final=True):   131         Container.__init__(self, nodes)   132         self.final = final   133    134     def __repr__(self):   135         return "Block(%r)" % self.nodes   136    137     def prettyprint(self, indent=""):   138         l = ["%sBlock: final=%s" % (indent, self.final)]   139         for node in self.nodes:   140             l.append(node.prettyprint(indent + "  "))   141         return "\n".join(l)   142    143     def to_string(self, out):   144         out.start_block(self.final)   145         for node in self.nodes:   146             node.to_string(out)   147         out.end_block(self.final)   148    149 class Text:   150    151     "A text node."   152    153     def __init__(self, s):   154         self.s = s   155    156     def merge(self, text):   157         self.s += text.s   158    159     def __repr__(self):   160         return "Text(%r)" % self.s   161    162     def prettyprint(self, indent=""):   163         return "%sText: %r" % (indent, self.s)   164    165     def to_string(self, out):   166         out.text(self.s)   167    168    169    170 # Serialisation.   171    172 class Serialiser:   173    174     "General serialisation support."   175    176     def __init__(self, out):   177         self.out = out   178    179 class MoinSerialiser(Serialiser):   180    181     "Serialisation of the page."   182    183     def start_region(self, level, type):   184         out = self.out   185         if level:   186             out("{" * level)        # marker   187         if type and level:   188             out("#!%s\n" % type)    # header   189    190     def end_region(self, level, type):   191         out = self.out   192         if level:   193             out("}" * level)        # marker   194    195     def start_block(self, final):   196         pass   197    198     def end_block(self, final):   199         if not final:   200             self.out("\n")   201    202     def text(self, s):   203         self.out(s)   204    205 class HTMLSerialiser(Serialiser):   206    207     "Serialisation of the page."   208    209     def start_region(self, level, type):   210         l = []   211         out = l.append   212         if level:   213             out("level-%d" % level)                 # marker   214    215         # NOTE: Encode type details for CSS.   216    217         if type:   218             out("type-%s" % escape(type, True))     # header   219    220         self.out("<span class='%s'>" % " ".join(l))   221    222     def end_region(self, level, type):   223         self.out("</span>")   224    225     def start_block(self, final):   226         self.out("<p>")   227    228     def end_block(self, final):   229         self.out("</p>")   230    231     def text(self, s):   232         self.out(escape(s))   233    234    235    236 # Tokenising functions.   237    238 class TokenStream:   239    240     "A stream of tokens taken from a string."   241    242     def __init__(self, s):   243         self.s = s   244         self.pos = 0   245         self.match = None   246         self.matching = None   247    248     def read_until(self, pattern_names, remaining=True):   249    250         """   251         Find the first match for the given 'pattern_names'. Return the text   252         preceding any match, the remaining text if no match was found, or None   253         if no match was found and 'remaining' is given as a false value.   254         """   255    256         first = None   257         self.matching = None   258    259         # Find the first matching pattern.   260    261         for pattern_name in pattern_names:   262             match = patterns[pattern_name].search(self.s, self.pos)   263             if match:   264                 start, end = match.span()   265                 if self.matching is None or start < first:   266                     first = start   267                     self.matching = pattern_name   268                     self.match = match   269    270         if self.matching is None:   271             if remaining:   272                 return self.s[self.pos:]   273             else:   274                 return None   275         else:   276             return self.s[self.pos:first]   277    278     def read_match(self):   279    280         "Return the matched text, updating the position in the stream."   281    282         if self.match:   283             _start, self.pos = self.match.span()   284             s = self.match.group(1)   285             return s   286         else:   287             self.pos = len(self.s)   288             return None   289    290    291    292 # Parser functions.   293    294 def parse_page(s):   295    296     """   297     Parse page text 's'. Pages consist of regions delimited by markers.   298     """   299    300     return parse_region(TokenStream(s))   301    302 def parse_region(items, level=0):   303    304     """   305     Parse the data provided by 'items' to populate a region at the given   306     'level'.   307     """   308    309     region = Region([], level)   310    311     # Parse section headers.   312    313     parse_region_header(items, region)   314    315     if region.is_transparent():   316         parse_region_wiki(items, region)   317     else:   318         parse_region_opaque(items, region)   319    320     return region   321    322 def parse_region_header(items, region):   323    324     """   325     Parse the region header from the 'items', setting it for the given 'region'.   326     """   327    328     if items.read_until(["header"], False) == "": # None means no header   329         region.type = items.read_match()   330    331 def parse_region_wiki(items, region):   332    333     "Parse the data provided by 'items' to populate a wiki 'region'."   334    335     # Process exposed text and sections.   336    337     block = new_block(region)   338    339     while True:   340    341         # Obtain text before any marker or the end of the input.   342    343         preceding = items.read_until(["break", "regionstart", "regionend"])   344         if preceding:   345             block.append(Text(preceding))   346    347         # Obtain any feature.   348    349         feature = items.read_match()   350    351         # End of input.   352    353         if not items.matching:   354             break   355    356         # Start a section if an appropriate marker is given.   357    358         if items.matching == "regionstart":   359             block = parse_region_within_wiki_region(items, region)   360    361         # Interpret the given marker, closing the current section if the   362         # given marker is the corresponding end marker for the current   363         # section.   364    365         elif items.matching == "regionend" and region.have_end(feature):   366             break   367    368         # Start a new block if a paragraph break is found.   369    370         elif items.matching == "break":   371             block = parse_block(items, region)   372    373         # Add any inappropriate marker to the text.   374    375         else:   376             block.append(Text(feature))   377    378     region.normalise()   379    380 def parse_region_within_wiki_region(items, region):   381    382     # Parse the section and start a new block after the section.   383    384     feature = items.read_match()   385     region.append(parse_region(items, len(feature)))   386     return new_block(region)   387    388 def parse_block(items, region):   389    390     # Mark any previous block as not being the final one in a sequence.   391    392     block = region.nodes[-1]   393     block.final = False   394     return new_block(region)   395    396 def parse_region_opaque(items, region):   397    398     "Parse the data provided by 'items' to populate an opaque 'region'."   399    400     # Process exposed text and the section end.   401    402     while True:   403    404         # Obtain text before any marker or the end of the input.   405    406         preceding = items.read_until(["regionend"])   407         if preceding:   408             region.append(Text(preceding))   409    410         # Obtain any marker.   411    412         marker = items.read_match()   413    414         # End of input.   415    416         if not marker:   417             break   418    419         # Interpret the given marker, closing the current section if the   420         # given marker is the corresponding end marker for the current   421         # section.   422    423         if region.have_end(marker):   424             break   425    426         # Add any inappropriate marker to the text.   427    428         else:   429             region.append(Text(marker))   430    431     region.normalise()   432    433 def new_block(region):   434    435     "Start a new block in 'region'."   436    437     block = Block([])   438     region.append(block)   439     return block   440    441    442    443 # Top-level functions.   444    445 parse = parse_page   446    447 def serialise(doc, serialiser=MoinSerialiser):   448     l = []   449     doc.to_string(serialiser(l.append))   450     return "".join(l)   451    452 # vim: tabstop=4 expandtab shiftwidth=4