MoinLight (file moinformat/__init_

     1 #!/usr/bin/env python     2      3 """     4 Moin wiki format parser.     5      6 Copyright (C) 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from moinformat.serialisers import serialise    23 from moinformat.tree import Block, Break, DefItem, DefTerm, FontStyle, Heading, \    24                             Larger, ListItem, Monospace, Region, Rule, Smaller, \    25                             Subscript, Superscript, TableAttr, TableAttrs, \    26                             TableCell, TableRow, Text, Underline    27 import re    28     29 # Regular expressions.    30     31 syntax = {    32     # Page regions:    33     "regionstart"   : r"((^\s*)([{]{3,}))",                         # {{{...    34     "regionend"     : r"^\s*([}]{3,})",                             # }}}...    35     "header"        : r"#!(.*?)\n",                                 # #! char-excl-nl    36     37     # Region contents:    38     # Line-oriented patterns:    39                       # blank line    40     "break"         : r"^(\s*?)\n",    41                       # ws... expecting text ::    42     "defterm"       : r"^(\s+)(?=.+?::)",    43                       # ws... expecting :: ws...    44     "defterm_empty" : r"^(\s+)(?=::\s+)",    45                       # [ws...] =... ws... expecting headingend    46     "heading"       : r"^(\s*)(?P<x>=+)(\s+)(?=.*?\s+(?P=x)\s*\n)",    47                       # ws... list-item [ws...]    48     "listitem"      : r"^(\s+)(\*)(\s*)",    49                       # ws... number-item ws...    50     "listitem_num"  : r"^(\s+)(\d+\.)(\s+)",    51                       # ws... alpha-item ws...    52     "listitem_alpha": r"^(\s+)([aA]\.)(\s+)",    53                       # ws... roman-item ws...    54     "listitem_roman": r"^(\s+)([iI]\.)(\s+)",    55                       # ws... dot-item [ws...]    56     "listitem_dot"  : r"^(\s+)(\.)(\s*)",    57                       # ||    58     "tablerow"      : r"^\|\|",    59     60     # Region contents:    61     # Inline patterns:    62     "fontstyle"     : r"('{2,6})",    63     "larger"        : r"~\+",    64     "monospace"     : r"`",    65     "rule"          : r"(-----*)",                                  # ----...    66     "smaller"       : r"~-",    67     "sub"           : r",,",    68     "super"         : r"\^",    69     "underline"     : r"__",    70     71     # Inline contents:    72     "largerend"     : r"\+~",    73     "monospaceend"  : r"`",    74     "smallerend"    : r"-~",    75     "subend"        : r",,",    76     "superend"      : r"\^",    77     "underlineend"  : r"__",    78     79     # Heading contents:    80     "headingend"    : r"(\s+)(=+)(\s*\n)",                          # ws... =... [ws...] nl    81     82     # List contents:    83     "deftermend"    : r"::(\s*?\n)",    84     "deftermsep"    : r"::(\s+)",    85     "listitemend"   : r"^",                                         # next line    86     87     # Table contents:    88     "tableattrs"    : r"<",    89     "tablecell"     : r"\|\|",    90     "tableend"      : r"(\s*?)^",                                   # [ws...] next line    91     92     # Table attributes:    93     "tableattrsend" : r">",    94     "halign"        : r"([(:)])",    95     "valign"        : r"([v^])",    96     "colour"        : r"(\#[0-9A-F]{6})",    97     "colspan"       : r"-(\d+)",    98     "rowspan"       : r"\|(\d+)",    99     "width"         : r"(\d+%)",   100     "attrname"      : r"((?![-\d])[-\w]+)",                         # not-dash-or-digit dash-or-word-char...   101     "attrvalue"     : r"""=(?P<x>['"])(.*?)(?P=x)""",   102     }   103    104 # Define pattern details.   105    106 table_pattern_names = ["attrname", "colour", "colspan", "halign", "rowspan", "tableattrsend", "valign", "width"]   107    108 inline_pattern_names = ["fontstyle", "larger", "monospace", "smaller", "sub", "super", "underline"]   109    110 def inline_patterns_for(name):   111     names = inline_pattern_names[:]   112     names[names.index(name)] = "%send" % name   113     return names   114    115 # Define patterns for the regular expressions.   116    117 patterns = {}   118 for name, value in syntax.items():   119     patterns[name] = re.compile(value, re.UNICODE | re.MULTILINE)   120    121    122    123 # Tokenising functions.   124    125 class TokenStream:   126    127     "A stream of tokens taken from a string."   128    129     def __init__(self, s):   130         self.s = s   131         self.pos = 0   132         self.match = None   133         self.matching = None   134    135     def rewind(self, length):   136    137         "Rewind in the string by 'length'."   138    139         self.pos -= min(length, self.pos)   140    141     def read_until(self, pattern_names, remaining=True):   142    143         """   144         Find the first match for the given 'pattern_names'. Return the text   145         preceding any match, the remaining text if no match was found, or None   146         if no match was found and 'remaining' is given as a false value.   147         """   148    149         first = None   150         self.matching = None   151    152         # Find the first matching pattern.   153    154         for pattern_name in pattern_names:   155             match = patterns[pattern_name].search(self.s, self.pos)   156             if match:   157                 start, end = match.span()   158                 if self.matching is None or start < first:   159                     first = start   160                     self.matching = pattern_name   161                     self.match = match   162    163         if self.matching is None:   164             if remaining:   165                 return self.s[self.pos:]   166             else:   167                 return None   168         else:   169             return self.s[self.pos:first]   170    171     def read_match(self, group=1):   172    173         """   174         Return the matched text, updating the position in the stream. If 'group'   175         is specified, the indicated group in a match will be returned.   176         Typically, group 1 should contain all pertinent data, but groups defined   177         within group 1 can provide sections of the data.   178         """   179    180         if self.match:   181             _start, self.pos = self.match.span()   182             try:   183                 return self.match.group(group)   184             except IndexError:   185                 return ""   186         else:   187             self.pos = len(self.s)   188             return None   189    190    191    192 # Parser functions.   193    194 def parse_page(s):   195    196     """   197     Parse page text 's'. Pages consist of regions delimited by markers.   198     """   199    200     return parse_region(TokenStream(s))   201    202 def parse_region(items, level=0, indent=0):   203    204     """   205     Parse the data provided by 'items' to populate a region with the given   206     'level' at the given 'indent'.   207     """   208    209     region = Region([], level, indent)   210    211     # Parse section headers.   212    213     parse_region_header(items, region)   214    215     # Parse section body.   216    217     if region.is_transparent():   218         parse_region_wiki(items, region)   219     else:   220         parse_region_opaque(items, region)   221    222     return region   223    224 def parse_region_header(items, region):   225    226     """   227     Parse the region header from the 'items', setting it for the given 'region'.   228     """   229    230     if items.read_until(["header"], False) == "": # None means no header   231         region.type = items.read_match()   232    233 def parse_region_wiki(items, region):   234    235     "Parse the data provided by 'items' to populate a wiki 'region'."   236    237     new_block(region)   238     parse_region_details(items, region, inline_pattern_names + [   239         "break", "heading",   240         "defterm", "defterm_empty",   241         "listitem", "listitem_alpha", "listitem_dot", "listitem_num",   242         "listitem_roman",   243         "regionstart", "regionend",   244         "rule",   245         "tablerow",   246         ])   247    248 def parse_region_opaque(items, region):   249    250     "Parse the data provided by 'items' to populate an opaque 'region'."   251    252     parse_region_details(items, region, ["regionend"])   253    254 def parse_region_details(items, region, pattern_names):   255    256     "Parse 'items' within 'region' searching using 'pattern_names'."   257    258     try:   259         while True:   260    261             # Obtain text before any marker or the end of the input.   262    263             preceding = items.read_until(pattern_names)   264             if preceding:   265                 region.append_inline(Text(preceding))   266    267             # End of input.   268    269             if not items.matching:   270                 break   271    272             # Obtain any feature.   273    274             feature = items.read_match()   275             handler = handlers.get(items.matching)   276    277             # Handle each feature or add text to the region.   278    279             if handler:   280                 handler(items, region)   281             else:   282                 region.append_inline(Text(feature))   283    284     except StopIteration:   285         pass   286    287     region.normalise()   288    289 def end_region(items, region):   290    291     "End the parsing of 'region'."   292    293     raise StopIteration   294    295 def parse_attrname(items, attrs):   296    297     "Handle an attribute name within 'attrs'."   298    299     name = items.read_match()   300     attr = TableAttr(name)   301    302     preceding = items.read_until(["attrvalue"], False)   303     if preceding == "":   304         attr.quote = items.read_match(1)   305         attr.value = items.read_match(2)   306    307     attrs.append(attr)   308    309 def parse_break(items, region):   310    311     "Handle a paragraph break within 'region'."   312    313     region.add(Break())   314     new_block(region)   315    316 def parse_defitem(items, region, extra=""):   317    318     "Handle a definition item within 'region'."   319    320     pad = items.read_match(1)   321     item = DefItem([], pad, extra)   322     parse_region_details(items, item, ["listitemend"])   323     region.add(item)   324     new_block(region)   325    326 def parse_defterm(items, region):   327    328     "Handle a definition term within 'region'."   329    330     pad = items.read_match(1)   331     term = DefTerm([], pad)   332     parse_region_details(items, term, ["deftermend", "deftermsep"])   333     region.add(term)   334     if items.matching == "deftermsep":   335         parse_defitem(items, region)   336    337 def parse_defterm_empty(items, region):   338    339     "Handle an empty definition term within 'region'."   340    341     extra = items.read_match(1)   342     parse_region_details(items, region, ["deftermsep"])   343     parse_defitem(items, region, extra)   344    345 def parse_fontstyle(items, region):   346    347     "Handle emphasis and strong styles."   348    349     n = len(items.read_match(1))   350    351     # Handle endings.   352    353     if isinstance(region, FontStyle):   354         emphasis = n in (2, 4, 5)   355         strong = n in (3, 5, 6)   356         active = True   357    358         if region.emphasis and emphasis:   359             active = region.close_emphasis()   360             n -= 2   361         if region.strong and strong:   362             active = region.close_strong()   363             n -= 3   364    365         if not active:   366             if n:   367                 items.rewind(n)   368             raise StopIteration   369    370         elif not n:   371             return   372    373     # Handle new styles.   374    375     emphasis = n in (2, 4, 5)   376     strong = n in (3, 5, 6)   377     double = n in (4, 6)   378    379     span = FontStyle([], emphasis, strong)   380     if not double:   381         parse_region_details(items, span, inline_pattern_names)   382     region.append_inline(span)   383    384 def parse_halign(items, attrs):   385    386     "Handle horizontal alignment within 'attrs'."   387    388     value = items.read_match()   389     attr = TableAttr("halign", value == "(" and "left" or value == ")" and "right" or "center", True)   390     attrs.append(attr)   391    392 def parse_heading(items, region):   393    394     "Handle a heading."   395    396     start_extra = items.read_match(1)   397     level = len(items.read_match(2))   398     start_pad = items.read_match(3)   399     heading = Heading([], level, start_extra, start_pad)   400     parse_region_details(items, heading, ["headingend"] + inline_pattern_names)   401     region.add(heading)   402     new_block(region)   403    404 def parse_heading_end(items, heading):   405    406     "Handle the end of a heading."   407    408     level = len(items.read_match(2))   409     if heading.level == level:   410         heading.end_pad = items.read_match(1)   411         heading.end_extra = items.read_match(3)   412         raise StopIteration   413    414 def parse_listitem(items, region):   415    416     "Handle a list item marker within 'region'."   417    418     indent = len(items.read_match(1))   419     marker = items.read_match(2)   420     space = items.read_match(3)   421     item = ListItem([], indent, marker, space)   422     parse_region_details(items, item, ["listitemend"])   423     region.add(item)   424     new_block(region)   425    426 def parse_rule(items, region):   427    428     "Handle a horizontal rule within 'region'."   429    430     length = len(items.read_match(1))   431     rule = Rule(length)   432     region.add(rule)   433     new_block(region)   434    435 def parse_section(items, region):   436    437     "Handle the start of a new section within 'region'."   438    439     # Parse the section and start a new block after the section.   440    441     indent = len(items.read_match(2))   442     level = len(items.read_match(3))   443     region.add(parse_region(items, level, indent))   444     new_block(region)   445    446 def parse_section_end(items, region):   447    448     "Handle the end of a new section within 'region'."   449    450     feature = items.read_match()   451     if region.have_end(feature):   452         raise StopIteration   453     else:   454         region.append_inline(Text(feature))   455    456 def parse_table_attrs(items, cell):   457    458     "Handle the start of table attributes within 'cell'."   459    460     attrs = TableAttrs([])   461     parse_region_details(items, attrs, table_pattern_names)   462     cell.attrs = attrs   463    464 def parse_table_row(items, region):   465    466     "Handle the start of a table row within 'region'."   467    468     row = TableRow([])   469    470     while True:   471         cell = TableCell([])   472         parse_region_details(items, cell, ["tableattrs", "tablecell", "tableend"])   473    474         # Handle the end of the row.   475    476         if items.matching == "tableend":   477             trailing = items.read_match()   478    479             # If the cell was started but not finished, convert the row into text.   480    481             if not row.nodes or not cell.empty():   482                 for node in row.nodes:   483                     region.append_inline(Text(serialise(node)))   484                 region.append_inline(Text(serialise(cell)))   485                 region.append_inline(Text(trailing))   486    487                 new_block(region)   488                 return   489    490             # Append the final cell, if not empty.   491    492             else:   493                 row.trailing = trailing   494    495                 if not cell.empty():   496                     row.append(cell)   497                 break   498    499         # A cell separator has been found.   500    501         row.append(cell)   502    503     region.add(row)   504     new_block(region)   505    506 def parse_valign(items, attrs):   507    508     "Handle vertical alignment within 'attrs'."   509    510     value = items.read_match()   511     attr = TableAttr("valign", value == "^" and "top" or "bottom", True)   512     attrs.append(attr)   513    514 # Inline formatting handlers.   515    516 def parse_inline(items, region, cls, pattern_name):   517    518     "Handle an inline region."   519    520     span = cls([])   521     parse_region_details(items, span, inline_patterns_for(pattern_name))   522     region.append_inline(span)   523    524 parse_larger = lambda items, region: parse_inline(items, region, Larger, "larger")   525 parse_monospace = lambda items, region: parse_inline(items, region, Monospace, "monospace")   526 parse_smaller = lambda items, region: parse_inline(items, region, Smaller, "smaller")   527 parse_sub = lambda items, region: parse_inline(items, region, Subscript, "sub")   528 parse_super = lambda items, region: parse_inline(items, region, Superscript, "super")   529 parse_underline = lambda items, region: parse_inline(items, region, Underline, "underline")   530    531 # Table attribute handlers.   532    533 def parse_table_attr(items, attrs, pattern_name):   534    535     "Handle a table attribute."   536    537     value = items.read_match()   538     attrs.append(TableAttr(pattern_name, value, True))   539    540 parse_colour = lambda items, cell: parse_table_attr(items, cell, "colour")   541 parse_colspan = lambda items, cell: parse_table_attr(items, cell, "colspan")   542 parse_rowspan = lambda items, cell: parse_table_attr(items, cell, "rowspan")   543 parse_width = lambda items, cell: parse_table_attr(items, cell, "width")   544    545 # Pattern handlers.   546    547 handlers = {   548     None : end_region,   549     "attrname" : parse_attrname,   550     "break" : parse_break,   551     "colour" : parse_colour,   552     "colspan" : parse_colspan,   553     "defterm" : parse_defterm,   554     "defterm_empty" : parse_defterm_empty,   555     "deftermend" : end_region,   556     "deftermsep" : end_region,   557     "fontstyle" : parse_fontstyle,   558     "halign" : parse_halign,   559     "heading" : parse_heading,   560     "headingend" : parse_heading_end,   561     "larger" : parse_larger,   562     "largerend" : end_region,   563     "listitemend" : end_region,   564     "listitem" : parse_listitem,   565     "listitem_alpha" : parse_listitem,   566     "listitem_dot" : parse_listitem,   567     "listitem_num" : parse_listitem,   568     "listitem_roman" : parse_listitem,   569     "monospace" : parse_monospace,   570     "monospaceend" : end_region,   571     "regionstart" : parse_section,   572     "regionend" : parse_section_end,   573     "rowspan" : parse_rowspan,   574     "rule" : parse_rule,   575     "smaller" : parse_smaller,   576     "smallerend" : end_region,   577     "sub" : parse_sub,   578     "subend" : end_region,   579     "super" : parse_super,   580     "superend" : end_region,   581     "tableattrs" : parse_table_attrs,   582     "tableattrsend" : end_region,   583     "tablerow" : parse_table_row,   584     "tablecell" : end_region,   585     "tableend" : end_region,   586     "underline" : parse_underline,   587     "underlineend" : end_region,   588     "valign" : parse_valign,   589     "width" : parse_width,   590     }   591    592 def new_block(region):   593    594     "Start a new block in 'region'."   595    596     block = Block([])   597     region.add(block)   598    599    600    601 # Top-level functions.   602    603 parse = parse_page   604    605 # vim: tabstop=4 expandtab shiftwidth=4
MoinLight

moinformat/__init__.py

moinformat/init.py