MoinLight (annotate moinformat/parsers/common.py in 21bc17cf1000)

MoinLight

Annotated moinformat/parsers/common.py

51:21bc17cf1000

2018-07-14

Paul Boddie

Added list item renumbering support.

paul@32	1	#!/usr/bin/env python
paul@32	2
paul@32	3	"""
paul@32	4	Moin wiki parsing functionality.
paul@32	5
paul@45	6	Copyright (C) 2017, 2018 Paul Boddie <paul@boddie.org.uk>
paul@32	7
paul@32	8	This program is free software; you can redistribute it and/or modify it under
paul@32	9	the terms of the GNU General Public License as published by the Free Software
paul@32	10	Foundation; either version 3 of the License, or (at your option) any later
paul@32	11	version.
paul@32	12
paul@32	13	This program is distributed in the hope that it will be useful, but WITHOUT
paul@32	14	ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
paul@32	15	FOR A PARTICULAR PURPOSE. See the GNU General Public License for more
paul@32	16	details.
paul@32	17
paul@32	18	You should have received a copy of the GNU General Public License along with
paul@32	19	this program. If not, see <http://www.gnu.org/licenses/>.
paul@32	20	"""
paul@32	21
paul@43	22	from collections import defaultdict
paul@32	23	from moinformat.tree import Block, Region, Text
paul@33	24	import re
paul@33	25
paul@33	26	# Pattern management.
paul@33	27
paul@36	28	ws_excl_nl = r"[ \f\r\t\v]"
paul@36	29
paul@33	30	def get_patterns(syntax):
paul@33	31
paul@36	32	"""
paul@36	33	Define patterns for the regular expressions in the 'syntax' mapping. In each
paul@36	34	pattern, replace \N with a pattern for matching whitespace excluding
paul@36	35	newlines.
paul@36	36	"""
paul@33	37
paul@33	38	patterns = {}
paul@33	39	for name, value in syntax.items():
paul@36	40	value = value.replace(r"\N", ws_excl_nl)
paul@33	41	patterns[name] = re.compile(value, re.UNICODE \| re.MULTILINE)
paul@33	42	return patterns
paul@33	43
paul@37	44	def get_subset(d, keys):
paul@33	45
paul@37	46	"Return a subset of 'd' having the given 'keys'."
paul@36	47
paul@37	48	subset = {}
paul@37	49	for key in keys:
paul@37	50	subset[key] = d[key]
paul@37	51	return subset
paul@36	52
paul@36	53
paul@32	54
paul@32	55	# Tokenising functions.
paul@32	56
paul@32	57	class TokenStream:
paul@32	58
paul@32	59	"A stream of tokens taken from a string."
paul@32	60
paul@37	61	def __init__(self, s, pos=0):
paul@32	62	self.s = s
paul@36	63	self.pos = pos
paul@45	64
paul@45	65	# Match details.
paul@45	66
paul@32	67	self.match = None
paul@45	68	self.queued = None
paul@45	69	self.match_start = None
paul@45	70
paul@45	71	# Pattern name details.
paul@45	72
paul@32	73	self.matching = None
paul@32	74
paul@32	75	def rewind(self, length):
paul@32	76
paul@32	77	"Rewind in the string by 'length'."
paul@32	78
paul@32	79	self.pos -= min(length, self.pos)
paul@32	80
paul@45	81	def queue_match(self):
paul@45	82
paul@45	83	"Rewind in the string to the start of the last match."
paul@45	84
paul@45	85	self.queued = self.match
paul@45	86
paul@37	87	def read_until(self, patterns, remaining=True):
paul@32	88
paul@32	89	"""
paul@37	90	Find the first match for the given 'patterns'. Return the text preceding
paul@37	91	any match, the remaining text if no match was found, or None if no match
paul@37	92	was found and 'remaining' is given as a false value.
paul@32	93	"""
paul@32	94
paul@45	95	if self.queued:
paul@45	96	self.match = self.queued
paul@45	97	self.queued = None
paul@45	98	else:
paul@45	99	self.match_start = None
paul@45	100	self.matching = None
paul@32	101
paul@45	102	# Find the first matching pattern.
paul@32	103
paul@45	104	for pattern_name, pattern in patterns.items():
paul@45	105	match = pattern.search(self.s, self.pos)
paul@45	106	if match:
paul@45	107	start, end = match.span()
paul@45	108	if self.matching is None or start < self.start:
paul@45	109	self.start = start
paul@45	110	self.matching = pattern_name
paul@45	111	self.match = match
paul@32	112
paul@32	113	if self.matching is None:
paul@32	114	if remaining:
paul@32	115	return self.s[self.pos:]
paul@32	116	else:
paul@32	117	return None
paul@32	118	else:
paul@45	119	return self.s[self.pos:self.start]
paul@32	120
paul@32	121	def read_match(self, group=1):
paul@32	122
paul@32	123	"""
paul@32	124	Return the matched text, updating the position in the stream. If 'group'
paul@32	125	is specified, the indicated group in a match will be returned.
paul@32	126	Typically, group 1 should contain all pertinent data, but groups defined
paul@32	127	within group 1 can provide sections of the data.
paul@32	128	"""
paul@32	129
paul@32	130	if self.match:
paul@32	131	_start, self.pos = self.match.span()
paul@32	132	try:
paul@32	133	return self.match.group(group)
paul@32	134	except IndexError:
paul@32	135	return ""
paul@32	136	else:
paul@32	137	self.pos = len(self.s)
paul@32	138	return None
paul@32	139
paul@51	140	def match_groups(self):
paul@51	141
paul@51	142	"Return the match groups."
paul@51	143
paul@51	144	if self.match:
paul@51	145	return self.match.groups()
paul@51	146	else:
paul@51	147	return []
paul@51	148
paul@32	149
paul@32	150
paul@32	151	# Parser abstractions.
paul@32	152
paul@32	153	class ParserBase:
paul@32	154
paul@32	155	"Common parsing methods."
paul@32	156
paul@37	157	region_pattern_names = None
paul@37	158
paul@32	159	def __init__(self, formats=None):
paul@32	160
paul@32	161	"""
paul@32	162	Initialise the parser with any given 'formats' mapping from region type
paul@32	163	names to parser objects.
paul@32	164	"""
paul@32	165
paul@32	166	self.formats = formats
paul@37	167
paul@37	168	def get_parser(self, format_type):
paul@37	169
paul@37	170	"""
paul@37	171	Return a parser for 'format_type' or None if no suitable parser is found.
paul@37	172	"""
paul@37	173
paul@37	174	if not self.formats:
paul@37	175	return None
paul@37	176
paul@37	177	cls = self.formats.get(format_type)
paul@37	178	if cls:
paul@37	179	return cls(self.formats)
paul@37	180	else:
paul@37	181	return None
paul@37	182
paul@37	183	def get_patterns(self, pattern_names):
paul@37	184
paul@37	185	"Return a mapping of the given 'pattern_names' to patterns."
paul@37	186
paul@37	187	return get_subset(self.patterns, pattern_names)
paul@32	188
paul@36	189	def get_items(self, s, pos=0):
paul@32	190
paul@36	191	"Return a sequence of token items for 's' and 'pos'."
paul@32	192
paul@37	193	return TokenStream(s, pos)
paul@37	194
paul@37	195	def set_region(self, items, region):
paul@37	196
paul@37	197	"Set the 'items' used to populate the given 'region'."
paul@32	198
paul@37	199	self.items = items
paul@37	200	self.region = region
paul@37	201
paul@37	202	def read_until(self, pattern_names, remaining=True):
paul@36	203
paul@37	204	"""
paul@37	205	Read the next portion of input, matching using 'pattern_names'. Return
paul@37	206	the text preceding any match, the remaining text if no match was found,
paul@37	207	or None if no match was found and 'remaining' is given as a false value.
paul@37	208	"""
paul@36	209
paul@37	210	return self.items.read_until(self.get_patterns(pattern_names))
paul@37	211
paul@37	212	def read_match(self, group=1):
paul@37	213
paul@37	214	"""
paul@37	215	Return the group of the matching pattern with the given 'group' number.
paul@37	216	"""
paul@36	217
paul@37	218	return self.items.read_match(group)
paul@37	219
paul@37	220	def read_matching(self):
paul@36	221
paul@37	222	"Return the name of the matching pattern."
paul@36	223
paul@37	224	return self.items.matching
paul@37	225
paul@51	226	def match_groups(self):
paul@51	227
paul@51	228	"Return the number of groups in the match."
paul@51	229
paul@51	230	return self.items.match_groups()
paul@51	231
paul@37	232	# Parser methods invoked from other objects.
paul@36	233
paul@32	234	def parse(self, s):
paul@32	235
paul@32	236	"""
paul@32	237	Parse page text 's'. Pages consist of regions delimited by markers.
paul@32	238	"""
paul@32	239
paul@37	240	self.items = self.get_items(s)
paul@37	241	self.region = self.parse_region()
paul@37	242	return self.region
paul@37	243
paul@37	244	def parse_region_content(self, items, region):
paul@37	245
paul@37	246	"Parse the data provided by 'items' to populate a 'region'."
paul@37	247
paul@37	248	self.set_region(items, region)
paul@32	249
paul@37	250	# Define a block to hold text and start parsing.
paul@37	251
paul@43	252	self.new_block(region)
paul@37	253
paul@37	254	if self.region_pattern_names:
paul@37	255	self.parse_region_details(region, self.region_pattern_names)
paul@37	256
paul@37	257	# Top-level parser handler methods.
paul@37	258
paul@37	259	def parse_region(self, level=0, indent=0):
paul@32	260
paul@32	261	"""
paul@37	262	Parse the data to populate a region with the given 'level' at the given
paul@37	263	'indent'.
paul@32	264	"""
paul@32	265
paul@32	266	region = Region([], level, indent)
paul@32	267
paul@32	268	# Parse section headers, then parse according to region type.
paul@32	269
paul@37	270	self.parse_region_header(region)
paul@37	271	self.parse_region_type(region)
paul@32	272
paul@32	273	return region
paul@32	274
paul@37	275	def parse_region_type(self, region):
paul@32	276
paul@32	277	"""
paul@37	278	Use configured parsers to parse 'region' based on its type.
paul@32	279	"""
paul@32	280
paul@32	281	# Find an appropriate parser given the type.
paul@32	282
paul@37	283	parser = self.get_parser(region.type)
paul@37	284
paul@37	285	if parser:
paul@37	286	parser.parse_region_content(self.items, region)
paul@32	287
paul@32	288	# Otherwise, treat the section as opaque.
paul@32	289
paul@32	290	else:
paul@37	291	self.parse_region_opaque(region)
paul@32	292
paul@37	293	def parse_region_header(self, region):
paul@32	294
paul@32	295	"""
paul@37	296	Parse the region header, setting it on the 'region' object.
paul@32	297	"""
paul@32	298
paul@37	299	if self.read_until(["header"], False) == "": # None means no header
paul@37	300	region.type = self.read_match()
paul@32	301
paul@37	302	def parse_region_opaque(self, region):
paul@32	303
paul@37	304	"Parse the data to populate an opaque 'region'."
paul@32	305
paul@32	306	region.transparent = False
paul@37	307	self.parse_region_details(region, ["regionend"])
paul@32	308
paul@32	309	# Parsing utilities.
paul@32	310
paul@43	311	def parse_region_details(self, region, pattern_names, strict=False):
paul@32	312
paul@43	313	"""
paul@43	314	Search 'region' using the 'pattern_names'. If 'strict' is set to a true
paul@43	315	value, forbid the accumulation of additional textual padding.
paul@43	316	"""
paul@32	317
paul@32	318	try:
paul@32	319	while True:
paul@32	320
paul@32	321	# Obtain text before any marker or the end of the input.
paul@32	322
paul@37	323	preceding = self.read_until(pattern_names)
paul@32	324	if preceding:
paul@43	325	if not strict:
paul@43	326	region.append_inline(Text(preceding))
paul@43	327	else:
paul@43	328	break
paul@32	329
paul@32	330	# End of input.
paul@32	331
paul@37	332	if not self.read_matching():
paul@32	333	break
paul@32	334
paul@32	335	# Obtain any feature.
paul@32	336
paul@37	337	feature = self.read_match()
paul@37	338	handler = self.handlers.get(self.read_matching())
paul@32	339
paul@32	340	# Handle each feature or add text to the region.
paul@32	341
paul@32	342	if handler:
paul@37	343	handler(self, region)
paul@43	344	elif not strict:
paul@43	345	region.append_inline(Text(feature))
paul@32	346	else:
paul@43	347	break
paul@32	348
paul@32	349	except StopIteration:
paul@32	350	pass
paul@32	351
paul@32	352	region.normalise()
paul@32	353
paul@43	354	def add_node(self, region, node):
paul@43	355
paul@43	356	"Add to 'region' the given 'node'."
paul@43	357
paul@43	358	region.add(node)
paul@43	359
paul@43	360	def append_node(self, region, node):
paul@43	361
paul@43	362	"Append to 'region' the given 'node'."
paul@43	363
paul@43	364	region.append(node)
paul@43	365
paul@37	366	def end_region(self, region):
paul@32	367
paul@32	368	"End the parsing of 'region', breaking out of the parsing loop."
paul@32	369
paul@32	370	raise StopIteration
paul@32	371
paul@45	372	def queue_match(self):
paul@43	373
paul@45	374	"Queue the current match."
paul@43	375
paul@45	376	self.items.queue_match()
paul@43	377
paul@43	378	def new_block(self, region):
paul@43	379
paul@43	380	"Start a new block in 'region'."
paul@43	381
paul@43	382	self.add_node(region, Block([]))
paul@43	383
paul@32	384	# vim: tabstop=4 expandtab shiftwidth=4