ConfluenceConverter (annotate parser.py in 192a2a30aff4)

ConfluenceConverter

Annotated parser.py

7:192a2a30aff4

2012-04-01

Paul Boddie

Made the section extraction slightly more strict when recognising the markers. Added extraction of other block types such as lists, headings and blockquotes.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@7	6	1. Wiki pages are first split up into regions.
paul@7	7	2. Then, within these regions, the text is split into blocks.
paul@7	8	1. First, lists are identified.
paul@7	9	2. Additionally, other block-like elements are identified.
paul@7	10	3. Each block is then parsed.
paul@7	11	"""
paul@7	12
paul@6	13	import re
paul@6	14
paul@6	15	# Section extraction.
paul@6	16
paul@7	17	sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})"
paul@6	18	sections_regexp = re.compile(sections_regexp_str, re.DOTALL \| re.MULTILINE)
paul@6	19
paul@6	20	def get_regions(s):
paul@6	21
paul@6	22	"""
paul@6	23	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	24	the form (type, text).
paul@6	25	"""
paul@6	26
paul@6	27	last = 0
paul@6	28	regions = []
paul@6	29	for match in sections_regexp.finditer(s):
paul@6	30	start, end = match.span()
paul@6	31	regions.append((None, s[last:start]))
paul@6	32	regions.append(get_section_details(s[start:end]))
paul@6	33	last = end
paul@6	34	regions.append((None, s[last:]))
paul@6	35	return regions
paul@6	36
paul@7	37	# Section inspection.
paul@7	38
paul@7	39	section_regexp_str = r"{(?P<sectiontype>[^\n]?)}(?P<section>.){(?P=sectiontype)}"
paul@7	40	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	41
paul@6	42	def get_section_details(s):
paul@6	43
paul@7	44	"Return the details of a section 's' in the form (type, text)."
paul@6	45
paul@6	46	match = section_regexp.match(s)
paul@6	47	if match:
paul@6	48	return match.group("sectiontype"), match.group("section")
paul@6	49	else:
paul@6	50	return None, s
paul@6	51
paul@7	52	# List extraction.
paul@7	53
paul@7	54	list_regexp_str = r"^(?P<listtype>[#-])[#-].\n((?P=listtype).(?:\n\|$))"
paul@7	55	list_regexp = re.compile(list_regexp_str, re.MULTILINE)
paul@7	56
paul@7	57	def get_lists(s):
paul@7	58
paul@7	59	"""
paul@7	60	Extract lists from the given string 's'.
paul@7	61	"""
paul@7	62
paul@7	63	last = 0
paul@7	64	blocks = []
paul@7	65	for match in list_regexp.finditer(s):
paul@7	66	start, end = match.span()
paul@7	67	blocks.append((None, s[last:start]))
paul@7	68	blocks.append(("list", s[start:end]))
paul@7	69	last = end
paul@7	70	blocks.append((None, s[last:]))
paul@7	71	return blocks
paul@7	72
paul@7	73	# Block extraction.
paul@7	74
paul@7	75	block_regexp_str = r"^(?:\s*\n)+"
paul@7	76	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	77
paul@7	78	def get_basic_blocks(s):
paul@7	79
paul@7	80	"""
paul@7	81	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	82	and eliminating those lines.
paul@7	83	"""
paul@7	84
paul@7	85	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	86
paul@7	87	# Block inspection.
paul@7	88
paul@7	89	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	90	blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE)
paul@7	91
paul@7	92	def get_blocks(s):
paul@7	93
paul@7	94	"""
paul@7	95	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	96	generating additional block-level text where appropriate.
paul@7	97	"""
paul@7	98
paul@7	99	blocks = []
paul@7	100
paul@7	101	for blocktype, blocktext in get_lists(s):
paul@7	102
paul@7	103	# Collect list blocks.
paul@7	104
paul@7	105	if blocktype is not None:
paul@7	106	blocks.append((blocktype, blocktext))
paul@7	107
paul@7	108	# Attempt to find new subblocks in other regions.
paul@7	109
paul@7	110	else:
paul@7	111	for block in get_basic_blocks(blocktext):
paul@7	112	last = 0
paul@7	113	for match in blocktext_regexp.finditer(block):
paul@7	114	start, end = match.span()
paul@7	115
paul@7	116	# Add preceding non-block text.
paul@7	117
paul@7	118	preceding = block[last:start]
paul@7	119	if preceding.strip():
paul@7	120	blocks.append((None, preceding))
paul@7	121
paul@7	122	# Add the subblock.
paul@7	123
paul@7	124	blocks.append((match.group("type"), match.group("text")))
paul@7	125	last = end
paul@7	126
paul@7	127	# Add trailing non-block text.
paul@7	128
paul@7	129	trailing = block[last:]
paul@7	130	if trailing.strip():
paul@7	131	blocks.append((None, trailing))
paul@7	132
paul@7	133	return blocks
paul@7	134
paul@7	135	listitem_regexp_str = r"^([#-])+\s(.*)$"
paul@7	136	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	137
paul@6	138	if __name__ == "__main__":
paul@6	139	import sys
paul@6	140
paul@6	141	s = sys.stdin.read()
paul@6	142
paul@6	143	for type, text in get_regions(s):
paul@7	144	if type is None:
paul@7	145	for blocktype, blocktext in get_blocks(text):
paul@7	146	print "Block type:", blocktype
paul@7	147	print blocktext
paul@7	148	print
paul@7	149	else:
paul@7	150	print "Region type:", type
paul@7	151	print text
paul@7	152	print
paul@7	153
paul@6	154	print "-" * 60
paul@6	155
paul@6	156	# vim: tabstop=4 expandtab shiftwidth=4