ConfluenceConverter (annotate parser.py in cdbfc82274f8)

ConfluenceConverter

Annotated parser.py

8:cdbfc82274f8

2012-04-01

Paul Boddie

Added copyright and licensing boilerplate.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@8	6	Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22
paul@8	23	--------
paul@8	24
paul@8	25	The basic procedure is as follows:
paul@8	26
paul@7	27	1. Wiki pages are first split up into regions.
paul@7	28	2. Then, within these regions, the text is split into blocks.
paul@7	29	1. First, lists are identified.
paul@7	30	2. Additionally, other block-like elements are identified.
paul@7	31	3. Each block is then parsed.
paul@7	32	"""
paul@7	33
paul@6	34	import re
paul@6	35
paul@6	36	# Section extraction.
paul@6	37
paul@7	38	sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})"
paul@6	39	sections_regexp = re.compile(sections_regexp_str, re.DOTALL \| re.MULTILINE)
paul@6	40
paul@6	41	def get_regions(s):
paul@6	42
paul@6	43	"""
paul@6	44	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	45	the form (type, text).
paul@6	46	"""
paul@6	47
paul@6	48	last = 0
paul@6	49	regions = []
paul@6	50	for match in sections_regexp.finditer(s):
paul@6	51	start, end = match.span()
paul@6	52	regions.append((None, s[last:start]))
paul@6	53	regions.append(get_section_details(s[start:end]))
paul@6	54	last = end
paul@6	55	regions.append((None, s[last:]))
paul@6	56	return regions
paul@6	57
paul@7	58	# Section inspection.
paul@7	59
paul@7	60	section_regexp_str = r"{(?P<sectiontype>[^\n]?)}(?P<section>.){(?P=sectiontype)}"
paul@7	61	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	62
paul@6	63	def get_section_details(s):
paul@6	64
paul@7	65	"Return the details of a section 's' in the form (type, text)."
paul@6	66
paul@6	67	match = section_regexp.match(s)
paul@6	68	if match:
paul@6	69	return match.group("sectiontype"), match.group("section")
paul@6	70	else:
paul@6	71	return None, s
paul@6	72
paul@7	73	# List extraction.
paul@7	74
paul@7	75	list_regexp_str = r"^(?P<listtype>[#-])[#-].\n((?P=listtype).(?:\n\|$))"
paul@7	76	list_regexp = re.compile(list_regexp_str, re.MULTILINE)
paul@7	77
paul@7	78	def get_lists(s):
paul@7	79
paul@7	80	"""
paul@7	81	Extract lists from the given string 's'.
paul@7	82	"""
paul@7	83
paul@7	84	last = 0
paul@7	85	blocks = []
paul@7	86	for match in list_regexp.finditer(s):
paul@7	87	start, end = match.span()
paul@7	88	blocks.append((None, s[last:start]))
paul@7	89	blocks.append(("list", s[start:end]))
paul@7	90	last = end
paul@7	91	blocks.append((None, s[last:]))
paul@7	92	return blocks
paul@7	93
paul@7	94	# Block extraction.
paul@7	95
paul@7	96	block_regexp_str = r"^(?:\s*\n)+"
paul@7	97	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	98
paul@7	99	def get_basic_blocks(s):
paul@7	100
paul@7	101	"""
paul@7	102	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	103	and eliminating those lines.
paul@7	104	"""
paul@7	105
paul@7	106	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	107
paul@7	108	# Block inspection.
paul@7	109
paul@7	110	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	111	blocktext_regexp = re.compile(blocktext_regexp_str, re.MULTILINE)
paul@7	112
paul@7	113	def get_blocks(s):
paul@7	114
paul@7	115	"""
paul@7	116	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	117	generating additional block-level text where appropriate.
paul@7	118	"""
paul@7	119
paul@7	120	blocks = []
paul@7	121
paul@7	122	for blocktype, blocktext in get_lists(s):
paul@7	123
paul@7	124	# Collect list blocks.
paul@7	125
paul@7	126	if blocktype is not None:
paul@7	127	blocks.append((blocktype, blocktext))
paul@7	128
paul@7	129	# Attempt to find new subblocks in other regions.
paul@7	130
paul@7	131	else:
paul@7	132	for block in get_basic_blocks(blocktext):
paul@7	133	last = 0
paul@7	134	for match in blocktext_regexp.finditer(block):
paul@7	135	start, end = match.span()
paul@7	136
paul@7	137	# Add preceding non-block text.
paul@7	138
paul@7	139	preceding = block[last:start]
paul@7	140	if preceding.strip():
paul@7	141	blocks.append((None, preceding))
paul@7	142
paul@7	143	# Add the subblock.
paul@7	144
paul@7	145	blocks.append((match.group("type"), match.group("text")))
paul@7	146	last = end
paul@7	147
paul@7	148	# Add trailing non-block text.
paul@7	149
paul@7	150	trailing = block[last:]
paul@7	151	if trailing.strip():
paul@7	152	blocks.append((None, trailing))
paul@7	153
paul@7	154	return blocks
paul@7	155
paul@7	156	listitem_regexp_str = r"^([#-])+\s(.*)$"
paul@7	157	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	158
paul@6	159	if __name__ == "__main__":
paul@6	160	import sys
paul@6	161
paul@6	162	s = sys.stdin.read()
paul@6	163
paul@6	164	for type, text in get_regions(s):
paul@7	165	if type is None:
paul@7	166	for blocktype, blocktext in get_blocks(text):
paul@7	167	print "Block type:", blocktype
paul@7	168	print blocktext
paul@7	169	print
paul@7	170	else:
paul@7	171	print "Region type:", type
paul@7	172	print text
paul@7	173	print
paul@7	174
paul@6	175	print "-" * 60
paul@6	176
paul@6	177	# vim: tabstop=4 expandtab shiftwidth=4