ConfluenceConverter (annotate parser.py in f4b8774961a0)

ConfluenceConverter

Annotated parser.py

14:f4b8774961a0

2012-04-22

Paul Boddie

Merged list and "block element" (heading and blockquote) extraction, including also table extraction in order to simplify the processing hierarchy. Added elementary translation of list items and table rows, attempting to avoid link and image syntax being interpreted as table cell separators.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@8	6	Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22
paul@8	23	--------
paul@8	24
paul@8	25	The basic procedure is as follows:
paul@8	26
paul@7	27	1. Wiki pages are first split up into regions.
paul@7	28	2. Then, within these regions, the text is split into blocks.
paul@7	29	1. First, lists are identified.
paul@7	30	2. Additionally, other block-like elements are identified.
paul@7	31	3. Each block is then parsed.
paul@7	32	"""
paul@7	33
paul@6	34	import re
paul@6	35
paul@6	36	# Section extraction.
paul@6	37
paul@7	38	sections_regexp_str = r"(?<!{)(?P<section>{(?P<type>[^{}\n]+)}.*?{(?P=type)})"
paul@6	39	sections_regexp = re.compile(sections_regexp_str, re.DOTALL \| re.MULTILINE)
paul@6	40
paul@6	41	def get_regions(s):
paul@6	42
paul@6	43	"""
paul@6	44	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	45	the form (type, text).
paul@6	46	"""
paul@6	47
paul@6	48	last = 0
paul@6	49	regions = []
paul@6	50	for match in sections_regexp.finditer(s):
paul@6	51	start, end = match.span()
paul@6	52	regions.append((None, s[last:start]))
paul@6	53	regions.append(get_section_details(s[start:end]))
paul@6	54	last = end
paul@6	55	regions.append((None, s[last:]))
paul@6	56	return regions
paul@6	57
paul@7	58	# Section inspection.
paul@7	59
paul@7	60	section_regexp_str = r"{(?P<sectiontype>[^\n]?)}(?P<section>.){(?P=sectiontype)}"
paul@7	61	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	62
paul@6	63	def get_section_details(s):
paul@6	64
paul@7	65	"Return the details of a section 's' in the form (type, text)."
paul@6	66
paul@6	67	match = section_regexp.match(s)
paul@6	68	if match:
paul@6	69	return match.group("sectiontype"), match.group("section")
paul@6	70	else:
paul@6	71	return None, s
paul@6	72
paul@14	73	# Heading, table and list extraction.
paul@7	74
paul@7	75	list_regexp_str = r"^(?P<listtype>[#-])[#-].\n((?P=listtype).(?:\n\|$))"
paul@14	76	table_regexp_str = r"^((?P<celltype>[\|]{1,2})(.+?(?P=celltype))+(\n\|$))+"
paul@14	77	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	78
paul@14	79	blockelement_regexp = re.compile(
paul@14	80	"(" + list_regexp_str + ")"
paul@14	81	"\|"
paul@14	82	"(" + table_regexp_str + ")"
paul@14	83	"\|"
paul@14	84	"(" + blocktext_regexp_str + ")",
paul@14	85	re.MULTILINE
paul@14	86	)
paul@14	87
paul@14	88	def get_block_elements(s):
paul@7	89
paul@7	90	"""
paul@14	91	Extract headings, tables and lists from the given string 's'.
paul@7	92	"""
paul@7	93
paul@7	94	last = 0
paul@7	95	blocks = []
paul@14	96	for match in blockelement_regexp.finditer(s):
paul@7	97	start, end = match.span()
paul@14	98	matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
paul@7	99	blocks.append((None, s[last:start]))
paul@14	100	blocks.append((matchtype, match.group("text") or s[start:end]))
paul@7	101	last = end
paul@7	102	blocks.append((None, s[last:]))
paul@7	103	return blocks
paul@7	104
paul@7	105	# Block extraction.
paul@7	106
paul@7	107	block_regexp_str = r"^(?:\s*\n)+"
paul@7	108	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	109
paul@7	110	def get_basic_blocks(s):
paul@7	111
paul@7	112	"""
paul@7	113	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	114	and eliminating those lines.
paul@7	115	"""
paul@7	116
paul@7	117	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	118
paul@7	119	# Block inspection.
paul@7	120
paul@7	121	def get_blocks(s):
paul@7	122
paul@7	123	"""
paul@7	124	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	125	generating additional block-level text where appropriate.
paul@7	126	"""
paul@7	127
paul@7	128	blocks = []
paul@7	129
paul@14	130	for blocktype, blocktext in get_block_elements(s):
paul@7	131
paul@14	132	# Collect heading, list and table blocks.
paul@7	133
paul@7	134	if blocktype is not None:
paul@7	135	blocks.append((blocktype, blocktext))
paul@7	136
paul@7	137	# Attempt to find new subblocks in other regions.
paul@7	138
paul@7	139	else:
paul@7	140	for block in get_basic_blocks(blocktext):
paul@14	141	blocks.append((None, block))
paul@7	142
paul@7	143	return blocks
paul@7	144
paul@14	145	# List item inspection.
paul@14	146
paul@14	147	listitem_regexp_str = r"^(?P<marker>[#-])+\s(?P<text>.*)$"
paul@7	148	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	149
paul@14	150	def get_list_items(text):
paul@14	151
paul@14	152	"Return a list of (marker, text) tuples for the given list 'text'."
paul@14	153
paul@14	154	items = []
paul@14	155
paul@14	156	for match in listitem_regexp.finditer(text):
paul@14	157	items.append((match.group("marker"), match.group("text")))
paul@14	158
paul@14	159	return items
paul@14	160
paul@14	161	# Table row inspection.
paul@14	162
paul@14	163	link_regexp_str = r"[[](?P<linktext>.*?)]"
paul@14	164	image_regexp_str = r"!(?P<imagetext>.*?)!"
paul@14	165	cellsep_regexp_str = r"(?P<celltype>[\|]{1,2})"
paul@14	166	content_regexp = re.compile(
paul@14	167	"(" + link_regexp_str + ")"
paul@14	168	"\|"
paul@14	169	"(" + image_regexp_str + ")"
paul@14	170	"\|"
paul@14	171	"(" + cellsep_regexp_str + ")"
paul@14	172	)
paul@14	173
paul@14	174	def get_table_rows(text):
paul@14	175
paul@14	176	"Return a list of (cellsep, columns) tuples for the given table 'text'."
paul@14	177
paul@14	178	rows = []
paul@14	179
paul@14	180	for line in text.split("\n"):
paul@14	181	cellsep = None
paul@14	182	columns = [""]
paul@14	183	last = 0
paul@14	184	for match in content_regexp.finditer(line):
paul@14	185	start, end = match.span()
paul@14	186	columns[-1] += line[last:start]
paul@14	187
paul@14	188	if match.group("celltype"):
paul@14	189	if cellsep is None:
paul@14	190	cellsep = match.group("celltype")
paul@14	191	columns.append("")
paul@14	192	else:
paul@14	193	columns[-1] += line[start:end]
paul@14	194
paul@14	195	last = end
paul@14	196
paul@14	197	columns[-1] += line[last:]
paul@14	198
paul@14	199	if cellsep:
paul@14	200	rows.append((cellsep, columns[1:-1]))
paul@14	201
paul@14	202	return rows
paul@14	203
paul@14	204	# General parsing and translation.
paul@14	205
paul@11	206	blocktypes = {
paul@11	207	"h1" : "= %s =",
paul@11	208	"h2" : "== %s ==",
paul@11	209	"h3" : "=== %s ===",
paul@11	210	"h4" : "==== %s ====",
paul@11	211	"h5" : "===== %s =====",
paul@11	212	"h6" : "====== %s ======",
paul@11	213	"bq" : "{{{%s}}}",
paul@11	214	}
paul@11	215
paul@14	216	markers = {
paul@14	217	"" : "",
paul@14	218	"#" : "1.",
paul@14	219	"-" : "*",
paul@14	220	}
paul@14	221
paul@14	222	def translate_marker(marker):
paul@14	223
paul@14	224	"Translate the given 'marker' to a suitable Moin representation."
paul@14	225
paul@14	226	return " " * len(marker) + markers[marker[-1]]
paul@14	227
paul@14	228	cellseps = {
paul@14	229	"\|" : "\|\|",
paul@14	230	"\|\|" : "\|\|",
paul@14	231	}
paul@14	232
paul@14	233	cellextra = {
paul@14	234	"\|" : "",
paul@14	235	"\|\|" : "'''",
paul@14	236	}
paul@14	237
paul@14	238	def translate_cellsep(cellsep):
paul@14	239
paul@14	240	"Translate the given 'cellsep' to a suitable Moin representation."
paul@14	241
paul@14	242	return cellseps[cellsep]
paul@14	243
paul@14	244	def translate_cell(cellsep, text):
paul@14	245
paul@14	246	"Using 'cellsep', translate the cell 'text'."
paul@14	247
paul@14	248	return cellextra[cellsep] + text + cellextra[cellsep]
paul@14	249
paul@11	250	def parse(s, out):
paul@11	251
paul@11	252	"Parse the content in the string 's', writing a translation to 'out'."
paul@11	253
paul@11	254	for type, text in get_regions(s):
paul@11	255
paul@11	256	# Handle list, heading, blockquote or anonymous blocks.
paul@11	257
paul@11	258	if type is None:
paul@11	259	for blocktype, blocktext in get_blocks(text):
paul@14	260
paul@14	261	# Translate headings and blockquotes.
paul@14	262
paul@11	263	if blocktypes.has_key(blocktype):
paul@11	264	print >>out, blocktypes[blocktype] % blocktext
paul@14	265
paul@14	266	# Translate list items.
paul@14	267
paul@14	268	elif blocktype == "list":
paul@14	269	for listmarker, listitem in get_list_items(blocktext):
paul@14	270	print >>out, "%s %s" % (translate_marker(listmarker), listitem)
paul@14	271
paul@14	272	# Translate table items.
paul@14	273
paul@14	274	elif blocktype == "table":
paul@14	275	for cellsep, columns in get_table_rows(blocktext):
paul@14	276	moinsep = translate_cellsep(cellsep)
paul@14	277	print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
paul@14	278
paul@14	279	# Handle anonymous blocks.
paul@14	280
paul@11	281	else:
paul@14	282	print >>out, blocktext.rstrip()
paul@14	283
paul@14	284	print >>out
paul@11	285
paul@11	286	# Handle sections.
paul@11	287
paul@11	288	else:
paul@14	289	print >>out, "{{{",
paul@14	290	print >>out, text,
paul@14	291	print >>out, "}}}"
paul@14	292	print >>out
paul@11	293
paul@6	294	if __name__ == "__main__":
paul@6	295	import sys
paul@6	296
paul@6	297	s = sys.stdin.read()
paul@11	298	parse(s, sys.stdout)
paul@6	299
paul@6	300	# vim: tabstop=4 expandtab shiftwidth=4