ConfluenceConverter (annotate parser.py in 0a5ff722fee3)

ConfluenceConverter

Annotated parser.py

16:0a5ff722fee3

2012-04-23

Paul Boddie

Introduced general content translation more widely, attempting to translate links and images.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@8	6	Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22
paul@8	23	--------
paul@8	24
paul@8	25	The basic procedure is as follows:
paul@8	26
paul@7	27	1. Wiki pages are first split up into regions.
paul@7	28	2. Then, within these regions, the text is split into blocks.
paul@7	29	1. First, lists are identified.
paul@7	30	2. Additionally, other block-like elements are identified.
paul@7	31	3. Each block is then parsed.
paul@7	32	"""
paul@7	33
paul@6	34	import re
paul@6	35
paul@6	36	# Section extraction.
paul@6	37
paul@15	38	sections_regexp_str = r"(?<!{){(?P<type>[^{}\n:]+)(:[^}\n]+)?}.*?{(?P=type)}"
paul@6	39	sections_regexp = re.compile(sections_regexp_str, re.DOTALL \| re.MULTILINE)
paul@6	40
paul@6	41	def get_regions(s):
paul@6	42
paul@6	43	"""
paul@6	44	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	45	the form (type, text).
paul@6	46	"""
paul@6	47
paul@6	48	last = 0
paul@6	49	regions = []
paul@6	50	for match in sections_regexp.finditer(s):
paul@6	51	start, end = match.span()
paul@6	52	regions.append((None, s[last:start]))
paul@6	53	regions.append(get_section_details(s[start:end]))
paul@6	54	last = end
paul@6	55	regions.append((None, s[last:]))
paul@6	56	return regions
paul@6	57
paul@7	58	# Section inspection.
paul@7	59
paul@15	60	section_regexp_str = r"{(?P<sectiontype>[^\n:]?)(?::(?P<options>.?))?}(?P<section>.*){(?P=sectiontype)}"
paul@7	61	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	62
paul@6	63	def get_section_details(s):
paul@6	64
paul@7	65	"Return the details of a section 's' in the form (type, text)."
paul@6	66
paul@6	67	match = section_regexp.match(s)
paul@6	68	if match:
paul@15	69	return (match.group("sectiontype"), match.group("options")), match.group("section")
paul@6	70	else:
paul@6	71	return None, s
paul@6	72
paul@14	73	# Heading, table and list extraction.
paul@7	74
paul@7	75	list_regexp_str = r"^(?P<listtype>[#-])[#-].\n((?P=listtype).(?:\n\|$))"
paul@14	76	table_regexp_str = r"^((?P<celltype>[\|]{1,2})(.+?(?P=celltype))+(\n\|$))+"
paul@14	77	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	78
paul@14	79	blockelement_regexp = re.compile(
paul@14	80	"(" + list_regexp_str + ")"
paul@14	81	"\|"
paul@14	82	"(" + table_regexp_str + ")"
paul@14	83	"\|"
paul@14	84	"(" + blocktext_regexp_str + ")",
paul@14	85	re.MULTILINE
paul@14	86	)
paul@14	87
paul@14	88	def get_block_elements(s):
paul@7	89
paul@7	90	"""
paul@14	91	Extract headings, tables and lists from the given string 's'.
paul@7	92	"""
paul@7	93
paul@7	94	last = 0
paul@7	95	blocks = []
paul@14	96	for match in blockelement_regexp.finditer(s):
paul@7	97	start, end = match.span()
paul@14	98	matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
paul@7	99	blocks.append((None, s[last:start]))
paul@14	100	blocks.append((matchtype, match.group("text") or s[start:end]))
paul@7	101	last = end
paul@7	102	blocks.append((None, s[last:]))
paul@7	103	return blocks
paul@7	104
paul@7	105	# Block extraction.
paul@7	106
paul@7	107	block_regexp_str = r"^(?:\s*\n)+"
paul@7	108	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	109
paul@7	110	def get_basic_blocks(s):
paul@7	111
paul@7	112	"""
paul@7	113	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	114	and eliminating those lines.
paul@7	115	"""
paul@7	116
paul@7	117	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	118
paul@7	119	# Block inspection.
paul@7	120
paul@7	121	def get_blocks(s):
paul@7	122
paul@7	123	"""
paul@7	124	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	125	generating additional block-level text where appropriate.
paul@7	126	"""
paul@7	127
paul@7	128	blocks = []
paul@7	129
paul@14	130	for blocktype, blocktext in get_block_elements(s):
paul@7	131
paul@14	132	# Collect heading, list and table blocks.
paul@7	133
paul@7	134	if blocktype is not None:
paul@7	135	blocks.append((blocktype, blocktext))
paul@7	136
paul@7	137	# Attempt to find new subblocks in other regions.
paul@7	138
paul@7	139	else:
paul@7	140	for block in get_basic_blocks(blocktext):
paul@14	141	blocks.append((None, block))
paul@7	142
paul@7	143	return blocks
paul@7	144
paul@14	145	# List item inspection.
paul@14	146
paul@14	147	listitem_regexp_str = r"^(?P<marker>[#-])+\s(?P<text>.*)$"
paul@7	148	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	149
paul@14	150	def get_list_items(text):
paul@14	151
paul@14	152	"Return a list of (marker, text) tuples for the given list 'text'."
paul@14	153
paul@14	154	items = []
paul@14	155
paul@14	156	for match in listitem_regexp.finditer(text):
paul@14	157	items.append((match.group("marker"), match.group("text")))
paul@14	158
paul@14	159	return items
paul@14	160
paul@14	161	# Table row inspection.
paul@14	162
paul@14	163	link_regexp_str = r"[[](?P<linktext>.*?)]"
paul@14	164	image_regexp_str = r"!(?P<imagetext>.*?)!"
paul@14	165	cellsep_regexp_str = r"(?P<celltype>[\|]{1,2})"
paul@16	166
paul@16	167	content_regexp_str = (
paul@14	168	"(" + link_regexp_str + ")"
paul@14	169	"\|"
paul@14	170	"(" + image_regexp_str + ")"
paul@16	171	)
paul@16	172
paul@16	173	table_content_regexp_str = (
paul@16	174	content_regexp_str +
paul@14	175	"\|"
paul@14	176	"(" + cellsep_regexp_str + ")"
paul@14	177	)
paul@14	178
paul@16	179	content_regexp = re.compile(content_regexp_str)
paul@16	180	table_content_regexp = re.compile(table_content_regexp_str)
paul@16	181
paul@16	182	def translate_content_match(match):
paul@16	183
paul@16	184	"Translate the content described by the given 'match', returning a string."
paul@16	185
paul@16	186	if match.group("linktext"):
paul@16	187	parts = match.group("linktext").split("\|")
paul@16	188
paul@16	189	# NOTE: Proper detection of external links required.
paul@16	190
paul@16	191	if len(parts) > 1 and parts[1].startswith("http"):
paul@16	192	prefix = ""
paul@16	193	elif parts[0].startswith("#"):
paul@16	194	prefix = ""
paul@16	195	elif parts[0].startswith("^"):
paul@16	196	prefix = "attachment:"
paul@16	197	else:
paul@16	198	prefix = "../"
paul@16	199
paul@16	200	if len(parts) == 1:
paul@16	201	return "[[%s%s]]" % (prefix, parts[0])
paul@16	202	elif len(parts) == 2:
paul@16	203	return "[[%s%s\|%s]]" % (prefix, parts[1], parts[0])
paul@16	204	else:
paul@16	205	return "[[%s%s\|%s\|title=%s]]" % (prefix, parts[1], parts[0], parts[2])
paul@16	206
paul@16	207	elif match.group("imagetext"):
paul@16	208	parts = match.group("imagetext").split("\|")
paul@16	209
paul@16	210	# NOTE: Proper detection of external links required.
paul@16	211
paul@16	212	if parts[0].startswith("http"):
paul@16	213	prefix = ""
paul@16	214	else:
paul@16	215	prefix = "attachment:"
paul@16	216
paul@16	217	# NOTE: Proper options conversion required.
paul@16	218
paul@16	219	if len(parts) == 1:
paul@16	220	return "{{%s%s}}" % (prefix, parts[0])
paul@16	221	else:
paul@16	222	return "{{%s%s\|%s}}" % (prefix, parts[1], parts[0])
paul@16	223
paul@16	224	else:
paul@16	225	return match.group()
paul@16	226
paul@14	227	def get_table_rows(text):
paul@14	228
paul@14	229	"Return a list of (cellsep, columns) tuples for the given table 'text'."
paul@14	230
paul@14	231	rows = []
paul@14	232
paul@14	233	for line in text.split("\n"):
paul@14	234	cellsep = None
paul@14	235	columns = [""]
paul@14	236	last = 0
paul@16	237	for match in table_content_regexp.finditer(line):
paul@14	238	start, end = match.span()
paul@14	239	columns[-1] += line[last:start]
paul@14	240
paul@14	241	if match.group("celltype"):
paul@14	242	if cellsep is None:
paul@14	243	cellsep = match.group("celltype")
paul@14	244	columns.append("")
paul@14	245	else:
paul@16	246	columns[-1] += match.group()
paul@14	247
paul@14	248	last = end
paul@14	249
paul@14	250	columns[-1] += line[last:]
paul@14	251
paul@14	252	if cellsep:
paul@14	253	rows.append((cellsep, columns[1:-1]))
paul@14	254
paul@14	255	return rows
paul@14	256
paul@16	257	def translate_content(text):
paul@16	258
paul@16	259	"Return a translation of the given 'text'."
paul@16	260
paul@16	261	parts = []
paul@16	262
paul@16	263	last = 0
paul@16	264	for match in content_regexp.finditer(text):
paul@16	265	start, end = match.span()
paul@16	266	parts.append(text[last:start])
paul@16	267	parts.append(translate_content_match(match))
paul@16	268	last = end
paul@16	269
paul@16	270	parts.append(text[last:])
paul@16	271	return "".join(parts)
paul@16	272
paul@15	273	# Translation helpers.
paul@14	274
paul@11	275	blocktypes = {
paul@11	276	"h1" : "= %s =",
paul@11	277	"h2" : "== %s ==",
paul@11	278	"h3" : "=== %s ===",
paul@11	279	"h4" : "==== %s ====",
paul@11	280	"h5" : "===== %s =====",
paul@11	281	"h6" : "====== %s ======",
paul@11	282	"bq" : "{{{%s}}}",
paul@11	283	}
paul@11	284
paul@14	285	markers = {
paul@14	286	"" : "",
paul@14	287	"#" : "1.",
paul@14	288	"-" : "*",
paul@14	289	}
paul@14	290
paul@14	291	def translate_marker(marker):
paul@14	292
paul@14	293	"Translate the given 'marker' to a suitable Moin representation."
paul@14	294
paul@14	295	return " " * len(marker) + markers[marker[-1]]
paul@14	296
paul@14	297	cellseps = {
paul@14	298	"\|" : "\|\|",
paul@14	299	"\|\|" : "\|\|",
paul@14	300	}
paul@14	301
paul@14	302	cellextra = {
paul@14	303	"\|" : "",
paul@14	304	"\|\|" : "'''",
paul@14	305	}
paul@14	306
paul@14	307	def translate_cellsep(cellsep):
paul@14	308
paul@14	309	"Translate the given 'cellsep' to a suitable Moin representation."
paul@14	310
paul@14	311	return cellseps[cellsep]
paul@14	312
paul@14	313	def translate_cell(cellsep, text):
paul@14	314
paul@14	315	"Using 'cellsep', translate the cell 'text'."
paul@14	316
paul@16	317	return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
paul@14	318
paul@15	319	sectiontypes = {
paul@15	320	"code" : "",
paul@15	321	"noformat" : "",
paul@15	322	"quote" : "",
paul@15	323	"info" : "wiki important",
paul@15	324	"note" : "wiki caution",
paul@15	325	"tip" : "wiki tip",
paul@15	326	"warning" : "wiki warning",
paul@15	327	}
paul@15	328
paul@15	329	# General parsing.
paul@15	330
paul@11	331	def parse(s, out):
paul@11	332
paul@11	333	"Parse the content in the string 's', writing a translation to 'out'."
paul@11	334
paul@11	335	for type, text in get_regions(s):
paul@11	336
paul@11	337	# Handle list, heading, blockquote or anonymous blocks.
paul@11	338
paul@11	339	if type is None:
paul@11	340	for blocktype, blocktext in get_blocks(text):
paul@14	341
paul@14	342	# Translate headings and blockquotes.
paul@14	343
paul@11	344	if blocktypes.has_key(blocktype):
paul@11	345	print >>out, blocktypes[blocktype] % blocktext
paul@14	346
paul@14	347	# Translate list items.
paul@14	348
paul@14	349	elif blocktype == "list":
paul@14	350	for listmarker, listitem in get_list_items(blocktext):
paul@16	351	print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
paul@14	352
paul@14	353	# Translate table items.
paul@14	354
paul@14	355	elif blocktype == "table":
paul@14	356	for cellsep, columns in get_table_rows(blocktext):
paul@14	357	moinsep = translate_cellsep(cellsep)
paul@14	358	print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
paul@14	359
paul@14	360	# Handle anonymous blocks.
paul@14	361
paul@11	362	else:
paul@16	363	print >>out, translate_content(blocktext.rstrip())
paul@14	364
paul@14	365	print >>out
paul@11	366
paul@11	367	# Handle sections.
paul@11	368
paul@11	369	else:
paul@15	370	sectiontype, options = type
paul@15	371
paul@15	372	# Direct translations of sections.
paul@15	373
paul@15	374	mointype = sectiontypes.get(sectiontype)
paul@15	375	if mointype:
paul@15	376	print >>out, "{{{#!%s" % mointype
paul@15	377	if options:
paul@15	378	print >>out, "##", options
paul@15	379	else:
paul@15	380	print >>out, "{{{",
paul@16	381	print >>out, translate_content(text),
paul@14	382	print >>out, "}}}"
paul@14	383	print >>out
paul@11	384
paul@6	385	if __name__ == "__main__":
paul@6	386	import sys
paul@6	387
paul@6	388	s = sys.stdin.read()
paul@11	389	parse(s, sys.stdout)
paul@6	390
paul@6	391	# vim: tabstop=4 expandtab shiftwidth=4