ConfluenceConverter (annotate parser.py in 74d7f8137a83)

ConfluenceConverter

Annotated parser.py

19:74d7f8137a83

2012-04-23

Paul Boddie

Added monospaced text region support. Added various characters to the excluded set in section type (and macro) names, since certain characters can appear within { and } (for example, asterisk can be used as {*}) in order to mark the start and end of text effect regions within words. Improved link processing, adding support for space-qualified links. Fixed the ordering of image targets and options.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@8	6	Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22
paul@8	23	--------
paul@8	24
paul@8	25	The basic procedure is as follows:
paul@8	26
paul@7	27	1. Wiki pages are first split up into regions.
paul@7	28	2. Then, within these regions, the text is split into blocks.
paul@7	29	1. First, lists are identified.
paul@7	30	2. Additionally, other block-like elements are identified.
paul@7	31	3. Each block is then parsed.
paul@7	32	"""
paul@7	33
paul@6	34	import re
paul@6	35
paul@19	36	URL_SCHEMES = ("http", "https", "ftp", "mailto")
paul@19	37
paul@6	38	# Section extraction.
paul@6	39
paul@19	40	sections_regexp_str = r"(?<!{){(?P<type>[^-_+{}\n:]+)(:[^}\n]+)?}.?{(?P=type)}"
paul@6	41	sections_regexp = re.compile(sections_regexp_str, re.DOTALL \| re.MULTILINE)
paul@6	42
paul@6	43	def get_regions(s):
paul@6	44
paul@6	45	"""
paul@6	46	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	47	the form (type, text).
paul@6	48	"""
paul@6	49
paul@6	50	last = 0
paul@6	51	regions = []
paul@6	52	for match in sections_regexp.finditer(s):
paul@6	53	start, end = match.span()
paul@6	54	regions.append((None, s[last:start]))
paul@6	55	regions.append(get_section_details(s[start:end]))
paul@6	56	last = end
paul@6	57	regions.append((None, s[last:]))
paul@6	58	return regions
paul@6	59
paul@7	60	# Section inspection.
paul@7	61
paul@15	62	section_regexp_str = r"{(?P<sectiontype>[^\n:]?)(?::(?P<options>.?))?}(?P<section>.*){(?P=sectiontype)}"
paul@7	63	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	64
paul@6	65	def get_section_details(s):
paul@6	66
paul@7	67	"Return the details of a section 's' in the form (type, text)."
paul@6	68
paul@6	69	match = section_regexp.match(s)
paul@6	70	if match:
paul@15	71	return (match.group("sectiontype"), match.group("options")), match.group("section")
paul@6	72	else:
paul@6	73	return None, s
paul@6	74
paul@14	75	# Heading, table and list extraction.
paul@7	76
paul@17	77	list_regexp_str = r"^\s(?P<listtype>[#-])[#-].\n(\s(?P=listtype).(?:\n\|$))"
paul@14	78	table_regexp_str = r"^((?P<celltype>[\|]{1,2})(.+?(?P=celltype))+(\n\|$))+"
paul@14	79	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	80
paul@14	81	blockelement_regexp = re.compile(
paul@14	82	"(" + list_regexp_str + ")"
paul@14	83	"\|"
paul@14	84	"(" + table_regexp_str + ")"
paul@14	85	"\|"
paul@14	86	"(" + blocktext_regexp_str + ")",
paul@14	87	re.MULTILINE
paul@14	88	)
paul@14	89
paul@14	90	def get_block_elements(s):
paul@7	91
paul@7	92	"""
paul@14	93	Extract headings, tables and lists from the given string 's'.
paul@7	94	"""
paul@7	95
paul@7	96	last = 0
paul@7	97	blocks = []
paul@14	98	for match in blockelement_regexp.finditer(s):
paul@7	99	start, end = match.span()
paul@14	100	matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
paul@7	101	blocks.append((None, s[last:start]))
paul@14	102	blocks.append((matchtype, match.group("text") or s[start:end]))
paul@7	103	last = end
paul@7	104	blocks.append((None, s[last:]))
paul@7	105	return blocks
paul@7	106
paul@7	107	# Block extraction.
paul@7	108
paul@7	109	block_regexp_str = r"^(?:\s*\n)+"
paul@7	110	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	111
paul@7	112	def get_basic_blocks(s):
paul@7	113
paul@7	114	"""
paul@7	115	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	116	and eliminating those lines.
paul@7	117	"""
paul@7	118
paul@7	119	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	120
paul@7	121	# Block inspection.
paul@7	122
paul@7	123	def get_blocks(s):
paul@7	124
paul@7	125	"""
paul@7	126	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	127	generating additional block-level text where appropriate.
paul@7	128	"""
paul@7	129
paul@7	130	blocks = []
paul@7	131
paul@14	132	for blocktype, blocktext in get_block_elements(s):
paul@7	133
paul@14	134	# Collect heading, list and table blocks.
paul@7	135
paul@7	136	if blocktype is not None:
paul@7	137	blocks.append((blocktype, blocktext))
paul@7	138
paul@7	139	# Attempt to find new subblocks in other regions.
paul@7	140
paul@7	141	else:
paul@7	142	for block in get_basic_blocks(blocktext):
paul@14	143	blocks.append((None, block))
paul@7	144
paul@7	145	return blocks
paul@7	146
paul@14	147	# List item inspection.
paul@14	148
paul@17	149	listitem_regexp_str = r"^(?P<marker> [-#]+)\s(?P<text>.)$"
paul@7	150	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	151
paul@14	152	def get_list_items(text):
paul@14	153
paul@14	154	"Return a list of (marker, text) tuples for the given list 'text'."
paul@14	155
paul@14	156	items = []
paul@14	157
paul@14	158	for match in listitem_regexp.finditer(text):
paul@14	159	items.append((match.group("marker"), match.group("text")))
paul@14	160
paul@14	161	return items
paul@14	162
paul@14	163	# Table row inspection.
paul@14	164
paul@19	165	monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
paul@14	166	link_regexp_str = r"[[](?P<linktext>.*?)]"
paul@14	167	image_regexp_str = r"!(?P<imagetext>.*?)!"
paul@14	168	cellsep_regexp_str = r"(?P<celltype>[\|]{1,2})"
paul@16	169
paul@16	170	content_regexp_str = (
paul@19	171	"(" + monospace_regexp_str + ")"
paul@19	172	"\|"
paul@14	173	"(" + link_regexp_str + ")"
paul@14	174	"\|"
paul@14	175	"(" + image_regexp_str + ")"
paul@16	176	)
paul@16	177
paul@16	178	table_content_regexp_str = (
paul@16	179	content_regexp_str +
paul@14	180	"\|"
paul@14	181	"(" + cellsep_regexp_str + ")"
paul@14	182	)
paul@14	183
paul@16	184	content_regexp = re.compile(content_regexp_str)
paul@16	185	table_content_regexp = re.compile(table_content_regexp_str)
paul@16	186
paul@16	187	def translate_content_match(match):
paul@16	188
paul@16	189	"Translate the content described by the given 'match', returning a string."
paul@16	190
paul@19	191	if match.group("monotext"):
paul@19	192	return "{{{%s}}}" % match.group("monotext")
paul@19	193
paul@19	194	elif match.group("linktext"):
paul@16	195	parts = match.group("linktext").split("\|")
paul@16	196
paul@16	197	# NOTE: Proper detection of external links required.
paul@16	198
paul@19	199	if len(parts) == 1:
paul@19	200	label, target = None, parts[0]
paul@19	201	elif len(parts) == 2:
paul@19	202	label, target = parts
paul@19	203	else:
paul@19	204	label, target, title = parts
paul@19	205
paul@19	206	if target.find(":") != -1:
paul@16	207	prefix = ""
paul@19	208	space, rest = target.split(":", 1)
paul@19	209	if space not in URL_SCHEMES:
paul@19	210	target = "%s/%s" % (space, rest)
paul@19	211	elif target.startswith("#"):
paul@16	212	prefix = ""
paul@19	213	elif target.startswith("^"):
paul@16	214	prefix = "attachment:"
paul@16	215	else:
paul@16	216	prefix = "../"
paul@16	217
paul@16	218	if len(parts) == 1:
paul@19	219	return "[[%s%s]]" % (prefix, target)
paul@16	220	elif len(parts) == 2:
paul@19	221	return "[[%s%s\|%s]]" % (prefix, target, label)
paul@16	222	else:
paul@19	223	return "[[%s%s\|%s\|title=%s]]" % (prefix, target, label, title)
paul@16	224
paul@16	225	elif match.group("imagetext"):
paul@16	226	parts = match.group("imagetext").split("\|")
paul@16	227
paul@16	228	# NOTE: Proper detection of external links required.
paul@16	229
paul@16	230	if parts[0].startswith("http"):
paul@16	231	prefix = ""
paul@16	232	else:
paul@16	233	prefix = "attachment:"
paul@16	234
paul@16	235	# NOTE: Proper options conversion required.
paul@16	236
paul@16	237	if len(parts) == 1:
paul@16	238	return "{{%s%s}}" % (prefix, parts[0])
paul@16	239	else:
paul@19	240	return "{{%s%s\|%s}}" % (prefix, parts[0], parts[1])
paul@16	241
paul@16	242	else:
paul@16	243	return match.group()
paul@16	244
paul@14	245	def get_table_rows(text):
paul@14	246
paul@14	247	"Return a list of (cellsep, columns) tuples for the given table 'text'."
paul@14	248
paul@14	249	rows = []
paul@14	250
paul@14	251	for line in text.split("\n"):
paul@14	252	cellsep = None
paul@14	253	columns = [""]
paul@14	254	last = 0
paul@16	255	for match in table_content_regexp.finditer(line):
paul@14	256	start, end = match.span()
paul@14	257	columns[-1] += line[last:start]
paul@14	258
paul@14	259	if match.group("celltype"):
paul@14	260	if cellsep is None:
paul@14	261	cellsep = match.group("celltype")
paul@14	262	columns.append("")
paul@14	263	else:
paul@16	264	columns[-1] += match.group()
paul@14	265
paul@14	266	last = end
paul@14	267
paul@14	268	columns[-1] += line[last:]
paul@14	269
paul@14	270	if cellsep:
paul@14	271	rows.append((cellsep, columns[1:-1]))
paul@14	272
paul@14	273	return rows
paul@14	274
paul@18	275	def translate_content(text, sectiontype=None):
paul@16	276
paul@18	277	"""
paul@18	278	Return a translation of the given 'text'. If the optional 'sectiontype' is
paul@18	279	specified, the translation may be modified to a form appropriate to the
paul@18	280	section being translated.
paul@18	281	"""
paul@16	282
paul@16	283	parts = []
paul@16	284
paul@16	285	last = 0
paul@16	286	for match in content_regexp.finditer(text):
paul@16	287	start, end = match.span()
paul@16	288	parts.append(text[last:start])
paul@18	289
paul@18	290	# Handle unformatted sections.
paul@18	291
paul@18	292	if sectiontype in ("code", "noformat"):
paul@18	293	parts.append(match.group())
paul@18	294	else:
paul@18	295	parts.append(translate_content_match(match))
paul@18	296
paul@16	297	last = end
paul@16	298
paul@16	299	parts.append(text[last:])
paul@16	300	return "".join(parts)
paul@16	301
paul@15	302	# Translation helpers.
paul@14	303
paul@11	304	blocktypes = {
paul@11	305	"h1" : "= %s =",
paul@11	306	"h2" : "== %s ==",
paul@11	307	"h3" : "=== %s ===",
paul@11	308	"h4" : "==== %s ====",
paul@11	309	"h5" : "===== %s =====",
paul@11	310	"h6" : "====== %s ======",
paul@11	311	"bq" : "{{{%s}}}",
paul@11	312	}
paul@11	313
paul@14	314	markers = {
paul@14	315	"" : "",
paul@14	316	"#" : "1.",
paul@14	317	"-" : "*",
paul@14	318	}
paul@14	319
paul@14	320	def translate_marker(marker):
paul@14	321
paul@14	322	"Translate the given 'marker' to a suitable Moin representation."
paul@14	323
paul@14	324	return " " * len(marker) + markers[marker[-1]]
paul@14	325
paul@14	326	cellseps = {
paul@14	327	"\|" : "\|\|",
paul@14	328	"\|\|" : "\|\|",
paul@14	329	}
paul@14	330
paul@14	331	cellextra = {
paul@14	332	"\|" : "",
paul@14	333	"\|\|" : "'''",
paul@14	334	}
paul@14	335
paul@14	336	def translate_cellsep(cellsep):
paul@14	337
paul@14	338	"Translate the given 'cellsep' to a suitable Moin representation."
paul@14	339
paul@14	340	return cellseps[cellsep]
paul@14	341
paul@14	342	def translate_cell(cellsep, text):
paul@14	343
paul@14	344	"Using 'cellsep', translate the cell 'text'."
paul@14	345
paul@16	346	return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
paul@14	347
paul@15	348	sectiontypes = {
paul@15	349	"code" : "",
paul@15	350	"noformat" : "",
paul@15	351	"quote" : "",
paul@15	352	"info" : "wiki important",
paul@15	353	"note" : "wiki caution",
paul@15	354	"tip" : "wiki tip",
paul@15	355	"warning" : "wiki warning",
paul@15	356	}
paul@15	357
paul@15	358	# General parsing.
paul@15	359
paul@11	360	def parse(s, out):
paul@11	361
paul@11	362	"Parse the content in the string 's', writing a translation to 'out'."
paul@11	363
paul@11	364	for type, text in get_regions(s):
paul@11	365
paul@11	366	# Handle list, heading, blockquote or anonymous blocks.
paul@11	367
paul@11	368	if type is None:
paul@11	369	for blocktype, blocktext in get_blocks(text):
paul@14	370
paul@14	371	# Translate headings and blockquotes.
paul@14	372
paul@11	373	if blocktypes.has_key(blocktype):
paul@11	374	print >>out, blocktypes[blocktype] % blocktext
paul@14	375
paul@14	376	# Translate list items.
paul@14	377
paul@14	378	elif blocktype == "list":
paul@14	379	for listmarker, listitem in get_list_items(blocktext):
paul@16	380	print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
paul@14	381
paul@14	382	# Translate table items.
paul@14	383
paul@14	384	elif blocktype == "table":
paul@14	385	for cellsep, columns in get_table_rows(blocktext):
paul@14	386	moinsep = translate_cellsep(cellsep)
paul@14	387	print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
paul@14	388
paul@14	389	# Handle anonymous blocks.
paul@14	390
paul@11	391	else:
paul@16	392	print >>out, translate_content(blocktext.rstrip())
paul@14	393
paul@14	394	print >>out
paul@11	395
paul@11	396	# Handle sections.
paul@11	397
paul@11	398	else:
paul@15	399	sectiontype, options = type
paul@15	400
paul@15	401	# Direct translations of sections.
paul@15	402
paul@15	403	mointype = sectiontypes.get(sectiontype)
paul@15	404	if mointype:
paul@15	405	print >>out, "{{{#!%s" % mointype
paul@15	406	if options:
paul@15	407	print >>out, "##", options
paul@15	408	else:
paul@15	409	print >>out, "{{{",
paul@18	410	print >>out, translate_content(text, sectiontype),
paul@14	411	print >>out, "}}}"
paul@14	412	print >>out
paul@11	413
paul@6	414	if __name__ == "__main__":
paul@6	415	import sys
paul@6	416
paul@6	417	s = sys.stdin.read()
paul@11	418	parse(s, sys.stdout)
paul@6	419
paul@6	420	# vim: tabstop=4 expandtab shiftwidth=4