ConfluenceConverter (annotate parser.py in e0920cd59970)

ConfluenceConverter

Annotated parser.py

25:e0920cd59970

2012-12-17

Paul Boddie

Added initial support for parsing and converting Confluence 4 XHTML content.

paul@6	1	#!/usr/bin/env python
paul@6	2
paul@7	3	"""
paul@7	4	Confluence Wiki syntax parsing.
paul@7	5
paul@8	6	Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22
paul@8	23	--------
paul@8	24
paul@8	25	The basic procedure is as follows:
paul@8	26
paul@7	27	1. Wiki pages are first split up into regions.
paul@7	28	2. Then, within these regions, the text is split into blocks.
paul@7	29	1. First, lists are identified.
paul@7	30	2. Additionally, other block-like elements are identified.
paul@7	31	3. Each block is then parsed.
paul@7	32	"""
paul@7	33
paul@25	34	try:
paul@25	35	from cStringIO import StringIO
paul@25	36	except ImportError:
paul@25	37	from StringIO import StringIO
paul@25	38
paul@25	39	from xmlread import Parser
paul@6	40	import re
paul@25	41	import sys
paul@6	42
paul@19	43	URL_SCHEMES = ("http", "https", "ftp", "mailto")
paul@19	44
paul@6	45	# Section extraction.
paul@6	46
paul@19	47	sections_regexp_str = r"(?<!{){(?P<type>[^-_+{}\n:]+)(:[^}\n]+)?}.?{(?P=type)}"
paul@6	48	sections_regexp = re.compile(sections_regexp_str, re.DOTALL \| re.MULTILINE)
paul@6	49
paul@6	50	def get_regions(s):
paul@6	51
paul@6	52	"""
paul@6	53	Return a list of regions from 's'. Each region is specified using a tuple of
paul@6	54	the form (type, text).
paul@6	55	"""
paul@6	56
paul@6	57	last = 0
paul@6	58	regions = []
paul@6	59	for match in sections_regexp.finditer(s):
paul@6	60	start, end = match.span()
paul@6	61	regions.append((None, s[last:start]))
paul@6	62	regions.append(get_section_details(s[start:end]))
paul@6	63	last = end
paul@6	64	regions.append((None, s[last:]))
paul@6	65	return regions
paul@6	66
paul@7	67	# Section inspection.
paul@7	68
paul@15	69	section_regexp_str = r"{(?P<sectiontype>[^\n:]?)(?::(?P<options>.?))?}(?P<section>.*){(?P=sectiontype)}"
paul@7	70	section_regexp = re.compile(section_regexp_str, re.DOTALL \| re.MULTILINE)
paul@7	71
paul@6	72	def get_section_details(s):
paul@6	73
paul@7	74	"Return the details of a section 's' in the form (type, text)."
paul@6	75
paul@6	76	match = section_regexp.match(s)
paul@6	77	if match:
paul@15	78	return (match.group("sectiontype"), match.group("options")), match.group("section")
paul@6	79	else:
paul@6	80	return None, s
paul@6	81
paul@14	82	# Heading, table and list extraction.
paul@7	83
paul@17	84	list_regexp_str = r"^\s(?P<listtype>[#-])[#-].\n(\s(?P=listtype).(?:\n\|$))"
paul@14	85	table_regexp_str = r"^((?P<celltype>[\|]{1,2})(.+?(?P=celltype))+(\n\|$))+"
paul@14	86	blocktext_regexp_str = r"^(?P<type>h\d\|bq)\.\s+(?P<text>.*)$"
paul@7	87
paul@14	88	blockelement_regexp = re.compile(
paul@14	89	"(" + list_regexp_str + ")"
paul@14	90	"\|"
paul@14	91	"(" + table_regexp_str + ")"
paul@14	92	"\|"
paul@14	93	"(" + blocktext_regexp_str + ")",
paul@14	94	re.MULTILINE
paul@14	95	)
paul@14	96
paul@14	97	def get_block_elements(s):
paul@7	98
paul@7	99	"""
paul@14	100	Extract headings, tables and lists from the given string 's'.
paul@7	101	"""
paul@7	102
paul@7	103	last = 0
paul@7	104	blocks = []
paul@14	105	for match in blockelement_regexp.finditer(s):
paul@7	106	start, end = match.span()
paul@14	107	matchtype = match.group("listtype") and "list" or match.group("celltype") and "table" or match.group("type")
paul@7	108	blocks.append((None, s[last:start]))
paul@14	109	blocks.append((matchtype, match.group("text") or s[start:end]))
paul@7	110	last = end
paul@7	111	blocks.append((None, s[last:]))
paul@7	112	return blocks
paul@7	113
paul@7	114	# Block extraction.
paul@7	115
paul@7	116	block_regexp_str = r"^(?:\s*\n)+"
paul@7	117	block_regexp = re.compile(block_regexp_str, re.MULTILINE)
paul@7	118
paul@7	119	def get_basic_blocks(s):
paul@7	120
paul@7	121	"""
paul@7	122	Return blocks from the given string 's' by splitting the text on blank lines
paul@7	123	and eliminating those lines.
paul@7	124	"""
paul@7	125
paul@7	126	return [b for b in block_regexp.split(s) if b.strip()]
paul@7	127
paul@7	128	# Block inspection.
paul@7	129
paul@7	130	def get_blocks(s):
paul@7	131
paul@7	132	"""
paul@7	133	Return blocks from the given string 's', inspecting the basic blocks and
paul@7	134	generating additional block-level text where appropriate.
paul@7	135	"""
paul@7	136
paul@7	137	blocks = []
paul@7	138
paul@14	139	for blocktype, blocktext in get_block_elements(s):
paul@7	140
paul@14	141	# Collect heading, list and table blocks.
paul@7	142
paul@7	143	if blocktype is not None:
paul@7	144	blocks.append((blocktype, blocktext))
paul@7	145
paul@7	146	# Attempt to find new subblocks in other regions.
paul@7	147
paul@7	148	else:
paul@7	149	for block in get_basic_blocks(blocktext):
paul@14	150	blocks.append((None, block))
paul@7	151
paul@7	152	return blocks
paul@7	153
paul@14	154	# List item inspection.
paul@14	155
paul@17	156	listitem_regexp_str = r"^(?P<marker> [-#]+)\s(?P<text>.)$"
paul@7	157	listitem_regexp = re.compile(listitem_regexp_str, re.MULTILINE)
paul@7	158
paul@14	159	def get_list_items(text):
paul@14	160
paul@14	161	"Return a list of (marker, text) tuples for the given list 'text'."
paul@14	162
paul@14	163	items = []
paul@14	164
paul@14	165	for match in listitem_regexp.finditer(text):
paul@14	166	items.append((match.group("marker"), match.group("text")))
paul@14	167
paul@14	168	return items
paul@14	169
paul@14	170	# Table row inspection.
paul@14	171
paul@19	172	monospace_regexp_str = r"{{(?P<monotext>.*?)}}"
paul@14	173	link_regexp_str = r"[[](?P<linktext>.*?)]"
paul@14	174	image_regexp_str = r"!(?P<imagetext>.*?)!"
paul@14	175	cellsep_regexp_str = r"(?P<celltype>[\|]{1,2})"
paul@16	176
paul@16	177	content_regexp_str = (
paul@19	178	"(" + monospace_regexp_str + ")"
paul@19	179	"\|"
paul@14	180	"(" + link_regexp_str + ")"
paul@14	181	"\|"
paul@14	182	"(" + image_regexp_str + ")"
paul@16	183	)
paul@16	184
paul@16	185	table_content_regexp_str = (
paul@16	186	content_regexp_str +
paul@14	187	"\|"
paul@14	188	"(" + cellsep_regexp_str + ")"
paul@14	189	)
paul@14	190
paul@16	191	content_regexp = re.compile(content_regexp_str)
paul@16	192	table_content_regexp = re.compile(table_content_regexp_str)
paul@16	193
paul@16	194	def translate_content_match(match):
paul@16	195
paul@16	196	"Translate the content described by the given 'match', returning a string."
paul@16	197
paul@19	198	if match.group("monotext"):
paul@19	199	return "{{{%s}}}" % match.group("monotext")
paul@19	200
paul@19	201	elif match.group("linktext"):
paul@16	202	parts = match.group("linktext").split("\|")
paul@16	203
paul@16	204	# NOTE: Proper detection of external links required.
paul@16	205
paul@19	206	if len(parts) == 1:
paul@22	207	label, target, title = None, parts[0], None
paul@19	208	elif len(parts) == 2:
paul@22	209	(label, target), title = parts, None
paul@19	210	else:
paul@19	211	label, target, title = parts
paul@19	212
paul@21	213	target = target.strip()
paul@21	214
paul@22	215	# Look for namespace links and rewrite them.
paul@22	216
paul@19	217	if target.find(":") != -1:
paul@16	218	prefix = ""
paul@19	219	space, rest = target.split(":", 1)
paul@19	220	if space not in URL_SCHEMES:
paul@19	221	target = "%s/%s" % (space, rest)
paul@22	222
paul@22	223	# Detect anchors.
paul@22	224
paul@19	225	elif target.startswith("#"):
paul@16	226	prefix = ""
paul@22	227
paul@22	228	# Detect attachments.
paul@22	229
paul@19	230	elif target.startswith("^"):
paul@16	231	prefix = "attachment:"
paul@22	232
paul@22	233	# Link to other pages within a space.
paul@22	234
paul@16	235	else:
paul@16	236	prefix = "../"
paul@16	237
paul@22	238	# Make the link tidier by making a target if none was given.
paul@22	239
paul@22	240	if not label:
paul@22	241	label = target
paul@22	242
paul@22	243	if not label and not title:
paul@19	244	return "[[%s%s]]" % (prefix, target)
paul@22	245	elif not title:
paul@19	246	return "[[%s%s\|%s]]" % (prefix, target, label)
paul@16	247	else:
paul@19	248	return "[[%s%s\|%s\|title=%s]]" % (prefix, target, label, title)
paul@16	249
paul@16	250	elif match.group("imagetext"):
paul@16	251	parts = match.group("imagetext").split("\|")
paul@16	252
paul@16	253	# NOTE: Proper detection of external links required.
paul@16	254
paul@16	255	if parts[0].startswith("http"):
paul@16	256	prefix = ""
paul@16	257	else:
paul@16	258	prefix = "attachment:"
paul@16	259
paul@16	260	# NOTE: Proper options conversion required.
paul@16	261
paul@16	262	if len(parts) == 1:
paul@16	263	return "{{%s%s}}" % (prefix, parts[0])
paul@16	264	else:
paul@19	265	return "{{%s%s\|%s}}" % (prefix, parts[0], parts[1])
paul@16	266
paul@16	267	else:
paul@16	268	return match.group()
paul@16	269
paul@14	270	def get_table_rows(text):
paul@14	271
paul@14	272	"Return a list of (cellsep, columns) tuples for the given table 'text'."
paul@14	273
paul@14	274	rows = []
paul@14	275
paul@14	276	for line in text.split("\n"):
paul@14	277	cellsep = None
paul@14	278	columns = [""]
paul@14	279	last = 0
paul@16	280	for match in table_content_regexp.finditer(line):
paul@14	281	start, end = match.span()
paul@14	282	columns[-1] += line[last:start]
paul@14	283
paul@14	284	if match.group("celltype"):
paul@14	285	if cellsep is None:
paul@14	286	cellsep = match.group("celltype")
paul@14	287	columns.append("")
paul@14	288	else:
paul@16	289	columns[-1] += match.group()
paul@14	290
paul@14	291	last = end
paul@14	292
paul@14	293	columns[-1] += line[last:]
paul@14	294
paul@14	295	if cellsep:
paul@14	296	rows.append((cellsep, columns[1:-1]))
paul@14	297
paul@14	298	return rows
paul@14	299
paul@18	300	def translate_content(text, sectiontype=None):
paul@16	301
paul@18	302	"""
paul@18	303	Return a translation of the given 'text'. If the optional 'sectiontype' is
paul@18	304	specified, the translation may be modified to a form appropriate to the
paul@18	305	section being translated.
paul@18	306	"""
paul@16	307
paul@16	308	parts = []
paul@16	309
paul@16	310	last = 0
paul@16	311	for match in content_regexp.finditer(text):
paul@16	312	start, end = match.span()
paul@16	313	parts.append(text[last:start])
paul@18	314
paul@18	315	# Handle unformatted sections.
paul@18	316
paul@18	317	if sectiontype in ("code", "noformat"):
paul@18	318	parts.append(match.group())
paul@18	319	else:
paul@18	320	parts.append(translate_content_match(match))
paul@18	321
paul@16	322	last = end
paul@16	323
paul@16	324	parts.append(text[last:])
paul@16	325	return "".join(parts)
paul@16	326
paul@15	327	# Translation helpers.
paul@14	328
paul@11	329	blocktypes = {
paul@11	330	"h1" : "= %s =",
paul@11	331	"h2" : "== %s ==",
paul@11	332	"h3" : "=== %s ===",
paul@11	333	"h4" : "==== %s ====",
paul@11	334	"h5" : "===== %s =====",
paul@11	335	"h6" : "====== %s ======",
paul@11	336	"bq" : "{{{%s}}}",
paul@11	337	}
paul@11	338
paul@14	339	markers = {
paul@14	340	"" : "",
paul@14	341	"#" : "1.",
paul@14	342	"-" : "*",
paul@14	343	}
paul@14	344
paul@14	345	def translate_marker(marker):
paul@14	346
paul@14	347	"Translate the given 'marker' to a suitable Moin representation."
paul@14	348
paul@14	349	return " " * len(marker) + markers[marker[-1]]
paul@14	350
paul@14	351	cellseps = {
paul@14	352	"\|" : "\|\|",
paul@14	353	"\|\|" : "\|\|",
paul@14	354	}
paul@14	355
paul@14	356	cellextra = {
paul@14	357	"\|" : "",
paul@14	358	"\|\|" : "'''",
paul@14	359	}
paul@14	360
paul@14	361	def translate_cellsep(cellsep):
paul@14	362
paul@14	363	"Translate the given 'cellsep' to a suitable Moin representation."
paul@14	364
paul@14	365	return cellseps[cellsep]
paul@14	366
paul@14	367	def translate_cell(cellsep, text):
paul@14	368
paul@14	369	"Using 'cellsep', translate the cell 'text'."
paul@14	370
paul@16	371	return cellextra[cellsep] + translate_content(text) + cellextra[cellsep]
paul@14	372
paul@15	373	sectiontypes = {
paul@15	374	"code" : "",
paul@15	375	"noformat" : "",
paul@15	376	"quote" : "",
paul@15	377	"info" : "wiki important",
paul@15	378	"note" : "wiki caution",
paul@15	379	"tip" : "wiki tip",
paul@15	380	"warning" : "wiki warning",
paul@15	381	}
paul@15	382
paul@25	383	# XML dialect syntax parsing.
paul@25	384
paul@25	385	tags = {
paul@25	386	"strong" : "'''%s'''",
paul@25	387	"em" : "''%s''",
paul@25	388	"u" : "__%s__",
paul@25	389	"del" : "--(%s)--",
paul@25	390	"sup" : "^%s^",
paul@25	391	"sub" : ",,%s,,",
paul@25	392	"code" : "`%s`",
paul@25	393	"pre" : "{{{%s}}}",
paul@25	394	"blockquote" : " %s",
paul@25	395	"small" : "~-%s-~",
paul@25	396	"big" : "~+%s+~",
paul@25	397	"p" : "%s\n\n",
paul@25	398	"ac:plain-text-body" : "{{{%s}}}",
paul@25	399	"ac:link" : "[[%s%s\|%s]]",
paul@25	400	}
paul@25	401
paul@25	402	tags.update(blocktypes)
paul@25	403
paul@25	404	list_tags = {
paul@25	405	"ol" : " 1. %s\n",
paul@25	406	"ul" : " * %s\n",
paul@25	407	}
paul@25	408
paul@25	409	link_target_tags = {
paul@25	410	"ri:page" : "ri:content-title",
paul@25	411	"ri:attachment" : "ri:filename",
paul@25	412	}
paul@25	413
paul@25	414	normalise_regexp_str = r"\n\n+"
paul@25	415	normalise_regexp = re.compile(normalise_regexp_str, re.DOTALL)
paul@25	416
paul@25	417	class ConfluenceXMLParser(Parser):
paul@25	418
paul@25	419	"Handle content from Confluence 4 page revisions."
paul@25	420
paul@25	421	def __init__(self, out):
paul@25	422	Parser.__init__(self)
paul@25	423	self.out = out
paul@25	424
paul@25	425	# Link target information.
paul@25	426
paul@25	427	self.target = None
paul@25	428	self.target_type = None
paul@25	429
paul@25	430	def handleElement(self, name):
paul@25	431	text = "".join(self.text[-1])
paul@25	432
paul@25	433	# Handle list elements.
paul@25	434
paul@25	435	if name == "li" and len(self.elements) > 1:
paul@25	436	list_tag = self.elements[-2]
paul@25	437	conversion = list_tags.get(list_tag)
paul@25	438
paul@25	439	# Remember link target information.
paul@25	440
paul@25	441	elif link_target_tags.has_key(name):
paul@25	442	self.target = self.attributes[-1].get(link_target_tags[name])
paul@25	443	self.target_type = name
paul@25	444	text = ""
paul@25	445
paul@25	446	# Handle the common case.
paul@25	447
paul@25	448	else:
paul@25	449	conversion = tags.get(name)
paul@25	450
paul@25	451	# Attempt to convert the text.
paul@25	452
paul@25	453	if name == "ac:link":
paul@25	454	if self.target_type == "ri:attachment":
paul@25	455	prefix = "attachment:"
paul@25	456	else:
paul@25	457	prefix = "../"
paul@25	458
paul@25	459	text = conversion % (prefix, self.target, text or self.target)
paul@25	460
paul@25	461	# Handle the common case.
paul@25	462
paul@25	463	elif text and conversion:
paul@25	464	text = conversion % text
paul@25	465
paul@25	466	# Add the converted text to the end of the parent element's text nodes.
paul@25	467
paul@25	468	if len(self.text) > 1:
paul@25	469	self.text[-2].append(text)
paul@25	470
paul@25	471	# Otherwise, emit the text with normalised newlines.
paul@25	472
paul@25	473	else:
paul@25	474	self.out.write(normalise_regexp.sub("\n\n", text))
paul@25	475
paul@25	476	def xmlparse(s, out):
paul@25	477
paul@25	478	"Parse the content in the string 's', writing a translation to 'out'."
paul@25	479
paul@25	480	# NOTE: CDATA sections appear to have erroneous endings.
paul@25	481
paul@25	482	s = u"""\
paul@25	483	<?xml version="1.0"?>
paul@25	484	<!DOCTYPE html
paul@25	485	PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@25	486	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@25	487	<html xmlns="http://www.w3.org/1999/xhtml">
paul@25	488	<body>
paul@25	489	%s
paul@25	490	</body>
paul@25	491	</html>""" % s.replace("]] >", "]]>")
paul@25	492
paul@25	493	f = StringIO(s.encode("utf-8"))
paul@25	494	try:
paul@25	495	parser = ConfluenceXMLParser(out)
paul@25	496	parser.parse(f)
paul@25	497	finally:
paul@25	498	f.close()
paul@25	499
paul@15	500	# General parsing.
paul@15	501
paul@11	502	def parse(s, out):
paul@11	503
paul@11	504	"Parse the content in the string 's', writing a translation to 'out'."
paul@11	505
paul@11	506	for type, text in get_regions(s):
paul@11	507
paul@11	508	# Handle list, heading, blockquote or anonymous blocks.
paul@11	509
paul@11	510	if type is None:
paul@11	511	for blocktype, blocktext in get_blocks(text):
paul@14	512
paul@14	513	# Translate headings and blockquotes.
paul@14	514
paul@11	515	if blocktypes.has_key(blocktype):
paul@11	516	print >>out, blocktypes[blocktype] % blocktext
paul@14	517
paul@14	518	# Translate list items.
paul@14	519
paul@14	520	elif blocktype == "list":
paul@14	521	for listmarker, listitem in get_list_items(blocktext):
paul@16	522	print >>out, "%s %s" % (translate_marker(listmarker), translate_content(listitem))
paul@14	523
paul@14	524	# Translate table items.
paul@14	525
paul@14	526	elif blocktype == "table":
paul@14	527	for cellsep, columns in get_table_rows(blocktext):
paul@14	528	moinsep = translate_cellsep(cellsep)
paul@14	529	print >>out, moinsep + moinsep.join([translate_cell(cellsep, column) for column in columns]) + moinsep
paul@14	530
paul@14	531	# Handle anonymous blocks.
paul@14	532
paul@11	533	else:
paul@16	534	print >>out, translate_content(blocktext.rstrip())
paul@14	535
paul@14	536	print >>out
paul@11	537
paul@11	538	# Handle sections.
paul@11	539
paul@11	540	else:
paul@15	541	sectiontype, options = type
paul@15	542
paul@15	543	# Direct translations of sections.
paul@15	544
paul@15	545	mointype = sectiontypes.get(sectiontype)
paul@15	546	if mointype:
paul@15	547	print >>out, "{{{#!%s" % mointype
paul@15	548	if options:
paul@15	549	print >>out, "##", options
paul@15	550	else:
paul@15	551	print >>out, "{{{",
paul@18	552	print >>out, translate_content(text, sectiontype),
paul@14	553	print >>out, "}}}"
paul@14	554	print >>out
paul@11	555
paul@6	556	if __name__ == "__main__":
paul@6	557	s = sys.stdin.read()
paul@11	558	parse(s, sys.stdout)
paul@6	559
paul@6	560	# vim: tabstop=4 expandtab shiftwidth=4