ConfluenceConverter (annotate xmlparser.py in 9f83d179c5c3)

ConfluenceConverter

Annotated xmlparser.py

51:9f83d179c5c3

2013-03-22

Paul Boddie

Supported anchor links and nested sections. Fixed preformatted region detection. Added ill-formed element nesting prevention.

paul@35	1	#!/usr/bin/env python
paul@35	2
paul@35	3	"""
paul@35	4	Confluence Wiki XML/XHTML syntax parsing.
paul@35	5
paul@35	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@35	7
paul@35	8	This software is free software; you can redistribute it and/or
paul@35	9	modify it under the terms of the GNU General Public License as
paul@35	10	published by the Free Software Foundation; either version 2 of
paul@35	11	the License, or (at your option) any later version.
paul@35	12
paul@35	13	This software is distributed in the hope that it will be useful,
paul@35	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@35	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@35	16	GNU General Public License for more details.
paul@35	17
paul@35	18	You should have received a copy of the GNU General Public
paul@35	19	License along with this library; see the file LICENCE.txt
paul@35	20	If not, write to the Free Software Foundation, Inc.,
paul@35	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@35	22	"""
paul@35	23
paul@35	24	try:
paul@35	25	from cStringIO import StringIO
paul@35	26	except ImportError:
paul@35	27	from StringIO import StringIO
paul@35	28
paul@51	29	from MoinMoin import wikiutil
paul@35	30	from common import *
paul@35	31	from xmlread import Parser
paul@35	32	import re
paul@35	33	import sys
paul@35	34	import operator
paul@35	35	import htmlentitydefs
paul@41	36	import codecs
paul@35	37
paul@35	38	# XML dialect syntax parsing.
paul@35	39
paul@35	40	tags = {
paul@35	41	# XHTML tag MoinMoin syntax
paul@35	42	"strong" : "'''%s'''",
paul@35	43	"em" : "''%s''",
paul@35	44	"u" : "__%s__",
paul@35	45	"del" : "--(%s)--",
paul@35	46	"sup" : "^%s^",
paul@35	47	"sub" : ",,%s,,",
paul@35	48	"code" : "`%s`",
paul@41	49	"tbody" : "%s",
paul@41	50	"tr" : "%s",
paul@41	51	"th" : "'''%s'''",
paul@41	52	"td" : "%s",
paul@35	53	"blockquote" : " %s",
paul@35	54	"small" : "~-%s-~",
paul@35	55	"big" : "~+%s+~",
paul@35	56	"p" : "%s",
paul@35	57	"ol" : "%s",
paul@35	58	"ul" : "%s",
paul@35	59	"ac:link" : "[[%s%s\|%s]]",
paul@42	60	"ac:image" : "{{%s%s\|%s}}",
paul@35	61	}
paul@35	62
paul@35	63	for tag, translation in blocktypes.items():
paul@35	64	tags[tag] = translation
paul@35	65
paul@35	66	simple_tags = {
paul@35	67	# XHTML tag MoinMoin syntax
paul@35	68	"br" : "<<BR>>",
paul@35	69	}
paul@35	70
paul@35	71	list_tags = {
paul@35	72	# XHTML list tag MoinMoin list item syntax
paul@35	73	"ol" : "1. %s",
paul@35	74	"ul" : "* %s",
paul@35	75	}
paul@35	76
paul@35	77	indented_tags = ["li", "p"]
paul@35	78
paul@51	79	preformatted_tags = ["pre", "ac:plain-text-body"]
paul@51	80	single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]
paul@51	81	formatted_tags = ["ac:rich-text-body", "table"]
paul@51	82
paul@35	83	link_target_tags = {
paul@35	84	# Confluence element Attribute providing the target
paul@35	85	"ri:page" : "ri:content-title",
paul@35	86	"ri:attachment" : "ri:filename",
paul@35	87	"ri:user" : "ri:username",
paul@35	88	}
paul@35	89
paul@51	90	# NOTE: User links should support the intended user namespace prefix.
paul@51	91
paul@51	92	link_target_types = {
paul@51	93	# Confluence element MoinMoin link prefix
paul@51	94	"ri:attachment" : "attachment:",
paul@51	95	"ri:user" : "",
paul@51	96	"ac:link-body" : "#",
paul@51	97	}
paul@51	98
paul@35	99	macro_rich_text_styles = {
paul@35	100	# Confluence style MoinMoin admonition style
paul@35	101	"note" : "caution",
paul@35	102	"warning" : "warning",
paul@35	103	"info" : "important",
paul@35	104	"tip" : "tip",
paul@35	105	}
paul@35	106
paul@35	107	normalise_regexp_str = r"\s+"
paul@35	108	normalise_regexp = re.compile(normalise_regexp_str)
paul@35	109
paul@35	110	class ConfluenceXMLParser(Parser):
paul@35	111
paul@35	112	"Handle content from Confluence 4 page revisions."
paul@35	113
paul@35	114	def __init__(self, out):
paul@35	115	Parser.__init__(self)
paul@35	116	self.out = out
paul@35	117
paul@51	118	# Link target and label information.
paul@35	119
paul@35	120	self.target = None
paul@35	121	self.target_type = None
paul@51	122	self.label = None
paul@35	123
paul@35	124	# Macro information.
paul@35	125
paul@35	126	self.macro = None
paul@35	127	self.macro_parameters = {}
paul@35	128
paul@51	129	# Indentation and element nesting states.
paul@35	130
paul@35	131	self.indent = 0
paul@35	132	self.states = {}
paul@51	133	self.max_level = self.level = 0
paul@51	134
paul@51	135	for name in preformatted_tags + single_level_tags:
paul@35	136	self.states[name] = 0
paul@35	137
paul@41	138	# Table states.
paul@41	139
paul@41	140	self.table_rows = 0
paul@41	141	self.table_columns = 0
paul@41	142
paul@35	143	# ContentHandler-related methods.
paul@35	144
paul@35	145	def startElement(self, name, attrs):
paul@35	146	if list_tags.has_key(name):
paul@35	147	self.indent += 1
paul@35	148	elif self.states.has_key(name):
paul@35	149	self.states[name] += 1
paul@51	150	if name in preformatted_tags or name in formatted_tags:
paul@51	151	self.level += 1
paul@51	152	self.max_level = max(self.level, self.max_level)
paul@51	153
paul@35	154	Parser.startElement(self, name, attrs)
paul@35	155
paul@51	156	# Remember macro information for use within the element.
paul@51	157
paul@51	158	if name == "ac:macro":
paul@51	159	self.macro = self.attributes[-1].get("ac:name")
paul@51	160
paul@35	161	def endElement(self, name):
paul@35	162	Parser.endElement(self, name)
paul@51	163
paul@35	164	if list_tags.has_key(name):
paul@35	165	self.indent -= 1
paul@35	166	elif self.states.has_key(name):
paul@35	167	self.states[name] -= 1
paul@51	168	if name in preformatted_tags or name in formatted_tags:
paul@51	169	self.level -= 1
paul@51	170	if not self.level:
paul@51	171	self.max_level = 0
paul@35	172
paul@35	173	def characters(self, content):
paul@35	174	if not self.is_preformatted():
paul@35	175	content = self.normalise(content, self.elements[-1])
paul@35	176	Parser.characters(self, content)
paul@35	177
paul@35	178	def skippedEntity(self, name):
paul@35	179	ch = htmlentitydefs.name2codepoint.get(name)
paul@35	180	if ch:
paul@35	181	self.text[-1].append(unichr(ch))
paul@35	182
paul@35	183	# Parser-related methods.
paul@35	184
paul@35	185	def handleElement(self, name):
paul@51	186
paul@51	187	"""
paul@51	188	Handle the completion of the element with the given 'name'. Any content
paul@51	189	will either be recorded for later use (by an enclosing element, for
paul@51	190	example) or emitted in some form.
paul@51	191	"""
paul@51	192
paul@42	193	text = "".join(self.text[-1])
paul@41	194
paul@41	195	# Handle state.
paul@41	196
paul@41	197	if name == "table":
paul@41	198	self.table_rows = 0
paul@41	199	elif name == "tr":
paul@41	200	self.table_columns = 0
paul@41	201
paul@41	202	# Find conversions.
paul@41	203
paul@35	204	conversion = None
paul@35	205
paul@35	206	# Handle list elements.
paul@35	207
paul@35	208	if name == "li" and len(self.elements) > 1:
paul@35	209	list_tag = self.elements[-2]
paul@35	210	conversion = list_tags.get(list_tag)
paul@35	211
paul@35	212	# Remember link target information.
paul@35	213
paul@35	214	elif link_target_tags.has_key(name):
paul@35	215	self.target = self.attributes[-1].get(link_target_tags[name])
paul@35	216	self.target_type = name
paul@35	217	text = ""
paul@35	218
paul@51	219	# For anchor links, just use the raw text and let Moin do the formatting.
paul@51	220
paul@51	221	elif name == "ac:link-body":
paul@51	222	self.target_type = name
paul@51	223	self.label = text
paul@51	224	text = ""
paul@51	225
paul@51	226	# Discard macro state.
paul@51	227
paul@51	228	elif name == "ac:macro":
paul@51	229	self.macro = None
paul@51	230	self.macro_parameters = {}
paul@51	231
paul@35	232	# Remember macro information.
paul@35	233
paul@51	234	elif name in ("ac:parameter", "ac:default-parameter"):
paul@35	235	self.macro_parameters[self.attributes[-1].get("ac:name")] = text
paul@35	236	text = ""
paul@35	237
paul@51	238	# Handle single-level tags.
paul@51	239
paul@51	240	elif name in single_level_tags and self.states[name] > 1:
paul@51	241	conversion = "%s"
paul@51	242
paul@51	243	# Handle preformatted sections.
paul@51	244
paul@51	245	elif name in preformatted_tags or name in formatted_tags:
paul@51	246
paul@51	247	# Nest the section appropriately.
paul@51	248
paul@51	249	level = 3 + self.max_level - self.level
paul@51	250	opening = "{" * level
paul@51	251	closing = "}" * level
paul@51	252
paul@51	253	# Macro name information is used to style rich text body regions.
paul@51	254
paul@51	255	if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):
paul@51	256	details = macro_rich_text_styles[self.macro]
paul@51	257	title = self.macro_parameters.get("title")
paul@51	258	if title:
paul@51	259	details = "%s\n\n%s" % (details, title)
paul@51	260
paul@51	261	conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)
paul@51	262
paul@51	263	elif name == "table":
paul@51	264	conversion = "%s#!table\n%%s\n%s" % (opening, closing)
paul@51	265
paul@51	266	else:
paul@51	267	conversion = "%s%%s%s" % (opening, closing)
paul@35	268
paul@35	269	# Handle the common case.
paul@35	270
paul@35	271	else:
paul@35	272	conversion = tags.get(name)
paul@35	273
paul@35	274	# Attempt to convert the text.
paul@35	275
paul@35	276	# Links require target information.
paul@35	277
paul@42	278	if name in ("ac:link", "ac:image"):
paul@51	279	prefix = link_target_types.get(self.target_type, "../")
paul@51	280	anchor = self.attributes[-1].get("ac:anchor")
paul@51	281	text = conversion % (prefix, anchor or self.target, self.label or text or self.target)
paul@51	282	self.target = self.target_type = self.label = None
paul@35	283
paul@35	284	# Handle the common case.
paul@35	285
paul@35	286	elif text and conversion:
paul@35	287	text = conversion % text
paul@35	288	elif simple_tags.has_key(name):
paul@35	289	text = simple_tags[name]
paul@35	290
paul@41	291	# Postprocess table columns and rows.
paul@41	292
paul@41	293	if name in ("th", "td"):
paul@41	294	if self.table_columns:
paul@41	295	text = "\n\|\| %s" % text
paul@41	296	self.table_columns += 1
paul@41	297	elif name == "tr":
paul@41	298	if self.table_rows:
paul@41	299	text = "\n==\n%s" % text
paul@41	300	self.table_rows += 1
paul@41	301
paul@35	302	# Normalise leading whitespace and indent the text if appropriate.
paul@35	303
paul@35	304	if name in indented_tags:
paul@35	305	text = " " * self.indent + text.lstrip()
paul@35	306
paul@35	307	# Add the converted text to the end of the parent element's text nodes.
paul@35	308
paul@35	309	if len(self.text) > 1:
paul@35	310	nodes = self.text[-2]
paul@35	311	if "".join(self.text[-2]):
paul@35	312	parent = self.elements[-2]
paul@35	313	if parent == "body":
paul@35	314	nodes.append("\n\n")
paul@35	315	elif list_tags.has_key(parent):
paul@35	316	nodes.append("\n")
paul@42	317	elif list_tags.has_key(name):
paul@35	318	nodes.append("\n")
paul@35	319	nodes.append(text)
paul@35	320
paul@35	321	# Otherwise, emit the text.
paul@35	322
paul@35	323	else:
paul@35	324	self.out.write(text)
paul@35	325
paul@35	326	def is_preformatted(self):
paul@51	327	return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)
paul@35	328
paul@35	329	# Whitespace normalisation.
paul@35	330
paul@35	331	def get_replacement(self, name):
paul@42	332	if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):
paul@35	333	return ""
paul@35	334	else:
paul@35	335	return " "
paul@35	336
paul@35	337	def normalise(self, text, name):
paul@35	338	return normalise_regexp.sub(self.get_replacement(name), text)
paul@35	339
paul@35	340	def parse(s, out):
paul@35	341
paul@35	342	"Parse the content in the string 's', writing a translation to 'out'."
paul@35	343
paul@35	344	# NOTE: CDATA sections appear to have erroneous endings.
paul@35	345
paul@35	346	s = u"""\
paul@35	347	<?xml version="1.0"?>
paul@35	348	<!DOCTYPE html
paul@35	349	PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@35	350	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@35	351	<html xmlns="http://www.w3.org/1999/xhtml">
paul@35	352	<body>
paul@35	353	%s
paul@35	354	</body>
paul@35	355	</html>""" % s.replace("]] >", "]]>")
paul@35	356
paul@35	357	f = StringIO(s.encode("utf-8"))
paul@35	358	try:
paul@35	359	parser = ConfluenceXMLParser(out)
paul@35	360	parser.parse(f)
paul@35	361	finally:
paul@35	362	f.close()
paul@35	363
paul@35	364	if __name__ == "__main__":
paul@35	365	s = sys.stdin.read()
paul@41	366	out = codecs.getwriter("utf-8")(sys.stdout)
paul@41	367	parse(s, out)
paul@35	368
paul@35	369	# vim: tabstop=4 expandtab shiftwidth=4