ConfluenceConverter (annotate xmlparser.py in a2449a212f99)

ConfluenceConverter

Annotated xmlparser.py

55:a2449a212f99

2013-04-10

Paul Boddie

Added support for conventional "a" links, thus fixing various link issues.

paul@35	1	#!/usr/bin/env python
paul@35	2
paul@35	3	"""
paul@35	4	Confluence Wiki XML/XHTML syntax parsing.
paul@35	5
paul@35	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@35	7
paul@35	8	This software is free software; you can redistribute it and/or
paul@35	9	modify it under the terms of the GNU General Public License as
paul@35	10	published by the Free Software Foundation; either version 2 of
paul@35	11	the License, or (at your option) any later version.
paul@35	12
paul@35	13	This software is distributed in the hope that it will be useful,
paul@35	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@35	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@35	16	GNU General Public License for more details.
paul@35	17
paul@35	18	You should have received a copy of the GNU General Public
paul@35	19	License along with this library; see the file LICENCE.txt
paul@35	20	If not, write to the Free Software Foundation, Inc.,
paul@35	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@35	22	"""
paul@35	23
paul@35	24	try:
paul@35	25	from cStringIO import StringIO
paul@35	26	except ImportError:
paul@35	27	from StringIO import StringIO
paul@35	28
paul@51	29	from MoinMoin import wikiutil
paul@35	30	from common import *
paul@35	31	from xmlread import Parser
paul@35	32	import re
paul@35	33	import sys
paul@35	34	import operator
paul@35	35	import htmlentitydefs
paul@41	36	import codecs
paul@35	37
paul@35	38	# XML dialect syntax parsing.
paul@35	39
paul@35	40	tags = {
paul@35	41	# XHTML tag MoinMoin syntax
paul@35	42	"strong" : "'''%s'''",
paul@35	43	"em" : "''%s''",
paul@35	44	"u" : "__%s__",
paul@35	45	"del" : "--(%s)--",
paul@35	46	"sup" : "^%s^",
paul@35	47	"sub" : ",,%s,,",
paul@35	48	"code" : "`%s`",
paul@41	49	"tbody" : "%s",
paul@41	50	"tr" : "%s",
paul@41	51	"th" : "'''%s'''",
paul@41	52	"td" : "%s",
paul@35	53	"blockquote" : " %s",
paul@35	54	"small" : "~-%s-~",
paul@35	55	"big" : "~+%s+~",
paul@35	56	"p" : "%s",
paul@35	57	"ol" : "%s",
paul@35	58	"ul" : "%s",
paul@35	59	"ac:link" : "[[%s%s\|%s]]",
paul@42	60	"ac:image" : "{{%s%s\|%s}}",
paul@55	61	"a" : "[[%s\|%s]]",
paul@35	62	}
paul@35	63
paul@35	64	for tag, translation in blocktypes.items():
paul@35	65	tags[tag] = translation
paul@35	66
paul@35	67	simple_tags = {
paul@35	68	# XHTML tag MoinMoin syntax
paul@35	69	"br" : "<<BR>>",
paul@35	70	}
paul@35	71
paul@35	72	list_tags = {
paul@35	73	# XHTML list tag MoinMoin list item syntax
paul@35	74	"ol" : "1. %s",
paul@35	75	"ul" : "* %s",
paul@35	76	}
paul@35	77
paul@35	78	indented_tags = ["li", "p"]
paul@35	79
paul@51	80	preformatted_tags = ["pre", "ac:plain-text-body"]
paul@51	81	single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]
paul@51	82	formatted_tags = ["ac:rich-text-body", "table"]
paul@51	83
paul@35	84	link_target_tags = {
paul@54	85	# Confluence element Attributes providing the target
paul@54	86	"ri:page" : ("ri:space-key", "ri:content-title"),
paul@54	87	"ri:attachment" : ("ri:filename",),
paul@54	88	"ri:user" : ("ri:username",),
paul@35	89	}
paul@35	90
paul@54	91	link_target_prefixes = {
paul@54	92	# Attribute with details Prefix ensuring correct relative link
paul@54	93	"ri:space-key" : "..",
paul@54	94	"ri:content-title" : "..",
paul@54	95	}
paul@54	96
paul@54	97	link_label_attributes = "ri:content-title", "ac:link-body"
paul@54	98
paul@51	99	# NOTE: User links should support the intended user namespace prefix.
paul@51	100
paul@51	101	link_target_types = {
paul@51	102	# Confluence element MoinMoin link prefix
paul@51	103	"ri:attachment" : "attachment:",
paul@51	104	"ri:user" : "",
paul@51	105	"ac:link-body" : "#",
paul@51	106	}
paul@51	107
paul@35	108	macro_rich_text_styles = {
paul@35	109	# Confluence style MoinMoin admonition style
paul@35	110	"note" : "caution",
paul@35	111	"warning" : "warning",
paul@35	112	"info" : "important",
paul@35	113	"tip" : "tip",
paul@35	114	}
paul@35	115
paul@35	116	normalise_regexp_str = r"\s+"
paul@35	117	normalise_regexp = re.compile(normalise_regexp_str)
paul@35	118
paul@35	119	class ConfluenceXMLParser(Parser):
paul@35	120
paul@35	121	"Handle content from Confluence 4 page revisions."
paul@35	122
paul@35	123	def __init__(self, out):
paul@35	124	Parser.__init__(self)
paul@35	125	self.out = out
paul@35	126
paul@51	127	# Link target and label information.
paul@35	128
paul@35	129	self.target = None
paul@35	130	self.target_type = None
paul@51	131	self.label = None
paul@35	132
paul@35	133	# Macro information.
paul@35	134
paul@35	135	self.macro = None
paul@35	136	self.macro_parameters = {}
paul@35	137
paul@51	138	# Indentation and element nesting states.
paul@35	139
paul@35	140	self.indent = 0
paul@35	141	self.states = {}
paul@51	142	self.max_level = self.level = 0
paul@51	143
paul@51	144	for name in preformatted_tags + single_level_tags:
paul@35	145	self.states[name] = 0
paul@35	146
paul@41	147	# Table states.
paul@41	148
paul@41	149	self.table_rows = 0
paul@41	150	self.table_columns = 0
paul@41	151
paul@35	152	# ContentHandler-related methods.
paul@35	153
paul@35	154	def startElement(self, name, attrs):
paul@54	155
paul@54	156	# Track indentation for lists.
paul@54	157
paul@35	158	if list_tags.has_key(name):
paul@35	159	self.indent += 1
paul@54	160
paul@54	161	# Track element nesting.
paul@54	162
paul@35	163	elif self.states.has_key(name):
paul@35	164	self.states[name] += 1
paul@54	165
paul@54	166	# Track cumulative element nesting in order to produce appropriate depth
paul@54	167	# indicators in the formatted output.
paul@54	168
paul@51	169	if name in preformatted_tags or name in formatted_tags:
paul@51	170	self.level += 1
paul@51	171	self.max_level = max(self.level, self.max_level)
paul@51	172
paul@35	173	Parser.startElement(self, name, attrs)
paul@35	174
paul@51	175	# Remember macro information for use within the element.
paul@51	176
paul@51	177	if name == "ac:macro":
paul@51	178	self.macro = self.attributes[-1].get("ac:name")
paul@51	179
paul@35	180	def endElement(self, name):
paul@35	181	Parser.endElement(self, name)
paul@51	182
paul@35	183	if list_tags.has_key(name):
paul@35	184	self.indent -= 1
paul@35	185	elif self.states.has_key(name):
paul@35	186	self.states[name] -= 1
paul@51	187	if name in preformatted_tags or name in formatted_tags:
paul@51	188	self.level -= 1
paul@51	189	if not self.level:
paul@51	190	self.max_level = 0
paul@35	191
paul@35	192	def characters(self, content):
paul@35	193	if not self.is_preformatted():
paul@35	194	content = self.normalise(content, self.elements[-1])
paul@35	195	Parser.characters(self, content)
paul@35	196
paul@35	197	def skippedEntity(self, name):
paul@35	198	ch = htmlentitydefs.name2codepoint.get(name)
paul@35	199	if ch:
paul@35	200	self.text[-1].append(unichr(ch))
paul@35	201
paul@35	202	# Parser-related methods.
paul@35	203
paul@35	204	def handleElement(self, name):
paul@51	205
paul@51	206	"""
paul@51	207	Handle the completion of the element with the given 'name'. Any content
paul@51	208	will either be recorded for later use (by an enclosing element, for
paul@51	209	example) or emitted in some form.
paul@51	210	"""
paul@51	211
paul@42	212	text = "".join(self.text[-1])
paul@41	213
paul@41	214	# Handle state.
paul@41	215
paul@41	216	if name == "table":
paul@41	217	self.table_rows = 0
paul@41	218	elif name == "tr":
paul@41	219	self.table_columns = 0
paul@41	220
paul@41	221	# Find conversions.
paul@41	222
paul@35	223	conversion = None
paul@35	224
paul@35	225	# Handle list elements.
paul@35	226
paul@35	227	if name == "li" and len(self.elements) > 1:
paul@35	228	list_tag = self.elements[-2]
paul@35	229	conversion = list_tags.get(list_tag)
paul@35	230
paul@35	231	# Remember link target information.
paul@35	232
paul@35	233	elif link_target_tags.has_key(name):
paul@54	234	target_details = []
paul@54	235
paul@54	236	# Get target details from the element's attributes.
paul@54	237
paul@54	238	for attrname in link_target_tags[name]:
paul@54	239	attrvalue = self.attributes[-1].get(attrname)
paul@54	240	if attrvalue:
paul@54	241	target_details.append(attrvalue)
paul@54	242	prefix = link_target_prefixes.get(attrname)
paul@54	243	if prefix:
paul@54	244	target_details.insert(0, prefix)
paul@54	245	if attrname in link_label_attributes and not self.label:
paul@54	246	self.label = attrvalue
paul@54	247
paul@54	248	# Make a link based on the details.
paul@54	249
paul@54	250	self.target = "/".join(target_details)
paul@35	251	self.target_type = name
paul@35	252	text = ""
paul@35	253
paul@51	254	# For anchor links, just use the raw text and let Moin do the formatting.
paul@51	255
paul@51	256	elif name == "ac:link-body":
paul@54	257	if not self.target_type:
paul@54	258	self.target_type = name
paul@51	259	self.label = text
paul@51	260	text = ""
paul@51	261
paul@55	262	# For conventional links, remember the href attribute as the target.
paul@55	263
paul@55	264	elif name == "a":
paul@55	265	self.target = self.attributes[-1].get("href")
paul@55	266	self.label = text
paul@55	267	text = ""
paul@55	268
paul@51	269	# Discard macro state.
paul@51	270
paul@51	271	elif name == "ac:macro":
paul@51	272	self.macro = None
paul@51	273	self.macro_parameters = {}
paul@51	274
paul@35	275	# Remember macro information.
paul@35	276
paul@51	277	elif name in ("ac:parameter", "ac:default-parameter"):
paul@35	278	self.macro_parameters[self.attributes[-1].get("ac:name")] = text
paul@35	279	text = ""
paul@35	280
paul@51	281	# Handle single-level tags.
paul@51	282
paul@51	283	elif name in single_level_tags and self.states[name] > 1:
paul@51	284	conversion = "%s"
paul@51	285
paul@51	286	# Handle preformatted sections.
paul@51	287
paul@51	288	elif name in preformatted_tags or name in formatted_tags:
paul@51	289
paul@51	290	# Nest the section appropriately.
paul@51	291
paul@51	292	level = 3 + self.max_level - self.level
paul@51	293	opening = "{" * level
paul@51	294	closing = "}" * level
paul@51	295
paul@51	296	# Macro name information is used to style rich text body regions.
paul@51	297
paul@51	298	if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):
paul@51	299	details = macro_rich_text_styles[self.macro]
paul@51	300	title = self.macro_parameters.get("title")
paul@51	301	if title:
paul@51	302	details = "%s\n\n%s" % (details, title)
paul@51	303
paul@51	304	conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)
paul@51	305
paul@51	306	elif name == "table":
paul@51	307	conversion = "%s#!table\n%%s\n%s" % (opening, closing)
paul@51	308
paul@51	309	else:
paul@51	310	conversion = "%s%%s%s" % (opening, closing)
paul@35	311
paul@55	312	# Handle the common case and simpler special cases.
paul@35	313
paul@55	314	if not conversion:
paul@35	315	conversion = tags.get(name)
paul@35	316
paul@35	317	# Attempt to convert the text.
paul@35	318
paul@35	319	# Links require target information.
paul@35	320
paul@42	321	if name in ("ac:link", "ac:image"):
paul@54	322	prefix = link_target_types.get(self.target_type, "")
paul@51	323	anchor = self.attributes[-1].get("ac:anchor")
paul@51	324	text = conversion % (prefix, anchor or self.target, self.label or text or self.target)
paul@51	325	self.target = self.target_type = self.label = None
paul@35	326
paul@55	327	elif name == "a":
paul@55	328	text = conversion % (self.target, self.label)
paul@55	329	self.target = self.target_type = self.label = None
paul@55	330
paul@35	331	# Handle the common case.
paul@35	332
paul@35	333	elif text and conversion:
paul@35	334	text = conversion % text
paul@35	335	elif simple_tags.has_key(name):
paul@35	336	text = simple_tags[name]
paul@35	337
paul@41	338	# Postprocess table columns and rows.
paul@41	339
paul@41	340	if name in ("th", "td"):
paul@41	341	if self.table_columns:
paul@41	342	text = "\n\|\| %s" % text
paul@41	343	self.table_columns += 1
paul@41	344	elif name == "tr":
paul@41	345	if self.table_rows:
paul@41	346	text = "\n==\n%s" % text
paul@41	347	self.table_rows += 1
paul@41	348
paul@35	349	# Normalise leading whitespace and indent the text if appropriate.
paul@35	350
paul@35	351	if name in indented_tags:
paul@35	352	text = " " * self.indent + text.lstrip()
paul@35	353
paul@35	354	# Add the converted text to the end of the parent element's text nodes.
paul@35	355
paul@35	356	if len(self.text) > 1:
paul@35	357	nodes = self.text[-2]
paul@35	358	if "".join(self.text[-2]):
paul@35	359	parent = self.elements[-2]
paul@35	360	if parent == "body":
paul@35	361	nodes.append("\n\n")
paul@35	362	elif list_tags.has_key(parent):
paul@35	363	nodes.append("\n")
paul@42	364	elif list_tags.has_key(name):
paul@35	365	nodes.append("\n")
paul@35	366	nodes.append(text)
paul@35	367
paul@35	368	# Otherwise, emit the text.
paul@35	369
paul@35	370	else:
paul@35	371	self.out.write(text)
paul@35	372
paul@35	373	def is_preformatted(self):
paul@51	374	return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)
paul@35	375
paul@35	376	# Whitespace normalisation.
paul@35	377
paul@35	378	def get_replacement(self, name):
paul@42	379	if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):
paul@35	380	return ""
paul@35	381	else:
paul@35	382	return " "
paul@35	383
paul@35	384	def normalise(self, text, name):
paul@35	385	return normalise_regexp.sub(self.get_replacement(name), text)
paul@35	386
paul@35	387	def parse(s, out):
paul@35	388
paul@35	389	"Parse the content in the string 's', writing a translation to 'out'."
paul@35	390
paul@35	391	# NOTE: CDATA sections appear to have erroneous endings.
paul@35	392
paul@35	393	s = u"""\
paul@35	394	<?xml version="1.0"?>
paul@35	395	<!DOCTYPE html
paul@35	396	PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@35	397	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@35	398	<html xmlns="http://www.w3.org/1999/xhtml">
paul@35	399	<body>
paul@35	400	%s
paul@35	401	</body>
paul@35	402	</html>""" % s.replace("]] >", "]]>")
paul@35	403
paul@35	404	f = StringIO(s.encode("utf-8"))
paul@35	405	try:
paul@35	406	parser = ConfluenceXMLParser(out)
paul@35	407	parser.parse(f)
paul@35	408	finally:
paul@35	409	f.close()
paul@35	410
paul@35	411	if __name__ == "__main__":
paul@35	412	s = sys.stdin.read()
paul@41	413	out = codecs.getwriter("utf-8")(sys.stdout)
paul@41	414	parse(s, out)
paul@35	415
paul@35	416	# vim: tabstop=4 expandtab shiftwidth=4