ConfluenceConverter (annotate xmlparser.py in b7133a21ad01)

ConfluenceConverter

Annotated xmlparser.py

59:b7133a21ad01

2013-04-13

Paul Boddie

Introduced explicit Unicode usage and handled blank but not empty link labels.

paul@35	1	#!/usr/bin/env python
paul@35	2
paul@35	3	"""
paul@35	4	Confluence Wiki XML/XHTML syntax parsing.
paul@35	5
paul@35	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@35	7
paul@35	8	This software is free software; you can redistribute it and/or
paul@35	9	modify it under the terms of the GNU General Public License as
paul@35	10	published by the Free Software Foundation; either version 2 of
paul@35	11	the License, or (at your option) any later version.
paul@35	12
paul@35	13	This software is distributed in the hope that it will be useful,
paul@35	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@35	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@35	16	GNU General Public License for more details.
paul@35	17
paul@35	18	You should have received a copy of the GNU General Public
paul@35	19	License along with this library; see the file LICENCE.txt
paul@35	20	If not, write to the Free Software Foundation, Inc.,
paul@35	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@35	22	"""
paul@35	23
paul@35	24	try:
paul@35	25	from cStringIO import StringIO
paul@35	26	except ImportError:
paul@35	27	from StringIO import StringIO
paul@35	28
paul@51	29	from MoinMoin import wikiutil
paul@35	30	from common import *
paul@35	31	from xmlread import Parser
paul@35	32	import re
paul@35	33	import sys
paul@35	34	import operator
paul@35	35	import htmlentitydefs
paul@41	36	import codecs
paul@35	37
paul@35	38	# XML dialect syntax parsing.
paul@35	39
paul@35	40	tags = {
paul@35	41	# XHTML tag MoinMoin syntax
paul@35	42	"strong" : "'''%s'''",
paul@35	43	"em" : "''%s''",
paul@35	44	"u" : "__%s__",
paul@35	45	"del" : "--(%s)--",
paul@35	46	"sup" : "^%s^",
paul@35	47	"sub" : ",,%s,,",
paul@35	48	"code" : "`%s`",
paul@41	49	"tbody" : "%s",
paul@41	50	"tr" : "%s",
paul@41	51	"th" : "'''%s'''",
paul@41	52	"td" : "%s",
paul@35	53	"blockquote" : " %s",
paul@35	54	"small" : "~-%s-~",
paul@35	55	"big" : "~+%s+~",
paul@35	56	"p" : "%s",
paul@35	57	"ol" : "%s",
paul@35	58	"ul" : "%s",
paul@35	59	"ac:link" : "[[%s%s\|%s]]",
paul@42	60	"ac:image" : "{{%s%s\|%s}}",
paul@55	61	"a" : "[[%s\|%s]]",
paul@35	62	}
paul@35	63
paul@35	64	for tag, translation in blocktypes.items():
paul@35	65	tags[tag] = translation
paul@35	66
paul@35	67	simple_tags = {
paul@35	68	# XHTML tag MoinMoin syntax
paul@35	69	"br" : "<<BR>>",
paul@35	70	}
paul@35	71
paul@35	72	list_tags = {
paul@35	73	# XHTML list tag MoinMoin list item syntax
paul@35	74	"ol" : "1. %s",
paul@35	75	"ul" : "* %s",
paul@35	76	}
paul@35	77
paul@51	78	preformatted_tags = ["pre", "ac:plain-text-body"]
paul@51	79	single_level_tags = ["strong", "em", "u", "del", "sup", "sub", "code"]
paul@51	80	formatted_tags = ["ac:rich-text-body", "table"]
paul@51	81
paul@56	82	indented_tags = ["li", "p"] + preformatted_tags + formatted_tags
paul@56	83	block_tags = indented_tags + blocktypes.keys() + list_tags.keys()
paul@58	84	span_override_tags = ["ac:link"]
paul@56	85
paul@35	86	link_target_tags = {
paul@54	87	# Confluence element Attributes providing the target
paul@54	88	"ri:page" : ("ri:space-key", "ri:content-title"),
paul@54	89	"ri:attachment" : ("ri:filename",),
paul@54	90	"ri:user" : ("ri:username",),
paul@35	91	}
paul@35	92
paul@54	93	link_target_prefixes = {
paul@54	94	# Attribute with details Prefix ensuring correct relative link
paul@54	95	"ri:space-key" : "..",
paul@54	96	"ri:content-title" : "..",
paul@54	97	}
paul@54	98
paul@54	99	link_label_attributes = "ri:content-title", "ac:link-body"
paul@54	100
paul@51	101	# NOTE: User links should support the intended user namespace prefix.
paul@51	102
paul@51	103	link_target_types = {
paul@51	104	# Confluence element MoinMoin link prefix
paul@51	105	"ri:attachment" : "attachment:",
paul@51	106	"ri:user" : "",
paul@51	107	"ac:link-body" : "#",
paul@51	108	}
paul@51	109
paul@35	110	macro_rich_text_styles = {
paul@35	111	# Confluence style MoinMoin admonition style
paul@35	112	"note" : "caution",
paul@35	113	"warning" : "warning",
paul@35	114	"info" : "important",
paul@35	115	"tip" : "tip",
paul@35	116	}
paul@35	117
paul@35	118	normalise_regexp_str = r"\s+"
paul@35	119	normalise_regexp = re.compile(normalise_regexp_str)
paul@35	120
paul@35	121	class ConfluenceXMLParser(Parser):
paul@35	122
paul@35	123	"Handle content from Confluence 4 page revisions."
paul@35	124
paul@35	125	def __init__(self, out):
paul@35	126	Parser.__init__(self)
paul@35	127	self.out = out
paul@35	128
paul@51	129	# Link target and label information.
paul@35	130
paul@35	131	self.target = None
paul@35	132	self.target_type = None
paul@51	133	self.label = None
paul@35	134
paul@35	135	# Macro information.
paul@35	136
paul@35	137	self.macro = None
paul@35	138	self.macro_parameters = {}
paul@35	139
paul@51	140	# Indentation and element nesting states.
paul@35	141
paul@35	142	self.indent = 0
paul@35	143	self.states = {}
paul@51	144	self.max_level = self.level = 0
paul@51	145
paul@51	146	for name in preformatted_tags + single_level_tags:
paul@35	147	self.states[name] = 0
paul@35	148
paul@41	149	# Table states.
paul@41	150
paul@41	151	self.table_rows = 0
paul@41	152	self.table_columns = 0
paul@41	153
paul@56	154	# Block states.
paul@56	155
paul@56	156	self.have_block = False
paul@56	157
paul@35	158	# ContentHandler-related methods.
paul@35	159
paul@35	160	def startElement(self, name, attrs):
paul@54	161
paul@54	162	# Track indentation for lists.
paul@54	163
paul@35	164	if list_tags.has_key(name):
paul@35	165	self.indent += 1
paul@54	166
paul@54	167	# Track element nesting.
paul@54	168
paul@35	169	elif self.states.has_key(name):
paul@35	170	self.states[name] += 1
paul@54	171
paul@54	172	# Track cumulative element nesting in order to produce appropriate depth
paul@54	173	# indicators in the formatted output.
paul@54	174
paul@51	175	if name in preformatted_tags or name in formatted_tags:
paul@51	176	self.level += 1
paul@51	177	self.max_level = max(self.level, self.max_level)
paul@51	178
paul@35	179	Parser.startElement(self, name, attrs)
paul@35	180
paul@51	181	# Remember macro information for use within the element.
paul@51	182
paul@51	183	if name == "ac:macro":
paul@51	184	self.macro = self.attributes[-1].get("ac:name")
paul@51	185
paul@35	186	def endElement(self, name):
paul@35	187	Parser.endElement(self, name)
paul@51	188
paul@35	189	if list_tags.has_key(name):
paul@35	190	self.indent -= 1
paul@35	191	elif self.states.has_key(name):
paul@35	192	self.states[name] -= 1
paul@51	193	if name in preformatted_tags or name in formatted_tags:
paul@51	194	self.level -= 1
paul@51	195	if not self.level:
paul@51	196	self.max_level = 0
paul@35	197
paul@35	198	def characters(self, content):
paul@35	199	if not self.is_preformatted():
paul@35	200	content = self.normalise(content, self.elements[-1])
paul@35	201	Parser.characters(self, content)
paul@35	202
paul@35	203	def skippedEntity(self, name):
paul@35	204	ch = htmlentitydefs.name2codepoint.get(name)
paul@35	205	if ch:
paul@35	206	self.text[-1].append(unichr(ch))
paul@35	207
paul@35	208	# Parser-related methods.
paul@35	209
paul@35	210	def handleElement(self, name):
paul@51	211
paul@51	212	"""
paul@51	213	Handle the completion of the element with the given 'name'. Any content
paul@51	214	will either be recorded for later use (by an enclosing element, for
paul@51	215	example) or emitted in some form.
paul@51	216	"""
paul@51	217
paul@59	218	text = u"".join(self.text[-1])
paul@41	219
paul@41	220	# Handle state.
paul@41	221
paul@41	222	if name == "table":
paul@41	223	self.table_rows = 0
paul@41	224	elif name == "tr":
paul@41	225	self.table_columns = 0
paul@41	226
paul@41	227	# Find conversions.
paul@41	228
paul@35	229	conversion = None
paul@35	230
paul@35	231	# Handle list elements.
paul@35	232
paul@35	233	if name == "li" and len(self.elements) > 1:
paul@35	234	list_tag = self.elements[-2]
paul@35	235	conversion = list_tags.get(list_tag)
paul@35	236
paul@35	237	# Remember link target information.
paul@35	238
paul@35	239	elif link_target_tags.has_key(name):
paul@54	240	target_details = []
paul@54	241
paul@54	242	# Get target details from the element's attributes.
paul@54	243
paul@54	244	for attrname in link_target_tags[name]:
paul@54	245	attrvalue = self.attributes[-1].get(attrname)
paul@54	246	if attrvalue:
paul@54	247	target_details.append(attrvalue)
paul@54	248	prefix = link_target_prefixes.get(attrname)
paul@54	249	if prefix:
paul@54	250	target_details.insert(0, prefix)
paul@54	251	if attrname in link_label_attributes and not self.label:
paul@54	252	self.label = attrvalue
paul@54	253
paul@54	254	# Make a link based on the details.
paul@54	255
paul@59	256	self.target = u"/".join(target_details)
paul@35	257	self.target_type = name
paul@35	258	text = ""
paul@35	259
paul@51	260	# For anchor links, just use the raw text and let Moin do the formatting.
paul@51	261
paul@51	262	elif name == "ac:link-body":
paul@54	263	if not self.target_type:
paul@54	264	self.target_type = name
paul@59	265	self.label = text.strip()
paul@51	266	text = ""
paul@51	267
paul@55	268	# For conventional links, remember the href attribute as the target.
paul@55	269
paul@55	270	elif name == "a":
paul@55	271	self.target = self.attributes[-1].get("href")
paul@59	272	self.label = text.strip()
paul@55	273	text = ""
paul@55	274
paul@51	275	# Discard macro state.
paul@51	276
paul@51	277	elif name == "ac:macro":
paul@51	278	self.macro = None
paul@51	279	self.macro_parameters = {}
paul@51	280
paul@35	281	# Remember macro information.
paul@35	282
paul@51	283	elif name in ("ac:parameter", "ac:default-parameter"):
paul@35	284	self.macro_parameters[self.attributes[-1].get("ac:name")] = text
paul@35	285	text = ""
paul@35	286
paul@51	287	# Handle single-level tags.
paul@51	288
paul@51	289	elif name in single_level_tags and self.states[name] > 1:
paul@51	290	conversion = "%s"
paul@51	291
paul@51	292	# Handle preformatted sections.
paul@51	293
paul@51	294	elif name in preformatted_tags or name in formatted_tags:
paul@51	295
paul@51	296	# Nest the section appropriately.
paul@51	297
paul@51	298	level = 3 + self.max_level - self.level
paul@51	299	opening = "{" * level
paul@51	300	closing = "}" * level
paul@51	301
paul@51	302	# Macro name information is used to style rich text body regions.
paul@51	303
paul@51	304	if name != "table" and self.macro and macro_rich_text_styles.has_key(self.macro):
paul@51	305	details = macro_rich_text_styles[self.macro]
paul@51	306	title = self.macro_parameters.get("title")
paul@51	307	if title:
paul@51	308	details = "%s\n\n%s" % (details, title)
paul@51	309
paul@51	310	conversion = "%s#!wiki %s\n\n%%s\n%s" % (opening, details, closing)
paul@51	311
paul@51	312	elif name == "table":
paul@51	313	conversion = "%s#!table\n%%s\n%s" % (opening, closing)
paul@51	314
paul@51	315	else:
paul@51	316	conversion = "%s%%s%s" % (opening, closing)
paul@35	317
paul@55	318	# Handle the common case and simpler special cases.
paul@35	319
paul@55	320	if not conversion:
paul@35	321	conversion = tags.get(name)
paul@35	322
paul@56	323
paul@56	324
paul@35	325	# Attempt to convert the text.
paul@35	326
paul@35	327	# Links require target information.
paul@35	328
paul@42	329	if name in ("ac:link", "ac:image"):
paul@54	330	prefix = link_target_types.get(self.target_type, "")
paul@51	331	anchor = self.attributes[-1].get("ac:anchor")
paul@58	332	text = conversion % (prefix, anchor or self.target, self.label or text.strip() or self.target)
paul@51	333	self.target = self.target_type = self.label = None
paul@35	334
paul@55	335	elif name == "a":
paul@59	336	text = conversion % (self.target, self.label or self.target)
paul@55	337	self.target = self.target_type = self.label = None
paul@55	338
paul@35	339	# Handle the common case.
paul@35	340
paul@35	341	elif text and conversion:
paul@35	342	text = conversion % text
paul@35	343	elif simple_tags.has_key(name):
paul@35	344	text = simple_tags[name]
paul@35	345
paul@41	346	# Postprocess table columns and rows.
paul@41	347
paul@41	348	if name in ("th", "td"):
paul@41	349	if self.table_columns:
paul@41	350	text = "\n\|\| %s" % text
paul@41	351	self.table_columns += 1
paul@41	352	elif name == "tr":
paul@41	353	if self.table_rows:
paul@41	354	text = "\n==\n%s" % text
paul@41	355	self.table_rows += 1
paul@41	356
paul@35	357	# Normalise leading whitespace and indent the text if appropriate.
paul@35	358
paul@35	359	if name in indented_tags:
paul@35	360	text = " " * self.indent + text.lstrip()
paul@35	361
paul@35	362	# Add the converted text to the end of the parent element's text nodes.
paul@35	363
paul@35	364	if len(self.text) > 1:
paul@35	365	nodes = self.text[-2]
paul@58	366	parent = self.elements[-2]
paul@56	367
paul@56	368	# Where preceding text exists, add any blank line separators.
paul@56	369
paul@59	370	if u"".join(nodes):
paul@56	371
paul@56	372	# All top-level elements are separated with blank lines.
paul@56	373
paul@35	374	if parent == "body":
paul@56	375	nodes.append("\n")
paul@56	376
paul@56	377	# Block elements always cause a new line to be started.
paul@56	378
paul@58	379	if name in block_tags or self.have_block and name not in span_override_tags:
paul@35	380	nodes.append("\n")
paul@56	381
paul@56	382	self.have_block = False
paul@56	383
paul@58	384	# Lists inside lists require separation.
paul@58	385
paul@58	386	elif list_tags.has_key(name) and parent == "li":
paul@58	387	nodes.append("\n")
paul@56	388
paul@58	389	# Without preceding text, save any block node state for non-block
paul@58	390	# elements so that new line separators can be added at another
paul@58	391	# level.
paul@58	392
paul@58	393	elif name in block_tags and parent not in block_tags:
paul@58	394	self.have_block = True
paul@58	395
paul@58	396	elif name not in block_tags and self.have_block and name not in span_override_tags:
paul@56	397	self.have_block = True
paul@56	398
paul@56	399	else:
paul@56	400	self.have_block = False
paul@56	401
paul@35	402	nodes.append(text)
paul@35	403
paul@56	404	# Otherwise, emit the text (at the top level of the document).
paul@35	405
paul@35	406	else:
paul@35	407	self.out.write(text)
paul@35	408
paul@35	409	def is_preformatted(self):
paul@51	410	return reduce(operator.or_, [self.states[tag] for tag in preformatted_tags], False)
paul@35	411
paul@35	412	# Whitespace normalisation.
paul@35	413
paul@35	414	def get_replacement(self, name):
paul@42	415	if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):
paul@35	416	return ""
paul@35	417	else:
paul@35	418	return " "
paul@35	419
paul@35	420	def normalise(self, text, name):
paul@35	421	return normalise_regexp.sub(self.get_replacement(name), text)
paul@35	422
paul@35	423	def parse(s, out):
paul@35	424
paul@35	425	"Parse the content in the string 's', writing a translation to 'out'."
paul@35	426
paul@35	427	# NOTE: CDATA sections appear to have erroneous endings.
paul@35	428
paul@35	429	s = u"""\
paul@35	430	<?xml version="1.0"?>
paul@35	431	<!DOCTYPE html
paul@35	432	PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@35	433	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@35	434	<html xmlns="http://www.w3.org/1999/xhtml">
paul@35	435	<body>
paul@35	436	%s
paul@35	437	</body>
paul@35	438	</html>""" % s.replace("]] >", "]]>")
paul@35	439
paul@35	440	f = StringIO(s.encode("utf-8"))
paul@35	441	try:
paul@35	442	parser = ConfluenceXMLParser(out)
paul@35	443	parser.parse(f)
paul@35	444	finally:
paul@35	445	f.close()
paul@35	446
paul@35	447	if __name__ == "__main__":
paul@35	448	s = sys.stdin.read()
paul@41	449	out = codecs.getwriter("utf-8")(sys.stdout)
paul@41	450	parse(s, out)
paul@35	451
paul@35	452	# vim: tabstop=4 expandtab shiftwidth=4