ConfluenceConverter (annotate xmlparser.py in 8479d1acb570)

ConfluenceConverter

Annotated xmlparser.py

42:8479d1acb570

2013-03-03

Paul Boddie

Added macro support to the Wiki markup parser, changing the block spacing approach. Added image support to the XHTML parser, also changing the block spacing and normalisation approach. Changed the conversion error message to show the filename of any failing file.

paul@35	1	#!/usr/bin/env python
paul@35	2
paul@35	3	"""
paul@35	4	Confluence Wiki XML/XHTML syntax parsing.
paul@35	5
paul@35	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@35	7
paul@35	8	This software is free software; you can redistribute it and/or
paul@35	9	modify it under the terms of the GNU General Public License as
paul@35	10	published by the Free Software Foundation; either version 2 of
paul@35	11	the License, or (at your option) any later version.
paul@35	12
paul@35	13	This software is distributed in the hope that it will be useful,
paul@35	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@35	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@35	16	GNU General Public License for more details.
paul@35	17
paul@35	18	You should have received a copy of the GNU General Public
paul@35	19	License along with this library; see the file LICENCE.txt
paul@35	20	If not, write to the Free Software Foundation, Inc.,
paul@35	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@35	22	"""
paul@35	23
paul@35	24	try:
paul@35	25	from cStringIO import StringIO
paul@35	26	except ImportError:
paul@35	27	from StringIO import StringIO
paul@35	28
paul@35	29	from common import *
paul@35	30	from xmlread import Parser
paul@35	31	import re
paul@35	32	import sys
paul@35	33	import operator
paul@35	34	import htmlentitydefs
paul@41	35	import codecs
paul@35	36
paul@35	37	# XML dialect syntax parsing.
paul@35	38
paul@35	39	tags = {
paul@35	40	# XHTML tag MoinMoin syntax
paul@35	41	"strong" : "'''%s'''",
paul@35	42	"em" : "''%s''",
paul@35	43	"u" : "__%s__",
paul@35	44	"del" : "--(%s)--",
paul@35	45	"sup" : "^%s^",
paul@35	46	"sub" : ",,%s,,",
paul@35	47	"code" : "`%s`",
paul@35	48	"pre" : "{{{%s}}}",
paul@41	49	"table" : "{{{#!table\n%s\n}}}",
paul@41	50	"tbody" : "%s",
paul@41	51	"tr" : "%s",
paul@41	52	"th" : "'''%s'''",
paul@41	53	"td" : "%s",
paul@35	54	"blockquote" : " %s",
paul@35	55	"small" : "~-%s-~",
paul@35	56	"big" : "~+%s+~",
paul@35	57	"p" : "%s",
paul@35	58	"ol" : "%s",
paul@35	59	"ul" : "%s",
paul@35	60	"ac:plain-text-body" : "{{{%s}}}",
paul@35	61	"ac:link" : "[[%s%s\|%s]]",
paul@42	62	"ac:image" : "{{%s%s\|%s}}",
paul@35	63	}
paul@35	64
paul@35	65	for tag, translation in blocktypes.items():
paul@35	66	tags[tag] = translation
paul@35	67
paul@35	68	simple_tags = {
paul@35	69	# XHTML tag MoinMoin syntax
paul@35	70	"br" : "<<BR>>",
paul@35	71	}
paul@35	72
paul@35	73	list_tags = {
paul@35	74	# XHTML list tag MoinMoin list item syntax
paul@35	75	"ol" : "1. %s",
paul@35	76	"ul" : "* %s",
paul@35	77	}
paul@35	78
paul@35	79	indented_tags = ["li", "p"]
paul@35	80
paul@35	81	link_target_tags = {
paul@35	82	# Confluence element Attribute providing the target
paul@35	83	"ri:page" : "ri:content-title",
paul@35	84	"ri:attachment" : "ri:filename",
paul@35	85	"ri:user" : "ri:username",
paul@35	86	}
paul@35	87
paul@35	88	macro_rich_text_styles = {
paul@35	89	# Confluence style MoinMoin admonition style
paul@35	90	"note" : "caution",
paul@35	91	"warning" : "warning",
paul@35	92	"info" : "important",
paul@35	93	"tip" : "tip",
paul@35	94	}
paul@35	95
paul@35	96	normalise_regexp_str = r"\s+"
paul@35	97	normalise_regexp = re.compile(normalise_regexp_str)
paul@35	98
paul@35	99	class ConfluenceXMLParser(Parser):
paul@35	100
paul@35	101	"Handle content from Confluence 4 page revisions."
paul@35	102
paul@35	103	def __init__(self, out):
paul@35	104	Parser.__init__(self)
paul@35	105	self.out = out
paul@35	106
paul@35	107	# Link target information.
paul@35	108
paul@35	109	self.target = None
paul@35	110	self.target_type = None
paul@35	111
paul@35	112	# Macro information.
paul@35	113
paul@35	114	self.macro = None
paul@35	115	self.macro_parameters = {}
paul@35	116
paul@35	117	# Indentation and preformatted states.
paul@35	118
paul@35	119	self.indent = 0
paul@35	120	self.states = {}
paul@35	121	for name in ("pre", "ac:plain-text-body"):
paul@35	122	self.states[name] = 0
paul@35	123
paul@41	124	# Table states.
paul@41	125
paul@41	126	self.table_rows = 0
paul@41	127	self.table_columns = 0
paul@41	128
paul@35	129	# ContentHandler-related methods.
paul@35	130
paul@35	131	def startElement(self, name, attrs):
paul@35	132	if list_tags.has_key(name):
paul@35	133	self.indent += 1
paul@35	134	elif self.states.has_key(name):
paul@35	135	self.states[name] += 1
paul@35	136	Parser.startElement(self, name, attrs)
paul@35	137
paul@35	138	def endElement(self, name):
paul@35	139	Parser.endElement(self, name)
paul@35	140	if list_tags.has_key(name):
paul@35	141	self.indent -= 1
paul@35	142	elif self.states.has_key(name):
paul@35	143	self.states[name] -= 1
paul@35	144
paul@35	145	def characters(self, content):
paul@35	146	if not self.is_preformatted():
paul@35	147	content = self.normalise(content, self.elements[-1])
paul@35	148	Parser.characters(self, content)
paul@35	149
paul@35	150	def skippedEntity(self, name):
paul@35	151	ch = htmlentitydefs.name2codepoint.get(name)
paul@35	152	if ch:
paul@35	153	self.text[-1].append(unichr(ch))
paul@35	154
paul@35	155	# Parser-related methods.
paul@35	156
paul@35	157	def handleElement(self, name):
paul@42	158	text = "".join(self.text[-1])
paul@41	159
paul@41	160	# Handle state.
paul@41	161
paul@41	162	if name == "table":
paul@41	163	self.table_rows = 0
paul@41	164	elif name == "tr":
paul@41	165	self.table_columns = 0
paul@41	166
paul@41	167	# Find conversions.
paul@41	168
paul@35	169	conversion = None
paul@35	170
paul@35	171	# Handle list elements.
paul@35	172
paul@35	173	if name == "li" and len(self.elements) > 1:
paul@35	174	list_tag = self.elements[-2]
paul@35	175	conversion = list_tags.get(list_tag)
paul@35	176
paul@35	177	# Remember link target information.
paul@35	178
paul@35	179	elif link_target_tags.has_key(name):
paul@35	180	self.target = self.attributes[-1].get(link_target_tags[name])
paul@35	181	self.target_type = name
paul@35	182	text = ""
paul@35	183
paul@35	184	# Remember macro information.
paul@35	185
paul@35	186	elif name == "ac:parameter":
paul@35	187	self.macro_parameters[self.attributes[-1].get("ac:name")] = text
paul@35	188	text = ""
paul@35	189
paul@35	190	elif name == "ac:macro":
paul@35	191	self.macro = self.attributes[-1].get("ac:name")
paul@35	192
paul@35	193	# Handle the common case.
paul@35	194
paul@35	195	else:
paul@35	196	conversion = tags.get(name)
paul@35	197
paul@35	198	# Attempt to convert the text.
paul@35	199
paul@35	200	# Links require target information.
paul@35	201	# NOTE: User links should support the intended user namespace prefix.
paul@35	202
paul@42	203	if name in ("ac:link", "ac:image"):
paul@35	204	if self.target_type == "ri:attachment":
paul@35	205	prefix = "attachment:"
paul@35	206	elif self.target_type == "ri:user":
paul@35	207	prefix = ""
paul@35	208	else:
paul@35	209	prefix = "../"
paul@35	210
paul@35	211	text = conversion % (prefix, self.target, text or self.target)
paul@35	212	self.target = self.target_type = None
paul@35	213
paul@35	214	# Macro name information is used to style rich text body regions.
paul@35	215
paul@35	216	elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):
paul@35	217	details = macro_rich_text_styles[self.macro]
paul@35	218	title = self.macro_parameters.get("title")
paul@35	219	if title:
paul@35	220	details = "%s\n\n%s" % (details, title)
paul@35	221	text = "{{{#!wiki %s\n\n%s}}}" % (details, text)
paul@35	222	self.macro = None
paul@35	223	self.macro_parameters = {}
paul@35	224
paul@35	225	# Handle the common case.
paul@35	226
paul@35	227	elif text and conversion:
paul@35	228	text = conversion % text
paul@35	229	elif simple_tags.has_key(name):
paul@35	230	text = simple_tags[name]
paul@35	231
paul@41	232	# Postprocess table columns and rows.
paul@41	233
paul@41	234	if name in ("th", "td"):
paul@41	235	if self.table_columns:
paul@41	236	text = "\n\|\| %s" % text
paul@41	237	self.table_columns += 1
paul@41	238	elif name == "tr":
paul@41	239	if self.table_rows:
paul@41	240	text = "\n==\n%s" % text
paul@41	241	self.table_rows += 1
paul@41	242
paul@35	243	# Normalise leading whitespace and indent the text if appropriate.
paul@35	244
paul@35	245	if name in indented_tags:
paul@35	246	text = " " * self.indent + text.lstrip()
paul@35	247
paul@35	248	# Add the converted text to the end of the parent element's text nodes.
paul@35	249
paul@35	250	if len(self.text) > 1:
paul@35	251	nodes = self.text[-2]
paul@35	252	if "".join(self.text[-2]):
paul@35	253	parent = self.elements[-2]
paul@35	254	if parent == "body":
paul@35	255	nodes.append("\n\n")
paul@35	256	elif list_tags.has_key(parent):
paul@35	257	nodes.append("\n")
paul@42	258	elif list_tags.has_key(name):
paul@35	259	nodes.append("\n")
paul@35	260	nodes.append(text)
paul@35	261
paul@35	262	# Otherwise, emit the text.
paul@35	263
paul@35	264	else:
paul@35	265	self.out.write(text)
paul@35	266
paul@35	267	def is_preformatted(self):
paul@35	268	return reduce(operator.or_, self.states.values(), False)
paul@35	269
paul@35	270	# Whitespace normalisation.
paul@35	271
paul@35	272	def get_replacement(self, name):
paul@42	273	if name in ("html", "body", "table", "tbody", "tr") or list_tags.has_key(name):
paul@35	274	return ""
paul@35	275	else:
paul@35	276	return " "
paul@35	277
paul@35	278	def normalise(self, text, name):
paul@35	279	return normalise_regexp.sub(self.get_replacement(name), text)
paul@35	280
paul@35	281	def parse(s, out):
paul@35	282
paul@35	283	"Parse the content in the string 's', writing a translation to 'out'."
paul@35	284
paul@35	285	# NOTE: CDATA sections appear to have erroneous endings.
paul@35	286
paul@35	287	s = u"""\
paul@35	288	<?xml version="1.0"?>
paul@35	289	<!DOCTYPE html
paul@35	290	PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@35	291	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@35	292	<html xmlns="http://www.w3.org/1999/xhtml">
paul@35	293	<body>
paul@35	294	%s
paul@35	295	</body>
paul@35	296	</html>""" % s.replace("]] >", "]]>")
paul@35	297
paul@35	298	f = StringIO(s.encode("utf-8"))
paul@35	299	try:
paul@35	300	parser = ConfluenceXMLParser(out)
paul@35	301	parser.parse(f)
paul@35	302	finally:
paul@35	303	f.close()
paul@35	304
paul@35	305	if __name__ == "__main__":
paul@35	306	s = sys.stdin.read()
paul@41	307	out = codecs.getwriter("utf-8")(sys.stdout)
paul@41	308	parse(s, out)
paul@35	309
paul@35	310	# vim: tabstop=4 expandtab shiftwidth=4