ConfluenceConverter (annotate xmlparser.py in 9d8a9c36829b)

ConfluenceConverter

Annotated xmlparser.py

35:9d8a9c36829b

2013-02-22

Paul Boddie

Replaced the parser module with separate modules covering the different areas of functionality.

paul@35	1	#!/usr/bin/env python
paul@35	2
paul@35	3	"""
paul@35	4	Confluence Wiki XML/XHTML syntax parsing.
paul@35	5
paul@35	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@35	7
paul@35	8	This software is free software; you can redistribute it and/or
paul@35	9	modify it under the terms of the GNU General Public License as
paul@35	10	published by the Free Software Foundation; either version 2 of
paul@35	11	the License, or (at your option) any later version.
paul@35	12
paul@35	13	This software is distributed in the hope that it will be useful,
paul@35	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@35	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@35	16	GNU General Public License for more details.
paul@35	17
paul@35	18	You should have received a copy of the GNU General Public
paul@35	19	License along with this library; see the file LICENCE.txt
paul@35	20	If not, write to the Free Software Foundation, Inc.,
paul@35	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@35	22	"""
paul@35	23
paul@35	24	try:
paul@35	25	from cStringIO import StringIO
paul@35	26	except ImportError:
paul@35	27	from StringIO import StringIO
paul@35	28
paul@35	29	from common import *
paul@35	30	from xmlread import Parser
paul@35	31	import re
paul@35	32	import sys
paul@35	33	import operator
paul@35	34	import htmlentitydefs
paul@35	35
paul@35	36	# XML dialect syntax parsing.
paul@35	37
paul@35	38	tags = {
paul@35	39	# XHTML tag MoinMoin syntax
paul@35	40	"strong" : "'''%s'''",
paul@35	41	"em" : "''%s''",
paul@35	42	"u" : "__%s__",
paul@35	43	"del" : "--(%s)--",
paul@35	44	"sup" : "^%s^",
paul@35	45	"sub" : ",,%s,,",
paul@35	46	"code" : "`%s`",
paul@35	47	"pre" : "{{{%s}}}",
paul@35	48	"blockquote" : " %s",
paul@35	49	"small" : "~-%s-~",
paul@35	50	"big" : "~+%s+~",
paul@35	51	"p" : "%s",
paul@35	52	"ol" : "%s",
paul@35	53	"ul" : "%s",
paul@35	54	"ac:plain-text-body" : "{{{%s}}}",
paul@35	55	"ac:link" : "[[%s%s\|%s]]",
paul@35	56	}
paul@35	57
paul@35	58	for tag, translation in blocktypes.items():
paul@35	59	tags[tag] = translation
paul@35	60
paul@35	61	simple_tags = {
paul@35	62	# XHTML tag MoinMoin syntax
paul@35	63	"br" : "<<BR>>",
paul@35	64	}
paul@35	65
paul@35	66	list_tags = {
paul@35	67	# XHTML list tag MoinMoin list item syntax
paul@35	68	"ol" : "1. %s",
paul@35	69	"ul" : "* %s",
paul@35	70	}
paul@35	71
paul@35	72	indented_tags = ["li", "p"]
paul@35	73
paul@35	74	link_target_tags = {
paul@35	75	# Confluence element Attribute providing the target
paul@35	76	"ri:page" : "ri:content-title",
paul@35	77	"ri:attachment" : "ri:filename",
paul@35	78	"ri:user" : "ri:username",
paul@35	79	}
paul@35	80
paul@35	81	macro_rich_text_styles = {
paul@35	82	# Confluence style MoinMoin admonition style
paul@35	83	"note" : "caution",
paul@35	84	"warning" : "warning",
paul@35	85	"info" : "important",
paul@35	86	"tip" : "tip",
paul@35	87	}
paul@35	88
paul@35	89	normalise_regexp_str = r"\s+"
paul@35	90	normalise_regexp = re.compile(normalise_regexp_str)
paul@35	91
paul@35	92	class ConfluenceXMLParser(Parser):
paul@35	93
paul@35	94	"Handle content from Confluence 4 page revisions."
paul@35	95
paul@35	96	def __init__(self, out):
paul@35	97	Parser.__init__(self)
paul@35	98	self.out = out
paul@35	99
paul@35	100	# Link target information.
paul@35	101
paul@35	102	self.target = None
paul@35	103	self.target_type = None
paul@35	104
paul@35	105	# Macro information.
paul@35	106
paul@35	107	self.macro = None
paul@35	108	self.macro_parameters = {}
paul@35	109
paul@35	110	# Indentation and preformatted states.
paul@35	111
paul@35	112	self.indent = 0
paul@35	113	self.states = {}
paul@35	114	for name in ("pre", "ac:plain-text-body"):
paul@35	115	self.states[name] = 0
paul@35	116
paul@35	117	# ContentHandler-related methods.
paul@35	118
paul@35	119	def startElement(self, name, attrs):
paul@35	120	if list_tags.has_key(name):
paul@35	121	self.indent += 1
paul@35	122	elif self.states.has_key(name):
paul@35	123	self.states[name] += 1
paul@35	124	Parser.startElement(self, name, attrs)
paul@35	125
paul@35	126	def endElement(self, name):
paul@35	127	Parser.endElement(self, name)
paul@35	128	if list_tags.has_key(name):
paul@35	129	self.indent -= 1
paul@35	130	elif self.states.has_key(name):
paul@35	131	self.states[name] -= 1
paul@35	132
paul@35	133	def characters(self, content):
paul@35	134	if not self.is_preformatted():
paul@35	135	content = self.normalise(content, self.elements[-1])
paul@35	136	Parser.characters(self, content)
paul@35	137
paul@35	138	def skippedEntity(self, name):
paul@35	139	ch = htmlentitydefs.name2codepoint.get(name)
paul@35	140	if ch:
paul@35	141	self.text[-1].append(unichr(ch))
paul@35	142
paul@35	143	# Parser-related methods.
paul@35	144
paul@35	145	def handleElement(self, name):
paul@35	146	text = "".join(self.text[-1])
paul@35	147	conversion = None
paul@35	148
paul@35	149	# Handle list elements.
paul@35	150
paul@35	151	if name == "li" and len(self.elements) > 1:
paul@35	152	list_tag = self.elements[-2]
paul@35	153	conversion = list_tags.get(list_tag)
paul@35	154
paul@35	155	# Remember link target information.
paul@35	156
paul@35	157	elif link_target_tags.has_key(name):
paul@35	158	self.target = self.attributes[-1].get(link_target_tags[name])
paul@35	159	self.target_type = name
paul@35	160	text = ""
paul@35	161
paul@35	162	# Remember macro information.
paul@35	163
paul@35	164	elif name == "ac:parameter":
paul@35	165	self.macro_parameters[self.attributes[-1].get("ac:name")] = text
paul@35	166	text = ""
paul@35	167
paul@35	168	elif name == "ac:macro":
paul@35	169	self.macro = self.attributes[-1].get("ac:name")
paul@35	170
paul@35	171	# Handle the common case.
paul@35	172
paul@35	173	else:
paul@35	174	conversion = tags.get(name)
paul@35	175
paul@35	176	# Attempt to convert the text.
paul@35	177
paul@35	178	# Links require target information.
paul@35	179	# NOTE: User links should support the intended user namespace prefix.
paul@35	180
paul@35	181	if name == "ac:link":
paul@35	182	if self.target_type == "ri:attachment":
paul@35	183	prefix = "attachment:"
paul@35	184	elif self.target_type == "ri:user":
paul@35	185	prefix = ""
paul@35	186	else:
paul@35	187	prefix = "../"
paul@35	188
paul@35	189	text = conversion % (prefix, self.target, text or self.target)
paul@35	190	self.target = self.target_type = None
paul@35	191
paul@35	192	# Macro name information is used to style rich text body regions.
paul@35	193
paul@35	194	elif name == "ac:macro" and macro_rich_text_styles.has_key(self.macro):
paul@35	195	details = macro_rich_text_styles[self.macro]
paul@35	196	title = self.macro_parameters.get("title")
paul@35	197	if title:
paul@35	198	details = "%s\n\n%s" % (details, title)
paul@35	199	text = "{{{#!wiki %s\n\n%s}}}" % (details, text)
paul@35	200	self.macro = None
paul@35	201	self.macro_parameters = {}
paul@35	202
paul@35	203	# Handle the common case.
paul@35	204
paul@35	205	elif text and conversion:
paul@35	206	text = conversion % text
paul@35	207	elif simple_tags.has_key(name):
paul@35	208	text = simple_tags[name]
paul@35	209
paul@35	210	# Normalise leading whitespace and indent the text if appropriate.
paul@35	211
paul@35	212	if name in indented_tags:
paul@35	213	text = " " * self.indent + text.lstrip()
paul@35	214
paul@35	215	# Add the converted text to the end of the parent element's text nodes.
paul@35	216
paul@35	217	if len(self.text) > 1:
paul@35	218	nodes = self.text[-2]
paul@35	219	if "".join(self.text[-2]):
paul@35	220	parent = self.elements[-2]
paul@35	221	if parent == "body":
paul@35	222	nodes.append("\n\n")
paul@35	223	elif list_tags.has_key(parent):
paul@35	224	nodes.append("\n")
paul@35	225	elif list_tags.has_key(name) and parent == "li":
paul@35	226	nodes.append("\n")
paul@35	227	nodes.append(text)
paul@35	228
paul@35	229	# Otherwise, emit the text.
paul@35	230
paul@35	231	else:
paul@35	232	self.out.write(text)
paul@35	233
paul@35	234	def is_preformatted(self):
paul@35	235	return reduce(operator.or_, self.states.values(), False)
paul@35	236
paul@35	237	# Whitespace normalisation.
paul@35	238
paul@35	239	def get_replacement(self, name):
paul@35	240	if name in ("html", "body") or list_tags.has_key(name):
paul@35	241	return ""
paul@35	242	else:
paul@35	243	return " "
paul@35	244
paul@35	245	def normalise(self, text, name):
paul@35	246	return normalise_regexp.sub(self.get_replacement(name), text)
paul@35	247
paul@35	248	def parse(s, out):
paul@35	249
paul@35	250	"Parse the content in the string 's', writing a translation to 'out'."
paul@35	251
paul@35	252	# NOTE: CDATA sections appear to have erroneous endings.
paul@35	253
paul@35	254	s = u"""\
paul@35	255	<?xml version="1.0"?>
paul@35	256	<!DOCTYPE html
paul@35	257	PUBLIC "-//W3C//DTD XHTML 1.0 Strict//EN"
paul@35	258	"http://www.w3.org/TR/xhtml1/DTD/xhtml1-strict.dtd">
paul@35	259	<html xmlns="http://www.w3.org/1999/xhtml">
paul@35	260	<body>
paul@35	261	%s
paul@35	262	</body>
paul@35	263	</html>""" % s.replace("]] >", "]]>")
paul@35	264
paul@35	265	f = StringIO(s.encode("utf-8"))
paul@35	266	try:
paul@35	267	parser = ConfluenceXMLParser(out)
paul@35	268	parser.parse(f)
paul@35	269	finally:
paul@35	270	f.close()
paul@35	271
paul@35	272	if __name__ == "__main__":
paul@35	273	s = sys.stdin.read()
paul@35	274	parse(s, sys.stdout)
paul@35	275
paul@35	276	# vim: tabstop=4 expandtab shiftwidth=4