ConfluenceConverter (annotate convert.py in 4df6e1afb172)

ConfluenceConverter

Annotated convert.py

24:4df6e1afb172

2012-12-14

Paul Boddie

Added recording of child pages for declared parent pages.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@8	3	"""
paul@8	4	Confluence XML dump conversion to a MoinMoin-compatible representation.
paul@8	5
paul@8	6	Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22	"""
paul@8	23
paul@3	24	from os import listdir, mkdir, makedirs
paul@1	25	from os.path import exists, extsep, join, splitext
paul@0	26	from zipfile import ZipFile
paul@0	27	from cStringIO import StringIO
paul@0	28	import codecs
paul@0	29	import xmlread
paul@11	30	import parser
paul@0	31
paul@23	32	MAX_TITLE_LENGTH = 120
paul@23	33
paul@0	34	class ConfluenceHandler:
paul@0	35
paul@0	36	"Handle content from a Confluence Wiki dump."
paul@0	37
paul@13	38	def __init__(self, space, no_translate=False):
paul@0	39	self.content = {}
paul@0	40	self.elements = []
paul@12	41	self.space = space
paul@13	42	self.no_translate = no_translate
paul@0	43
paul@0	44	def handle_object(self, name, elements, attributes, all_text, text):
paul@0	45
paul@0	46	"Handle objects according to type."
paul@0	47
paul@0	48	objecttype = attributes[-1]["class"]
paul@0	49	identifier = text.strip()
paul@0	50	content = self.content
paul@0	51
paul@12	52	pages_dir = join(self.space, "pages")
paul@12	53	versions_dir = join(self.space, "versions")
paul@0	54
paul@0	55	# Handle particular types.
paul@0	56
paul@10	57	if objecttype in ("Page", "Comment", "BlogPost"):
paul@0	58
paul@0	59	# Handle pages and revisions, adding revisions to the page manifest.
paul@9	60	# The original version is used as a unifying identifier for all the
paul@9	61	# different revisions (each of which being defined by a Page
paul@9	62	# element). Although "original" implies the first identifier used,
paul@9	63	# it actually appears to be the latest and will have the highest
paul@9	64	# version number.
paul@0	65
paul@0	66	if content.has_key("originalVersion"):
paul@0	67	pageid = content["originalVersion"]
paul@0	68	else:
paul@0	69	pageid = identifier
paul@0	70
paul@0	71	versionfile = join(versions_dir, identifier)
paul@0	72
paul@0	73	# Note page metadata, not necessarily in the correct order.
paul@9	74	# For comments, the title will need to be rewritten, since they
paul@9	75	# should be defined in terms of their owner page.
paul@0	76
paul@0	77	mkdirs(join(pages_dir, pageid))
paul@0	78
paul@12	79	title = content["title"]
paul@23	80
paul@23	81	# Limit the title to a "safe" number of characters in order to avoid
paul@23	82	# filesystem issues.
paul@23	83
paul@23	84	title = title[:MAX_TITLE_LENGTH]
paul@23	85
paul@12	86	if title:
paul@12	87	title = "%s/%s" % (self.space, title)
paul@12	88
paul@24	89	append(join(pages_dir, pageid, "manifest"),
paul@24	90	"%s\|AddRevision\|%s\|%s\|%s\|%s\n" % (
paul@24	91	content["version"],
paul@24	92	versionfile,
paul@24	93	title or content["version"], # comment titles will incorporate the version
paul@24	94	content["lastModifierName"],
paul@24	95	content["versionComment"]
paul@24	96	))
paul@0	97
paul@9	98	# Write comments as subpages.
paul@9	99
paul@9	100	if content.has_key("comments"):
paul@9	101
paul@9	102	# Define a page directory for each comment, and write the page
paul@9	103	# title in a special file for later processing.
paul@9	104
paul@9	105	for _comment, commentid in content["comments"]:
paul@9	106	mkdirs(join(pages_dir, commentid))
paul@12	107	append(join(pages_dir, commentid, "pagetitle"), title)
paul@9	108
paul@24	109	# Add information to parent pages for child page lists.
paul@24	110
paul@24	111	if content.has_key("parent"):
paul@24	112	parentid = content["parent"]
paul@24	113	mkdirs(join(pages_dir, parentid))
paul@24	114	append(join(pages_dir, parentid, "children"), title + "\n")
paul@24	115
paul@0	116	# Some metadata is not particularly relevant. For example,
paul@0	117	# ancestors, children, parent are navigation-related.
paul@0	118
paul@0	119	# Other metadata could be added to the page content itself.
paul@0	120	# For example, labelling could be converted to categories.
paul@0	121
paul@0	122	# Handle revisions.
paul@0	123
paul@0	124	elif objecttype == "BodyContent":
paul@12	125	body = content["body"]
paul@12	126	if not body:
paul@12	127	body = "## Empty page."
paul@13	128
paul@13	129	if no_translate:
paul@13	130	fn = write
paul@13	131	else:
paul@13	132	fn = translate
paul@13	133
paul@13	134	fn(join(versions_dir, content["content"]), body)
paul@0	135
paul@0	136	self.content = {}
paul@0	137
paul@0	138	def handle_property(self, name, elements, attributes, all_text, text):
paul@0	139
paul@0	140	"Record properties in the current content dictionary."
paul@0	141
paul@0	142	self.content[attributes[-1]["name"]] = text.strip()
paul@0	143
paul@0	144	def handle_id(self, name, elements, attributes, all_text, text):
paul@0	145
paul@0	146	"Promote identifiers to the parent element's text."
paul@0	147
paul@0	148	all_text[-2].append(text)
paul@0	149
paul@0	150	def handle_collection(self, name, elements, attributes, all_text, text):
paul@0	151
paul@0	152	"Record collections in the current content dictionary."
paul@0	153
paul@0	154	self.content[attributes[-1]["name"]] = self.elements
paul@0	155	self.elements = []
paul@0	156
paul@0	157	def handle_element(self, name, elements, attributes, all_text, text):
paul@0	158
paul@0	159	"Add elements to the current collection."
paul@0	160
paul@0	161	self.elements.append((attributes[-1]["class"], text.strip()))
paul@0	162
paul@0	163	def mkdirs(name):
paul@2	164
paul@2	165	"Make the directory with the given 'name' at any depth."
paul@2	166
paul@0	167	try:
paul@0	168	makedirs(name)
paul@0	169	except OSError:
paul@0	170	pass
paul@0	171
paul@0	172	def append(filename, s):
paul@2	173
paul@2	174	"Append to the file with the given 'filename' the string 's'."
paul@2	175
paul@0	176	write(filename, s, True)
paul@0	177
paul@0	178	def write(filename, s, append=False):
paul@2	179
paul@2	180	"""
paul@2	181	Write to the file with the given 'filename' the string 's'. If the optional
paul@2	182	'append' parameter is set to a true value, 's' will be appended to the file.
paul@2	183	"""
paul@2	184
paul@2	185	f = codecs.open(filename, append and "a" or "w", encoding="utf-8")
paul@0	186	try:
paul@0	187	f.write(s)
paul@0	188	finally:
paul@0	189	f.close()
paul@0	190
paul@9	191	def read(filename):
paul@9	192
paul@9	193	"""
paul@9	194	Read from the file with the given 'filename', returning a string containing
paul@9	195	its contents.
paul@9	196	"""
paul@9	197
paul@9	198	f = codecs.open(filename, encoding="utf-8")
paul@9	199	try:
paul@9	200	return f.read()
paul@9	201	finally:
paul@9	202	f.close()
paul@3	203
paul@11	204	def translate(filename, body):
paul@11	205
paul@11	206	"""
paul@11	207	Write to the file with the given 'filename' a translation of the given
paul@11	208	'body'.
paul@11	209	"""
paul@11	210
paul@11	211	out = codecs.open(filename, "w", encoding="utf-8")
paul@11	212	try:
paul@11	213	parser.parse(body, out)
paul@11	214	finally:
paul@11	215	out.close()
paul@11	216
paul@10	217	def sort_manifest(filename, pagetitle, output=None):
paul@9	218
paul@9	219	"""
paul@23	220	Sort the manifest given in 'filename' according to revision.
paul@23	221
paul@23	222	If a 'pagetitle' file exists, the title column in the manifest will be
paul@23	223	augmented with the contents of that file. This is typically done for
paul@23	224	comments.
paul@23	225
paul@23	226	If 'output' is given, the manifest details will be appended to the file
paul@23	227	having that filename instead of being rewritten to the original manifest
paul@23	228	file.
paul@9	229	"""
paul@9	230
paul@9	231	if exists(pagetitle):
paul@9	232	title = read(pagetitle)
paul@9	233	else:
paul@9	234	title = None
paul@3	235
paul@5	236	f = codecs.open(filename, "r", encoding="utf-8")
paul@3	237	try:
paul@3	238	lines = [x.split("\|") for x in f.readlines()]
paul@3	239	lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))
paul@9	240
paul@9	241	# Reconstruct the lines, optionally changing the titles.
paul@9	242
paul@9	243	result = []
paul@9	244	for x in lines:
paul@9	245	if title is not None:
paul@9	246	x[3] = "%s/%s" % (title, x[3])
paul@9	247	result.append("\|".join(x[1:]))
paul@3	248	finally:
paul@3	249	f.close()
paul@3	250
paul@10	251	s = "".join(result)
paul@10	252
paul@10	253	if output is None:
paul@10	254	write(filename, s)
paul@10	255	else:
paul@10	256	append(output, s)
paul@3	257
paul@0	258	if __name__ == "__main__":
paul@0	259	import sys
paul@0	260
paul@20	261	try:
paul@20	262	filename = sys.argv[1]
paul@20	263	is_zipfile = splitext(filename)[-1] == extsep + "zip"
paul@20	264	space = sys.argv[2]
paul@20	265	except IndexError:
paul@20	266	print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."
paul@20	267	print >>sys.stderr, "For example: com_entities.xml COM"
paul@20	268	sys.exit(1)
paul@0	269
paul@13	270	no_translate = "--no-translate" in sys.argv
paul@0	271
paul@12	272	if exists(space):
paul@12	273	print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space
paul@0	274	sys.exit(1)
paul@0	275
paul@12	276	package_zip = space + extsep + "zip"
paul@12	277
paul@12	278	if exists(package_zip):
paul@12	279	print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip
paul@12	280	sys.exit(1)
paul@12	281
paul@12	282	mkdir(space)
paul@12	283	mkdirs(join(space, "pages"))
paul@12	284	mkdirs(join(space, "versions"))
paul@0	285
paul@0	286	p = xmlread.ConfigurableParser()
paul@13	287	handler = ConfluenceHandler(space, no_translate)
paul@0	288
paul@24	289	# Register handlers in the parser for different elements.
paul@24	290
paul@0	291	p["object"] = handler.handle_object
paul@0	292	p["property"] = handler.handle_property
paul@0	293	p["id"] = handler.handle_id
paul@0	294	p["collection"] = handler.handle_collection
paul@0	295	p["element"] = handler.handle_element
paul@0	296
paul@2	297	# Open the XML dump.
paul@2	298
paul@0	299	f = open(filename)
paul@0	300
paul@0	301	if is_zipfile:
paul@0	302	zf = ZipFile(f)
paul@0	303	ff = StringIO(zf.read("entities.xml"))
paul@0	304	else:
paul@0	305	ff = f
paul@0	306
paul@2	307	# Parse the data.
paul@2	308
paul@0	309	try:
paul@0	310	p.parse(ff)
paul@0	311	finally:
paul@0	312	f.close()
paul@0	313
paul@2	314	# Tidy up the import manifests, sorting each of them by revision and
paul@2	315	# finalising them.
paul@2	316
paul@12	317	pages_dir = join(space, "pages")
paul@3	318
paul@12	319	output_manifest = join(space, "MOIN_PACKAGE")
paul@10	320	append(output_manifest, "MoinMoinPackage\|1\n")
paul@10	321
paul@3	322	for pageid in listdir(pages_dir):
paul@3	323	manifest = join(pages_dir, pageid, "manifest")
paul@9	324	pagetitle = join(pages_dir, pageid, "pagetitle")
paul@10	325	sort_manifest(manifest, pagetitle, output_manifest)
paul@10	326
paul@10	327	# Write the page package.
paul@10	328
paul@12	329	page_package = ZipFile(package_zip, "w")
paul@10	330
paul@10	331	try:
paul@10	332	# Include the page revisions.
paul@10	333
paul@12	334	versions_dir = join(space, "versions")
paul@10	335
paul@10	336	for versionid in listdir(versions_dir):
paul@10	337	page_package.write(join(versions_dir, versionid))
paul@10	338
paul@10	339	# Include only the top-level manifest.
paul@10	340
paul@10	341	page_package.write(output_manifest, "MOIN_PACKAGE")
paul@10	342
paul@10	343	finally:
paul@10	344	page_package.close()
paul@3	345
paul@0	346	# vim: tabstop=4 expandtab shiftwidth=4