ConfluenceConverter (annotate convert.py in 61c251cf2202)

ConfluenceConverter

Annotated convert.py

140:61c251cf2202

2014-08-12

Paul Boddie

Support the reset parameter, redirecting to FindPage.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@8	3	"""
paul@8	4	Confluence XML dump conversion to a MoinMoin-compatible representation.
paul@8	5
paul@33	6	Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>
paul@8	7
paul@8	8	This software is free software; you can redistribute it and/or
paul@8	9	modify it under the terms of the GNU General Public License as
paul@8	10	published by the Free Software Foundation; either version 2 of
paul@8	11	the License, or (at your option) any later version.
paul@8	12
paul@8	13	This software is distributed in the hope that it will be useful,
paul@8	14	but WITHOUT ANY WARRANTY; without even the implied warranty of
paul@8	15	MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
paul@8	16	GNU General Public License for more details.
paul@8	17
paul@8	18	You should have received a copy of the GNU General Public
paul@8	19	License along with this library; see the file LICENCE.txt
paul@8	20	If not, write to the Free Software Foundation, Inc.,
paul@8	21	51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA
paul@8	22	"""
paul@8	23
paul@40	24	from os import chdir, getcwd, listdir, mkdir, makedirs, walk
paul@40	25	from os.path import exists, extsep, join, split, splitext
paul@0	26	from zipfile import ZipFile
paul@0	27	from cStringIO import StringIO
paul@40	28	from MoinMoin import wikiutil
paul@0	29	import codecs
paul@0	30	import xmlread
paul@35	31	import wikiparser, xmlparser
paul@25	32	import sys
paul@123	33	import time, calendar
paul@0	34
paul@84	35	from common import get_page_title
paul@23	36
paul@123	37	def date_to_seconds(s):
paul@123	38	return calendar.timegm(time.strptime(s.split(".", 1)[0], "%Y-%m-%d %H:%M:%S"))
paul@123	39
paul@0	40	class ConfluenceHandler:
paul@0	41
paul@0	42	"Handle content from a Confluence Wiki dump."
paul@0	43
paul@13	44	def __init__(self, space, no_translate=False):
paul@0	45	self.content = {}
paul@0	46	self.elements = []
paul@12	47	self.space = space
paul@13	48	self.no_translate = no_translate
paul@0	49
paul@0	50	def handle_object(self, name, elements, attributes, all_text, text):
paul@0	51
paul@40	52	"""
paul@40	53	Handle objects according to type. Objects appear as follows:
paul@40	54
paul@40	55	<object class="Page" package="...">
paul@40	56	<id name="id">...</id>
paul@40	57	...
paul@40	58	</object>
paul@40	59
paul@40	60	Within objects, one finds things like properties and collections, which
paul@40	61	are handled by their own methods but which are stored in the content
paul@40	62	dictionary associated with the current object.
paul@40	63
paul@40	64	By the time this method is called, the contents of the object will have
paul@40	65	been gathered and the properties and collections populated in the
paul@40	66	content dictionary. Any identifier will have been assigned to the
paul@40	67	textual content of the object element and will be available in the
paul@40	68	'text' parameter.
paul@40	69	"""
paul@0	70
paul@0	71	objecttype = attributes[-1]["class"]
paul@25	72
paul@25	73	# Any identifier is stored as the object's textual content.
paul@25	74
paul@0	75	identifier = text.strip()
paul@25	76
paul@25	77	# The content is a dictionary mapping names to properties and
paul@25	78	# collections.
paul@25	79
paul@0	80	content = self.content
paul@0	81
paul@12	82	pages_dir = join(self.space, "pages")
paul@12	83	versions_dir = join(self.space, "versions")
paul@0	84
paul@0	85	# Handle particular types.
paul@0	86
paul@10	87	if objecttype in ("Page", "Comment", "BlogPost"):
paul@0	88
paul@0	89	# Handle pages and revisions, adding revisions to the page manifest.
paul@9	90	# The original version is used as a unifying identifier for all the
paul@9	91	# different revisions (each of which being defined by a Page
paul@9	92	# element). Although "original" implies the first identifier used,
paul@9	93	# it actually appears to be the latest and will have the highest
paul@9	94	# version number.
paul@0	95
paul@0	96	if content.has_key("originalVersion"):
paul@0	97	pageid = content["originalVersion"]
paul@0	98	else:
paul@0	99	pageid = identifier
paul@0	100
paul@0	101	versionfile = join(versions_dir, identifier)
paul@0	102
paul@0	103	# Note page metadata, not necessarily in the correct order.
paul@9	104	# For comments, the title will need to be rewritten, since they
paul@9	105	# should be defined in terms of their owner page.
paul@0	106
paul@53	107	# NOTE: This only makes the current title available to comments.
paul@53	108
paul@0	109	mkdirs(join(pages_dir, pageid))
paul@0	110
paul@12	111	title = content["title"]
paul@23	112
paul@23	113	# Limit the title to a "safe" number of characters in order to avoid
paul@23	114	# filesystem issues.
paul@23	115
paul@84	116	title = get_page_title(title)
paul@23	117
paul@12	118	if title:
paul@12	119	title = "%s/%s" % (self.space, title)
paul@31	120	write(join(pages_dir, pageid, "pagetitle"), title)
paul@12	121
paul@100	122	# Note the type of the page.
paul@100	123
paul@100	124	write(join(pages_dir, pageid, "pagetype"), objecttype)
paul@100	125
paul@28	126	# See sort_manifest for access to this data.
paul@28	127
paul@24	128	append(join(pages_dir, pageid, "manifest"),
paul@123	129	"%s\|AddRevision\|_\|%s\|%s\|%s\|%s\|%d\n" % ( # blank added for consistency with AddAttachment
paul@24	130	content["version"],
paul@24	131	versionfile,
paul@31	132	title, # comment titles will incorporate the comment's position
paul@24	133	content["lastModifierName"],
paul@123	134	content["versionComment"],
paul@123	135	date_to_seconds(content["lastModificationDate"])
paul@24	136	))
paul@0	137
paul@24	138	# Add information to parent pages for child page lists.
paul@24	139
paul@24	140	if content.has_key("parent"):
paul@24	141	parentid = content["parent"]
paul@24	142	mkdirs(join(pages_dir, parentid))
paul@24	143	append(join(pages_dir, parentid, "children"), title + "\n")
paul@24	144
paul@31	145	# Add creation details for comments to the owner page.
paul@31	146	# Since comments can be versioned, the date of the original version
paul@31	147	# is used, and only this "original" version has the owner property.
paul@31	148
paul@31	149	if objecttype == "Comment" and content.has_key("owner"):
paul@31	150	ownerid = content["owner"]
paul@31	151	mkdirs(join(pages_dir, ownerid))
paul@31	152	append(join(pages_dir, ownerid, "comments"), "%s\|%s\n" % (content["creationDate"], pageid))
paul@31	153
paul@0	154	# Some metadata is not particularly relevant. For example,
paul@0	155	# ancestors, children, parent are navigation-related.
paul@0	156
paul@0	157	# Other metadata could be added to the page content itself.
paul@0	158	# For example, labelling could be converted to categories.
paul@0	159
paul@0	160	# Handle revisions.
paul@0	161
paul@0	162	elif objecttype == "BodyContent":
paul@12	163	body = content["body"]
paul@12	164	if not body:
paul@12	165	body = "## Empty page."
paul@13	166
paul@25	167	# NOTE: Very simple technique employed for guessing the format.
paul@25	168
paul@13	169	if no_translate:
paul@13	170	fn = write
paul@25	171	elif body.startswith("<"):
paul@25	172	fn = xmltranslate
paul@13	173	else:
paul@13	174	fn = translate
paul@13	175
paul@25	176	try:
paul@25	177	fn(join(versions_dir, content["content"]), body)
paul@25	178	except:
paul@42	179	err = codecs.getwriter("utf-8")(sys.stderr)
paul@42	180	print >>err, "Error parsing", content["content"]
paul@25	181	raise
paul@0	182
paul@40	183	# Handle attachments.
paul@40	184
paul@40	185	elif objecttype == "Attachment":
paul@40	186	pageid = content["content"]
paul@40	187	version = content["attachmentVersion"]
paul@40	188
paul@40	189	if content.has_key("originalVersion"):
paul@40	190	attachid = content["originalVersion"]
paul@40	191	else:
paul@40	192	attachid = identifier
paul@40	193
paul@40	194	append(join(pages_dir, pageid, "attachments"),
paul@123	195	"%s\|AddAttachment\|%s\|%s\|%s\|%s\|%s\|%d\n" % (
paul@40	196	version,
paul@40	197	# Have to "taint" archive filenames, although Moin will
paul@40	198	# probably handle package script filename tainting.
paul@40	199	wikiutil.taintfilename(join("attachments", pageid, attachid, version)),
paul@40	200	wikiutil.taintfilename(content["fileName"]),
paul@40	201	"", # pagename is substituted later
paul@40	202	content["lastModifierName"],
paul@123	203	content["comment"],
paul@123	204	date_to_seconds(content["lastModificationDate"])
paul@40	205	))
paul@40	206
paul@0	207	self.content = {}
paul@0	208
paul@0	209	def handle_property(self, name, elements, attributes, all_text, text):
paul@0	210
paul@0	211	"Record properties in the current content dictionary."
paul@0	212
paul@0	213	self.content[attributes[-1]["name"]] = text.strip()
paul@0	214
paul@0	215	def handle_id(self, name, elements, attributes, all_text, text):
paul@0	216
paul@0	217	"Promote identifiers to the parent element's text."
paul@0	218
paul@0	219	all_text[-2].append(text)
paul@0	220
paul@0	221	def handle_collection(self, name, elements, attributes, all_text, text):
paul@0	222
paul@0	223	"Record collections in the current content dictionary."
paul@0	224
paul@0	225	self.content[attributes[-1]["name"]] = self.elements
paul@0	226	self.elements = []
paul@0	227
paul@0	228	def handle_element(self, name, elements, attributes, all_text, text):
paul@0	229
paul@0	230	"Add elements to the current collection."
paul@0	231
paul@0	232	self.elements.append((attributes[-1]["class"], text.strip()))
paul@0	233
paul@0	234	def mkdirs(name):
paul@2	235
paul@2	236	"Make the directory with the given 'name' at any depth."
paul@2	237
paul@0	238	try:
paul@0	239	makedirs(name)
paul@0	240	except OSError:
paul@0	241	pass
paul@0	242
paul@0	243	def append(filename, s):
paul@2	244
paul@2	245	"Append to the file with the given 'filename' the string 's'."
paul@2	246
paul@0	247	write(filename, s, True)
paul@0	248
paul@0	249	def write(filename, s, append=False):
paul@2	250
paul@2	251	"""
paul@2	252	Write to the file with the given 'filename' the string 's'. If the optional
paul@2	253	'append' parameter is set to a true value, 's' will be appended to the file.
paul@2	254	"""
paul@2	255
paul@2	256	f = codecs.open(filename, append and "a" or "w", encoding="utf-8")
paul@0	257	try:
paul@0	258	f.write(s)
paul@0	259	finally:
paul@0	260	f.close()
paul@0	261
paul@9	262	def read(filename):
paul@9	263
paul@9	264	"""
paul@9	265	Read from the file with the given 'filename', returning a string containing
paul@9	266	its contents.
paul@9	267	"""
paul@9	268
paul@9	269	f = codecs.open(filename, encoding="utf-8")
paul@9	270	try:
paul@9	271	return f.read()
paul@9	272	finally:
paul@9	273	f.close()
paul@3	274
paul@25	275	def translate(filename, body, fn=None):
paul@11	276
paul@11	277	"""
paul@11	278	Write to the file with the given 'filename' a translation of the given
paul@11	279	'body'.
paul@11	280	"""
paul@11	281
paul@35	282	fn = fn or wikiparser.parse
paul@25	283
paul@11	284	out = codecs.open(filename, "w", encoding="utf-8")
paul@11	285	try:
paul@44	286	print >>out, "#pragma page-filename", filename
paul@25	287	fn(body, out)
paul@11	288	finally:
paul@11	289	out.close()
paul@11	290
paul@25	291	def xmltranslate(filename, body):
paul@35	292	translate(filename, body, xmlparser.parse)
paul@25	293
paul@31	294	def sort_comments(pages_dir, pageid):
paul@31	295
paul@31	296	"""
paul@31	297	Where 'pageid' has comments associated with it, sort them chronologically
paul@31	298	and label the comment pages with the owner page's title and comment's
paul@31	299	position in the chronological sequence. Such labelling is done by writing
paul@31	300	a "pagetitle" file in each comment page's directory.
paul@31	301	"""
paul@31	302
paul@31	303	comments = join(pages_dir, pageid, "comments")
paul@31	304
paul@31	305	if not exists(comments):
paul@31	306	return
paul@31	307
paul@31	308	title = read(join(pages_dir, pageid, "pagetitle"))
paul@31	309
paul@31	310	details = [line.split("\|") for line in read(comments).split("\n") if line]
paul@31	311	details.sort()
paul@31	312
paul@31	313	# Write the sorted comments list for testing purposes.
paul@31	314
paul@31	315	write(comments, "\n".join(["\|".join(x) for x in details]))
paul@31	316
paul@31	317	# Define comments as subpages by setting their titles using this
paul@31	318	# page's name/title and their position in the comments collection.
paul@31	319
paul@31	320	for position, (_lastmodified, commentid) in enumerate(details):
paul@31	321
paul@31	322	# In the page directory for each comment, write the page title in a
paul@31	323	# special file for later processing.
paul@31	324
paul@32	325	write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))
paul@31	326
paul@95	327	def _manifest_to_mapping(manifest, output_mapping):
paul@95	328
paul@95	329	"""
paul@95	330	Open the given 'manifest' and write a mapping from version identifiers to
paul@95	331	page names/titles to the file with the given 'output_mapping' filename.
paul@95	332	"""
paul@95	333
paul@95	334	f = codecs.open(manifest, "r", encoding="utf-8")
paul@95	335	try:
paul@95	336	mapping = []
paul@95	337
paul@95	338	lines = [x.split("\|") for x in f.readlines()]
paul@95	339	for line in lines:
paul@123	340	version, _action, _archive_filename, filename, title, username, comment, mtime = line
paul@95	341	if title:
paul@95	342	mapping.append((split(filename)[-1], title))
paul@95	343
paul@95	344	append(output_mapping, "".join([("\t".join(x) + "\n") for x in mapping]))
paul@95	345
paul@95	346	finally:
paul@95	347	f.close()
paul@95	348
paul@40	349	def _sort_manifest(manifest, title):
paul@40	350
paul@40	351	"""
paul@40	352	Open the given 'manifest' and sort it according to revision so that it will
paul@40	353	be added to MoinMoin in the correct order.
paul@40	354
paul@40	355	If a 'title' is provided, the title column in the manifest will be augmented
paul@40	356	with that information. This is typically done for comments and is necessary
paul@40	357	for attachments.
paul@40	358
paul@40	359	A list of manifest entries is returned.
paul@40	360	"""
paul@40	361
paul@40	362	f = codecs.open(manifest, "r", encoding="utf-8")
paul@40	363	try:
paul@109	364	lines = [x.rstrip("\n").split("\|") for x in f.readlines()]
paul@40	365	lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))
paul@40	366
paul@40	367	# Reconstruct the lines, optionally changing the titles.
paul@40	368
paul@40	369	result = []
paul@40	370
paul@40	371	for line in lines:
paul@123	372	version, _action, _archive_filename, filename, old_title, username, comment, mtime = line
paul@40	373
paul@40	374	# Replace title information with the information already present.
paul@40	375
paul@53	376	if not old_title:
paul@40	377	new_title = title
paul@40	378	else:
paul@40	379	new_title = old_title
paul@40	380
paul@40	381	# The version is omitted now that the manifest is ordered.
paul@40	382
paul@123	383	line = _action, _archive_filename, filename, new_title, username, comment, mtime
paul@40	384	result.append(line)
paul@40	385
paul@40	386	return result
paul@40	387
paul@40	388	finally:
paul@40	389	f.close()
paul@40	390
paul@40	391	def serialise_manifest(manifest):
paul@40	392
paul@40	393	"""
paul@40	394	Process the 'manifest' consisting of entries, removing superfluous columns.
paul@40	395	"""
paul@40	396
paul@40	397	result = []
paul@40	398
paul@40	399	for columns in manifest:
paul@40	400	action = columns[0]
paul@40	401	if action == "AddRevision":
paul@40	402	columns = list(columns)
paul@40	403	del columns[1]
paul@109	404	result.append("\|".join(columns) + "\n")
paul@40	405
paul@40	406	return "".join(result)
paul@40	407
paul@123	408	def sort_manifest(pages_dir, pageid, output_mapping=None, no_translate=False):
paul@9	409
paul@9	410	"""
paul@28	411	Using the given 'pageid', locate the manifest for the page and any page
paul@28	412	title information written to a "pagetitle" file.
paul@23	413
paul@123	414	Then sort the manifest according to revision so that historical operations
paul@123	415	such as page renaming can be detected.
paul@28	416
paul@28	417	If a "pagetitle" file exists, the title column in the manifest will be
paul@23	418	augmented with the contents of that file. This is typically done for
paul@23	419	comments.
paul@23	420
paul@28	421	If a "children" file exists, the pages in that file will be added as a list
paul@28	422	to the end of each revision's content.
paul@28	423
paul@95	424	If 'output_mapping' is given, a mapping from version identifiers to page
paul@95	425	titles will be appended to the file having that filename.
paul@9	426	"""
paul@9	427
paul@100	428	pagetype = join(pages_dir, pageid, "pagetype")
paul@28	429	manifest = join(pages_dir, pageid, "manifest")
paul@40	430	attachments = join(pages_dir, pageid, "attachments")
paul@28	431	pagetitle = join(pages_dir, pageid, "pagetitle")
paul@28	432	children = join(pages_dir, pageid, "children")
paul@32	433	comments = join(pages_dir, pageid, "comments")
paul@28	434
paul@100	435	type = exists(pagetype) and read(pagetype) or None
paul@100	436
paul@9	437	if exists(pagetitle):
paul@9	438	title = read(pagetitle)
paul@61	439	space, _page_name = get_space_and_name(title)
paul@9	440	else:
paul@61	441	title = space = None
paul@3	442
paul@40	443	# Sort the revision manifest.
paul@40	444
paul@40	445	result = _sort_manifest(manifest, title)
paul@9	446
paul@95	447	# Output a mapping of identifiers to page names.
paul@95	448
paul@95	449	if output_mapping:
paul@95	450	_manifest_to_mapping(manifest, output_mapping)
paul@95	451
paul@95	452	# Modify the content to include child pages and comments.
paul@95	453
paul@109	454	last_title = None
paul@109	455	final_result = []
paul@109	456
paul@109	457	for details in result:
paul@123	458	_action, _archive_filename, filename, new_title, username, comment, mtime = details
paul@109	459
paul@109	460	# Detect renamed pages and add a redirect revision.
paul@109	461
paul@109	462	if last_title and last_title != new_title and _action == "AddRevision":
paul@109	463	renaming_versionfile = filename + ".rename"
paul@123	464	final_result.append((_action, "_", renaming_versionfile, last_title, username, "Page renamed to %s" % new_title, mtime))
paul@109	465	write(renaming_versionfile, "#REDIRECT %s" % new_title)
paul@109	466
paul@109	467	last_title = new_title
paul@109	468
paul@109	469	# Add this revision to the manifest.
paul@109	470
paul@109	471	final_result.append(details)
paul@109	472
paul@109	473	# Obtain the text only if modifications are to be made.
paul@109	474
paul@109	475	text = None
paul@100	476
paul@100	477	# Add an ACL to comment pages so that people cannot change other
paul@100	478	# people's comments.
paul@130	479	# NOTE: This should match the PostComment action.
paul@100	480
paul@100	481	if type == "Comment":
paul@130	482	text = """\
paul@130	483	#acl %s:read,write,delete,revert All:read
paul@130	484	#pragma comment-owner %s
paul@130	485	%s""" % (username, username, text or read(filename))
paul@9	486
paul@40	487	# Add child page information to the content.
paul@28	488
paul@40	489	if exists(children) and not no_translate:
paul@40	490	child_pages = []
paul@40	491	child_page_names = [x for x in read(children).split("\n") if x]
paul@40	492	child_page_names.sort()
paul@28	493
paul@61	494	# Produce links which hide the space prefix.
paul@61	495
paul@40	496	for child_page_name in child_page_names:
paul@61	497	child_space, page_name = get_space_and_name(child_page_name)
paul@61	498	if child_space == space:
paul@61	499	child_page_label = page_name
paul@61	500	else:
paul@61	501	child_page_label = child_page_name
paul@61	502
paul@61	503	child_pages.append(" * [[%s\|%s]]" % (child_page_name, child_page_label))
paul@28	504
paul@109	505	text = (text or read(filename)) + child_page_section % "\n".join(child_pages)
paul@28	506
paul@40	507	# Add comments to the content.
paul@40	508
paul@40	509	if exists(comments) and title and not no_translate:
paul@109	510	text = (text or read(filename)) + comment_section
paul@100	511
paul@109	512	# Rewrite the file if necessary.
paul@100	513
paul@109	514	if text:
paul@109	515	write(filename, text)
paul@28	516
paul@40	517	# Add the attachments to the manifest.
paul@32	518
paul@40	519	if exists(attachments):
paul@109	520	final_result += _sort_manifest(attachments, title)
paul@32	521
paul@123	522	return final_result
paul@123	523
paul@123	524	def sort_final_manifest(entries, output):
paul@123	525
paul@123	526	"""
paul@123	527	Sort the manifest 'entries' by last modified time and serialise it.
paul@123	528	The manifest details will be appended to the file named by 'output'.
paul@123	529	"""
paul@123	530
paul@123	531	# The final entry in each element is the mtime.
paul@123	532
paul@123	533	entries.sort(cmp=lambda x, y: cmp(int(x[-1]), int(y[-1])))
paul@123	534
paul@40	535	# Serialise the manifest.
paul@3	536
paul@123	537	s = serialise_manifest(entries)
paul@123	538	append(output, s)
paul@3	539
paul@61	540	def get_space_and_name(page_name):
paul@61	541	try:
paul@61	542	return page_name.split("/", 1)
paul@61	543	except IndexError:
paul@61	544	return None, page_name
paul@61	545
paul@28	546	# Template for child page information.
paul@28	547
paul@28	548	child_page_section = """
paul@28	549	----
paul@28	550
paul@28	551	%s
paul@28	552	"""
paul@28	553
paul@32	554	# Template for comments.
paul@32	555
paul@32	556	comment_section = """
paul@32	557	----
paul@32	558
paul@110	559	<<IncludeComments>>
paul@32	560	"""
paul@32	561
paul@28	562	# Main program.
paul@28	563
paul@0	564	if __name__ == "__main__":
paul@20	565	try:
paul@20	566	filename = sys.argv[1]
paul@20	567	is_zipfile = splitext(filename)[-1] == extsep + "zip"
paul@20	568	space = sys.argv[2]
paul@44	569	if len(sys.argv) > 3 and sys.argv[3]:
paul@40	570	attachments = sys.argv[3]
paul@40	571	else:
paul@40	572	attachments = None
paul@20	573	except IndexError:
paul@47	574	print >>sys.stderr, """
paul@47	575	Please specify an XML file containing Wiki data, a workspace name, and an
paul@47	576	optional attachments directory location. For example:
paul@47	577
paul@100	578	%(progname)s com_entities.xml COM attachments
paul@47	579
paul@47	580	Adding --no-translate will unpack the Wiki but not translate the content.
paul@47	581	When doing so without an attachments directory, add an empty argument as
paul@47	582	follows:
paul@47	583
paul@100	584	%(progname)s com_entities.xml COM '' --no-translate
paul@100	585
paul@100	586	An archive can be used instead of the XML file, and since this may include
paul@100	587	attachments, no additional attachments directory needs to be specified:
paul@100	588
paul@100	589	%(progname)s COM-123456-789012.zip COM
paul@100	590	""" % {"progname" : split(sys.argv[0])[-1]}
paul@100	591
paul@20	592	sys.exit(1)
paul@0	593
paul@13	594	no_translate = "--no-translate" in sys.argv
paul@0	595
paul@12	596	if exists(space):
paul@12	597	print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space
paul@0	598	sys.exit(1)
paul@0	599
paul@12	600	package_zip = space + extsep + "zip"
paul@12	601
paul@12	602	if exists(package_zip):
paul@12	603	print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip
paul@12	604	sys.exit(1)
paul@12	605
paul@12	606	mkdir(space)
paul@12	607	mkdirs(join(space, "pages"))
paul@12	608	mkdirs(join(space, "versions"))
paul@0	609
paul@0	610	p = xmlread.ConfigurableParser()
paul@13	611	handler = ConfluenceHandler(space, no_translate)
paul@0	612
paul@24	613	# Register handlers in the parser for different elements.
paul@24	614
paul@0	615	p["object"] = handler.handle_object
paul@0	616	p["property"] = handler.handle_property
paul@0	617	p["id"] = handler.handle_id
paul@0	618	p["collection"] = handler.handle_collection
paul@0	619	p["element"] = handler.handle_element
paul@0	620
paul@2	621	# Open the XML dump.
paul@2	622
paul@0	623	f = open(filename)
paul@0	624
paul@0	625	if is_zipfile:
paul@0	626	zf = ZipFile(f)
paul@0	627	ff = StringIO(zf.read("entities.xml"))
paul@0	628	else:
paul@0	629	ff = f
paul@0	630
paul@2	631	# Parse the data.
paul@2	632
paul@0	633	try:
paul@0	634	p.parse(ff)
paul@40	635
paul@40	636	# Tidy up the import manifests, sorting each of them by revision and
paul@40	637	# finalising them.
paul@40	638
paul@40	639	pages_dir = join(space, "pages")
paul@40	640
paul@40	641	for pageid in listdir(pages_dir):
paul@40	642	sort_comments(pages_dir, pageid)
paul@40	643
paul@95	644	output_mapping = join(space, "MAPPING")
paul@95	645
paul@40	646	output_manifest = join(space, "MOIN_PACKAGE")
paul@40	647	append(output_manifest, "MoinMoinPackage\|1\n")
paul@40	648
paul@123	649	entries = []
paul@123	650
paul@40	651	for pageid in listdir(pages_dir):
paul@123	652	entries += sort_manifest(pages_dir, pageid, output_mapping, no_translate)
paul@123	653
paul@123	654	sort_final_manifest(entries, output_manifest)
paul@40	655
paul@40	656	# Write the page package.
paul@40	657
paul@40	658	page_package = ZipFile(package_zip, "w")
paul@40	659
paul@40	660	try:
paul@40	661	# Include the page revisions.
paul@40	662
paul@40	663	versions_dir = join(space, "versions")
paul@40	664
paul@40	665	for versionid in listdir(versions_dir):
paul@40	666	page_package.write(join(versions_dir, versionid))
paul@40	667
paul@40	668	# Include the attachments.
paul@40	669
paul@40	670	if attachments:
paul@40	671	cwd = getcwd()
paul@40	672	chdir(split(attachments)[0])
paul@40	673	try:
paul@40	674	for path, dirnames, filenames in walk(split(attachments)[1]):
paul@40	675	for filename in filenames:
paul@40	676	# Have to "taint" archive filenames.
paul@40	677	page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))
paul@40	678	finally:
paul@40	679	chdir(cwd)
paul@40	680	elif is_zipfile:
paul@40	681	for filename in zf.namelist():
paul@40	682	if filename.startswith("attachments"):
paul@40	683	# Have to "taint" archive filenames.
paul@40	684	page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))
paul@40	685
paul@40	686	# Include only the top-level manifest.
paul@40	687
paul@40	688	page_package.write(output_manifest, "MOIN_PACKAGE")
paul@40	689
paul@40	690	finally:
paul@40	691	page_package.close()
paul@40	692
paul@0	693	finally:
paul@0	694	f.close()
paul@0	695
paul@0	696	# vim: tabstop=4 expandtab shiftwidth=4