# HG changeset patch # User Paul Boddie # Date 1361837246 -3600 # Node ID 52c4c0976745815b918bfe2a1cdce952c1b7cacc # Parent f9771c857a296a2e56711d1bd4a967262e657f67 Added support for attachments in pages, making manifest handling more general. diff -r f9771c857a29 -r 52c4c0976745 README.txt --- a/README.txt Sun Feb 24 23:42:06 2013 +0100 +++ b/README.txt Tue Feb 26 01:07:26 2013 +0100 @@ -17,6 +17,9 @@ The xmlread.py file from the xmlread distribution can be copied into the ConfluenceConverter directory. +ConfluenceConverter also requires access to the MoinMoin.wikiutil module found +in the MoinMoin distribution. + The moinsetup program is highly recommended for the installation of page packages and the management of MoinMoin Wiki instances: @@ -34,11 +37,13 @@ file is called com_entities.xml), the following command can be used to prepare a page package for MoinMoin: -python convert.py com_entities.xml COM +python convert.py com_entities.xml COM attachments In addition to the filename, a workspace name is required. Confluence appears to require a workspace as a container for collections of pages, but this also -permits us to selectively import parts of a Wiki into MoinMoin. +permits us to selectively import parts of a Wiki into MoinMoin. If a directory +of attachments is also specified, these will be imported into the page +package. The result of the above command will be a directory having the same name as the chosen workspace, together with a zip archive for that directory's @@ -67,13 +72,16 @@ Each page directory contains the following things: - * manifest (a list of version entries in a format similar to the MoinMoin - page package manifest format) + * manifest (a list of version entries in a format similar to the MoinMoin + page package manifest format) - * pagetitle (an optional page title imposed on the page by another content - item) + * attachments (a list of attachment version entries in a format similar to + the MoinMoin page package manifest format) - * children (a list of child page names defined for the page) + * pagetitle (an optional page title imposed on the page by another content + item) + + * children (a list of child page names defined for the page) In the output structure, content items such as comments are represented as pages and each reference a content version. Since comments will ultimately be diff -r f9771c857a29 -r 52c4c0976745 convert.py --- a/convert.py Sun Feb 24 23:42:06 2013 +0100 +++ b/convert.py Tue Feb 26 01:07:26 2013 +0100 @@ -21,10 +21,11 @@ 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA """ -from os import listdir, mkdir, makedirs -from os.path import exists, extsep, join, splitext +from os import chdir, getcwd, listdir, mkdir, makedirs, walk +from os.path import exists, extsep, join, split, splitext from zipfile import ZipFile from cStringIO import StringIO +from MoinMoin import wikiutil import codecs import xmlread import wikiparser, xmlparser @@ -44,7 +45,24 @@ def handle_object(self, name, elements, attributes, all_text, text): - "Handle objects according to type." + """ + Handle objects according to type. Objects appear as follows: + + + ... + ... + + + Within objects, one finds things like properties and collections, which + are handled by their own methods but which are stored in the content + dictionary associated with the current object. + + By the time this method is called, the contents of the object will have + been gathered and the properties and collections populated in the + content dictionary. Any identifier will have been assigned to the + textual content of the object element and will be available in the + 'text' parameter. + """ objecttype = attributes[-1]["class"] @@ -98,7 +116,7 @@ # See sort_manifest for access to this data. append(join(pages_dir, pageid, "manifest"), - "%s|AddRevision|%s|%s|%s|%s\n" % ( + "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment content["version"], versionfile, title, # comment titles will incorporate the comment's position @@ -151,6 +169,29 @@ print >>sys.stderr, body raise + # Handle attachments. + + elif objecttype == "Attachment": + pageid = content["content"] + version = content["attachmentVersion"] + + if content.has_key("originalVersion"): + attachid = content["originalVersion"] + else: + attachid = identifier + + append(join(pages_dir, pageid, "attachments"), + "%s|AddAttachment|%s|%s|%s|%s|%s\n" % ( + version, + # Have to "taint" archive filenames, although Moin will + # probably handle package script filename tainting. + wikiutil.taintfilename(join("attachments", pageid, attachid, version)), + wikiutil.taintfilename(content["fileName"]), + "", # pagename is substituted later + content["lastModifierName"], + content["comment"] + )) + self.content = {} def handle_property(self, name, elements, attributes, all_text, text): @@ -270,6 +311,65 @@ write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) +def _sort_manifest(manifest, title): + + """ + Open the given 'manifest' and sort it according to revision so that it will + be added to MoinMoin in the correct order. + + If a 'title' is provided, the title column in the manifest will be augmented + with that information. This is typically done for comments and is necessary + for attachments. + + A list of manifest entries is returned. + """ + + f = codecs.open(manifest, "r", encoding="utf-8") + try: + lines = [x.split("|") for x in f.readlines()] + lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) + + # Reconstruct the lines, optionally changing the titles. + + result = [] + + for line in lines: + version, _action, _archive_filename, filename, old_title, username, comment = line + + # Replace title information with the information already present. + + if title is not None: + new_title = title + else: + new_title = old_title + + # The version is omitted now that the manifest is ordered. + + line = _action, _archive_filename, filename, new_title, username, comment + result.append(line) + + return result + + finally: + f.close() + +def serialise_manifest(manifest): + + """ + Process the 'manifest' consisting of entries, removing superfluous columns. + """ + + result = [] + + for columns in manifest: + action = columns[0] + if action == "AddRevision": + columns = list(columns) + del columns[1] + result.append("|".join(columns)) + + return "".join(result) + def sort_manifest(pages_dir, pageid, output=None, no_translate=False): """ @@ -292,6 +392,7 @@ """ manifest = join(pages_dir, pageid, "manifest") + attachments = join(pages_dir, pageid, "attachments") pagetitle = join(pages_dir, pageid, "pagetitle") children = join(pages_dir, pageid, "children") comments = join(pages_dir, pageid, "comments") @@ -301,51 +402,37 @@ else: title = None - f = codecs.open(manifest, "r", encoding="utf-8") - try: - lines = [x.split("|") for x in f.readlines()] - lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) + # Sort the revision manifest. + + result = _sort_manifest(manifest, title) - # Reconstruct the lines, optionally changing the titles. + for _action, _archive_filename, filename, new_title, username, comment in result: - result = [] + # Add child page information to the content. - for line in lines: - version, _addrevision, filename, old_title, username, comment = line - - # Replace title information with the information already present. + if exists(children) and not no_translate: + child_pages = [] + child_page_names = [x for x in read(children).split("\n") if x] + child_page_names.sort() - if title is not None: - new_title = title - else: - new_title = old_title - - # The version is omitted now that the manifest is ordered. + for child_page_name in child_page_names: + child_pages.append(" * [[%s]]" % child_page_name) - line = _addrevision, filename, new_title, username, comment - result.append("|".join(line)) - - # Add child page information to the content. + append(filename, child_page_section % "\n".join(child_pages)) - if exists(children) and not no_translate: - child_pages = [] - child_page_names = [x for x in read(children).split("\n") if x] - child_page_names.sort() + # Add comments to the content. + + if exists(comments) and title and not no_translate: + append(filename, comment_section % title) - for child_page_name in child_page_names: - child_pages.append(" * [[%s]]" % child_page_name) - - append(filename, child_page_section % "\n".join(child_pages)) - - # Add comments to the content. + # Add the attachments to the manifest. - if exists(comments) and title and not no_translate: - append(filename, comment_section % title) + if exists(attachments): + result += _sort_manifest(attachments, title) - finally: - f.close() + # Serialise the manifest. - s = "".join(result) + s = serialise_manifest(result) if output is None: write(manifest, s) @@ -375,8 +462,13 @@ filename = sys.argv[1] is_zipfile = splitext(filename)[-1] == extsep + "zip" space = sys.argv[2] + if len(sys.argv) > 3: + attachments = sys.argv[3] + else: + attachments = None except IndexError: - print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name." + print >>sys.stderr, "Please specify an XML file containing Wiki data, a workspace name," + print >>sys.stderr, "and an optional attachments directory location." print >>sys.stderr, "For example: com_entities.xml COM" sys.exit(1) @@ -421,40 +513,59 @@ try: p.parse(ff) + + # Tidy up the import manifests, sorting each of them by revision and + # finalising them. + + pages_dir = join(space, "pages") + + for pageid in listdir(pages_dir): + sort_comments(pages_dir, pageid) + + output_manifest = join(space, "MOIN_PACKAGE") + append(output_manifest, "MoinMoinPackage|1\n") + + for pageid in listdir(pages_dir): + sort_manifest(pages_dir, pageid, output_manifest, no_translate) + + # Write the page package. + + page_package = ZipFile(package_zip, "w") + + try: + # Include the page revisions. + + versions_dir = join(space, "versions") + + for versionid in listdir(versions_dir): + page_package.write(join(versions_dir, versionid)) + + # Include the attachments. + + if attachments: + cwd = getcwd() + chdir(split(attachments)[0]) + try: + for path, dirnames, filenames in walk(split(attachments)[1]): + for filename in filenames: + # Have to "taint" archive filenames. + page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) + finally: + chdir(cwd) + elif is_zipfile: + for filename in zf.namelist(): + if filename.startswith("attachments"): + # Have to "taint" archive filenames. + page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) + + # Include only the top-level manifest. + + page_package.write(output_manifest, "MOIN_PACKAGE") + + finally: + page_package.close() + finally: f.close() - # Tidy up the import manifests, sorting each of them by revision and - # finalising them. - - pages_dir = join(space, "pages") - - for pageid in listdir(pages_dir): - sort_comments(pages_dir, pageid) - - output_manifest = join(space, "MOIN_PACKAGE") - append(output_manifest, "MoinMoinPackage|1\n") - - for pageid in listdir(pages_dir): - sort_manifest(pages_dir, pageid, output_manifest, no_translate) - - # Write the page package. - - page_package = ZipFile(package_zip, "w") - - try: - # Include the page revisions. - - versions_dir = join(space, "versions") - - for versionid in listdir(versions_dir): - page_package.write(join(versions_dir, versionid)) - - # Include only the top-level manifest. - - page_package.write(output_manifest, "MOIN_PACKAGE") - - finally: - page_package.close() - # vim: tabstop=4 expandtab shiftwidth=4