ConfluenceConverter (file convert.py at a22497c19c3e)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import listdir, mkdir, makedirs    25 from os.path import exists, extsep, join, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 import codecs    29 import xmlread    30     31 class ConfluenceHandler:    32     33     "Handle content from a Confluence Wiki dump."    34     35     def __init__(self, directory):    36         self.content = {}    37         self.elements = []    38         self.directory = directory    39     40     def handle_object(self, name, elements, attributes, all_text, text):    41     42         "Handle objects according to type."    43     44         objecttype = attributes[-1]["class"]    45         identifier = text.strip()    46         content = self.content    47     48         pages_dir = join(self.directory, "pages")    49         versions_dir = join(self.directory, "versions")    50     51         # Handle particular types.    52     53         if objecttype in ("Page", "Comment", "BlogPost"):    54     55             # Handle pages and revisions, adding revisions to the page manifest.    56             # The original version is used as a unifying identifier for all the    57             # different revisions (each of which being defined by a Page    58             # element). Although "original" implies the first identifier used,    59             # it actually appears to be the latest and will have the highest    60             # version number.    61     62             if content.has_key("originalVersion"):    63                 pageid = content["originalVersion"]    64             else:    65                 pageid = identifier    66     67             versionfile = join(versions_dir, identifier)    68     69             # Note page metadata, not necessarily in the correct order.    70             # For comments, the title will need to be rewritten, since they    71             # should be defined in terms of their owner page.    72     73             mkdirs(join(pages_dir, pageid))    74     75             append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % (    76                 content["version"],    77                 versionfile,    78                 content["title"] or content["version"], # comment titles will incorporate the version    79                 content["lastModifierName"],    80                 content["versionComment"]))    81     82             # Write comments as subpages.    83     84             if content.has_key("comments"):    85     86                 # Define a page directory for each comment, and write the page    87                 # title in a special file for later processing.    88     89                 for _comment, commentid in content["comments"]:    90                     mkdirs(join(pages_dir, commentid))    91                     append(join(pages_dir, commentid, "pagetitle"), content["title"])    92     93             # Some metadata is not particularly relevant. For example,    94             # ancestors, children, parent are navigation-related.    95     96             # Other metadata could be added to the page content itself.    97             # For example, labelling could be converted to categories.    98     99         # Handle revisions.   100    101         elif objecttype == "BodyContent":   102             write(join(versions_dir, content["content"]), content["body"])   103    104         self.content = {}   105    106     def handle_property(self, name, elements, attributes, all_text, text):   107    108         "Record properties in the current content dictionary."   109    110         self.content[attributes[-1]["name"]] = text.strip()   111    112     def handle_id(self, name, elements, attributes, all_text, text):   113    114         "Promote identifiers to the parent element's text."   115    116         all_text[-2].append(text)   117    118     def handle_collection(self, name, elements, attributes, all_text, text):   119    120         "Record collections in the current content dictionary."   121    122         self.content[attributes[-1]["name"]] = self.elements   123         self.elements = []   124    125     def handle_element(self, name, elements, attributes, all_text, text):   126    127         "Add elements to the current collection."   128    129         self.elements.append((attributes[-1]["class"], text.strip()))   130    131 def mkdirs(name):   132    133     "Make the directory with the given 'name' at any depth."   134    135     try:   136         makedirs(name)   137     except OSError:   138         pass   139    140 def append(filename, s):   141    142     "Append to the file with the given 'filename' the string 's'."   143    144     write(filename, s, True)   145    146 def write(filename, s, append=False):   147    148     """   149     Write to the file with the given 'filename' the string 's'. If the optional   150     'append' parameter is set to a true value, 's' will be appended to the file.   151     """   152    153     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   154     try:   155         f.write(s)   156     finally:   157         f.close()   158    159 def read(filename):   160    161     """   162     Read from the file with the given 'filename', returning a string containing   163     its contents.   164     """   165    166     f = codecs.open(filename, encoding="utf-8")   167     try:   168         return f.read()   169     finally:   170         f.close()   171    172 def sort_manifest(filename, pagetitle, output=None):   173    174     """   175     Sort the manifest given in 'filename' according to revision. If a   176     'pagetitle' file exists, the title column in the manifest will be augmented   177     with the contents of that file. If 'output' is given, the manifest details   178     will be appended to the file having that filename instead of being rewritten   179     to the original manifest file.   180     """   181    182     if exists(pagetitle):   183         title = read(pagetitle)   184     else:   185         title = None   186    187     f = codecs.open(filename, "r", encoding="utf-8")   188     try:   189         lines = [x.split("|") for x in f.readlines()]   190         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   191    192         # Reconstruct the lines, optionally changing the titles.   193    194         result = []   195         for x in lines:   196             if title is not None:   197                 x[3] = "%s/%s" % (title, x[3])   198             result.append("|".join(x[1:]))   199     finally:   200         f.close()   201    202     s = "".join(result)   203    204     if output is None:   205         write(filename, s)   206     else:   207         append(output, s)   208    209 if __name__ == "__main__":   210     import sys   211    212     filename = sys.argv[1]   213     is_zipfile = splitext(filename)[-1] == extsep + "zip"   214    215     directory = sys.argv[2]   216    217     if exists(directory):   218         print >>sys.stderr, "Directory exists. Please choose another or remove its contents."   219         sys.exit(1)   220    221     mkdir(directory)   222     mkdirs(join(directory, "pages"))   223     mkdirs(join(directory, "versions"))   224    225     p = xmlread.ConfigurableParser()   226     handler = ConfluenceHandler(directory)   227    228     p["object"] = handler.handle_object   229     p["property"] = handler.handle_property   230     p["id"] = handler.handle_id   231     p["collection"] = handler.handle_collection   232     p["element"] = handler.handle_element   233    234     # Open the XML dump.   235    236     f = open(filename)   237    238     if is_zipfile:   239         zf = ZipFile(f)   240         ff = StringIO(zf.read("entities.xml"))   241     else:   242         ff = f   243    244     # Parse the data.   245    246     try:   247         p.parse(ff)   248     finally:   249         f.close()   250    251     # Tidy up the import manifests, sorting each of them by revision and   252     # finalising them.   253    254     pages_dir = join(directory, "pages")   255    256     output_manifest = join(directory, "MOIN_PACKAGE")   257     append(output_manifest, "MoinMoinPackage|1\n")   258    259     for pageid in listdir(pages_dir):   260         manifest = join(pages_dir, pageid, "manifest")   261         pagetitle = join(pages_dir, pageid, "pagetitle")   262         sort_manifest(manifest, pagetitle, output_manifest)   263    264     # Write the page package.   265    266     page_package = ZipFile(directory + extsep + "zip", "w")   267    268     try:   269         # Include the page revisions.   270    271         versions_dir = join(directory, "versions")   272    273         for versionid in listdir(versions_dir):   274             page_package.write(join(versions_dir, versionid))   275    276         # Include only the top-level manifest.   277    278         page_package.write(output_manifest, "MOIN_PACKAGE")   279    280     finally:   281         page_package.close()   282    283 # vim: tabstop=4 expandtab shiftwidth=4