ConfluenceConverter (file convert.py at e0920cd59970)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import listdir, mkdir, makedirs    25 from os.path import exists, extsep, join, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 import codecs    29 import xmlread    30 import parser    31 import sys    32     33 MAX_TITLE_LENGTH = 120    34     35 class ConfluenceHandler:    36     37     "Handle content from a Confluence Wiki dump."    38     39     def __init__(self, space, no_translate=False):    40         self.content = {}    41         self.elements = []    42         self.space = space    43         self.no_translate = no_translate    44     45     def handle_object(self, name, elements, attributes, all_text, text):    46     47         "Handle objects according to type."    48     49         objecttype = attributes[-1]["class"]    50     51         # Any identifier is stored as the object's textual content.    52     53         identifier = text.strip()    54     55         # The content is a dictionary mapping names to properties and    56         # collections.    57     58         content = self.content    59     60         pages_dir = join(self.space, "pages")    61         versions_dir = join(self.space, "versions")    62     63         # Handle particular types.    64     65         if objecttype in ("Page", "Comment", "BlogPost"):    66     67             # Handle pages and revisions, adding revisions to the page manifest.    68             # The original version is used as a unifying identifier for all the    69             # different revisions (each of which being defined by a Page    70             # element). Although "original" implies the first identifier used,    71             # it actually appears to be the latest and will have the highest    72             # version number.    73     74             if content.has_key("originalVersion"):    75                 pageid = content["originalVersion"]    76             else:    77                 pageid = identifier    78     79             versionfile = join(versions_dir, identifier)    80     81             # Note page metadata, not necessarily in the correct order.    82             # For comments, the title will need to be rewritten, since they    83             # should be defined in terms of their owner page.    84     85             mkdirs(join(pages_dir, pageid))    86     87             title = content["title"]    88     89             # Limit the title to a "safe" number of characters in order to avoid    90             # filesystem issues.    91     92             title = title[:MAX_TITLE_LENGTH]    93     94             if title:    95                 title = "%s/%s" % (self.space, title)    96     97             append(join(pages_dir, pageid, "manifest"),    98                 "%s|AddRevision|%s|%s|%s|%s\n" % (    99                     content["version"],   100                     versionfile,   101                     title or content["version"], # comment titles will incorporate the version   102                     content["lastModifierName"],   103                     content["versionComment"]   104                 ))   105    106             # Write comments as subpages.   107    108             if content.has_key("comments"):   109    110                 # Define a page directory for each comment, and write the page   111                 # title in a special file for later processing.   112    113                 for _comment, commentid in content["comments"]:   114                     mkdirs(join(pages_dir, commentid))   115                     append(join(pages_dir, commentid, "pagetitle"), title)   116    117             # Add information to parent pages for child page lists.   118    119             if content.has_key("parent"):   120                 parentid = content["parent"]   121                 mkdirs(join(pages_dir, parentid))   122                 append(join(pages_dir, parentid, "children"), title + "\n")   123    124             # Some metadata is not particularly relevant. For example,   125             # ancestors, children, parent are navigation-related.   126    127             # Other metadata could be added to the page content itself.   128             # For example, labelling could be converted to categories.   129    130         # Handle revisions.   131    132         elif objecttype == "BodyContent":   133             body = content["body"]   134             if not body:   135                 body = "## Empty page."   136    137             # NOTE: Very simple technique employed for guessing the format.   138    139             if no_translate:   140                 fn = write   141             elif body.startswith("<"):   142                 fn = xmltranslate   143             else:   144                 fn = translate   145    146             try:   147                 fn(join(versions_dir, content["content"]), body)   148             except:   149                 print >>sys.stderr, "Error parsing..."   150                 print >>sys.stderr, body   151                 raise   152    153         self.content = {}   154    155     def handle_property(self, name, elements, attributes, all_text, text):   156    157         "Record properties in the current content dictionary."   158    159         self.content[attributes[-1]["name"]] = text.strip()   160    161     def handle_id(self, name, elements, attributes, all_text, text):   162    163         "Promote identifiers to the parent element's text."   164    165         all_text[-2].append(text)   166    167     def handle_collection(self, name, elements, attributes, all_text, text):   168    169         "Record collections in the current content dictionary."   170    171         self.content[attributes[-1]["name"]] = self.elements   172         self.elements = []   173    174     def handle_element(self, name, elements, attributes, all_text, text):   175    176         "Add elements to the current collection."   177    178         self.elements.append((attributes[-1]["class"], text.strip()))   179    180 def mkdirs(name):   181    182     "Make the directory with the given 'name' at any depth."   183    184     try:   185         makedirs(name)   186     except OSError:   187         pass   188    189 def append(filename, s):   190    191     "Append to the file with the given 'filename' the string 's'."   192    193     write(filename, s, True)   194    195 def write(filename, s, append=False):   196    197     """   198     Write to the file with the given 'filename' the string 's'. If the optional   199     'append' parameter is set to a true value, 's' will be appended to the file.   200     """   201    202     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   203     try:   204         f.write(s)   205     finally:   206         f.close()   207    208 def read(filename):   209    210     """   211     Read from the file with the given 'filename', returning a string containing   212     its contents.   213     """   214    215     f = codecs.open(filename, encoding="utf-8")   216     try:   217         return f.read()   218     finally:   219         f.close()   220    221 def translate(filename, body, fn=None):   222    223     """   224     Write to the file with the given 'filename' a translation of the given   225     'body'.   226     """   227    228     fn = fn or parser.parse   229    230     out = codecs.open(filename, "w", encoding="utf-8")   231     try:   232         fn(body, out)   233     finally:   234         out.close()   235    236 def xmltranslate(filename, body):   237     translate(filename, body, parser.xmlparse)   238    239 def sort_manifest(filename, pagetitle, output=None):   240    241     """   242     Sort the manifest given in 'filename' according to revision.   243    244     If a 'pagetitle' file exists, the title column in the manifest will be   245     augmented with the contents of that file. This is typically done for   246     comments.   247    248     If 'output' is given, the manifest details will be appended to the file   249     having that filename instead of being rewritten to the original manifest   250     file.   251     """   252    253     if exists(pagetitle):   254         title = read(pagetitle)   255     else:   256         title = None   257    258     f = codecs.open(filename, "r", encoding="utf-8")   259     try:   260         lines = [x.split("|") for x in f.readlines()]   261         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   262    263         # Reconstruct the lines, optionally changing the titles.   264    265         result = []   266         for x in lines:   267             if title is not None:   268                 x[3] = "%s/%s" % (title, x[3])   269             result.append("|".join(x[1:]))   270     finally:   271         f.close()   272    273     s = "".join(result)   274    275     if output is None:   276         write(filename, s)   277     else:   278         append(output, s)   279    280 if __name__ == "__main__":   281     try:   282         filename = sys.argv[1]   283         is_zipfile = splitext(filename)[-1] == extsep + "zip"   284         space = sys.argv[2]   285     except IndexError:   286         print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."   287         print >>sys.stderr, "For example: com_entities.xml COM"   288         sys.exit(1)   289    290     no_translate = "--no-translate" in sys.argv   291    292     if exists(space):   293         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   294         sys.exit(1)   295    296     package_zip = space + extsep + "zip"   297    298     if exists(package_zip):   299         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   300         sys.exit(1)   301    302     mkdir(space)   303     mkdirs(join(space, "pages"))   304     mkdirs(join(space, "versions"))   305    306     p = xmlread.ConfigurableParser()   307     handler = ConfluenceHandler(space, no_translate)   308    309     # Register handlers in the parser for different elements.   310    311     p["object"] = handler.handle_object   312     p["property"] = handler.handle_property   313     p["id"] = handler.handle_id   314     p["collection"] = handler.handle_collection   315     p["element"] = handler.handle_element   316    317     # Open the XML dump.   318    319     f = open(filename)   320    321     if is_zipfile:   322         zf = ZipFile(f)   323         ff = StringIO(zf.read("entities.xml"))   324     else:   325         ff = f   326    327     # Parse the data.   328    329     try:   330         p.parse(ff)   331     finally:   332         f.close()   333    334     # Tidy up the import manifests, sorting each of them by revision and   335     # finalising them.   336    337     pages_dir = join(space, "pages")   338    339     output_manifest = join(space, "MOIN_PACKAGE")   340     append(output_manifest, "MoinMoinPackage|1\n")   341    342     for pageid in listdir(pages_dir):   343         manifest = join(pages_dir, pageid, "manifest")   344         pagetitle = join(pages_dir, pageid, "pagetitle")   345         sort_manifest(manifest, pagetitle, output_manifest)   346    347     # Write the page package.   348    349     page_package = ZipFile(package_zip, "w")   350    351     try:   352         # Include the page revisions.   353    354         versions_dir = join(space, "versions")   355    356         for versionid in listdir(versions_dir):   357             page_package.write(join(versions_dir, versionid))   358    359         # Include only the top-level manifest.   360    361         page_package.write(output_manifest, "MOIN_PACKAGE")   362    363     finally:   364         page_package.close()   365    366 # vim: tabstop=4 expandtab shiftwidth=4