ConfluenceConverter (file convert.py at 4df6e1afb172)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import listdir, mkdir, makedirs    25 from os.path import exists, extsep, join, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 import codecs    29 import xmlread    30 import parser    31     32 MAX_TITLE_LENGTH = 120    33     34 class ConfluenceHandler:    35     36     "Handle content from a Confluence Wiki dump."    37     38     def __init__(self, space, no_translate=False):    39         self.content = {}    40         self.elements = []    41         self.space = space    42         self.no_translate = no_translate    43     44     def handle_object(self, name, elements, attributes, all_text, text):    45     46         "Handle objects according to type."    47     48         objecttype = attributes[-1]["class"]    49         identifier = text.strip()    50         content = self.content    51     52         pages_dir = join(self.space, "pages")    53         versions_dir = join(self.space, "versions")    54     55         # Handle particular types.    56     57         if objecttype in ("Page", "Comment", "BlogPost"):    58     59             # Handle pages and revisions, adding revisions to the page manifest.    60             # The original version is used as a unifying identifier for all the    61             # different revisions (each of which being defined by a Page    62             # element). Although "original" implies the first identifier used,    63             # it actually appears to be the latest and will have the highest    64             # version number.    65     66             if content.has_key("originalVersion"):    67                 pageid = content["originalVersion"]    68             else:    69                 pageid = identifier    70     71             versionfile = join(versions_dir, identifier)    72     73             # Note page metadata, not necessarily in the correct order.    74             # For comments, the title will need to be rewritten, since they    75             # should be defined in terms of their owner page.    76     77             mkdirs(join(pages_dir, pageid))    78     79             title = content["title"]    80     81             # Limit the title to a "safe" number of characters in order to avoid    82             # filesystem issues.    83     84             title = title[:MAX_TITLE_LENGTH]    85     86             if title:    87                 title = "%s/%s" % (self.space, title)    88     89             append(join(pages_dir, pageid, "manifest"),    90                 "%s|AddRevision|%s|%s|%s|%s\n" % (    91                     content["version"],    92                     versionfile,    93                     title or content["version"], # comment titles will incorporate the version    94                     content["lastModifierName"],    95                     content["versionComment"]    96                 ))    97     98             # Write comments as subpages.    99    100             if content.has_key("comments"):   101    102                 # Define a page directory for each comment, and write the page   103                 # title in a special file for later processing.   104    105                 for _comment, commentid in content["comments"]:   106                     mkdirs(join(pages_dir, commentid))   107                     append(join(pages_dir, commentid, "pagetitle"), title)   108    109             # Add information to parent pages for child page lists.   110    111             if content.has_key("parent"):   112                 parentid = content["parent"]   113                 mkdirs(join(pages_dir, parentid))   114                 append(join(pages_dir, parentid, "children"), title + "\n")   115    116             # Some metadata is not particularly relevant. For example,   117             # ancestors, children, parent are navigation-related.   118    119             # Other metadata could be added to the page content itself.   120             # For example, labelling could be converted to categories.   121    122         # Handle revisions.   123    124         elif objecttype == "BodyContent":   125             body = content["body"]   126             if not body:   127                 body = "## Empty page."   128    129             if no_translate:   130                 fn = write   131             else:   132                 fn = translate   133    134             fn(join(versions_dir, content["content"]), body)   135    136         self.content = {}   137    138     def handle_property(self, name, elements, attributes, all_text, text):   139    140         "Record properties in the current content dictionary."   141    142         self.content[attributes[-1]["name"]] = text.strip()   143    144     def handle_id(self, name, elements, attributes, all_text, text):   145    146         "Promote identifiers to the parent element's text."   147    148         all_text[-2].append(text)   149    150     def handle_collection(self, name, elements, attributes, all_text, text):   151    152         "Record collections in the current content dictionary."   153    154         self.content[attributes[-1]["name"]] = self.elements   155         self.elements = []   156    157     def handle_element(self, name, elements, attributes, all_text, text):   158    159         "Add elements to the current collection."   160    161         self.elements.append((attributes[-1]["class"], text.strip()))   162    163 def mkdirs(name):   164    165     "Make the directory with the given 'name' at any depth."   166    167     try:   168         makedirs(name)   169     except OSError:   170         pass   171    172 def append(filename, s):   173    174     "Append to the file with the given 'filename' the string 's'."   175    176     write(filename, s, True)   177    178 def write(filename, s, append=False):   179    180     """   181     Write to the file with the given 'filename' the string 's'. If the optional   182     'append' parameter is set to a true value, 's' will be appended to the file.   183     """   184    185     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   186     try:   187         f.write(s)   188     finally:   189         f.close()   190    191 def read(filename):   192    193     """   194     Read from the file with the given 'filename', returning a string containing   195     its contents.   196     """   197    198     f = codecs.open(filename, encoding="utf-8")   199     try:   200         return f.read()   201     finally:   202         f.close()   203    204 def translate(filename, body):   205    206     """   207     Write to the file with the given 'filename' a translation of the given   208     'body'.   209     """   210    211     out = codecs.open(filename, "w", encoding="utf-8")   212     try:   213         parser.parse(body, out)   214     finally:   215         out.close()   216    217 def sort_manifest(filename, pagetitle, output=None):   218    219     """   220     Sort the manifest given in 'filename' according to revision.   221    222     If a 'pagetitle' file exists, the title column in the manifest will be   223     augmented with the contents of that file. This is typically done for   224     comments.   225    226     If 'output' is given, the manifest details will be appended to the file   227     having that filename instead of being rewritten to the original manifest   228     file.   229     """   230    231     if exists(pagetitle):   232         title = read(pagetitle)   233     else:   234         title = None   235    236     f = codecs.open(filename, "r", encoding="utf-8")   237     try:   238         lines = [x.split("|") for x in f.readlines()]   239         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   240    241         # Reconstruct the lines, optionally changing the titles.   242    243         result = []   244         for x in lines:   245             if title is not None:   246                 x[3] = "%s/%s" % (title, x[3])   247             result.append("|".join(x[1:]))   248     finally:   249         f.close()   250    251     s = "".join(result)   252    253     if output is None:   254         write(filename, s)   255     else:   256         append(output, s)   257    258 if __name__ == "__main__":   259     import sys   260    261     try:   262         filename = sys.argv[1]   263         is_zipfile = splitext(filename)[-1] == extsep + "zip"   264         space = sys.argv[2]   265     except IndexError:   266         print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."   267         print >>sys.stderr, "For example: com_entities.xml COM"   268         sys.exit(1)   269    270     no_translate = "--no-translate" in sys.argv   271    272     if exists(space):   273         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   274         sys.exit(1)   275    276     package_zip = space + extsep + "zip"   277    278     if exists(package_zip):   279         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   280         sys.exit(1)   281    282     mkdir(space)   283     mkdirs(join(space, "pages"))   284     mkdirs(join(space, "versions"))   285    286     p = xmlread.ConfigurableParser()   287     handler = ConfluenceHandler(space, no_translate)   288    289     # Register handlers in the parser for different elements.   290    291     p["object"] = handler.handle_object   292     p["property"] = handler.handle_property   293     p["id"] = handler.handle_id   294     p["collection"] = handler.handle_collection   295     p["element"] = handler.handle_element   296    297     # Open the XML dump.   298    299     f = open(filename)   300    301     if is_zipfile:   302         zf = ZipFile(f)   303         ff = StringIO(zf.read("entities.xml"))   304     else:   305         ff = f   306    307     # Parse the data.   308    309     try:   310         p.parse(ff)   311     finally:   312         f.close()   313    314     # Tidy up the import manifests, sorting each of them by revision and   315     # finalising them.   316    317     pages_dir = join(space, "pages")   318    319     output_manifest = join(space, "MOIN_PACKAGE")   320     append(output_manifest, "MoinMoinPackage|1\n")   321    322     for pageid in listdir(pages_dir):   323         manifest = join(pages_dir, pageid, "manifest")   324         pagetitle = join(pages_dir, pageid, "pagetitle")   325         sort_manifest(manifest, pagetitle, output_manifest)   326    327     # Write the page package.   328    329     page_package = ZipFile(package_zip, "w")   330    331     try:   332         # Include the page revisions.   333    334         versions_dir = join(space, "versions")   335    336         for versionid in listdir(versions_dir):   337             page_package.write(join(versions_dir, versionid))   338    339         # Include only the top-level manifest.   340    341         page_package.write(output_manifest, "MOIN_PACKAGE")   342    343     finally:   344         page_package.close()   345    346 # vim: tabstop=4 expandtab shiftwidth=4