ConfluenceConverter (file convert.py at 702a040785d7)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import listdir, mkdir, makedirs    25 from os.path import exists, extsep, join, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 import codecs    29 import xmlread    30 import parser    31     32 class ConfluenceHandler:    33     34     "Handle content from a Confluence Wiki dump."    35     36     def __init__(self, space, no_translate=False):    37         self.content = {}    38         self.elements = []    39         self.space = space    40         self.no_translate = no_translate    41     42     def handle_object(self, name, elements, attributes, all_text, text):    43     44         "Handle objects according to type."    45     46         objecttype = attributes[-1]["class"]    47         identifier = text.strip()    48         content = self.content    49     50         pages_dir = join(self.space, "pages")    51         versions_dir = join(self.space, "versions")    52     53         # Handle particular types.    54     55         if objecttype in ("Page", "Comment", "BlogPost"):    56     57             # Handle pages and revisions, adding revisions to the page manifest.    58             # The original version is used as a unifying identifier for all the    59             # different revisions (each of which being defined by a Page    60             # element). Although "original" implies the first identifier used,    61             # it actually appears to be the latest and will have the highest    62             # version number.    63     64             if content.has_key("originalVersion"):    65                 pageid = content["originalVersion"]    66             else:    67                 pageid = identifier    68     69             versionfile = join(versions_dir, identifier)    70     71             # Note page metadata, not necessarily in the correct order.    72             # For comments, the title will need to be rewritten, since they    73             # should be defined in terms of their owner page.    74     75             mkdirs(join(pages_dir, pageid))    76     77             title = content["title"]    78             if title:    79                 title = "%s/%s" % (self.space, title)    80     81             append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % (    82                 content["version"],    83                 versionfile,    84                 title or content["version"], # comment titles will incorporate the version    85                 content["lastModifierName"],    86                 content["versionComment"]))    87     88             # Write comments as subpages.    89     90             if content.has_key("comments"):    91     92                 # Define a page directory for each comment, and write the page    93                 # title in a special file for later processing.    94     95                 for _comment, commentid in content["comments"]:    96                     mkdirs(join(pages_dir, commentid))    97                     append(join(pages_dir, commentid, "pagetitle"), title)    98     99             # Some metadata is not particularly relevant. For example,   100             # ancestors, children, parent are navigation-related.   101    102             # Other metadata could be added to the page content itself.   103             # For example, labelling could be converted to categories.   104    105         # Handle revisions.   106    107         elif objecttype == "BodyContent":   108             body = content["body"]   109             if not body:   110                 body = "## Empty page."   111    112             if no_translate:   113                 fn = write   114             else:   115                 fn = translate   116    117             fn(join(versions_dir, content["content"]), body)   118    119         self.content = {}   120    121     def handle_property(self, name, elements, attributes, all_text, text):   122    123         "Record properties in the current content dictionary."   124    125         self.content[attributes[-1]["name"]] = text.strip()   126    127     def handle_id(self, name, elements, attributes, all_text, text):   128    129         "Promote identifiers to the parent element's text."   130    131         all_text[-2].append(text)   132    133     def handle_collection(self, name, elements, attributes, all_text, text):   134    135         "Record collections in the current content dictionary."   136    137         self.content[attributes[-1]["name"]] = self.elements   138         self.elements = []   139    140     def handle_element(self, name, elements, attributes, all_text, text):   141    142         "Add elements to the current collection."   143    144         self.elements.append((attributes[-1]["class"], text.strip()))   145    146 def mkdirs(name):   147    148     "Make the directory with the given 'name' at any depth."   149    150     try:   151         makedirs(name)   152     except OSError:   153         pass   154    155 def append(filename, s):   156    157     "Append to the file with the given 'filename' the string 's'."   158    159     write(filename, s, True)   160    161 def write(filename, s, append=False):   162    163     """   164     Write to the file with the given 'filename' the string 's'. If the optional   165     'append' parameter is set to a true value, 's' will be appended to the file.   166     """   167    168     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   169     try:   170         f.write(s)   171     finally:   172         f.close()   173    174 def read(filename):   175    176     """   177     Read from the file with the given 'filename', returning a string containing   178     its contents.   179     """   180    181     f = codecs.open(filename, encoding="utf-8")   182     try:   183         return f.read()   184     finally:   185         f.close()   186    187 def translate(filename, body):   188    189     """   190     Write to the file with the given 'filename' a translation of the given   191     'body'.   192     """   193    194     out = codecs.open(filename, "w", encoding="utf-8")   195     try:   196         parser.parse(body, out)   197     finally:   198         out.close()   199    200 def sort_manifest(filename, pagetitle, output=None):   201    202     """   203     Sort the manifest given in 'filename' according to revision. If a   204     'pagetitle' file exists, the title column in the manifest will be augmented   205     with the contents of that file. If 'output' is given, the manifest details   206     will be appended to the file having that filename instead of being rewritten   207     to the original manifest file.   208     """   209    210     if exists(pagetitle):   211         title = read(pagetitle)   212     else:   213         title = None   214    215     f = codecs.open(filename, "r", encoding="utf-8")   216     try:   217         lines = [x.split("|") for x in f.readlines()]   218         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   219    220         # Reconstruct the lines, optionally changing the titles.   221    222         result = []   223         for x in lines:   224             if title is not None:   225                 x[3] = "%s/%s" % (title, x[3])   226             result.append("|".join(x[1:]))   227     finally:   228         f.close()   229    230     s = "".join(result)   231    232     if output is None:   233         write(filename, s)   234     else:   235         append(output, s)   236    237 if __name__ == "__main__":   238     import sys   239    240     try:   241         filename = sys.argv[1]   242         is_zipfile = splitext(filename)[-1] == extsep + "zip"   243         space = sys.argv[2]   244     except IndexError:   245         print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."   246         print >>sys.stderr, "For example: com_entities.xml COM"   247         sys.exit(1)   248    249     no_translate = "--no-translate" in sys.argv   250    251     if exists(space):   252         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   253         sys.exit(1)   254    255     package_zip = space + extsep + "zip"   256    257     if exists(package_zip):   258         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   259         sys.exit(1)   260    261     mkdir(space)   262     mkdirs(join(space, "pages"))   263     mkdirs(join(space, "versions"))   264    265     p = xmlread.ConfigurableParser()   266     handler = ConfluenceHandler(space, no_translate)   267    268     p["object"] = handler.handle_object   269     p["property"] = handler.handle_property   270     p["id"] = handler.handle_id   271     p["collection"] = handler.handle_collection   272     p["element"] = handler.handle_element   273    274     # Open the XML dump.   275    276     f = open(filename)   277    278     if is_zipfile:   279         zf = ZipFile(f)   280         ff = StringIO(zf.read("entities.xml"))   281     else:   282         ff = f   283    284     # Parse the data.   285    286     try:   287         p.parse(ff)   288     finally:   289         f.close()   290    291     # Tidy up the import manifests, sorting each of them by revision and   292     # finalising them.   293    294     pages_dir = join(space, "pages")   295    296     output_manifest = join(space, "MOIN_PACKAGE")   297     append(output_manifest, "MoinMoinPackage|1\n")   298    299     for pageid in listdir(pages_dir):   300         manifest = join(pages_dir, pageid, "manifest")   301         pagetitle = join(pages_dir, pageid, "pagetitle")   302         sort_manifest(manifest, pagetitle, output_manifest)   303    304     # Write the page package.   305    306     page_package = ZipFile(package_zip, "w")   307    308     try:   309         # Include the page revisions.   310    311         versions_dir = join(space, "versions")   312    313         for versionid in listdir(versions_dir):   314             page_package.write(join(versions_dir, versionid))   315    316         # Include only the top-level manifest.   317    318         page_package.write(output_manifest, "MOIN_PACKAGE")   319    320     finally:   321         page_package.close()   322    323 # vim: tabstop=4 expandtab shiftwidth=4