ConfluenceConverter (file convert.py at 33c7019a881e)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import listdir, mkdir, makedirs    25 from os.path import exists, extsep, join, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 import codecs    29 import xmlread    30 import parser    31 import sys    32     33 MAX_TITLE_LENGTH = 120    34     35 class ConfluenceHandler:    36     37     "Handle content from a Confluence Wiki dump."    38     39     def __init__(self, space, no_translate=False):    40         self.content = {}    41         self.elements = []    42         self.space = space    43         self.no_translate = no_translate    44     45     def handle_object(self, name, elements, attributes, all_text, text):    46     47         "Handle objects according to type."    48     49         objecttype = attributes[-1]["class"]    50     51         # Any identifier is stored as the object's textual content.    52     53         identifier = text.strip()    54     55         # The content is a dictionary mapping names to properties and    56         # collections.    57     58         content = self.content    59     60         pages_dir = join(self.space, "pages")    61         versions_dir = join(self.space, "versions")    62     63         # Handle particular types.    64     65         if objecttype in ("Page", "Comment", "BlogPost"):    66     67             # Handle pages and revisions, adding revisions to the page manifest.    68             # The original version is used as a unifying identifier for all the    69             # different revisions (each of which being defined by a Page    70             # element). Although "original" implies the first identifier used,    71             # it actually appears to be the latest and will have the highest    72             # version number.    73     74             if content.has_key("originalVersion"):    75                 pageid = content["originalVersion"]    76             else:    77                 pageid = identifier    78     79             versionfile = join(versions_dir, identifier)    80     81             # Note page metadata, not necessarily in the correct order.    82             # For comments, the title will need to be rewritten, since they    83             # should be defined in terms of their owner page.    84     85             mkdirs(join(pages_dir, pageid))    86     87             title = content["title"]    88     89             # Limit the title to a "safe" number of characters in order to avoid    90             # filesystem issues.    91     92             title = title[:MAX_TITLE_LENGTH]    93     94             if title:    95                 title = "%s/%s" % (self.space, title)    96                 write(join(pages_dir, pageid, "pagetitle"), title)    97     98             # See sort_manifest for access to this data.    99    100             append(join(pages_dir, pageid, "manifest"),   101                 "%s|AddRevision|%s|%s|%s|%s\n" % (   102                     content["version"],   103                     versionfile,   104                     title, # comment titles will incorporate the comment's position   105                     content["lastModifierName"],   106                     content["versionComment"]   107                 ))   108    109             # Add information to parent pages for child page lists.   110    111             if content.has_key("parent"):   112                 parentid = content["parent"]   113                 mkdirs(join(pages_dir, parentid))   114                 append(join(pages_dir, parentid, "children"), title + "\n")   115    116             # Add creation details for comments to the owner page.   117             # Since comments can be versioned, the date of the original version   118             # is used, and only this "original" version has the owner property.   119    120             if objecttype == "Comment" and content.has_key("owner"):   121                 ownerid = content["owner"]   122                 mkdirs(join(pages_dir, ownerid))   123                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   124    125             # Some metadata is not particularly relevant. For example,   126             # ancestors, children, parent are navigation-related.   127    128             # Other metadata could be added to the page content itself.   129             # For example, labelling could be converted to categories.   130    131         # Handle revisions.   132    133         elif objecttype == "BodyContent":   134             body = content["body"]   135             if not body:   136                 body = "## Empty page."   137    138             # NOTE: Very simple technique employed for guessing the format.   139    140             if no_translate:   141                 fn = write   142             elif body.startswith("<"):   143                 fn = xmltranslate   144             else:   145                 fn = translate   146    147             try:   148                 fn(join(versions_dir, content["content"]), body)   149             except:   150                 print >>sys.stderr, "Error parsing..."   151                 print >>sys.stderr, body   152                 raise   153    154         self.content = {}   155    156     def handle_property(self, name, elements, attributes, all_text, text):   157    158         "Record properties in the current content dictionary."   159    160         self.content[attributes[-1]["name"]] = text.strip()   161    162     def handle_id(self, name, elements, attributes, all_text, text):   163    164         "Promote identifiers to the parent element's text."   165    166         all_text[-2].append(text)   167    168     def handle_collection(self, name, elements, attributes, all_text, text):   169    170         "Record collections in the current content dictionary."   171    172         self.content[attributes[-1]["name"]] = self.elements   173         self.elements = []   174    175     def handle_element(self, name, elements, attributes, all_text, text):   176    177         "Add elements to the current collection."   178    179         self.elements.append((attributes[-1]["class"], text.strip()))   180    181 def mkdirs(name):   182    183     "Make the directory with the given 'name' at any depth."   184    185     try:   186         makedirs(name)   187     except OSError:   188         pass   189    190 def append(filename, s):   191    192     "Append to the file with the given 'filename' the string 's'."   193    194     write(filename, s, True)   195    196 def write(filename, s, append=False):   197    198     """   199     Write to the file with the given 'filename' the string 's'. If the optional   200     'append' parameter is set to a true value, 's' will be appended to the file.   201     """   202    203     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   204     try:   205         f.write(s)   206     finally:   207         f.close()   208    209 def read(filename):   210    211     """   212     Read from the file with the given 'filename', returning a string containing   213     its contents.   214     """   215    216     f = codecs.open(filename, encoding="utf-8")   217     try:   218         return f.read()   219     finally:   220         f.close()   221    222 def translate(filename, body, fn=None):   223    224     """   225     Write to the file with the given 'filename' a translation of the given   226     'body'.   227     """   228    229     fn = fn or parser.parse   230    231     out = codecs.open(filename, "w", encoding="utf-8")   232     try:   233         fn(body, out)   234     finally:   235         out.close()   236    237 def xmltranslate(filename, body):   238     translate(filename, body, parser.xmlparse)   239    240 def sort_comments(pages_dir, pageid):   241    242     """   243     Where 'pageid' has comments associated with it, sort them chronologically   244     and label the comment pages with the owner page's title and comment's   245     position in the chronological sequence. Such labelling is done by writing   246     a "pagetitle" file in each comment page's directory.   247     """   248    249     comments = join(pages_dir, pageid, "comments")   250    251     if not exists(comments):   252         return   253    254     title = read(join(pages_dir, pageid, "pagetitle"))   255    256     details = [line.split("|") for line in read(comments).split("\n") if line]   257     details.sort()   258    259     # Write the sorted comments list for testing purposes.   260    261     write(comments, "\n".join(["|".join(x) for x in details]))   262    263     # Define comments as subpages by setting their titles using this   264     # page's name/title and their position in the comments collection.   265    266     for position, (_lastmodified, commentid) in enumerate(details):   267    268         # In the page directory for each comment, write the page title in a   269         # special file for later processing.   270    271         write(join(pages_dir, commentid, "pagetitle"), "%s/%s" % (title, position))   272    273 def sort_manifest(pages_dir, pageid, output=None):   274    275     """   276     Using the given 'pageid', locate the manifest for the page and any page   277     title information written to a "pagetitle" file.   278    279     Then sort the manifest according to revision so that it will be added to   280     MoinMoin in the correct order.   281    282     If a "pagetitle" file exists, the title column in the manifest will be   283     augmented with the contents of that file. This is typically done for   284     comments.   285    286     If a "children" file exists, the pages in that file will be added as a list   287     to the end of each revision's content.   288    289     If 'output' is given, the manifest details will be appended to the file   290     having that filename instead of being rewritten to the original manifest   291     file.   292     """   293    294     manifest = join(pages_dir, pageid, "manifest")   295     pagetitle = join(pages_dir, pageid, "pagetitle")   296     children = join(pages_dir, pageid, "children")   297    298     if exists(pagetitle):   299         title = read(pagetitle)   300     else:   301         title = None   302    303     f = codecs.open(manifest, "r", encoding="utf-8")   304     try:   305         lines = [x.split("|") for x in f.readlines()]   306         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   307    308         # Reconstruct the lines, optionally changing the titles.   309    310         result = []   311    312         for line in lines:   313             version, _addrevision, filename, old_title, username, comment = line   314    315             # Replace title information with the information already present.   316    317             if title is not None:   318                 new_title = title   319             else:   320                 new_title = old_title   321    322             # The version is omitted now that the manifest is ordered.   323    324             line = _addrevision, filename, new_title, username, comment   325             result.append("|".join(line))   326    327             # Add child page information to the content.   328    329             if exists(children):   330                 child_pages = []   331                 child_page_names = [x for x in read(children).split("\n") if x]   332                 child_page_names.sort()   333    334                 for child_page_name in child_page_names:   335                     child_pages.append(" * [[%s]]" % child_page_name)   336    337                 append(filename, child_page_section % "\n".join(child_pages))   338    339     finally:   340         f.close()   341    342     s = "".join(result)   343    344     if output is None:   345         write(manifest, s)   346     else:   347         append(output, s)   348    349 # Template for child page information.   350    351 child_page_section = """   352 ----   353    354 %s   355 """   356    357 # Main program.   358    359 if __name__ == "__main__":   360     try:   361         filename = sys.argv[1]   362         is_zipfile = splitext(filename)[-1] == extsep + "zip"   363         space = sys.argv[2]   364     except IndexError:   365         print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."   366         print >>sys.stderr, "For example: com_entities.xml COM"   367         sys.exit(1)   368    369     no_translate = "--no-translate" in sys.argv   370    371     if exists(space):   372         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   373         sys.exit(1)   374    375     package_zip = space + extsep + "zip"   376    377     if exists(package_zip):   378         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   379         sys.exit(1)   380    381     mkdir(space)   382     mkdirs(join(space, "pages"))   383     mkdirs(join(space, "versions"))   384    385     p = xmlread.ConfigurableParser()   386     handler = ConfluenceHandler(space, no_translate)   387    388     # Register handlers in the parser for different elements.   389    390     p["object"] = handler.handle_object   391     p["property"] = handler.handle_property   392     p["id"] = handler.handle_id   393     p["collection"] = handler.handle_collection   394     p["element"] = handler.handle_element   395    396     # Open the XML dump.   397    398     f = open(filename)   399    400     if is_zipfile:   401         zf = ZipFile(f)   402         ff = StringIO(zf.read("entities.xml"))   403     else:   404         ff = f   405    406     # Parse the data.   407    408     try:   409         p.parse(ff)   410     finally:   411         f.close()   412    413     # Tidy up the import manifests, sorting each of them by revision and   414     # finalising them.   415    416     pages_dir = join(space, "pages")   417    418     for pageid in listdir(pages_dir):   419         sort_comments(pages_dir, pageid)   420    421     output_manifest = join(space, "MOIN_PACKAGE")   422     append(output_manifest, "MoinMoinPackage|1\n")   423    424     for pageid in listdir(pages_dir):   425         sort_manifest(pages_dir, pageid, output_manifest)   426    427     # Write the page package.   428    429     page_package = ZipFile(package_zip, "w")   430    431     try:   432         # Include the page revisions.   433    434         versions_dir = join(space, "versions")   435    436         for versionid in listdir(versions_dir):   437             page_package.write(join(versions_dir, versionid))   438    439         # Include only the top-level manifest.   440    441         page_package.write(output_manifest, "MOIN_PACKAGE")   442    443     finally:   444         page_package.close()   445    446 # vim: tabstop=4 expandtab shiftwidth=4