ConfluenceConverter (file convert.py at 9d8a9c36829b)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import listdir, mkdir, makedirs    25 from os.path import exists, extsep, join, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 import codecs    29 import xmlread    30 import wikiparser, xmlparser    31 import sys    32     33 MAX_TITLE_LENGTH = 120    34     35 class ConfluenceHandler:    36     37     "Handle content from a Confluence Wiki dump."    38     39     def __init__(self, space, no_translate=False):    40         self.content = {}    41         self.elements = []    42         self.space = space    43         self.no_translate = no_translate    44     45     def handle_object(self, name, elements, attributes, all_text, text):    46     47         "Handle objects according to type."    48     49         objecttype = attributes[-1]["class"]    50     51         # Any identifier is stored as the object's textual content.    52     53         identifier = text.strip()    54     55         # The content is a dictionary mapping names to properties and    56         # collections.    57     58         content = self.content    59     60         pages_dir = join(self.space, "pages")    61         versions_dir = join(self.space, "versions")    62     63         # Handle particular types.    64     65         if objecttype in ("Page", "Comment", "BlogPost"):    66     67             # Handle pages and revisions, adding revisions to the page manifest.    68             # The original version is used as a unifying identifier for all the    69             # different revisions (each of which being defined by a Page    70             # element). Although "original" implies the first identifier used,    71             # it actually appears to be the latest and will have the highest    72             # version number.    73     74             if content.has_key("originalVersion"):    75                 pageid = content["originalVersion"]    76             else:    77                 pageid = identifier    78     79             versionfile = join(versions_dir, identifier)    80     81             # Note page metadata, not necessarily in the correct order.    82             # For comments, the title will need to be rewritten, since they    83             # should be defined in terms of their owner page.    84     85             mkdirs(join(pages_dir, pageid))    86     87             title = content["title"]    88     89             # Limit the title to a "safe" number of characters in order to avoid    90             # filesystem issues.    91     92             title = title[:MAX_TITLE_LENGTH]    93     94             if title:    95                 title = "%s/%s" % (self.space, title)    96                 write(join(pages_dir, pageid, "pagetitle"), title)    97     98             # See sort_manifest for access to this data.    99    100             append(join(pages_dir, pageid, "manifest"),   101                 "%s|AddRevision|%s|%s|%s|%s\n" % (   102                     content["version"],   103                     versionfile,   104                     title, # comment titles will incorporate the comment's position   105                     content["lastModifierName"],   106                     content["versionComment"]   107                 ))   108    109             # Add information to parent pages for child page lists.   110    111             if content.has_key("parent"):   112                 parentid = content["parent"]   113                 mkdirs(join(pages_dir, parentid))   114                 append(join(pages_dir, parentid, "children"), title + "\n")   115    116             # Add creation details for comments to the owner page.   117             # Since comments can be versioned, the date of the original version   118             # is used, and only this "original" version has the owner property.   119    120             if objecttype == "Comment" and content.has_key("owner"):   121                 ownerid = content["owner"]   122                 mkdirs(join(pages_dir, ownerid))   123                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   124    125             # Some metadata is not particularly relevant. For example,   126             # ancestors, children, parent are navigation-related.   127    128             # Other metadata could be added to the page content itself.   129             # For example, labelling could be converted to categories.   130    131         # Handle revisions.   132    133         elif objecttype == "BodyContent":   134             body = content["body"]   135             if not body:   136                 body = "## Empty page."   137    138             # NOTE: Very simple technique employed for guessing the format.   139    140             if no_translate:   141                 fn = write   142             elif body.startswith("<"):   143                 fn = xmltranslate   144             else:   145                 fn = translate   146    147             try:   148                 fn(join(versions_dir, content["content"]), body)   149             except:   150                 print >>sys.stderr, "Error parsing..."   151                 print >>sys.stderr, body   152                 raise   153    154         self.content = {}   155    156     def handle_property(self, name, elements, attributes, all_text, text):   157    158         "Record properties in the current content dictionary."   159    160         self.content[attributes[-1]["name"]] = text.strip()   161    162     def handle_id(self, name, elements, attributes, all_text, text):   163    164         "Promote identifiers to the parent element's text."   165    166         all_text[-2].append(text)   167    168     def handle_collection(self, name, elements, attributes, all_text, text):   169    170         "Record collections in the current content dictionary."   171    172         self.content[attributes[-1]["name"]] = self.elements   173         self.elements = []   174    175     def handle_element(self, name, elements, attributes, all_text, text):   176    177         "Add elements to the current collection."   178    179         self.elements.append((attributes[-1]["class"], text.strip()))   180    181 def mkdirs(name):   182    183     "Make the directory with the given 'name' at any depth."   184    185     try:   186         makedirs(name)   187     except OSError:   188         pass   189    190 def append(filename, s):   191    192     "Append to the file with the given 'filename' the string 's'."   193    194     write(filename, s, True)   195    196 def write(filename, s, append=False):   197    198     """   199     Write to the file with the given 'filename' the string 's'. If the optional   200     'append' parameter is set to a true value, 's' will be appended to the file.   201     """   202    203     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   204     try:   205         f.write(s)   206     finally:   207         f.close()   208    209 def read(filename):   210    211     """   212     Read from the file with the given 'filename', returning a string containing   213     its contents.   214     """   215    216     f = codecs.open(filename, encoding="utf-8")   217     try:   218         return f.read()   219     finally:   220         f.close()   221    222 def translate(filename, body, fn=None):   223    224     """   225     Write to the file with the given 'filename' a translation of the given   226     'body'.   227     """   228    229     fn = fn or wikiparser.parse   230    231     out = codecs.open(filename, "w", encoding="utf-8")   232     try:   233         fn(body, out)   234     finally:   235         out.close()   236    237 def xmltranslate(filename, body):   238     translate(filename, body, xmlparser.parse)   239    240 def sort_comments(pages_dir, pageid):   241    242     """   243     Where 'pageid' has comments associated with it, sort them chronologically   244     and label the comment pages with the owner page's title and comment's   245     position in the chronological sequence. Such labelling is done by writing   246     a "pagetitle" file in each comment page's directory.   247     """   248    249     comments = join(pages_dir, pageid, "comments")   250    251     if not exists(comments):   252         return   253    254     title = read(join(pages_dir, pageid, "pagetitle"))   255    256     details = [line.split("|") for line in read(comments).split("\n") if line]   257     details.sort()   258    259     # Write the sorted comments list for testing purposes.   260    261     write(comments, "\n".join(["|".join(x) for x in details]))   262    263     # Define comments as subpages by setting their titles using this   264     # page's name/title and their position in the comments collection.   265    266     for position, (_lastmodified, commentid) in enumerate(details):   267    268         # In the page directory for each comment, write the page title in a   269         # special file for later processing.   270    271         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   272    273 def sort_manifest(pages_dir, pageid, output=None, no_translate=False):   274    275     """   276     Using the given 'pageid', locate the manifest for the page and any page   277     title information written to a "pagetitle" file.   278    279     Then sort the manifest according to revision so that it will be added to   280     MoinMoin in the correct order.   281    282     If a "pagetitle" file exists, the title column in the manifest will be   283     augmented with the contents of that file. This is typically done for   284     comments.   285    286     If a "children" file exists, the pages in that file will be added as a list   287     to the end of each revision's content.   288    289     If 'output' is given, the manifest details will be appended to the file   290     having that filename instead of being rewritten to the original manifest   291     file.   292     """   293    294     manifest = join(pages_dir, pageid, "manifest")   295     pagetitle = join(pages_dir, pageid, "pagetitle")   296     children = join(pages_dir, pageid, "children")   297     comments = join(pages_dir, pageid, "comments")   298    299     if exists(pagetitle):   300         title = read(pagetitle)   301     else:   302         title = None   303    304     f = codecs.open(manifest, "r", encoding="utf-8")   305     try:   306         lines = [x.split("|") for x in f.readlines()]   307         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   308    309         # Reconstruct the lines, optionally changing the titles.   310    311         result = []   312    313         for line in lines:   314             version, _addrevision, filename, old_title, username, comment = line   315    316             # Replace title information with the information already present.   317    318             if title is not None:   319                 new_title = title   320             else:   321                 new_title = old_title   322    323             # The version is omitted now that the manifest is ordered.   324    325             line = _addrevision, filename, new_title, username, comment   326             result.append("|".join(line))   327    328             # Add child page information to the content.   329    330             if exists(children) and not no_translate:   331                 child_pages = []   332                 child_page_names = [x for x in read(children).split("\n") if x]   333                 child_page_names.sort()   334    335                 for child_page_name in child_page_names:   336                     child_pages.append(" * [[%s]]" % child_page_name)   337    338                 append(filename, child_page_section % "\n".join(child_pages))   339    340             # Add comments to the content.   341    342             if exists(comments) and title and not no_translate:   343                 append(filename, comment_section % title)   344    345     finally:   346         f.close()   347    348     s = "".join(result)   349    350     if output is None:   351         write(manifest, s)   352     else:   353         append(output, s)   354    355 # Template for child page information.   356    357 child_page_section = """   358 ----   359    360 %s   361 """   362    363 # Template for comments.   364    365 comment_section = """   366 ----   367    368 <<Include("^%s/")>>   369 """   370    371 # Main program.   372    373 if __name__ == "__main__":   374     try:   375         filename = sys.argv[1]   376         is_zipfile = splitext(filename)[-1] == extsep + "zip"   377         space = sys.argv[2]   378     except IndexError:   379         print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."   380         print >>sys.stderr, "For example: com_entities.xml COM"   381         sys.exit(1)   382    383     no_translate = "--no-translate" in sys.argv   384    385     if exists(space):   386         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   387         sys.exit(1)   388    389     package_zip = space + extsep + "zip"   390    391     if exists(package_zip):   392         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   393         sys.exit(1)   394    395     mkdir(space)   396     mkdirs(join(space, "pages"))   397     mkdirs(join(space, "versions"))   398    399     p = xmlread.ConfigurableParser()   400     handler = ConfluenceHandler(space, no_translate)   401    402     # Register handlers in the parser for different elements.   403    404     p["object"] = handler.handle_object   405     p["property"] = handler.handle_property   406     p["id"] = handler.handle_id   407     p["collection"] = handler.handle_collection   408     p["element"] = handler.handle_element   409    410     # Open the XML dump.   411    412     f = open(filename)   413    414     if is_zipfile:   415         zf = ZipFile(f)   416         ff = StringIO(zf.read("entities.xml"))   417     else:   418         ff = f   419    420     # Parse the data.   421    422     try:   423         p.parse(ff)   424     finally:   425         f.close()   426    427     # Tidy up the import manifests, sorting each of them by revision and   428     # finalising them.   429    430     pages_dir = join(space, "pages")   431    432     for pageid in listdir(pages_dir):   433         sort_comments(pages_dir, pageid)   434    435     output_manifest = join(space, "MOIN_PACKAGE")   436     append(output_manifest, "MoinMoinPackage|1\n")   437    438     for pageid in listdir(pages_dir):   439         sort_manifest(pages_dir, pageid, output_manifest, no_translate)   440    441     # Write the page package.   442    443     page_package = ZipFile(package_zip, "w")   444    445     try:   446         # Include the page revisions.   447    448         versions_dir = join(space, "versions")   449    450         for versionid in listdir(versions_dir):   451             page_package.write(join(versions_dir, versionid))   452    453         # Include only the top-level manifest.   454    455         page_package.write(output_manifest, "MOIN_PACKAGE")   456    457     finally:   458         page_package.close()   459    460 # vim: tabstop=4 expandtab shiftwidth=4