ConfluenceConverter (file convert.py at 1b5923977b00)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk    25 from os.path import exists, extsep, join, split, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 from MoinMoin import wikiutil    29 import codecs    30 import xmlread    31 import wikiparser, xmlparser    32 import sys    33     34 MAX_TITLE_LENGTH = 120    35     36 class ConfluenceHandler:    37     38     "Handle content from a Confluence Wiki dump."    39     40     def __init__(self, space, no_translate=False):    41         self.content = {}    42         self.elements = []    43         self.space = space    44         self.no_translate = no_translate    45     46     def handle_object(self, name, elements, attributes, all_text, text):    47     48         """    49         Handle objects according to type. Objects appear as follows:    50     51         <object class="Page" package="...">    52         <id name="id">...</id>    53         ...    54         </object>    55     56         Within objects, one finds things like properties and collections, which    57         are handled by their own methods but which are stored in the content    58         dictionary associated with the current object.    59     60         By the time this method is called, the contents of the object will have    61         been gathered and the properties and collections populated in the    62         content dictionary. Any identifier will have been assigned to the    63         textual content of the object element and will be available in the    64         'text' parameter.    65         """    66     67         objecttype = attributes[-1]["class"]    68     69         # Any identifier is stored as the object's textual content.    70     71         identifier = text.strip()    72     73         # The content is a dictionary mapping names to properties and    74         # collections.    75     76         content = self.content    77     78         pages_dir = join(self.space, "pages")    79         versions_dir = join(self.space, "versions")    80     81         # Handle particular types.    82     83         if objecttype in ("Page", "Comment", "BlogPost"):    84     85             # Handle pages and revisions, adding revisions to the page manifest.    86             # The original version is used as a unifying identifier for all the    87             # different revisions (each of which being defined by a Page    88             # element). Although "original" implies the first identifier used,    89             # it actually appears to be the latest and will have the highest    90             # version number.    91     92             if content.has_key("originalVersion"):    93                 pageid = content["originalVersion"]    94             else:    95                 pageid = identifier    96     97             versionfile = join(versions_dir, identifier)    98     99             # Note page metadata, not necessarily in the correct order.   100             # For comments, the title will need to be rewritten, since they   101             # should be defined in terms of their owner page.   102    103             # NOTE: This only makes the current title available to comments.   104    105             mkdirs(join(pages_dir, pageid))   106    107             title = content["title"]   108    109             # Limit the title to a "safe" number of characters in order to avoid   110             # filesystem issues.   111    112             title = title[:MAX_TITLE_LENGTH]   113    114             if title:   115                 title = "%s/%s" % (self.space, title)   116                 write(join(pages_dir, pageid, "pagetitle"), title)   117    118             # See sort_manifest for access to this data.   119    120             append(join(pages_dir, pageid, "manifest"),   121                 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment   122                     content["version"],   123                     versionfile,   124                     title, # comment titles will incorporate the comment's position   125                     content["lastModifierName"],   126                     content["versionComment"]   127                 ))   128    129             # Add information to parent pages for child page lists.   130    131             if content.has_key("parent"):   132                 parentid = content["parent"]   133                 mkdirs(join(pages_dir, parentid))   134                 append(join(pages_dir, parentid, "children"), title + "\n")   135    136             # Add creation details for comments to the owner page.   137             # Since comments can be versioned, the date of the original version   138             # is used, and only this "original" version has the owner property.   139    140             if objecttype == "Comment" and content.has_key("owner"):   141                 ownerid = content["owner"]   142                 mkdirs(join(pages_dir, ownerid))   143                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   144    145             # Some metadata is not particularly relevant. For example,   146             # ancestors, children, parent are navigation-related.   147    148             # Other metadata could be added to the page content itself.   149             # For example, labelling could be converted to categories.   150    151         # Handle revisions.   152    153         elif objecttype == "BodyContent":   154             body = content["body"]   155             if not body:   156                 body = "## Empty page."   157    158             # NOTE: Very simple technique employed for guessing the format.   159    160             if no_translate:   161                 fn = write   162             elif body.startswith("<"):   163                 fn = xmltranslate   164             else:   165                 fn = translate   166    167             try:   168                 fn(join(versions_dir, content["content"]), body)   169             except:   170                 err = codecs.getwriter("utf-8")(sys.stderr)   171                 print >>err, "Error parsing", content["content"]   172                 raise   173    174         # Handle attachments.   175    176         elif objecttype == "Attachment":   177             pageid = content["content"]   178             version = content["attachmentVersion"]   179    180             if content.has_key("originalVersion"):   181                 attachid = content["originalVersion"]   182             else:   183                 attachid = identifier   184    185             append(join(pages_dir, pageid, "attachments"),   186                 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % (   187                     version,   188                     # Have to "taint" archive filenames, although Moin will   189                     # probably handle package script filename tainting.   190                     wikiutil.taintfilename(join("attachments", pageid, attachid, version)),   191                     wikiutil.taintfilename(content["fileName"]),   192                     "", # pagename is substituted later   193                     content["lastModifierName"],   194                     content["comment"]   195                 ))   196    197         self.content = {}   198    199     def handle_property(self, name, elements, attributes, all_text, text):   200    201         "Record properties in the current content dictionary."   202    203         self.content[attributes[-1]["name"]] = text.strip()   204    205     def handle_id(self, name, elements, attributes, all_text, text):   206    207         "Promote identifiers to the parent element's text."   208    209         all_text[-2].append(text)   210    211     def handle_collection(self, name, elements, attributes, all_text, text):   212    213         "Record collections in the current content dictionary."   214    215         self.content[attributes[-1]["name"]] = self.elements   216         self.elements = []   217    218     def handle_element(self, name, elements, attributes, all_text, text):   219    220         "Add elements to the current collection."   221    222         self.elements.append((attributes[-1]["class"], text.strip()))   223    224 def mkdirs(name):   225    226     "Make the directory with the given 'name' at any depth."   227    228     try:   229         makedirs(name)   230     except OSError:   231         pass   232    233 def append(filename, s):   234    235     "Append to the file with the given 'filename' the string 's'."   236    237     write(filename, s, True)   238    239 def write(filename, s, append=False):   240    241     """   242     Write to the file with the given 'filename' the string 's'. If the optional   243     'append' parameter is set to a true value, 's' will be appended to the file.   244     """   245    246     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   247     try:   248         f.write(s)   249     finally:   250         f.close()   251    252 def read(filename):   253    254     """   255     Read from the file with the given 'filename', returning a string containing   256     its contents.   257     """   258    259     f = codecs.open(filename, encoding="utf-8")   260     try:   261         return f.read()   262     finally:   263         f.close()   264    265 def translate(filename, body, fn=None):   266    267     """   268     Write to the file with the given 'filename' a translation of the given   269     'body'.   270     """   271    272     fn = fn or wikiparser.parse   273    274     out = codecs.open(filename, "w", encoding="utf-8")   275     try:   276         print >>out, "#pragma page-filename", filename   277         fn(body, out)   278     finally:   279         out.close()   280    281 def xmltranslate(filename, body):   282     translate(filename, body, xmlparser.parse)   283    284 def sort_comments(pages_dir, pageid):   285    286     """   287     Where 'pageid' has comments associated with it, sort them chronologically   288     and label the comment pages with the owner page's title and comment's   289     position in the chronological sequence. Such labelling is done by writing   290     a "pagetitle" file in each comment page's directory.   291     """   292    293     comments = join(pages_dir, pageid, "comments")   294    295     if not exists(comments):   296         return   297    298     title = read(join(pages_dir, pageid, "pagetitle"))   299    300     details = [line.split("|") for line in read(comments).split("\n") if line]   301     details.sort()   302    303     # Write the sorted comments list for testing purposes.   304    305     write(comments, "\n".join(["|".join(x) for x in details]))   306    307     # Define comments as subpages by setting their titles using this   308     # page's name/title and their position in the comments collection.   309    310     for position, (_lastmodified, commentid) in enumerate(details):   311    312         # In the page directory for each comment, write the page title in a   313         # special file for later processing.   314    315         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   316    317 def _sort_manifest(manifest, title):   318    319     """   320     Open the given 'manifest' and sort it according to revision so that it will   321     be added to MoinMoin in the correct order.   322    323     If a 'title' is provided, the title column in the manifest will be augmented   324     with that information. This is typically done for comments and is necessary   325     for attachments.   326    327     A list of manifest entries is returned.   328     """   329    330     f = codecs.open(manifest, "r", encoding="utf-8")   331     try:   332         lines = [x.split("|") for x in f.readlines()]   333         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   334    335         # Reconstruct the lines, optionally changing the titles.   336    337         result = []   338    339         for line in lines:   340             version, _action, _archive_filename, filename, old_title, username, comment = line   341    342             # Replace title information with the information already present.   343    344             if not old_title:   345                 new_title = title   346             else:   347                 new_title = old_title   348    349             # The version is omitted now that the manifest is ordered.   350    351             line = _action, _archive_filename, filename, new_title, username, comment   352             result.append(line)   353    354         return result   355    356     finally:   357         f.close()   358    359 def serialise_manifest(manifest):   360    361     """   362     Process the 'manifest' consisting of entries, removing superfluous columns.   363     """   364    365     result = []   366    367     for columns in manifest:   368         action = columns[0]   369         if action == "AddRevision":   370             columns = list(columns)   371             del columns[1]   372         result.append("|".join(columns))   373    374     return "".join(result)   375                376 def sort_manifest(pages_dir, pageid, output=None, no_translate=False):   377    378     """   379     Using the given 'pageid', locate the manifest for the page and any page   380     title information written to a "pagetitle" file.   381    382     Then sort the manifest according to revision so that it will be added to   383     MoinMoin in the correct order.   384    385     If a "pagetitle" file exists, the title column in the manifest will be   386     augmented with the contents of that file. This is typically done for   387     comments.   388    389     If a "children" file exists, the pages in that file will be added as a list   390     to the end of each revision's content.   391    392     If 'output' is given, the manifest details will be appended to the file   393     having that filename instead of being rewritten to the original manifest   394     file.   395     """   396    397     manifest = join(pages_dir, pageid, "manifest")   398     attachments = join(pages_dir, pageid, "attachments")   399     pagetitle = join(pages_dir, pageid, "pagetitle")   400     children = join(pages_dir, pageid, "children")   401     comments = join(pages_dir, pageid, "comments")   402    403     if exists(pagetitle):   404         title = read(pagetitle)   405     else:   406         title = None   407    408     # Sort the revision manifest.   409    410     result = _sort_manifest(manifest, title)   411    412     for _action, _archive_filename, filename, new_title, username, comment in result:   413    414         # Add child page information to the content.   415    416         if exists(children) and not no_translate:   417             child_pages = []   418             child_page_names = [x for x in read(children).split("\n") if x]   419             child_page_names.sort()   420    421             for child_page_name in child_page_names:   422                 child_pages.append(" * [[%s]]" % child_page_name)   423    424             append(filename, child_page_section % "\n".join(child_pages))   425    426         # Add comments to the content.   427    428         if exists(comments) and title and not no_translate:   429             append(filename, comment_section % title)   430    431     # Add the attachments to the manifest.   432    433     if exists(attachments):   434         result += _sort_manifest(attachments, title)   435    436     # Serialise the manifest.   437    438     s = serialise_manifest(result)   439    440     if output is None:   441         write(manifest, s)   442     else:   443         append(output, s)   444    445 # Template for child page information.   446    447 child_page_section = """   448 ----   449    450 %s   451 """   452    453 # Template for comments.   454    455 comment_section = """   456 ----   457    458 <<Include("^%s/")>>   459 """   460    461 # Main program.   462    463 if __name__ == "__main__":   464     try:   465         filename = sys.argv[1]   466         is_zipfile = splitext(filename)[-1] == extsep + "zip"   467         space = sys.argv[2]   468         if len(sys.argv) > 3 and sys.argv[3]:   469             attachments = sys.argv[3]   470         else:   471             attachments = None   472     except IndexError:   473         print >>sys.stderr, """   474 Please specify an XML file containing Wiki data, a workspace name, and an   475 optional attachments directory location. For example:   476    477 com_entities.xml COM attachments   478    479 Adding --no-translate will unpack the Wiki but not translate the content.   480 When doing so without an attachments directory, add an empty argument as   481 follows:   482    483 com_entities.xml COM '' --no-translate   484 """   485         sys.exit(1)   486    487     no_translate = "--no-translate" in sys.argv   488    489     if exists(space):   490         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   491         sys.exit(1)   492    493     package_zip = space + extsep + "zip"   494    495     if exists(package_zip):   496         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   497         sys.exit(1)   498    499     mkdir(space)   500     mkdirs(join(space, "pages"))   501     mkdirs(join(space, "versions"))   502    503     p = xmlread.ConfigurableParser()   504     handler = ConfluenceHandler(space, no_translate)   505    506     # Register handlers in the parser for different elements.   507    508     p["object"] = handler.handle_object   509     p["property"] = handler.handle_property   510     p["id"] = handler.handle_id   511     p["collection"] = handler.handle_collection   512     p["element"] = handler.handle_element   513    514     # Open the XML dump.   515    516     f = open(filename)   517    518     if is_zipfile:   519         zf = ZipFile(f)   520         ff = StringIO(zf.read("entities.xml"))   521     else:   522         ff = f   523    524     # Parse the data.   525    526     try:   527         p.parse(ff)   528    529         # Tidy up the import manifests, sorting each of them by revision and   530         # finalising them.   531    532         pages_dir = join(space, "pages")   533    534         for pageid in listdir(pages_dir):   535             sort_comments(pages_dir, pageid)   536    537         output_manifest = join(space, "MOIN_PACKAGE")   538         append(output_manifest, "MoinMoinPackage|1\n")   539    540         for pageid in listdir(pages_dir):   541             sort_manifest(pages_dir, pageid, output_manifest, no_translate)   542    543         # Write the page package.   544    545         page_package = ZipFile(package_zip, "w")   546    547         try:   548             # Include the page revisions.   549    550             versions_dir = join(space, "versions")   551    552             for versionid in listdir(versions_dir):   553                 page_package.write(join(versions_dir, versionid))   554    555             # Include the attachments.   556    557             if attachments:   558                 cwd = getcwd()   559                 chdir(split(attachments)[0])   560                 try:   561                     for path, dirnames, filenames in walk(split(attachments)[1]):   562                         for filename in filenames:   563                             # Have to "taint" archive filenames.   564                             page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))   565                 finally:   566                     chdir(cwd)   567             elif is_zipfile:   568                 for filename in zf.namelist():   569                     if filename.startswith("attachments"):   570                         # Have to "taint" archive filenames.   571                         page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))   572    573             # Include only the top-level manifest.   574    575             page_package.write(output_manifest, "MOIN_PACKAGE")   576    577         finally:   578             page_package.close()   579    580     finally:   581         f.close()   582    583 # vim: tabstop=4 expandtab shiftwidth=4