ConfluenceConverter (file convert.py at 78dd5bd3e830)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk    25 from os.path import exists, extsep, join, split, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 from MoinMoin import wikiutil    29 import codecs    30 import xmlread    31 import wikiparser, xmlparser    32 import sys    33     34 MAX_TITLE_LENGTH = 120    35     36 class ConfluenceHandler:    37     38     "Handle content from a Confluence Wiki dump."    39     40     def __init__(self, space, no_translate=False):    41         self.content = {}    42         self.elements = []    43         self.space = space    44         self.no_translate = no_translate    45     46     def handle_object(self, name, elements, attributes, all_text, text):    47     48         """    49         Handle objects according to type. Objects appear as follows:    50     51         <object class="Page" package="...">    52         <id name="id">...</id>    53         ...    54         </object>    55     56         Within objects, one finds things like properties and collections, which    57         are handled by their own methods but which are stored in the content    58         dictionary associated with the current object.    59     60         By the time this method is called, the contents of the object will have    61         been gathered and the properties and collections populated in the    62         content dictionary. Any identifier will have been assigned to the    63         textual content of the object element and will be available in the    64         'text' parameter.    65         """    66     67         objecttype = attributes[-1]["class"]    68     69         # Any identifier is stored as the object's textual content.    70     71         identifier = text.strip()    72     73         # The content is a dictionary mapping names to properties and    74         # collections.    75     76         content = self.content    77     78         pages_dir = join(self.space, "pages")    79         versions_dir = join(self.space, "versions")    80     81         # Handle particular types.    82     83         if objecttype in ("Page", "Comment", "BlogPost"):    84     85             # Handle pages and revisions, adding revisions to the page manifest.    86             # The original version is used as a unifying identifier for all the    87             # different revisions (each of which being defined by a Page    88             # element). Although "original" implies the first identifier used,    89             # it actually appears to be the latest and will have the highest    90             # version number.    91     92             if content.has_key("originalVersion"):    93                 pageid = content["originalVersion"]    94             else:    95                 pageid = identifier    96     97             versionfile = join(versions_dir, identifier)    98     99             # Note page metadata, not necessarily in the correct order.   100             # For comments, the title will need to be rewritten, since they   101             # should be defined in terms of their owner page.   102    103             mkdirs(join(pages_dir, pageid))   104    105             title = content["title"]   106    107             # Limit the title to a "safe" number of characters in order to avoid   108             # filesystem issues.   109    110             title = title[:MAX_TITLE_LENGTH]   111    112             if title:   113                 title = "%s/%s" % (self.space, title)   114                 write(join(pages_dir, pageid, "pagetitle"), title)   115    116             # See sort_manifest for access to this data.   117    118             append(join(pages_dir, pageid, "manifest"),   119                 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment   120                     content["version"],   121                     versionfile,   122                     title, # comment titles will incorporate the comment's position   123                     content["lastModifierName"],   124                     content["versionComment"]   125                 ))   126    127             # Add information to parent pages for child page lists.   128    129             if content.has_key("parent"):   130                 parentid = content["parent"]   131                 mkdirs(join(pages_dir, parentid))   132                 append(join(pages_dir, parentid, "children"), title + "\n")   133    134             # Add creation details for comments to the owner page.   135             # Since comments can be versioned, the date of the original version   136             # is used, and only this "original" version has the owner property.   137    138             if objecttype == "Comment" and content.has_key("owner"):   139                 ownerid = content["owner"]   140                 mkdirs(join(pages_dir, ownerid))   141                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   142    143             # Some metadata is not particularly relevant. For example,   144             # ancestors, children, parent are navigation-related.   145    146             # Other metadata could be added to the page content itself.   147             # For example, labelling could be converted to categories.   148    149         # Handle revisions.   150    151         elif objecttype == "BodyContent":   152             body = content["body"]   153             if not body:   154                 body = "## Empty page."   155    156             # NOTE: Very simple technique employed for guessing the format.   157    158             if no_translate:   159                 fn = write   160             elif body.startswith("<"):   161                 fn = xmltranslate   162             else:   163                 fn = translate   164    165             try:   166                 fn(join(versions_dir, content["content"]), body)   167             except:   168                 err = codecs.getwriter("utf-8")(sys.stderr)   169                 print >>err, "Error parsing", content["content"]   170                 raise   171    172         # Handle attachments.   173    174         elif objecttype == "Attachment":   175             pageid = content["content"]   176             version = content["attachmentVersion"]   177    178             if content.has_key("originalVersion"):   179                 attachid = content["originalVersion"]   180             else:   181                 attachid = identifier   182    183             append(join(pages_dir, pageid, "attachments"),   184                 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % (   185                     version,   186                     # Have to "taint" archive filenames, although Moin will   187                     # probably handle package script filename tainting.   188                     wikiutil.taintfilename(join("attachments", pageid, attachid, version)),   189                     wikiutil.taintfilename(content["fileName"]),   190                     "", # pagename is substituted later   191                     content["lastModifierName"],   192                     content["comment"]   193                 ))   194    195         self.content = {}   196    197     def handle_property(self, name, elements, attributes, all_text, text):   198    199         "Record properties in the current content dictionary."   200    201         self.content[attributes[-1]["name"]] = text.strip()   202    203     def handle_id(self, name, elements, attributes, all_text, text):   204    205         "Promote identifiers to the parent element's text."   206    207         all_text[-2].append(text)   208    209     def handle_collection(self, name, elements, attributes, all_text, text):   210    211         "Record collections in the current content dictionary."   212    213         self.content[attributes[-1]["name"]] = self.elements   214         self.elements = []   215    216     def handle_element(self, name, elements, attributes, all_text, text):   217    218         "Add elements to the current collection."   219    220         self.elements.append((attributes[-1]["class"], text.strip()))   221    222 def mkdirs(name):   223    224     "Make the directory with the given 'name' at any depth."   225    226     try:   227         makedirs(name)   228     except OSError:   229         pass   230    231 def append(filename, s):   232    233     "Append to the file with the given 'filename' the string 's'."   234    235     write(filename, s, True)   236    237 def write(filename, s, append=False):   238    239     """   240     Write to the file with the given 'filename' the string 's'. If the optional   241     'append' parameter is set to a true value, 's' will be appended to the file.   242     """   243    244     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   245     try:   246         f.write(s)   247     finally:   248         f.close()   249    250 def read(filename):   251    252     """   253     Read from the file with the given 'filename', returning a string containing   254     its contents.   255     """   256    257     f = codecs.open(filename, encoding="utf-8")   258     try:   259         return f.read()   260     finally:   261         f.close()   262    263 def translate(filename, body, fn=None):   264    265     """   266     Write to the file with the given 'filename' a translation of the given   267     'body'.   268     """   269    270     fn = fn or wikiparser.parse   271    272     out = codecs.open(filename, "w", encoding="utf-8")   273     try:   274         print >>out, "#pragma page-filename", filename   275         fn(body, out)   276     finally:   277         out.close()   278    279 def xmltranslate(filename, body):   280     translate(filename, body, xmlparser.parse)   281    282 def sort_comments(pages_dir, pageid):   283    284     """   285     Where 'pageid' has comments associated with it, sort them chronologically   286     and label the comment pages with the owner page's title and comment's   287     position in the chronological sequence. Such labelling is done by writing   288     a "pagetitle" file in each comment page's directory.   289     """   290    291     comments = join(pages_dir, pageid, "comments")   292    293     if not exists(comments):   294         return   295    296     title = read(join(pages_dir, pageid, "pagetitle"))   297    298     details = [line.split("|") for line in read(comments).split("\n") if line]   299     details.sort()   300    301     # Write the sorted comments list for testing purposes.   302    303     write(comments, "\n".join(["|".join(x) for x in details]))   304    305     # Define comments as subpages by setting their titles using this   306     # page's name/title and their position in the comments collection.   307    308     for position, (_lastmodified, commentid) in enumerate(details):   309    310         # In the page directory for each comment, write the page title in a   311         # special file for later processing.   312    313         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   314    315 def _sort_manifest(manifest, title):   316    317     """   318     Open the given 'manifest' and sort it according to revision so that it will   319     be added to MoinMoin in the correct order.   320    321     If a 'title' is provided, the title column in the manifest will be augmented   322     with that information. This is typically done for comments and is necessary   323     for attachments.   324    325     A list of manifest entries is returned.   326     """   327    328     f = codecs.open(manifest, "r", encoding="utf-8")   329     try:   330         lines = [x.split("|") for x in f.readlines()]   331         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   332    333         # Reconstruct the lines, optionally changing the titles.   334    335         result = []   336    337         for line in lines:   338             version, _action, _archive_filename, filename, old_title, username, comment = line   339    340             # Replace title information with the information already present.   341    342             if title is not None:   343                 new_title = title   344             else:   345                 new_title = old_title   346    347             # The version is omitted now that the manifest is ordered.   348    349             line = _action, _archive_filename, filename, new_title, username, comment   350             result.append(line)   351    352         return result   353    354     finally:   355         f.close()   356    357 def serialise_manifest(manifest):   358    359     """   360     Process the 'manifest' consisting of entries, removing superfluous columns.   361     """   362    363     result = []   364    365     for columns in manifest:   366         action = columns[0]   367         if action == "AddRevision":   368             columns = list(columns)   369             del columns[1]   370         result.append("|".join(columns))   371    372     return "".join(result)   373                374 def sort_manifest(pages_dir, pageid, output=None, no_translate=False):   375    376     """   377     Using the given 'pageid', locate the manifest for the page and any page   378     title information written to a "pagetitle" file.   379    380     Then sort the manifest according to revision so that it will be added to   381     MoinMoin in the correct order.   382    383     If a "pagetitle" file exists, the title column in the manifest will be   384     augmented with the contents of that file. This is typically done for   385     comments.   386    387     If a "children" file exists, the pages in that file will be added as a list   388     to the end of each revision's content.   389    390     If 'output' is given, the manifest details will be appended to the file   391     having that filename instead of being rewritten to the original manifest   392     file.   393     """   394    395     manifest = join(pages_dir, pageid, "manifest")   396     attachments = join(pages_dir, pageid, "attachments")   397     pagetitle = join(pages_dir, pageid, "pagetitle")   398     children = join(pages_dir, pageid, "children")   399     comments = join(pages_dir, pageid, "comments")   400    401     if exists(pagetitle):   402         title = read(pagetitle)   403     else:   404         title = None   405    406     # Sort the revision manifest.   407    408     result = _sort_manifest(manifest, title)   409    410     for _action, _archive_filename, filename, new_title, username, comment in result:   411    412         # Add child page information to the content.   413    414         if exists(children) and not no_translate:   415             child_pages = []   416             child_page_names = [x for x in read(children).split("\n") if x]   417             child_page_names.sort()   418    419             for child_page_name in child_page_names:   420                 child_pages.append(" * [[%s]]" % child_page_name)   421    422             append(filename, child_page_section % "\n".join(child_pages))   423    424         # Add comments to the content.   425    426         if exists(comments) and title and not no_translate:   427             append(filename, comment_section % title)   428    429     # Add the attachments to the manifest.   430    431     if exists(attachments):   432         result += _sort_manifest(attachments, title)   433    434     # Serialise the manifest.   435    436     s = serialise_manifest(result)   437    438     if output is None:   439         write(manifest, s)   440     else:   441         append(output, s)   442    443 # Template for child page information.   444    445 child_page_section = """   446 ----   447    448 %s   449 """   450    451 # Template for comments.   452    453 comment_section = """   454 ----   455    456 <<Include("^%s/")>>   457 """   458    459 # Main program.   460    461 if __name__ == "__main__":   462     try:   463         filename = sys.argv[1]   464         is_zipfile = splitext(filename)[-1] == extsep + "zip"   465         space = sys.argv[2]   466         if len(sys.argv) > 3 and sys.argv[3]:   467             attachments = sys.argv[3]   468         else:   469             attachments = None   470     except IndexError:   471         print >>sys.stderr, "Please specify an XML file containing Wiki data, a workspace name,"   472         print >>sys.stderr, "and an optional attachments directory location."   473         print >>sys.stderr, "For example: com_entities.xml COM"   474         sys.exit(1)   475    476     no_translate = "--no-translate" in sys.argv   477    478     if exists(space):   479         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   480         sys.exit(1)   481    482     package_zip = space + extsep + "zip"   483    484     if exists(package_zip):   485         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   486         sys.exit(1)   487    488     mkdir(space)   489     mkdirs(join(space, "pages"))   490     mkdirs(join(space, "versions"))   491    492     p = xmlread.ConfigurableParser()   493     handler = ConfluenceHandler(space, no_translate)   494    495     # Register handlers in the parser for different elements.   496    497     p["object"] = handler.handle_object   498     p["property"] = handler.handle_property   499     p["id"] = handler.handle_id   500     p["collection"] = handler.handle_collection   501     p["element"] = handler.handle_element   502    503     # Open the XML dump.   504    505     f = open(filename)   506    507     if is_zipfile:   508         zf = ZipFile(f)   509         ff = StringIO(zf.read("entities.xml"))   510     else:   511         ff = f   512    513     # Parse the data.   514    515     try:   516         p.parse(ff)   517    518         # Tidy up the import manifests, sorting each of them by revision and   519         # finalising them.   520    521         pages_dir = join(space, "pages")   522    523         for pageid in listdir(pages_dir):   524             sort_comments(pages_dir, pageid)   525    526         output_manifest = join(space, "MOIN_PACKAGE")   527         append(output_manifest, "MoinMoinPackage|1\n")   528    529         for pageid in listdir(pages_dir):   530             sort_manifest(pages_dir, pageid, output_manifest, no_translate)   531    532         # Write the page package.   533    534         page_package = ZipFile(package_zip, "w")   535    536         try:   537             # Include the page revisions.   538    539             versions_dir = join(space, "versions")   540    541             for versionid in listdir(versions_dir):   542                 page_package.write(join(versions_dir, versionid))   543    544             # Include the attachments.   545    546             if attachments:   547                 cwd = getcwd()   548                 chdir(split(attachments)[0])   549                 try:   550                     for path, dirnames, filenames in walk(split(attachments)[1]):   551                         for filename in filenames:   552                             # Have to "taint" archive filenames.   553                             page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))   554                 finally:   555                     chdir(cwd)   556             elif is_zipfile:   557                 for filename in zf.namelist():   558                     if filename.startswith("attachments"):   559                         # Have to "taint" archive filenames.   560                         page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))   561    562             # Include only the top-level manifest.   563    564             page_package.write(output_manifest, "MOIN_PACKAGE")   565    566         finally:   567             page_package.close()   568    569     finally:   570         f.close()   571    572 # vim: tabstop=4 expandtab shiftwidth=4