ConfluenceConverter (file convert.py at 8479d1acb570)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk    25 from os.path import exists, extsep, join, split, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 from MoinMoin import wikiutil    29 import codecs    30 import xmlread    31 import wikiparser, xmlparser    32 import sys    33     34 MAX_TITLE_LENGTH = 120    35     36 class ConfluenceHandler:    37     38     "Handle content from a Confluence Wiki dump."    39     40     def __init__(self, space, no_translate=False):    41         self.content = {}    42         self.elements = []    43         self.space = space    44         self.no_translate = no_translate    45     46     def handle_object(self, name, elements, attributes, all_text, text):    47     48         """    49         Handle objects according to type. Objects appear as follows:    50     51         <object class="Page" package="...">    52         <id name="id">...</id>    53         ...    54         </object>    55     56         Within objects, one finds things like properties and collections, which    57         are handled by their own methods but which are stored in the content    58         dictionary associated with the current object.    59     60         By the time this method is called, the contents of the object will have    61         been gathered and the properties and collections populated in the    62         content dictionary. Any identifier will have been assigned to the    63         textual content of the object element and will be available in the    64         'text' parameter.    65         """    66     67         objecttype = attributes[-1]["class"]    68     69         # Any identifier is stored as the object's textual content.    70     71         identifier = text.strip()    72     73         # The content is a dictionary mapping names to properties and    74         # collections.    75     76         content = self.content    77     78         pages_dir = join(self.space, "pages")    79         versions_dir = join(self.space, "versions")    80     81         # Handle particular types.    82     83         if objecttype in ("Page", "Comment", "BlogPost"):    84     85             # Handle pages and revisions, adding revisions to the page manifest.    86             # The original version is used as a unifying identifier for all the    87             # different revisions (each of which being defined by a Page    88             # element). Although "original" implies the first identifier used,    89             # it actually appears to be the latest and will have the highest    90             # version number.    91     92             if content.has_key("originalVersion"):    93                 pageid = content["originalVersion"]    94             else:    95                 pageid = identifier    96     97             versionfile = join(versions_dir, identifier)    98     99             # Note page metadata, not necessarily in the correct order.   100             # For comments, the title will need to be rewritten, since they   101             # should be defined in terms of their owner page.   102    103             mkdirs(join(pages_dir, pageid))   104    105             title = content["title"]   106    107             # Limit the title to a "safe" number of characters in order to avoid   108             # filesystem issues.   109    110             title = title[:MAX_TITLE_LENGTH]   111    112             if title:   113                 title = "%s/%s" % (self.space, title)   114                 write(join(pages_dir, pageid, "pagetitle"), title)   115    116             # See sort_manifest for access to this data.   117    118             append(join(pages_dir, pageid, "manifest"),   119                 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment   120                     content["version"],   121                     versionfile,   122                     title, # comment titles will incorporate the comment's position   123                     content["lastModifierName"],   124                     content["versionComment"]   125                 ))   126    127             # Add information to parent pages for child page lists.   128    129             if content.has_key("parent"):   130                 parentid = content["parent"]   131                 mkdirs(join(pages_dir, parentid))   132                 append(join(pages_dir, parentid, "children"), title + "\n")   133    134             # Add creation details for comments to the owner page.   135             # Since comments can be versioned, the date of the original version   136             # is used, and only this "original" version has the owner property.   137    138             if objecttype == "Comment" and content.has_key("owner"):   139                 ownerid = content["owner"]   140                 mkdirs(join(pages_dir, ownerid))   141                 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid))   142    143             # Some metadata is not particularly relevant. For example,   144             # ancestors, children, parent are navigation-related.   145    146             # Other metadata could be added to the page content itself.   147             # For example, labelling could be converted to categories.   148    149         # Handle revisions.   150    151         elif objecttype == "BodyContent":   152             body = content["body"]   153             if not body:   154                 body = "## Empty page."   155    156             # NOTE: Very simple technique employed for guessing the format.   157    158             if no_translate:   159                 fn = write   160             elif body.startswith("<"):   161                 fn = xmltranslate   162             else:   163                 fn = translate   164    165             try:   166                 fn(join(versions_dir, content["content"]), body)   167             except:   168                 err = codecs.getwriter("utf-8")(sys.stderr)   169                 print >>err, "Error parsing", content["content"]   170                 raise   171    172         # Handle attachments.   173    174         elif objecttype == "Attachment":   175             pageid = content["content"]   176             version = content["attachmentVersion"]   177    178             if content.has_key("originalVersion"):   179                 attachid = content["originalVersion"]   180             else:   181                 attachid = identifier   182    183             append(join(pages_dir, pageid, "attachments"),   184                 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % (   185                     version,   186                     # Have to "taint" archive filenames, although Moin will   187                     # probably handle package script filename tainting.   188                     wikiutil.taintfilename(join("attachments", pageid, attachid, version)),   189                     wikiutil.taintfilename(content["fileName"]),   190                     "", # pagename is substituted later   191                     content["lastModifierName"],   192                     content["comment"]   193                 ))   194    195         self.content = {}   196    197     def handle_property(self, name, elements, attributes, all_text, text):   198    199         "Record properties in the current content dictionary."   200    201         self.content[attributes[-1]["name"]] = text.strip()   202    203     def handle_id(self, name, elements, attributes, all_text, text):   204    205         "Promote identifiers to the parent element's text."   206    207         all_text[-2].append(text)   208    209     def handle_collection(self, name, elements, attributes, all_text, text):   210    211         "Record collections in the current content dictionary."   212    213         self.content[attributes[-1]["name"]] = self.elements   214         self.elements = []   215    216     def handle_element(self, name, elements, attributes, all_text, text):   217    218         "Add elements to the current collection."   219    220         self.elements.append((attributes[-1]["class"], text.strip()))   221    222 def mkdirs(name):   223    224     "Make the directory with the given 'name' at any depth."   225    226     try:   227         makedirs(name)   228     except OSError:   229         pass   230    231 def append(filename, s):   232    233     "Append to the file with the given 'filename' the string 's'."   234    235     write(filename, s, True)   236    237 def write(filename, s, append=False):   238    239     """   240     Write to the file with the given 'filename' the string 's'. If the optional   241     'append' parameter is set to a true value, 's' will be appended to the file.   242     """   243    244     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   245     try:   246         f.write(s)   247     finally:   248         f.close()   249    250 def read(filename):   251    252     """   253     Read from the file with the given 'filename', returning a string containing   254     its contents.   255     """   256    257     f = codecs.open(filename, encoding="utf-8")   258     try:   259         return f.read()   260     finally:   261         f.close()   262    263 def translate(filename, body, fn=None):   264    265     """   266     Write to the file with the given 'filename' a translation of the given   267     'body'.   268     """   269    270     fn = fn or wikiparser.parse   271    272     out = codecs.open(filename, "w", encoding="utf-8")   273     try:   274         fn(body, out)   275     finally:   276         out.close()   277    278 def xmltranslate(filename, body):   279     translate(filename, body, xmlparser.parse)   280    281 def sort_comments(pages_dir, pageid):   282    283     """   284     Where 'pageid' has comments associated with it, sort them chronologically   285     and label the comment pages with the owner page's title and comment's   286     position in the chronological sequence. Such labelling is done by writing   287     a "pagetitle" file in each comment page's directory.   288     """   289    290     comments = join(pages_dir, pageid, "comments")   291    292     if not exists(comments):   293         return   294    295     title = read(join(pages_dir, pageid, "pagetitle"))   296    297     details = [line.split("|") for line in read(comments).split("\n") if line]   298     details.sort()   299    300     # Write the sorted comments list for testing purposes.   301    302     write(comments, "\n".join(["|".join(x) for x in details]))   303    304     # Define comments as subpages by setting their titles using this   305     # page's name/title and their position in the comments collection.   306    307     for position, (_lastmodified, commentid) in enumerate(details):   308    309         # In the page directory for each comment, write the page title in a   310         # special file for later processing.   311    312         write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position))   313    314 def _sort_manifest(manifest, title):   315    316     """   317     Open the given 'manifest' and sort it according to revision so that it will   318     be added to MoinMoin in the correct order.   319    320     If a 'title' is provided, the title column in the manifest will be augmented   321     with that information. This is typically done for comments and is necessary   322     for attachments.   323    324     A list of manifest entries is returned.   325     """   326    327     f = codecs.open(manifest, "r", encoding="utf-8")   328     try:   329         lines = [x.split("|") for x in f.readlines()]   330         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   331    332         # Reconstruct the lines, optionally changing the titles.   333    334         result = []   335    336         for line in lines:   337             version, _action, _archive_filename, filename, old_title, username, comment = line   338    339             # Replace title information with the information already present.   340    341             if title is not None:   342                 new_title = title   343             else:   344                 new_title = old_title   345    346             # The version is omitted now that the manifest is ordered.   347    348             line = _action, _archive_filename, filename, new_title, username, comment   349             result.append(line)   350    351         return result   352    353     finally:   354         f.close()   355    356 def serialise_manifest(manifest):   357    358     """   359     Process the 'manifest' consisting of entries, removing superfluous columns.   360     """   361    362     result = []   363    364     for columns in manifest:   365         action = columns[0]   366         if action == "AddRevision":   367             columns = list(columns)   368             del columns[1]   369         result.append("|".join(columns))   370    371     return "".join(result)   372                373 def sort_manifest(pages_dir, pageid, output=None, no_translate=False):   374    375     """   376     Using the given 'pageid', locate the manifest for the page and any page   377     title information written to a "pagetitle" file.   378    379     Then sort the manifest according to revision so that it will be added to   380     MoinMoin in the correct order.   381    382     If a "pagetitle" file exists, the title column in the manifest will be   383     augmented with the contents of that file. This is typically done for   384     comments.   385    386     If a "children" file exists, the pages in that file will be added as a list   387     to the end of each revision's content.   388    389     If 'output' is given, the manifest details will be appended to the file   390     having that filename instead of being rewritten to the original manifest   391     file.   392     """   393    394     manifest = join(pages_dir, pageid, "manifest")   395     attachments = join(pages_dir, pageid, "attachments")   396     pagetitle = join(pages_dir, pageid, "pagetitle")   397     children = join(pages_dir, pageid, "children")   398     comments = join(pages_dir, pageid, "comments")   399    400     if exists(pagetitle):   401         title = read(pagetitle)   402     else:   403         title = None   404    405     # Sort the revision manifest.   406    407     result = _sort_manifest(manifest, title)   408    409     for _action, _archive_filename, filename, new_title, username, comment in result:   410    411         # Add child page information to the content.   412    413         if exists(children) and not no_translate:   414             child_pages = []   415             child_page_names = [x for x in read(children).split("\n") if x]   416             child_page_names.sort()   417    418             for child_page_name in child_page_names:   419                 child_pages.append(" * [[%s]]" % child_page_name)   420    421             append(filename, child_page_section % "\n".join(child_pages))   422    423         # Add comments to the content.   424    425         if exists(comments) and title and not no_translate:   426             append(filename, comment_section % title)   427    428     # Add the attachments to the manifest.   429    430     if exists(attachments):   431         result += _sort_manifest(attachments, title)   432    433     # Serialise the manifest.   434    435     s = serialise_manifest(result)   436    437     if output is None:   438         write(manifest, s)   439     else:   440         append(output, s)   441    442 # Template for child page information.   443    444 child_page_section = """   445 ----   446    447 %s   448 """   449    450 # Template for comments.   451    452 comment_section = """   453 ----   454    455 <<Include("^%s/")>>   456 """   457    458 # Main program.   459    460 if __name__ == "__main__":   461     try:   462         filename = sys.argv[1]   463         is_zipfile = splitext(filename)[-1] == extsep + "zip"   464         space = sys.argv[2]   465         if len(sys.argv) > 3:   466             attachments = sys.argv[3]   467         else:   468             attachments = None   469     except IndexError:   470         print >>sys.stderr, "Please specify an XML file containing Wiki data, a workspace name,"   471         print >>sys.stderr, "and an optional attachments directory location."   472         print >>sys.stderr, "For example: com_entities.xml COM"   473         sys.exit(1)   474    475     no_translate = "--no-translate" in sys.argv   476    477     if exists(space):   478         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   479         sys.exit(1)   480    481     package_zip = space + extsep + "zip"   482    483     if exists(package_zip):   484         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   485         sys.exit(1)   486    487     mkdir(space)   488     mkdirs(join(space, "pages"))   489     mkdirs(join(space, "versions"))   490    491     p = xmlread.ConfigurableParser()   492     handler = ConfluenceHandler(space, no_translate)   493    494     # Register handlers in the parser for different elements.   495    496     p["object"] = handler.handle_object   497     p["property"] = handler.handle_property   498     p["id"] = handler.handle_id   499     p["collection"] = handler.handle_collection   500     p["element"] = handler.handle_element   501    502     # Open the XML dump.   503    504     f = open(filename)   505    506     if is_zipfile:   507         zf = ZipFile(f)   508         ff = StringIO(zf.read("entities.xml"))   509     else:   510         ff = f   511    512     # Parse the data.   513    514     try:   515         p.parse(ff)   516    517         # Tidy up the import manifests, sorting each of them by revision and   518         # finalising them.   519    520         pages_dir = join(space, "pages")   521    522         for pageid in listdir(pages_dir):   523             sort_comments(pages_dir, pageid)   524    525         output_manifest = join(space, "MOIN_PACKAGE")   526         append(output_manifest, "MoinMoinPackage|1\n")   527    528         for pageid in listdir(pages_dir):   529             sort_manifest(pages_dir, pageid, output_manifest, no_translate)   530    531         # Write the page package.   532    533         page_package = ZipFile(package_zip, "w")   534    535         try:   536             # Include the page revisions.   537    538             versions_dir = join(space, "versions")   539    540             for versionid in listdir(versions_dir):   541                 page_package.write(join(versions_dir, versionid))   542    543             # Include the attachments.   544    545             if attachments:   546                 cwd = getcwd()   547                 chdir(split(attachments)[0])   548                 try:   549                     for path, dirnames, filenames in walk(split(attachments)[1]):   550                         for filename in filenames:   551                             # Have to "taint" archive filenames.   552                             page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename)))   553                 finally:   554                     chdir(cwd)   555             elif is_zipfile:   556                 for filename in zf.namelist():   557                     if filename.startswith("attachments"):   558                         # Have to "taint" archive filenames.   559                         page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename))   560    561             # Include only the top-level manifest.   562    563             page_package.write(output_manifest, "MOIN_PACKAGE")   564    565         finally:   566             page_package.close()   567    568     finally:   569         f.close()   570    571 # vim: tabstop=4 expandtab shiftwidth=4