ConfluenceConverter (file convert.py at 0e41fd332cf5)

     1 #!/usr/bin/env python     2      3 """     4 Confluence XML dump conversion to a MoinMoin-compatible representation.     5      6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk>     7      8 This software is free software; you can redistribute it and/or     9 modify it under the terms of the GNU General Public License as    10 published by the Free Software Foundation; either version 2 of    11 the License, or (at your option) any later version.    12     13 This software is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the    16 GNU General Public License for more details.    17     18 You should have received a copy of the GNU General Public    19 License along with this library; see the file LICENCE.txt    20 If not, write to the Free Software Foundation, Inc.,    21 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    22 """    23     24 from os import listdir, mkdir, makedirs    25 from os.path import exists, extsep, join, splitext    26 from zipfile import ZipFile    27 from cStringIO import StringIO    28 import codecs    29 import xmlread    30 import parser    31     32 MAX_TITLE_LENGTH = 120    33     34 class ConfluenceHandler:    35     36     "Handle content from a Confluence Wiki dump."    37     38     def __init__(self, space, no_translate=False):    39         self.content = {}    40         self.elements = []    41         self.space = space    42         self.no_translate = no_translate    43     44     def handle_object(self, name, elements, attributes, all_text, text):    45     46         "Handle objects according to type."    47     48         objecttype = attributes[-1]["class"]    49         identifier = text.strip()    50         content = self.content    51     52         pages_dir = join(self.space, "pages")    53         versions_dir = join(self.space, "versions")    54     55         # Handle particular types.    56     57         if objecttype in ("Page", "Comment", "BlogPost"):    58     59             # Handle pages and revisions, adding revisions to the page manifest.    60             # The original version is used as a unifying identifier for all the    61             # different revisions (each of which being defined by a Page    62             # element). Although "original" implies the first identifier used,    63             # it actually appears to be the latest and will have the highest    64             # version number.    65     66             if content.has_key("originalVersion"):    67                 pageid = content["originalVersion"]    68             else:    69                 pageid = identifier    70     71             versionfile = join(versions_dir, identifier)    72     73             # Note page metadata, not necessarily in the correct order.    74             # For comments, the title will need to be rewritten, since they    75             # should be defined in terms of their owner page.    76     77             mkdirs(join(pages_dir, pageid))    78     79             title = content["title"]    80     81             # Limit the title to a "safe" number of characters in order to avoid    82             # filesystem issues.    83     84             title = title[:MAX_TITLE_LENGTH]    85     86             if title:    87                 title = "%s/%s" % (self.space, title)    88     89             append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % (    90                 content["version"],    91                 versionfile,    92                 title or content["version"], # comment titles will incorporate the version    93                 content["lastModifierName"],    94                 content["versionComment"]))    95     96             # Write comments as subpages.    97     98             if content.has_key("comments"):    99    100                 # Define a page directory for each comment, and write the page   101                 # title in a special file for later processing.   102    103                 for _comment, commentid in content["comments"]:   104                     mkdirs(join(pages_dir, commentid))   105                     append(join(pages_dir, commentid, "pagetitle"), title)   106    107             # Some metadata is not particularly relevant. For example,   108             # ancestors, children, parent are navigation-related.   109    110             # Other metadata could be added to the page content itself.   111             # For example, labelling could be converted to categories.   112    113         # Handle revisions.   114    115         elif objecttype == "BodyContent":   116             body = content["body"]   117             if not body:   118                 body = "## Empty page."   119    120             if no_translate:   121                 fn = write   122             else:   123                 fn = translate   124    125             fn(join(versions_dir, content["content"]), body)   126    127         self.content = {}   128    129     def handle_property(self, name, elements, attributes, all_text, text):   130    131         "Record properties in the current content dictionary."   132    133         self.content[attributes[-1]["name"]] = text.strip()   134    135     def handle_id(self, name, elements, attributes, all_text, text):   136    137         "Promote identifiers to the parent element's text."   138    139         all_text[-2].append(text)   140    141     def handle_collection(self, name, elements, attributes, all_text, text):   142    143         "Record collections in the current content dictionary."   144    145         self.content[attributes[-1]["name"]] = self.elements   146         self.elements = []   147    148     def handle_element(self, name, elements, attributes, all_text, text):   149    150         "Add elements to the current collection."   151    152         self.elements.append((attributes[-1]["class"], text.strip()))   153    154 def mkdirs(name):   155    156     "Make the directory with the given 'name' at any depth."   157    158     try:   159         makedirs(name)   160     except OSError:   161         pass   162    163 def append(filename, s):   164    165     "Append to the file with the given 'filename' the string 's'."   166    167     write(filename, s, True)   168    169 def write(filename, s, append=False):   170    171     """   172     Write to the file with the given 'filename' the string 's'. If the optional   173     'append' parameter is set to a true value, 's' will be appended to the file.   174     """   175    176     f = codecs.open(filename, append and "a" or "w", encoding="utf-8")   177     try:   178         f.write(s)   179     finally:   180         f.close()   181    182 def read(filename):   183    184     """   185     Read from the file with the given 'filename', returning a string containing   186     its contents.   187     """   188    189     f = codecs.open(filename, encoding="utf-8")   190     try:   191         return f.read()   192     finally:   193         f.close()   194    195 def translate(filename, body):   196    197     """   198     Write to the file with the given 'filename' a translation of the given   199     'body'.   200     """   201    202     out = codecs.open(filename, "w", encoding="utf-8")   203     try:   204         parser.parse(body, out)   205     finally:   206         out.close()   207    208 def sort_manifest(filename, pagetitle, output=None):   209    210     """   211     Sort the manifest given in 'filename' according to revision.   212    213     If a 'pagetitle' file exists, the title column in the manifest will be   214     augmented with the contents of that file. This is typically done for   215     comments.   216    217     If 'output' is given, the manifest details will be appended to the file   218     having that filename instead of being rewritten to the original manifest   219     file.   220     """   221    222     if exists(pagetitle):   223         title = read(pagetitle)   224     else:   225         title = None   226    227     f = codecs.open(filename, "r", encoding="utf-8")   228     try:   229         lines = [x.split("|") for x in f.readlines()]   230         lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0])))   231    232         # Reconstruct the lines, optionally changing the titles.   233    234         result = []   235         for x in lines:   236             if title is not None:   237                 x[3] = "%s/%s" % (title, x[3])   238             result.append("|".join(x[1:]))   239     finally:   240         f.close()   241    242     s = "".join(result)   243    244     if output is None:   245         write(filename, s)   246     else:   247         append(output, s)   248    249 if __name__ == "__main__":   250     import sys   251    252     try:   253         filename = sys.argv[1]   254         is_zipfile = splitext(filename)[-1] == extsep + "zip"   255         space = sys.argv[2]   256     except IndexError:   257         print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name."   258         print >>sys.stderr, "For example: com_entities.xml COM"   259         sys.exit(1)   260    261     no_translate = "--no-translate" in sys.argv   262    263     if exists(space):   264         print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space   265         sys.exit(1)   266    267     package_zip = space + extsep + "zip"   268    269     if exists(package_zip):   270         print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip   271         sys.exit(1)   272    273     mkdir(space)   274     mkdirs(join(space, "pages"))   275     mkdirs(join(space, "versions"))   276    277     p = xmlread.ConfigurableParser()   278     handler = ConfluenceHandler(space, no_translate)   279    280     p["object"] = handler.handle_object   281     p["property"] = handler.handle_property   282     p["id"] = handler.handle_id   283     p["collection"] = handler.handle_collection   284     p["element"] = handler.handle_element   285    286     # Open the XML dump.   287    288     f = open(filename)   289    290     if is_zipfile:   291         zf = ZipFile(f)   292         ff = StringIO(zf.read("entities.xml"))   293     else:   294         ff = f   295    296     # Parse the data.   297    298     try:   299         p.parse(ff)   300     finally:   301         f.close()   302    303     # Tidy up the import manifests, sorting each of them by revision and   304     # finalising them.   305    306     pages_dir = join(space, "pages")   307    308     output_manifest = join(space, "MOIN_PACKAGE")   309     append(output_manifest, "MoinMoinPackage|1\n")   310    311     for pageid in listdir(pages_dir):   312         manifest = join(pages_dir, pageid, "manifest")   313         pagetitle = join(pages_dir, pageid, "pagetitle")   314         sort_manifest(manifest, pagetitle, output_manifest)   315    316     # Write the page package.   317    318     page_package = ZipFile(package_zip, "w")   319    320     try:   321         # Include the page revisions.   322    323         versions_dir = join(space, "versions")   324    325         for versionid in listdir(versions_dir):   326             page_package.write(join(versions_dir, versionid))   327    328         # Include only the top-level manifest.   329    330         page_package.write(output_manifest, "MOIN_PACKAGE")   331    332     finally:   333         page_package.close()   334    335 # vim: tabstop=4 expandtab shiftwidth=4