1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import listdir, mkdir, makedirs 25 from os.path import exists, extsep, join, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 import codecs 29 import xmlread 30 31 class ConfluenceHandler: 32 33 "Handle content from a Confluence Wiki dump." 34 35 def __init__(self, directory): 36 self.content = {} 37 self.elements = [] 38 self.directory = directory 39 40 def handle_object(self, name, elements, attributes, all_text, text): 41 42 "Handle objects according to type." 43 44 objecttype = attributes[-1]["class"] 45 identifier = text.strip() 46 content = self.content 47 48 pages_dir = join(self.directory, "pages") 49 versions_dir = join(self.directory, "versions") 50 51 # Handle particular types. 52 53 if objecttype in ("Page", "Comment", "BlogPost"): 54 55 # Handle pages and revisions, adding revisions to the page manifest. 56 # The original version is used as a unifying identifier for all the 57 # different revisions (each of which being defined by a Page 58 # element). Although "original" implies the first identifier used, 59 # it actually appears to be the latest and will have the highest 60 # version number. 61 62 if content.has_key("originalVersion"): 63 pageid = content["originalVersion"] 64 else: 65 pageid = identifier 66 67 versionfile = join(versions_dir, identifier) 68 69 # Note page metadata, not necessarily in the correct order. 70 # For comments, the title will need to be rewritten, since they 71 # should be defined in terms of their owner page. 72 73 mkdirs(join(pages_dir, pageid)) 74 75 append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % ( 76 content["version"], 77 versionfile, 78 content["title"] or content["version"], # comment titles will incorporate the version 79 content["lastModifierName"], 80 content["versionComment"])) 81 82 # Write comments as subpages. 83 84 if content.has_key("comments"): 85 86 # Define a page directory for each comment, and write the page 87 # title in a special file for later processing. 88 89 for _comment, commentid in content["comments"]: 90 mkdirs(join(pages_dir, commentid)) 91 append(join(pages_dir, commentid, "pagetitle"), content["title"]) 92 93 # Some metadata is not particularly relevant. For example, 94 # ancestors, children, parent are navigation-related. 95 96 # Other metadata could be added to the page content itself. 97 # For example, labelling could be converted to categories. 98 99 # Handle revisions. 100 101 elif objecttype == "BodyContent": 102 write(join(versions_dir, content["content"]), content["body"]) 103 104 self.content = {} 105 106 def handle_property(self, name, elements, attributes, all_text, text): 107 108 "Record properties in the current content dictionary." 109 110 self.content[attributes[-1]["name"]] = text.strip() 111 112 def handle_id(self, name, elements, attributes, all_text, text): 113 114 "Promote identifiers to the parent element's text." 115 116 all_text[-2].append(text) 117 118 def handle_collection(self, name, elements, attributes, all_text, text): 119 120 "Record collections in the current content dictionary." 121 122 self.content[attributes[-1]["name"]] = self.elements 123 self.elements = [] 124 125 def handle_element(self, name, elements, attributes, all_text, text): 126 127 "Add elements to the current collection." 128 129 self.elements.append((attributes[-1]["class"], text.strip())) 130 131 def mkdirs(name): 132 133 "Make the directory with the given 'name' at any depth." 134 135 try: 136 makedirs(name) 137 except OSError: 138 pass 139 140 def append(filename, s): 141 142 "Append to the file with the given 'filename' the string 's'." 143 144 write(filename, s, True) 145 146 def write(filename, s, append=False): 147 148 """ 149 Write to the file with the given 'filename' the string 's'. If the optional 150 'append' parameter is set to a true value, 's' will be appended to the file. 151 """ 152 153 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 154 try: 155 f.write(s) 156 finally: 157 f.close() 158 159 def read(filename): 160 161 """ 162 Read from the file with the given 'filename', returning a string containing 163 its contents. 164 """ 165 166 f = codecs.open(filename, encoding="utf-8") 167 try: 168 return f.read() 169 finally: 170 f.close() 171 172 def sort_manifest(filename, pagetitle, output=None): 173 174 """ 175 Sort the manifest given in 'filename' according to revision. If a 176 'pagetitle' file exists, the title column in the manifest will be augmented 177 with the contents of that file. If 'output' is given, the manifest details 178 will be appended to the file having that filename instead of being rewritten 179 to the original manifest file. 180 """ 181 182 if exists(pagetitle): 183 title = read(pagetitle) 184 else: 185 title = None 186 187 f = codecs.open(filename, "r", encoding="utf-8") 188 try: 189 lines = [x.split("|") for x in f.readlines()] 190 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 191 192 # Reconstruct the lines, optionally changing the titles. 193 194 result = [] 195 for x in lines: 196 if title is not None: 197 x[3] = "%s/%s" % (title, x[3]) 198 result.append("|".join(x[1:])) 199 finally: 200 f.close() 201 202 s = "".join(result) 203 204 if output is None: 205 write(filename, s) 206 else: 207 append(output, s) 208 209 if __name__ == "__main__": 210 import sys 211 212 filename = sys.argv[1] 213 is_zipfile = splitext(filename)[-1] == extsep + "zip" 214 215 directory = sys.argv[2] 216 217 if exists(directory): 218 print >>sys.stderr, "Directory exists. Please choose another or remove its contents." 219 sys.exit(1) 220 221 mkdir(directory) 222 mkdirs(join(directory, "pages")) 223 mkdirs(join(directory, "versions")) 224 225 p = xmlread.ConfigurableParser() 226 handler = ConfluenceHandler(directory) 227 228 p["object"] = handler.handle_object 229 p["property"] = handler.handle_property 230 p["id"] = handler.handle_id 231 p["collection"] = handler.handle_collection 232 p["element"] = handler.handle_element 233 234 # Open the XML dump. 235 236 f = open(filename) 237 238 if is_zipfile: 239 zf = ZipFile(f) 240 ff = StringIO(zf.read("entities.xml")) 241 else: 242 ff = f 243 244 # Parse the data. 245 246 try: 247 p.parse(ff) 248 finally: 249 f.close() 250 251 # Tidy up the import manifests, sorting each of them by revision and 252 # finalising them. 253 254 pages_dir = join(directory, "pages") 255 256 output_manifest = join(directory, "MOIN_PACKAGE") 257 append(output_manifest, "MoinMoinPackage|1\n") 258 259 for pageid in listdir(pages_dir): 260 manifest = join(pages_dir, pageid, "manifest") 261 pagetitle = join(pages_dir, pageid, "pagetitle") 262 sort_manifest(manifest, pagetitle, output_manifest) 263 264 # Write the page package. 265 266 page_package = ZipFile(directory + extsep + "zip", "w") 267 268 try: 269 # Include the page revisions. 270 271 versions_dir = join(directory, "versions") 272 273 for versionid in listdir(versions_dir): 274 page_package.write(join(versions_dir, versionid)) 275 276 # Include only the top-level manifest. 277 278 page_package.write(output_manifest, "MOIN_PACKAGE") 279 280 finally: 281 page_package.close() 282 283 # vim: tabstop=4 expandtab shiftwidth=4