1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import listdir, mkdir, makedirs 25 from os.path import exists, extsep, join, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 import codecs 29 import xmlread 30 import parser 31 import sys 32 33 MAX_TITLE_LENGTH = 120 34 35 class ConfluenceHandler: 36 37 "Handle content from a Confluence Wiki dump." 38 39 def __init__(self, space, no_translate=False): 40 self.content = {} 41 self.elements = [] 42 self.space = space 43 self.no_translate = no_translate 44 45 def handle_object(self, name, elements, attributes, all_text, text): 46 47 "Handle objects according to type." 48 49 objecttype = attributes[-1]["class"] 50 51 # Any identifier is stored as the object's textual content. 52 53 identifier = text.strip() 54 55 # The content is a dictionary mapping names to properties and 56 # collections. 57 58 content = self.content 59 60 pages_dir = join(self.space, "pages") 61 versions_dir = join(self.space, "versions") 62 63 # Handle particular types. 64 65 if objecttype in ("Page", "Comment", "BlogPost"): 66 67 # Handle pages and revisions, adding revisions to the page manifest. 68 # The original version is used as a unifying identifier for all the 69 # different revisions (each of which being defined by a Page 70 # element). Although "original" implies the first identifier used, 71 # it actually appears to be the latest and will have the highest 72 # version number. 73 74 if content.has_key("originalVersion"): 75 pageid = content["originalVersion"] 76 else: 77 pageid = identifier 78 79 versionfile = join(versions_dir, identifier) 80 81 # Note page metadata, not necessarily in the correct order. 82 # For comments, the title will need to be rewritten, since they 83 # should be defined in terms of their owner page. 84 85 mkdirs(join(pages_dir, pageid)) 86 87 title = content["title"] 88 89 # Limit the title to a "safe" number of characters in order to avoid 90 # filesystem issues. 91 92 title = title[:MAX_TITLE_LENGTH] 93 94 if title: 95 title = "%s/%s" % (self.space, title) 96 97 append(join(pages_dir, pageid, "manifest"), 98 "%s|AddRevision|%s|%s|%s|%s\n" % ( 99 content["version"], 100 versionfile, 101 title or content["version"], # comment titles will incorporate the version 102 content["lastModifierName"], 103 content["versionComment"] 104 )) 105 106 # Write comments as subpages. 107 108 if content.has_key("comments"): 109 110 # Define a page directory for each comment, and write the page 111 # title in a special file for later processing. 112 113 for _comment, commentid in content["comments"]: 114 mkdirs(join(pages_dir, commentid)) 115 append(join(pages_dir, commentid, "pagetitle"), title) 116 117 # Add information to parent pages for child page lists. 118 119 if content.has_key("parent"): 120 parentid = content["parent"] 121 mkdirs(join(pages_dir, parentid)) 122 append(join(pages_dir, parentid, "children"), title + "\n") 123 124 # Some metadata is not particularly relevant. For example, 125 # ancestors, children, parent are navigation-related. 126 127 # Other metadata could be added to the page content itself. 128 # For example, labelling could be converted to categories. 129 130 # Handle revisions. 131 132 elif objecttype == "BodyContent": 133 body = content["body"] 134 if not body: 135 body = "## Empty page." 136 137 # NOTE: Very simple technique employed for guessing the format. 138 139 if no_translate: 140 fn = write 141 elif body.startswith("<"): 142 fn = xmltranslate 143 else: 144 fn = translate 145 146 try: 147 fn(join(versions_dir, content["content"]), body) 148 except: 149 print >>sys.stderr, "Error parsing..." 150 print >>sys.stderr, body 151 raise 152 153 self.content = {} 154 155 def handle_property(self, name, elements, attributes, all_text, text): 156 157 "Record properties in the current content dictionary." 158 159 self.content[attributes[-1]["name"]] = text.strip() 160 161 def handle_id(self, name, elements, attributes, all_text, text): 162 163 "Promote identifiers to the parent element's text." 164 165 all_text[-2].append(text) 166 167 def handle_collection(self, name, elements, attributes, all_text, text): 168 169 "Record collections in the current content dictionary." 170 171 self.content[attributes[-1]["name"]] = self.elements 172 self.elements = [] 173 174 def handle_element(self, name, elements, attributes, all_text, text): 175 176 "Add elements to the current collection." 177 178 self.elements.append((attributes[-1]["class"], text.strip())) 179 180 def mkdirs(name): 181 182 "Make the directory with the given 'name' at any depth." 183 184 try: 185 makedirs(name) 186 except OSError: 187 pass 188 189 def append(filename, s): 190 191 "Append to the file with the given 'filename' the string 's'." 192 193 write(filename, s, True) 194 195 def write(filename, s, append=False): 196 197 """ 198 Write to the file with the given 'filename' the string 's'. If the optional 199 'append' parameter is set to a true value, 's' will be appended to the file. 200 """ 201 202 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 203 try: 204 f.write(s) 205 finally: 206 f.close() 207 208 def read(filename): 209 210 """ 211 Read from the file with the given 'filename', returning a string containing 212 its contents. 213 """ 214 215 f = codecs.open(filename, encoding="utf-8") 216 try: 217 return f.read() 218 finally: 219 f.close() 220 221 def translate(filename, body, fn=None): 222 223 """ 224 Write to the file with the given 'filename' a translation of the given 225 'body'. 226 """ 227 228 fn = fn or parser.parse 229 230 out = codecs.open(filename, "w", encoding="utf-8") 231 try: 232 fn(body, out) 233 finally: 234 out.close() 235 236 def xmltranslate(filename, body): 237 translate(filename, body, parser.xmlparse) 238 239 def sort_manifest(filename, pagetitle, output=None): 240 241 """ 242 Sort the manifest given in 'filename' according to revision. 243 244 If a 'pagetitle' file exists, the title column in the manifest will be 245 augmented with the contents of that file. This is typically done for 246 comments. 247 248 If 'output' is given, the manifest details will be appended to the file 249 having that filename instead of being rewritten to the original manifest 250 file. 251 """ 252 253 if exists(pagetitle): 254 title = read(pagetitle) 255 else: 256 title = None 257 258 f = codecs.open(filename, "r", encoding="utf-8") 259 try: 260 lines = [x.split("|") for x in f.readlines()] 261 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 262 263 # Reconstruct the lines, optionally changing the titles. 264 265 result = [] 266 for x in lines: 267 if title is not None: 268 x[3] = "%s/%s" % (title, x[3]) 269 result.append("|".join(x[1:])) 270 finally: 271 f.close() 272 273 s = "".join(result) 274 275 if output is None: 276 write(filename, s) 277 else: 278 append(output, s) 279 280 if __name__ == "__main__": 281 try: 282 filename = sys.argv[1] 283 is_zipfile = splitext(filename)[-1] == extsep + "zip" 284 space = sys.argv[2] 285 except IndexError: 286 print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name." 287 print >>sys.stderr, "For example: com_entities.xml COM" 288 sys.exit(1) 289 290 no_translate = "--no-translate" in sys.argv 291 292 if exists(space): 293 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 294 sys.exit(1) 295 296 package_zip = space + extsep + "zip" 297 298 if exists(package_zip): 299 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 300 sys.exit(1) 301 302 mkdir(space) 303 mkdirs(join(space, "pages")) 304 mkdirs(join(space, "versions")) 305 306 p = xmlread.ConfigurableParser() 307 handler = ConfluenceHandler(space, no_translate) 308 309 # Register handlers in the parser for different elements. 310 311 p["object"] = handler.handle_object 312 p["property"] = handler.handle_property 313 p["id"] = handler.handle_id 314 p["collection"] = handler.handle_collection 315 p["element"] = handler.handle_element 316 317 # Open the XML dump. 318 319 f = open(filename) 320 321 if is_zipfile: 322 zf = ZipFile(f) 323 ff = StringIO(zf.read("entities.xml")) 324 else: 325 ff = f 326 327 # Parse the data. 328 329 try: 330 p.parse(ff) 331 finally: 332 f.close() 333 334 # Tidy up the import manifests, sorting each of them by revision and 335 # finalising them. 336 337 pages_dir = join(space, "pages") 338 339 output_manifest = join(space, "MOIN_PACKAGE") 340 append(output_manifest, "MoinMoinPackage|1\n") 341 342 for pageid in listdir(pages_dir): 343 manifest = join(pages_dir, pageid, "manifest") 344 pagetitle = join(pages_dir, pageid, "pagetitle") 345 sort_manifest(manifest, pagetitle, output_manifest) 346 347 # Write the page package. 348 349 page_package = ZipFile(package_zip, "w") 350 351 try: 352 # Include the page revisions. 353 354 versions_dir = join(space, "versions") 355 356 for versionid in listdir(versions_dir): 357 page_package.write(join(versions_dir, versionid)) 358 359 # Include only the top-level manifest. 360 361 page_package.write(output_manifest, "MOIN_PACKAGE") 362 363 finally: 364 page_package.close() 365 366 # vim: tabstop=4 expandtab shiftwidth=4