1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import listdir, mkdir, makedirs 25 from os.path import exists, extsep, join, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 import codecs 29 import xmlread 30 import parser 31 import sys 32 33 MAX_TITLE_LENGTH = 120 34 35 class ConfluenceHandler: 36 37 "Handle content from a Confluence Wiki dump." 38 39 def __init__(self, space, no_translate=False): 40 self.content = {} 41 self.elements = [] 42 self.space = space 43 self.no_translate = no_translate 44 45 def handle_object(self, name, elements, attributes, all_text, text): 46 47 "Handle objects according to type." 48 49 objecttype = attributes[-1]["class"] 50 51 # Any identifier is stored as the object's textual content. 52 53 identifier = text.strip() 54 55 # The content is a dictionary mapping names to properties and 56 # collections. 57 58 content = self.content 59 60 pages_dir = join(self.space, "pages") 61 versions_dir = join(self.space, "versions") 62 63 # Handle particular types. 64 65 if objecttype in ("Page", "Comment", "BlogPost"): 66 67 # Handle pages and revisions, adding revisions to the page manifest. 68 # The original version is used as a unifying identifier for all the 69 # different revisions (each of which being defined by a Page 70 # element). Although "original" implies the first identifier used, 71 # it actually appears to be the latest and will have the highest 72 # version number. 73 74 if content.has_key("originalVersion"): 75 pageid = content["originalVersion"] 76 else: 77 pageid = identifier 78 79 versionfile = join(versions_dir, identifier) 80 81 # Note page metadata, not necessarily in the correct order. 82 # For comments, the title will need to be rewritten, since they 83 # should be defined in terms of their owner page. 84 85 mkdirs(join(pages_dir, pageid)) 86 87 title = content["title"] 88 89 # Limit the title to a "safe" number of characters in order to avoid 90 # filesystem issues. 91 92 title = title[:MAX_TITLE_LENGTH] 93 94 if title: 95 title = "%s/%s" % (self.space, title) 96 write(join(pages_dir, pageid, "pagetitle"), title) 97 98 # See sort_manifest for access to this data. 99 100 append(join(pages_dir, pageid, "manifest"), 101 "%s|AddRevision|%s|%s|%s|%s\n" % ( 102 content["version"], 103 versionfile, 104 title, # comment titles will incorporate the comment's position 105 content["lastModifierName"], 106 content["versionComment"] 107 )) 108 109 # Add information to parent pages for child page lists. 110 111 if content.has_key("parent"): 112 parentid = content["parent"] 113 mkdirs(join(pages_dir, parentid)) 114 append(join(pages_dir, parentid, "children"), title + "\n") 115 116 # Add creation details for comments to the owner page. 117 # Since comments can be versioned, the date of the original version 118 # is used, and only this "original" version has the owner property. 119 120 if objecttype == "Comment" and content.has_key("owner"): 121 ownerid = content["owner"] 122 mkdirs(join(pages_dir, ownerid)) 123 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 124 125 # Some metadata is not particularly relevant. For example, 126 # ancestors, children, parent are navigation-related. 127 128 # Other metadata could be added to the page content itself. 129 # For example, labelling could be converted to categories. 130 131 # Handle revisions. 132 133 elif objecttype == "BodyContent": 134 body = content["body"] 135 if not body: 136 body = "## Empty page." 137 138 # NOTE: Very simple technique employed for guessing the format. 139 140 if no_translate: 141 fn = write 142 elif body.startswith("<"): 143 fn = xmltranslate 144 else: 145 fn = translate 146 147 try: 148 fn(join(versions_dir, content["content"]), body) 149 except: 150 print >>sys.stderr, "Error parsing..." 151 print >>sys.stderr, body 152 raise 153 154 self.content = {} 155 156 def handle_property(self, name, elements, attributes, all_text, text): 157 158 "Record properties in the current content dictionary." 159 160 self.content[attributes[-1]["name"]] = text.strip() 161 162 def handle_id(self, name, elements, attributes, all_text, text): 163 164 "Promote identifiers to the parent element's text." 165 166 all_text[-2].append(text) 167 168 def handle_collection(self, name, elements, attributes, all_text, text): 169 170 "Record collections in the current content dictionary." 171 172 self.content[attributes[-1]["name"]] = self.elements 173 self.elements = [] 174 175 def handle_element(self, name, elements, attributes, all_text, text): 176 177 "Add elements to the current collection." 178 179 self.elements.append((attributes[-1]["class"], text.strip())) 180 181 def mkdirs(name): 182 183 "Make the directory with the given 'name' at any depth." 184 185 try: 186 makedirs(name) 187 except OSError: 188 pass 189 190 def append(filename, s): 191 192 "Append to the file with the given 'filename' the string 's'." 193 194 write(filename, s, True) 195 196 def write(filename, s, append=False): 197 198 """ 199 Write to the file with the given 'filename' the string 's'. If the optional 200 'append' parameter is set to a true value, 's' will be appended to the file. 201 """ 202 203 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 204 try: 205 f.write(s) 206 finally: 207 f.close() 208 209 def read(filename): 210 211 """ 212 Read from the file with the given 'filename', returning a string containing 213 its contents. 214 """ 215 216 f = codecs.open(filename, encoding="utf-8") 217 try: 218 return f.read() 219 finally: 220 f.close() 221 222 def translate(filename, body, fn=None): 223 224 """ 225 Write to the file with the given 'filename' a translation of the given 226 'body'. 227 """ 228 229 fn = fn or parser.parse 230 231 out = codecs.open(filename, "w", encoding="utf-8") 232 try: 233 fn(body, out) 234 finally: 235 out.close() 236 237 def xmltranslate(filename, body): 238 translate(filename, body, parser.xmlparse) 239 240 def sort_comments(pages_dir, pageid): 241 242 """ 243 Where 'pageid' has comments associated with it, sort them chronologically 244 and label the comment pages with the owner page's title and comment's 245 position in the chronological sequence. Such labelling is done by writing 246 a "pagetitle" file in each comment page's directory. 247 """ 248 249 comments = join(pages_dir, pageid, "comments") 250 251 if not exists(comments): 252 return 253 254 title = read(join(pages_dir, pageid, "pagetitle")) 255 256 details = [line.split("|") for line in read(comments).split("\n") if line] 257 details.sort() 258 259 # Write the sorted comments list for testing purposes. 260 261 write(comments, "\n".join(["|".join(x) for x in details])) 262 263 # Define comments as subpages by setting their titles using this 264 # page's name/title and their position in the comments collection. 265 266 for position, (_lastmodified, commentid) in enumerate(details): 267 268 # In the page directory for each comment, write the page title in a 269 # special file for later processing. 270 271 write(join(pages_dir, commentid, "pagetitle"), "%s/%s" % (title, position)) 272 273 def sort_manifest(pages_dir, pageid, output=None): 274 275 """ 276 Using the given 'pageid', locate the manifest for the page and any page 277 title information written to a "pagetitle" file. 278 279 Then sort the manifest according to revision so that it will be added to 280 MoinMoin in the correct order. 281 282 If a "pagetitle" file exists, the title column in the manifest will be 283 augmented with the contents of that file. This is typically done for 284 comments. 285 286 If a "children" file exists, the pages in that file will be added as a list 287 to the end of each revision's content. 288 289 If 'output' is given, the manifest details will be appended to the file 290 having that filename instead of being rewritten to the original manifest 291 file. 292 """ 293 294 manifest = join(pages_dir, pageid, "manifest") 295 pagetitle = join(pages_dir, pageid, "pagetitle") 296 children = join(pages_dir, pageid, "children") 297 298 if exists(pagetitle): 299 title = read(pagetitle) 300 else: 301 title = None 302 303 f = codecs.open(manifest, "r", encoding="utf-8") 304 try: 305 lines = [x.split("|") for x in f.readlines()] 306 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 307 308 # Reconstruct the lines, optionally changing the titles. 309 310 result = [] 311 312 for line in lines: 313 version, _addrevision, filename, old_title, username, comment = line 314 315 # Replace title information with the information already present. 316 317 if title is not None: 318 new_title = title 319 else: 320 new_title = old_title 321 322 # The version is omitted now that the manifest is ordered. 323 324 line = _addrevision, filename, new_title, username, comment 325 result.append("|".join(line)) 326 327 # Add child page information to the content. 328 329 if exists(children): 330 child_pages = [] 331 child_page_names = [x for x in read(children).split("\n") if x] 332 child_page_names.sort() 333 334 for child_page_name in child_page_names: 335 child_pages.append(" * [[%s]]" % child_page_name) 336 337 append(filename, child_page_section % "\n".join(child_pages)) 338 339 finally: 340 f.close() 341 342 s = "".join(result) 343 344 if output is None: 345 write(manifest, s) 346 else: 347 append(output, s) 348 349 # Template for child page information. 350 351 child_page_section = """ 352 ---- 353 354 %s 355 """ 356 357 # Main program. 358 359 if __name__ == "__main__": 360 try: 361 filename = sys.argv[1] 362 is_zipfile = splitext(filename)[-1] == extsep + "zip" 363 space = sys.argv[2] 364 except IndexError: 365 print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name." 366 print >>sys.stderr, "For example: com_entities.xml COM" 367 sys.exit(1) 368 369 no_translate = "--no-translate" in sys.argv 370 371 if exists(space): 372 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 373 sys.exit(1) 374 375 package_zip = space + extsep + "zip" 376 377 if exists(package_zip): 378 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 379 sys.exit(1) 380 381 mkdir(space) 382 mkdirs(join(space, "pages")) 383 mkdirs(join(space, "versions")) 384 385 p = xmlread.ConfigurableParser() 386 handler = ConfluenceHandler(space, no_translate) 387 388 # Register handlers in the parser for different elements. 389 390 p["object"] = handler.handle_object 391 p["property"] = handler.handle_property 392 p["id"] = handler.handle_id 393 p["collection"] = handler.handle_collection 394 p["element"] = handler.handle_element 395 396 # Open the XML dump. 397 398 f = open(filename) 399 400 if is_zipfile: 401 zf = ZipFile(f) 402 ff = StringIO(zf.read("entities.xml")) 403 else: 404 ff = f 405 406 # Parse the data. 407 408 try: 409 p.parse(ff) 410 finally: 411 f.close() 412 413 # Tidy up the import manifests, sorting each of them by revision and 414 # finalising them. 415 416 pages_dir = join(space, "pages") 417 418 for pageid in listdir(pages_dir): 419 sort_comments(pages_dir, pageid) 420 421 output_manifest = join(space, "MOIN_PACKAGE") 422 append(output_manifest, "MoinMoinPackage|1\n") 423 424 for pageid in listdir(pages_dir): 425 sort_manifest(pages_dir, pageid, output_manifest) 426 427 # Write the page package. 428 429 page_package = ZipFile(package_zip, "w") 430 431 try: 432 # Include the page revisions. 433 434 versions_dir = join(space, "versions") 435 436 for versionid in listdir(versions_dir): 437 page_package.write(join(versions_dir, versionid)) 438 439 # Include only the top-level manifest. 440 441 page_package.write(output_manifest, "MOIN_PACKAGE") 442 443 finally: 444 page_package.close() 445 446 # vim: tabstop=4 expandtab shiftwidth=4