1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 34 MAX_TITLE_LENGTH = 120 35 36 class ConfluenceHandler: 37 38 "Handle content from a Confluence Wiki dump." 39 40 def __init__(self, space, no_translate=False): 41 self.content = {} 42 self.elements = [] 43 self.space = space 44 self.no_translate = no_translate 45 46 def handle_object(self, name, elements, attributes, all_text, text): 47 48 """ 49 Handle objects according to type. Objects appear as follows: 50 51 <object class="Page" package="..."> 52 <id name="id">...</id> 53 ... 54 </object> 55 56 Within objects, one finds things like properties and collections, which 57 are handled by their own methods but which are stored in the content 58 dictionary associated with the current object. 59 60 By the time this method is called, the contents of the object will have 61 been gathered and the properties and collections populated in the 62 content dictionary. Any identifier will have been assigned to the 63 textual content of the object element and will be available in the 64 'text' parameter. 65 """ 66 67 objecttype = attributes[-1]["class"] 68 69 # Any identifier is stored as the object's textual content. 70 71 identifier = text.strip() 72 73 # The content is a dictionary mapping names to properties and 74 # collections. 75 76 content = self.content 77 78 pages_dir = join(self.space, "pages") 79 versions_dir = join(self.space, "versions") 80 81 # Handle particular types. 82 83 if objecttype in ("Page", "Comment", "BlogPost"): 84 85 # Handle pages and revisions, adding revisions to the page manifest. 86 # The original version is used as a unifying identifier for all the 87 # different revisions (each of which being defined by a Page 88 # element). Although "original" implies the first identifier used, 89 # it actually appears to be the latest and will have the highest 90 # version number. 91 92 if content.has_key("originalVersion"): 93 pageid = content["originalVersion"] 94 else: 95 pageid = identifier 96 97 versionfile = join(versions_dir, identifier) 98 99 # Note page metadata, not necessarily in the correct order. 100 # For comments, the title will need to be rewritten, since they 101 # should be defined in terms of their owner page. 102 103 mkdirs(join(pages_dir, pageid)) 104 105 title = content["title"] 106 107 # Limit the title to a "safe" number of characters in order to avoid 108 # filesystem issues. 109 110 title = title[:MAX_TITLE_LENGTH] 111 112 if title: 113 title = "%s/%s" % (self.space, title) 114 write(join(pages_dir, pageid, "pagetitle"), title) 115 116 # See sort_manifest for access to this data. 117 118 append(join(pages_dir, pageid, "manifest"), 119 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment 120 content["version"], 121 versionfile, 122 title, # comment titles will incorporate the comment's position 123 content["lastModifierName"], 124 content["versionComment"] 125 )) 126 127 # Add information to parent pages for child page lists. 128 129 if content.has_key("parent"): 130 parentid = content["parent"] 131 mkdirs(join(pages_dir, parentid)) 132 append(join(pages_dir, parentid, "children"), title + "\n") 133 134 # Add creation details for comments to the owner page. 135 # Since comments can be versioned, the date of the original version 136 # is used, and only this "original" version has the owner property. 137 138 if objecttype == "Comment" and content.has_key("owner"): 139 ownerid = content["owner"] 140 mkdirs(join(pages_dir, ownerid)) 141 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 142 143 # Some metadata is not particularly relevant. For example, 144 # ancestors, children, parent are navigation-related. 145 146 # Other metadata could be added to the page content itself. 147 # For example, labelling could be converted to categories. 148 149 # Handle revisions. 150 151 elif objecttype == "BodyContent": 152 body = content["body"] 153 if not body: 154 body = "## Empty page." 155 156 # NOTE: Very simple technique employed for guessing the format. 157 158 if no_translate: 159 fn = write 160 elif body.startswith("<"): 161 fn = xmltranslate 162 else: 163 fn = translate 164 165 try: 166 fn(join(versions_dir, content["content"]), body) 167 except: 168 err = codecs.getwriter("utf-8")(sys.stderr) 169 print >>err, "Error parsing", content["content"] 170 raise 171 172 # Handle attachments. 173 174 elif objecttype == "Attachment": 175 pageid = content["content"] 176 version = content["attachmentVersion"] 177 178 if content.has_key("originalVersion"): 179 attachid = content["originalVersion"] 180 else: 181 attachid = identifier 182 183 append(join(pages_dir, pageid, "attachments"), 184 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % ( 185 version, 186 # Have to "taint" archive filenames, although Moin will 187 # probably handle package script filename tainting. 188 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 189 wikiutil.taintfilename(content["fileName"]), 190 "", # pagename is substituted later 191 content["lastModifierName"], 192 content["comment"] 193 )) 194 195 self.content = {} 196 197 def handle_property(self, name, elements, attributes, all_text, text): 198 199 "Record properties in the current content dictionary." 200 201 self.content[attributes[-1]["name"]] = text.strip() 202 203 def handle_id(self, name, elements, attributes, all_text, text): 204 205 "Promote identifiers to the parent element's text." 206 207 all_text[-2].append(text) 208 209 def handle_collection(self, name, elements, attributes, all_text, text): 210 211 "Record collections in the current content dictionary." 212 213 self.content[attributes[-1]["name"]] = self.elements 214 self.elements = [] 215 216 def handle_element(self, name, elements, attributes, all_text, text): 217 218 "Add elements to the current collection." 219 220 self.elements.append((attributes[-1]["class"], text.strip())) 221 222 def mkdirs(name): 223 224 "Make the directory with the given 'name' at any depth." 225 226 try: 227 makedirs(name) 228 except OSError: 229 pass 230 231 def append(filename, s): 232 233 "Append to the file with the given 'filename' the string 's'." 234 235 write(filename, s, True) 236 237 def write(filename, s, append=False): 238 239 """ 240 Write to the file with the given 'filename' the string 's'. If the optional 241 'append' parameter is set to a true value, 's' will be appended to the file. 242 """ 243 244 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 245 try: 246 f.write(s) 247 finally: 248 f.close() 249 250 def read(filename): 251 252 """ 253 Read from the file with the given 'filename', returning a string containing 254 its contents. 255 """ 256 257 f = codecs.open(filename, encoding="utf-8") 258 try: 259 return f.read() 260 finally: 261 f.close() 262 263 def translate(filename, body, fn=None): 264 265 """ 266 Write to the file with the given 'filename' a translation of the given 267 'body'. 268 """ 269 270 fn = fn or wikiparser.parse 271 272 out = codecs.open(filename, "w", encoding="utf-8") 273 try: 274 fn(body, out) 275 finally: 276 out.close() 277 278 def xmltranslate(filename, body): 279 translate(filename, body, xmlparser.parse) 280 281 def sort_comments(pages_dir, pageid): 282 283 """ 284 Where 'pageid' has comments associated with it, sort them chronologically 285 and label the comment pages with the owner page's title and comment's 286 position in the chronological sequence. Such labelling is done by writing 287 a "pagetitle" file in each comment page's directory. 288 """ 289 290 comments = join(pages_dir, pageid, "comments") 291 292 if not exists(comments): 293 return 294 295 title = read(join(pages_dir, pageid, "pagetitle")) 296 297 details = [line.split("|") for line in read(comments).split("\n") if line] 298 details.sort() 299 300 # Write the sorted comments list for testing purposes. 301 302 write(comments, "\n".join(["|".join(x) for x in details])) 303 304 # Define comments as subpages by setting their titles using this 305 # page's name/title and their position in the comments collection. 306 307 for position, (_lastmodified, commentid) in enumerate(details): 308 309 # In the page directory for each comment, write the page title in a 310 # special file for later processing. 311 312 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 313 314 def _sort_manifest(manifest, title): 315 316 """ 317 Open the given 'manifest' and sort it according to revision so that it will 318 be added to MoinMoin in the correct order. 319 320 If a 'title' is provided, the title column in the manifest will be augmented 321 with that information. This is typically done for comments and is necessary 322 for attachments. 323 324 A list of manifest entries is returned. 325 """ 326 327 f = codecs.open(manifest, "r", encoding="utf-8") 328 try: 329 lines = [x.split("|") for x in f.readlines()] 330 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 331 332 # Reconstruct the lines, optionally changing the titles. 333 334 result = [] 335 336 for line in lines: 337 version, _action, _archive_filename, filename, old_title, username, comment = line 338 339 # Replace title information with the information already present. 340 341 if title is not None: 342 new_title = title 343 else: 344 new_title = old_title 345 346 # The version is omitted now that the manifest is ordered. 347 348 line = _action, _archive_filename, filename, new_title, username, comment 349 result.append(line) 350 351 return result 352 353 finally: 354 f.close() 355 356 def serialise_manifest(manifest): 357 358 """ 359 Process the 'manifest' consisting of entries, removing superfluous columns. 360 """ 361 362 result = [] 363 364 for columns in manifest: 365 action = columns[0] 366 if action == "AddRevision": 367 columns = list(columns) 368 del columns[1] 369 result.append("|".join(columns)) 370 371 return "".join(result) 372 373 def sort_manifest(pages_dir, pageid, output=None, no_translate=False): 374 375 """ 376 Using the given 'pageid', locate the manifest for the page and any page 377 title information written to a "pagetitle" file. 378 379 Then sort the manifest according to revision so that it will be added to 380 MoinMoin in the correct order. 381 382 If a "pagetitle" file exists, the title column in the manifest will be 383 augmented with the contents of that file. This is typically done for 384 comments. 385 386 If a "children" file exists, the pages in that file will be added as a list 387 to the end of each revision's content. 388 389 If 'output' is given, the manifest details will be appended to the file 390 having that filename instead of being rewritten to the original manifest 391 file. 392 """ 393 394 manifest = join(pages_dir, pageid, "manifest") 395 attachments = join(pages_dir, pageid, "attachments") 396 pagetitle = join(pages_dir, pageid, "pagetitle") 397 children = join(pages_dir, pageid, "children") 398 comments = join(pages_dir, pageid, "comments") 399 400 if exists(pagetitle): 401 title = read(pagetitle) 402 else: 403 title = None 404 405 # Sort the revision manifest. 406 407 result = _sort_manifest(manifest, title) 408 409 for _action, _archive_filename, filename, new_title, username, comment in result: 410 411 # Add child page information to the content. 412 413 if exists(children) and not no_translate: 414 child_pages = [] 415 child_page_names = [x for x in read(children).split("\n") if x] 416 child_page_names.sort() 417 418 for child_page_name in child_page_names: 419 child_pages.append(" * [[%s]]" % child_page_name) 420 421 append(filename, child_page_section % "\n".join(child_pages)) 422 423 # Add comments to the content. 424 425 if exists(comments) and title and not no_translate: 426 append(filename, comment_section % title) 427 428 # Add the attachments to the manifest. 429 430 if exists(attachments): 431 result += _sort_manifest(attachments, title) 432 433 # Serialise the manifest. 434 435 s = serialise_manifest(result) 436 437 if output is None: 438 write(manifest, s) 439 else: 440 append(output, s) 441 442 # Template for child page information. 443 444 child_page_section = """ 445 ---- 446 447 %s 448 """ 449 450 # Template for comments. 451 452 comment_section = """ 453 ---- 454 455 <<Include("^%s/")>> 456 """ 457 458 # Main program. 459 460 if __name__ == "__main__": 461 try: 462 filename = sys.argv[1] 463 is_zipfile = splitext(filename)[-1] == extsep + "zip" 464 space = sys.argv[2] 465 if len(sys.argv) > 3: 466 attachments = sys.argv[3] 467 else: 468 attachments = None 469 except IndexError: 470 print >>sys.stderr, "Please specify an XML file containing Wiki data, a workspace name," 471 print >>sys.stderr, "and an optional attachments directory location." 472 print >>sys.stderr, "For example: com_entities.xml COM" 473 sys.exit(1) 474 475 no_translate = "--no-translate" in sys.argv 476 477 if exists(space): 478 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 479 sys.exit(1) 480 481 package_zip = space + extsep + "zip" 482 483 if exists(package_zip): 484 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 485 sys.exit(1) 486 487 mkdir(space) 488 mkdirs(join(space, "pages")) 489 mkdirs(join(space, "versions")) 490 491 p = xmlread.ConfigurableParser() 492 handler = ConfluenceHandler(space, no_translate) 493 494 # Register handlers in the parser for different elements. 495 496 p["object"] = handler.handle_object 497 p["property"] = handler.handle_property 498 p["id"] = handler.handle_id 499 p["collection"] = handler.handle_collection 500 p["element"] = handler.handle_element 501 502 # Open the XML dump. 503 504 f = open(filename) 505 506 if is_zipfile: 507 zf = ZipFile(f) 508 ff = StringIO(zf.read("entities.xml")) 509 else: 510 ff = f 511 512 # Parse the data. 513 514 try: 515 p.parse(ff) 516 517 # Tidy up the import manifests, sorting each of them by revision and 518 # finalising them. 519 520 pages_dir = join(space, "pages") 521 522 for pageid in listdir(pages_dir): 523 sort_comments(pages_dir, pageid) 524 525 output_manifest = join(space, "MOIN_PACKAGE") 526 append(output_manifest, "MoinMoinPackage|1\n") 527 528 for pageid in listdir(pages_dir): 529 sort_manifest(pages_dir, pageid, output_manifest, no_translate) 530 531 # Write the page package. 532 533 page_package = ZipFile(package_zip, "w") 534 535 try: 536 # Include the page revisions. 537 538 versions_dir = join(space, "versions") 539 540 for versionid in listdir(versions_dir): 541 page_package.write(join(versions_dir, versionid)) 542 543 # Include the attachments. 544 545 if attachments: 546 cwd = getcwd() 547 chdir(split(attachments)[0]) 548 try: 549 for path, dirnames, filenames in walk(split(attachments)[1]): 550 for filename in filenames: 551 # Have to "taint" archive filenames. 552 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 553 finally: 554 chdir(cwd) 555 elif is_zipfile: 556 for filename in zf.namelist(): 557 if filename.startswith("attachments"): 558 # Have to "taint" archive filenames. 559 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 560 561 # Include only the top-level manifest. 562 563 page_package.write(output_manifest, "MOIN_PACKAGE") 564 565 finally: 566 page_package.close() 567 568 finally: 569 f.close() 570 571 # vim: tabstop=4 expandtab shiftwidth=4