1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 34 MAX_TITLE_LENGTH = 120 35 36 class ConfluenceHandler: 37 38 "Handle content from a Confluence Wiki dump." 39 40 def __init__(self, space, no_translate=False): 41 self.content = {} 42 self.elements = [] 43 self.space = space 44 self.no_translate = no_translate 45 46 def handle_object(self, name, elements, attributes, all_text, text): 47 48 """ 49 Handle objects according to type. Objects appear as follows: 50 51 <object class="Page" package="..."> 52 <id name="id">...</id> 53 ... 54 </object> 55 56 Within objects, one finds things like properties and collections, which 57 are handled by their own methods but which are stored in the content 58 dictionary associated with the current object. 59 60 By the time this method is called, the contents of the object will have 61 been gathered and the properties and collections populated in the 62 content dictionary. Any identifier will have been assigned to the 63 textual content of the object element and will be available in the 64 'text' parameter. 65 """ 66 67 objecttype = attributes[-1]["class"] 68 69 # Any identifier is stored as the object's textual content. 70 71 identifier = text.strip() 72 73 # The content is a dictionary mapping names to properties and 74 # collections. 75 76 content = self.content 77 78 pages_dir = join(self.space, "pages") 79 versions_dir = join(self.space, "versions") 80 81 # Handle particular types. 82 83 if objecttype in ("Page", "Comment", "BlogPost"): 84 85 # Handle pages and revisions, adding revisions to the page manifest. 86 # The original version is used as a unifying identifier for all the 87 # different revisions (each of which being defined by a Page 88 # element). Although "original" implies the first identifier used, 89 # it actually appears to be the latest and will have the highest 90 # version number. 91 92 if content.has_key("originalVersion"): 93 pageid = content["originalVersion"] 94 else: 95 pageid = identifier 96 97 versionfile = join(versions_dir, identifier) 98 99 # Note page metadata, not necessarily in the correct order. 100 # For comments, the title will need to be rewritten, since they 101 # should be defined in terms of their owner page. 102 103 # NOTE: This only makes the current title available to comments. 104 105 mkdirs(join(pages_dir, pageid)) 106 107 title = content["title"] 108 109 # Limit the title to a "safe" number of characters in order to avoid 110 # filesystem issues. 111 112 title = title[:MAX_TITLE_LENGTH] 113 114 if title: 115 title = "%s/%s" % (self.space, title) 116 write(join(pages_dir, pageid, "pagetitle"), title) 117 118 # See sort_manifest for access to this data. 119 120 append(join(pages_dir, pageid, "manifest"), 121 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment 122 content["version"], 123 versionfile, 124 title, # comment titles will incorporate the comment's position 125 content["lastModifierName"], 126 content["versionComment"] 127 )) 128 129 # Add information to parent pages for child page lists. 130 131 if content.has_key("parent"): 132 parentid = content["parent"] 133 mkdirs(join(pages_dir, parentid)) 134 append(join(pages_dir, parentid, "children"), title + "\n") 135 136 # Add creation details for comments to the owner page. 137 # Since comments can be versioned, the date of the original version 138 # is used, and only this "original" version has the owner property. 139 140 if objecttype == "Comment" and content.has_key("owner"): 141 ownerid = content["owner"] 142 mkdirs(join(pages_dir, ownerid)) 143 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 144 145 # Some metadata is not particularly relevant. For example, 146 # ancestors, children, parent are navigation-related. 147 148 # Other metadata could be added to the page content itself. 149 # For example, labelling could be converted to categories. 150 151 # Handle revisions. 152 153 elif objecttype == "BodyContent": 154 body = content["body"] 155 if not body: 156 body = "## Empty page." 157 158 # NOTE: Very simple technique employed for guessing the format. 159 160 if no_translate: 161 fn = write 162 elif body.startswith("<"): 163 fn = xmltranslate 164 else: 165 fn = translate 166 167 try: 168 fn(join(versions_dir, content["content"]), body) 169 except: 170 err = codecs.getwriter("utf-8")(sys.stderr) 171 print >>err, "Error parsing", content["content"] 172 raise 173 174 # Handle attachments. 175 176 elif objecttype == "Attachment": 177 pageid = content["content"] 178 version = content["attachmentVersion"] 179 180 if content.has_key("originalVersion"): 181 attachid = content["originalVersion"] 182 else: 183 attachid = identifier 184 185 append(join(pages_dir, pageid, "attachments"), 186 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % ( 187 version, 188 # Have to "taint" archive filenames, although Moin will 189 # probably handle package script filename tainting. 190 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 191 wikiutil.taintfilename(content["fileName"]), 192 "", # pagename is substituted later 193 content["lastModifierName"], 194 content["comment"] 195 )) 196 197 self.content = {} 198 199 def handle_property(self, name, elements, attributes, all_text, text): 200 201 "Record properties in the current content dictionary." 202 203 self.content[attributes[-1]["name"]] = text.strip() 204 205 def handle_id(self, name, elements, attributes, all_text, text): 206 207 "Promote identifiers to the parent element's text." 208 209 all_text[-2].append(text) 210 211 def handle_collection(self, name, elements, attributes, all_text, text): 212 213 "Record collections in the current content dictionary." 214 215 self.content[attributes[-1]["name"]] = self.elements 216 self.elements = [] 217 218 def handle_element(self, name, elements, attributes, all_text, text): 219 220 "Add elements to the current collection." 221 222 self.elements.append((attributes[-1]["class"], text.strip())) 223 224 def mkdirs(name): 225 226 "Make the directory with the given 'name' at any depth." 227 228 try: 229 makedirs(name) 230 except OSError: 231 pass 232 233 def append(filename, s): 234 235 "Append to the file with the given 'filename' the string 's'." 236 237 write(filename, s, True) 238 239 def write(filename, s, append=False): 240 241 """ 242 Write to the file with the given 'filename' the string 's'. If the optional 243 'append' parameter is set to a true value, 's' will be appended to the file. 244 """ 245 246 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 247 try: 248 f.write(s) 249 finally: 250 f.close() 251 252 def read(filename): 253 254 """ 255 Read from the file with the given 'filename', returning a string containing 256 its contents. 257 """ 258 259 f = codecs.open(filename, encoding="utf-8") 260 try: 261 return f.read() 262 finally: 263 f.close() 264 265 def translate(filename, body, fn=None): 266 267 """ 268 Write to the file with the given 'filename' a translation of the given 269 'body'. 270 """ 271 272 fn = fn or wikiparser.parse 273 274 out = codecs.open(filename, "w", encoding="utf-8") 275 try: 276 print >>out, "#pragma page-filename", filename 277 fn(body, out) 278 finally: 279 out.close() 280 281 def xmltranslate(filename, body): 282 translate(filename, body, xmlparser.parse) 283 284 def sort_comments(pages_dir, pageid): 285 286 """ 287 Where 'pageid' has comments associated with it, sort them chronologically 288 and label the comment pages with the owner page's title and comment's 289 position in the chronological sequence. Such labelling is done by writing 290 a "pagetitle" file in each comment page's directory. 291 """ 292 293 comments = join(pages_dir, pageid, "comments") 294 295 if not exists(comments): 296 return 297 298 title = read(join(pages_dir, pageid, "pagetitle")) 299 300 details = [line.split("|") for line in read(comments).split("\n") if line] 301 details.sort() 302 303 # Write the sorted comments list for testing purposes. 304 305 write(comments, "\n".join(["|".join(x) for x in details])) 306 307 # Define comments as subpages by setting their titles using this 308 # page's name/title and their position in the comments collection. 309 310 for position, (_lastmodified, commentid) in enumerate(details): 311 312 # In the page directory for each comment, write the page title in a 313 # special file for later processing. 314 315 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 316 317 def _sort_manifest(manifest, title): 318 319 """ 320 Open the given 'manifest' and sort it according to revision so that it will 321 be added to MoinMoin in the correct order. 322 323 If a 'title' is provided, the title column in the manifest will be augmented 324 with that information. This is typically done for comments and is necessary 325 for attachments. 326 327 A list of manifest entries is returned. 328 """ 329 330 f = codecs.open(manifest, "r", encoding="utf-8") 331 try: 332 lines = [x.split("|") for x in f.readlines()] 333 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 334 335 # Reconstruct the lines, optionally changing the titles. 336 337 result = [] 338 339 for line in lines: 340 version, _action, _archive_filename, filename, old_title, username, comment = line 341 342 # Replace title information with the information already present. 343 344 if not old_title: 345 new_title = title 346 else: 347 new_title = old_title 348 349 # The version is omitted now that the manifest is ordered. 350 351 line = _action, _archive_filename, filename, new_title, username, comment 352 result.append(line) 353 354 return result 355 356 finally: 357 f.close() 358 359 def serialise_manifest(manifest): 360 361 """ 362 Process the 'manifest' consisting of entries, removing superfluous columns. 363 """ 364 365 result = [] 366 367 for columns in manifest: 368 action = columns[0] 369 if action == "AddRevision": 370 columns = list(columns) 371 del columns[1] 372 result.append("|".join(columns)) 373 374 return "".join(result) 375 376 def sort_manifest(pages_dir, pageid, output=None, no_translate=False): 377 378 """ 379 Using the given 'pageid', locate the manifest for the page and any page 380 title information written to a "pagetitle" file. 381 382 Then sort the manifest according to revision so that it will be added to 383 MoinMoin in the correct order. 384 385 If a "pagetitle" file exists, the title column in the manifest will be 386 augmented with the contents of that file. This is typically done for 387 comments. 388 389 If a "children" file exists, the pages in that file will be added as a list 390 to the end of each revision's content. 391 392 If 'output' is given, the manifest details will be appended to the file 393 having that filename instead of being rewritten to the original manifest 394 file. 395 """ 396 397 manifest = join(pages_dir, pageid, "manifest") 398 attachments = join(pages_dir, pageid, "attachments") 399 pagetitle = join(pages_dir, pageid, "pagetitle") 400 children = join(pages_dir, pageid, "children") 401 comments = join(pages_dir, pageid, "comments") 402 403 if exists(pagetitle): 404 title = read(pagetitle) 405 else: 406 title = None 407 408 # Sort the revision manifest. 409 410 result = _sort_manifest(manifest, title) 411 412 for _action, _archive_filename, filename, new_title, username, comment in result: 413 414 # Add child page information to the content. 415 416 if exists(children) and not no_translate: 417 child_pages = [] 418 child_page_names = [x for x in read(children).split("\n") if x] 419 child_page_names.sort() 420 421 for child_page_name in child_page_names: 422 child_pages.append(" * [[%s]]" % child_page_name) 423 424 append(filename, child_page_section % "\n".join(child_pages)) 425 426 # Add comments to the content. 427 428 if exists(comments) and title and not no_translate: 429 append(filename, comment_section % title) 430 431 # Add the attachments to the manifest. 432 433 if exists(attachments): 434 result += _sort_manifest(attachments, title) 435 436 # Serialise the manifest. 437 438 s = serialise_manifest(result) 439 440 if output is None: 441 write(manifest, s) 442 else: 443 append(output, s) 444 445 # Template for child page information. 446 447 child_page_section = """ 448 ---- 449 450 %s 451 """ 452 453 # Template for comments. 454 455 comment_section = """ 456 ---- 457 458 <<Include("^%s/")>> 459 """ 460 461 # Main program. 462 463 if __name__ == "__main__": 464 try: 465 filename = sys.argv[1] 466 is_zipfile = splitext(filename)[-1] == extsep + "zip" 467 space = sys.argv[2] 468 if len(sys.argv) > 3 and sys.argv[3]: 469 attachments = sys.argv[3] 470 else: 471 attachments = None 472 except IndexError: 473 print >>sys.stderr, """ 474 Please specify an XML file containing Wiki data, a workspace name, and an 475 optional attachments directory location. For example: 476 477 com_entities.xml COM attachments 478 479 Adding --no-translate will unpack the Wiki but not translate the content. 480 When doing so without an attachments directory, add an empty argument as 481 follows: 482 483 com_entities.xml COM '' --no-translate 484 """ 485 sys.exit(1) 486 487 no_translate = "--no-translate" in sys.argv 488 489 if exists(space): 490 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 491 sys.exit(1) 492 493 package_zip = space + extsep + "zip" 494 495 if exists(package_zip): 496 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 497 sys.exit(1) 498 499 mkdir(space) 500 mkdirs(join(space, "pages")) 501 mkdirs(join(space, "versions")) 502 503 p = xmlread.ConfigurableParser() 504 handler = ConfluenceHandler(space, no_translate) 505 506 # Register handlers in the parser for different elements. 507 508 p["object"] = handler.handle_object 509 p["property"] = handler.handle_property 510 p["id"] = handler.handle_id 511 p["collection"] = handler.handle_collection 512 p["element"] = handler.handle_element 513 514 # Open the XML dump. 515 516 f = open(filename) 517 518 if is_zipfile: 519 zf = ZipFile(f) 520 ff = StringIO(zf.read("entities.xml")) 521 else: 522 ff = f 523 524 # Parse the data. 525 526 try: 527 p.parse(ff) 528 529 # Tidy up the import manifests, sorting each of them by revision and 530 # finalising them. 531 532 pages_dir = join(space, "pages") 533 534 for pageid in listdir(pages_dir): 535 sort_comments(pages_dir, pageid) 536 537 output_manifest = join(space, "MOIN_PACKAGE") 538 append(output_manifest, "MoinMoinPackage|1\n") 539 540 for pageid in listdir(pages_dir): 541 sort_manifest(pages_dir, pageid, output_manifest, no_translate) 542 543 # Write the page package. 544 545 page_package = ZipFile(package_zip, "w") 546 547 try: 548 # Include the page revisions. 549 550 versions_dir = join(space, "versions") 551 552 for versionid in listdir(versions_dir): 553 page_package.write(join(versions_dir, versionid)) 554 555 # Include the attachments. 556 557 if attachments: 558 cwd = getcwd() 559 chdir(split(attachments)[0]) 560 try: 561 for path, dirnames, filenames in walk(split(attachments)[1]): 562 for filename in filenames: 563 # Have to "taint" archive filenames. 564 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 565 finally: 566 chdir(cwd) 567 elif is_zipfile: 568 for filename in zf.namelist(): 569 if filename.startswith("attachments"): 570 # Have to "taint" archive filenames. 571 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 572 573 # Include only the top-level manifest. 574 575 page_package.write(output_manifest, "MOIN_PACKAGE") 576 577 finally: 578 page_package.close() 579 580 finally: 581 f.close() 582 583 # vim: tabstop=4 expandtab shiftwidth=4