1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 34 MAX_TITLE_LENGTH = 120 35 36 class ConfluenceHandler: 37 38 "Handle content from a Confluence Wiki dump." 39 40 def __init__(self, space, no_translate=False): 41 self.content = {} 42 self.elements = [] 43 self.space = space 44 self.no_translate = no_translate 45 46 def handle_object(self, name, elements, attributes, all_text, text): 47 48 """ 49 Handle objects according to type. Objects appear as follows: 50 51 <object class="Page" package="..."> 52 <id name="id">...</id> 53 ... 54 </object> 55 56 Within objects, one finds things like properties and collections, which 57 are handled by their own methods but which are stored in the content 58 dictionary associated with the current object. 59 60 By the time this method is called, the contents of the object will have 61 been gathered and the properties and collections populated in the 62 content dictionary. Any identifier will have been assigned to the 63 textual content of the object element and will be available in the 64 'text' parameter. 65 """ 66 67 objecttype = attributes[-1]["class"] 68 69 # Any identifier is stored as the object's textual content. 70 71 identifier = text.strip() 72 73 # The content is a dictionary mapping names to properties and 74 # collections. 75 76 content = self.content 77 78 pages_dir = join(self.space, "pages") 79 versions_dir = join(self.space, "versions") 80 81 # Handle particular types. 82 83 if objecttype in ("Page", "Comment", "BlogPost"): 84 85 # Handle pages and revisions, adding revisions to the page manifest. 86 # The original version is used as a unifying identifier for all the 87 # different revisions (each of which being defined by a Page 88 # element). Although "original" implies the first identifier used, 89 # it actually appears to be the latest and will have the highest 90 # version number. 91 92 if content.has_key("originalVersion"): 93 pageid = content["originalVersion"] 94 else: 95 pageid = identifier 96 97 versionfile = join(versions_dir, identifier) 98 99 # Note page metadata, not necessarily in the correct order. 100 # For comments, the title will need to be rewritten, since they 101 # should be defined in terms of their owner page. 102 103 mkdirs(join(pages_dir, pageid)) 104 105 title = content["title"] 106 107 # Limit the title to a "safe" number of characters in order to avoid 108 # filesystem issues. 109 110 title = title[:MAX_TITLE_LENGTH] 111 112 if title: 113 title = "%s/%s" % (self.space, title) 114 write(join(pages_dir, pageid, "pagetitle"), title) 115 116 # See sort_manifest for access to this data. 117 118 append(join(pages_dir, pageid, "manifest"), 119 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment 120 content["version"], 121 versionfile, 122 title, # comment titles will incorporate the comment's position 123 content["lastModifierName"], 124 content["versionComment"] 125 )) 126 127 # Add information to parent pages for child page lists. 128 129 if content.has_key("parent"): 130 parentid = content["parent"] 131 mkdirs(join(pages_dir, parentid)) 132 append(join(pages_dir, parentid, "children"), title + "\n") 133 134 # Add creation details for comments to the owner page. 135 # Since comments can be versioned, the date of the original version 136 # is used, and only this "original" version has the owner property. 137 138 if objecttype == "Comment" and content.has_key("owner"): 139 ownerid = content["owner"] 140 mkdirs(join(pages_dir, ownerid)) 141 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 142 143 # Some metadata is not particularly relevant. For example, 144 # ancestors, children, parent are navigation-related. 145 146 # Other metadata could be added to the page content itself. 147 # For example, labelling could be converted to categories. 148 149 # Handle revisions. 150 151 elif objecttype == "BodyContent": 152 body = content["body"] 153 if not body: 154 body = "## Empty page." 155 156 # NOTE: Very simple technique employed for guessing the format. 157 158 if no_translate: 159 fn = write 160 elif body.startswith("<"): 161 fn = xmltranslate 162 else: 163 fn = translate 164 165 try: 166 fn(join(versions_dir, content["content"]), body) 167 except: 168 err = codecs.getwriter("utf-8")(sys.stderr) 169 print >>err, "Error parsing", content["content"] 170 raise 171 172 # Handle attachments. 173 174 elif objecttype == "Attachment": 175 pageid = content["content"] 176 version = content["attachmentVersion"] 177 178 if content.has_key("originalVersion"): 179 attachid = content["originalVersion"] 180 else: 181 attachid = identifier 182 183 append(join(pages_dir, pageid, "attachments"), 184 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % ( 185 version, 186 # Have to "taint" archive filenames, although Moin will 187 # probably handle package script filename tainting. 188 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 189 wikiutil.taintfilename(content["fileName"]), 190 "", # pagename is substituted later 191 content["lastModifierName"], 192 content["comment"] 193 )) 194 195 self.content = {} 196 197 def handle_property(self, name, elements, attributes, all_text, text): 198 199 "Record properties in the current content dictionary." 200 201 self.content[attributes[-1]["name"]] = text.strip() 202 203 def handle_id(self, name, elements, attributes, all_text, text): 204 205 "Promote identifiers to the parent element's text." 206 207 all_text[-2].append(text) 208 209 def handle_collection(self, name, elements, attributes, all_text, text): 210 211 "Record collections in the current content dictionary." 212 213 self.content[attributes[-1]["name"]] = self.elements 214 self.elements = [] 215 216 def handle_element(self, name, elements, attributes, all_text, text): 217 218 "Add elements to the current collection." 219 220 self.elements.append((attributes[-1]["class"], text.strip())) 221 222 def mkdirs(name): 223 224 "Make the directory with the given 'name' at any depth." 225 226 try: 227 makedirs(name) 228 except OSError: 229 pass 230 231 def append(filename, s): 232 233 "Append to the file with the given 'filename' the string 's'." 234 235 write(filename, s, True) 236 237 def write(filename, s, append=False): 238 239 """ 240 Write to the file with the given 'filename' the string 's'. If the optional 241 'append' parameter is set to a true value, 's' will be appended to the file. 242 """ 243 244 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 245 try: 246 f.write(s) 247 finally: 248 f.close() 249 250 def read(filename): 251 252 """ 253 Read from the file with the given 'filename', returning a string containing 254 its contents. 255 """ 256 257 f = codecs.open(filename, encoding="utf-8") 258 try: 259 return f.read() 260 finally: 261 f.close() 262 263 def translate(filename, body, fn=None): 264 265 """ 266 Write to the file with the given 'filename' a translation of the given 267 'body'. 268 """ 269 270 fn = fn or wikiparser.parse 271 272 out = codecs.open(filename, "w", encoding="utf-8") 273 try: 274 print >>out, "#pragma page-filename", filename 275 fn(body, out) 276 finally: 277 out.close() 278 279 def xmltranslate(filename, body): 280 translate(filename, body, xmlparser.parse) 281 282 def sort_comments(pages_dir, pageid): 283 284 """ 285 Where 'pageid' has comments associated with it, sort them chronologically 286 and label the comment pages with the owner page's title and comment's 287 position in the chronological sequence. Such labelling is done by writing 288 a "pagetitle" file in each comment page's directory. 289 """ 290 291 comments = join(pages_dir, pageid, "comments") 292 293 if not exists(comments): 294 return 295 296 title = read(join(pages_dir, pageid, "pagetitle")) 297 298 details = [line.split("|") for line in read(comments).split("\n") if line] 299 details.sort() 300 301 # Write the sorted comments list for testing purposes. 302 303 write(comments, "\n".join(["|".join(x) for x in details])) 304 305 # Define comments as subpages by setting their titles using this 306 # page's name/title and their position in the comments collection. 307 308 for position, (_lastmodified, commentid) in enumerate(details): 309 310 # In the page directory for each comment, write the page title in a 311 # special file for later processing. 312 313 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 314 315 def _sort_manifest(manifest, title): 316 317 """ 318 Open the given 'manifest' and sort it according to revision so that it will 319 be added to MoinMoin in the correct order. 320 321 If a 'title' is provided, the title column in the manifest will be augmented 322 with that information. This is typically done for comments and is necessary 323 for attachments. 324 325 A list of manifest entries is returned. 326 """ 327 328 f = codecs.open(manifest, "r", encoding="utf-8") 329 try: 330 lines = [x.split("|") for x in f.readlines()] 331 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 332 333 # Reconstruct the lines, optionally changing the titles. 334 335 result = [] 336 337 for line in lines: 338 version, _action, _archive_filename, filename, old_title, username, comment = line 339 340 # Replace title information with the information already present. 341 342 if title is not None: 343 new_title = title 344 else: 345 new_title = old_title 346 347 # The version is omitted now that the manifest is ordered. 348 349 line = _action, _archive_filename, filename, new_title, username, comment 350 result.append(line) 351 352 return result 353 354 finally: 355 f.close() 356 357 def serialise_manifest(manifest): 358 359 """ 360 Process the 'manifest' consisting of entries, removing superfluous columns. 361 """ 362 363 result = [] 364 365 for columns in manifest: 366 action = columns[0] 367 if action == "AddRevision": 368 columns = list(columns) 369 del columns[1] 370 result.append("|".join(columns)) 371 372 return "".join(result) 373 374 def sort_manifest(pages_dir, pageid, output=None, no_translate=False): 375 376 """ 377 Using the given 'pageid', locate the manifest for the page and any page 378 title information written to a "pagetitle" file. 379 380 Then sort the manifest according to revision so that it will be added to 381 MoinMoin in the correct order. 382 383 If a "pagetitle" file exists, the title column in the manifest will be 384 augmented with the contents of that file. This is typically done for 385 comments. 386 387 If a "children" file exists, the pages in that file will be added as a list 388 to the end of each revision's content. 389 390 If 'output' is given, the manifest details will be appended to the file 391 having that filename instead of being rewritten to the original manifest 392 file. 393 """ 394 395 manifest = join(pages_dir, pageid, "manifest") 396 attachments = join(pages_dir, pageid, "attachments") 397 pagetitle = join(pages_dir, pageid, "pagetitle") 398 children = join(pages_dir, pageid, "children") 399 comments = join(pages_dir, pageid, "comments") 400 401 if exists(pagetitle): 402 title = read(pagetitle) 403 else: 404 title = None 405 406 # Sort the revision manifest. 407 408 result = _sort_manifest(manifest, title) 409 410 for _action, _archive_filename, filename, new_title, username, comment in result: 411 412 # Add child page information to the content. 413 414 if exists(children) and not no_translate: 415 child_pages = [] 416 child_page_names = [x for x in read(children).split("\n") if x] 417 child_page_names.sort() 418 419 for child_page_name in child_page_names: 420 child_pages.append(" * [[%s]]" % child_page_name) 421 422 append(filename, child_page_section % "\n".join(child_pages)) 423 424 # Add comments to the content. 425 426 if exists(comments) and title and not no_translate: 427 append(filename, comment_section % title) 428 429 # Add the attachments to the manifest. 430 431 if exists(attachments): 432 result += _sort_manifest(attachments, title) 433 434 # Serialise the manifest. 435 436 s = serialise_manifest(result) 437 438 if output is None: 439 write(manifest, s) 440 else: 441 append(output, s) 442 443 # Template for child page information. 444 445 child_page_section = """ 446 ---- 447 448 %s 449 """ 450 451 # Template for comments. 452 453 comment_section = """ 454 ---- 455 456 <<Include("^%s/")>> 457 """ 458 459 # Main program. 460 461 if __name__ == "__main__": 462 try: 463 filename = sys.argv[1] 464 is_zipfile = splitext(filename)[-1] == extsep + "zip" 465 space = sys.argv[2] 466 if len(sys.argv) > 3 and sys.argv[3]: 467 attachments = sys.argv[3] 468 else: 469 attachments = None 470 except IndexError: 471 print >>sys.stderr, """ 472 Please specify an XML file containing Wiki data, a workspace name, and an 473 optional attachments directory location. For example: 474 475 com_entities.xml COM attachments 476 477 Adding --no-translate will unpack the Wiki but not translate the content. 478 When doing so without an attachments directory, add an empty argument as 479 follows: 480 481 com_entities.xml COM '' --no-translate 482 """ 483 sys.exit(1) 484 485 no_translate = "--no-translate" in sys.argv 486 487 if exists(space): 488 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 489 sys.exit(1) 490 491 package_zip = space + extsep + "zip" 492 493 if exists(package_zip): 494 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 495 sys.exit(1) 496 497 mkdir(space) 498 mkdirs(join(space, "pages")) 499 mkdirs(join(space, "versions")) 500 501 p = xmlread.ConfigurableParser() 502 handler = ConfluenceHandler(space, no_translate) 503 504 # Register handlers in the parser for different elements. 505 506 p["object"] = handler.handle_object 507 p["property"] = handler.handle_property 508 p["id"] = handler.handle_id 509 p["collection"] = handler.handle_collection 510 p["element"] = handler.handle_element 511 512 # Open the XML dump. 513 514 f = open(filename) 515 516 if is_zipfile: 517 zf = ZipFile(f) 518 ff = StringIO(zf.read("entities.xml")) 519 else: 520 ff = f 521 522 # Parse the data. 523 524 try: 525 p.parse(ff) 526 527 # Tidy up the import manifests, sorting each of them by revision and 528 # finalising them. 529 530 pages_dir = join(space, "pages") 531 532 for pageid in listdir(pages_dir): 533 sort_comments(pages_dir, pageid) 534 535 output_manifest = join(space, "MOIN_PACKAGE") 536 append(output_manifest, "MoinMoinPackage|1\n") 537 538 for pageid in listdir(pages_dir): 539 sort_manifest(pages_dir, pageid, output_manifest, no_translate) 540 541 # Write the page package. 542 543 page_package = ZipFile(package_zip, "w") 544 545 try: 546 # Include the page revisions. 547 548 versions_dir = join(space, "versions") 549 550 for versionid in listdir(versions_dir): 551 page_package.write(join(versions_dir, versionid)) 552 553 # Include the attachments. 554 555 if attachments: 556 cwd = getcwd() 557 chdir(split(attachments)[0]) 558 try: 559 for path, dirnames, filenames in walk(split(attachments)[1]): 560 for filename in filenames: 561 # Have to "taint" archive filenames. 562 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 563 finally: 564 chdir(cwd) 565 elif is_zipfile: 566 for filename in zf.namelist(): 567 if filename.startswith("attachments"): 568 # Have to "taint" archive filenames. 569 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 570 571 # Include only the top-level manifest. 572 573 page_package.write(output_manifest, "MOIN_PACKAGE") 574 575 finally: 576 page_package.close() 577 578 finally: 579 f.close() 580 581 # vim: tabstop=4 expandtab shiftwidth=4