1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import chdir, getcwd, listdir, mkdir, makedirs, walk 25 from os.path import exists, extsep, join, split, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 from MoinMoin import wikiutil 29 import codecs 30 import xmlread 31 import wikiparser, xmlparser 32 import sys 33 34 MAX_TITLE_LENGTH = 120 35 36 class ConfluenceHandler: 37 38 "Handle content from a Confluence Wiki dump." 39 40 def __init__(self, space, no_translate=False): 41 self.content = {} 42 self.elements = [] 43 self.space = space 44 self.no_translate = no_translate 45 46 def handle_object(self, name, elements, attributes, all_text, text): 47 48 """ 49 Handle objects according to type. Objects appear as follows: 50 51 <object class="Page" package="..."> 52 <id name="id">...</id> 53 ... 54 </object> 55 56 Within objects, one finds things like properties and collections, which 57 are handled by their own methods but which are stored in the content 58 dictionary associated with the current object. 59 60 By the time this method is called, the contents of the object will have 61 been gathered and the properties and collections populated in the 62 content dictionary. Any identifier will have been assigned to the 63 textual content of the object element and will be available in the 64 'text' parameter. 65 """ 66 67 objecttype = attributes[-1]["class"] 68 69 # Any identifier is stored as the object's textual content. 70 71 identifier = text.strip() 72 73 # The content is a dictionary mapping names to properties and 74 # collections. 75 76 content = self.content 77 78 pages_dir = join(self.space, "pages") 79 versions_dir = join(self.space, "versions") 80 81 # Handle particular types. 82 83 if objecttype in ("Page", "Comment", "BlogPost"): 84 85 # Handle pages and revisions, adding revisions to the page manifest. 86 # The original version is used as a unifying identifier for all the 87 # different revisions (each of which being defined by a Page 88 # element). Although "original" implies the first identifier used, 89 # it actually appears to be the latest and will have the highest 90 # version number. 91 92 if content.has_key("originalVersion"): 93 pageid = content["originalVersion"] 94 else: 95 pageid = identifier 96 97 versionfile = join(versions_dir, identifier) 98 99 # Note page metadata, not necessarily in the correct order. 100 # For comments, the title will need to be rewritten, since they 101 # should be defined in terms of their owner page. 102 103 mkdirs(join(pages_dir, pageid)) 104 105 title = content["title"] 106 107 # Limit the title to a "safe" number of characters in order to avoid 108 # filesystem issues. 109 110 title = title[:MAX_TITLE_LENGTH] 111 112 if title: 113 title = "%s/%s" % (self.space, title) 114 write(join(pages_dir, pageid, "pagetitle"), title) 115 116 # See sort_manifest for access to this data. 117 118 append(join(pages_dir, pageid, "manifest"), 119 "%s|AddRevision|_|%s|%s|%s|%s\n" % ( # blank added for consistency with AddAttachment 120 content["version"], 121 versionfile, 122 title, # comment titles will incorporate the comment's position 123 content["lastModifierName"], 124 content["versionComment"] 125 )) 126 127 # Add information to parent pages for child page lists. 128 129 if content.has_key("parent"): 130 parentid = content["parent"] 131 mkdirs(join(pages_dir, parentid)) 132 append(join(pages_dir, parentid, "children"), title + "\n") 133 134 # Add creation details for comments to the owner page. 135 # Since comments can be versioned, the date of the original version 136 # is used, and only this "original" version has the owner property. 137 138 if objecttype == "Comment" and content.has_key("owner"): 139 ownerid = content["owner"] 140 mkdirs(join(pages_dir, ownerid)) 141 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 142 143 # Some metadata is not particularly relevant. For example, 144 # ancestors, children, parent are navigation-related. 145 146 # Other metadata could be added to the page content itself. 147 # For example, labelling could be converted to categories. 148 149 # Handle revisions. 150 151 elif objecttype == "BodyContent": 152 body = content["body"] 153 if not body: 154 body = "## Empty page." 155 156 # NOTE: Very simple technique employed for guessing the format. 157 158 if no_translate: 159 fn = write 160 elif body.startswith("<"): 161 fn = xmltranslate 162 else: 163 fn = translate 164 165 try: 166 fn(join(versions_dir, content["content"]), body) 167 except: 168 err = codecs.getwriter("utf-8")(sys.stderr) 169 print >>err, "Error parsing", content["content"] 170 raise 171 172 # Handle attachments. 173 174 elif objecttype == "Attachment": 175 pageid = content["content"] 176 version = content["attachmentVersion"] 177 178 if content.has_key("originalVersion"): 179 attachid = content["originalVersion"] 180 else: 181 attachid = identifier 182 183 append(join(pages_dir, pageid, "attachments"), 184 "%s|AddAttachment|%s|%s|%s|%s|%s\n" % ( 185 version, 186 # Have to "taint" archive filenames, although Moin will 187 # probably handle package script filename tainting. 188 wikiutil.taintfilename(join("attachments", pageid, attachid, version)), 189 wikiutil.taintfilename(content["fileName"]), 190 "", # pagename is substituted later 191 content["lastModifierName"], 192 content["comment"] 193 )) 194 195 self.content = {} 196 197 def handle_property(self, name, elements, attributes, all_text, text): 198 199 "Record properties in the current content dictionary." 200 201 self.content[attributes[-1]["name"]] = text.strip() 202 203 def handle_id(self, name, elements, attributes, all_text, text): 204 205 "Promote identifiers to the parent element's text." 206 207 all_text[-2].append(text) 208 209 def handle_collection(self, name, elements, attributes, all_text, text): 210 211 "Record collections in the current content dictionary." 212 213 self.content[attributes[-1]["name"]] = self.elements 214 self.elements = [] 215 216 def handle_element(self, name, elements, attributes, all_text, text): 217 218 "Add elements to the current collection." 219 220 self.elements.append((attributes[-1]["class"], text.strip())) 221 222 def mkdirs(name): 223 224 "Make the directory with the given 'name' at any depth." 225 226 try: 227 makedirs(name) 228 except OSError: 229 pass 230 231 def append(filename, s): 232 233 "Append to the file with the given 'filename' the string 's'." 234 235 write(filename, s, True) 236 237 def write(filename, s, append=False): 238 239 """ 240 Write to the file with the given 'filename' the string 's'. If the optional 241 'append' parameter is set to a true value, 's' will be appended to the file. 242 """ 243 244 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 245 try: 246 f.write(s) 247 finally: 248 f.close() 249 250 def read(filename): 251 252 """ 253 Read from the file with the given 'filename', returning a string containing 254 its contents. 255 """ 256 257 f = codecs.open(filename, encoding="utf-8") 258 try: 259 return f.read() 260 finally: 261 f.close() 262 263 def translate(filename, body, fn=None): 264 265 """ 266 Write to the file with the given 'filename' a translation of the given 267 'body'. 268 """ 269 270 fn = fn or wikiparser.parse 271 272 out = codecs.open(filename, "w", encoding="utf-8") 273 try: 274 print >>out, "#pragma page-filename", filename 275 fn(body, out) 276 finally: 277 out.close() 278 279 def xmltranslate(filename, body): 280 translate(filename, body, xmlparser.parse) 281 282 def sort_comments(pages_dir, pageid): 283 284 """ 285 Where 'pageid' has comments associated with it, sort them chronologically 286 and label the comment pages with the owner page's title and comment's 287 position in the chronological sequence. Such labelling is done by writing 288 a "pagetitle" file in each comment page's directory. 289 """ 290 291 comments = join(pages_dir, pageid, "comments") 292 293 if not exists(comments): 294 return 295 296 title = read(join(pages_dir, pageid, "pagetitle")) 297 298 details = [line.split("|") for line in read(comments).split("\n") if line] 299 details.sort() 300 301 # Write the sorted comments list for testing purposes. 302 303 write(comments, "\n".join(["|".join(x) for x in details])) 304 305 # Define comments as subpages by setting their titles using this 306 # page's name/title and their position in the comments collection. 307 308 for position, (_lastmodified, commentid) in enumerate(details): 309 310 # In the page directory for each comment, write the page title in a 311 # special file for later processing. 312 313 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 314 315 def _sort_manifest(manifest, title): 316 317 """ 318 Open the given 'manifest' and sort it according to revision so that it will 319 be added to MoinMoin in the correct order. 320 321 If a 'title' is provided, the title column in the manifest will be augmented 322 with that information. This is typically done for comments and is necessary 323 for attachments. 324 325 A list of manifest entries is returned. 326 """ 327 328 f = codecs.open(manifest, "r", encoding="utf-8") 329 try: 330 lines = [x.split("|") for x in f.readlines()] 331 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 332 333 # Reconstruct the lines, optionally changing the titles. 334 335 result = [] 336 337 for line in lines: 338 version, _action, _archive_filename, filename, old_title, username, comment = line 339 340 # Replace title information with the information already present. 341 342 if title is not None: 343 new_title = title 344 else: 345 new_title = old_title 346 347 # The version is omitted now that the manifest is ordered. 348 349 line = _action, _archive_filename, filename, new_title, username, comment 350 result.append(line) 351 352 return result 353 354 finally: 355 f.close() 356 357 def serialise_manifest(manifest): 358 359 """ 360 Process the 'manifest' consisting of entries, removing superfluous columns. 361 """ 362 363 result = [] 364 365 for columns in manifest: 366 action = columns[0] 367 if action == "AddRevision": 368 columns = list(columns) 369 del columns[1] 370 result.append("|".join(columns)) 371 372 return "".join(result) 373 374 def sort_manifest(pages_dir, pageid, output=None, no_translate=False): 375 376 """ 377 Using the given 'pageid', locate the manifest for the page and any page 378 title information written to a "pagetitle" file. 379 380 Then sort the manifest according to revision so that it will be added to 381 MoinMoin in the correct order. 382 383 If a "pagetitle" file exists, the title column in the manifest will be 384 augmented with the contents of that file. This is typically done for 385 comments. 386 387 If a "children" file exists, the pages in that file will be added as a list 388 to the end of each revision's content. 389 390 If 'output' is given, the manifest details will be appended to the file 391 having that filename instead of being rewritten to the original manifest 392 file. 393 """ 394 395 manifest = join(pages_dir, pageid, "manifest") 396 attachments = join(pages_dir, pageid, "attachments") 397 pagetitle = join(pages_dir, pageid, "pagetitle") 398 children = join(pages_dir, pageid, "children") 399 comments = join(pages_dir, pageid, "comments") 400 401 if exists(pagetitle): 402 title = read(pagetitle) 403 else: 404 title = None 405 406 # Sort the revision manifest. 407 408 result = _sort_manifest(manifest, title) 409 410 for _action, _archive_filename, filename, new_title, username, comment in result: 411 412 # Add child page information to the content. 413 414 if exists(children) and not no_translate: 415 child_pages = [] 416 child_page_names = [x for x in read(children).split("\n") if x] 417 child_page_names.sort() 418 419 for child_page_name in child_page_names: 420 child_pages.append(" * [[%s]]" % child_page_name) 421 422 append(filename, child_page_section % "\n".join(child_pages)) 423 424 # Add comments to the content. 425 426 if exists(comments) and title and not no_translate: 427 append(filename, comment_section % title) 428 429 # Add the attachments to the manifest. 430 431 if exists(attachments): 432 result += _sort_manifest(attachments, title) 433 434 # Serialise the manifest. 435 436 s = serialise_manifest(result) 437 438 if output is None: 439 write(manifest, s) 440 else: 441 append(output, s) 442 443 # Template for child page information. 444 445 child_page_section = """ 446 ---- 447 448 %s 449 """ 450 451 # Template for comments. 452 453 comment_section = """ 454 ---- 455 456 <<Include("^%s/")>> 457 """ 458 459 # Main program. 460 461 if __name__ == "__main__": 462 try: 463 filename = sys.argv[1] 464 is_zipfile = splitext(filename)[-1] == extsep + "zip" 465 space = sys.argv[2] 466 if len(sys.argv) > 3 and sys.argv[3]: 467 attachments = sys.argv[3] 468 else: 469 attachments = None 470 except IndexError: 471 print >>sys.stderr, "Please specify an XML file containing Wiki data, a workspace name," 472 print >>sys.stderr, "and an optional attachments directory location." 473 print >>sys.stderr, "For example: com_entities.xml COM" 474 sys.exit(1) 475 476 no_translate = "--no-translate" in sys.argv 477 478 if exists(space): 479 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 480 sys.exit(1) 481 482 package_zip = space + extsep + "zip" 483 484 if exists(package_zip): 485 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 486 sys.exit(1) 487 488 mkdir(space) 489 mkdirs(join(space, "pages")) 490 mkdirs(join(space, "versions")) 491 492 p = xmlread.ConfigurableParser() 493 handler = ConfluenceHandler(space, no_translate) 494 495 # Register handlers in the parser for different elements. 496 497 p["object"] = handler.handle_object 498 p["property"] = handler.handle_property 499 p["id"] = handler.handle_id 500 p["collection"] = handler.handle_collection 501 p["element"] = handler.handle_element 502 503 # Open the XML dump. 504 505 f = open(filename) 506 507 if is_zipfile: 508 zf = ZipFile(f) 509 ff = StringIO(zf.read("entities.xml")) 510 else: 511 ff = f 512 513 # Parse the data. 514 515 try: 516 p.parse(ff) 517 518 # Tidy up the import manifests, sorting each of them by revision and 519 # finalising them. 520 521 pages_dir = join(space, "pages") 522 523 for pageid in listdir(pages_dir): 524 sort_comments(pages_dir, pageid) 525 526 output_manifest = join(space, "MOIN_PACKAGE") 527 append(output_manifest, "MoinMoinPackage|1\n") 528 529 for pageid in listdir(pages_dir): 530 sort_manifest(pages_dir, pageid, output_manifest, no_translate) 531 532 # Write the page package. 533 534 page_package = ZipFile(package_zip, "w") 535 536 try: 537 # Include the page revisions. 538 539 versions_dir = join(space, "versions") 540 541 for versionid in listdir(versions_dir): 542 page_package.write(join(versions_dir, versionid)) 543 544 # Include the attachments. 545 546 if attachments: 547 cwd = getcwd() 548 chdir(split(attachments)[0]) 549 try: 550 for path, dirnames, filenames in walk(split(attachments)[1]): 551 for filename in filenames: 552 # Have to "taint" archive filenames. 553 page_package.write(join(path, filename), wikiutil.taintfilename(join(path, filename))) 554 finally: 555 chdir(cwd) 556 elif is_zipfile: 557 for filename in zf.namelist(): 558 if filename.startswith("attachments"): 559 # Have to "taint" archive filenames. 560 page_package.writestr(wikiutil.taintfilename(filename), zf.read(filename)) 561 562 # Include only the top-level manifest. 563 564 page_package.write(output_manifest, "MOIN_PACKAGE") 565 566 finally: 567 page_package.close() 568 569 finally: 570 f.close() 571 572 # vim: tabstop=4 expandtab shiftwidth=4