1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import listdir, mkdir, makedirs 25 from os.path import exists, extsep, join, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 import codecs 29 import xmlread 30 import wikiparser, xmlparser 31 import sys 32 33 MAX_TITLE_LENGTH = 120 34 35 class ConfluenceHandler: 36 37 "Handle content from a Confluence Wiki dump." 38 39 def __init__(self, space, no_translate=False): 40 self.content = {} 41 self.elements = [] 42 self.space = space 43 self.no_translate = no_translate 44 45 def handle_object(self, name, elements, attributes, all_text, text): 46 47 "Handle objects according to type." 48 49 objecttype = attributes[-1]["class"] 50 51 # Any identifier is stored as the object's textual content. 52 53 identifier = text.strip() 54 55 # The content is a dictionary mapping names to properties and 56 # collections. 57 58 content = self.content 59 60 pages_dir = join(self.space, "pages") 61 versions_dir = join(self.space, "versions") 62 63 # Handle particular types. 64 65 if objecttype in ("Page", "Comment", "BlogPost"): 66 67 # Handle pages and revisions, adding revisions to the page manifest. 68 # The original version is used as a unifying identifier for all the 69 # different revisions (each of which being defined by a Page 70 # element). Although "original" implies the first identifier used, 71 # it actually appears to be the latest and will have the highest 72 # version number. 73 74 if content.has_key("originalVersion"): 75 pageid = content["originalVersion"] 76 else: 77 pageid = identifier 78 79 versionfile = join(versions_dir, identifier) 80 81 # Note page metadata, not necessarily in the correct order. 82 # For comments, the title will need to be rewritten, since they 83 # should be defined in terms of their owner page. 84 85 mkdirs(join(pages_dir, pageid)) 86 87 title = content["title"] 88 89 # Limit the title to a "safe" number of characters in order to avoid 90 # filesystem issues. 91 92 title = title[:MAX_TITLE_LENGTH] 93 94 if title: 95 title = "%s/%s" % (self.space, title) 96 write(join(pages_dir, pageid, "pagetitle"), title) 97 98 # See sort_manifest for access to this data. 99 100 append(join(pages_dir, pageid, "manifest"), 101 "%s|AddRevision|%s|%s|%s|%s\n" % ( 102 content["version"], 103 versionfile, 104 title, # comment titles will incorporate the comment's position 105 content["lastModifierName"], 106 content["versionComment"] 107 )) 108 109 # Add information to parent pages for child page lists. 110 111 if content.has_key("parent"): 112 parentid = content["parent"] 113 mkdirs(join(pages_dir, parentid)) 114 append(join(pages_dir, parentid, "children"), title + "\n") 115 116 # Add creation details for comments to the owner page. 117 # Since comments can be versioned, the date of the original version 118 # is used, and only this "original" version has the owner property. 119 120 if objecttype == "Comment" and content.has_key("owner"): 121 ownerid = content["owner"] 122 mkdirs(join(pages_dir, ownerid)) 123 append(join(pages_dir, ownerid, "comments"), "%s|%s\n" % (content["creationDate"], pageid)) 124 125 # Some metadata is not particularly relevant. For example, 126 # ancestors, children, parent are navigation-related. 127 128 # Other metadata could be added to the page content itself. 129 # For example, labelling could be converted to categories. 130 131 # Handle revisions. 132 133 elif objecttype == "BodyContent": 134 body = content["body"] 135 if not body: 136 body = "## Empty page." 137 138 # NOTE: Very simple technique employed for guessing the format. 139 140 if no_translate: 141 fn = write 142 elif body.startswith("<"): 143 fn = xmltranslate 144 else: 145 fn = translate 146 147 try: 148 fn(join(versions_dir, content["content"]), body) 149 except: 150 print >>sys.stderr, "Error parsing..." 151 print >>sys.stderr, body 152 raise 153 154 self.content = {} 155 156 def handle_property(self, name, elements, attributes, all_text, text): 157 158 "Record properties in the current content dictionary." 159 160 self.content[attributes[-1]["name"]] = text.strip() 161 162 def handle_id(self, name, elements, attributes, all_text, text): 163 164 "Promote identifiers to the parent element's text." 165 166 all_text[-2].append(text) 167 168 def handle_collection(self, name, elements, attributes, all_text, text): 169 170 "Record collections in the current content dictionary." 171 172 self.content[attributes[-1]["name"]] = self.elements 173 self.elements = [] 174 175 def handle_element(self, name, elements, attributes, all_text, text): 176 177 "Add elements to the current collection." 178 179 self.elements.append((attributes[-1]["class"], text.strip())) 180 181 def mkdirs(name): 182 183 "Make the directory with the given 'name' at any depth." 184 185 try: 186 makedirs(name) 187 except OSError: 188 pass 189 190 def append(filename, s): 191 192 "Append to the file with the given 'filename' the string 's'." 193 194 write(filename, s, True) 195 196 def write(filename, s, append=False): 197 198 """ 199 Write to the file with the given 'filename' the string 's'. If the optional 200 'append' parameter is set to a true value, 's' will be appended to the file. 201 """ 202 203 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 204 try: 205 f.write(s) 206 finally: 207 f.close() 208 209 def read(filename): 210 211 """ 212 Read from the file with the given 'filename', returning a string containing 213 its contents. 214 """ 215 216 f = codecs.open(filename, encoding="utf-8") 217 try: 218 return f.read() 219 finally: 220 f.close() 221 222 def translate(filename, body, fn=None): 223 224 """ 225 Write to the file with the given 'filename' a translation of the given 226 'body'. 227 """ 228 229 fn = fn or wikiparser.parse 230 231 out = codecs.open(filename, "w", encoding="utf-8") 232 try: 233 fn(body, out) 234 finally: 235 out.close() 236 237 def xmltranslate(filename, body): 238 translate(filename, body, xmlparser.parse) 239 240 def sort_comments(pages_dir, pageid): 241 242 """ 243 Where 'pageid' has comments associated with it, sort them chronologically 244 and label the comment pages with the owner page's title and comment's 245 position in the chronological sequence. Such labelling is done by writing 246 a "pagetitle" file in each comment page's directory. 247 """ 248 249 comments = join(pages_dir, pageid, "comments") 250 251 if not exists(comments): 252 return 253 254 title = read(join(pages_dir, pageid, "pagetitle")) 255 256 details = [line.split("|") for line in read(comments).split("\n") if line] 257 details.sort() 258 259 # Write the sorted comments list for testing purposes. 260 261 write(comments, "\n".join(["|".join(x) for x in details])) 262 263 # Define comments as subpages by setting their titles using this 264 # page's name/title and their position in the comments collection. 265 266 for position, (_lastmodified, commentid) in enumerate(details): 267 268 # In the page directory for each comment, write the page title in a 269 # special file for later processing. 270 271 write(join(pages_dir, commentid, "pagetitle"), "%s/%04d" % (title, position)) 272 273 def sort_manifest(pages_dir, pageid, output=None, no_translate=False): 274 275 """ 276 Using the given 'pageid', locate the manifest for the page and any page 277 title information written to a "pagetitle" file. 278 279 Then sort the manifest according to revision so that it will be added to 280 MoinMoin in the correct order. 281 282 If a "pagetitle" file exists, the title column in the manifest will be 283 augmented with the contents of that file. This is typically done for 284 comments. 285 286 If a "children" file exists, the pages in that file will be added as a list 287 to the end of each revision's content. 288 289 If 'output' is given, the manifest details will be appended to the file 290 having that filename instead of being rewritten to the original manifest 291 file. 292 """ 293 294 manifest = join(pages_dir, pageid, "manifest") 295 pagetitle = join(pages_dir, pageid, "pagetitle") 296 children = join(pages_dir, pageid, "children") 297 comments = join(pages_dir, pageid, "comments") 298 299 if exists(pagetitle): 300 title = read(pagetitle) 301 else: 302 title = None 303 304 f = codecs.open(manifest, "r", encoding="utf-8") 305 try: 306 lines = [x.split("|") for x in f.readlines()] 307 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 308 309 # Reconstruct the lines, optionally changing the titles. 310 311 result = [] 312 313 for line in lines: 314 version, _addrevision, filename, old_title, username, comment = line 315 316 # Replace title information with the information already present. 317 318 if title is not None: 319 new_title = title 320 else: 321 new_title = old_title 322 323 # The version is omitted now that the manifest is ordered. 324 325 line = _addrevision, filename, new_title, username, comment 326 result.append("|".join(line)) 327 328 # Add child page information to the content. 329 330 if exists(children) and not no_translate: 331 child_pages = [] 332 child_page_names = [x for x in read(children).split("\n") if x] 333 child_page_names.sort() 334 335 for child_page_name in child_page_names: 336 child_pages.append(" * [[%s]]" % child_page_name) 337 338 append(filename, child_page_section % "\n".join(child_pages)) 339 340 # Add comments to the content. 341 342 if exists(comments) and title and not no_translate: 343 append(filename, comment_section % title) 344 345 finally: 346 f.close() 347 348 s = "".join(result) 349 350 if output is None: 351 write(manifest, s) 352 else: 353 append(output, s) 354 355 # Template for child page information. 356 357 child_page_section = """ 358 ---- 359 360 %s 361 """ 362 363 # Template for comments. 364 365 comment_section = """ 366 ---- 367 368 <<Include("^%s/")>> 369 """ 370 371 # Main program. 372 373 if __name__ == "__main__": 374 try: 375 filename = sys.argv[1] 376 is_zipfile = splitext(filename)[-1] == extsep + "zip" 377 space = sys.argv[2] 378 except IndexError: 379 print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name." 380 print >>sys.stderr, "For example: com_entities.xml COM" 381 sys.exit(1) 382 383 no_translate = "--no-translate" in sys.argv 384 385 if exists(space): 386 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 387 sys.exit(1) 388 389 package_zip = space + extsep + "zip" 390 391 if exists(package_zip): 392 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 393 sys.exit(1) 394 395 mkdir(space) 396 mkdirs(join(space, "pages")) 397 mkdirs(join(space, "versions")) 398 399 p = xmlread.ConfigurableParser() 400 handler = ConfluenceHandler(space, no_translate) 401 402 # Register handlers in the parser for different elements. 403 404 p["object"] = handler.handle_object 405 p["property"] = handler.handle_property 406 p["id"] = handler.handle_id 407 p["collection"] = handler.handle_collection 408 p["element"] = handler.handle_element 409 410 # Open the XML dump. 411 412 f = open(filename) 413 414 if is_zipfile: 415 zf = ZipFile(f) 416 ff = StringIO(zf.read("entities.xml")) 417 else: 418 ff = f 419 420 # Parse the data. 421 422 try: 423 p.parse(ff) 424 finally: 425 f.close() 426 427 # Tidy up the import manifests, sorting each of them by revision and 428 # finalising them. 429 430 pages_dir = join(space, "pages") 431 432 for pageid in listdir(pages_dir): 433 sort_comments(pages_dir, pageid) 434 435 output_manifest = join(space, "MOIN_PACKAGE") 436 append(output_manifest, "MoinMoinPackage|1\n") 437 438 for pageid in listdir(pages_dir): 439 sort_manifest(pages_dir, pageid, output_manifest, no_translate) 440 441 # Write the page package. 442 443 page_package = ZipFile(package_zip, "w") 444 445 try: 446 # Include the page revisions. 447 448 versions_dir = join(space, "versions") 449 450 for versionid in listdir(versions_dir): 451 page_package.write(join(versions_dir, versionid)) 452 453 # Include only the top-level manifest. 454 455 page_package.write(output_manifest, "MOIN_PACKAGE") 456 457 finally: 458 page_package.close() 459 460 # vim: tabstop=4 expandtab shiftwidth=4