1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import listdir, mkdir, makedirs 25 from os.path import exists, extsep, join, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 import codecs 29 import xmlread 30 import parser 31 32 MAX_TITLE_LENGTH = 120 33 34 class ConfluenceHandler: 35 36 "Handle content from a Confluence Wiki dump." 37 38 def __init__(self, space, no_translate=False): 39 self.content = {} 40 self.elements = [] 41 self.space = space 42 self.no_translate = no_translate 43 44 def handle_object(self, name, elements, attributes, all_text, text): 45 46 "Handle objects according to type." 47 48 objecttype = attributes[-1]["class"] 49 identifier = text.strip() 50 content = self.content 51 52 pages_dir = join(self.space, "pages") 53 versions_dir = join(self.space, "versions") 54 55 # Handle particular types. 56 57 if objecttype in ("Page", "Comment", "BlogPost"): 58 59 # Handle pages and revisions, adding revisions to the page manifest. 60 # The original version is used as a unifying identifier for all the 61 # different revisions (each of which being defined by a Page 62 # element). Although "original" implies the first identifier used, 63 # it actually appears to be the latest and will have the highest 64 # version number. 65 66 if content.has_key("originalVersion"): 67 pageid = content["originalVersion"] 68 else: 69 pageid = identifier 70 71 versionfile = join(versions_dir, identifier) 72 73 # Note page metadata, not necessarily in the correct order. 74 # For comments, the title will need to be rewritten, since they 75 # should be defined in terms of their owner page. 76 77 mkdirs(join(pages_dir, pageid)) 78 79 title = content["title"] 80 81 # Limit the title to a "safe" number of characters in order to avoid 82 # filesystem issues. 83 84 title = title[:MAX_TITLE_LENGTH] 85 86 if title: 87 title = "%s/%s" % (self.space, title) 88 89 append(join(pages_dir, pageid, "manifest"), 90 "%s|AddRevision|%s|%s|%s|%s\n" % ( 91 content["version"], 92 versionfile, 93 title or content["version"], # comment titles will incorporate the version 94 content["lastModifierName"], 95 content["versionComment"] 96 )) 97 98 # Write comments as subpages. 99 100 if content.has_key("comments"): 101 102 # Define a page directory for each comment, and write the page 103 # title in a special file for later processing. 104 105 for _comment, commentid in content["comments"]: 106 mkdirs(join(pages_dir, commentid)) 107 append(join(pages_dir, commentid, "pagetitle"), title) 108 109 # Add information to parent pages for child page lists. 110 111 if content.has_key("parent"): 112 parentid = content["parent"] 113 mkdirs(join(pages_dir, parentid)) 114 append(join(pages_dir, parentid, "children"), title + "\n") 115 116 # Some metadata is not particularly relevant. For example, 117 # ancestors, children, parent are navigation-related. 118 119 # Other metadata could be added to the page content itself. 120 # For example, labelling could be converted to categories. 121 122 # Handle revisions. 123 124 elif objecttype == "BodyContent": 125 body = content["body"] 126 if not body: 127 body = "## Empty page." 128 129 if no_translate: 130 fn = write 131 else: 132 fn = translate 133 134 fn(join(versions_dir, content["content"]), body) 135 136 self.content = {} 137 138 def handle_property(self, name, elements, attributes, all_text, text): 139 140 "Record properties in the current content dictionary." 141 142 self.content[attributes[-1]["name"]] = text.strip() 143 144 def handle_id(self, name, elements, attributes, all_text, text): 145 146 "Promote identifiers to the parent element's text." 147 148 all_text[-2].append(text) 149 150 def handle_collection(self, name, elements, attributes, all_text, text): 151 152 "Record collections in the current content dictionary." 153 154 self.content[attributes[-1]["name"]] = self.elements 155 self.elements = [] 156 157 def handle_element(self, name, elements, attributes, all_text, text): 158 159 "Add elements to the current collection." 160 161 self.elements.append((attributes[-1]["class"], text.strip())) 162 163 def mkdirs(name): 164 165 "Make the directory with the given 'name' at any depth." 166 167 try: 168 makedirs(name) 169 except OSError: 170 pass 171 172 def append(filename, s): 173 174 "Append to the file with the given 'filename' the string 's'." 175 176 write(filename, s, True) 177 178 def write(filename, s, append=False): 179 180 """ 181 Write to the file with the given 'filename' the string 's'. If the optional 182 'append' parameter is set to a true value, 's' will be appended to the file. 183 """ 184 185 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 186 try: 187 f.write(s) 188 finally: 189 f.close() 190 191 def read(filename): 192 193 """ 194 Read from the file with the given 'filename', returning a string containing 195 its contents. 196 """ 197 198 f = codecs.open(filename, encoding="utf-8") 199 try: 200 return f.read() 201 finally: 202 f.close() 203 204 def translate(filename, body): 205 206 """ 207 Write to the file with the given 'filename' a translation of the given 208 'body'. 209 """ 210 211 out = codecs.open(filename, "w", encoding="utf-8") 212 try: 213 parser.parse(body, out) 214 finally: 215 out.close() 216 217 def sort_manifest(filename, pagetitle, output=None): 218 219 """ 220 Sort the manifest given in 'filename' according to revision. 221 222 If a 'pagetitle' file exists, the title column in the manifest will be 223 augmented with the contents of that file. This is typically done for 224 comments. 225 226 If 'output' is given, the manifest details will be appended to the file 227 having that filename instead of being rewritten to the original manifest 228 file. 229 """ 230 231 if exists(pagetitle): 232 title = read(pagetitle) 233 else: 234 title = None 235 236 f = codecs.open(filename, "r", encoding="utf-8") 237 try: 238 lines = [x.split("|") for x in f.readlines()] 239 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 240 241 # Reconstruct the lines, optionally changing the titles. 242 243 result = [] 244 for x in lines: 245 if title is not None: 246 x[3] = "%s/%s" % (title, x[3]) 247 result.append("|".join(x[1:])) 248 finally: 249 f.close() 250 251 s = "".join(result) 252 253 if output is None: 254 write(filename, s) 255 else: 256 append(output, s) 257 258 if __name__ == "__main__": 259 import sys 260 261 try: 262 filename = sys.argv[1] 263 is_zipfile = splitext(filename)[-1] == extsep + "zip" 264 space = sys.argv[2] 265 except IndexError: 266 print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name." 267 print >>sys.stderr, "For example: com_entities.xml COM" 268 sys.exit(1) 269 270 no_translate = "--no-translate" in sys.argv 271 272 if exists(space): 273 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 274 sys.exit(1) 275 276 package_zip = space + extsep + "zip" 277 278 if exists(package_zip): 279 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 280 sys.exit(1) 281 282 mkdir(space) 283 mkdirs(join(space, "pages")) 284 mkdirs(join(space, "versions")) 285 286 p = xmlread.ConfigurableParser() 287 handler = ConfluenceHandler(space, no_translate) 288 289 # Register handlers in the parser for different elements. 290 291 p["object"] = handler.handle_object 292 p["property"] = handler.handle_property 293 p["id"] = handler.handle_id 294 p["collection"] = handler.handle_collection 295 p["element"] = handler.handle_element 296 297 # Open the XML dump. 298 299 f = open(filename) 300 301 if is_zipfile: 302 zf = ZipFile(f) 303 ff = StringIO(zf.read("entities.xml")) 304 else: 305 ff = f 306 307 # Parse the data. 308 309 try: 310 p.parse(ff) 311 finally: 312 f.close() 313 314 # Tidy up the import manifests, sorting each of them by revision and 315 # finalising them. 316 317 pages_dir = join(space, "pages") 318 319 output_manifest = join(space, "MOIN_PACKAGE") 320 append(output_manifest, "MoinMoinPackage|1\n") 321 322 for pageid in listdir(pages_dir): 323 manifest = join(pages_dir, pageid, "manifest") 324 pagetitle = join(pages_dir, pageid, "pagetitle") 325 sort_manifest(manifest, pagetitle, output_manifest) 326 327 # Write the page package. 328 329 page_package = ZipFile(package_zip, "w") 330 331 try: 332 # Include the page revisions. 333 334 versions_dir = join(space, "versions") 335 336 for versionid in listdir(versions_dir): 337 page_package.write(join(versions_dir, versionid)) 338 339 # Include only the top-level manifest. 340 341 page_package.write(output_manifest, "MOIN_PACKAGE") 342 343 finally: 344 page_package.close() 345 346 # vim: tabstop=4 expandtab shiftwidth=4