1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import listdir, mkdir, makedirs 25 from os.path import exists, extsep, join, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 import codecs 29 import xmlread 30 import parser 31 32 MAX_TITLE_LENGTH = 120 33 34 class ConfluenceHandler: 35 36 "Handle content from a Confluence Wiki dump." 37 38 def __init__(self, space, no_translate=False): 39 self.content = {} 40 self.elements = [] 41 self.space = space 42 self.no_translate = no_translate 43 44 def handle_object(self, name, elements, attributes, all_text, text): 45 46 "Handle objects according to type." 47 48 objecttype = attributes[-1]["class"] 49 identifier = text.strip() 50 content = self.content 51 52 pages_dir = join(self.space, "pages") 53 versions_dir = join(self.space, "versions") 54 55 # Handle particular types. 56 57 if objecttype in ("Page", "Comment", "BlogPost"): 58 59 # Handle pages and revisions, adding revisions to the page manifest. 60 # The original version is used as a unifying identifier for all the 61 # different revisions (each of which being defined by a Page 62 # element). Although "original" implies the first identifier used, 63 # it actually appears to be the latest and will have the highest 64 # version number. 65 66 if content.has_key("originalVersion"): 67 pageid = content["originalVersion"] 68 else: 69 pageid = identifier 70 71 versionfile = join(versions_dir, identifier) 72 73 # Note page metadata, not necessarily in the correct order. 74 # For comments, the title will need to be rewritten, since they 75 # should be defined in terms of their owner page. 76 77 mkdirs(join(pages_dir, pageid)) 78 79 title = content["title"] 80 81 # Limit the title to a "safe" number of characters in order to avoid 82 # filesystem issues. 83 84 title = title[:MAX_TITLE_LENGTH] 85 86 if title: 87 title = "%s/%s" % (self.space, title) 88 89 append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % ( 90 content["version"], 91 versionfile, 92 title or content["version"], # comment titles will incorporate the version 93 content["lastModifierName"], 94 content["versionComment"])) 95 96 # Write comments as subpages. 97 98 if content.has_key("comments"): 99 100 # Define a page directory for each comment, and write the page 101 # title in a special file for later processing. 102 103 for _comment, commentid in content["comments"]: 104 mkdirs(join(pages_dir, commentid)) 105 append(join(pages_dir, commentid, "pagetitle"), title) 106 107 # Some metadata is not particularly relevant. For example, 108 # ancestors, children, parent are navigation-related. 109 110 # Other metadata could be added to the page content itself. 111 # For example, labelling could be converted to categories. 112 113 # Handle revisions. 114 115 elif objecttype == "BodyContent": 116 body = content["body"] 117 if not body: 118 body = "## Empty page." 119 120 if no_translate: 121 fn = write 122 else: 123 fn = translate 124 125 fn(join(versions_dir, content["content"]), body) 126 127 self.content = {} 128 129 def handle_property(self, name, elements, attributes, all_text, text): 130 131 "Record properties in the current content dictionary." 132 133 self.content[attributes[-1]["name"]] = text.strip() 134 135 def handle_id(self, name, elements, attributes, all_text, text): 136 137 "Promote identifiers to the parent element's text." 138 139 all_text[-2].append(text) 140 141 def handle_collection(self, name, elements, attributes, all_text, text): 142 143 "Record collections in the current content dictionary." 144 145 self.content[attributes[-1]["name"]] = self.elements 146 self.elements = [] 147 148 def handle_element(self, name, elements, attributes, all_text, text): 149 150 "Add elements to the current collection." 151 152 self.elements.append((attributes[-1]["class"], text.strip())) 153 154 def mkdirs(name): 155 156 "Make the directory with the given 'name' at any depth." 157 158 try: 159 makedirs(name) 160 except OSError: 161 pass 162 163 def append(filename, s): 164 165 "Append to the file with the given 'filename' the string 's'." 166 167 write(filename, s, True) 168 169 def write(filename, s, append=False): 170 171 """ 172 Write to the file with the given 'filename' the string 's'. If the optional 173 'append' parameter is set to a true value, 's' will be appended to the file. 174 """ 175 176 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 177 try: 178 f.write(s) 179 finally: 180 f.close() 181 182 def read(filename): 183 184 """ 185 Read from the file with the given 'filename', returning a string containing 186 its contents. 187 """ 188 189 f = codecs.open(filename, encoding="utf-8") 190 try: 191 return f.read() 192 finally: 193 f.close() 194 195 def translate(filename, body): 196 197 """ 198 Write to the file with the given 'filename' a translation of the given 199 'body'. 200 """ 201 202 out = codecs.open(filename, "w", encoding="utf-8") 203 try: 204 parser.parse(body, out) 205 finally: 206 out.close() 207 208 def sort_manifest(filename, pagetitle, output=None): 209 210 """ 211 Sort the manifest given in 'filename' according to revision. 212 213 If a 'pagetitle' file exists, the title column in the manifest will be 214 augmented with the contents of that file. This is typically done for 215 comments. 216 217 If 'output' is given, the manifest details will be appended to the file 218 having that filename instead of being rewritten to the original manifest 219 file. 220 """ 221 222 if exists(pagetitle): 223 title = read(pagetitle) 224 else: 225 title = None 226 227 f = codecs.open(filename, "r", encoding="utf-8") 228 try: 229 lines = [x.split("|") for x in f.readlines()] 230 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 231 232 # Reconstruct the lines, optionally changing the titles. 233 234 result = [] 235 for x in lines: 236 if title is not None: 237 x[3] = "%s/%s" % (title, x[3]) 238 result.append("|".join(x[1:])) 239 finally: 240 f.close() 241 242 s = "".join(result) 243 244 if output is None: 245 write(filename, s) 246 else: 247 append(output, s) 248 249 if __name__ == "__main__": 250 import sys 251 252 try: 253 filename = sys.argv[1] 254 is_zipfile = splitext(filename)[-1] == extsep + "zip" 255 space = sys.argv[2] 256 except IndexError: 257 print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name." 258 print >>sys.stderr, "For example: com_entities.xml COM" 259 sys.exit(1) 260 261 no_translate = "--no-translate" in sys.argv 262 263 if exists(space): 264 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 265 sys.exit(1) 266 267 package_zip = space + extsep + "zip" 268 269 if exists(package_zip): 270 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 271 sys.exit(1) 272 273 mkdir(space) 274 mkdirs(join(space, "pages")) 275 mkdirs(join(space, "versions")) 276 277 p = xmlread.ConfigurableParser() 278 handler = ConfluenceHandler(space, no_translate) 279 280 p["object"] = handler.handle_object 281 p["property"] = handler.handle_property 282 p["id"] = handler.handle_id 283 p["collection"] = handler.handle_collection 284 p["element"] = handler.handle_element 285 286 # Open the XML dump. 287 288 f = open(filename) 289 290 if is_zipfile: 291 zf = ZipFile(f) 292 ff = StringIO(zf.read("entities.xml")) 293 else: 294 ff = f 295 296 # Parse the data. 297 298 try: 299 p.parse(ff) 300 finally: 301 f.close() 302 303 # Tidy up the import manifests, sorting each of them by revision and 304 # finalising them. 305 306 pages_dir = join(space, "pages") 307 308 output_manifest = join(space, "MOIN_PACKAGE") 309 append(output_manifest, "MoinMoinPackage|1\n") 310 311 for pageid in listdir(pages_dir): 312 manifest = join(pages_dir, pageid, "manifest") 313 pagetitle = join(pages_dir, pageid, "pagetitle") 314 sort_manifest(manifest, pagetitle, output_manifest) 315 316 # Write the page package. 317 318 page_package = ZipFile(package_zip, "w") 319 320 try: 321 # Include the page revisions. 322 323 versions_dir = join(space, "versions") 324 325 for versionid in listdir(versions_dir): 326 page_package.write(join(versions_dir, versionid)) 327 328 # Include only the top-level manifest. 329 330 page_package.write(output_manifest, "MOIN_PACKAGE") 331 332 finally: 333 page_package.close() 334 335 # vim: tabstop=4 expandtab shiftwidth=4