1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import listdir, mkdir, makedirs 25 from os.path import exists, extsep, join, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 import codecs 29 import xmlread 30 import parser 31 32 class ConfluenceHandler: 33 34 "Handle content from a Confluence Wiki dump." 35 36 def __init__(self, space, no_translate=False): 37 self.content = {} 38 self.elements = [] 39 self.space = space 40 self.no_translate = no_translate 41 42 def handle_object(self, name, elements, attributes, all_text, text): 43 44 "Handle objects according to type." 45 46 objecttype = attributes[-1]["class"] 47 identifier = text.strip() 48 content = self.content 49 50 pages_dir = join(self.space, "pages") 51 versions_dir = join(self.space, "versions") 52 53 # Handle particular types. 54 55 if objecttype in ("Page", "Comment", "BlogPost"): 56 57 # Handle pages and revisions, adding revisions to the page manifest. 58 # The original version is used as a unifying identifier for all the 59 # different revisions (each of which being defined by a Page 60 # element). Although "original" implies the first identifier used, 61 # it actually appears to be the latest and will have the highest 62 # version number. 63 64 if content.has_key("originalVersion"): 65 pageid = content["originalVersion"] 66 else: 67 pageid = identifier 68 69 versionfile = join(versions_dir, identifier) 70 71 # Note page metadata, not necessarily in the correct order. 72 # For comments, the title will need to be rewritten, since they 73 # should be defined in terms of their owner page. 74 75 mkdirs(join(pages_dir, pageid)) 76 77 title = content["title"] 78 if title: 79 title = "%s/%s" % (self.space, title) 80 81 append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % ( 82 content["version"], 83 versionfile, 84 title or content["version"], # comment titles will incorporate the version 85 content["lastModifierName"], 86 content["versionComment"])) 87 88 # Write comments as subpages. 89 90 if content.has_key("comments"): 91 92 # Define a page directory for each comment, and write the page 93 # title in a special file for later processing. 94 95 for _comment, commentid in content["comments"]: 96 mkdirs(join(pages_dir, commentid)) 97 append(join(pages_dir, commentid, "pagetitle"), title) 98 99 # Some metadata is not particularly relevant. For example, 100 # ancestors, children, parent are navigation-related. 101 102 # Other metadata could be added to the page content itself. 103 # For example, labelling could be converted to categories. 104 105 # Handle revisions. 106 107 elif objecttype == "BodyContent": 108 body = content["body"] 109 if not body: 110 body = "## Empty page." 111 112 if no_translate: 113 fn = write 114 else: 115 fn = translate 116 117 fn(join(versions_dir, content["content"]), body) 118 119 self.content = {} 120 121 def handle_property(self, name, elements, attributes, all_text, text): 122 123 "Record properties in the current content dictionary." 124 125 self.content[attributes[-1]["name"]] = text.strip() 126 127 def handle_id(self, name, elements, attributes, all_text, text): 128 129 "Promote identifiers to the parent element's text." 130 131 all_text[-2].append(text) 132 133 def handle_collection(self, name, elements, attributes, all_text, text): 134 135 "Record collections in the current content dictionary." 136 137 self.content[attributes[-1]["name"]] = self.elements 138 self.elements = [] 139 140 def handle_element(self, name, elements, attributes, all_text, text): 141 142 "Add elements to the current collection." 143 144 self.elements.append((attributes[-1]["class"], text.strip())) 145 146 def mkdirs(name): 147 148 "Make the directory with the given 'name' at any depth." 149 150 try: 151 makedirs(name) 152 except OSError: 153 pass 154 155 def append(filename, s): 156 157 "Append to the file with the given 'filename' the string 's'." 158 159 write(filename, s, True) 160 161 def write(filename, s, append=False): 162 163 """ 164 Write to the file with the given 'filename' the string 's'. If the optional 165 'append' parameter is set to a true value, 's' will be appended to the file. 166 """ 167 168 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 169 try: 170 f.write(s) 171 finally: 172 f.close() 173 174 def read(filename): 175 176 """ 177 Read from the file with the given 'filename', returning a string containing 178 its contents. 179 """ 180 181 f = codecs.open(filename, encoding="utf-8") 182 try: 183 return f.read() 184 finally: 185 f.close() 186 187 def translate(filename, body): 188 189 """ 190 Write to the file with the given 'filename' a translation of the given 191 'body'. 192 """ 193 194 out = codecs.open(filename, "w", encoding="utf-8") 195 try: 196 parser.parse(body, out) 197 finally: 198 out.close() 199 200 def sort_manifest(filename, pagetitle, output=None): 201 202 """ 203 Sort the manifest given in 'filename' according to revision. If a 204 'pagetitle' file exists, the title column in the manifest will be augmented 205 with the contents of that file. If 'output' is given, the manifest details 206 will be appended to the file having that filename instead of being rewritten 207 to the original manifest file. 208 """ 209 210 if exists(pagetitle): 211 title = read(pagetitle) 212 else: 213 title = None 214 215 f = codecs.open(filename, "r", encoding="utf-8") 216 try: 217 lines = [x.split("|") for x in f.readlines()] 218 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 219 220 # Reconstruct the lines, optionally changing the titles. 221 222 result = [] 223 for x in lines: 224 if title is not None: 225 x[3] = "%s/%s" % (title, x[3]) 226 result.append("|".join(x[1:])) 227 finally: 228 f.close() 229 230 s = "".join(result) 231 232 if output is None: 233 write(filename, s) 234 else: 235 append(output, s) 236 237 if __name__ == "__main__": 238 import sys 239 240 try: 241 filename = sys.argv[1] 242 is_zipfile = splitext(filename)[-1] == extsep + "zip" 243 space = sys.argv[2] 244 except IndexError: 245 print >>sys.stderr, "Please specify an XML file containing Wiki data and a workspace name." 246 print >>sys.stderr, "For example: com_entities.xml COM" 247 sys.exit(1) 248 249 no_translate = "--no-translate" in sys.argv 250 251 if exists(space): 252 print >>sys.stderr, "Directory exists for space %s. Please choose another or remove its contents." % space 253 sys.exit(1) 254 255 package_zip = space + extsep + "zip" 256 257 if exists(package_zip): 258 print >>sys.stderr, "Page package exists. Please remove or rename it:", package_zip 259 sys.exit(1) 260 261 mkdir(space) 262 mkdirs(join(space, "pages")) 263 mkdirs(join(space, "versions")) 264 265 p = xmlread.ConfigurableParser() 266 handler = ConfluenceHandler(space, no_translate) 267 268 p["object"] = handler.handle_object 269 p["property"] = handler.handle_property 270 p["id"] = handler.handle_id 271 p["collection"] = handler.handle_collection 272 p["element"] = handler.handle_element 273 274 # Open the XML dump. 275 276 f = open(filename) 277 278 if is_zipfile: 279 zf = ZipFile(f) 280 ff = StringIO(zf.read("entities.xml")) 281 else: 282 ff = f 283 284 # Parse the data. 285 286 try: 287 p.parse(ff) 288 finally: 289 f.close() 290 291 # Tidy up the import manifests, sorting each of them by revision and 292 # finalising them. 293 294 pages_dir = join(space, "pages") 295 296 output_manifest = join(space, "MOIN_PACKAGE") 297 append(output_manifest, "MoinMoinPackage|1\n") 298 299 for pageid in listdir(pages_dir): 300 manifest = join(pages_dir, pageid, "manifest") 301 pagetitle = join(pages_dir, pageid, "pagetitle") 302 sort_manifest(manifest, pagetitle, output_manifest) 303 304 # Write the page package. 305 306 page_package = ZipFile(package_zip, "w") 307 308 try: 309 # Include the page revisions. 310 311 versions_dir = join(space, "versions") 312 313 for versionid in listdir(versions_dir): 314 page_package.write(join(versions_dir, versionid)) 315 316 # Include only the top-level manifest. 317 318 page_package.write(output_manifest, "MOIN_PACKAGE") 319 320 finally: 321 page_package.close() 322 323 # vim: tabstop=4 expandtab shiftwidth=4