1 #!/usr/bin/env python 2 3 """ 4 Confluence XML dump conversion to a MoinMoin-compatible representation. 5 6 Copyright (C) 2012 Paul Boddie <paul@boddie.org.uk> 7 8 This software is free software; you can redistribute it and/or 9 modify it under the terms of the GNU General Public License as 10 published by the Free Software Foundation; either version 2 of 11 the License, or (at your option) any later version. 12 13 This software is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the 16 GNU General Public License for more details. 17 18 You should have received a copy of the GNU General Public 19 License along with this library; see the file LICENCE.txt 20 If not, write to the Free Software Foundation, Inc., 21 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 22 """ 23 24 from os import listdir, mkdir, makedirs 25 from os.path import exists, extsep, join, splitext 26 from zipfile import ZipFile 27 from cStringIO import StringIO 28 import codecs 29 import xmlread 30 31 class ConfluenceHandler: 32 33 "Handle content from a Confluence Wiki dump." 34 35 def __init__(self, directory): 36 self.content = {} 37 self.elements = [] 38 self.directory = directory 39 40 def handle_object(self, name, elements, attributes, all_text, text): 41 42 "Handle objects according to type." 43 44 objecttype = attributes[-1]["class"] 45 identifier = text.strip() 46 content = self.content 47 48 pages_dir = join(self.directory, "pages") 49 versions_dir = join(self.directory, "versions") 50 51 # Handle particular types. 52 53 if objecttype == "Page": 54 55 # Handle pages and revisions, adding revisions to the page manifest. 56 57 if content.has_key("originalVersion"): 58 pageid = content["originalVersion"] 59 else: 60 pageid = identifier 61 62 versionfile = join(versions_dir, identifier) 63 64 # Note page metadata, not necessarily in the correct order. 65 66 mkdirs(join(pages_dir, pageid)) 67 68 append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % ( 69 content["version"], 70 versionfile, 71 content["title"], 72 content["lastModifierName"], 73 content["versionComment"])) 74 75 # Some metadata is not particularly relevant. For example, 76 # ancestors, children, parent are navigation-related. 77 78 # Other metadata could be added to the page content itself. 79 # For example, labelling could be converted to categories. 80 81 # Handle revisions. 82 83 elif objecttype == "BodyContent": 84 write(join(versions_dir, content["content"]), content["body"]) 85 86 self.content = {} 87 88 def handle_property(self, name, elements, attributes, all_text, text): 89 90 "Record properties in the current content dictionary." 91 92 self.content[attributes[-1]["name"]] = text.strip() 93 94 def handle_id(self, name, elements, attributes, all_text, text): 95 96 "Promote identifiers to the parent element's text." 97 98 all_text[-2].append(text) 99 100 def handle_collection(self, name, elements, attributes, all_text, text): 101 102 "Record collections in the current content dictionary." 103 104 self.content[attributes[-1]["name"]] = self.elements 105 self.elements = [] 106 107 def handle_element(self, name, elements, attributes, all_text, text): 108 109 "Add elements to the current collection." 110 111 self.elements.append((attributes[-1]["class"], text.strip())) 112 113 def mkdirs(name): 114 115 "Make the directory with the given 'name' at any depth." 116 117 try: 118 makedirs(name) 119 except OSError: 120 pass 121 122 def append(filename, s): 123 124 "Append to the file with the given 'filename' the string 's'." 125 126 write(filename, s, True) 127 128 def write(filename, s, append=False): 129 130 """ 131 Write to the file with the given 'filename' the string 's'. If the optional 132 'append' parameter is set to a true value, 's' will be appended to the file. 133 """ 134 135 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 136 try: 137 f.write(s) 138 finally: 139 f.close() 140 141 def sort_manifest(filename): 142 143 "Sort the manifest given in 'filename' according to revision." 144 145 f = codecs.open(filename, "r", encoding="utf-8") 146 try: 147 lines = [x.split("|") for x in f.readlines()] 148 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 149 lines = ["|".join(x[1:]) for x in lines] 150 finally: 151 f.close() 152 153 write(filename, "".join(lines)) 154 155 if __name__ == "__main__": 156 import sys 157 158 filename = sys.argv[1] 159 is_zipfile = splitext(filename)[-1] == extsep + "zip" 160 161 directory = sys.argv[2] 162 163 if exists(directory): 164 print >>sys.stderr, "Directory exists. Please choose another or remove its contents." 165 sys.exit(1) 166 167 mkdir(directory) 168 mkdirs(join(directory, "pages")) 169 mkdirs(join(directory, "versions")) 170 171 p = xmlread.ConfigurableParser() 172 handler = ConfluenceHandler(directory) 173 174 p["object"] = handler.handle_object 175 p["property"] = handler.handle_property 176 p["id"] = handler.handle_id 177 p["collection"] = handler.handle_collection 178 p["element"] = handler.handle_element 179 180 # Open the XML dump. 181 182 f = open(filename) 183 184 if is_zipfile: 185 zf = ZipFile(f) 186 ff = StringIO(zf.read("entities.xml")) 187 else: 188 ff = f 189 190 # Parse the data. 191 192 try: 193 p.parse(ff) 194 finally: 195 f.close() 196 197 # Tidy up the import manifests, sorting each of them by revision and 198 # finalising them. 199 200 pages_dir = join(directory, "pages") 201 202 for pageid in listdir(pages_dir): 203 manifest = join(pages_dir, pageid, "manifest") 204 sort_manifest(manifest) 205 206 # vim: tabstop=4 expandtab shiftwidth=4