1 #!/usr/bin/env python 2 3 from os import listdir, mkdir, makedirs 4 from os.path import exists, extsep, join, splitext 5 from zipfile import ZipFile 6 from cStringIO import StringIO 7 import codecs 8 import xmlread 9 10 class ConfluenceHandler: 11 12 "Handle content from a Confluence Wiki dump." 13 14 def __init__(self, directory): 15 self.content = {} 16 self.elements = [] 17 self.directory = directory 18 19 def handle_object(self, name, elements, attributes, all_text, text): 20 21 "Handle objects according to type." 22 23 objecttype = attributes[-1]["class"] 24 identifier = text.strip() 25 content = self.content 26 27 pages_dir = join(self.directory, "pages") 28 versions_dir = join(self.directory, "versions") 29 30 # Handle particular types. 31 32 if objecttype == "Page": 33 34 # Handle pages and revisions, adding revisions to the page manifest. 35 36 if content.has_key("originalVersion"): 37 pageid = content["originalVersion"] 38 else: 39 pageid = identifier 40 41 versionfile = join(versions_dir, identifier) 42 43 # Note page metadata, not necessarily in the correct order. 44 45 mkdirs(join(pages_dir, pageid)) 46 47 append(join(pages_dir, pageid, "manifest"), "%s|AddRevision|%s|%s|%s|%s\n" % ( 48 content["version"], 49 versionfile, 50 content["title"], 51 content["lastModifierName"], 52 content["versionComment"])) 53 54 # Some metadata is not particularly relevant. For example, 55 # ancestors, children, parent are navigation-related. 56 57 # Other metadata could be added to the page content itself. 58 # For example, labelling could be converted to categories. 59 60 # Handle revisions. 61 62 elif objecttype == "BodyContent": 63 write(join(versions_dir, content["content"]), content["body"]) 64 65 self.content = {} 66 67 def handle_property(self, name, elements, attributes, all_text, text): 68 69 "Record properties in the current content dictionary." 70 71 self.content[attributes[-1]["name"]] = text.strip() 72 73 def handle_id(self, name, elements, attributes, all_text, text): 74 75 "Promote identifiers to the parent element's text." 76 77 all_text[-2].append(text) 78 79 def handle_collection(self, name, elements, attributes, all_text, text): 80 81 "Record collections in the current content dictionary." 82 83 self.content[attributes[-1]["name"]] = self.elements 84 self.elements = [] 85 86 def handle_element(self, name, elements, attributes, all_text, text): 87 88 "Add elements to the current collection." 89 90 self.elements.append((attributes[-1]["class"], text.strip())) 91 92 def mkdirs(name): 93 94 "Make the directory with the given 'name' at any depth." 95 96 try: 97 makedirs(name) 98 except OSError: 99 pass 100 101 def append(filename, s): 102 103 "Append to the file with the given 'filename' the string 's'." 104 105 write(filename, s, True) 106 107 def write(filename, s, append=False): 108 109 """ 110 Write to the file with the given 'filename' the string 's'. If the optional 111 'append' parameter is set to a true value, 's' will be appended to the file. 112 """ 113 114 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 115 try: 116 f.write(s) 117 finally: 118 f.close() 119 120 def sort_manifest(filename): 121 122 "Sort the manifest given in 'filename' according to revision." 123 124 f = codecs.open(filename, "r", encoding="utf-8") 125 try: 126 lines = [x.split("|") for x in f.readlines()] 127 lines.sort(cmp=lambda x, y: cmp(int(x[0]), int(y[0]))) 128 lines = ["|".join(x[1:]) for x in lines] 129 finally: 130 f.close() 131 132 write(filename, "".join(lines)) 133 134 if __name__ == "__main__": 135 import sys 136 137 filename = sys.argv[1] 138 is_zipfile = splitext(filename)[-1] == extsep + "zip" 139 140 directory = sys.argv[2] 141 142 if exists(directory): 143 print >>sys.stderr, "Directory exists. Please choose another or remove its contents." 144 sys.exit(1) 145 146 mkdir(directory) 147 mkdirs(join(directory, "pages")) 148 mkdirs(join(directory, "versions")) 149 150 p = xmlread.ConfigurableParser() 151 handler = ConfluenceHandler(directory) 152 153 p["object"] = handler.handle_object 154 p["property"] = handler.handle_property 155 p["id"] = handler.handle_id 156 p["collection"] = handler.handle_collection 157 p["element"] = handler.handle_element 158 159 # Open the XML dump. 160 161 f = open(filename) 162 163 if is_zipfile: 164 zf = ZipFile(f) 165 ff = StringIO(zf.read("entities.xml")) 166 else: 167 ff = f 168 169 # Parse the data. 170 171 try: 172 p.parse(ff) 173 finally: 174 f.close() 175 176 # Tidy up the import manifests, sorting each of them by revision and 177 # finalising them. 178 179 pages_dir = join(directory, "pages") 180 181 for pageid in listdir(pages_dir): 182 manifest = join(pages_dir, pageid, "manifest") 183 sort_manifest(manifest) 184 185 # vim: tabstop=4 expandtab shiftwidth=4