1 #!/usr/bin/env python 2 3 from os import mkdir, makedirs 4 from os.path import exists, extsep, join, splitext 5 from zipfile import ZipFile 6 from cStringIO import StringIO 7 import codecs 8 import xmlread 9 10 class ConfluenceHandler: 11 12 "Handle content from a Confluence Wiki dump." 13 14 def __init__(self, directory): 15 self.content = {} 16 self.elements = [] 17 self.directory = directory 18 19 def handle_object(self, name, elements, attributes, all_text, text): 20 21 "Handle objects according to type." 22 23 objecttype = attributes[-1]["class"] 24 identifier = text.strip() 25 content = self.content 26 27 pages_dir = join(self.directory, "pages") 28 versions_dir = join(self.directory, "versions") 29 30 # Handle particular types. 31 32 if objecttype == "Page": 33 34 # Handle pages and revisions, adding revisions to the page manifest. 35 36 if content.has_key("originalVersion"): 37 pageid = content["originalVersion"] 38 else: 39 pageid = identifier 40 41 versionfile = join(versions_dir, identifier) 42 43 # Note page metadata, not necessarily in the correct order. 44 45 mkdirs(join(pages_dir, pageid)) 46 47 append(join(pages_dir, pageid, "import"), "%s|AddRevision|%s|%s|%s|%s\n" % ( 48 content["version"], 49 versionfile, 50 content["title"], 51 content["lastModifierName"], 52 content["versionComment"])) 53 54 # Some metadata is not particularly relevant. For example, 55 # ancestors, children, parent are navigation-related. 56 57 # Other metadata could be added to the page content itself. 58 # For example, labelling could be converted to categories. 59 60 # Handle revisions. 61 62 elif objecttype == "BodyContent": 63 write(join(versions_dir, content["content"]), content["body"]) 64 65 self.content = {} 66 67 def handle_property(self, name, elements, attributes, all_text, text): 68 69 "Record properties in the current content dictionary." 70 71 self.content[attributes[-1]["name"]] = text.strip() 72 73 def handle_id(self, name, elements, attributes, all_text, text): 74 75 "Promote identifiers to the parent element's text." 76 77 all_text[-2].append(text) 78 79 def handle_collection(self, name, elements, attributes, all_text, text): 80 81 "Record collections in the current content dictionary." 82 83 self.content[attributes[-1]["name"]] = self.elements 84 self.elements = [] 85 86 def handle_element(self, name, elements, attributes, all_text, text): 87 88 "Add elements to the current collection." 89 90 self.elements.append((attributes[-1]["class"], text.strip())) 91 92 def mkdirs(name): 93 94 "Make the directory with the given 'name' at any depth." 95 96 try: 97 makedirs(name) 98 except OSError: 99 pass 100 101 def append(filename, s): 102 103 "Append to the file with the given 'filename' the string 's'." 104 105 write(filename, s, True) 106 107 def write(filename, s, append=False): 108 109 """ 110 Write to the file with the given 'filename' the string 's'. If the optional 111 'append' parameter is set to a true value, 's' will be appended to the file. 112 """ 113 114 f = codecs.open(filename, append and "a" or "w", encoding="utf-8") 115 try: 116 f.write(s) 117 finally: 118 f.close() 119 120 if __name__ == "__main__": 121 import sys 122 123 filename = sys.argv[1] 124 is_zipfile = splitext(filename)[-1] == extsep + "zip" 125 126 directory = sys.argv[2] 127 128 if exists(directory): 129 print >>sys.stderr, "Directory exists. Please choose another or remove its contents." 130 sys.exit(1) 131 132 mkdir(directory) 133 mkdirs(join(directory, "pages")) 134 mkdirs(join(directory, "versions")) 135 136 p = xmlread.ConfigurableParser() 137 handler = ConfluenceHandler(directory) 138 139 p["object"] = handler.handle_object 140 p["property"] = handler.handle_property 141 p["id"] = handler.handle_id 142 p["collection"] = handler.handle_collection 143 p["element"] = handler.handle_element 144 145 # Open the XML dump. 146 147 f = open(filename) 148 149 if is_zipfile: 150 zf = ZipFile(f) 151 ff = StringIO(zf.read("entities.xml")) 152 else: 153 ff = f 154 155 # Parse the data. 156 157 try: 158 p.parse(ff) 159 finally: 160 f.close() 161 162 # Tidy up the import manifests, sorting each of them by revision and 163 # finalising them. 164 165 # vim: tabstop=4 expandtab shiftwidth=4