1 #!/usr/bin/env python 2 3 """ 4 Generic file access. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.data import vint 22 from cStringIO import StringIO 23 import bz2, zlib 24 25 # Constants. 26 27 WRITE_CACHE_SIZE = 100000 28 READ_CACHE_SIZE = 10000 29 READ_CACHE_RESIZE = 5000 30 31 compressors = [("b", bz2.compress), ("z", zlib.compress)] 32 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress} 33 34 class File: 35 36 "A basic file abstraction." 37 38 def __init__(self, f): 39 self.f = f 40 self.reset() 41 42 def reset(self): 43 44 "To be used to reset the state of the reader or writer between records." 45 46 pass 47 48 def rewind(self): 49 self.seek(0) 50 self.reset() 51 52 def seek(self, offset): 53 54 "To be defined by readers." 55 56 pass 57 58 def flush(self): 59 60 "To be defined by writers." 61 62 pass 63 64 def close(self): 65 if self.f is not None: 66 self.flush() 67 self.f.close() 68 self.f = None 69 70 class FileWriter(File): 71 72 "Writing basic data types to files." 73 74 def __init__(self, f): 75 File.__init__(self, f) 76 self.cache = StringIO() 77 self.cache_length = 0 78 79 def write_number(self, number): 80 81 "Write 'number' to the file using a variable length encoding." 82 83 self.write(vint(number)) 84 85 def write_string(self, s, compress=0): 86 87 """ 88 Write 's' to the file, recording its length and compressing the string 89 if 'compress' is set to a true value. 90 """ 91 92 # Convert Unicode objects to strings. 93 94 if isinstance(s, unicode): 95 s = s.encode("utf-8") 96 97 # Compress the string if requested. 98 99 if compress: 100 for flag, fn in compressors: 101 cs = fn(s) 102 103 # Take the first string shorter than the original. 104 105 if len(cs) < len(s): 106 s = cs 107 break 108 else: 109 flag = "-" 110 111 else: 112 flag = "" 113 114 # Write the length of the data before the data itself. 115 116 length = len(s) 117 self.write(flag + vint(length) + s) 118 119 # Cache-affected methods. 120 121 def write(self, s): 122 self.cache.write(s) 123 if self.cache.tell() >= WRITE_CACHE_SIZE: 124 self.flush() 125 126 def tell(self): 127 return self.f.tell() + self.cache.tell() 128 129 def flush(self): 130 self.cache.seek(0) 131 self.f.write(self.cache.read()) 132 self.cache = StringIO() 133 134 class FileReader(File): 135 136 "Reading basic data types from files." 137 138 def __init__(self, f): 139 File.__init__(self, f) 140 self.reset_cache(0) 141 142 def reset_cache(self, offset): 143 self.cache = "" 144 self.cache_length = 0 145 self.cache_start = 0 146 self.cache_offset = offset 147 self.f.seek(offset) 148 149 def resize_cache(self, next_start): 150 self.cache = self.cache[next_start:] 151 self.cache_length = len(self.cache) 152 self.cache_start = 0 153 self.cache_offset += next_start 154 155 def read_number(self): 156 157 "Read a number from the file." 158 159 # Read each byte, adding it to the number. 160 161 shift = 0 162 number = 0 163 read = self.read 164 165 try: 166 csd = ord(read(1)) 167 while csd & 128: 168 number += ((csd & 127) << shift) 169 shift += 7 170 csd = ord(read(1)) 171 else: 172 number += (csd << shift) 173 except TypeError: 174 raise EOFError 175 176 return number 177 178 def read_string(self, decompress=0): 179 180 """ 181 Read a string from the file, decompressing the stored data if 182 'decompress' is set to a true value. 183 """ 184 185 # Decompress the data if requested. 186 187 if decompress: 188 flag = self.read(1) 189 else: 190 flag = "-" 191 192 length = self.read_number() 193 s = self.read(length) 194 195 # Perform decompression if applicable. 196 197 if flag != "-": 198 fn = decompressors[flag] 199 s = fn(s) 200 201 # Convert strings to Unicode objects. 202 203 return unicode(s, "utf-8") 204 205 # Cache-affected methods. 206 207 def read(self, n): 208 needed = n - (self.cache_length - self.cache_start) 209 210 # Read the needed number of characters, if possible. 211 212 if needed > 0: 213 s = self.f.read(max(needed, READ_CACHE_SIZE)) 214 self.cache += s 215 self.cache_length += len(s) 216 217 # Get the end of the requested block. 218 219 next_start = self.cache_start + n 220 s = self.cache[self.cache_start:next_start] 221 222 # Reposition the pointer to the cache. 223 224 self._seek_cache(len(s)) 225 return s 226 227 def tell(self): 228 return self.cache_offset + self.cache_start 229 230 def seek(self, offset): 231 current = self.tell() 232 233 # If seeking forward, attempt to navigate the cache. 234 235 if offset >= current: 236 self._seek_cache(offset - current) 237 else: 238 self.reset_cache(offset) 239 240 def _seek_cache(self, delta): 241 next_start = self.cache_start + delta 242 243 if next_start > 0 and next_start >= len(self.cache): 244 self.reset_cache(self.cache_offset + next_start) 245 246 # If the cache is too big, resize it. 247 248 elif next_start > READ_CACHE_RESIZE: 249 self.resize_cache(next_start) 250 251 # Otherwise, just reference the next part of the cache. 252 253 else: 254 self.cache_start = next_start 255 256 class FileOpener: 257 258 "Opening files using their filenames." 259 260 def __init__(self, filename): 261 self.filename = filename 262 263 def open(self, mode): 264 return open(self.filename, mode) 265 266 def close(self): 267 pass 268 269 # vim: tabstop=4 expandtab shiftwidth=4