1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing document information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.data import * 22 from iixr.files import * 23 from bisect import bisect_right # to find terms in the dictionary index 24 25 DOCUMENT_CACHE_LIMIT = 10000 26 27 class FieldWriter(FileWriter): 28 29 "Writing field data to files." 30 31 def begin(self, docnum_size): 32 self.write_number(docnum_size) 33 self.end_record() 34 self.docnum_size = docnum_size 35 self.data_start = self.tell() 36 37 def reset(self): 38 self.end_record() 39 self.last_docnum = None 40 self.subtractor = None 41 42 def write_fields(self, docnum, fields): 43 44 """ 45 Write for the given 'docnum', a list of 'fields' (integer, string pairs 46 representing field identifiers and values respectively). 47 """ 48 49 # Find the size of document number values. 50 51 if self.last_docnum is not None: 52 docnum_seq = self.subtractor(docnum, self.last_docnum) 53 else: 54 self.subtractor = get_subtractor(docnum) 55 docnum_seq = docnum 56 57 # Write the document number. 58 59 self.write_sequence_value(docnum_seq, self.docnum_size) 60 61 # Write the number of fields. 62 63 self.write_number(len(fields)) 64 65 # Write the fields themselves. 66 67 for i, field in fields: 68 self.write_number(i) 69 self.write_string(field, 1) # compress 70 71 self.last_docnum = docnum 72 73 class FieldReader(FileReader): 74 75 "Reading field data from files." 76 77 def begin(self): 78 self.begin_record() 79 try: 80 self.docnum_size = self.read_number() 81 except EOFError: 82 self.docnum_size = 0 # NOTE: No fields! 83 self.data_start = self.tell() 84 85 def reset(self): 86 self.last_docnum = None 87 self.adder = None 88 self.begin_record() 89 90 def read_fields(self): 91 92 """ 93 Read fields from the file, returning a tuple containing the document 94 number and a list of field (identifier, value) pairs. 95 """ 96 97 # Read the document number. 98 99 docnum = self.read_sequence_value(self.docnum_size) 100 101 if self.last_docnum is not None: 102 self.last_docnum = self.adder(docnum, self.last_docnum) 103 else: 104 self.adder = get_adder(docnum) 105 self.last_docnum = docnum 106 107 # Read the number of fields. 108 109 nfields = self.read_number() 110 111 # Collect the fields. 112 113 fields = [] 114 i = 0 115 116 while i < nfields: 117 identifier = self.read_number() 118 value = self.read_string(1) # decompress 119 fields.append((identifier, value)) 120 i += 1 121 122 return self.last_docnum, fields 123 124 def read_document_fields(self, docnum, offset): 125 126 """ 127 Read fields for 'docnum' at the given 'offset'. This permits the 128 retrieval of details for the specified document, as well as scanning for 129 later documents. 130 """ 131 132 self.seek(offset) 133 bad_docnum, fields = self.read_fields() 134 self.last_docnum = docnum 135 return docnum, fields 136 137 class FieldIndexWriter(FieldWriter): 138 139 "Writing field index details to files." 140 141 def reset(self): 142 FieldWriter.reset(self) 143 self.last_offset = 0 144 145 def write_document(self, docnum, offset): 146 147 """ 148 Write for the given 'docnum', the 'offset' at which the fields for the 149 document are stored in the fields file. 150 """ 151 152 # Find the size of document number values. 153 154 if self.last_docnum is not None: 155 docnum_seq = self.subtractor(docnum, self.last_docnum) 156 else: 157 self.subtractor = get_subtractor(docnum) 158 docnum_seq = docnum 159 160 # Write the document number. 161 162 self.write_sequence_value(docnum_seq, self.docnum_size) 163 164 # Write the offset delta. 165 166 self.write_number(offset - self.last_offset) 167 168 self.last_docnum = docnum 169 self.last_offset = offset 170 171 class FieldIndexReader(FieldReader): 172 173 "Reading field index details from files." 174 175 def reset(self): 176 FieldReader.reset(self) 177 self.last_offset = 0 178 179 def read_document(self): 180 181 "Read a document number and field file offset." 182 183 # Read the document number. 184 185 docnum = self.read_sequence_value(self.docnum_size) 186 187 if self.last_docnum is not None: 188 self.last_docnum = self.adder(docnum, self.last_docnum) 189 else: 190 self.adder = get_adder(docnum) 191 self.last_docnum = docnum 192 193 # Read the offset. 194 195 self.last_offset += self.read_number() 196 197 return self.last_docnum, self.last_offset 198 199 class FieldDictionaryWriter: 200 201 "Writing field dictionary details." 202 203 def __init__(self, field_writer, field_index_writer, interval): 204 self.field_writer = field_writer 205 self.field_index_writer = field_index_writer 206 self.interval = interval 207 self.entry = 0 208 209 def write_fields(self, docnum, fields): 210 211 "Write details of the given 'docnum' and 'fields'." 212 213 if self.entry == 0: 214 docnum_size = sizeof(docnum) 215 self.field_writer.begin(docnum_size) 216 self.field_index_writer.begin(docnum_size) 217 self.field_index_writer.reset() 218 219 if self.entry % self.interval == 0: 220 self.field_writer.reset() 221 offset = self.field_writer.tell() 222 self.field_writer.write_fields(docnum, fields) 223 self.field_index_writer.write_document(docnum, offset) 224 else: 225 self.field_writer.write_fields(docnum, fields) 226 227 self.entry += 1 228 229 def close(self): 230 self.field_writer.close() 231 self.field_index_writer.close() 232 233 class FieldDictionaryReader: 234 235 "Reading field dictionary details." 236 237 def __init__(self, field_reader, field_index_reader): 238 self.field_reader = field_reader 239 self.field_index_reader = field_index_reader 240 241 self.field_reader.reset() 242 self.field_index_reader.reset() 243 244 self.cache = {} 245 246 self.entry = 0 247 self.docs = [] 248 try: 249 while 1: 250 self.docs.append(self.field_index_reader.read_document()) 251 except EOFError: 252 pass 253 254 # Large numbers for ordering purposes. 255 256 if self.docs: 257 self.max_offset = self.docs[-1][1] 258 else: 259 self.max_offset = None 260 261 # Iterator convenience methods. 262 263 def __iter__(self): 264 self.rewind() 265 return self 266 267 def next(self): 268 try: 269 return self.read_fields() 270 except EOFError: 271 raise StopIteration 272 273 # Sequential access methods. 274 275 def rewind(self): 276 self.field_reader.rewind() 277 278 def read_fields(self): 279 280 "Return the next document number and fields." 281 282 try: 283 return self.field_reader.read_fields() 284 except EOFError: 285 self.entry += 1 286 try: 287 found_docnum, offset = self.docs[self.entry] 288 except IndexError: 289 raise EOFError 290 else: 291 self.field_reader.reset() 292 return self.field_reader.read_fields() 293 294 # Random access methods. 295 296 def get_fields(self, docnum): 297 298 "Read the fields of the document with the given 'docnum'." 299 300 if self.cache.has_key(docnum): 301 return self.cache[docnum] 302 303 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 304 305 # Get the entry position providing the term or one preceding it. 306 307 if i == -1: 308 return None 309 310 found_docnum, offset = self.docs[i] 311 312 # Read from the fields file. 313 314 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 315 316 # Scan for the document, if necessary. 317 318 try: 319 while docnum > found_docnum: 320 found_docnum, fields = self.field_reader.read_fields() 321 except EOFError: 322 pass 323 324 # If the document is found, return the fields. 325 326 if docnum == found_docnum: 327 328 # Store the fields in the cache, removing entries if the limit has 329 # been reached. 330 331 keys = self.cache.keys() 332 333 if len(keys) == DOCUMENT_CACHE_LIMIT: 334 del self.cache[keys[0]] 335 336 self.cache[docnum] = fields 337 return fields 338 else: 339 return None 340 341 def close(self): 342 self.field_reader.close() 343 self.field_index_reader.close() 344 345 # vim: tabstop=4 expandtab shiftwidth=4