1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing document information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.data import * 22 from iixr.files import * 23 from bisect import bisect_right # to find terms in the dictionary index 24 25 DOCUMENT_CACHE_LIMIT = 10000 26 27 class FieldWriter(FileWriter): 28 29 "Writing field data to files." 30 31 def reset(self): 32 self.end_record() 33 self.last_docnum = None 34 self.subtractor = None 35 36 def write_fields(self, docnum, fields): 37 38 """ 39 Write for the given 'docnum', a list of 'fields' (integer, string pairs 40 representing field identifiers and values respectively). 41 """ 42 43 # Find the size of document number values. 44 45 if self.last_docnum is not None: 46 docnum_seq = self.subtractor(docnum, self.last_docnum) 47 else: 48 self.subtractor = get_subtractor(docnum) 49 docnum_seq = docnum 50 51 # Write the document number. 52 53 self.write_sequence_value(docnum_seq) 54 55 # Write the number of fields. 56 57 self.write_number(len(fields)) 58 59 # Write the fields themselves. 60 61 for i, field in fields: 62 self.write_number(i) 63 self.write_string(field, 1) # compress 64 65 self.last_docnum = docnum 66 67 class FieldReader(FileReader): 68 69 "Reading field data from files." 70 71 def reset(self): 72 self.last_docnum = None 73 self.adder = None 74 self.begin_record() 75 76 def read_fields(self): 77 78 """ 79 Read fields from the file, returning a tuple containing the document 80 number and a list of field (identifier, value) pairs. 81 """ 82 83 # Read the document number. 84 85 docnum = self.read_sequence_value() 86 87 if self.last_docnum is not None: 88 self.last_docnum = self.adder(docnum, self.last_docnum) 89 else: 90 self.adder = get_adder(docnum) 91 self.last_docnum = docnum 92 93 # Read the number of fields. 94 95 nfields = self.read_number() 96 97 # Collect the fields. 98 99 fields = [] 100 i = 0 101 102 while i < nfields: 103 identifier = self.read_number() 104 value = self.read_string(1) # decompress 105 fields.append((identifier, value)) 106 i += 1 107 108 return self.last_docnum, fields 109 110 def read_document_fields(self, docnum, offset): 111 112 """ 113 Read fields for 'docnum' at the given 'offset'. This permits the 114 retrieval of details for the specified document, as well as scanning for 115 later documents. 116 """ 117 118 self.seek(offset) 119 bad_docnum, fields = self.read_fields() 120 self.last_docnum = docnum 121 return docnum, fields 122 123 class FieldIndexWriter(FileWriter): 124 125 "Writing field index details to files." 126 127 def reset(self): 128 self.end_record() 129 self.last_docnum = None 130 self.subtractor = None 131 self.last_offset = 0 132 133 def write_document(self, docnum, offset): 134 135 """ 136 Write for the given 'docnum', the 'offset' at which the fields for the 137 document are stored in the fields file. 138 """ 139 140 # Find the size of document number values. 141 142 if self.last_docnum is not None: 143 docnum_seq = self.subtractor(docnum, self.last_docnum) 144 else: 145 self.subtractor = get_subtractor(docnum) 146 docnum_seq = docnum 147 148 # Write the document number. 149 150 self.write_sequence_value(docnum_seq) 151 152 # Write the offset delta. 153 154 self.write_number(offset - self.last_offset) 155 156 self.last_docnum = docnum 157 self.last_offset = offset 158 159 class FieldIndexReader(FileReader): 160 161 "Reading field index details from files." 162 163 def reset(self): 164 self.last_docnum = None 165 self.adder = None 166 self.last_offset = 0 167 self.begin_record() 168 169 def read_document(self): 170 171 "Read a document number and field file offset." 172 173 # Read the document number. 174 175 docnum = self.read_sequence_value() 176 177 if self.last_docnum is not None: 178 self.last_docnum = self.adder(docnum, self.last_docnum) 179 else: 180 self.adder = get_adder(docnum) 181 self.last_docnum = docnum 182 183 # Read the offset. 184 185 self.last_offset += self.read_number() 186 187 return self.last_docnum, self.last_offset 188 189 class FieldDictionaryWriter: 190 191 "Writing field dictionary details." 192 193 def __init__(self, field_writer, field_index_writer, interval): 194 self.field_writer = field_writer 195 self.field_index_writer = field_index_writer 196 self.interval = interval 197 self.entry = 0 198 199 def write_fields(self, docnum, fields): 200 201 "Write details of the document with the given 'docnum' and 'fields'." 202 203 if self.entry % self.interval == 0: 204 self.field_writer.reset() 205 offset = self.field_writer.tell() 206 self.field_writer.write_fields(docnum, fields) 207 self.field_index_writer.write_document(docnum, offset) 208 else: 209 self.field_writer.write_fields(docnum, fields) 210 211 self.entry += 1 212 213 def close(self): 214 self.field_writer.close() 215 self.field_index_writer.close() 216 217 class FieldDictionaryReader: 218 219 "Reading field dictionary details." 220 221 def __init__(self, field_reader, field_index_reader): 222 self.field_reader = field_reader 223 self.field_index_reader = field_index_reader 224 self.entry = 0 225 226 self.cache = {} 227 self.docs = [] 228 try: 229 while 1: 230 self.docs.append(self.field_index_reader.read_document()) 231 except EOFError: 232 pass 233 234 # Large numbers for ordering purposes. 235 236 if self.docs: 237 self.max_offset = self.docs[-1][1] 238 else: 239 self.max_offset = None 240 241 # Iterator convenience methods. 242 243 def __iter__(self): 244 self.rewind() 245 return self 246 247 def next(self): 248 try: 249 return self.read_fields() 250 except EOFError: 251 raise StopIteration 252 253 # Sequential access methods. 254 255 def rewind(self): 256 self.field_reader.rewind() 257 258 def read_fields(self): 259 260 "Return the next document number and fields." 261 262 try: 263 return self.field_reader.read_fields() 264 except EOFError: 265 self.entry += 1 266 try: 267 found_docnum, offset = self.docs[self.entry] 268 except IndexError: 269 raise EOFError 270 else: 271 self.field_reader.reset() 272 return self.field_reader.read_fields() 273 274 # Random access methods. 275 276 def get_fields(self, docnum): 277 278 "Read the fields of the document with the given 'docnum'." 279 280 if self.cache.has_key(docnum): 281 return self.cache[docnum] 282 283 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 284 285 # Get the entry position providing the term or one preceding it. 286 287 if i == -1: 288 return None 289 290 found_docnum, offset = self.docs[i] 291 292 # Read from the fields file. 293 294 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 295 296 # Scan for the document, if necessary. 297 298 try: 299 while docnum > found_docnum: 300 found_docnum, fields = self.field_reader.read_fields() 301 except EOFError: 302 pass 303 304 # If the document is found, return the fields. 305 306 if docnum == found_docnum: 307 308 # Store the fields in the cache, removing entries if the limit has 309 # been reached. 310 311 keys = self.cache.keys() 312 313 if len(keys) == DOCUMENT_CACHE_LIMIT: 314 del self.cache[keys[0]] 315 316 self.cache[docnum] = fields 317 return fields 318 else: 319 return None 320 321 def close(self): 322 self.field_reader.close() 323 self.field_index_reader.close() 324 325 # vim: tabstop=4 expandtab shiftwidth=4