1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing document information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.data import * 22 from iixr.files import * 23 from bisect import bisect_right # to find terms in the dictionary index 24 25 DOCUMENT_CACHE_LIMIT = 10000 26 27 class FieldWriter(FileWriter): 28 29 "Writing field data to files." 30 31 def reset(self): 32 self.last_docnum = None 33 self.subtractor = None 34 35 def write_fields(self, docnum, fields): 36 37 """ 38 Write for the given 'docnum', a list of 'fields' (integer, string pairs 39 representing field identifiers and values respectively). 40 """ 41 42 # Find the size of document number values. 43 44 if self.last_docnum is not None: 45 docnum_seq = self.subtractor(docnum, self.last_docnum) 46 else: 47 self.subtractor = get_subtractor(docnum) 48 docnum_seq = docnum 49 50 self.begin_record() 51 52 # Write the document number. 53 54 self.write_sequence_value(docnum_seq) 55 56 # Write the number of fields. 57 58 self.write_number(len(fields)) 59 60 # Write the fields themselves. 61 62 for i, field in fields: 63 self.write_number(i) 64 self.write_string(field, 1) # compress 65 66 self.end_record() 67 68 self.last_docnum = docnum 69 70 class FieldReader(FileReader): 71 72 "Reading field data from files." 73 74 def reset(self): 75 self.last_docnum = None 76 self.adder = None 77 78 def read_fields(self): 79 80 """ 81 Read fields from the file, returning a tuple containing the document 82 number and a list of field (identifier, value) pairs. 83 """ 84 85 self.begin_record() 86 87 # Read the document number. 88 89 docnum = self.read_sequence_value() 90 91 if self.last_docnum is not None: 92 self.last_docnum = self.adder(docnum, self.last_docnum) 93 else: 94 self.adder = get_adder(docnum) 95 self.last_docnum = docnum 96 97 # Read the number of fields. 98 99 nfields = self.read_number() 100 101 # Collect the fields. 102 103 fields = [] 104 i = 0 105 106 while i < nfields: 107 identifier = self.read_number() 108 value = self.read_string(1) # decompress 109 fields.append((identifier, value)) 110 i += 1 111 112 self.end_record() 113 114 return self.last_docnum, fields 115 116 def read_document_fields(self, docnum, offset): 117 118 """ 119 Read fields for 'docnum' at the given 'offset'. This permits the 120 retrieval of details for the specified document, as well as scanning for 121 later documents. 122 """ 123 124 self.seek(offset) 125 bad_docnum, fields = self.read_fields() 126 self.last_docnum = docnum 127 return docnum, fields 128 129 class FieldIndexWriter(FileWriter): 130 131 "Writing field index details to files." 132 133 def reset(self): 134 self.last_docnum = None 135 self.subtractor = None 136 self.last_offset = 0 137 138 def write_document(self, docnum, offset): 139 140 """ 141 Write for the given 'docnum', the 'offset' at which the fields for the 142 document are stored in the fields file. 143 """ 144 145 # Find the size of document number values. 146 147 if self.last_docnum is not None: 148 docnum_seq = self.subtractor(docnum, self.last_docnum) 149 else: 150 self.subtractor = get_subtractor(docnum) 151 docnum_seq = docnum 152 153 self.begin_record() 154 155 # Write the document number. 156 157 self.write_sequence_value(docnum_seq) 158 159 # Write the offset delta. 160 161 self.write_number(offset - self.last_offset) 162 self.end_record() 163 164 self.last_docnum = docnum 165 self.last_offset = offset 166 167 class FieldIndexReader(FileReader): 168 169 "Reading field index details from files." 170 171 def reset(self): 172 self.last_docnum = None 173 self.adder = None 174 self.last_offset = 0 175 176 def read_document(self): 177 178 "Read a document number and field file offset." 179 180 self.begin_record() 181 182 # Read the document number. 183 184 docnum = self.read_sequence_value() 185 186 if self.last_docnum is not None: 187 self.last_docnum = self.adder(docnum, self.last_docnum) 188 else: 189 self.adder = get_adder(docnum) 190 self.last_docnum = docnum 191 192 # Read the offset. 193 194 self.last_offset += self.read_number() 195 self.end_record() 196 197 return self.last_docnum, self.last_offset 198 199 class FieldDictionaryWriter: 200 201 "Writing field dictionary details." 202 203 def __init__(self, field_writer, field_index_writer, interval): 204 self.field_writer = field_writer 205 self.field_index_writer = field_index_writer 206 self.interval = interval 207 self.entry = 0 208 209 def write_fields(self, docnum, fields): 210 211 "Write details of the document with the given 'docnum' and 'fields'." 212 213 if self.entry % self.interval == 0: 214 offset = self.field_writer.tell() 215 self.field_writer.write_fields(docnum, fields) 216 self.field_index_writer.write_document(docnum, offset) 217 else: 218 self.field_writer.write_fields(docnum, fields) 219 220 self.entry += 1 221 222 def close(self): 223 self.field_writer.close() 224 self.field_index_writer.close() 225 226 class FieldDictionaryReader: 227 228 "Reading field dictionary details." 229 230 def __init__(self, field_reader, field_index_reader): 231 self.field_reader = field_reader 232 self.field_index_reader = field_index_reader 233 234 self.cache = {} 235 self.docs = [] 236 try: 237 while 1: 238 self.docs.append(self.field_index_reader.read_document()) 239 except EOFError: 240 pass 241 242 # Large numbers for ordering purposes. 243 244 if self.docs: 245 self.max_offset = self.docs[-1][1] 246 else: 247 self.max_offset = None 248 249 # Iterator convenience methods. 250 251 def __iter__(self): 252 self.rewind() 253 return self 254 255 def next(self): 256 try: 257 return self.read_fields() 258 except EOFError: 259 raise StopIteration 260 261 # Sequential access methods. 262 263 def rewind(self): 264 self.field_reader.rewind() 265 266 def read_fields(self): 267 268 "Return the next document number and fields." 269 270 return self.field_reader.read_fields() 271 272 # Random access methods. 273 274 def get_fields(self, docnum): 275 276 "Read the fields of the document with the given 'docnum'." 277 278 if self.cache.has_key(docnum): 279 return self.cache[docnum] 280 281 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 282 283 # Get the entry position providing the term or one preceding it. 284 285 if i == -1: 286 return None 287 288 found_docnum, offset = self.docs[i] 289 290 # Read from the fields file. 291 292 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 293 294 # Scan for the document, if necessary. 295 296 try: 297 while docnum > found_docnum: 298 found_docnum, fields = self.field_reader.read_fields() 299 except EOFError: 300 pass 301 302 # If the document is found, return the fields. 303 304 if docnum == found_docnum: 305 306 # Store the fields in the cache, removing entries if the limit has 307 # been reached. 308 309 keys = self.cache.keys() 310 311 if len(keys) == DOCUMENT_CACHE_LIMIT: 312 del self.cache[keys[0]] 313 314 self.cache[docnum] = fields 315 return fields 316 else: 317 return None 318 319 def close(self): 320 self.field_reader.close() 321 self.field_index_reader.close() 322 323 # vim: tabstop=4 expandtab shiftwidth=4