1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing term information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.positions import * 23 from os.path import commonprefix # to find common string prefixes 24 from bisect import bisect_right # to find terms in the dictionary index 25 26 class TermWriter(FileWriter): 27 28 "Writing term information to files." 29 30 def reset(self): 31 self.last_term = "" 32 self.last_offset = 0 33 34 def write_term(self, term, offset, frequency, doc_frequency): 35 36 """ 37 Write the given 'term', its position file 'offset', its 'frequency' and 38 its 'doc_frequency' (number of documents in which it appears) to the 39 term information file. 40 """ 41 42 # Write the prefix length and term suffix. 43 44 common = len(commonprefix([self.last_term, term])) 45 suffix = term[common:] 46 47 self.write_number(common) 48 self.write_string(suffix) 49 50 # Write the offset delta. 51 52 self.write_number(offset - self.last_offset) 53 54 # Write the frequency. 55 56 self.write_number(frequency) 57 58 # Write the document frequency. 59 60 self.write_number(doc_frequency) 61 62 self.last_term = term 63 self.last_offset = offset 64 65 class TermReader(FileReader): 66 67 "Reading term information from files." 68 69 def reset(self): 70 self.last_term = "" 71 self.last_offset = 0 72 73 def read_term(self): 74 75 """ 76 Read a term, its position file offset, its frequency and its document 77 frequency from the term information file. 78 """ 79 80 # Read the prefix length and term suffix. 81 82 common = self.read_number() 83 suffix = self.read_string() 84 85 self.last_term = self.last_term[:common] + suffix 86 87 # Read the offset delta. 88 89 self.last_offset += self.read_number() 90 91 # Read the frequency. 92 93 frequency = self.read_number() 94 95 # Read the document frequency. 96 97 doc_frequency = self.read_number() 98 99 return self.last_term, self.last_offset, frequency, doc_frequency 100 101 def go_to_term(self, term, offset, info_offset): 102 103 """ 104 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 105 permits the scanning for later terms from the specified term. 106 """ 107 108 self.f.seek(info_offset) 109 self.last_term = term 110 self.last_offset = offset 111 112 class TermIndexWriter(TermWriter): 113 114 "Writing term dictionary index details to files." 115 116 def reset(self): 117 TermWriter.reset(self) 118 self.last_info_offset = 0 119 120 def write_term(self, term, offset, frequency, doc_frequency, info_offset): 121 122 """ 123 Write the given 'term', its position file 'offset', its 'frequency' and 124 its 'doc_frequency' to the term dictionary index file, along with the 125 'info_offset' in the term information file. 126 """ 127 128 TermWriter.write_term(self, term, offset, frequency, doc_frequency) 129 130 # Write the information file offset delta. 131 132 self.write_number(info_offset - self.last_info_offset) 133 self.last_info_offset = info_offset 134 135 class TermIndexReader(TermReader): 136 137 "Reading term dictionary index details from files." 138 139 def reset(self): 140 TermReader.reset(self) 141 self.last_info_offset = 0 142 143 def read_term(self): 144 145 """ 146 Read a term, its position file offset, its frequency, its document 147 frequency and a term information file offset from the term dictionary 148 index file. 149 """ 150 151 term, offset, frequency, doc_frequency = TermReader.read_term(self) 152 153 # Read the offset delta. 154 155 self.last_info_offset += self.read_number() 156 157 return term, offset, frequency, doc_frequency, self.last_info_offset 158 159 class TermDictionaryWriter: 160 161 "Writing term dictionaries." 162 163 def __init__(self, info_writer, index_writer, position_dict_writer, interval): 164 self.info_writer = info_writer 165 self.index_writer = index_writer 166 self.position_dict_writer = position_dict_writer 167 self.interval = interval 168 self.entry = 0 169 170 def _write_term(self, term, offset, frequency, doc_frequency): 171 172 """ 173 Write the given 'term', its position file 'offset', its 'frequency' and 174 its 'doc_frequency' (number of documents in which it appears) to the 175 term information file. Return the offset after the term information was 176 written to the file. 177 """ 178 179 self.info_writer.write_term(term, offset, frequency, doc_frequency) 180 181 if self.entry % self.interval == 0: 182 info_offset = self.info_writer.f.tell() 183 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) 184 185 self.entry += 1 186 187 def write_term_positions(self, term, doc_positions): 188 189 """ 190 Write the given 'term' and the 'doc_positions' recording the documents 191 and positions at which the term is found. 192 """ 193 194 offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) 195 self._write_term(term, offset, frequency, doc_frequency) 196 197 def close(self): 198 self.info_writer.close() 199 self.index_writer.close() 200 self.position_dict_writer.close() 201 202 class TermDictionaryReader: 203 204 "Reading term dictionaries." 205 206 def __init__(self, info_reader, index_reader, position_dict_reader): 207 self.info_reader = info_reader 208 self.index_reader = index_reader 209 self.position_dict_reader = position_dict_reader 210 self.position_dict_iterator = None # for sequential/iterator access 211 212 self.terms = [] 213 try: 214 while 1: 215 self.terms.append(self.index_reader.read_term()) 216 except EOFError: 217 pass 218 219 # Large numbers for ordering purposes. 220 221 if self.terms: 222 self.max_offset = self.terms[-1][1] + 1 223 else: 224 self.max_offset = None 225 226 def _find_closest_entry(self, term): 227 228 """ 229 Find the offsets and frequencies of 'term' from the term dictionary or 230 the closest term starting with the value of 'term'. 231 232 Return the closest index entry consisting of a term, the position file 233 offset, the term frequency, the document frequency, and the term details 234 file offset. 235 """ 236 237 i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 238 239 # Get the entry position providing the term or one preceding it. 240 # If no entry precedes the requested term, return the very first entry 241 # as the closest. 242 243 if i == -1: 244 return self.terms[0] 245 else: 246 return self.terms[i] 247 248 def _find_closest_term(self, term): 249 250 """ 251 Find the offsets and frequencies of 'term' from the term dictionary or 252 the closest term starting with the value of 'term'. 253 254 Return the closest term (or the term itself), the position file offset, 255 the term frequency, the document frequency, and the term details file 256 offset (or None if the reader is already positioned). 257 """ 258 259 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) 260 261 # Where the term is found immediately, return the offset and 262 # frequencies. If the term does not appear, return the details of the 263 # closest entry. 264 265 if term <= found_term: 266 return found_term, offset, frequency, doc_frequency, info_offset 267 268 # Otherwise, seek past the index term's entry in the information file 269 # and scan for the desired term. 270 271 else: 272 self.info_reader.go_to_term(found_term, offset, info_offset) 273 try: 274 while term > found_term: 275 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 276 except EOFError: 277 pass 278 279 return found_term, offset, frequency, doc_frequency, None 280 281 def _find_term(self, term): 282 283 """ 284 Find the position file offset and frequency of 'term' from the term 285 dictionary. 286 """ 287 288 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 289 290 # If the term is found, return the offset and frequencies. 291 292 if term == found_term: 293 return offset, frequency, doc_frequency 294 else: 295 return None 296 297 def _get_positions(self, offset, doc_frequency): 298 299 """ 300 Obtain positions from the position index 'offset' expecting a number of 301 documents equal to the given 'doc_frequency'. 302 """ 303 304 return self.position_dict_reader.read_term_positions(offset, doc_frequency) 305 306 # Iterator convenience methods. 307 308 def __iter__(self): 309 self.rewind() 310 return self 311 312 def next(self): 313 try: 314 return self.read_term() 315 except EOFError: 316 raise StopIteration 317 318 # Sequential access methods. 319 320 def rewind(self): 321 self.info_reader.rewind() 322 323 def read_term(self): 324 325 """ 326 Return the next term, its frequency, its document frequency, and the 327 documents and positions at which the term is found. 328 """ 329 330 term, offset, frequency, doc_frequency = self.info_reader.read_term() 331 332 # For sequential access, attempt to reuse any iterator. 333 334 if self.position_dict_iterator is None: 335 self.position_dict_iterator = self._get_positions(offset, doc_frequency) 336 337 return term, frequency, doc_frequency, ResetPositionDictionaryIterator(self.position_dict_iterator, offset, doc_frequency) 338 339 # Query methods. 340 341 def find_terms(self, term): 342 343 "Return all terms whose values start with the value of 'term'." 344 345 terms = [] 346 347 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 348 349 # Position the reader, if necessary. 350 351 if info_offset is not None: 352 self.info_reader.go_to_term(found_term, offset, info_offset) 353 354 # Read and record terms. 355 356 try: 357 # Add the found term if it starts with the specified term. 358 359 while found_term.startswith(term): 360 terms.append(found_term) 361 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 362 363 except EOFError: 364 pass 365 366 return terms 367 368 def find_positions(self, term): 369 370 "Return the documents and positions at which the given 'term' is found." 371 372 t = self._find_term(term) 373 if t is None: 374 return None 375 else: 376 offset, frequency, doc_frequency = t 377 return self._get_positions(offset, doc_frequency) 378 379 def get_frequency(self, term): 380 381 "Return the frequency of the given 'term'." 382 383 t = self._find_term(term) 384 if t is None: 385 return None 386 else: 387 offset, frequency, doc_frequency = t 388 return frequency 389 390 def get_document_frequency(self, term): 391 392 "Return the document frequency of the given 'term'." 393 394 t = self._find_term(term) 395 if t is None: 396 return None 397 else: 398 offset, frequency, doc_frequency = t 399 return doc_frequency 400 401 def close(self): 402 self.info_reader.close() 403 self.index_reader.close() 404 self.position_dict_reader.close() 405 if self.position_dict_iterator is not None: 406 self.position_dict_iterator.close() 407 self.position_dict_iterator = None 408 409 # vim: tabstop=4 expandtab shiftwidth=4