1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing term information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.positions import * 23 from iixr.phrases import PhraseIterator 24 from os.path import commonprefix # to find common string prefixes 25 from bisect import bisect_right # to find terms in the dictionary index 26 27 class TermWriter(FileWriter): 28 29 "Writing term information to files." 30 31 def reset(self): 32 self.end_record() 33 self.last_term = "" 34 self.last_offset = 0 35 36 def write_term(self, term, offset, frequency, doc_frequency): 37 38 """ 39 Write the given 'term', its position file 'offset', its 'frequency' and 40 its 'doc_frequency' (number of documents in which it appears) to the 41 term information file. 42 """ 43 44 if term <= self.last_term: 45 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) 46 47 # Write the prefix length and term suffix. 48 49 common = len(commonprefix([self.last_term, term])) 50 suffix = term[common:] 51 52 self.write_number(common) 53 self.write_string(suffix) 54 55 # Write the offset delta. 56 # Write the frequency. 57 # Write the document frequency. 58 59 self.write_numbers(( 60 offset - self.last_offset, 61 frequency, 62 doc_frequency 63 )) 64 65 self.last_term = term 66 self.last_offset = offset 67 68 class TermReader(FileReader): 69 70 "Reading term information from files." 71 72 def reset(self): 73 self.last_term = "" 74 self.last_offset = 0 75 self.begin_record() 76 77 def read_term(self): 78 79 """ 80 Read a term, its position file offset, its frequency and its document 81 frequency from the term information file. 82 """ 83 84 # Read the prefix length and term suffix. 85 86 common = self.read_number() 87 suffix = self.read_string() 88 89 self.last_term = self.last_term[:common] + suffix 90 91 # Read the offset delta. 92 93 self.last_offset += self.read_number() 94 95 # Read the frequency. 96 97 frequency = self.read_number() 98 99 # Read the document frequency. 100 101 doc_frequency = self.read_number() 102 103 return self.last_term, self.last_offset, frequency, doc_frequency 104 105 def go_to_term(self, term, offset, info_offset): 106 107 """ 108 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 109 permits the scanning for later terms from the specified term. 110 """ 111 112 self.seek(info_offset) 113 self.last_term = term 114 self.last_offset = offset 115 116 class TermIndexWriter(TermWriter): 117 118 "Writing term dictionary index details to files." 119 120 def reset(self): 121 TermWriter.reset(self) 122 self.last_info_offset = 0 123 124 def write_term(self, term, offset, frequency, doc_frequency, info_offset): 125 126 """ 127 Write the given 'term', its position file 'offset', its 'frequency' and 128 its 'doc_frequency' to the term dictionary index file, along with the 129 'info_offset' in the term information file. 130 """ 131 132 TermWriter.write_term(self, term, offset, frequency, doc_frequency) 133 134 # Write the information file offset delta. 135 136 self.write_number(info_offset - self.last_info_offset) 137 138 self.last_info_offset = info_offset 139 140 class TermIndexReader(TermReader): 141 142 "Reading term dictionary index details from files." 143 144 def reset(self): 145 TermReader.reset(self) 146 self.last_info_offset = 0 147 148 def read_term(self): 149 150 """ 151 Read a term, its position file offset, its frequency, its document 152 frequency and a term information file offset from the term dictionary 153 index file. 154 """ 155 156 term, offset, frequency, doc_frequency = TermReader.read_term(self) 157 158 # Read the offset delta. 159 160 self.last_info_offset += self.read_number() 161 162 return term, offset, frequency, doc_frequency, self.last_info_offset 163 164 class TermDictionaryWriter: 165 166 "Writing term dictionaries." 167 168 def __init__(self, info_writer, index_writer, position_dict_writer, interval): 169 self.info_writer = info_writer 170 self.index_writer = index_writer 171 self.position_dict_writer = position_dict_writer 172 self.interval = interval 173 self.entry = 0 174 175 def _write_term(self, term, offset, frequency, doc_frequency): 176 177 """ 178 Write the given 'term', its position file 'offset', its 'frequency' and 179 its 'doc_frequency' (number of documents in which it appears) to the 180 term information file. Return the offset before the term information was 181 written to the file. 182 """ 183 184 if self.entry % self.interval == 0: 185 self.info_writer.reset() 186 info_offset = self.info_writer.tell() 187 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) 188 189 self.info_writer.write_term(term, offset, frequency, doc_frequency) 190 self.entry += 1 191 192 def write_term_positions(self, term, doc_positions): 193 194 """ 195 Write the given 'term' and the 'doc_positions' recording the documents 196 and positions at which the term is found. 197 """ 198 199 offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) 200 201 if not frequency or not doc_frequency: 202 raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions) 203 204 self._write_term(term, offset, frequency, doc_frequency) 205 206 def close(self): 207 self.info_writer.close() 208 self.index_writer.close() 209 self.position_dict_writer.close() 210 211 class TermDictionaryReader: 212 213 "Reading term dictionaries." 214 215 def __init__(self, info_reader, index_reader, position_dict_reader): 216 self.info_reader = info_reader 217 self.index_reader = index_reader 218 self.position_dict_reader = position_dict_reader 219 self.entry = 0 220 221 self.terms = [] 222 try: 223 while 1: 224 self.terms.append(self.index_reader.read_term()) 225 except EOFError: 226 pass 227 228 # Large numbers for ordering purposes. 229 230 if self.terms: 231 self.max_offset = self.terms[-1][1] + 1 232 else: 233 self.max_offset = None 234 235 def _find_closest_entry(self, term): 236 237 """ 238 Find the offsets and frequencies of 'term' from the term dictionary or 239 the closest term starting with the value of 'term'. 240 241 Return the closest index entry consisting of a term, the position file 242 offset, the term frequency, the document frequency, and the term details 243 file offset. 244 """ 245 246 i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 247 248 # Get the entry position providing the term or one preceding it. 249 # If no entry precedes the requested term, return the very first entry 250 # as the closest. 251 252 if i == -1: 253 self.entry = 0 254 return self.terms[0] 255 else: 256 self.entry = i 257 return self.terms[i] 258 259 def _find_closest_term(self, term): 260 261 """ 262 Find the offsets and frequencies of 'term' from the term dictionary or 263 the closest term starting with the value of 'term'. 264 265 Return the closest term (or the term itself), the position file offset, 266 the term frequency, the document frequency, and the term details file 267 offset (or None if the reader is already positioned). 268 """ 269 270 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) 271 272 # Where the term is found immediately, return the offset and 273 # frequencies. If the term does not appear, return the details of the 274 # closest entry. 275 276 if term <= found_term: 277 return found_term, offset, frequency, doc_frequency, info_offset 278 279 # Otherwise, seek past the index term's entry in the information file 280 # and scan for the desired term. 281 282 else: 283 # Reset the term and offset for the new page. 284 self.info_reader.go_to_term("", 0, info_offset) 285 try: 286 while term > found_term: 287 found_term, offset, frequency, doc_frequency = self._read_term() 288 except EOFError: 289 pass 290 291 return found_term, offset, frequency, doc_frequency, None 292 293 def _find_term(self, term): 294 295 """ 296 Find the position file offset and frequency of 'term' from the term 297 dictionary. 298 """ 299 300 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 301 302 # If the term is found, return the offset and frequencies. 303 304 if term == found_term: 305 return offset, frequency, doc_frequency 306 else: 307 return None 308 309 def _get_term_and_positions(self, term, offset, frequency, doc_frequency): 310 311 """ 312 Return the term plus positions details using the given 'term', 'offset', 313 'frequency' and 'doc_frequency'. 314 """ 315 316 return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency) 317 318 def _get_positions(self, offset, doc_frequency): 319 320 """ 321 Obtain positions from the position index 'offset' expecting a number of 322 documents equal to the given 'doc_frequency'. 323 """ 324 325 return self.position_dict_reader.read_term_positions(offset, doc_frequency) 326 327 # Iterator convenience methods. 328 329 def __iter__(self): 330 self.rewind() 331 return self 332 333 def next(self): 334 try: 335 return self.read_term() 336 except EOFError: 337 raise StopIteration 338 339 # Sequential access methods. 340 341 def rewind(self): 342 self.entry = 0 343 self.info_reader.rewind() 344 345 def read_term(self): 346 347 """ 348 Return the next term, its frequency, its document frequency, and the 349 documents and positions at which the term is found. 350 """ 351 352 return self._get_term_and_positions(*self._read_term()) 353 354 def _read_term(self): 355 356 try: 357 term, offset, frequency, doc_frequency = self.info_reader.read_term() 358 except EOFError: 359 self.entry += 1 360 try: 361 term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry] 362 except IndexError: 363 raise EOFError 364 else: 365 # Reset the term and offset for the new page. 366 367 self.info_reader.go_to_term("", 0, info_offset) 368 369 # Skip the term in the information file. 370 371 self.info_reader.read_term() 372 373 return term, offset, frequency, doc_frequency 374 375 def go_to_term(self, term): 376 377 """ 378 Navigate to 'term' in the dictionary, returning the details from its 379 entry. The returned details can be augmented with position information 380 when presented to the _get_term_and_positions method. 381 """ 382 383 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 384 385 # Position the reader, if necessary. 386 387 if info_offset is not None: 388 389 # Reset the term and offset for the new page. 390 391 self.info_reader.go_to_term("", 0, info_offset) 392 393 # Skip the term in the information file. 394 395 self.info_reader.read_term() 396 397 return found_term, offset, frequency, doc_frequency 398 399 # Query methods. 400 401 def get_terms(self): 402 403 "Return a list of all terms." 404 405 return iter(self) 406 407 def find_terms(self, term): 408 409 "Return all terms whose values start with the value of 'term'." 410 411 terms = [] 412 413 found_term, offset, frequency, doc_frequency = self.go_to_term(term) 414 415 # Read and record terms. 416 417 try: 418 # Add the found term if it starts with the specified term. 419 420 while found_term.startswith(term): 421 terms.append(found_term) 422 found_term, offset, frequency, doc_frequency = self._read_term() 423 424 except EOFError: 425 pass 426 427 return terms 428 429 def find_positions(self, term): 430 431 "Return the documents and positions at which the given 'term' is found." 432 433 t = self._find_term(term) 434 if t is None: 435 return [] 436 else: 437 offset, frequency, doc_frequency = t 438 return self._get_positions(offset, doc_frequency) 439 440 def find_common_positions(self, terms): 441 442 """ 443 Return the documents and positions at which all the given 'terms' are 444 found, where only common documents are returned. 445 """ 446 447 return PhraseIterator([self.find_positions(term) for term in terms]) 448 449 def get_frequency(self, term): 450 451 "Return the frequency of the given 'term'." 452 453 t = self._find_term(term) 454 if t is None: 455 return None 456 else: 457 offset, frequency, doc_frequency = t 458 return frequency 459 460 def get_document_frequency(self, term): 461 462 "Return the document frequency of the given 'term'." 463 464 t = self._find_term(term) 465 if t is None: 466 return None 467 else: 468 offset, frequency, doc_frequency = t 469 return doc_frequency 470 471 def close(self): 472 self.info_reader.close() 473 self.index_reader.close() 474 self.position_dict_reader.close() 475 476 # vim: tabstop=4 expandtab shiftwidth=4