1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing term information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.positions import * 23 from iixr.phrases import PhraseIterator 24 from os.path import commonprefix # to find common string prefixes 25 from bisect import bisect_right # to find terms in the dictionary index 26 27 class TermWriter(FileWriter): 28 29 "Writing term information to files." 30 31 def reset(self): 32 self.end_record() 33 self.last_term = "" 34 self.last_offset = 0 35 36 def write_term(self, term, offset, frequency, doc_frequency): 37 38 """ 39 Write the given 'term', its position file 'offset', its 'frequency' and 40 its 'doc_frequency' (number of documents in which it appears) to the 41 term information file. 42 """ 43 44 if term <= self.last_term: 45 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) 46 47 # Write the prefix length and term suffix. 48 49 common = len(commonprefix([self.last_term, term])) 50 suffix = term[common:] 51 52 self.write_number(common) 53 self.write_string(suffix) 54 55 # Write the offset delta. 56 # Write the frequency. 57 # Write the document frequency. 58 59 self.write_numbers(( 60 offset - self.last_offset, 61 frequency, 62 doc_frequency 63 )) 64 65 self.last_term = term 66 self.last_offset = offset 67 68 class TermReader(FileReader): 69 70 "Reading term information from files." 71 72 def reset(self): 73 self.last_term = "" 74 self.last_offset = 0 75 self.begin_record() 76 77 def read_term(self): 78 79 """ 80 Read a term, its position file offset, its frequency and its document 81 frequency from the term information file. 82 """ 83 84 # Read the prefix length and term suffix. 85 86 common = self.read_number() 87 suffix = self.read_string() 88 89 self.last_term = self.last_term[:common] + suffix 90 91 # Read the offset delta. 92 93 self.last_offset += self.read_number() 94 95 # Read the frequency. 96 97 frequency = self.read_number() 98 99 # Read the document frequency. 100 101 doc_frequency = self.read_number() 102 103 return self.last_term, self.last_offset, frequency, doc_frequency 104 105 def go_to_term(self, term, offset, info_offset): 106 107 """ 108 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 109 permits the scanning for later terms from the specified term. 110 """ 111 112 self.seek(info_offset) 113 self.last_term = term 114 self.last_offset = offset 115 116 class TermIndexWriter(TermWriter): 117 118 "Writing term dictionary index details to files." 119 120 def reset(self): 121 TermWriter.reset(self) 122 self.last_info_offset = 0 123 124 def write_term(self, term, offset, frequency, doc_frequency, info_offset): 125 126 """ 127 Write the given 'term', its position file 'offset', its 'frequency' and 128 its 'doc_frequency' to the term dictionary index file, along with the 129 'info_offset' in the term information file. 130 """ 131 132 TermWriter.write_term(self, term, offset, frequency, doc_frequency) 133 134 # Write the information file offset delta. 135 136 self.write_number(info_offset - self.last_info_offset) 137 138 self.last_info_offset = info_offset 139 140 class TermIndexReader(TermReader): 141 142 "Reading term dictionary index details from files." 143 144 def reset(self): 145 TermReader.reset(self) 146 self.last_info_offset = 0 147 148 def read_term(self): 149 150 """ 151 Read a term, its position file offset, its frequency, its document 152 frequency and a term information file offset from the term dictionary 153 index file. 154 """ 155 156 term, offset, frequency, doc_frequency = TermReader.read_term(self) 157 158 # Read the offset delta. 159 160 self.last_info_offset += self.read_number() 161 162 return term, offset, frequency, doc_frequency, self.last_info_offset 163 164 class TermDictionaryWriter: 165 166 "Writing term dictionaries." 167 168 def __init__(self, info_writer, index_writer, position_dict_writer, interval): 169 self.info_writer = info_writer 170 self.index_writer = index_writer 171 self.position_dict_writer = position_dict_writer 172 self.interval = interval 173 self.entry = 0 174 175 self.index_writer.reset() 176 177 def _write_term(self, term, offset, frequency, doc_frequency): 178 179 """ 180 Write the given 'term', its position file 'offset', its 'frequency' and 181 its 'doc_frequency' (number of documents in which it appears) to the 182 term information file. Return the offset before the term information was 183 written to the file. 184 """ 185 186 if self.entry % self.interval == 0: 187 self.info_writer.reset() 188 info_offset = self.info_writer.tell() 189 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) 190 191 self.info_writer.write_term(term, offset, frequency, doc_frequency) 192 self.entry += 1 193 194 def write_term_positions(self, term, doc_positions): 195 196 """ 197 Write the given 'term' and the 'doc_positions' recording the documents 198 and positions at which the term is found. 199 """ 200 201 offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) 202 203 if not frequency or not doc_frequency: 204 raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions) 205 206 self._write_term(term, offset, frequency, doc_frequency) 207 208 def close(self): 209 self.info_writer.close() 210 self.index_writer.close() 211 self.position_dict_writer.close() 212 213 class TermDictionaryReader: 214 215 "Reading term dictionaries." 216 217 def __init__(self, info_reader, index_reader, position_dict_reader): 218 self.info_reader = info_reader 219 self.index_reader = index_reader 220 self.position_dict_reader = position_dict_reader 221 222 self.info_reader.reset() 223 self.index_reader.reset() 224 225 self.entry = 0 226 self.terms = [] 227 try: 228 while 1: 229 self.terms.append(self.index_reader.read_term()) 230 except EOFError: 231 pass 232 233 # Large numbers for ordering purposes. 234 235 if self.terms: 236 self.max_offset = self.terms[-1][1] + 1 237 else: 238 self.max_offset = None 239 240 def _find_closest_entry(self, term): 241 242 """ 243 Find the offsets and frequencies of 'term' from the term dictionary or 244 the closest term starting with the value of 'term'. 245 246 Return the closest index entry consisting of a term, the position file 247 offset, the term frequency, the document frequency, and the term details 248 file offset. 249 """ 250 251 i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 252 253 # Get the entry position providing the term or one preceding it. 254 # If no entry precedes the requested term, return the very first entry 255 # as the closest. 256 257 if i == -1: 258 self.entry = 0 259 return self.terms[0] 260 else: 261 self.entry = i 262 return self.terms[i] 263 264 def _find_closest_term(self, term): 265 266 """ 267 Find the offsets and frequencies of 'term' from the term dictionary or 268 the closest term starting with the value of 'term'. 269 270 Return the closest term (or the term itself), the position file offset, 271 the term frequency, the document frequency, and the term details file 272 offset (or None if the reader is already positioned). 273 """ 274 275 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) 276 277 # Where the term is found immediately, return the offset and 278 # frequencies. If the term does not appear, return the details of the 279 # closest entry. 280 281 if term <= found_term: 282 return found_term, offset, frequency, doc_frequency, info_offset 283 284 # Otherwise, seek past the index term's entry in the information file 285 # and scan for the desired term. 286 287 else: 288 # Reset the term and offset for the new page. 289 self.info_reader.go_to_term("", 0, info_offset) 290 try: 291 while term > found_term: 292 found_term, offset, frequency, doc_frequency = self._read_term() 293 except EOFError: 294 pass 295 296 return found_term, offset, frequency, doc_frequency, None 297 298 def _find_term(self, term): 299 300 """ 301 Find the position file offset and frequency of 'term' from the term 302 dictionary. 303 """ 304 305 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 306 307 # If the term is found, return the offset and frequencies. 308 309 if term == found_term: 310 return offset, frequency, doc_frequency 311 else: 312 return None 313 314 def _get_term_and_positions(self, term, offset, frequency, doc_frequency): 315 316 """ 317 Return the term plus positions details using the given 'term', 'offset', 318 'frequency' and 'doc_frequency'. 319 """ 320 321 return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency) 322 323 def _get_positions(self, offset, doc_frequency): 324 325 """ 326 Obtain positions from the position index 'offset' expecting a number of 327 documents equal to the given 'doc_frequency'. 328 """ 329 330 return self.position_dict_reader.read_term_positions(offset, doc_frequency) 331 332 # Iterator convenience methods. 333 334 def __iter__(self): 335 self.rewind() 336 return self 337 338 def next(self): 339 try: 340 return self.read_term() 341 except EOFError: 342 raise StopIteration 343 344 # Sequential access methods. 345 346 def rewind(self): 347 self.entry = 0 348 self.info_reader.rewind() 349 350 def read_term(self): 351 352 """ 353 Return the next term, its frequency, its document frequency, and the 354 documents and positions at which the term is found. 355 """ 356 357 return self._get_term_and_positions(*self._read_term()) 358 359 def _read_term(self): 360 361 try: 362 term, offset, frequency, doc_frequency = self.info_reader.read_term() 363 except EOFError: 364 self.entry += 1 365 try: 366 term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry] 367 except IndexError: 368 raise EOFError 369 else: 370 # Reset the term and offset for the new page. 371 372 self.info_reader.go_to_term("", 0, info_offset) 373 374 # Skip the term in the information file. 375 376 self.info_reader.read_term() 377 378 return term, offset, frequency, doc_frequency 379 380 def go_to_term(self, term): 381 382 """ 383 Navigate to 'term' in the dictionary, returning the details from its 384 entry. The returned details can be augmented with position information 385 when presented to the _get_term_and_positions method. 386 """ 387 388 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 389 390 # Position the reader, if necessary. 391 392 if info_offset is not None: 393 394 # Reset the term and offset for the new page. 395 396 self.info_reader.go_to_term("", 0, info_offset) 397 398 # Skip the term in the information file. 399 400 self.info_reader.read_term() 401 402 return found_term, offset, frequency, doc_frequency 403 404 # Query methods. 405 406 def get_terms(self): 407 408 "Return a list of all terms." 409 410 return iter(self) 411 412 def find_terms(self, term): 413 414 "Return all terms whose values start with the value of 'term'." 415 416 terms = [] 417 418 found_term, offset, frequency, doc_frequency = self.go_to_term(term) 419 420 # Read and record terms. 421 422 try: 423 # Add the found term if it starts with the specified term. 424 425 while found_term.startswith(term): 426 terms.append(found_term) 427 found_term, offset, frequency, doc_frequency = self._read_term() 428 429 except EOFError: 430 pass 431 432 return terms 433 434 def find_positions(self, term): 435 436 "Return the documents and positions at which the given 'term' is found." 437 438 t = self._find_term(term) 439 if t is None: 440 return [] 441 else: 442 offset, frequency, doc_frequency = t 443 return self._get_positions(offset, doc_frequency) 444 445 def find_common_positions(self, terms): 446 447 """ 448 Return the documents and positions at which all the given 'terms' are 449 found, where only common documents are returned. 450 """ 451 452 return PhraseIterator([self.find_positions(term) for term in terms]) 453 454 def get_frequency(self, term): 455 456 "Return the frequency of the given 'term'." 457 458 t = self._find_term(term) 459 if t is None: 460 return None 461 else: 462 offset, frequency, doc_frequency = t 463 return frequency 464 465 def get_document_frequency(self, term): 466 467 "Return the document frequency of the given 'term'." 468 469 t = self._find_term(term) 470 if t is None: 471 return None 472 else: 473 offset, frequency, doc_frequency = t 474 return doc_frequency 475 476 def close(self): 477 self.info_reader.close() 478 self.index_reader.close() 479 self.position_dict_reader.close() 480 481 # vim: tabstop=4 expandtab shiftwidth=4