1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing term information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.positions import * 23 from iixr.phrases import PhraseIterator 24 from os.path import commonprefix # to find common string prefixes 25 from bisect import bisect_right # to find terms in the dictionary index 26 27 class TermWriter(FileWriter): 28 29 "Writing term information to files." 30 31 def reset(self): 32 self.last_term = "" 33 self.last_offset = 0 34 35 def write_term(self, term, offset, frequency, doc_frequency): 36 37 """ 38 Write the given 'term', its position file 'offset', its 'frequency' and 39 its 'doc_frequency' (number of documents in which it appears) to the 40 term information file. 41 """ 42 43 self.begin_record() 44 self._write_term(term, offset, frequency, doc_frequency) 45 self.end_record() 46 47 def _write_term(self, term, offset, frequency, doc_frequency): 48 49 "Performs the term writing for 'write_term'." 50 51 if term <= self.last_term: 52 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) 53 54 # Write the prefix length and term suffix. 55 56 common = len(commonprefix([self.last_term, term])) 57 suffix = term[common:] 58 59 self.write_number(common) 60 self.write_string(suffix) 61 62 # Write the offset delta. 63 # Write the frequency. 64 # Write the document frequency. 65 66 self.write_numbers(( 67 offset - self.last_offset, 68 frequency, 69 doc_frequency 70 )) 71 72 self.last_term = term 73 self.last_offset = offset 74 75 class TermReader(FileReader): 76 77 "Reading term information from files." 78 79 def reset(self): 80 self.last_term = "" 81 self.last_offset = 0 82 83 def read_term(self): 84 85 """ 86 Read a term, its position file offset, its frequency and its document 87 frequency from the term information file. 88 """ 89 90 self.begin_record() 91 try: 92 return self._read_term() 93 finally: 94 self.end_record() 95 96 def _read_term(self): 97 98 "Performs the term reading for 'read_term'." 99 100 # Read the prefix length and term suffix. 101 102 common = self.read_number() 103 suffix = self.read_string() 104 105 self.last_term = self.last_term[:common] + suffix 106 107 # Read the offset delta. 108 109 self.last_offset += self.read_number() 110 111 # Read the frequency. 112 113 frequency = self.read_number() 114 115 # Read the document frequency. 116 117 doc_frequency = self.read_number() 118 119 return self.last_term, self.last_offset, frequency, doc_frequency 120 121 def go_to_term(self, term, offset, info_offset): 122 123 """ 124 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 125 permits the scanning for later terms from the specified term. 126 """ 127 128 self.seek(info_offset) 129 self.last_term = term 130 self.last_offset = offset 131 132 class TermIndexWriter(TermWriter): 133 134 "Writing term dictionary index details to files." 135 136 def reset(self): 137 TermWriter.reset(self) 138 self.last_info_offset = 0 139 140 def write_term(self, term, offset, frequency, doc_frequency, info_offset): 141 142 """ 143 Write the given 'term', its position file 'offset', its 'frequency' and 144 its 'doc_frequency' to the term dictionary index file, along with the 145 'info_offset' in the term information file. 146 """ 147 148 self.begin_record() 149 TermWriter._write_term(self, term, offset, frequency, doc_frequency) 150 151 # Write the information file offset delta. 152 153 self.write_number(info_offset - self.last_info_offset) 154 self.end_record() 155 156 self.last_info_offset = info_offset 157 158 class TermIndexReader(TermReader): 159 160 "Reading term dictionary index details from files." 161 162 def reset(self): 163 TermReader.reset(self) 164 self.last_info_offset = 0 165 166 def read_term(self): 167 168 """ 169 Read a term, its position file offset, its frequency, its document 170 frequency and a term information file offset from the term dictionary 171 index file. 172 """ 173 174 self.begin_record() 175 term, offset, frequency, doc_frequency = TermReader._read_term(self) 176 177 # Read the offset delta. 178 179 self.last_info_offset += self.read_number() 180 self.end_record() 181 182 return term, offset, frequency, doc_frequency, self.last_info_offset 183 184 class TermDictionaryWriter: 185 186 "Writing term dictionaries." 187 188 def __init__(self, info_writer, index_writer, position_dict_writer, interval): 189 self.info_writer = info_writer 190 self.index_writer = index_writer 191 self.position_dict_writer = position_dict_writer 192 self.interval = interval 193 self.entry = 0 194 195 def _write_term(self, term, offset, frequency, doc_frequency): 196 197 """ 198 Write the given 'term', its position file 'offset', its 'frequency' and 199 its 'doc_frequency' (number of documents in which it appears) to the 200 term information file. Return the offset after the term information was 201 written to the file. 202 """ 203 204 self.info_writer.write_term(term, offset, frequency, doc_frequency) 205 206 if self.entry % self.interval == 0: 207 info_offset = self.info_writer.tell() 208 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) 209 210 self.entry += 1 211 212 def write_term_positions(self, term, doc_positions): 213 214 """ 215 Write the given 'term' and the 'doc_positions' recording the documents 216 and positions at which the term is found. 217 """ 218 219 offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) 220 221 if not frequency or not doc_frequency: 222 raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions) 223 224 self._write_term(term, offset, frequency, doc_frequency) 225 226 def close(self): 227 self.info_writer.close() 228 self.index_writer.close() 229 self.position_dict_writer.close() 230 231 class TermDictionaryReader: 232 233 "Reading term dictionaries." 234 235 def __init__(self, info_reader, index_reader, position_dict_reader): 236 self.info_reader = info_reader 237 self.index_reader = index_reader 238 self.position_dict_reader = position_dict_reader 239 240 self.terms = [] 241 try: 242 while 1: 243 self.terms.append(self.index_reader.read_term()) 244 except EOFError: 245 pass 246 247 # Large numbers for ordering purposes. 248 249 if self.terms: 250 self.max_offset = self.terms[-1][1] + 1 251 else: 252 self.max_offset = None 253 254 def _find_closest_entry(self, term): 255 256 """ 257 Find the offsets and frequencies of 'term' from the term dictionary or 258 the closest term starting with the value of 'term'. 259 260 Return the closest index entry consisting of a term, the position file 261 offset, the term frequency, the document frequency, and the term details 262 file offset. 263 """ 264 265 i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 266 267 # Get the entry position providing the term or one preceding it. 268 # If no entry precedes the requested term, return the very first entry 269 # as the closest. 270 271 if i == -1: 272 return self.terms[0] 273 else: 274 return self.terms[i] 275 276 def _find_closest_term(self, term): 277 278 """ 279 Find the offsets and frequencies of 'term' from the term dictionary or 280 the closest term starting with the value of 'term'. 281 282 Return the closest term (or the term itself), the position file offset, 283 the term frequency, the document frequency, and the term details file 284 offset (or None if the reader is already positioned). 285 """ 286 287 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) 288 289 # Where the term is found immediately, return the offset and 290 # frequencies. If the term does not appear, return the details of the 291 # closest entry. 292 293 if term <= found_term: 294 return found_term, offset, frequency, doc_frequency, info_offset 295 296 # Otherwise, seek past the index term's entry in the information file 297 # and scan for the desired term. 298 299 else: 300 self.info_reader.go_to_term(found_term, offset, info_offset) 301 try: 302 while term > found_term: 303 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 304 except EOFError: 305 pass 306 307 return found_term, offset, frequency, doc_frequency, None 308 309 def _find_term(self, term): 310 311 """ 312 Find the position file offset and frequency of 'term' from the term 313 dictionary. 314 """ 315 316 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 317 318 # If the term is found, return the offset and frequencies. 319 320 if term == found_term: 321 return offset, frequency, doc_frequency 322 else: 323 return None 324 325 def _get_term_and_positions(self, term, offset, frequency, doc_frequency): 326 327 """ 328 Return the term plus positions details using the given 'term', 'offset', 329 'frequency' and 'doc_frequency'. 330 """ 331 332 return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency) 333 334 def _get_positions(self, offset, doc_frequency): 335 336 """ 337 Obtain positions from the position index 'offset' expecting a number of 338 documents equal to the given 'doc_frequency'. 339 """ 340 341 return self.position_dict_reader.read_term_positions(offset, doc_frequency) 342 343 # Iterator convenience methods. 344 345 def __iter__(self): 346 self.rewind() 347 return self 348 349 def next(self): 350 try: 351 return self.read_term() 352 except EOFError: 353 raise StopIteration 354 355 # Sequential access methods. 356 357 def rewind(self): 358 self.info_reader.rewind() 359 360 def read_term(self): 361 362 """ 363 Return the next term, its frequency, its document frequency, and the 364 documents and positions at which the term is found. 365 """ 366 367 term, offset, frequency, doc_frequency = self.info_reader.read_term() 368 return self._get_term_and_positions(term, offset, frequency, doc_frequency) 369 370 def go_to_term(self, term): 371 372 """ 373 Navigate to 'term' in the dictionary, returning the details from its 374 entry. The returned details can be augmented with position information 375 when presented to the _get_term_and_positions method. 376 """ 377 378 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 379 380 # Position the reader, if necessary. 381 382 if info_offset is not None: 383 self.info_reader.go_to_term(found_term, offset, info_offset) 384 385 return found_term, offset, frequency, doc_frequency 386 387 # Query methods. 388 389 def get_terms(self): 390 391 "Return a list of all terms." 392 393 return iter(self) 394 395 def find_terms(self, term): 396 397 "Return all terms whose values start with the value of 'term'." 398 399 terms = [] 400 401 found_term, offset, frequency, doc_frequency = self.go_to_term(term) 402 403 # Read and record terms. 404 405 try: 406 # Add the found term if it starts with the specified term. 407 408 while found_term.startswith(term): 409 terms.append(found_term) 410 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 411 412 except EOFError: 413 pass 414 415 return terms 416 417 def find_positions(self, term): 418 419 "Return the documents and positions at which the given 'term' is found." 420 421 t = self._find_term(term) 422 if t is None: 423 return [] 424 else: 425 offset, frequency, doc_frequency = t 426 return self._get_positions(offset, doc_frequency) 427 428 def find_common_positions(self, terms): 429 430 """ 431 Return the documents and positions at which all the given 'terms' are 432 found, where only common documents are returned. 433 """ 434 435 return PhraseIterator([self.find_positions(term) for term in terms]) 436 437 def get_frequency(self, term): 438 439 "Return the frequency of the given 'term'." 440 441 t = self._find_term(term) 442 if t is None: 443 return None 444 else: 445 offset, frequency, doc_frequency = t 446 return frequency 447 448 def get_document_frequency(self, term): 449 450 "Return the document frequency of the given 'term'." 451 452 t = self._find_term(term) 453 if t is None: 454 return None 455 else: 456 offset, frequency, doc_frequency = t 457 return doc_frequency 458 459 def close(self): 460 self.info_reader.close() 461 self.index_reader.close() 462 self.position_dict_reader.close() 463 464 # vim: tabstop=4 expandtab shiftwidth=4