1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing term information. 5 6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.positions import * 23 from iixr.phrases import PhraseIterator 24 from os.path import commonprefix # to find common string prefixes 25 from bisect import bisect_right # to find terms in the dictionary index 26 27 class TermWriter(FileWriter): 28 29 "Writing term information to files." 30 31 def reset(self): 32 self.last_term = "" 33 self.last_offset = 0 34 35 def write_term(self, term, offset, frequency, doc_frequency): 36 37 """ 38 Write the given 'term', its position file 'offset', its 'frequency' and 39 its 'doc_frequency' (number of documents in which it appears) to the 40 term information file. 41 """ 42 43 if term <= self.last_term: 44 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) 45 46 # Write the prefix length and term suffix. 47 48 common = len(commonprefix([self.last_term, term])) 49 suffix = term[common:] 50 51 self.write_number(common) 52 self.write_string(suffix) 53 54 # Write the offset delta. 55 # Write the frequency. 56 # Write the document frequency. 57 58 self.write_numbers(( 59 offset - self.last_offset, 60 frequency, 61 doc_frequency 62 )) 63 64 self.last_term = term 65 self.last_offset = offset 66 67 class TermReader(FileReader): 68 69 "Reading term information from files." 70 71 def reset(self): 72 self.last_term = "" 73 self.last_offset = 0 74 75 def read_term(self): 76 77 """ 78 Read a term, its position file offset, its frequency and its document 79 frequency from the term information file. 80 """ 81 82 # Read the prefix length and term suffix. 83 84 common = self.read_number() 85 suffix = self.read_string() 86 87 self.last_term = self.last_term[:common] + suffix 88 89 # Read the offset delta. 90 91 self.last_offset += self.read_number() 92 93 # Read the frequency. 94 95 frequency = self.read_number() 96 97 # Read the document frequency. 98 99 doc_frequency = self.read_number() 100 101 return self.last_term, self.last_offset, frequency, doc_frequency 102 103 def go_to_term(self, term, offset, info_offset): 104 105 """ 106 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 107 permits the scanning for later terms from the specified term. 108 """ 109 110 self.seek(info_offset) 111 self.last_term = term 112 self.last_offset = offset 113 114 class TermIndexWriter(TermWriter): 115 116 "Writing term dictionary index details to files." 117 118 def reset(self): 119 TermWriter.reset(self) 120 self.last_info_offset = 0 121 122 def write_term(self, term, offset, frequency, doc_frequency, info_offset): 123 124 """ 125 Write the given 'term', its position file 'offset', its 'frequency' and 126 its 'doc_frequency' to the term dictionary index file, along with the 127 'info_offset' in the term information file. 128 """ 129 130 TermWriter.write_term(self, term, offset, frequency, doc_frequency) 131 132 # Write the information file offset delta. 133 134 self.write_number(info_offset - self.last_info_offset) 135 self.last_info_offset = info_offset 136 137 class TermIndexReader(TermReader): 138 139 "Reading term dictionary index details from files." 140 141 def reset(self): 142 TermReader.reset(self) 143 self.last_info_offset = 0 144 145 def read_term(self): 146 147 """ 148 Read a term, its position file offset, its frequency, its document 149 frequency and a term information file offset from the term dictionary 150 index file. 151 """ 152 153 term, offset, frequency, doc_frequency = TermReader.read_term(self) 154 155 # Read the offset delta. 156 157 self.last_info_offset += self.read_number() 158 159 return term, offset, frequency, doc_frequency, self.last_info_offset 160 161 class TermDictionaryWriter: 162 163 "Writing term dictionaries." 164 165 def __init__(self, info_writer, index_writer, position_dict_writer, interval): 166 self.info_writer = info_writer 167 self.index_writer = index_writer 168 self.position_dict_writer = position_dict_writer 169 self.interval = interval 170 self.entry = 0 171 172 def _write_term(self, term, offset, frequency, doc_frequency): 173 174 """ 175 Write the given 'term', its position file 'offset', its 'frequency' and 176 its 'doc_frequency' (number of documents in which it appears) to the 177 term information file. Return the offset after the term information was 178 written to the file. 179 """ 180 181 self.info_writer.write_term(term, offset, frequency, doc_frequency) 182 183 if self.entry % self.interval == 0: 184 info_offset = self.info_writer.f.tell() 185 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) 186 187 self.entry += 1 188 189 def write_term_positions(self, term, doc_positions): 190 191 """ 192 Write the given 'term' and the 'doc_positions' recording the documents 193 and positions at which the term is found. 194 """ 195 196 offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) 197 198 if not frequency or not doc_frequency: 199 raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions) 200 201 self._write_term(term, offset, frequency, doc_frequency) 202 203 def close(self): 204 self.info_writer.close() 205 self.index_writer.close() 206 self.position_dict_writer.close() 207 208 class TermDictionaryReader: 209 210 "Reading term dictionaries." 211 212 def __init__(self, info_reader, index_reader, position_dict_reader): 213 self.info_reader = info_reader 214 self.index_reader = index_reader 215 self.position_dict_reader = position_dict_reader 216 217 self.terms = [] 218 try: 219 while 1: 220 self.terms.append(self.index_reader.read_term()) 221 except EOFError: 222 pass 223 224 # Large numbers for ordering purposes. 225 226 if self.terms: 227 self.max_offset = self.terms[-1][1] + 1 228 else: 229 self.max_offset = None 230 231 def _find_closest_entry(self, term): 232 233 """ 234 Find the offsets and frequencies of 'term' from the term dictionary or 235 the closest term starting with the value of 'term'. 236 237 Return the closest index entry consisting of a term, the position file 238 offset, the term frequency, the document frequency, and the term details 239 file offset. 240 """ 241 242 i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 243 244 # Get the entry position providing the term or one preceding it. 245 # If no entry precedes the requested term, return the very first entry 246 # as the closest. 247 248 if i == -1: 249 return self.terms[0] 250 else: 251 return self.terms[i] 252 253 def _find_closest_term(self, term): 254 255 """ 256 Find the offsets and frequencies of 'term' from the term dictionary or 257 the closest term starting with the value of 'term'. 258 259 Return the closest term (or the term itself), the position file offset, 260 the term frequency, the document frequency, and the term details file 261 offset (or None if the reader is already positioned). 262 """ 263 264 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) 265 266 # Where the term is found immediately, return the offset and 267 # frequencies. If the term does not appear, return the details of the 268 # closest entry. 269 270 if term <= found_term: 271 return found_term, offset, frequency, doc_frequency, info_offset 272 273 # Otherwise, seek past the index term's entry in the information file 274 # and scan for the desired term. 275 276 else: 277 self.info_reader.go_to_term(found_term, offset, info_offset) 278 try: 279 while term > found_term: 280 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 281 except EOFError: 282 pass 283 284 return found_term, offset, frequency, doc_frequency, None 285 286 def _find_term(self, term): 287 288 """ 289 Find the position file offset and frequency of 'term' from the term 290 dictionary. 291 """ 292 293 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 294 295 # If the term is found, return the offset and frequencies. 296 297 if term == found_term: 298 return offset, frequency, doc_frequency 299 else: 300 return None 301 302 def _get_term_and_positions(self, term, offset, frequency, doc_frequency): 303 304 """ 305 Return the term plus positions details using the given 'term', 'offset', 306 'frequency' and 'doc_frequency'. 307 """ 308 309 return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency) 310 311 def _get_positions(self, offset, doc_frequency): 312 313 """ 314 Obtain positions from the position index 'offset' expecting a number of 315 documents equal to the given 'doc_frequency'. 316 """ 317 318 return self.position_dict_reader.read_term_positions(offset, doc_frequency) 319 320 # Iterator convenience methods. 321 322 def __iter__(self): 323 self.rewind() 324 return self 325 326 def next(self): 327 try: 328 return self.read_term() 329 except EOFError: 330 raise StopIteration 331 332 # Sequential access methods. 333 334 def rewind(self): 335 self.info_reader.rewind() 336 337 def read_term(self): 338 339 """ 340 Return the next term, its frequency, its document frequency, and the 341 documents and positions at which the term is found. 342 """ 343 344 term, offset, frequency, doc_frequency = self.info_reader.read_term() 345 return self._get_term_and_positions(term, offset, frequency, doc_frequency) 346 347 def go_to_term(self, term): 348 349 """ 350 Navigate to 'term' in the dictionary, returning the details from its 351 entry. The returned details can be augmented with position information 352 when presented to the _get_term_and_positions method. 353 """ 354 355 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 356 357 # Position the reader, if necessary. 358 359 if info_offset is not None: 360 self.info_reader.go_to_term(found_term, offset, info_offset) 361 362 return found_term, offset, frequency, doc_frequency 363 364 # Query methods. 365 366 def get_terms(self): 367 368 "Return a list of all terms." 369 370 return iter(self) 371 372 def find_terms(self, term): 373 374 "Return all terms whose values start with the value of 'term'." 375 376 terms = [] 377 378 found_term, offset, frequency, doc_frequency = self.go_to_term(term) 379 380 # Read and record terms. 381 382 try: 383 # Add the found term if it starts with the specified term. 384 385 while found_term.startswith(term): 386 terms.append(found_term) 387 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 388 389 except EOFError: 390 pass 391 392 return terms 393 394 def find_positions(self, term): 395 396 "Return the documents and positions at which the given 'term' is found." 397 398 t = self._find_term(term) 399 if t is None: 400 return [] 401 else: 402 offset, frequency, doc_frequency = t 403 return self._get_positions(offset, doc_frequency) 404 405 def find_common_positions(self, terms): 406 407 """ 408 Return the documents and positions at which all the given 'terms' are 409 found, where only common documents are returned. 410 """ 411 412 return PhraseIterator([self.find_positions(term) for term in terms]) 413 414 def get_frequency(self, term): 415 416 "Return the frequency of the given 'term'." 417 418 t = self._find_term(term) 419 if t is None: 420 return None 421 else: 422 offset, frequency, doc_frequency = t 423 return frequency 424 425 def get_document_frequency(self, term): 426 427 "Return the document frequency of the given 'term'." 428 429 t = self._find_term(term) 430 if t is None: 431 return None 432 else: 433 offset, frequency, doc_frequency = t 434 return doc_frequency 435 436 def close(self): 437 self.info_reader.close() 438 self.index_reader.close() 439 self.position_dict_reader.close() 440 441 # vim: tabstop=4 expandtab shiftwidth=4