1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing term information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.positions import * 23 from iixr.phrases import PhraseIterator 24 from os.path import commonprefix # to find common string prefixes 25 from bisect import bisect_right # to find terms in the dictionary index 26 27 class TermWriter(FileWriter): 28 29 "Writing term information to files." 30 31 def reset(self): 32 self.last_term = "" 33 self.last_offset = 0 34 35 def write_term(self, term, offset, frequency, doc_frequency): 36 37 """ 38 Write the given 'term', its position file 'offset', its 'frequency' and 39 its 'doc_frequency' (number of documents in which it appears) to the 40 term information file. 41 """ 42 43 # Write the prefix length and term suffix. 44 45 common = len(commonprefix([self.last_term, term])) 46 suffix = term[common:] 47 48 self.write_number(common) 49 self.write_string(suffix) 50 51 # Write the offset delta. 52 53 self.write_number(offset - self.last_offset) 54 55 # Write the frequency. 56 57 self.write_number(frequency) 58 59 # Write the document frequency. 60 61 self.write_number(doc_frequency) 62 63 self.last_term = term 64 self.last_offset = offset 65 66 class TermReader(FileReader): 67 68 "Reading term information from files." 69 70 def reset(self): 71 self.last_term = "" 72 self.last_offset = 0 73 74 def read_term(self): 75 76 """ 77 Read a term, its position file offset, its frequency and its document 78 frequency from the term information file. 79 """ 80 81 # Read the prefix length and term suffix. 82 83 common = self.read_number() 84 suffix = self.read_string() 85 86 self.last_term = self.last_term[:common] + suffix 87 88 # Read the offset delta. 89 90 self.last_offset += self.read_number() 91 92 # Read the frequency. 93 94 frequency = self.read_number() 95 96 # Read the document frequency. 97 98 doc_frequency = self.read_number() 99 100 return self.last_term, self.last_offset, frequency, doc_frequency 101 102 def go_to_term(self, term, offset, info_offset): 103 104 """ 105 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 106 permits the scanning for later terms from the specified term. 107 """ 108 109 self.f.seek(info_offset) 110 self.last_term = term 111 self.last_offset = offset 112 113 class TermIndexWriter(TermWriter): 114 115 "Writing term dictionary index details to files." 116 117 def reset(self): 118 TermWriter.reset(self) 119 self.last_info_offset = 0 120 121 def write_term(self, term, offset, frequency, doc_frequency, info_offset): 122 123 """ 124 Write the given 'term', its position file 'offset', its 'frequency' and 125 its 'doc_frequency' to the term dictionary index file, along with the 126 'info_offset' in the term information file. 127 """ 128 129 TermWriter.write_term(self, term, offset, frequency, doc_frequency) 130 131 # Write the information file offset delta. 132 133 self.write_number(info_offset - self.last_info_offset) 134 self.last_info_offset = info_offset 135 136 class TermIndexReader(TermReader): 137 138 "Reading term dictionary index details from files." 139 140 def reset(self): 141 TermReader.reset(self) 142 self.last_info_offset = 0 143 144 def read_term(self): 145 146 """ 147 Read a term, its position file offset, its frequency, its document 148 frequency and a term information file offset from the term dictionary 149 index file. 150 """ 151 152 term, offset, frequency, doc_frequency = TermReader.read_term(self) 153 154 # Read the offset delta. 155 156 self.last_info_offset += self.read_number() 157 158 return term, offset, frequency, doc_frequency, self.last_info_offset 159 160 class TermDictionaryWriter: 161 162 "Writing term dictionaries." 163 164 def __init__(self, info_writer, index_writer, position_dict_writer, interval): 165 self.info_writer = info_writer 166 self.index_writer = index_writer 167 self.position_dict_writer = position_dict_writer 168 self.interval = interval 169 self.entry = 0 170 171 def _write_term(self, term, offset, frequency, doc_frequency): 172 173 """ 174 Write the given 'term', its position file 'offset', its 'frequency' and 175 its 'doc_frequency' (number of documents in which it appears) to the 176 term information file. Return the offset after the term information was 177 written to the file. 178 """ 179 180 self.info_writer.write_term(term, offset, frequency, doc_frequency) 181 182 if self.entry % self.interval == 0: 183 info_offset = self.info_writer.f.tell() 184 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) 185 186 self.entry += 1 187 188 def write_term_positions(self, term, doc_positions): 189 190 """ 191 Write the given 'term' and the 'doc_positions' recording the documents 192 and positions at which the term is found. 193 """ 194 195 offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) 196 self._write_term(term, offset, frequency, doc_frequency) 197 198 def close(self): 199 self.info_writer.close() 200 self.index_writer.close() 201 self.position_dict_writer.close() 202 203 class TermDictionaryReader: 204 205 "Reading term dictionaries." 206 207 def __init__(self, info_reader, index_reader, position_dict_reader): 208 self.info_reader = info_reader 209 self.index_reader = index_reader 210 self.position_dict_reader = position_dict_reader 211 self.position_dict_iterator = None # for sequential/iterator access 212 213 self.terms = [] 214 try: 215 while 1: 216 self.terms.append(self.index_reader.read_term()) 217 except EOFError: 218 pass 219 220 # Large numbers for ordering purposes. 221 222 if self.terms: 223 self.max_offset = self.terms[-1][1] + 1 224 else: 225 self.max_offset = None 226 227 def _find_closest_entry(self, term): 228 229 """ 230 Find the offsets and frequencies of 'term' from the term dictionary or 231 the closest term starting with the value of 'term'. 232 233 Return the closest index entry consisting of a term, the position file 234 offset, the term frequency, the document frequency, and the term details 235 file offset. 236 """ 237 238 i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 239 240 # Get the entry position providing the term or one preceding it. 241 # If no entry precedes the requested term, return the very first entry 242 # as the closest. 243 244 if i == -1: 245 return self.terms[0] 246 else: 247 return self.terms[i] 248 249 def _find_closest_term(self, term): 250 251 """ 252 Find the offsets and frequencies of 'term' from the term dictionary or 253 the closest term starting with the value of 'term'. 254 255 Return the closest term (or the term itself), the position file offset, 256 the term frequency, the document frequency, and the term details file 257 offset (or None if the reader is already positioned). 258 """ 259 260 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) 261 262 # Where the term is found immediately, return the offset and 263 # frequencies. If the term does not appear, return the details of the 264 # closest entry. 265 266 if term <= found_term: 267 return found_term, offset, frequency, doc_frequency, info_offset 268 269 # Otherwise, seek past the index term's entry in the information file 270 # and scan for the desired term. 271 272 else: 273 self.info_reader.go_to_term(found_term, offset, info_offset) 274 try: 275 while term > found_term: 276 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 277 except EOFError: 278 pass 279 280 return found_term, offset, frequency, doc_frequency, None 281 282 def _find_term(self, term): 283 284 """ 285 Find the position file offset and frequency of 'term' from the term 286 dictionary. 287 """ 288 289 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 290 291 # If the term is found, return the offset and frequencies. 292 293 if term == found_term: 294 return offset, frequency, doc_frequency 295 else: 296 return None 297 298 def _get_positions(self, offset, doc_frequency): 299 300 """ 301 Obtain positions from the position index 'offset' expecting a number of 302 documents equal to the given 'doc_frequency'. 303 """ 304 305 return self.position_dict_reader.read_term_positions(offset, doc_frequency) 306 307 # Iterator convenience methods. 308 309 def __iter__(self): 310 self.rewind() 311 return self 312 313 def next(self): 314 try: 315 return self.read_term() 316 except EOFError: 317 raise StopIteration 318 319 # Sequential access methods. 320 321 def rewind(self): 322 self.info_reader.rewind() 323 324 def read_term(self): 325 326 """ 327 Return the next term, its frequency, its document frequency, and the 328 documents and positions at which the term is found. 329 """ 330 331 term, offset, frequency, doc_frequency = self.info_reader.read_term() 332 333 # For sequential access, attempt to reuse any iterator. 334 335 if self.position_dict_iterator is None: 336 self.position_dict_iterator = self._get_positions(offset, doc_frequency) 337 338 return term, frequency, doc_frequency, ResetPositionDictionaryIterator(self.position_dict_iterator, offset, doc_frequency) 339 340 # Query methods. 341 342 def find_terms(self, term): 343 344 "Return all terms whose values start with the value of 'term'." 345 346 terms = [] 347 348 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 349 350 # Position the reader, if necessary. 351 352 if info_offset is not None: 353 self.info_reader.go_to_term(found_term, offset, info_offset) 354 355 # Read and record terms. 356 357 try: 358 # Add the found term if it starts with the specified term. 359 360 while found_term.startswith(term): 361 terms.append(found_term) 362 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 363 364 except EOFError: 365 pass 366 367 return terms 368 369 def find_positions(self, term): 370 371 "Return the documents and positions at which the given 'term' is found." 372 373 t = self._find_term(term) 374 if t is None: 375 return None 376 else: 377 offset, frequency, doc_frequency = t 378 return self._get_positions(offset, doc_frequency) 379 380 def find_common_positions(self, terms): 381 382 """ 383 Return the documents and positions at which all the given 'terms' are 384 found, where only common documents are returned. 385 """ 386 387 return PhraseIterator([self.find_positions(term) for term in terms]) 388 389 def get_frequency(self, term): 390 391 "Return the frequency of the given 'term'." 392 393 t = self._find_term(term) 394 if t is None: 395 return None 396 else: 397 offset, frequency, doc_frequency = t 398 return frequency 399 400 def get_document_frequency(self, term): 401 402 "Return the document frequency of the given 'term'." 403 404 t = self._find_term(term) 405 if t is None: 406 return None 407 else: 408 offset, frequency, doc_frequency = t 409 return doc_frequency 410 411 def close(self): 412 self.info_reader.close() 413 self.index_reader.close() 414 self.position_dict_reader.close() 415 if self.position_dict_iterator is not None: 416 self.position_dict_iterator.close() 417 self.position_dict_iterator = None 418 419 # vim: tabstop=4 expandtab shiftwidth=4