1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing term information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.positions import * 23 from os.path import commonprefix # to find common string prefixes 24 from bisect import bisect_right # to find terms in the dictionary index 25 26 class TermWriter(FileWriter): 27 28 "Writing term information to files." 29 30 def reset(self): 31 self.last_term = "" 32 self.last_offset = 0 33 34 def write_term(self, term, offset, frequency, doc_frequency): 35 36 """ 37 Write the given 'term', its position file 'offset', its 'frequency' and 38 its 'doc_frequency' (number of documents in which it appears) to the 39 term information file. Return the offset after the term information was 40 written to the file. 41 """ 42 43 # Write the prefix length and term suffix. 44 45 common = len(commonprefix([self.last_term, term])) 46 suffix = term[common:] 47 48 self.write_number(common) 49 self.write_string(suffix) 50 51 # Write the offset delta. 52 53 self.write_number(offset - self.last_offset) 54 55 # Write the frequency. 56 57 self.write_number(frequency) 58 59 # Write the document frequency. 60 61 self.write_number(doc_frequency) 62 63 self.last_term = term 64 self.last_offset = offset 65 66 return self.tell() 67 68 class TermReader(FileReader): 69 70 "Reading term information from files." 71 72 def reset(self): 73 self.last_term = "" 74 self.last_offset = 0 75 76 def read_term(self): 77 78 """ 79 Read a term, its position file offset, its frequency and its document 80 frequency from the term information file. 81 """ 82 83 # Read the prefix length and term suffix. 84 85 common = self.read_number() 86 suffix = self.read_string() 87 88 self.last_term = self.last_term[:common] + suffix 89 90 # Read the offset delta. 91 92 self.last_offset += self.read_number() 93 94 # Read the frequency. 95 96 frequency = self.read_number() 97 98 # Read the document frequency. 99 100 doc_frequency = self.read_number() 101 102 return self.last_term, self.last_offset, frequency, doc_frequency 103 104 def go_to_term(self, term, offset, info_offset): 105 106 """ 107 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 108 permits the scanning for later terms from the specified term. 109 """ 110 111 self.seek(info_offset) 112 self.last_term = term 113 self.last_offset = offset 114 115 class TermIndexWriter(TermWriter): 116 117 "Writing term dictionary index details to files." 118 119 def reset(self): 120 TermWriter.reset(self) 121 self.last_info_offset = 0 122 123 def write_term(self, term, offset, frequency, doc_frequency, info_offset): 124 125 """ 126 Write the given 'term', its position file 'offset', its 'frequency' and 127 its 'doc_frequency' to the term dictionary index file, along with the 128 'info_offset' in the term information file. 129 """ 130 131 TermWriter.write_term(self, term, offset, frequency, doc_frequency) 132 133 # Write the information file offset delta. 134 135 self.write_number(info_offset - self.last_info_offset) 136 self.last_info_offset = info_offset 137 138 class TermIndexReader(TermReader): 139 140 "Reading term dictionary index details from files." 141 142 def reset(self): 143 TermReader.reset(self) 144 self.last_info_offset = 0 145 146 def read_term(self): 147 148 """ 149 Read a term, its position file offset, its frequency, its document 150 frequency and a term information file offset from the term dictionary 151 index file. 152 """ 153 154 term, offset, frequency, doc_frequency = TermReader.read_term(self) 155 156 # Read the offset delta. 157 158 self.last_info_offset += self.read_number() 159 160 return term, offset, frequency, doc_frequency, self.last_info_offset 161 162 class TermDictionaryWriter: 163 164 "Writing term dictionaries." 165 166 def __init__(self, info_writer, index_writer, position_dict_writer, interval): 167 self.info_writer = info_writer 168 self.index_writer = index_writer 169 self.position_dict_writer = position_dict_writer 170 self.interval = interval 171 self.entry = 0 172 173 def _write_term(self, term, offset, frequency, doc_frequency): 174 175 """ 176 Write the given 'term', its position file 'offset', its 'frequency' and 177 its 'doc_frequency' (number of documents in which it appears) to the 178 term information file. Return the offset after the term information was 179 written to the file. 180 """ 181 182 info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency) 183 184 if self.entry % self.interval == 0: 185 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) 186 187 self.entry += 1 188 189 def write_term_positions(self, term, doc_positions): 190 191 """ 192 Write the given 'term' and the 'doc_positions' recording the documents 193 and positions at which the term is found. 194 """ 195 196 offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) 197 self._write_term(term, offset, frequency, doc_frequency) 198 199 def close(self): 200 self.info_writer.close() 201 self.index_writer.close() 202 self.position_dict_writer.close() 203 204 class TermDictionaryReader: 205 206 "Reading term dictionaries." 207 208 def __init__(self, info_reader, index_reader, position_dict_reader): 209 self.info_reader = info_reader 210 self.index_reader = index_reader 211 self.position_dict_reader = position_dict_reader 212 self.position_dict_iterator = None # for sequential/iterator access 213 214 self.terms = [] 215 try: 216 while 1: 217 self.terms.append(self.index_reader.read_term()) 218 except EOFError: 219 pass 220 221 # Large numbers for ordering purposes. 222 223 if self.terms: 224 self.max_offset = self.terms[-1][1] + 1 225 else: 226 self.max_offset = None 227 228 def _find_closest_entry(self, term): 229 230 """ 231 Find the offsets and frequencies of 'term' from the term dictionary or 232 the closest term starting with the value of 'term'. 233 234 Return the closest index entry consisting of a term, the position file 235 offset, the term frequency, the document frequency, and the term details 236 file offset. 237 """ 238 239 i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 240 241 # Get the entry position providing the term or one preceding it. 242 # If no entry precedes the requested term, return the very first entry 243 # as the closest. 244 245 if i == -1: 246 return self.terms[0] 247 else: 248 return self.terms[i] 249 250 def _find_closest_term(self, term): 251 252 """ 253 Find the offsets and frequencies of 'term' from the term dictionary or 254 the closest term starting with the value of 'term'. 255 256 Return the closest term (or the term itself), the position file offset, 257 the term frequency, the document frequency, and the term details file 258 offset (or None if the reader is already positioned). 259 """ 260 261 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) 262 263 # Where the term is found immediately, return the offset and 264 # frequencies. If the term does not appear, return the details of the 265 # closest entry. 266 267 if term <= found_term: 268 return found_term, offset, frequency, doc_frequency, info_offset 269 270 # Otherwise, seek past the index term's entry in the information file 271 # and scan for the desired term. 272 273 else: 274 self.info_reader.go_to_term(found_term, offset, info_offset) 275 try: 276 while term > found_term: 277 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 278 except EOFError: 279 pass 280 281 return found_term, offset, frequency, doc_frequency, None 282 283 def _find_term(self, term): 284 285 """ 286 Find the position file offset and frequency of 'term' from the term 287 dictionary. 288 """ 289 290 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 291 292 # If the term is found, return the offset and frequencies. 293 294 if term == found_term: 295 return offset, frequency, doc_frequency 296 else: 297 return None 298 299 def _get_positions(self, offset, doc_frequency): 300 return self.position_dict_reader.read_term_positions(offset, doc_frequency) 301 302 # Iterator convenience methods. 303 304 def __iter__(self): 305 self.rewind() 306 return self 307 308 def next(self): 309 try: 310 return self.read_term() 311 except EOFError: 312 raise StopIteration 313 314 # Sequential access methods. 315 316 def rewind(self): 317 self.info_reader.rewind() 318 319 def read_term(self): 320 321 """ 322 Return the next term, its frequency, its document frequency, and the 323 documents and positions at which the term is found. 324 """ 325 326 term, offset, frequency, doc_frequency = self.info_reader.read_term() 327 328 # For sequential access, attempt to reuse any iterator. 329 330 if self.position_dict_iterator is None: 331 self.position_dict_iterator = self._get_positions(offset, doc_frequency) 332 333 return term, frequency, doc_frequency, ResetPositionDictionaryIterator(self.position_dict_iterator, offset, doc_frequency) 334 335 # Query methods. 336 337 def find_terms(self, term): 338 339 "Return all terms whose values start with the value of 'term'." 340 341 terms = [] 342 343 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 344 345 # Position the reader, if necessary. 346 347 if info_offset is not None: 348 self.info_reader.go_to_term(found_term, offset, info_offset) 349 350 # Read and record terms. 351 352 try: 353 # Add the found term if it starts with the specified term. 354 355 while found_term.startswith(term): 356 terms.append(found_term) 357 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 358 359 except EOFError: 360 pass 361 362 return terms 363 364 def find_positions(self, term): 365 366 "Return the documents and positions at which the given 'term' is found." 367 368 t = self._find_term(term) 369 if t is None: 370 return None 371 else: 372 offset, frequency, doc_frequency = t 373 return self._get_positions(offset, doc_frequency) 374 375 def get_frequency(self, term): 376 377 "Return the frequency of the given 'term'." 378 379 t = self._find_term(term) 380 if t is None: 381 return None 382 else: 383 offset, frequency, doc_frequency = t 384 return frequency 385 386 def get_document_frequency(self, term): 387 388 "Return the document frequency of the given 'term'." 389 390 t = self._find_term(term) 391 if t is None: 392 return None 393 else: 394 offset, frequency, doc_frequency = t 395 return doc_frequency 396 397 def close(self): 398 self.info_reader.close() 399 self.index_reader.close() 400 self.position_dict_reader.close() 401 402 # vim: tabstop=4 expandtab shiftwidth=4