1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint, vint_to_array 23 from array import array 24 25 class PositionWriter(FileWriter): 26 27 "Writing position information to files." 28 29 def reset(self): 30 self.last_docnum = 0 31 32 def write_positions(self, docnum, positions): 33 34 """ 35 Write for the document 'docnum' the given 'positions'. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Make sure that the positions are sorted. 42 43 positions.sort() 44 45 # Write the document number delta. 46 # Write the number of positions. 47 48 output = array('B') 49 vint_to_array(docnum - self.last_docnum, output) 50 vint_to_array(len(positions), output) 51 52 # Write the position deltas. 53 54 last = 0 55 56 for position in positions: 57 vint_to_array(position - last, output) 58 last = position 59 60 output.tofile(self.f) 61 62 self.last_docnum = docnum 63 64 class PositionOpener(FileOpener): 65 66 "Reading position information from files." 67 68 def read_term_positions(self, offset, count): 69 70 """ 71 Read all positions from 'offset', seeking to that position in the file 72 before reading. The number of documents available for reading is limited 73 to 'count'. 74 """ 75 76 # Duplicate the file handle. 77 78 f = self.open("rb") 79 return PositionIterator(f, offset, count) 80 81 class PositionIndexWriter(FileWriter): 82 83 "Writing position index information to files." 84 85 def reset(self): 86 self.last_docnum = 0 87 self.last_pos_offset = 0 88 89 def write_positions(self, docnum, pos_offset, count): 90 91 """ 92 Write the given 'docnum, 'pos_offset' and document 'count' to the 93 position index file. 94 """ 95 96 # Write the document number delta. 97 # Write the position file offset delta. 98 # Write the document count. 99 100 output = array('B') 101 vint_to_array(docnum - self.last_docnum, output) 102 vint_to_array(pos_offset - self.last_pos_offset, output) 103 vint_to_array(count, output) 104 105 # Actually write the data. 106 107 output.tofile(self.f) 108 109 self.last_pos_offset = pos_offset 110 self.last_docnum = docnum 111 112 class PositionIndexOpener(FileOpener): 113 114 "Reading position index information from files." 115 116 def read_term_positions(self, offset, doc_frequency): 117 118 """ 119 Read all positions from 'offset', seeking to that position in the file 120 before reading. The number of documents available for reading is limited 121 to 'doc_frequency'. 122 """ 123 124 # Duplicate the file handle. 125 126 f = self.open("rb") 127 return PositionIndexIterator(f, offset, doc_frequency) 128 129 # Iterators for position-related files. 130 131 class IteratorBase: 132 133 def __init__(self, count): 134 self.replenish(count) 135 136 def replenish(self, count): 137 self.count = count 138 self.read_documents = 0 139 140 def __len__(self): 141 return self.count 142 143 def sort(self): 144 pass # Stored document positions are already sorted. 145 146 def __iter__(self): 147 return self 148 149 class PositionIterator(FileReader, IteratorBase): 150 151 "Iterating over document positions." 152 153 def __init__(self, f, offset, count): 154 FileReader.__init__(self, f) 155 IteratorBase.__init__(self, count) 156 self.f.seek(offset) 157 158 def reset(self): 159 self.last_docnum = 0 160 161 def read_positions(self): 162 163 "Read positions, returning a document number and a list of positions." 164 165 # Read the document number delta and add it to the last number. 166 167 self.last_docnum += self.read_number() 168 169 # Read the number of positions. 170 171 npositions = self.read_number() 172 173 # Read the position deltas, adding each previous position to get the 174 # appropriate collection of absolute positions. 175 176 i = 0 177 last = 0 178 positions = [] 179 180 while i < npositions: 181 last += self.read_number() 182 positions.append(last) 183 i += 1 184 185 return self.last_docnum, positions 186 187 def next(self): 188 189 "Read positions for a single document." 190 191 if self.read_documents < self.count: 192 self.read_documents += 1 193 return self.read_positions() 194 else: 195 raise StopIteration 196 197 class PositionIndexIterator(FileReader, IteratorBase): 198 199 "Iterating over document positions." 200 201 def __init__(self, f, offset, count): 202 FileReader.__init__(self, f) 203 IteratorBase.__init__(self, count) 204 self.f.seek(offset) 205 206 def reset(self): 207 self.last_docnum = 0 208 self.last_pos_offset = 0 209 self.section_count = 0 210 211 def read_positions(self): 212 213 """ 214 Read a document number, a position file offset for the position index 215 file, and the number of documents in a section of that file. 216 """ 217 218 # Read the document number delta. 219 220 self.last_docnum += self.read_number() 221 222 # Read the offset delta. 223 224 self.last_pos_offset += self.read_number() 225 226 # Read the document count. 227 228 count = self.read_number() 229 230 return self.last_docnum, self.last_pos_offset, count 231 232 def next(self): 233 234 "Read positions for a single document." 235 236 self.read_documents += self.section_count 237 if self.read_documents < self.count: 238 docnum, pos_offset, self.section_count = t = self.read_positions() 239 return t 240 else: 241 #assert self.read_documents == self.count # not upheld by from_document 242 raise StopIteration 243 244 class PositionDictionaryWriter: 245 246 "Writing position dictionaries." 247 248 def __init__(self, position_writer, position_index_writer, interval): 249 self.position_writer = position_writer 250 self.position_index_writer = position_index_writer 251 self.interval = interval 252 253 def write_term_positions(self, doc_positions): 254 255 """ 256 Write all 'doc_positions' - a collection of tuples of the form (document 257 number, position list) - to the file. 258 259 Add some records to the index, making dictionary entries. 260 261 Return a tuple containing the offset of the written data, the frequency 262 (number of positions), and document frequency (number of documents) for 263 the term involved. 264 """ 265 266 # Reset the writers. 267 268 self.position_writer.reset() 269 self.position_index_writer.reset() 270 271 # Remember the first index entry offset. 272 273 index_offset = self.position_index_writer.f.tell() 274 275 # Write the positions. 276 277 frequency = 0 278 count = 0 279 280 if doc_positions: 281 282 # Retain the first record offset for a subsequent index entry. 283 284 first_offset = self.position_writer.f.tell() 285 first_docnum = None 286 287 doc_positions.sort() 288 289 for docnum, positions in doc_positions: 290 if first_docnum is None: 291 first_docnum = docnum 292 293 self.position_writer.write_positions(docnum, positions) 294 295 frequency += len(positions) 296 count += 1 297 298 # Every {interval} entries, write an index entry. 299 300 if count % self.interval == 0: 301 302 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 303 304 first_offset = self.position_writer.f.tell() 305 first_docnum = None 306 307 # Reset the position writer so that position readers accessing 308 # a section start with the correct document number. 309 310 self.position_writer.reset() 311 312 # Finish writing an index entry for the remaining documents. 313 314 else: 315 if first_docnum is not None: 316 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 317 318 return index_offset, frequency, count 319 320 def close(self): 321 self.position_writer.close() 322 self.position_index_writer.close() 323 324 class PositionDictionaryReader: 325 326 "Reading position dictionaries." 327 328 def __init__(self, position_opener, position_index_opener): 329 self.position_opener = position_opener 330 self.position_index_opener = position_index_opener 331 self.position_dict_iterators = [] 332 333 def read_term_positions(self, offset, doc_frequency): 334 335 """ 336 Return an iterator for dictionary entries starting at 'offset' with the 337 given 'doc_frequency'. 338 """ 339 340 it = PositionDictionaryIterator(self.position_opener, 341 self.position_index_opener, offset, doc_frequency) 342 self.position_dict_iterators.append(it) 343 return it 344 345 def close(self): 346 for it in self.position_dict_iterators: 347 it.close() 348 self.position_dict_iterators = [] 349 350 class PositionDictionaryIterator: 351 352 "Iteration over position dictionary entries." 353 354 def __init__(self, position_opener, position_index_opener, offset, doc_frequency): 355 self.position_opener = position_opener 356 self.position_index_opener = position_index_opener 357 self.doc_frequency = doc_frequency 358 359 self.index_iterator = None 360 self.iterator = None 361 362 # Initialise the iterators. 363 364 self.reset(offset, doc_frequency) 365 366 def reset(self, offset, doc_frequency): 367 368 # Remember the last values. 369 370 self.found_docnum, self.found_positions = None, None 371 372 # Attempt to reuse the index iterator. 373 374 if self.index_iterator is not None: 375 ii = self.index_iterator 376 ii.replenish(doc_frequency) 377 ii.f.seek(offset) 378 ii.reset() 379 380 # Or make a new index iterator. 381 382 else: 383 self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency) 384 385 # Maintain state for the next index entry, if read. 386 387 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 388 389 # Initialise the current index entry and current position file iterator. 390 391 self._next_section() 392 self._init_section() 393 394 # Sequence methods. 395 396 def __len__(self): 397 return self.doc_frequency 398 399 def sort(self): 400 pass 401 402 # Iterator methods. 403 404 def __iter__(self): 405 return self 406 407 def next(self): 408 409 """ 410 Attempt to get the next document record from the section in the 411 positions file. 412 """ 413 414 # Return any visited but unrequested record. 415 416 if self.found_docnum is not None: 417 t = self.found_docnum, self.found_positions 418 self.found_docnum, self.found_positions = None, None 419 return t 420 421 # Or search for the next record. 422 423 while 1: 424 425 # Either return the next record. 426 427 try: 428 return self.iterator.next() 429 430 # Or, where a section is finished, get the next section and try again. 431 432 except StopIteration: 433 434 # Where a section follows, update the index iterator, but keep 435 # reading using the same file iterator (since the data should 436 # just follow on from the last section). 437 438 self._next_section() 439 self.iterator.replenish(self.section_count) 440 441 # Reset the state of the iterator to make sure that document 442 # numbers are correct. 443 444 self.iterator.reset() 445 446 def from_document(self, docnum): 447 448 """ 449 Attempt to navigate to a positions entry for the given 'docnum', 450 returning the positions for 'docnum', or None otherwise. 451 """ 452 453 # Return any unrequested document positions. 454 455 if docnum == self.found_docnum: 456 return self.found_positions 457 458 # Read ahead in the index until the next entry refers to a document 459 # later than the desired document. 460 461 try: 462 if self.next_docnum is None: 463 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 464 465 # Read until the next entry is after the desired document number, 466 # or until the end of the results. 467 468 while self.next_docnum <= docnum: 469 self._next_read_section() 470 if self.docnum < docnum: 471 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 472 else: 473 break 474 475 except StopIteration: 476 pass 477 478 # Navigate in the position file to the document. 479 480 self._init_section() 481 482 try: 483 while 1: 484 found_docnum, found_positions = self.iterator.next() 485 486 # Return the desired document positions or None (retaining the 487 # positions for the document immediately after). 488 489 if docnum == found_docnum: 490 return found_positions 491 elif docnum < found_docnum: 492 self.found_docnum, self.found_positions = found_docnum, found_positions 493 return None 494 495 except StopIteration: 496 return None 497 498 # Internal methods. 499 500 def _next_section(self): 501 502 "Attempt to get the next section in the index." 503 504 if self.next_docnum is None: 505 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() 506 else: 507 self._next_read_section() 508 509 def _next_read_section(self): 510 511 """ 512 Make the next index entry the current one without reading from the 513 index. 514 """ 515 516 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 517 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 518 519 def _init_section(self): 520 521 "Initialise the iterator for the section in the position file." 522 523 # Attempt to reuse any correctly positioned iterator. 524 525 if self.iterator is not None: 526 i = self.iterator 527 i.replenish(self.section_count) 528 i.f.seek(self.pos_offset) 529 i.reset() 530 531 # Otherwise, obtain a new iterator. 532 533 else: 534 self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) 535 536 def close(self): 537 if self.iterator is not None: 538 self.iterator.close() 539 self.iterator = None 540 if self.index_iterator is not None: 541 self.index_iterator.close() 542 self.index_iterator = None 543 544 class ResetPositionDictionaryIterator: 545 546 """ 547 A helper class which permits the reuse of iterators without modifying their 548 state. 549 """ 550 551 def __init__(self, iterator, offset, doc_frequency): 552 self.iterator = iterator 553 self.offset = offset 554 self.doc_frequency = doc_frequency 555 556 def __iter__(self): 557 self.iterator.reset(self.offset, self.doc_frequency) 558 return iter(self.iterator) 559 560 # vim: tabstop=4 expandtab shiftwidth=4