1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint 23 24 class PositionWriter(FileWriter): 25 26 "Writing position information to files." 27 28 def reset(self): 29 self.last_docnum = 0 30 31 def write_positions(self, docnum, positions): 32 33 """ 34 Write for the document 'docnum' the given 'positions'. 35 Return the offset of the written record. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Record the offset of this record. 42 43 offset = self.tell() 44 45 # Make sure that the positions are sorted. 46 47 positions.sort() 48 49 # Write the position deltas. 50 51 output = [] 52 last = 0 53 54 for position in positions: 55 output.append(vint(position - last)) 56 last = position 57 58 # Write the document number delta. 59 # Write the number of positions. 60 # Then write the positions. 61 62 self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output)) 63 64 self.last_docnum = docnum 65 return offset 66 67 class PositionOpener(FileOpener): 68 69 "Reading position information from files." 70 71 def read_term_positions(self, offset, count): 72 73 """ 74 Read all positions from 'offset', seeking to that position in the file 75 before reading. The number of documents available for reading is limited 76 to 'count'. 77 """ 78 79 # Duplicate the file handle. 80 81 f = self.open("rb") 82 return PositionIterator(f, offset, count) 83 84 class PositionIndexWriter(FileWriter): 85 86 "Writing position index information to files." 87 88 def reset(self): 89 self.last_docnum = 0 90 self.last_pos_offset = 0 91 92 def write_positions(self, docnum, pos_offset, count): 93 94 """ 95 Write the given 'docnum, 'pos_offset' and document 'count' to the 96 position index file. 97 """ 98 99 # Record the offset of this record. 100 101 offset = self.tell() 102 output = [] 103 104 # Write the document number delta. 105 106 output.append(vint(docnum - self.last_docnum)) 107 self.last_docnum = docnum 108 109 # Write the position file offset delta. 110 111 output.append(vint(pos_offset - self.last_pos_offset)) 112 self.last_pos_offset = pos_offset 113 114 # Write the document count. 115 116 output.append(vint(count)) 117 118 # Actually write the data. 119 120 self.write("".join(output)) 121 122 return offset 123 124 class PositionIndexOpener(FileOpener): 125 126 "Reading position index information from files." 127 128 def read_term_positions(self, offset, doc_frequency): 129 130 """ 131 Read all positions from 'offset', seeking to that position in the file 132 before reading. The number of documents available for reading is limited 133 to 'doc_frequency'. 134 """ 135 136 # Duplicate the file handle. 137 138 f = self.open("rb") 139 return PositionIndexIterator(f, offset, doc_frequency) 140 141 # Iterators for position-related files. 142 143 class IteratorBase: 144 145 def __init__(self, count): 146 self.replenish(count) 147 148 def replenish(self, count): 149 self.count = count 150 self.read_documents = 0 151 152 def __len__(self): 153 return self.count 154 155 def sort(self): 156 pass # Stored document positions are already sorted. 157 158 def __iter__(self): 159 return self 160 161 class PositionIterator(FileReader, IteratorBase): 162 163 "Iterating over document positions." 164 165 def __init__(self, f, offset, count): 166 FileReader.__init__(self, f) 167 IteratorBase.__init__(self, count) 168 self.seek(offset) 169 170 def reset(self): 171 self.last_docnum = 0 172 173 def read_positions(self): 174 175 "Read positions, returning a document number and a list of positions." 176 177 # Read the document number delta and add it to the last number. 178 179 self.last_docnum += self.read_number() 180 181 # Read the number of positions. 182 183 npositions = self.read_number() 184 185 # Read the position deltas, adding each previous position to get the 186 # appropriate collection of absolute positions. 187 188 i = 0 189 last = 0 190 positions = [] 191 192 while i < npositions: 193 last += self.read_number() 194 positions.append(last) 195 i += 1 196 197 return self.last_docnum, positions 198 199 def next(self): 200 201 "Read positions for a single document." 202 203 if self.read_documents < self.count: 204 self.read_documents += 1 205 return self.read_positions() 206 else: 207 raise StopIteration 208 209 class PositionIndexIterator(FileReader, IteratorBase): 210 211 "Iterating over document positions." 212 213 def __init__(self, f, offset, count): 214 FileReader.__init__(self, f) 215 IteratorBase.__init__(self, count) 216 self.seek(offset) 217 218 def reset(self): 219 self.last_docnum = 0 220 self.last_pos_offset = 0 221 self.section_count = 0 222 223 def read_positions(self): 224 225 """ 226 Read a document number, a position file offset for the position index 227 file, and the number of documents in a section of that file. 228 """ 229 230 # Read the document number delta. 231 232 self.last_docnum += self.read_number() 233 234 # Read the offset delta. 235 236 self.last_pos_offset += self.read_number() 237 238 # Read the document count. 239 240 count = self.read_number() 241 242 return self.last_docnum, self.last_pos_offset, count 243 244 def next(self): 245 246 "Read positions for a single document." 247 248 self.read_documents += self.section_count 249 if self.read_documents < self.count: 250 docnum, pos_offset, self.section_count = t = self.read_positions() 251 return t 252 else: 253 assert self.read_documents == self.count 254 raise StopIteration 255 256 class PositionDictionaryWriter: 257 258 "Writing position dictionaries." 259 260 def __init__(self, position_writer, position_index_writer, interval): 261 self.position_writer = position_writer 262 self.position_index_writer = position_index_writer 263 self.interval = interval 264 265 def write_term_positions(self, doc_positions): 266 267 """ 268 Write all 'doc_positions' - a collection of tuples of the form (document 269 number, position list) - to the file. 270 271 Add some records to the index, making dictionary entries. 272 273 Return a tuple containing the offset of the written data, the frequency 274 (number of positions), and document frequency (number of documents) for 275 the term involved. 276 """ 277 278 # Reset the writers. 279 280 self.position_writer.reset() 281 self.position_index_writer.reset() 282 283 index_offset = None 284 285 # Write the positions. 286 287 frequency = 0 288 first_docnum = None 289 first_offset = None 290 count = 0 291 292 doc_positions.sort() 293 294 for docnum, positions in doc_positions: 295 pos_offset = self.position_writer.write_positions(docnum, positions) 296 297 # Retain the first record offset for a subsequent index entry. 298 299 if first_offset is None: 300 first_offset = pos_offset 301 first_docnum = docnum 302 303 frequency += len(positions) 304 count += 1 305 306 # Every {interval} entries, write an index entry. 307 308 if count % self.interval == 0: 309 io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 310 311 # Remember the first index entry offset. 312 313 if index_offset is None: 314 index_offset = io 315 316 first_offset = None 317 first_docnum = None 318 319 # Reset the position writer so that position readers accessing 320 # a section start with the correct document number. 321 322 self.position_writer.reset() 323 324 # Finish writing an index entry for the remaining documents. 325 326 else: 327 if first_offset is not None: 328 io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 329 330 # Remember the first index entry offset. 331 332 if index_offset is None: 333 index_offset = io 334 335 return index_offset, frequency, count 336 337 def close(self): 338 self.position_writer.close() 339 self.position_index_writer.close() 340 341 class PositionDictionaryReader: 342 343 "Reading position dictionaries." 344 345 def __init__(self, position_opener, position_index_opener): 346 self.position_opener = position_opener 347 self.position_index_opener = position_index_opener 348 349 def read_term_positions(self, offset, doc_frequency): 350 351 """ 352 Return an iterator for dictionary entries starting at 'offset' with the 353 given 'doc_frequency'. 354 """ 355 356 return PositionDictionaryIterator(self.position_opener, 357 self.position_index_opener, offset, doc_frequency) 358 359 def close(self): 360 pass 361 362 class PositionDictionaryIterator: 363 364 "Iteration over position dictionary entries." 365 366 def __init__(self, position_opener, position_index_opener, offset, doc_frequency): 367 self.position_opener = position_opener 368 self.position_index_opener = position_index_opener 369 self.doc_frequency = doc_frequency 370 371 self.index_iterator = None 372 self.iterator = None 373 374 # Initialise the iterators. 375 376 self.reset(offset, doc_frequency) 377 378 def reset(self, offset, doc_frequency): 379 380 # Remember the last values. 381 382 self.found_docnum, self.found_positions = None, None 383 384 # Attempt to reuse the index iterator. 385 386 if self.index_iterator is not None: 387 self.index_iterator.replenish(doc_frequency) 388 self.index_iterator.seek(offset) 389 self.index_iterator.reset() 390 391 # Or make a new index iterator. 392 393 else: 394 self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency) 395 396 # Maintain state for the next index entry, if read. 397 398 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 399 400 # Initialise the current index entry and current position file iterator. 401 402 self._next_section() 403 self._init_section() 404 405 # Sequence methods. 406 407 def __len__(self): 408 return self.doc_frequency 409 410 def sort(self): 411 pass 412 413 # Iterator methods. 414 415 def __iter__(self): 416 return self 417 418 def next(self): 419 420 """ 421 Attempt to get the next document record from the section in the 422 positions file. 423 """ 424 425 # Return any visited but unrequested record. 426 427 if self.found_docnum is not None: 428 t = self.found_docnum, self.found_positions 429 self.found_docnum, self.found_positions = None, None 430 return t 431 432 # Or search for the next record. 433 434 while 1: 435 436 # Either return the next record. 437 438 try: 439 return self.iterator.next() 440 441 # Or, where a section is finished, get the next section and try again. 442 443 except StopIteration: 444 445 # Where a section follows, update the index iterator, but keep 446 # reading using the same file iterator (since the data should 447 # just follow on from the last section). 448 449 self._next_section() 450 self.iterator.replenish(self.section_count) 451 452 # Reset the state of the iterator to make sure that document 453 # numbers are correct. 454 455 self.iterator.reset() 456 457 def from_document(self, docnum): 458 459 """ 460 Attempt to navigate to a positions entry for the given 'docnum', 461 returning the positions for 'docnum', or None otherwise. 462 """ 463 464 # Return any unrequested document positions. 465 466 if docnum == self.found_docnum: 467 return self.found_positions 468 469 # Read ahead in the index until the next entry refers to a document 470 # later than the desired document. 471 472 try: 473 if self.next_docnum is None: 474 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 475 476 # Read until the next entry is after the desired document number, 477 # or until the end of the results. 478 479 while self.next_docnum <= docnum: 480 self._next_read_section() 481 if self.docnum < docnum: 482 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 483 else: 484 break 485 486 except StopIteration: 487 pass 488 489 # Navigate in the position file to the document. 490 491 self._init_section() 492 493 try: 494 while 1: 495 found_docnum, found_positions = self.iterator.next() 496 497 # Return the desired document positions or None (retaining the 498 # positions for the document immediately after). 499 500 if docnum == found_docnum: 501 return found_positions 502 elif docnum < found_docnum: 503 self.found_docnum, self.found_positions = found_docnum, found_positions 504 return None 505 506 except StopIteration: 507 return None 508 509 # Internal methods. 510 511 def _next_section(self): 512 513 "Attempt to get the next section in the index." 514 515 if self.next_docnum is None: 516 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() 517 else: 518 self._next_read_section() 519 520 def _next_read_section(self): 521 522 """ 523 Make the next index entry the current one without reading from the 524 index. 525 """ 526 527 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 528 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 529 530 def _init_section(self): 531 532 "Initialise the iterator for the section in the position file." 533 534 # Attempt to reuse any correctly positioned iterator. 535 536 if self.iterator is not None: 537 self.iterator.replenish(self.section_count) 538 self.iterator.seek(self.pos_offset) 539 self.iterator.reset() 540 541 # Otherwise, obtain a new iterator. 542 543 else: 544 self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) 545 546 def close(self): 547 if self.iterator is not None: 548 self.iterator.close() 549 self.iterator = None 550 if self.index_iterator is not None: 551 self.index_iterator.close() 552 self.index_iterator = None 553 554 class ResetPositionDictionaryIterator: 555 556 """ 557 A helper class which permits the reuse of iterators without modifying their 558 state. 559 """ 560 561 def __init__(self, iterator, offset, doc_frequency): 562 self.iterator = iterator 563 self.offset = offset 564 self.doc_frequency = doc_frequency 565 566 def __iter__(self): 567 self.iterator.reset(self.offset, self.doc_frequency) 568 return iter(self.iterator) 569 570 # vim: tabstop=4 expandtab shiftwidth=4