1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint 23 24 class PositionWriter(FileWriter): 25 26 "Writing position information to files." 27 28 def reset(self): 29 self.last_docnum = 0 30 31 def write_positions(self, docnum, positions): 32 33 """ 34 Write for the document 'docnum' the given 'positions'. 35 Return the offset of the written record. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Record the offset of this record. 42 43 offset = self.f.tell() 44 45 # Make sure that the positions are sorted. 46 47 positions.sort() 48 49 # Write the document number delta. 50 # Write the number of positions. 51 52 output = [docnum - self.last_docnum, len(positions)] 53 54 # Write the position deltas. 55 56 append = output.append 57 last = 0 58 59 for position in positions: 60 append(position - last) 61 last = position 62 63 self.f.write("".join([vint(x) for x in output])) 64 65 self.last_docnum = docnum 66 return offset 67 68 class PositionOpener(FileOpener): 69 70 "Reading position information from files." 71 72 def read_term_positions(self, offset, count): 73 74 """ 75 Read all positions from 'offset', seeking to that position in the file 76 before reading. The number of documents available for reading is limited 77 to 'count'. 78 """ 79 80 # Duplicate the file handle. 81 82 f = self.open("rb") 83 return PositionIterator(f, offset, count) 84 85 class PositionIndexWriter(FileWriter): 86 87 "Writing position index information to files." 88 89 def reset(self): 90 self.last_docnum = 0 91 self.last_pos_offset = 0 92 93 def write_positions(self, docnum, pos_offset, count): 94 95 """ 96 Write the given 'docnum, 'pos_offset' and document 'count' to the 97 position index file. 98 """ 99 100 # Record the offset of this record. 101 102 offset = self.f.tell() 103 104 # Write the document number delta. 105 # Write the position file offset delta. 106 # Write the document count. 107 108 output = [docnum - self.last_docnum, pos_offset - self.last_pos_offset, count] 109 110 # Actually write the data. 111 112 self.f.write("".join([vint(x) for x in output])) 113 114 self.last_pos_offset = pos_offset 115 self.last_docnum = docnum 116 return offset 117 118 class PositionIndexOpener(FileOpener): 119 120 "Reading position index information from files." 121 122 def read_term_positions(self, offset, doc_frequency): 123 124 """ 125 Read all positions from 'offset', seeking to that position in the file 126 before reading. The number of documents available for reading is limited 127 to 'doc_frequency'. 128 """ 129 130 # Duplicate the file handle. 131 132 f = self.open("rb") 133 return PositionIndexIterator(f, offset, doc_frequency) 134 135 # Iterators for position-related files. 136 137 class IteratorBase: 138 139 def __init__(self, count): 140 self.replenish(count) 141 142 def replenish(self, count): 143 self.count = count 144 self.read_documents = 0 145 146 def __len__(self): 147 return self.count 148 149 def sort(self): 150 pass # Stored document positions are already sorted. 151 152 def __iter__(self): 153 return self 154 155 class PositionIterator(FileReader, IteratorBase): 156 157 "Iterating over document positions." 158 159 def __init__(self, f, offset, count): 160 FileReader.__init__(self, f) 161 IteratorBase.__init__(self, count) 162 self.f.seek(offset) 163 164 def reset(self): 165 self.last_docnum = 0 166 167 def read_positions(self): 168 169 "Read positions, returning a document number and a list of positions." 170 171 # Read the document number delta and add it to the last number. 172 173 self.last_docnum += self.read_number() 174 175 # Read the number of positions. 176 177 npositions = self.read_number() 178 179 # Read the position deltas, adding each previous position to get the 180 # appropriate collection of absolute positions. 181 182 i = 0 183 last = 0 184 positions = [] 185 186 while i < npositions: 187 last += self.read_number() 188 positions.append(last) 189 i += 1 190 191 return self.last_docnum, positions 192 193 def next(self): 194 195 "Read positions for a single document." 196 197 if self.read_documents < self.count: 198 self.read_documents += 1 199 return self.read_positions() 200 else: 201 raise StopIteration 202 203 class PositionIndexIterator(FileReader, IteratorBase): 204 205 "Iterating over document positions." 206 207 def __init__(self, f, offset, count): 208 FileReader.__init__(self, f) 209 IteratorBase.__init__(self, count) 210 self.f.seek(offset) 211 212 def reset(self): 213 self.last_docnum = 0 214 self.last_pos_offset = 0 215 self.section_count = 0 216 217 def read_positions(self): 218 219 """ 220 Read a document number, a position file offset for the position index 221 file, and the number of documents in a section of that file. 222 """ 223 224 # Read the document number delta. 225 226 self.last_docnum += self.read_number() 227 228 # Read the offset delta. 229 230 self.last_pos_offset += self.read_number() 231 232 # Read the document count. 233 234 count = self.read_number() 235 236 return self.last_docnum, self.last_pos_offset, count 237 238 def next(self): 239 240 "Read positions for a single document." 241 242 self.read_documents += self.section_count 243 if self.read_documents < self.count: 244 docnum, pos_offset, self.section_count = t = self.read_positions() 245 return t 246 else: 247 assert self.read_documents == self.count 248 raise StopIteration 249 250 class PositionDictionaryWriter: 251 252 "Writing position dictionaries." 253 254 def __init__(self, position_writer, position_index_writer, interval): 255 self.position_writer = position_writer 256 self.position_index_writer = position_index_writer 257 self.interval = interval 258 259 def write_term_positions(self, doc_positions): 260 261 """ 262 Write all 'doc_positions' - a collection of tuples of the form (document 263 number, position list) - to the file. 264 265 Add some records to the index, making dictionary entries. 266 267 Return a tuple containing the offset of the written data, the frequency 268 (number of positions), and document frequency (number of documents) for 269 the term involved. 270 """ 271 272 # Reset the writers. 273 274 self.position_writer.reset() 275 self.position_index_writer.reset() 276 277 index_offset = None 278 279 # Write the positions. 280 281 frequency = 0 282 first_docnum = None 283 first_offset = None 284 count = 0 285 286 doc_positions.sort() 287 288 for docnum, positions in doc_positions: 289 pos_offset = self.position_writer.write_positions(docnum, positions) 290 291 # Retain the first record offset for a subsequent index entry. 292 293 if first_offset is None: 294 first_offset = pos_offset 295 first_docnum = docnum 296 297 frequency += len(positions) 298 count += 1 299 300 # Every {interval} entries, write an index entry. 301 302 if count % self.interval == 0: 303 io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 304 305 # Remember the first index entry offset. 306 307 if index_offset is None: 308 index_offset = io 309 310 first_offset = None 311 first_docnum = None 312 313 # Reset the position writer so that position readers accessing 314 # a section start with the correct document number. 315 316 self.position_writer.reset() 317 318 # Finish writing an index entry for the remaining documents. 319 320 else: 321 if first_offset is not None: 322 io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 323 324 # Remember the first index entry offset. 325 326 if index_offset is None: 327 index_offset = io 328 329 return index_offset, frequency, count 330 331 def close(self): 332 self.position_writer.close() 333 self.position_index_writer.close() 334 335 class PositionDictionaryReader: 336 337 "Reading position dictionaries." 338 339 def __init__(self, position_opener, position_index_opener): 340 self.position_opener = position_opener 341 self.position_index_opener = position_index_opener 342 343 def read_term_positions(self, offset, doc_frequency): 344 345 """ 346 Return an iterator for dictionary entries starting at 'offset' with the 347 given 'doc_frequency'. 348 """ 349 350 return PositionDictionaryIterator(self.position_opener, 351 self.position_index_opener, offset, doc_frequency) 352 353 def close(self): 354 pass 355 356 class PositionDictionaryIterator: 357 358 "Iteration over position dictionary entries." 359 360 def __init__(self, position_opener, position_index_opener, offset, doc_frequency): 361 self.position_opener = position_opener 362 self.position_index_opener = position_index_opener 363 self.doc_frequency = doc_frequency 364 365 self.index_iterator = None 366 self.iterator = None 367 368 # Initialise the iterators. 369 370 self.reset(offset, doc_frequency) 371 372 def reset(self, offset, doc_frequency): 373 374 # Remember the last values. 375 376 self.found_docnum, self.found_positions = None, None 377 378 # Attempt to reuse the index iterator. 379 380 if self.index_iterator is not None: 381 ii = self.index_iterator 382 ii.replenish(doc_frequency) 383 ii.f.seek(offset) 384 ii.reset() 385 386 # Or make a new index iterator. 387 388 else: 389 self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency) 390 391 # Maintain state for the next index entry, if read. 392 393 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 394 395 # Initialise the current index entry and current position file iterator. 396 397 self._next_section() 398 self._init_section() 399 400 # Sequence methods. 401 402 def __len__(self): 403 return self.doc_frequency 404 405 def sort(self): 406 pass 407 408 # Iterator methods. 409 410 def __iter__(self): 411 return self 412 413 def next(self): 414 415 """ 416 Attempt to get the next document record from the section in the 417 positions file. 418 """ 419 420 # Return any visited but unrequested record. 421 422 if self.found_docnum is not None: 423 t = self.found_docnum, self.found_positions 424 self.found_docnum, self.found_positions = None, None 425 return t 426 427 # Or search for the next record. 428 429 while 1: 430 431 # Either return the next record. 432 433 try: 434 return self.iterator.next() 435 436 # Or, where a section is finished, get the next section and try again. 437 438 except StopIteration: 439 440 # Where a section follows, update the index iterator, but keep 441 # reading using the same file iterator (since the data should 442 # just follow on from the last section). 443 444 self._next_section() 445 self.iterator.replenish(self.section_count) 446 447 # Reset the state of the iterator to make sure that document 448 # numbers are correct. 449 450 self.iterator.reset() 451 452 def from_document(self, docnum): 453 454 """ 455 Attempt to navigate to a positions entry for the given 'docnum', 456 returning the positions for 'docnum', or None otherwise. 457 """ 458 459 # Return any unrequested document positions. 460 461 if docnum == self.found_docnum: 462 return self.found_positions 463 464 # Read ahead in the index until the next entry refers to a document 465 # later than the desired document. 466 467 try: 468 if self.next_docnum is None: 469 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 470 471 # Read until the next entry is after the desired document number, 472 # or until the end of the results. 473 474 while self.next_docnum <= docnum: 475 self._next_read_section() 476 if self.docnum < docnum: 477 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 478 else: 479 break 480 481 except StopIteration: 482 pass 483 484 # Navigate in the position file to the document. 485 486 self._init_section() 487 488 try: 489 while 1: 490 found_docnum, found_positions = self.iterator.next() 491 492 # Return the desired document positions or None (retaining the 493 # positions for the document immediately after). 494 495 if docnum == found_docnum: 496 return found_positions 497 elif docnum < found_docnum: 498 self.found_docnum, self.found_positions = found_docnum, found_positions 499 return None 500 501 except StopIteration: 502 return None 503 504 # Internal methods. 505 506 def _next_section(self): 507 508 "Attempt to get the next section in the index." 509 510 if self.next_docnum is None: 511 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() 512 else: 513 self._next_read_section() 514 515 def _next_read_section(self): 516 517 """ 518 Make the next index entry the current one without reading from the 519 index. 520 """ 521 522 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 523 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 524 525 def _init_section(self): 526 527 "Initialise the iterator for the section in the position file." 528 529 # Attempt to reuse any correctly positioned iterator. 530 531 if self.iterator is not None: 532 i = self.iterator 533 i.replenish(self.section_count) 534 i.f.seek(self.pos_offset) 535 i.reset() 536 537 # Otherwise, obtain a new iterator. 538 539 else: 540 self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) 541 542 def close(self): 543 if self.iterator is not None: 544 self.iterator.close() 545 self.iterator = None 546 if self.index_iterator is not None: 547 self.index_iterator.close() 548 self.index_iterator = None 549 550 class ResetPositionDictionaryIterator: 551 552 """ 553 A helper class which permits the reuse of iterators without modifying their 554 state. 555 """ 556 557 def __init__(self, iterator, offset, doc_frequency): 558 self.iterator = iterator 559 self.offset = offset 560 self.doc_frequency = doc_frequency 561 562 def __iter__(self): 563 self.iterator.reset(self.offset, self.doc_frequency) 564 return iter(self.iterator) 565 566 # vim: tabstop=4 expandtab shiftwidth=4