1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint, vint_to_array 23 from array import array 24 25 class PositionWriter(FileWriter): 26 27 "Writing position information to files." 28 29 def reset(self): 30 self.last_docnum = 0 31 32 def write_positions(self, docnum, positions): 33 34 """ 35 Write for the document 'docnum' the given 'positions'. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Make sure that the positions are sorted. 42 43 positions.sort() 44 45 # Write the document number delta. 46 # Write the number of positions. 47 48 output = array('B') 49 vint_to_array(docnum - self.last_docnum, output) 50 vint_to_array(len(positions), output) 51 52 # Write the position deltas. 53 54 last = 0 55 56 for position in positions: 57 vint_to_array(position - last, output) 58 last = position 59 60 output.tofile(self.f) 61 62 self.last_docnum = docnum 63 64 class PositionOpener(FileOpener): 65 66 "Reading position information from files." 67 68 def read_term_positions(self, offset, count): 69 70 """ 71 Read all positions from 'offset', seeking to that position in the file 72 before reading. The number of documents available for reading is limited 73 to 'count'. 74 """ 75 76 # Duplicate the file handle. 77 78 f = self.open("rb") 79 return PositionIterator(f, offset, count) 80 81 class PositionIndexWriter(FileWriter): 82 83 "Writing position index information to files." 84 85 def reset(self): 86 self.last_docnum = 0 87 self.last_pos_offset = 0 88 89 def write_positions(self, docnum, pos_offset, count): 90 91 """ 92 Write the given 'docnum, 'pos_offset' and document 'count' to the 93 position index file. 94 """ 95 96 # Write the document number delta. 97 # Write the position file offset delta. 98 # Write the document count. 99 100 output = array('B') 101 vint_to_array(docnum - self.last_docnum, output) 102 vint_to_array(pos_offset - self.last_pos_offset, output) 103 vint_to_array(count, output) 104 105 # Actually write the data. 106 107 output.tofile(self.f) 108 109 self.last_pos_offset = pos_offset 110 self.last_docnum = docnum 111 112 class PositionIndexOpener(FileOpener): 113 114 "Reading position index information from files." 115 116 def read_term_positions(self, offset, doc_frequency): 117 118 """ 119 Read all positions from 'offset', seeking to that position in the file 120 before reading. The number of documents available for reading is limited 121 to 'doc_frequency'. 122 """ 123 124 # Duplicate the file handle. 125 126 f = self.open("rb") 127 return PositionIndexIterator(f, offset, doc_frequency) 128 129 # Iterators for position-related files. 130 131 class IteratorBase: 132 133 def __init__(self, count): 134 self.replenish(count) 135 136 def replenish(self, count): 137 self.count = count 138 self.read_documents = 0 139 140 def __len__(self): 141 return self.count 142 143 def sort(self): 144 pass # Stored document positions are already sorted. 145 146 def __iter__(self): 147 return self 148 149 class PositionIterator(FileReader, IteratorBase): 150 151 "Iterating over document positions." 152 153 def __init__(self, f, offset, count): 154 FileReader.__init__(self, f) 155 IteratorBase.__init__(self, count) 156 self.f.seek(offset) 157 158 def reset(self): 159 self.last_docnum = 0 160 161 def read_positions(self): 162 163 "Read positions, returning a document number and a list of positions." 164 165 # Read the document number delta and add it to the last number. 166 167 self.last_docnum += self.read_number() 168 169 # Read the number of positions. 170 171 npositions = self.read_number() 172 173 # Read the position deltas, adding each previous position to get the 174 # appropriate collection of absolute positions. 175 176 i = 0 177 last = 0 178 positions = [] 179 180 while i < npositions: 181 last += self.read_number() 182 positions.append(last) 183 i += 1 184 185 return self.last_docnum, positions 186 187 def next(self): 188 189 "Read positions for a single document." 190 191 if self.read_documents < self.count: 192 self.read_documents += 1 193 return self.read_positions() 194 else: 195 raise StopIteration 196 197 class PositionIndexIterator(FileReader, IteratorBase): 198 199 "Iterating over document positions." 200 201 def __init__(self, f, offset, count): 202 FileReader.__init__(self, f) 203 IteratorBase.__init__(self, count) 204 self.f.seek(offset) 205 206 def reset(self): 207 self.last_docnum = 0 208 self.last_pos_offset = 0 209 self.section_count = 0 210 211 def read_positions(self): 212 213 """ 214 Read a document number, a position file offset for the position index 215 file, and the number of documents in a section of that file. 216 """ 217 218 # Read the document number delta. 219 220 self.last_docnum += self.read_number() 221 222 # Read the offset delta. 223 224 self.last_pos_offset += self.read_number() 225 226 # Read the document count. 227 228 count = self.read_number() 229 230 return self.last_docnum, self.last_pos_offset, count 231 232 def next(self): 233 234 "Read positions for a single document." 235 236 self.read_documents += self.section_count 237 if self.read_documents < self.count: 238 docnum, pos_offset, self.section_count = t = self.read_positions() 239 return t 240 else: 241 assert self.read_documents == self.count 242 raise StopIteration 243 244 class PositionDictionaryWriter: 245 246 "Writing position dictionaries." 247 248 def __init__(self, position_writer, position_index_writer, interval): 249 self.position_writer = position_writer 250 self.position_index_writer = position_index_writer 251 self.interval = interval 252 253 def write_term_positions(self, doc_positions): 254 255 """ 256 Write all 'doc_positions' - a collection of tuples of the form (document 257 number, position list) - to the file. 258 259 Add some records to the index, making dictionary entries. 260 261 Return a tuple containing the offset of the written data, the frequency 262 (number of positions), and document frequency (number of documents) for 263 the term involved. 264 """ 265 266 # Reset the writers. 267 268 self.position_writer.reset() 269 self.position_index_writer.reset() 270 271 # Remember the first index entry offset. 272 273 index_offset = self.position_index_writer.f.tell() 274 275 # Write the positions. 276 277 frequency = 0 278 count = 0 279 280 if doc_positions: 281 282 # Retain the first record offset for a subsequent index entry. 283 284 first_offset = self.position_writer.f.tell() 285 first_docnum = None 286 287 doc_positions.sort() 288 289 for docnum, positions in doc_positions: 290 if first_docnum is None: 291 first_docnum = docnum 292 293 self.position_writer.write_positions(docnum, positions) 294 295 frequency += len(positions) 296 count += 1 297 298 # Every {interval} entries, write an index entry. 299 300 if count % self.interval == 0: 301 302 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 303 304 first_offset = self.position_writer.f.tell() 305 first_docnum = None 306 307 # Reset the position writer so that position readers accessing 308 # a section start with the correct document number. 309 310 self.position_writer.reset() 311 312 # Finish writing an index entry for the remaining documents. 313 314 else: 315 if first_docnum is not None: 316 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 317 318 return index_offset, frequency, count 319 320 def close(self): 321 self.position_writer.close() 322 self.position_index_writer.close() 323 324 class PositionDictionaryReader: 325 326 "Reading position dictionaries." 327 328 def __init__(self, position_opener, position_index_opener): 329 self.position_opener = position_opener 330 self.position_index_opener = position_index_opener 331 332 def read_term_positions(self, offset, doc_frequency): 333 334 """ 335 Return an iterator for dictionary entries starting at 'offset' with the 336 given 'doc_frequency'. 337 """ 338 339 return PositionDictionaryIterator(self.position_opener, 340 self.position_index_opener, offset, doc_frequency) 341 342 def close(self): 343 pass 344 345 class PositionDictionaryIterator: 346 347 "Iteration over position dictionary entries." 348 349 def __init__(self, position_opener, position_index_opener, offset, doc_frequency): 350 self.position_opener = position_opener 351 self.position_index_opener = position_index_opener 352 self.doc_frequency = doc_frequency 353 354 self.index_iterator = None 355 self.iterator = None 356 357 # Initialise the iterators. 358 359 self.reset(offset, doc_frequency) 360 361 def reset(self, offset, doc_frequency): 362 363 # Remember the last values. 364 365 self.found_docnum, self.found_positions = None, None 366 367 # Attempt to reuse the index iterator. 368 369 if self.index_iterator is not None: 370 ii = self.index_iterator 371 ii.replenish(doc_frequency) 372 ii.f.seek(offset) 373 ii.reset() 374 375 # Or make a new index iterator. 376 377 else: 378 self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency) 379 380 # Maintain state for the next index entry, if read. 381 382 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 383 384 # Initialise the current index entry and current position file iterator. 385 386 self._next_section() 387 self._init_section() 388 389 # Sequence methods. 390 391 def __len__(self): 392 return self.doc_frequency 393 394 def sort(self): 395 pass 396 397 # Iterator methods. 398 399 def __iter__(self): 400 return self 401 402 def next(self): 403 404 """ 405 Attempt to get the next document record from the section in the 406 positions file. 407 """ 408 409 # Return any visited but unrequested record. 410 411 if self.found_docnum is not None: 412 t = self.found_docnum, self.found_positions 413 self.found_docnum, self.found_positions = None, None 414 return t 415 416 # Or search for the next record. 417 418 while 1: 419 420 # Either return the next record. 421 422 try: 423 return self.iterator.next() 424 425 # Or, where a section is finished, get the next section and try again. 426 427 except StopIteration: 428 429 # Where a section follows, update the index iterator, but keep 430 # reading using the same file iterator (since the data should 431 # just follow on from the last section). 432 433 self._next_section() 434 self.iterator.replenish(self.section_count) 435 436 # Reset the state of the iterator to make sure that document 437 # numbers are correct. 438 439 self.iterator.reset() 440 441 def from_document(self, docnum): 442 443 """ 444 Attempt to navigate to a positions entry for the given 'docnum', 445 returning the positions for 'docnum', or None otherwise. 446 """ 447 448 # Return any unrequested document positions. 449 450 if docnum == self.found_docnum: 451 return self.found_positions 452 453 # Read ahead in the index until the next entry refers to a document 454 # later than the desired document. 455 456 try: 457 if self.next_docnum is None: 458 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 459 460 # Read until the next entry is after the desired document number, 461 # or until the end of the results. 462 463 while self.next_docnum <= docnum: 464 self._next_read_section() 465 if self.docnum < docnum: 466 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 467 else: 468 break 469 470 except StopIteration: 471 pass 472 473 # Navigate in the position file to the document. 474 475 self._init_section() 476 477 try: 478 while 1: 479 found_docnum, found_positions = self.iterator.next() 480 481 # Return the desired document positions or None (retaining the 482 # positions for the document immediately after). 483 484 if docnum == found_docnum: 485 return found_positions 486 elif docnum < found_docnum: 487 self.found_docnum, self.found_positions = found_docnum, found_positions 488 return None 489 490 except StopIteration: 491 return None 492 493 # Internal methods. 494 495 def _next_section(self): 496 497 "Attempt to get the next section in the index." 498 499 if self.next_docnum is None: 500 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() 501 else: 502 self._next_read_section() 503 504 def _next_read_section(self): 505 506 """ 507 Make the next index entry the current one without reading from the 508 index. 509 """ 510 511 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 512 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 513 514 def _init_section(self): 515 516 "Initialise the iterator for the section in the position file." 517 518 # Attempt to reuse any correctly positioned iterator. 519 520 if self.iterator is not None: 521 i = self.iterator 522 i.replenish(self.section_count) 523 i.f.seek(self.pos_offset) 524 i.reset() 525 526 # Otherwise, obtain a new iterator. 527 528 else: 529 self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) 530 531 def close(self): 532 if self.iterator is not None: 533 self.iterator.close() 534 self.iterator = None 535 if self.index_iterator is not None: 536 self.index_iterator.close() 537 self.index_iterator = None 538 539 class ResetPositionDictionaryIterator: 540 541 """ 542 A helper class which permits the reuse of iterators without modifying their 543 state. 544 """ 545 546 def __init__(self, iterator, offset, doc_frequency): 547 self.iterator = iterator 548 self.offset = offset 549 self.doc_frequency = doc_frequency 550 551 def __iter__(self): 552 self.iterator.reset(self.offset, self.doc_frequency) 553 return iter(self.iterator) 554 555 # vim: tabstop=4 expandtab shiftwidth=4