1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint, vint_to_array 23 from array import array 24 25 class PositionWriter(FileWriter): 26 27 "Writing position information to files." 28 29 def reset(self): 30 self.last_docnum = 0 31 32 def write_positions(self, docnum, positions): 33 34 """ 35 Write for the document 'docnum' the given 'positions'. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Make sure that the positions are sorted. 42 43 positions.sort() 44 45 # Write the document number delta. 46 # Write the number of positions. 47 48 output = array('B') 49 vint_to_array(docnum - self.last_docnum, output) 50 vint_to_array(len(positions), output) 51 52 # Write the position deltas. 53 54 last = 0 55 56 for position in positions: 57 vint_to_array(position - last, output) 58 last = position 59 60 output.tofile(self.f) 61 62 self.last_docnum = docnum 63 64 class PositionOpener(FileOpener): 65 66 "Reading position information from files." 67 68 def read_term_positions(self, offset, count): 69 70 """ 71 Read all positions from 'offset', seeking to that position in the file 72 before reading. The number of documents available for reading is limited 73 to 'count'. 74 """ 75 76 # Duplicate the file handle. 77 78 f = self.open("rb") 79 return PositionIterator(f, offset, count) 80 81 class PositionIndexWriter(FileWriter): 82 83 "Writing position index information to files." 84 85 def reset(self): 86 self.last_docnum = 0 87 self.last_pos_offset = 0 88 89 def write_positions(self, docnum, pos_offset, count): 90 91 """ 92 Write the given 'docnum, 'pos_offset' and document 'count' to the 93 position index file. 94 """ 95 96 # Write the document number delta. 97 # Write the position file offset delta. 98 # Write the document count. 99 100 output = [docnum - self.last_docnum, pos_offset - self.last_pos_offset, count] 101 102 # Actually write the data. 103 104 self.f.write("".join([vint(x) for x in output])) 105 106 self.last_pos_offset = pos_offset 107 self.last_docnum = docnum 108 109 class PositionIndexOpener(FileOpener): 110 111 "Reading position index information from files." 112 113 def read_term_positions(self, offset, doc_frequency): 114 115 """ 116 Read all positions from 'offset', seeking to that position in the file 117 before reading. The number of documents available for reading is limited 118 to 'doc_frequency'. 119 """ 120 121 # Duplicate the file handle. 122 123 f = self.open("rb") 124 return PositionIndexIterator(f, offset, doc_frequency) 125 126 # Iterators for position-related files. 127 128 class IteratorBase: 129 130 def __init__(self, count): 131 self.replenish(count) 132 133 def replenish(self, count): 134 self.count = count 135 self.read_documents = 0 136 137 def __len__(self): 138 return self.count 139 140 def sort(self): 141 pass # Stored document positions are already sorted. 142 143 def __iter__(self): 144 return self 145 146 class PositionIterator(FileReader, IteratorBase): 147 148 "Iterating over document positions." 149 150 def __init__(self, f, offset, count): 151 FileReader.__init__(self, f) 152 IteratorBase.__init__(self, count) 153 self.f.seek(offset) 154 155 def reset(self): 156 self.last_docnum = 0 157 158 def read_positions(self): 159 160 "Read positions, returning a document number and a list of positions." 161 162 # Read the document number delta and add it to the last number. 163 164 self.last_docnum += self.read_number() 165 166 # Read the number of positions. 167 168 npositions = self.read_number() 169 170 # Read the position deltas, adding each previous position to get the 171 # appropriate collection of absolute positions. 172 173 i = 0 174 last = 0 175 positions = [] 176 177 while i < npositions: 178 last += self.read_number() 179 positions.append(last) 180 i += 1 181 182 return self.last_docnum, positions 183 184 def next(self): 185 186 "Read positions for a single document." 187 188 if self.read_documents < self.count: 189 self.read_documents += 1 190 return self.read_positions() 191 else: 192 raise StopIteration 193 194 class PositionIndexIterator(FileReader, IteratorBase): 195 196 "Iterating over document positions." 197 198 def __init__(self, f, offset, count): 199 FileReader.__init__(self, f) 200 IteratorBase.__init__(self, count) 201 self.f.seek(offset) 202 203 def reset(self): 204 self.last_docnum = 0 205 self.last_pos_offset = 0 206 self.section_count = 0 207 208 def read_positions(self): 209 210 """ 211 Read a document number, a position file offset for the position index 212 file, and the number of documents in a section of that file. 213 """ 214 215 # Read the document number delta. 216 217 self.last_docnum += self.read_number() 218 219 # Read the offset delta. 220 221 self.last_pos_offset += self.read_number() 222 223 # Read the document count. 224 225 count = self.read_number() 226 227 return self.last_docnum, self.last_pos_offset, count 228 229 def next(self): 230 231 "Read positions for a single document." 232 233 self.read_documents += self.section_count 234 if self.read_documents < self.count: 235 docnum, pos_offset, self.section_count = t = self.read_positions() 236 return t 237 else: 238 assert self.read_documents == self.count 239 raise StopIteration 240 241 class PositionDictionaryWriter: 242 243 "Writing position dictionaries." 244 245 def __init__(self, position_writer, position_index_writer, interval): 246 self.position_writer = position_writer 247 self.position_index_writer = position_index_writer 248 self.interval = interval 249 250 def write_term_positions(self, doc_positions): 251 252 """ 253 Write all 'doc_positions' - a collection of tuples of the form (document 254 number, position list) - to the file. 255 256 Add some records to the index, making dictionary entries. 257 258 Return a tuple containing the offset of the written data, the frequency 259 (number of positions), and document frequency (number of documents) for 260 the term involved. 261 """ 262 263 # Reset the writers. 264 265 self.position_writer.reset() 266 self.position_index_writer.reset() 267 268 # Remember the first index entry offset. 269 270 index_offset = self.position_index_writer.f.tell() 271 272 # Write the positions. 273 274 frequency = 0 275 count = 0 276 277 if doc_positions: 278 279 # Retain the first record offset for a subsequent index entry. 280 281 first_offset = self.position_writer.f.tell() 282 first_docnum = None 283 284 doc_positions.sort() 285 286 for docnum, positions in doc_positions: 287 if first_docnum is None: 288 first_docnum = docnum 289 290 self.position_writer.write_positions(docnum, positions) 291 292 frequency += len(positions) 293 count += 1 294 295 # Every {interval} entries, write an index entry. 296 297 if count % self.interval == 0: 298 299 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 300 301 first_offset = self.position_writer.f.tell() 302 first_docnum = None 303 304 # Reset the position writer so that position readers accessing 305 # a section start with the correct document number. 306 307 self.position_writer.reset() 308 309 # Finish writing an index entry for the remaining documents. 310 311 else: 312 if first_docnum is not None: 313 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 314 315 return index_offset, frequency, count 316 317 def close(self): 318 self.position_writer.close() 319 self.position_index_writer.close() 320 321 class PositionDictionaryReader: 322 323 "Reading position dictionaries." 324 325 def __init__(self, position_opener, position_index_opener): 326 self.position_opener = position_opener 327 self.position_index_opener = position_index_opener 328 329 def read_term_positions(self, offset, doc_frequency): 330 331 """ 332 Return an iterator for dictionary entries starting at 'offset' with the 333 given 'doc_frequency'. 334 """ 335 336 return PositionDictionaryIterator(self.position_opener, 337 self.position_index_opener, offset, doc_frequency) 338 339 def close(self): 340 pass 341 342 class PositionDictionaryIterator: 343 344 "Iteration over position dictionary entries." 345 346 def __init__(self, position_opener, position_index_opener, offset, doc_frequency): 347 self.position_opener = position_opener 348 self.position_index_opener = position_index_opener 349 self.doc_frequency = doc_frequency 350 351 self.index_iterator = None 352 self.iterator = None 353 354 # Initialise the iterators. 355 356 self.reset(offset, doc_frequency) 357 358 def reset(self, offset, doc_frequency): 359 360 # Remember the last values. 361 362 self.found_docnum, self.found_positions = None, None 363 364 # Attempt to reuse the index iterator. 365 366 if self.index_iterator is not None: 367 ii = self.index_iterator 368 ii.replenish(doc_frequency) 369 ii.f.seek(offset) 370 ii.reset() 371 372 # Or make a new index iterator. 373 374 else: 375 self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency) 376 377 # Maintain state for the next index entry, if read. 378 379 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 380 381 # Initialise the current index entry and current position file iterator. 382 383 self._next_section() 384 self._init_section() 385 386 # Sequence methods. 387 388 def __len__(self): 389 return self.doc_frequency 390 391 def sort(self): 392 pass 393 394 # Iterator methods. 395 396 def __iter__(self): 397 return self 398 399 def next(self): 400 401 """ 402 Attempt to get the next document record from the section in the 403 positions file. 404 """ 405 406 # Return any visited but unrequested record. 407 408 if self.found_docnum is not None: 409 t = self.found_docnum, self.found_positions 410 self.found_docnum, self.found_positions = None, None 411 return t 412 413 # Or search for the next record. 414 415 while 1: 416 417 # Either return the next record. 418 419 try: 420 return self.iterator.next() 421 422 # Or, where a section is finished, get the next section and try again. 423 424 except StopIteration: 425 426 # Where a section follows, update the index iterator, but keep 427 # reading using the same file iterator (since the data should 428 # just follow on from the last section). 429 430 self._next_section() 431 self.iterator.replenish(self.section_count) 432 433 # Reset the state of the iterator to make sure that document 434 # numbers are correct. 435 436 self.iterator.reset() 437 438 def from_document(self, docnum): 439 440 """ 441 Attempt to navigate to a positions entry for the given 'docnum', 442 returning the positions for 'docnum', or None otherwise. 443 """ 444 445 # Return any unrequested document positions. 446 447 if docnum == self.found_docnum: 448 return self.found_positions 449 450 # Read ahead in the index until the next entry refers to a document 451 # later than the desired document. 452 453 try: 454 if self.next_docnum is None: 455 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 456 457 # Read until the next entry is after the desired document number, 458 # or until the end of the results. 459 460 while self.next_docnum <= docnum: 461 self._next_read_section() 462 if self.docnum < docnum: 463 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 464 else: 465 break 466 467 except StopIteration: 468 pass 469 470 # Navigate in the position file to the document. 471 472 self._init_section() 473 474 try: 475 while 1: 476 found_docnum, found_positions = self.iterator.next() 477 478 # Return the desired document positions or None (retaining the 479 # positions for the document immediately after). 480 481 if docnum == found_docnum: 482 return found_positions 483 elif docnum < found_docnum: 484 self.found_docnum, self.found_positions = found_docnum, found_positions 485 return None 486 487 except StopIteration: 488 return None 489 490 # Internal methods. 491 492 def _next_section(self): 493 494 "Attempt to get the next section in the index." 495 496 if self.next_docnum is None: 497 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() 498 else: 499 self._next_read_section() 500 501 def _next_read_section(self): 502 503 """ 504 Make the next index entry the current one without reading from the 505 index. 506 """ 507 508 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 509 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 510 511 def _init_section(self): 512 513 "Initialise the iterator for the section in the position file." 514 515 # Attempt to reuse any correctly positioned iterator. 516 517 if self.iterator is not None: 518 i = self.iterator 519 i.replenish(self.section_count) 520 i.f.seek(self.pos_offset) 521 i.reset() 522 523 # Otherwise, obtain a new iterator. 524 525 else: 526 self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) 527 528 def close(self): 529 if self.iterator is not None: 530 self.iterator.close() 531 self.iterator = None 532 if self.index_iterator is not None: 533 self.index_iterator.close() 534 self.index_iterator = None 535 536 class ResetPositionDictionaryIterator: 537 538 """ 539 A helper class which permits the reuse of iterators without modifying their 540 state. 541 """ 542 543 def __init__(self, iterator, offset, doc_frequency): 544 self.iterator = iterator 545 self.offset = offset 546 self.doc_frequency = doc_frequency 547 548 def __iter__(self): 549 self.iterator.reset(self.offset, self.doc_frequency) 550 return iter(self.iterator) 551 552 # vim: tabstop=4 expandtab shiftwidth=4