1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint 23 24 class PositionWriter(FileWriter): 25 26 "Writing position information to files." 27 28 def reset(self): 29 self.last_docnum = 0 30 31 def write_positions(self, docnum, positions): 32 33 """ 34 Write for the document 'docnum' the given 'positions'. 35 """ 36 37 if docnum < self.last_docnum: 38 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 39 40 # Make sure that the positions are sorted. 41 42 positions.sort() 43 44 # Write the document number delta. 45 # Write the number of positions. 46 47 output = [docnum - self.last_docnum, len(positions)] 48 49 # Write the position deltas. 50 51 append = output.append 52 last = 0 53 54 for position in positions: 55 append(position - last) 56 last = position 57 58 self.f.write("".join([vint(x) for x in output])) 59 60 self.last_docnum = docnum 61 62 class PositionOpener(FileOpener): 63 64 "Reading position information from files." 65 66 def read_term_positions(self, offset, count): 67 68 """ 69 Read all positions from 'offset', seeking to that position in the file 70 before reading. The number of documents available for reading is limited 71 to 'count'. 72 """ 73 74 # Duplicate the file handle. 75 76 f = self.open("rb") 77 return PositionIterator(f, offset, count) 78 79 class PositionIndexWriter(FileWriter): 80 81 "Writing position index information to files." 82 83 def reset(self): 84 self.last_docnum = 0 85 self.last_pos_offset = 0 86 87 def write_positions(self, docnum, pos_offset, count): 88 89 """ 90 Write the given 'docnum, 'pos_offset' and document 'count' to the 91 position index file. 92 """ 93 94 # Write the document number delta. 95 # Write the position file offset delta. 96 # Write the document count. 97 98 output = [docnum - self.last_docnum, pos_offset - self.last_pos_offset, count] 99 100 # Actually write the data. 101 102 self.f.write("".join([vint(x) for x in output])) 103 104 self.last_pos_offset = pos_offset 105 self.last_docnum = docnum 106 107 class PositionIndexOpener(FileOpener): 108 109 "Reading position index information from files." 110 111 def read_term_positions(self, offset, doc_frequency): 112 113 """ 114 Read all positions from 'offset', seeking to that position in the file 115 before reading. The number of documents available for reading is limited 116 to 'doc_frequency'. 117 """ 118 119 # Duplicate the file handle. 120 121 f = self.open("rb") 122 return PositionIndexIterator(f, offset, doc_frequency) 123 124 # Iterators for position-related files. 125 126 class IteratorBase: 127 128 def __init__(self, count): 129 self.replenish(count) 130 131 def replenish(self, count): 132 self.count = count 133 self.read_documents = 0 134 135 def __len__(self): 136 return self.count 137 138 def sort(self): 139 pass # Stored document positions are already sorted. 140 141 def __iter__(self): 142 return self 143 144 class PositionIterator(FileReader, IteratorBase): 145 146 "Iterating over document positions." 147 148 def __init__(self, f, offset, count): 149 FileReader.__init__(self, f) 150 IteratorBase.__init__(self, count) 151 self.f.seek(offset) 152 153 def reset(self): 154 self.last_docnum = 0 155 156 def read_positions(self): 157 158 "Read positions, returning a document number and a list of positions." 159 160 # Read the document number delta and add it to the last number. 161 162 self.last_docnum += self.read_number() 163 164 # Read the number of positions. 165 166 npositions = self.read_number() 167 168 # Read the position deltas, adding each previous position to get the 169 # appropriate collection of absolute positions. 170 171 i = 0 172 last = 0 173 positions = [] 174 175 while i < npositions: 176 last += self.read_number() 177 positions.append(last) 178 i += 1 179 180 return self.last_docnum, positions 181 182 def next(self): 183 184 "Read positions for a single document." 185 186 if self.read_documents < self.count: 187 self.read_documents += 1 188 return self.read_positions() 189 else: 190 raise StopIteration 191 192 class PositionIndexIterator(FileReader, IteratorBase): 193 194 "Iterating over document positions." 195 196 def __init__(self, f, offset, count): 197 FileReader.__init__(self, f) 198 IteratorBase.__init__(self, count) 199 self.f.seek(offset) 200 201 def reset(self): 202 self.last_docnum = 0 203 self.last_pos_offset = 0 204 self.section_count = 0 205 206 def read_positions(self): 207 208 """ 209 Read a document number, a position file offset for the position index 210 file, and the number of documents in a section of that file. 211 """ 212 213 # Read the document number delta. 214 215 self.last_docnum += self.read_number() 216 217 # Read the offset delta. 218 219 self.last_pos_offset += self.read_number() 220 221 # Read the document count. 222 223 count = self.read_number() 224 225 return self.last_docnum, self.last_pos_offset, count 226 227 def next(self): 228 229 "Read positions for a single document." 230 231 self.read_documents += self.section_count 232 if self.read_documents < self.count: 233 docnum, pos_offset, self.section_count = t = self.read_positions() 234 return t 235 else: 236 assert self.read_documents == self.count 237 raise StopIteration 238 239 class PositionDictionaryWriter: 240 241 "Writing position dictionaries." 242 243 def __init__(self, position_writer, position_index_writer, interval): 244 self.position_writer = position_writer 245 self.position_index_writer = position_index_writer 246 self.interval = interval 247 248 def write_term_positions(self, doc_positions): 249 250 """ 251 Write all 'doc_positions' - a collection of tuples of the form (document 252 number, position list) - to the file. 253 254 Add some records to the index, making dictionary entries. 255 256 Return a tuple containing the offset of the written data, the frequency 257 (number of positions), and document frequency (number of documents) for 258 the term involved. 259 """ 260 261 # Reset the writers. 262 263 self.position_writer.reset() 264 self.position_index_writer.reset() 265 266 # Remember the first index entry offset. 267 268 index_offset = self.position_index_writer.f.tell() 269 270 # Write the positions. 271 272 frequency = 0 273 count = 0 274 275 if doc_positions: 276 277 # Retain the first record offset for a subsequent index entry. 278 279 first_offset = self.position_writer.f.tell() 280 first_docnum = None 281 282 doc_positions.sort() 283 284 for docnum, positions in doc_positions: 285 if first_docnum is None: 286 first_docnum = docnum 287 288 self.position_writer.write_positions(docnum, positions) 289 290 frequency += len(positions) 291 count += 1 292 293 # Every {interval} entries, write an index entry. 294 295 if count % self.interval == 0: 296 297 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 298 299 first_offset = self.position_writer.f.tell() 300 first_docnum = None 301 302 # Reset the position writer so that position readers accessing 303 # a section start with the correct document number. 304 305 self.position_writer.reset() 306 307 # Finish writing an index entry for the remaining documents. 308 309 else: 310 if first_docnum is not None: 311 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 312 313 return index_offset, frequency, count 314 315 def close(self): 316 self.position_writer.close() 317 self.position_index_writer.close() 318 319 class PositionDictionaryReader: 320 321 "Reading position dictionaries." 322 323 def __init__(self, position_opener, position_index_opener): 324 self.position_opener = position_opener 325 self.position_index_opener = position_index_opener 326 327 def read_term_positions(self, offset, doc_frequency): 328 329 """ 330 Return an iterator for dictionary entries starting at 'offset' with the 331 given 'doc_frequency'. 332 """ 333 334 return PositionDictionaryIterator(self.position_opener, 335 self.position_index_opener, offset, doc_frequency) 336 337 def close(self): 338 pass 339 340 class PositionDictionaryIterator: 341 342 "Iteration over position dictionary entries." 343 344 def __init__(self, position_opener, position_index_opener, offset, doc_frequency): 345 self.position_opener = position_opener 346 self.position_index_opener = position_index_opener 347 self.doc_frequency = doc_frequency 348 349 self.index_iterator = None 350 self.iterator = None 351 352 # Initialise the iterators. 353 354 self.reset(offset, doc_frequency) 355 356 def reset(self, offset, doc_frequency): 357 358 # Remember the last values. 359 360 self.found_docnum, self.found_positions = None, None 361 362 # Attempt to reuse the index iterator. 363 364 if self.index_iterator is not None: 365 ii = self.index_iterator 366 ii.replenish(doc_frequency) 367 ii.f.seek(offset) 368 ii.reset() 369 370 # Or make a new index iterator. 371 372 else: 373 self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency) 374 375 # Maintain state for the next index entry, if read. 376 377 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 378 379 # Initialise the current index entry and current position file iterator. 380 381 self._next_section() 382 self._init_section() 383 384 # Sequence methods. 385 386 def __len__(self): 387 return self.doc_frequency 388 389 def sort(self): 390 pass 391 392 # Iterator methods. 393 394 def __iter__(self): 395 return self 396 397 def next(self): 398 399 """ 400 Attempt to get the next document record from the section in the 401 positions file. 402 """ 403 404 # Return any visited but unrequested record. 405 406 if self.found_docnum is not None: 407 t = self.found_docnum, self.found_positions 408 self.found_docnum, self.found_positions = None, None 409 return t 410 411 # Or search for the next record. 412 413 while 1: 414 415 # Either return the next record. 416 417 try: 418 return self.iterator.next() 419 420 # Or, where a section is finished, get the next section and try again. 421 422 except StopIteration: 423 424 # Where a section follows, update the index iterator, but keep 425 # reading using the same file iterator (since the data should 426 # just follow on from the last section). 427 428 self._next_section() 429 self.iterator.replenish(self.section_count) 430 431 # Reset the state of the iterator to make sure that document 432 # numbers are correct. 433 434 self.iterator.reset() 435 436 def from_document(self, docnum): 437 438 """ 439 Attempt to navigate to a positions entry for the given 'docnum', 440 returning the positions for 'docnum', or None otherwise. 441 """ 442 443 # Return any unrequested document positions. 444 445 if docnum == self.found_docnum: 446 return self.found_positions 447 448 # Read ahead in the index until the next entry refers to a document 449 # later than the desired document. 450 451 try: 452 if self.next_docnum is None: 453 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 454 455 # Read until the next entry is after the desired document number, 456 # or until the end of the results. 457 458 while self.next_docnum <= docnum: 459 self._next_read_section() 460 if self.docnum < docnum: 461 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 462 else: 463 break 464 465 except StopIteration: 466 pass 467 468 # Navigate in the position file to the document. 469 470 self._init_section() 471 472 try: 473 while 1: 474 found_docnum, found_positions = self.iterator.next() 475 476 # Return the desired document positions or None (retaining the 477 # positions for the document immediately after). 478 479 if docnum == found_docnum: 480 return found_positions 481 elif docnum < found_docnum: 482 self.found_docnum, self.found_positions = found_docnum, found_positions 483 return None 484 485 except StopIteration: 486 return None 487 488 # Internal methods. 489 490 def _next_section(self): 491 492 "Attempt to get the next section in the index." 493 494 if self.next_docnum is None: 495 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() 496 else: 497 self._next_read_section() 498 499 def _next_read_section(self): 500 501 """ 502 Make the next index entry the current one without reading from the 503 index. 504 """ 505 506 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 507 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 508 509 def _init_section(self): 510 511 "Initialise the iterator for the section in the position file." 512 513 # Attempt to reuse any correctly positioned iterator. 514 515 if self.iterator is not None: 516 i = self.iterator 517 i.replenish(self.section_count) 518 i.f.seek(self.pos_offset) 519 i.reset() 520 521 # Otherwise, obtain a new iterator. 522 523 else: 524 self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) 525 526 def close(self): 527 if self.iterator is not None: 528 self.iterator.close() 529 self.iterator = None 530 if self.index_iterator is not None: 531 self.index_iterator.close() 532 self.index_iterator = None 533 534 class ResetPositionDictionaryIterator: 535 536 """ 537 A helper class which permits the reuse of iterators without modifying their 538 state. 539 """ 540 541 def __init__(self, iterator, offset, doc_frequency): 542 self.iterator = iterator 543 self.offset = offset 544 self.doc_frequency = doc_frequency 545 546 def __iter__(self): 547 self.iterator.reset(self.offset, self.doc_frequency) 548 return iter(self.iterator) 549 550 # vim: tabstop=4 expandtab shiftwidth=4