1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint, vint_to_array 23 from array import array 24 25 class PositionWriter(FileWriter): 26 27 "Writing position information to files." 28 29 def reset(self): 30 self.last_docnum = 0 31 32 def write_positions(self, docnum, positions): 33 34 """ 35 Write for the document 'docnum' the given 'positions'. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Make sure that the positions are sorted. 42 43 positions.sort() 44 45 # Write the document number delta. 46 # Write the number of positions. 47 48 output = array('B') 49 vint_to_array(docnum - self.last_docnum, output) 50 vint_to_array(len(positions), output) 51 52 # Write the position deltas. 53 54 last = 0 55 56 for position in positions: 57 vint_to_array(position - last, output) 58 last = position 59 60 output.tofile(self.f) 61 62 self.last_docnum = docnum 63 64 class PositionOpener(FileOpener): 65 66 "Reading position information from files." 67 68 def read_term_positions(self, offset, count): 69 70 """ 71 Read all positions from 'offset', seeking to that position in the file 72 before reading. The number of documents available for reading is limited 73 to 'count'. 74 """ 75 76 f = self.open("rb") 77 return PositionIterator(f, offset, count) 78 79 class PositionIndexWriter(FileWriter): 80 81 "Writing position index information to files." 82 83 def reset(self): 84 self.last_docnum = 0 85 self.last_pos_offset = 0 86 87 def write_positions(self, docnum, pos_offset, count): 88 89 """ 90 Write the given 'docnum, 'pos_offset' and document 'count' to the 91 position index file. 92 """ 93 94 # Write the document number delta. 95 # Write the position file offset delta. 96 # Write the document count. 97 98 output = array('B') 99 vint_to_array(docnum - self.last_docnum, output) 100 vint_to_array(pos_offset - self.last_pos_offset, output) 101 vint_to_array(count, output) 102 103 # Actually write the data. 104 105 output.tofile(self.f) 106 107 self.last_pos_offset = pos_offset 108 self.last_docnum = docnum 109 110 class PositionIndexOpener(FileOpener): 111 112 "Reading position index information from files." 113 114 def read_term_positions(self, offset, doc_frequency): 115 116 """ 117 Read all positions from 'offset', seeking to that position in the file 118 before reading. The number of documents available for reading is limited 119 to 'doc_frequency'. 120 """ 121 122 f = self.open("rb") 123 return PositionIndexIterator(f, offset, doc_frequency) 124 125 # Iterators for position-related files. 126 127 class IteratorBase: 128 129 def __init__(self, count): 130 self.replenish(count) 131 132 def replenish(self, count): 133 self.count = count 134 self.read_documents = 0 135 136 def __len__(self): 137 return self.count 138 139 def sort(self): 140 pass # Stored document positions are already sorted. 141 142 def __iter__(self): 143 return self 144 145 class PositionIterator(FileReader, IteratorBase): 146 147 "Iterating over document positions." 148 149 def __init__(self, f, offset, count): 150 FileReader.__init__(self, f) 151 IteratorBase.__init__(self, count) 152 self.f.seek(offset) 153 154 def reset(self): 155 self.last_docnum = 0 156 157 def read_positions(self): 158 159 "Read positions, returning a document number and a list of positions." 160 161 # Read the document number delta and add it to the last number. 162 163 self.last_docnum += self.read_number() 164 165 # Read the number of positions. 166 167 npositions = self.read_number() 168 169 # Read the position deltas, adding each previous position to get the 170 # appropriate collection of absolute positions. 171 172 i = 0 173 last = 0 174 positions = [] 175 176 while i < npositions: 177 last += self.read_number() 178 positions.append(last) 179 i += 1 180 181 return self.last_docnum, positions 182 183 def next(self): 184 185 "Read positions for a single document." 186 187 if self.read_documents < self.count: 188 self.read_documents += 1 189 return self.read_positions() 190 else: 191 raise StopIteration 192 193 class PositionIndexIterator(FileReader, IteratorBase): 194 195 "Iterating over document positions." 196 197 def __init__(self, f, offset, count): 198 FileReader.__init__(self, f) 199 IteratorBase.__init__(self, count) 200 self.f.seek(offset) 201 202 def reset(self): 203 self.last_docnum = 0 204 self.last_pos_offset = 0 205 self.section_count = 0 206 207 def read_positions(self): 208 209 """ 210 Read a document number, a position file offset for the position index 211 file, and the number of documents in a section of that file. 212 """ 213 214 # Read the document number delta. 215 216 self.last_docnum += self.read_number() 217 218 # Read the offset delta. 219 220 self.last_pos_offset += self.read_number() 221 222 # Read the document count. 223 224 count = self.read_number() 225 226 return self.last_docnum, self.last_pos_offset, count 227 228 def next(self): 229 230 "Read positions for a single document." 231 232 self.read_documents += self.section_count 233 if self.read_documents < self.count: 234 docnum, pos_offset, self.section_count = t = self.read_positions() 235 return t 236 else: 237 #assert self.read_documents == self.count # not upheld by from_document 238 raise StopIteration 239 240 class PositionDictionaryWriter: 241 242 "Writing position dictionaries." 243 244 def __init__(self, position_writer, position_index_writer, interval): 245 self.position_writer = position_writer 246 self.position_index_writer = position_index_writer 247 self.interval = interval 248 249 def write_term_positions(self, doc_positions): 250 251 """ 252 Write all 'doc_positions' - a collection of tuples of the form (document 253 number, position list) - to the file. 254 255 Add some records to the index, making dictionary entries. 256 257 Return a tuple containing the offset of the written data, the frequency 258 (number of positions), and document frequency (number of documents) for 259 the term involved. 260 """ 261 262 # Reset the writers. 263 264 self.position_writer.reset() 265 self.position_index_writer.reset() 266 267 # Remember the first index entry offset. 268 269 index_offset = self.position_index_writer.f.tell() 270 271 # Write the positions. 272 273 frequency = 0 274 count = 0 275 276 if doc_positions: 277 278 # Retain the first record offset for a subsequent index entry. 279 280 first_offset = self.position_writer.f.tell() 281 first_docnum = None 282 283 doc_positions.sort() 284 285 for docnum, positions in doc_positions: 286 if first_docnum is None: 287 first_docnum = docnum 288 289 self.position_writer.write_positions(docnum, positions) 290 291 frequency += len(positions) 292 count += 1 293 294 # Every {interval} entries, write an index entry. 295 296 if count % self.interval == 0: 297 298 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 299 300 first_offset = self.position_writer.f.tell() 301 first_docnum = None 302 303 # Reset the position writer so that position readers accessing 304 # a section start with the correct document number. 305 306 self.position_writer.reset() 307 308 # Finish writing an index entry for the remaining documents. 309 310 else: 311 if first_docnum is not None: 312 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 313 314 return index_offset, frequency, count 315 316 def close(self): 317 self.position_writer.close() 318 self.position_index_writer.close() 319 320 class PositionDictionaryReader: 321 322 "Reading position dictionaries." 323 324 def __init__(self, position_opener, position_index_opener): 325 self.position_opener = position_opener 326 self.position_index_opener = position_index_opener 327 328 def read_term_positions(self, offset, doc_frequency): 329 330 """ 331 Return an iterator for dictionary entries starting at 'offset' with the 332 given 'doc_frequency'. 333 """ 334 335 return PositionDictionaryIterator(self.position_opener, 336 self.position_index_opener, offset, doc_frequency) 337 338 def close(self): 339 pass 340 341 class PositionDictionaryIterator: 342 343 "Iteration over position dictionary entries." 344 345 def __init__(self, position_opener, position_index_opener, offset, doc_frequency): 346 self.position_opener = position_opener 347 self.position_index_opener = position_index_opener 348 self.doc_frequency = doc_frequency 349 350 self.index_iterator = None 351 self.iterator = None 352 353 # Initialise the iterators. 354 355 self.reset(offset, doc_frequency) 356 357 def reset(self, offset, doc_frequency): 358 359 # Remember the last values. 360 361 self.found_docnum, self.found_positions = None, None 362 363 # Attempt to reuse the index iterator. 364 365 if self.index_iterator is not None: 366 ii = self.index_iterator 367 ii.replenish(doc_frequency) 368 ii.f.seek(offset) 369 ii.reset() 370 371 # Or make a new index iterator. 372 373 else: 374 self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency) 375 376 # Maintain state for the next index entry, if read. 377 378 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 379 380 # Initialise the current index entry and current position file iterator. 381 382 self._next_section() 383 self._init_section() 384 385 # Sequence methods. 386 387 def __len__(self): 388 return self.doc_frequency 389 390 def sort(self): 391 pass 392 393 # Iterator methods. 394 395 def __iter__(self): 396 return self 397 398 def next(self): 399 400 """ 401 Attempt to get the next document record from the section in the 402 positions file. 403 """ 404 405 # Return any visited but unrequested record. 406 407 if self.found_docnum is not None: 408 t = self.found_docnum, self.found_positions 409 self.found_docnum, self.found_positions = None, None 410 return t 411 412 # Or search for the next record. 413 414 while 1: 415 416 # Either return the next record. 417 418 try: 419 return self.iterator.next() 420 421 # Or, where a section is finished, get the next section and try again. 422 423 except StopIteration: 424 425 # Where a section follows, update the index iterator, but keep 426 # reading using the same file iterator (since the data should 427 # just follow on from the last section). 428 429 self._next_section() 430 self.iterator.replenish(self.section_count) 431 432 # Reset the state of the iterator to make sure that document 433 # numbers are correct. 434 435 self.iterator.reset() 436 437 def from_document(self, docnum): 438 439 """ 440 Attempt to navigate to a positions entry for the given 'docnum', 441 returning the positions for 'docnum', or None otherwise. 442 """ 443 444 # Return any unrequested document positions. 445 446 if docnum == self.found_docnum: 447 return self.found_positions 448 449 # Read ahead in the index until the next entry refers to a document 450 # later than the desired document. 451 452 try: 453 if self.next_docnum is None: 454 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 455 456 # Read until the next entry is after the desired document number, 457 # or until the end of the results. 458 459 while self.next_docnum <= docnum: 460 self._next_read_section() 461 if self.docnum < docnum: 462 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 463 else: 464 break 465 466 except StopIteration: 467 pass 468 469 # Navigate in the position file to the document. 470 471 self._init_section() 472 473 try: 474 while 1: 475 found_docnum, found_positions = self.iterator.next() 476 477 # Return the desired document positions or None (retaining the 478 # positions for the document immediately after). 479 480 if docnum == found_docnum: 481 return found_positions 482 elif docnum < found_docnum: 483 self.found_docnum, self.found_positions = found_docnum, found_positions 484 return None 485 486 except StopIteration: 487 return None 488 489 # Internal methods. 490 491 def _next_section(self): 492 493 "Attempt to get the next section in the index." 494 495 if self.next_docnum is None: 496 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() 497 else: 498 self._next_read_section() 499 500 def _next_read_section(self): 501 502 """ 503 Make the next index entry the current one without reading from the 504 index. 505 """ 506 507 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 508 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 509 510 def _init_section(self): 511 512 "Initialise the iterator for the section in the position file." 513 514 # Attempt to reuse any correctly positioned iterator. 515 516 if self.iterator is not None: 517 i = self.iterator 518 i.replenish(self.section_count) 519 i.f.seek(self.pos_offset) 520 i.reset() 521 522 # Otherwise, obtain a new iterator. 523 524 else: 525 self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) 526 527 def close(self): 528 if self.iterator is not None: 529 self.iterator.close() 530 self.iterator = None 531 if self.index_iterator is not None: 532 self.index_iterator.close() 533 self.index_iterator = None 534 535 class ResetPositionDictionaryIterator: 536 537 """ 538 A helper class which permits the reuse of iterators without modifying their 539 state. 540 """ 541 542 def __init__(self, iterator, offset, doc_frequency): 543 self.iterator = iterator 544 self.offset = offset 545 self.doc_frequency = doc_frequency 546 547 def __iter__(self): 548 self.iterator.reset(self.offset, self.doc_frequency) 549 return iter(self.iterator) 550 551 # vim: tabstop=4 expandtab shiftwidth=4