1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.data import * 22 from iixr.files import * 23 24 class PositionWriter(FileWriter): 25 26 "Writing position information to files." 27 28 def begin(self, docnum_size, position_size): 29 self.write_numbers((docnum_size, position_size)) 30 self.end_record() 31 self.data_start = self.tell() 32 self.docnum_size = docnum_size 33 self.position_size = position_size 34 35 def reset(self): 36 self.end_record() 37 self.last_docnum = None 38 self.subtractor = None 39 40 def write_positions(self, docnum, positions): 41 42 """ 43 Write for the document 'docnum' the given 'positions'. 44 """ 45 46 if not positions: 47 return 48 49 # Make sure that the positions are sorted. 50 51 positions.sort() 52 53 # Calculate an ongoing delta. 54 55 if self.last_docnum is not None: 56 if docnum < self.last_docnum: 57 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 58 59 docnum_seq = self.subtractor(docnum, self.last_docnum) 60 61 # Or preserve the document number and prepare for future deltas. 62 63 else: 64 self.subtractor = get_subtractor(docnum) 65 docnum_seq = docnum 66 67 self.write_sequence_value(docnum_seq, self.docnum_size) 68 self.write_monotonic_sequence(positions, self.position_size) 69 70 self.last_docnum = docnum 71 72 class PositionReader(FileReader): 73 74 "Reading position information within term-specific regions of a file." 75 76 def begin(self): 77 self.begin_record() 78 try: 79 self.docnum_size, self.position_size = self.read_numbers(2) 80 except EOFError: 81 self.docnum_size, self.position_size = 0, 0 # NOTE: No positions! 82 self.data_start = self.tell() 83 84 def reset(self): 85 self.last_docnum = None 86 self.adder = None 87 self.begin_record() 88 89 def read_positions(self): 90 91 """ 92 Read positions, returning a document number and a list of positions. 93 """ 94 95 # Read the document number. 96 97 docnum = self.read_sequence_value(self.docnum_size) 98 99 # Calculate an ongoing delta. 100 101 if self.last_docnum is not None: 102 self.last_docnum = self.adder(docnum, self.last_docnum) 103 104 # Or preserve the document number and prepare for future deltas. 105 106 else: 107 self.adder = get_adder(docnum) 108 self.last_docnum = docnum 109 110 positions = self.read_monotonic_sequence(self.position_size) 111 112 return self.last_docnum, positions 113 114 class PositionIndexWriter(PositionWriter): 115 116 "Writing position index information to files." 117 118 def begin(self, docnum_size): 119 PositionWriter.begin(self, docnum_size, 0) 120 121 def reset(self): 122 PositionWriter.reset(self) 123 self.last_pos_offset = 0 124 125 def write_positions(self, docnum, pos_offset, count): 126 127 """ 128 Write the given 'docnum, 'pos_offset' and document 'count' to the 129 position index file. 130 """ 131 132 # Find the size of document number values. 133 134 if self.last_docnum is not None: 135 docnum_seq = self.subtractor(docnum, self.last_docnum) 136 else: 137 self.subtractor = get_subtractor(docnum) 138 docnum_seq = docnum 139 140 self.write_sequence_value(docnum_seq, self.docnum_size) 141 self.write_number(pos_offset - self.last_pos_offset) 142 self.write_number(count) 143 144 self.last_docnum = docnum 145 self.last_pos_offset = pos_offset 146 147 class PositionIndexReader(PositionReader): 148 149 "Reading position index information within term-specific regions of a file." 150 151 def reset(self): 152 PositionReader.reset(self) 153 self.last_pos_offset = 0 154 155 def read_positions(self): 156 157 """ 158 Read a document number, a position file offset for the position index 159 file, and the number of documents in a section of that file. 160 """ 161 162 # Read the document number. 163 164 docnum = self.read_sequence_value(self.docnum_size) 165 166 if self.last_docnum is not None: 167 self.last_docnum = self.adder(docnum, self.last_docnum) 168 else: 169 self.adder = get_adder(docnum) 170 self.last_docnum = docnum 171 172 # Read the offset delta. 173 174 self.last_pos_offset += self.read_number() 175 176 # Read the document count. 177 178 count = self.read_number() 179 180 return self.last_docnum, self.last_pos_offset, count 181 182 # Iterators for position-related files. 183 184 class IteratorBase: 185 186 "Support for iterating over results." 187 188 def __init__(self, reader): 189 190 "Initialise the iterator using the given 'reader'." 191 192 self.reader = reader 193 self.replenish(0) # no iteration initially permitted 194 195 def replenish(self, count): 196 197 "Replenish the iterator with 'count' results." 198 199 self.count = count 200 self.read_documents = 0 201 202 def __len__(self): 203 204 "Return the total number of results." 205 206 return self.count 207 208 def sort(self): 209 pass # Stored document positions are already sorted. 210 211 def __iter__(self): 212 return self 213 214 class PositionIterator(IteratorBase): 215 216 "Iterating over document positions." 217 218 def replenish(self, count): 219 IteratorBase.replenish(self, count) 220 221 # Fill a cache of positions. 222 223 self.cache = [] 224 n = 0 225 226 while n < self.count: 227 self.cache.append(self.reader.read_positions()) 228 n += 1 229 230 def seek(self, offset, count): 231 232 """ 233 Seek to 'offset' in the file, limiting the number of documents available 234 for reading to 'count'. 235 """ 236 237 self.reader.seek(offset) 238 self.replenish(count) 239 240 def next(self): 241 242 "Read positions for a single document." 243 244 if self.read_documents < self.count: 245 positions = self.cache[self.read_documents] 246 self.read_documents += 1 247 return positions 248 else: 249 raise StopIteration 250 251 class PositionIndexIterator(IteratorBase): 252 253 "Iterating over document positions." 254 255 def replenish(self, count): 256 IteratorBase.replenish(self, count) 257 258 # Fill a cache of offsets. 259 260 self.cache = [] 261 self.current = 0 262 n = 0 263 264 while n < self.count: 265 docnum, pos_offset, section_count = t = self.reader.read_positions() 266 self.cache.append(t) 267 n += section_count 268 269 def seek(self, offset, doc_frequency): 270 271 """ 272 Seek to 'offset' in the file, limiting the number of documents available 273 for reading to 'doc_frequency'. 274 """ 275 276 self.reader.seek(offset) 277 self.replenish(doc_frequency) 278 279 def next(self): 280 281 "Read positions for a single document." 282 283 if self.current < len(self.cache): 284 docnum, pos_offset, self.section_count = t = self.cache[self.current] 285 self.current += 1 286 return t 287 else: 288 raise StopIteration 289 290 class PositionDictionaryWriter: 291 292 "Writing position dictionaries." 293 294 def __init__(self, position_writer, position_index_writer, interval): 295 self.position_writer = position_writer 296 self.position_index_writer = position_index_writer 297 self.interval = interval 298 299 def write_term_positions(self, doc_positions): 300 301 """ 302 Write all 'doc_positions' - a collection of tuples of the form (document 303 number, position list) - to the file. 304 305 Add some records to the index, making dictionary entries. 306 307 Return a tuple containing the offset of the written data, the frequency 308 (number of positions), and document frequency (number of documents) for 309 the term involved. 310 """ 311 312 # Write the positions. 313 314 frequency = 0 315 count = 0 316 317 if doc_positions: 318 doc_positions.sort() 319 320 # Look ahead at the first document record. 321 # NOTE: Any iterator would need to support this. 322 323 first_docnum, first_positions = doc_positions[0] 324 first_position = first_positions[0] 325 326 # Write out size details. 327 328 docnum_size, position_size = sizeof(first_docnum), sizeof(first_position) 329 self.position_writer.begin(docnum_size, position_size) 330 self.position_index_writer.begin(docnum_size) 331 332 # Reset the writers. 333 334 self.position_writer.reset() 335 self.position_index_writer.reset() 336 337 # Remember the first index entry offset. 338 339 index_offset = self.position_index_writer.tell() 340 341 # Retain the first record offset for a subsequent index entry. 342 343 first_offset = self.position_writer.tell() 344 345 for docnum, positions in doc_positions: 346 if first_docnum is None: 347 first_docnum = docnum 348 349 self.position_writer.write_positions(docnum, positions) 350 351 frequency += len(positions) 352 count += 1 353 354 # Every {interval} entries, write an index entry. 355 356 if count % self.interval == 0: 357 358 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 359 360 # Reset the position writer so that position readers accessing 361 # a section start with the correct document number. 362 363 self.position_writer.reset() 364 365 first_offset = self.position_writer.tell() 366 first_docnum = None 367 368 # Finish writing an index entry for the remaining documents. 369 370 else: 371 if first_docnum is not None: 372 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 373 374 return index_offset, frequency, count 375 376 def close(self): 377 self.position_writer.close() 378 self.position_index_writer.close() 379 380 class PositionDictionaryReader: 381 382 "Access to position dictionary entries through iterators." 383 384 def __init__(self, position_reader, position_index_reader): 385 self.position_reader = position_reader 386 self.position_index_reader = position_index_reader 387 388 def read_term_positions(self, offset, doc_frequency): 389 iterator = PositionDictionaryIterator( 390 PositionIterator(self.position_reader), 391 PositionIndexIterator(self.position_index_reader) 392 ) 393 iterator.seek(offset, doc_frequency) 394 return iterator 395 396 def close(self): 397 self.position_reader.close() 398 self.position_index_reader.close() 399 400 class PositionDictionaryIterator: 401 402 "Iteration over position dictionary entries." 403 404 def __init__(self, position_iterator, position_index_iterator): 405 self.position_iterator = position_iterator 406 self.position_index_iterator = position_index_iterator 407 self.reset() 408 409 def reset(self): 410 411 # Remember the last values. 412 413 self.found_docnum, self.found_positions = None, None 414 415 # Maintain state for the next index entry, if read. 416 417 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 418 419 def seek(self, offset, doc_frequency): 420 421 """ 422 Seek to 'offset' in the index file, limiting the number of documents 423 available for reading to 'doc_frequency'. 424 """ 425 426 self.reset() 427 428 # Seek to the appropriate index entry. 429 430 self.position_index_iterator.seek(offset, doc_frequency) 431 432 # Initialise the current index entry and current position file iterator. 433 434 self._next_section() 435 self._init_section() 436 437 # Sequence methods. 438 439 def __len__(self): 440 return len(self.position_index_iterator) 441 442 def sort(self): 443 pass 444 445 # Iterator methods. 446 447 def __iter__(self): 448 return self 449 450 def next(self): 451 452 """ 453 Attempt to get the next document record from the section in the 454 positions file. 455 """ 456 457 # Return any visited but unrequested record. 458 459 if self.found_docnum is not None: 460 t = self.found_docnum, self.found_positions 461 self.found_docnum, self.found_positions = None, None 462 return t 463 464 # Or search for the next record. 465 466 while 1: 467 468 # Either return the next record. 469 470 try: 471 return self.position_iterator.next() 472 473 # Or, where a section is finished, get the next section and try again. 474 475 except StopIteration: 476 477 # Although, where a single iterator is in use, the file reader 478 # would be positioned appropriately, this is not guaranteed in a 479 # multiple iterator situation. 480 481 self._next_section() 482 self._init_section() 483 484 def from_document(self, docnum): 485 486 """ 487 Attempt to navigate to a positions entry for the given 'docnum', 488 returning the positions for 'docnum', or None otherwise. 489 """ 490 491 # Return any unrequested document positions. 492 493 if docnum == self.found_docnum: 494 return self.found_positions 495 496 # Read ahead in the index until the next entry refers to a document 497 # later than the desired document. 498 499 try: 500 if self.next_docnum is None: 501 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 502 503 # Read until the next entry is after the desired document number, 504 # or until the end of the results. 505 506 while self.next_docnum <= docnum: 507 self._next_read_section() 508 if self.docnum < docnum: 509 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 510 else: 511 break 512 513 except StopIteration: 514 pass 515 516 # Navigate in the position file to the document. 517 518 self._init_section() 519 520 try: 521 while 1: 522 found_docnum, found_positions = self.position_iterator.next() 523 524 # Return the desired document positions or None (retaining the 525 # positions for the document immediately after). 526 527 if docnum <= found_docnum: 528 self.found_docnum, self.found_positions = found_docnum, found_positions 529 if docnum == found_docnum: 530 return found_positions 531 elif docnum < found_docnum: 532 return None 533 534 except StopIteration: 535 return None 536 537 # Internal methods. 538 539 def _next_section(self): 540 541 "Attempt to get the next section in the index." 542 543 if self.next_docnum is None: 544 self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next() 545 else: 546 self._next_read_section() 547 548 def _next_read_section(self): 549 550 """ 551 Make the next index entry the current one without reading from the 552 index. 553 """ 554 555 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 556 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 557 558 def _init_section(self): 559 560 "Initialise the iterator for the section in the position file." 561 562 # Seek to the position entry. 563 564 self.position_iterator.seek(self.pos_offset, self.section_count) 565 566 # vim: tabstop=4 expandtab shiftwidth=4