1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint, vint_to_array 23 from array import array 24 25 class PositionWriter(FileWriter): 26 27 "Writing position information to files." 28 29 def reset(self): 30 self.last_docnum = None 31 self.docnum_size = None 32 33 def write_positions(self, docnum, positions): 34 35 """ 36 Write for the document 'docnum' the given 'positions'. 37 """ 38 39 # Find the size of document number values. 40 41 if self.docnum_size is None: 42 self.docnum_size = self.get_value_size(docnum) 43 self.last_docnum = self.get_initial_value(self.docnum_size) 44 45 if docnum < self.last_docnum: 46 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 47 48 # Make sure that the positions are sorted. 49 50 positions.sort() 51 52 # Find the size of position values. 53 54 size = self.get_value_size(positions[0]) 55 56 # Write the number of values per document number. 57 # Write the document number delta. 58 # Write the number of positions. 59 # Write the number of values per position. 60 61 output = array('B') 62 vint_to_array(self.docnum_size, output) 63 self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) 64 vint_to_array(len(positions), output) 65 vint_to_array(size, output) 66 67 # Write the position deltas. 68 69 last = self.get_initial_value(size) 70 71 for position in positions: 72 last = self.write_sequence(output, position, last, size) 73 74 output.tofile(self.f) 75 76 self.last_docnum = docnum 77 78 class PositionReader(FileReader): 79 80 "Reading position information within term-specific regions of a file." 81 82 def reset(self): 83 self.last_docnum = None 84 85 def read_positions(self): 86 87 """ 88 Read positions, returning a document number and a list of positions. 89 """ 90 91 # Read the number of values per document number. 92 93 docnum_size = self.read_number() 94 95 if self.last_docnum is None: 96 self.last_docnum = self.get_initial_value(docnum_size) 97 98 # Read the document number delta and add it to the last number. 99 100 self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) 101 102 # Read the number of positions. 103 104 npositions = self.read_number() 105 106 # Read the number of values per position. 107 108 size = self.read_number() 109 110 # Read the position deltas, adding each previous position to get the 111 # appropriate collection of absolute positions. 112 113 i = 0 114 115 last = self.get_initial_value(size) 116 117 positions = [] 118 119 while i < npositions: 120 last = self.read_sequence(last, size) 121 positions.append(last) 122 i += 1 123 124 return self.last_docnum, positions 125 126 class PositionIndexWriter(FileWriter): 127 128 "Writing position index information to files." 129 130 def reset(self): 131 self.last_docnum = None 132 self.docnum_size = None 133 self.last_pos_offset = 0 134 135 def write_positions(self, docnum, pos_offset, count): 136 137 """ 138 Write the given 'docnum, 'pos_offset' and document 'count' to the 139 position index file. 140 """ 141 142 # Find the size of document number values. 143 144 if self.docnum_size is None: 145 self.docnum_size = self.get_value_size(docnum) 146 self.last_docnum = self.get_initial_value(self.docnum_size) 147 148 # Write the number of values per document number. 149 # Write the document number delta. 150 # Write the position file offset delta. 151 # Write the document count. 152 153 output = array('B') 154 vint_to_array(self.docnum_size, output) 155 self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) 156 vint_to_array(pos_offset - self.last_pos_offset, output) 157 vint_to_array(count, output) 158 159 # Actually write the data. 160 161 output.tofile(self.f) 162 163 self.last_pos_offset = pos_offset 164 165 class PositionIndexReader(FileReader): 166 167 "Reading position index information within term-specific regions of a file." 168 169 def reset(self): 170 self.last_docnum = None 171 self.last_pos_offset = 0 172 173 def read_positions(self): 174 175 """ 176 Read a document number, a position file offset for the position index 177 file, and the number of documents in a section of that file. 178 """ 179 180 # Read the number of values per document number. 181 182 docnum_size = self.read_number() 183 184 if self.last_docnum is None: 185 self.last_docnum = self.get_initial_value(docnum_size) 186 187 # Read the document number delta and add it to the last number. 188 189 self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) 190 191 # Read the offset delta. 192 193 self.last_pos_offset += self.read_number() 194 195 # Read the document count. 196 197 count = self.read_number() 198 199 return self.last_docnum, self.last_pos_offset, count 200 201 # Iterators for position-related files. 202 203 class IteratorBase: 204 205 "Support for iterating over results." 206 207 def __init__(self, reader): 208 209 "Initialise the iterator using the given 'reader'." 210 211 self.reader = reader 212 self.replenish(0) # no iteration initially permitted 213 214 def replenish(self, count): 215 216 "Replenish the iterator with 'count' results." 217 218 self.count = count 219 self.read_documents = 0 220 221 def __len__(self): 222 223 "Return the total number of results." 224 225 return self.count 226 227 def sort(self): 228 pass # Stored document positions are already sorted. 229 230 def __iter__(self): 231 return self 232 233 class PositionIterator(IteratorBase): 234 235 "Iterating over document positions." 236 237 def replenish(self, count): 238 IteratorBase.replenish(self, count) 239 240 # Fill a cache of positions. 241 242 self.cache = [] 243 n = 0 244 245 while n < self.count: 246 self.cache.append(self.reader.read_positions()) 247 n += 1 248 249 def seek(self, offset, count): 250 251 """ 252 Seek to 'offset' in the file, limiting the number of documents available 253 for reading to 'count'. 254 """ 255 256 self.reader.seek(offset) 257 self.replenish(count) 258 259 def next(self): 260 261 "Read positions for a single document." 262 263 if self.read_documents < self.count: 264 positions = self.cache[self.read_documents] 265 self.read_documents += 1 266 return positions 267 else: 268 raise StopIteration 269 270 class PositionIndexIterator(IteratorBase): 271 272 "Iterating over document positions." 273 274 def replenish(self, count): 275 IteratorBase.replenish(self, count) 276 277 # Fill a cache of offsets. 278 279 self.cache = [] 280 self.current = 0 281 n = 0 282 283 while n < self.count: 284 docnum, pos_offset, section_count = t = self.reader.read_positions() 285 self.cache.append(t) 286 n += section_count 287 288 def seek(self, offset, doc_frequency): 289 290 """ 291 Seek to 'offset' in the file, limiting the number of documents available 292 for reading to 'doc_frequency'. 293 """ 294 295 self.reader.seek(offset) 296 self.replenish(doc_frequency) 297 298 def next(self): 299 300 "Read positions for a single document." 301 302 if self.current < len(self.cache): 303 docnum, pos_offset, self.section_count = t = self.cache[self.current] 304 self.current += 1 305 return t 306 else: 307 raise StopIteration 308 309 class PositionDictionaryWriter: 310 311 "Writing position dictionaries." 312 313 def __init__(self, position_writer, position_index_writer, interval): 314 self.position_writer = position_writer 315 self.position_index_writer = position_index_writer 316 self.interval = interval 317 318 def write_term_positions(self, doc_positions): 319 320 """ 321 Write all 'doc_positions' - a collection of tuples of the form (document 322 number, position list) - to the file. 323 324 Add some records to the index, making dictionary entries. 325 326 Return a tuple containing the offset of the written data, the frequency 327 (number of positions), and document frequency (number of documents) for 328 the term involved. 329 """ 330 331 # Reset the writers. 332 333 self.position_writer.reset() 334 self.position_index_writer.reset() 335 336 # Remember the first index entry offset. 337 338 index_offset = self.position_index_writer.f.tell() 339 340 # Write the positions. 341 342 frequency = 0 343 count = 0 344 345 if doc_positions: 346 347 # Retain the first record offset for a subsequent index entry. 348 349 first_offset = self.position_writer.f.tell() 350 first_docnum = None 351 352 doc_positions.sort() 353 354 for docnum, positions in doc_positions: 355 if first_docnum is None: 356 first_docnum = docnum 357 358 self.position_writer.write_positions(docnum, positions) 359 360 frequency += len(positions) 361 count += 1 362 363 # Every {interval} entries, write an index entry. 364 365 if count % self.interval == 0: 366 367 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 368 369 first_offset = self.position_writer.f.tell() 370 first_docnum = None 371 372 # Reset the position writer so that position readers accessing 373 # a section start with the correct document number. 374 375 self.position_writer.reset() 376 377 # Finish writing an index entry for the remaining documents. 378 379 else: 380 if first_docnum is not None: 381 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 382 383 return index_offset, frequency, count 384 385 def close(self): 386 self.position_writer.close() 387 self.position_index_writer.close() 388 389 class PositionDictionaryReader: 390 391 "Access to position dictionary entries through iterators." 392 393 def __init__(self, position_reader, position_index_reader): 394 self.position_reader = position_reader 395 self.position_index_reader = position_index_reader 396 397 def read_term_positions(self, offset, doc_frequency): 398 iterator = PositionDictionaryIterator( 399 PositionIterator(self.position_reader), 400 PositionIndexIterator(self.position_index_reader) 401 ) 402 iterator.seek(offset, doc_frequency) 403 return iterator 404 405 def close(self): 406 self.position_reader.close() 407 self.position_index_reader.close() 408 409 class PositionDictionaryIterator: 410 411 "Iteration over position dictionary entries." 412 413 def __init__(self, position_iterator, position_index_iterator): 414 self.position_iterator = position_iterator 415 self.position_index_iterator = position_index_iterator 416 self.reset() 417 418 def reset(self): 419 420 # Remember the last values. 421 422 self.found_docnum, self.found_positions = None, None 423 424 # Maintain state for the next index entry, if read. 425 426 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 427 428 def seek(self, offset, doc_frequency): 429 430 """ 431 Seek to 'offset' in the index file, limiting the number of documents 432 available for reading to 'doc_frequency'. 433 """ 434 435 self.reset() 436 437 # Seek to the appropriate index entry. 438 439 self.position_index_iterator.seek(offset, doc_frequency) 440 441 # Initialise the current index entry and current position file iterator. 442 443 self._next_section() 444 self._init_section() 445 446 # Sequence methods. 447 448 def __len__(self): 449 return len(self.position_index_iterator) 450 451 def sort(self): 452 pass 453 454 # Iterator methods. 455 456 def __iter__(self): 457 return self 458 459 def next(self): 460 461 """ 462 Attempt to get the next document record from the section in the 463 positions file. 464 """ 465 466 # Return any visited but unrequested record. 467 468 if self.found_docnum is not None: 469 t = self.found_docnum, self.found_positions 470 self.found_docnum, self.found_positions = None, None 471 return t 472 473 # Or search for the next record. 474 475 while 1: 476 477 # Either return the next record. 478 479 try: 480 return self.position_iterator.next() 481 482 # Or, where a section is finished, get the next section and try again. 483 484 except StopIteration: 485 486 # Although, where a single iterator is in use, the file reader 487 # would be positioned appropriately, this is not guaranteed in a 488 # multiple iterator situation. 489 490 self._next_section() 491 self._init_section() 492 493 def from_document(self, docnum): 494 495 """ 496 Attempt to navigate to a positions entry for the given 'docnum', 497 returning the positions for 'docnum', or None otherwise. 498 """ 499 500 # Return any unrequested document positions. 501 502 if docnum == self.found_docnum: 503 return self.found_positions 504 505 # Read ahead in the index until the next entry refers to a document 506 # later than the desired document. 507 508 try: 509 if self.next_docnum is None: 510 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 511 512 # Read until the next entry is after the desired document number, 513 # or until the end of the results. 514 515 while self.next_docnum <= docnum: 516 self._next_read_section() 517 if self.docnum < docnum: 518 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 519 else: 520 break 521 522 except StopIteration: 523 pass 524 525 # Navigate in the position file to the document. 526 527 self._init_section() 528 529 try: 530 while 1: 531 found_docnum, found_positions = self.position_iterator.next() 532 533 # Return the desired document positions or None (retaining the 534 # positions for the document immediately after). 535 536 if docnum <= found_docnum: 537 self.found_docnum, self.found_positions = found_docnum, found_positions 538 if docnum == found_docnum: 539 return found_positions 540 elif docnum < found_docnum: 541 return None 542 543 except StopIteration: 544 return None 545 546 # Internal methods. 547 548 def _next_section(self): 549 550 "Attempt to get the next section in the index." 551 552 if self.next_docnum is None: 553 self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next() 554 else: 555 self._next_read_section() 556 557 def _next_read_section(self): 558 559 """ 560 Make the next index entry the current one without reading from the 561 index. 562 """ 563 564 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 565 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 566 567 def _init_section(self): 568 569 "Initialise the iterator for the section in the position file." 570 571 # Seek to the position entry. 572 573 self.position_iterator.seek(self.pos_offset, self.section_count) 574 575 # vim: tabstop=4 expandtab shiftwidth=4