1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint, vint_to_array 23 24 class PositionWriter(FileWriter): 25 26 "Writing position information to files." 27 28 def reset(self): 29 self.last_docnum = None 30 self.docnum_size = None 31 32 def write_positions(self, docnum, positions): 33 34 """ 35 Write for the document 'docnum' the given 'positions'. 36 """ 37 38 # Find the size of document number values. 39 40 if self.docnum_size is None: 41 self.docnum_size = self.get_value_size(docnum) 42 self.last_docnum = self.get_initial_value(self.docnum_size) 43 44 if docnum < self.last_docnum: 45 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 46 47 # Make sure that the positions are sorted. 48 49 positions.sort() 50 51 # Find the size of position values. 52 53 size = self.get_value_size(positions[0]) 54 55 # Write the number of values per document number. 56 # Write the document number delta. 57 # Write the number of positions. 58 # Write the number of values per position. 59 60 self.write_number(self.docnum_size) 61 self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) 62 self.write_number(len(positions)) 63 self.write_number(size) 64 65 # Write the position deltas. 66 67 last = self.get_initial_value(size) 68 69 for position in positions: 70 last = self.write_sequence(position, last, size) 71 72 self.last_docnum = docnum 73 74 class PositionReader(FileReader): 75 76 "Reading position information within term-specific regions of a file." 77 78 def reset(self): 79 self.last_docnum = None 80 81 def read_positions(self): 82 83 """ 84 Read positions, returning a document number and a list of positions. 85 """ 86 87 # Read the number of values per document number. 88 89 docnum_size = self.read_number() 90 91 if self.last_docnum is None: 92 self.last_docnum = self.get_initial_value(docnum_size) 93 94 # Read the document number delta and add it to the last number. 95 96 self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) 97 98 # Read the number of positions. 99 100 npositions = self.read_number() 101 102 # Read the number of values per position. 103 104 size = self.read_number() 105 106 # Read the position deltas, adding each previous position to get the 107 # appropriate collection of absolute positions. 108 109 i = 0 110 111 last = self.get_initial_value(size) 112 113 positions = [] 114 115 while i < npositions: 116 last = self.read_sequence(last, size) 117 positions.append(last) 118 i += 1 119 120 return self.last_docnum, positions 121 122 class PositionIndexWriter(FileWriter): 123 124 "Writing position index information to files." 125 126 def reset(self): 127 self.last_docnum = None 128 self.docnum_size = None 129 self.last_pos_offset = 0 130 131 def write_positions(self, docnum, pos_offset, count): 132 133 """ 134 Write the given 'docnum, 'pos_offset' and document 'count' to the 135 position index file. 136 """ 137 138 # Find the size of document number values. 139 140 if self.docnum_size is None: 141 self.docnum_size = self.get_value_size(docnum) 142 self.last_docnum = self.get_initial_value(self.docnum_size) 143 144 # Write the number of values per document number. 145 # Write the document number delta. 146 # Write the position file offset delta. 147 # Write the document count. 148 149 self.write_number(self.docnum_size) 150 self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) 151 self.write_number(pos_offset - self.last_pos_offset) 152 self.write_number(count) 153 154 self.last_pos_offset = pos_offset 155 156 class PositionIndexReader(FileReader): 157 158 "Reading position index information within term-specific regions of a file." 159 160 def reset(self): 161 self.last_docnum = None 162 self.last_pos_offset = 0 163 164 def read_positions(self): 165 166 """ 167 Read a document number, a position file offset for the position index 168 file, and the number of documents in a section of that file. 169 """ 170 171 # Read the number of values per document number. 172 173 docnum_size = self.read_number() 174 175 if self.last_docnum is None: 176 self.last_docnum = self.get_initial_value(docnum_size) 177 178 # Read the document number delta and add it to the last number. 179 180 self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) 181 182 # Read the offset delta. 183 184 self.last_pos_offset += self.read_number() 185 186 # Read the document count. 187 188 count = self.read_number() 189 190 return self.last_docnum, self.last_pos_offset, count 191 192 # Iterators for position-related files. 193 194 class IteratorBase: 195 196 "Support for iterating over results." 197 198 def __init__(self, reader): 199 200 "Initialise the iterator using the given 'reader'." 201 202 self.reader = reader 203 self.replenish(0) # no iteration initially permitted 204 205 def replenish(self, count): 206 207 "Replenish the iterator with 'count' results." 208 209 self.count = count 210 self.read_documents = 0 211 212 def __len__(self): 213 214 "Return the total number of results." 215 216 return self.count 217 218 def sort(self): 219 pass # Stored document positions are already sorted. 220 221 def __iter__(self): 222 return self 223 224 class PositionIterator(IteratorBase): 225 226 "Iterating over document positions." 227 228 def replenish(self, count): 229 IteratorBase.replenish(self, count) 230 231 # Fill a cache of positions. 232 233 self.cache = [] 234 n = 0 235 236 while n < self.count: 237 self.cache.append(self.reader.read_positions()) 238 n += 1 239 240 def seek(self, offset, count): 241 242 """ 243 Seek to 'offset' in the file, limiting the number of documents available 244 for reading to 'count'. 245 """ 246 247 self.reader.seek(offset) 248 self.replenish(count) 249 250 def next(self): 251 252 "Read positions for a single document." 253 254 if self.read_documents < self.count: 255 positions = self.cache[self.read_documents] 256 self.read_documents += 1 257 return positions 258 else: 259 raise StopIteration 260 261 class PositionIndexIterator(IteratorBase): 262 263 "Iterating over document positions." 264 265 def replenish(self, count): 266 IteratorBase.replenish(self, count) 267 268 # Fill a cache of offsets. 269 270 self.cache = [] 271 self.current = 0 272 n = 0 273 274 while n < self.count: 275 docnum, pos_offset, section_count = t = self.reader.read_positions() 276 self.cache.append(t) 277 n += section_count 278 279 def seek(self, offset, doc_frequency): 280 281 """ 282 Seek to 'offset' in the file, limiting the number of documents available 283 for reading to 'doc_frequency'. 284 """ 285 286 self.reader.seek(offset) 287 self.replenish(doc_frequency) 288 289 def next(self): 290 291 "Read positions for a single document." 292 293 if self.current < len(self.cache): 294 docnum, pos_offset, self.section_count = t = self.cache[self.current] 295 self.current += 1 296 return t 297 else: 298 raise StopIteration 299 300 class PositionDictionaryWriter: 301 302 "Writing position dictionaries." 303 304 def __init__(self, position_writer, position_index_writer, interval): 305 self.position_writer = position_writer 306 self.position_index_writer = position_index_writer 307 self.interval = interval 308 309 def write_term_positions(self, doc_positions): 310 311 """ 312 Write all 'doc_positions' - a collection of tuples of the form (document 313 number, position list) - to the file. 314 315 Add some records to the index, making dictionary entries. 316 317 Return a tuple containing the offset of the written data, the frequency 318 (number of positions), and document frequency (number of documents) for 319 the term involved. 320 """ 321 322 # Reset the writers. 323 324 self.position_writer.reset() 325 self.position_index_writer.reset() 326 327 # Remember the first index entry offset. 328 329 index_offset = self.position_index_writer.tell() 330 331 # Write the positions. 332 333 frequency = 0 334 count = 0 335 336 if doc_positions: 337 338 # Retain the first record offset for a subsequent index entry. 339 340 first_offset = self.position_writer.tell() 341 first_docnum = None 342 343 doc_positions.sort() 344 345 for docnum, positions in doc_positions: 346 if first_docnum is None: 347 first_docnum = docnum 348 349 self.position_writer.write_positions(docnum, positions) 350 351 frequency += len(positions) 352 count += 1 353 354 # Every {interval} entries, write an index entry. 355 356 if count % self.interval == 0: 357 358 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 359 360 first_offset = self.position_writer.tell() 361 first_docnum = None 362 363 # Reset the position writer so that position readers accessing 364 # a section start with the correct document number. 365 366 self.position_writer.reset() 367 368 # Finish writing an index entry for the remaining documents. 369 370 else: 371 if first_docnum is not None: 372 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 373 374 return index_offset, frequency, count 375 376 def close(self): 377 self.position_writer.close() 378 self.position_index_writer.close() 379 380 class PositionDictionaryReader: 381 382 "Access to position dictionary entries through iterators." 383 384 def __init__(self, position_reader, position_index_reader): 385 self.position_reader = position_reader 386 self.position_index_reader = position_index_reader 387 388 def read_term_positions(self, offset, doc_frequency): 389 iterator = PositionDictionaryIterator( 390 PositionIterator(self.position_reader), 391 PositionIndexIterator(self.position_index_reader) 392 ) 393 iterator.seek(offset, doc_frequency) 394 return iterator 395 396 def close(self): 397 self.position_reader.close() 398 self.position_index_reader.close() 399 400 class PositionDictionaryIterator: 401 402 "Iteration over position dictionary entries." 403 404 def __init__(self, position_iterator, position_index_iterator): 405 self.position_iterator = position_iterator 406 self.position_index_iterator = position_index_iterator 407 self.reset() 408 409 def reset(self): 410 411 # Remember the last values. 412 413 self.found_docnum, self.found_positions = None, None 414 415 # Maintain state for the next index entry, if read. 416 417 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 418 419 def seek(self, offset, doc_frequency): 420 421 """ 422 Seek to 'offset' in the index file, limiting the number of documents 423 available for reading to 'doc_frequency'. 424 """ 425 426 self.reset() 427 428 # Seek to the appropriate index entry. 429 430 self.position_index_iterator.seek(offset, doc_frequency) 431 432 # Initialise the current index entry and current position file iterator. 433 434 self._next_section() 435 self._init_section() 436 437 # Sequence methods. 438 439 def __len__(self): 440 return len(self.position_index_iterator) 441 442 def sort(self): 443 pass 444 445 # Iterator methods. 446 447 def __iter__(self): 448 return self 449 450 def next(self): 451 452 """ 453 Attempt to get the next document record from the section in the 454 positions file. 455 """ 456 457 # Return any visited but unrequested record. 458 459 if self.found_docnum is not None: 460 t = self.found_docnum, self.found_positions 461 self.found_docnum, self.found_positions = None, None 462 return t 463 464 # Or search for the next record. 465 466 while 1: 467 468 # Either return the next record. 469 470 try: 471 return self.position_iterator.next() 472 473 # Or, where a section is finished, get the next section and try again. 474 475 except StopIteration: 476 477 # Although, where a single iterator is in use, the file reader 478 # would be positioned appropriately, this is not guaranteed in a 479 # multiple iterator situation. 480 481 self._next_section() 482 self._init_section() 483 484 def from_document(self, docnum): 485 486 """ 487 Attempt to navigate to a positions entry for the given 'docnum', 488 returning the positions for 'docnum', or None otherwise. 489 """ 490 491 # Return any unrequested document positions. 492 493 if docnum == self.found_docnum: 494 return self.found_positions 495 496 # Read ahead in the index until the next entry refers to a document 497 # later than the desired document. 498 499 try: 500 if self.next_docnum is None: 501 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 502 503 # Read until the next entry is after the desired document number, 504 # or until the end of the results. 505 506 while self.next_docnum <= docnum: 507 self._next_read_section() 508 if self.docnum < docnum: 509 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 510 else: 511 break 512 513 except StopIteration: 514 pass 515 516 # Navigate in the position file to the document. 517 518 self._init_section() 519 520 try: 521 while 1: 522 found_docnum, found_positions = self.position_iterator.next() 523 524 # Return the desired document positions or None (retaining the 525 # positions for the document immediately after). 526 527 if docnum <= found_docnum: 528 self.found_docnum, self.found_positions = found_docnum, found_positions 529 if docnum == found_docnum: 530 return found_positions 531 elif docnum < found_docnum: 532 return None 533 534 except StopIteration: 535 return None 536 537 # Internal methods. 538 539 def _next_section(self): 540 541 "Attempt to get the next section in the index." 542 543 if self.next_docnum is None: 544 self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next() 545 else: 546 self._next_read_section() 547 548 def _next_read_section(self): 549 550 """ 551 Make the next index entry the current one without reading from the 552 index. 553 """ 554 555 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 556 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 557 558 def _init_section(self): 559 560 "Initialise the iterator for the section in the position file." 561 562 # Seek to the position entry. 563 564 self.position_iterator.seek(self.pos_offset, self.section_count) 565 566 # vim: tabstop=4 expandtab shiftwidth=4