1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.data import * 22 from iixr.files import * 23 24 class PositionWriter(FileWriter): 25 26 "Writing position information to files." 27 28 def reset(self): 29 self.last_docnum = None 30 self.subtractor = None 31 32 def write_positions(self, docnum, positions): 33 34 """ 35 Write for the document 'docnum' the given 'positions'. 36 """ 37 38 if not positions: 39 return 40 41 # Make sure that the positions are sorted. 42 43 positions.sort() 44 45 # Calculate an ongoing delta. 46 47 if self.last_docnum is not None: 48 if docnum < self.last_docnum: 49 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 50 51 docnum_seq = self.subtractor(docnum, self.last_docnum) 52 53 # Or preserve the document number and prepare for future deltas. 54 55 else: 56 self.subtractor = get_subtractor(docnum) 57 docnum_seq = docnum 58 59 self.begin_record() 60 self.write_sequence_value(docnum_seq) 61 self.write_monotonic_sequence(positions) 62 self.end_record() 63 64 self.last_docnum = docnum 65 66 class PositionReader(FileReader): 67 68 "Reading position information within term-specific regions of a file." 69 70 def reset(self): 71 self.last_docnum = None 72 self.adder = None 73 74 def read_positions(self): 75 76 """ 77 Read positions, returning a document number and a list of positions. 78 """ 79 80 self.begin_record() 81 82 # Read the document number. 83 84 docnum = self.read_sequence_value() 85 86 # Calculate an ongoing delta. 87 88 if self.last_docnum is not None: 89 self.last_docnum = self.adder(docnum, self.last_docnum) 90 91 # Or preserve the document number and prepare for future deltas. 92 93 else: 94 self.adder = get_adder(docnum) 95 self.last_docnum = docnum 96 97 positions = self.read_monotonic_sequence() 98 self.end_record() 99 100 return self.last_docnum, positions 101 102 class PositionIndexWriter(FileWriter): 103 104 "Writing position index information to files." 105 106 def reset(self): 107 self.last_docnum = None 108 self.subtractor = None 109 self.last_pos_offset = 0 110 111 def write_positions(self, docnum, pos_offset, count): 112 113 """ 114 Write the given 'docnum, 'pos_offset' and document 'count' to the 115 position index file. 116 """ 117 118 # Find the size of document number values. 119 120 if self.last_docnum is not None: 121 docnum_seq = self.subtractor(docnum, self.last_docnum) 122 else: 123 self.subtractor = get_subtractor(docnum) 124 docnum_seq = docnum 125 126 self.begin_record() 127 self.write_sequence_value(docnum_seq) 128 self.write_number(pos_offset - self.last_pos_offset) 129 self.write_number(count) 130 self.end_record() 131 132 self.last_docnum = docnum 133 self.last_pos_offset = pos_offset 134 135 class PositionIndexReader(FileReader): 136 137 "Reading position index information within term-specific regions of a file." 138 139 def reset(self): 140 self.last_docnum = None 141 self.adder = None 142 self.last_pos_offset = 0 143 144 def read_positions(self): 145 146 """ 147 Read a document number, a position file offset for the position index 148 file, and the number of documents in a section of that file. 149 """ 150 151 self.begin_record() 152 153 # Read the document number. 154 155 docnum = self.read_sequence_value() 156 157 if self.last_docnum is not None: 158 self.last_docnum = self.adder(docnum, self.last_docnum) 159 else: 160 self.adder = get_adder(docnum) 161 self.last_docnum = docnum 162 163 # Read the offset delta. 164 165 self.last_pos_offset += self.read_number() 166 167 # Read the document count. 168 169 count = self.read_number() 170 self.end_record() 171 172 return self.last_docnum, self.last_pos_offset, count 173 174 # Iterators for position-related files. 175 176 class IteratorBase: 177 178 "Support for iterating over results." 179 180 def __init__(self, reader): 181 182 "Initialise the iterator using the given 'reader'." 183 184 self.reader = reader 185 self.replenish(0) # no iteration initially permitted 186 187 def replenish(self, count): 188 189 "Replenish the iterator with 'count' results." 190 191 self.count = count 192 self.read_documents = 0 193 194 def __len__(self): 195 196 "Return the total number of results." 197 198 return self.count 199 200 def sort(self): 201 pass # Stored document positions are already sorted. 202 203 def __iter__(self): 204 return self 205 206 class PositionIterator(IteratorBase): 207 208 "Iterating over document positions." 209 210 def replenish(self, count): 211 IteratorBase.replenish(self, count) 212 213 # Fill a cache of positions. 214 215 self.cache = [] 216 n = 0 217 218 while n < self.count: 219 self.cache.append(self.reader.read_positions()) 220 n += 1 221 222 def seek(self, offset, count): 223 224 """ 225 Seek to 'offset' in the file, limiting the number of documents available 226 for reading to 'count'. 227 """ 228 229 self.reader.seek(offset) 230 self.replenish(count) 231 232 def next(self): 233 234 "Read positions for a single document." 235 236 if self.read_documents < self.count: 237 positions = self.cache[self.read_documents] 238 self.read_documents += 1 239 return positions 240 else: 241 raise StopIteration 242 243 class PositionIndexIterator(IteratorBase): 244 245 "Iterating over document positions." 246 247 def replenish(self, count): 248 IteratorBase.replenish(self, count) 249 250 # Fill a cache of offsets. 251 252 self.cache = [] 253 self.current = 0 254 n = 0 255 256 while n < self.count: 257 docnum, pos_offset, section_count = t = self.reader.read_positions() 258 self.cache.append(t) 259 n += section_count 260 261 def seek(self, offset, doc_frequency): 262 263 """ 264 Seek to 'offset' in the file, limiting the number of documents available 265 for reading to 'doc_frequency'. 266 """ 267 268 self.reader.seek(offset) 269 self.replenish(doc_frequency) 270 271 def next(self): 272 273 "Read positions for a single document." 274 275 if self.current < len(self.cache): 276 docnum, pos_offset, self.section_count = t = self.cache[self.current] 277 self.current += 1 278 return t 279 else: 280 raise StopIteration 281 282 class PositionDictionaryWriter: 283 284 "Writing position dictionaries." 285 286 def __init__(self, position_writer, position_index_writer, interval): 287 self.position_writer = position_writer 288 self.position_index_writer = position_index_writer 289 self.interval = interval 290 291 def write_term_positions(self, doc_positions): 292 293 """ 294 Write all 'doc_positions' - a collection of tuples of the form (document 295 number, position list) - to the file. 296 297 Add some records to the index, making dictionary entries. 298 299 Return a tuple containing the offset of the written data, the frequency 300 (number of positions), and document frequency (number of documents) for 301 the term involved. 302 """ 303 304 # Reset the writers. 305 306 self.position_writer.reset() 307 self.position_index_writer.reset() 308 309 # Remember the first index entry offset. 310 311 index_offset = self.position_index_writer.tell() 312 313 # Write the positions. 314 315 frequency = 0 316 count = 0 317 318 if doc_positions: 319 320 # Retain the first record offset for a subsequent index entry. 321 322 first_offset = self.position_writer.tell() 323 first_docnum = None 324 325 doc_positions.sort() 326 327 for docnum, positions in doc_positions: 328 if first_docnum is None: 329 first_docnum = docnum 330 331 self.position_writer.write_positions(docnum, positions) 332 333 frequency += len(positions) 334 count += 1 335 336 # Every {interval} entries, write an index entry. 337 338 if count % self.interval == 0: 339 340 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 341 342 first_offset = self.position_writer.tell() 343 first_docnum = None 344 345 # Reset the position writer so that position readers accessing 346 # a section start with the correct document number. 347 348 self.position_writer.reset() 349 350 # Finish writing an index entry for the remaining documents. 351 352 else: 353 if first_docnum is not None: 354 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 355 356 return index_offset, frequency, count 357 358 def close(self): 359 self.position_writer.close() 360 self.position_index_writer.close() 361 362 class PositionDictionaryReader: 363 364 "Access to position dictionary entries through iterators." 365 366 def __init__(self, position_reader, position_index_reader): 367 self.position_reader = position_reader 368 self.position_index_reader = position_index_reader 369 370 def read_term_positions(self, offset, doc_frequency): 371 iterator = PositionDictionaryIterator( 372 PositionIterator(self.position_reader), 373 PositionIndexIterator(self.position_index_reader) 374 ) 375 iterator.seek(offset, doc_frequency) 376 return iterator 377 378 def close(self): 379 self.position_reader.close() 380 self.position_index_reader.close() 381 382 class PositionDictionaryIterator: 383 384 "Iteration over position dictionary entries." 385 386 def __init__(self, position_iterator, position_index_iterator): 387 self.position_iterator = position_iterator 388 self.position_index_iterator = position_index_iterator 389 self.reset() 390 391 def reset(self): 392 393 # Remember the last values. 394 395 self.found_docnum, self.found_positions = None, None 396 397 # Maintain state for the next index entry, if read. 398 399 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 400 401 def seek(self, offset, doc_frequency): 402 403 """ 404 Seek to 'offset' in the index file, limiting the number of documents 405 available for reading to 'doc_frequency'. 406 """ 407 408 self.reset() 409 410 # Seek to the appropriate index entry. 411 412 self.position_index_iterator.seek(offset, doc_frequency) 413 414 # Initialise the current index entry and current position file iterator. 415 416 self._next_section() 417 self._init_section() 418 419 # Sequence methods. 420 421 def __len__(self): 422 return len(self.position_index_iterator) 423 424 def sort(self): 425 pass 426 427 # Iterator methods. 428 429 def __iter__(self): 430 return self 431 432 def next(self): 433 434 """ 435 Attempt to get the next document record from the section in the 436 positions file. 437 """ 438 439 # Return any visited but unrequested record. 440 441 if self.found_docnum is not None: 442 t = self.found_docnum, self.found_positions 443 self.found_docnum, self.found_positions = None, None 444 return t 445 446 # Or search for the next record. 447 448 while 1: 449 450 # Either return the next record. 451 452 try: 453 return self.position_iterator.next() 454 455 # Or, where a section is finished, get the next section and try again. 456 457 except StopIteration: 458 459 # Although, where a single iterator is in use, the file reader 460 # would be positioned appropriately, this is not guaranteed in a 461 # multiple iterator situation. 462 463 self._next_section() 464 self._init_section() 465 466 def from_document(self, docnum): 467 468 """ 469 Attempt to navigate to a positions entry for the given 'docnum', 470 returning the positions for 'docnum', or None otherwise. 471 """ 472 473 # Return any unrequested document positions. 474 475 if docnum == self.found_docnum: 476 return self.found_positions 477 478 # Read ahead in the index until the next entry refers to a document 479 # later than the desired document. 480 481 try: 482 if self.next_docnum is None: 483 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 484 485 # Read until the next entry is after the desired document number, 486 # or until the end of the results. 487 488 while self.next_docnum <= docnum: 489 self._next_read_section() 490 if self.docnum < docnum: 491 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 492 else: 493 break 494 495 except StopIteration: 496 pass 497 498 # Navigate in the position file to the document. 499 500 self._init_section() 501 502 try: 503 while 1: 504 found_docnum, found_positions = self.position_iterator.next() 505 506 # Return the desired document positions or None (retaining the 507 # positions for the document immediately after). 508 509 if docnum <= found_docnum: 510 self.found_docnum, self.found_positions = found_docnum, found_positions 511 if docnum == found_docnum: 512 return found_positions 513 elif docnum < found_docnum: 514 return None 515 516 except StopIteration: 517 return None 518 519 # Internal methods. 520 521 def _next_section(self): 522 523 "Attempt to get the next section in the index." 524 525 if self.next_docnum is None: 526 self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next() 527 else: 528 self._next_read_section() 529 530 def _next_read_section(self): 531 532 """ 533 Make the next index entry the current one without reading from the 534 index. 535 """ 536 537 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 538 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 539 540 def _init_section(self): 541 542 "Initialise the iterator for the section in the position file." 543 544 # Seek to the position entry. 545 546 self.position_iterator.seek(self.pos_offset, self.section_count) 547 548 # vim: tabstop=4 expandtab shiftwidth=4