1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.data import * 22 from iixr.files import * 23 24 class PositionWriter(FileWriter): 25 26 "Writing position information to files." 27 28 def reset(self): 29 self.end_record() 30 self.last_docnum = None 31 self.subtractor = None 32 33 def write_positions(self, docnum, positions): 34 35 """ 36 Write for the document 'docnum' the given 'positions'. 37 """ 38 39 if not positions: 40 return 41 42 # Make sure that the positions are sorted. 43 44 positions.sort() 45 46 # Calculate an ongoing delta. 47 48 if self.last_docnum is not None: 49 if docnum < self.last_docnum: 50 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 51 52 docnum_seq = self.subtractor(docnum, self.last_docnum) 53 54 # Or preserve the document number and prepare for future deltas. 55 56 else: 57 self.subtractor = get_subtractor(docnum) 58 docnum_seq = docnum 59 60 self.write_sequence_value(docnum_seq) 61 self.write_monotonic_sequence(positions) 62 63 self.last_docnum = docnum 64 65 class PositionReader(FileReader): 66 67 "Reading position information within term-specific regions of a file." 68 69 def reset(self): 70 self.last_docnum = None 71 self.adder = None 72 self.begin_record() 73 74 def read_positions(self): 75 76 """ 77 Read positions, returning a document number and a list of positions. 78 """ 79 80 # Read the document number. 81 82 docnum = self.read_sequence_value() 83 84 # Calculate an ongoing delta. 85 86 if self.last_docnum is not None: 87 self.last_docnum = self.adder(docnum, self.last_docnum) 88 89 # Or preserve the document number and prepare for future deltas. 90 91 else: 92 self.adder = get_adder(docnum) 93 self.last_docnum = docnum 94 95 positions = self.read_monotonic_sequence() 96 97 return self.last_docnum, positions 98 99 class PositionIndexWriter(FileWriter): 100 101 "Writing position index information to files." 102 103 def reset(self): 104 self.end_record() 105 self.last_docnum = None 106 self.subtractor = None 107 self.last_pos_offset = 0 108 109 def write_positions(self, docnum, pos_offset, count): 110 111 """ 112 Write the given 'docnum, 'pos_offset' and document 'count' to the 113 position index file. 114 """ 115 116 # Find the size of document number values. 117 118 if self.last_docnum is not None: 119 docnum_seq = self.subtractor(docnum, self.last_docnum) 120 else: 121 self.subtractor = get_subtractor(docnum) 122 docnum_seq = docnum 123 124 self.write_sequence_value(docnum_seq) 125 self.write_number(pos_offset - self.last_pos_offset) 126 self.write_number(count) 127 128 self.last_docnum = docnum 129 self.last_pos_offset = pos_offset 130 131 class PositionIndexReader(FileReader): 132 133 "Reading position index information within term-specific regions of a file." 134 135 def reset(self): 136 self.last_docnum = None 137 self.adder = None 138 self.last_pos_offset = 0 139 self.begin_record() 140 141 def read_positions(self): 142 143 """ 144 Read a document number, a position file offset for the position index 145 file, and the number of documents in a section of that file. 146 """ 147 148 # Read the document number. 149 150 docnum = self.read_sequence_value() 151 152 if self.last_docnum is not None: 153 self.last_docnum = self.adder(docnum, self.last_docnum) 154 else: 155 self.adder = get_adder(docnum) 156 self.last_docnum = docnum 157 158 # Read the offset delta. 159 160 self.last_pos_offset += self.read_number() 161 162 # Read the document count. 163 164 count = self.read_number() 165 166 return self.last_docnum, self.last_pos_offset, count 167 168 # Iterators for position-related files. 169 170 class IteratorBase: 171 172 "Support for iterating over results." 173 174 def __init__(self, reader): 175 176 "Initialise the iterator using the given 'reader'." 177 178 self.reader = reader 179 self.replenish(0) # no iteration initially permitted 180 181 def replenish(self, count): 182 183 "Replenish the iterator with 'count' results." 184 185 self.count = count 186 self.read_documents = 0 187 188 def __len__(self): 189 190 "Return the total number of results." 191 192 return self.count 193 194 def sort(self): 195 pass # Stored document positions are already sorted. 196 197 def __iter__(self): 198 return self 199 200 class PositionIterator(IteratorBase): 201 202 "Iterating over document positions." 203 204 def replenish(self, count): 205 IteratorBase.replenish(self, count) 206 207 # Fill a cache of positions. 208 209 self.cache = [] 210 n = 0 211 212 while n < self.count: 213 self.cache.append(self.reader.read_positions()) 214 n += 1 215 216 def seek(self, offset, count): 217 218 """ 219 Seek to 'offset' in the file, limiting the number of documents available 220 for reading to 'count'. 221 """ 222 223 self.reader.seek(offset) 224 self.replenish(count) 225 226 def next(self): 227 228 "Read positions for a single document." 229 230 if self.read_documents < self.count: 231 positions = self.cache[self.read_documents] 232 self.read_documents += 1 233 return positions 234 else: 235 raise StopIteration 236 237 class PositionIndexIterator(IteratorBase): 238 239 "Iterating over document positions." 240 241 def replenish(self, count): 242 IteratorBase.replenish(self, count) 243 244 # Fill a cache of offsets. 245 246 self.cache = [] 247 self.current = 0 248 n = 0 249 250 while n < self.count: 251 docnum, pos_offset, section_count = t = self.reader.read_positions() 252 self.cache.append(t) 253 n += section_count 254 255 def seek(self, offset, doc_frequency): 256 257 """ 258 Seek to 'offset' in the file, limiting the number of documents available 259 for reading to 'doc_frequency'. 260 """ 261 262 self.reader.seek(offset) 263 self.replenish(doc_frequency) 264 265 def next(self): 266 267 "Read positions for a single document." 268 269 if self.current < len(self.cache): 270 docnum, pos_offset, self.section_count = t = self.cache[self.current] 271 self.current += 1 272 return t 273 else: 274 raise StopIteration 275 276 class PositionDictionaryWriter: 277 278 "Writing position dictionaries." 279 280 def __init__(self, position_writer, position_index_writer, interval): 281 self.position_writer = position_writer 282 self.position_index_writer = position_index_writer 283 self.interval = interval 284 285 def write_term_positions(self, doc_positions): 286 287 """ 288 Write all 'doc_positions' - a collection of tuples of the form (document 289 number, position list) - to the file. 290 291 Add some records to the index, making dictionary entries. 292 293 Return a tuple containing the offset of the written data, the frequency 294 (number of positions), and document frequency (number of documents) for 295 the term involved. 296 """ 297 298 # Reset the writers. 299 300 self.position_writer.reset() 301 self.position_index_writer.reset() 302 303 # Remember the first index entry offset. 304 305 index_offset = self.position_index_writer.tell() 306 307 # Write the positions. 308 309 frequency = 0 310 count = 0 311 312 if doc_positions: 313 314 # Retain the first record offset for a subsequent index entry. 315 316 first_offset = self.position_writer.tell() 317 first_docnum = None 318 319 doc_positions.sort() 320 321 for docnum, positions in doc_positions: 322 if first_docnum is None: 323 first_docnum = docnum 324 325 self.position_writer.write_positions(docnum, positions) 326 327 frequency += len(positions) 328 count += 1 329 330 # Every {interval} entries, write an index entry. 331 332 if count % self.interval == 0: 333 334 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 335 336 # Reset the position writer so that position readers accessing 337 # a section start with the correct document number. 338 339 self.position_writer.reset() 340 341 first_offset = self.position_writer.tell() 342 first_docnum = None 343 344 # Finish writing an index entry for the remaining documents. 345 346 else: 347 if first_docnum is not None: 348 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 349 350 return index_offset, frequency, count 351 352 def close(self): 353 self.position_writer.close() 354 self.position_index_writer.close() 355 356 class PositionDictionaryReader: 357 358 "Access to position dictionary entries through iterators." 359 360 def __init__(self, position_reader, position_index_reader): 361 self.position_reader = position_reader 362 self.position_index_reader = position_index_reader 363 364 def read_term_positions(self, offset, doc_frequency): 365 iterator = PositionDictionaryIterator( 366 PositionIterator(self.position_reader), 367 PositionIndexIterator(self.position_index_reader) 368 ) 369 iterator.seek(offset, doc_frequency) 370 return iterator 371 372 def close(self): 373 self.position_reader.close() 374 self.position_index_reader.close() 375 376 class PositionDictionaryIterator: 377 378 "Iteration over position dictionary entries." 379 380 def __init__(self, position_iterator, position_index_iterator): 381 self.position_iterator = position_iterator 382 self.position_index_iterator = position_index_iterator 383 self.reset() 384 385 def reset(self): 386 387 # Remember the last values. 388 389 self.found_docnum, self.found_positions = None, None 390 391 # Maintain state for the next index entry, if read. 392 393 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 394 395 def seek(self, offset, doc_frequency): 396 397 """ 398 Seek to 'offset' in the index file, limiting the number of documents 399 available for reading to 'doc_frequency'. 400 """ 401 402 self.reset() 403 404 # Seek to the appropriate index entry. 405 406 self.position_index_iterator.seek(offset, doc_frequency) 407 408 # Initialise the current index entry and current position file iterator. 409 410 self._next_section() 411 self._init_section() 412 413 # Sequence methods. 414 415 def __len__(self): 416 return len(self.position_index_iterator) 417 418 def sort(self): 419 pass 420 421 # Iterator methods. 422 423 def __iter__(self): 424 return self 425 426 def next(self): 427 428 """ 429 Attempt to get the next document record from the section in the 430 positions file. 431 """ 432 433 # Return any visited but unrequested record. 434 435 if self.found_docnum is not None: 436 t = self.found_docnum, self.found_positions 437 self.found_docnum, self.found_positions = None, None 438 return t 439 440 # Or search for the next record. 441 442 while 1: 443 444 # Either return the next record. 445 446 try: 447 return self.position_iterator.next() 448 449 # Or, where a section is finished, get the next section and try again. 450 451 except StopIteration: 452 453 # Although, where a single iterator is in use, the file reader 454 # would be positioned appropriately, this is not guaranteed in a 455 # multiple iterator situation. 456 457 self._next_section() 458 self._init_section() 459 460 def from_document(self, docnum): 461 462 """ 463 Attempt to navigate to a positions entry for the given 'docnum', 464 returning the positions for 'docnum', or None otherwise. 465 """ 466 467 # Return any unrequested document positions. 468 469 if docnum == self.found_docnum: 470 return self.found_positions 471 472 # Read ahead in the index until the next entry refers to a document 473 # later than the desired document. 474 475 try: 476 if self.next_docnum is None: 477 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 478 479 # Read until the next entry is after the desired document number, 480 # or until the end of the results. 481 482 while self.next_docnum <= docnum: 483 self._next_read_section() 484 if self.docnum < docnum: 485 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 486 else: 487 break 488 489 except StopIteration: 490 pass 491 492 # Navigate in the position file to the document. 493 494 self._init_section() 495 496 try: 497 while 1: 498 found_docnum, found_positions = self.position_iterator.next() 499 500 # Return the desired document positions or None (retaining the 501 # positions for the document immediately after). 502 503 if docnum <= found_docnum: 504 self.found_docnum, self.found_positions = found_docnum, found_positions 505 if docnum == found_docnum: 506 return found_positions 507 elif docnum < found_docnum: 508 return None 509 510 except StopIteration: 511 return None 512 513 # Internal methods. 514 515 def _next_section(self): 516 517 "Attempt to get the next section in the index." 518 519 if self.next_docnum is None: 520 self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next() 521 else: 522 self._next_read_section() 523 524 def _next_read_section(self): 525 526 """ 527 Make the next index entry the current one without reading from the 528 index. 529 """ 530 531 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 532 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 533 534 def _init_section(self): 535 536 "Initialise the iterator for the section in the position file." 537 538 # Seek to the position entry. 539 540 self.position_iterator.seek(self.pos_offset, self.section_count) 541 542 # vim: tabstop=4 expandtab shiftwidth=4