1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint, vint_to_array 23 from array import array 24 25 class PositionWriter(FileWriter): 26 27 "Writing position information to files." 28 29 def reset(self): 30 self.last_docnum = 0 31 32 def write_positions(self, docnum, positions): 33 34 """ 35 Write for the document 'docnum' the given 'positions'. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Make sure that the positions are sorted. 42 43 positions.sort() 44 45 # Write the document number delta. 46 # Write the number of positions. 47 48 output = array('B') 49 vint_to_array(docnum - self.last_docnum, output) 50 vint_to_array(len(positions), output) 51 52 # Write the position deltas. 53 54 last = 0 55 56 for position in positions: 57 vint_to_array(position - last, output) 58 last = position 59 60 output.tofile(self.f) 61 62 self.last_docnum = docnum 63 64 class PositionReader(FileReader): 65 66 "Reading position information within term-specific regions of a file." 67 68 def reset(self): 69 self.last_docnum = 0 70 71 def read_positions(self): 72 73 "Read positions, returning a document number and a list of positions." 74 75 # Read the document number delta and add it to the last number. 76 77 self.last_docnum += self.read_number() 78 79 # Read the number of positions. 80 81 npositions = self.read_number() 82 83 # Read the position deltas, adding each previous position to get the 84 # appropriate collection of absolute positions. 85 86 i = 0 87 last = 0 88 positions = [] 89 90 while i < npositions: 91 last += self.read_number() 92 positions.append(last) 93 i += 1 94 95 return self.last_docnum, positions 96 97 class PositionIndexWriter(FileWriter): 98 99 "Writing position index information to files." 100 101 def reset(self): 102 self.last_docnum = 0 103 self.last_pos_offset = 0 104 105 def write_positions(self, docnum, pos_offset, count): 106 107 """ 108 Write the given 'docnum, 'pos_offset' and document 'count' to the 109 position index file. 110 """ 111 112 # Write the document number delta. 113 # Write the position file offset delta. 114 # Write the document count. 115 116 output = array('B') 117 vint_to_array(docnum - self.last_docnum, output) 118 vint_to_array(pos_offset - self.last_pos_offset, output) 119 vint_to_array(count, output) 120 121 # Actually write the data. 122 123 output.tofile(self.f) 124 125 self.last_pos_offset = pos_offset 126 self.last_docnum = docnum 127 128 class PositionIndexReader(FileReader): 129 130 "Reading position index information within term-specific regions of a file." 131 132 def reset(self): 133 self.last_docnum = 0 134 self.last_pos_offset = 0 135 136 def read_positions(self): 137 138 """ 139 Read a document number, a position file offset for the position index 140 file, and the number of documents in a section of that file. 141 """ 142 143 # Read the document number delta. 144 145 self.last_docnum += self.read_number() 146 147 # Read the offset delta. 148 149 self.last_pos_offset += self.read_number() 150 151 # Read the document count. 152 153 count = self.read_number() 154 155 return self.last_docnum, self.last_pos_offset, count 156 157 # Iterators for position-related files. 158 159 class IteratorBase: 160 161 "Support for iterating over results." 162 163 def __init__(self, reader): 164 165 "Initialise the iterator using the given 'reader'." 166 167 self.reader = reader 168 self.replenish(0) # no iteration initially permitted 169 170 def replenish(self, count): 171 172 "Replenish the iterator with 'count' results." 173 174 self.count = count 175 self.read_documents = 0 176 177 def __len__(self): 178 179 "Return the total number of results." 180 181 return self.count 182 183 def sort(self): 184 pass # Stored document positions are already sorted. 185 186 def __iter__(self): 187 return self 188 189 class PositionIterator(IteratorBase): 190 191 "Iterating over document positions." 192 193 def replenish(self, count): 194 IteratorBase.replenish(self, count) 195 196 # Fill a cache of positions. 197 198 self.cache = [] 199 n = 0 200 201 while n < self.count: 202 self.cache.append(self.reader.read_positions()) 203 n += 1 204 205 def seek(self, offset, count): 206 207 """ 208 Seek to 'offset' in the file, limiting the number of documents available 209 for reading to 'count'. 210 """ 211 212 self.reader.seek(offset) 213 self.replenish(count) 214 215 def next(self): 216 217 "Read positions for a single document." 218 219 if self.read_documents < self.count: 220 positions = self.cache[self.read_documents] 221 self.read_documents += 1 222 return positions 223 else: 224 raise StopIteration 225 226 class PositionIndexIterator(IteratorBase): 227 228 "Iterating over document positions." 229 230 def replenish(self, count): 231 IteratorBase.replenish(self, count) 232 233 # Fill a cache of offsets. 234 235 self.cache = [] 236 self.current = 0 237 n = 0 238 239 while n < self.count: 240 docnum, pos_offset, section_count = t = self.reader.read_positions() 241 self.cache.append(t) 242 n += section_count 243 244 def seek(self, offset, doc_frequency): 245 246 """ 247 Seek to 'offset' in the file, limiting the number of documents available 248 for reading to 'doc_frequency'. 249 """ 250 251 self.reader.seek(offset) 252 self.replenish(doc_frequency) 253 254 def next(self): 255 256 "Read positions for a single document." 257 258 if self.current < len(self.cache): 259 docnum, pos_offset, self.section_count = t = self.cache[self.current] 260 self.current += 1 261 return t 262 else: 263 raise StopIteration 264 265 class PositionDictionaryWriter: 266 267 "Writing position dictionaries." 268 269 def __init__(self, position_writer, position_index_writer, interval): 270 self.position_writer = position_writer 271 self.position_index_writer = position_index_writer 272 self.interval = interval 273 274 def write_term_positions(self, doc_positions): 275 276 """ 277 Write all 'doc_positions' - a collection of tuples of the form (document 278 number, position list) - to the file. 279 280 Add some records to the index, making dictionary entries. 281 282 Return a tuple containing the offset of the written data, the frequency 283 (number of positions), and document frequency (number of documents) for 284 the term involved. 285 """ 286 287 # Reset the writers. 288 289 self.position_writer.reset() 290 self.position_index_writer.reset() 291 292 # Remember the first index entry offset. 293 294 index_offset = self.position_index_writer.f.tell() 295 296 # Write the positions. 297 298 frequency = 0 299 count = 0 300 301 if doc_positions: 302 303 # Retain the first record offset for a subsequent index entry. 304 305 first_offset = self.position_writer.f.tell() 306 first_docnum = None 307 308 doc_positions.sort() 309 310 for docnum, positions in doc_positions: 311 if first_docnum is None: 312 first_docnum = docnum 313 314 self.position_writer.write_positions(docnum, positions) 315 316 frequency += len(positions) 317 count += 1 318 319 # Every {interval} entries, write an index entry. 320 321 if count % self.interval == 0: 322 323 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 324 325 first_offset = self.position_writer.f.tell() 326 first_docnum = None 327 328 # Reset the position writer so that position readers accessing 329 # a section start with the correct document number. 330 331 self.position_writer.reset() 332 333 # Finish writing an index entry for the remaining documents. 334 335 else: 336 if first_docnum is not None: 337 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 338 339 return index_offset, frequency, count 340 341 def close(self): 342 self.position_writer.close() 343 self.position_index_writer.close() 344 345 class PositionDictionaryReader: 346 347 "Access to position dictionary entries through iterators." 348 349 def __init__(self, position_reader, position_index_reader): 350 self.position_reader = position_reader 351 self.position_index_reader = position_index_reader 352 353 def read_term_positions(self, offset, doc_frequency): 354 iterator = PositionDictionaryIterator( 355 PositionIterator(self.position_reader), 356 PositionIndexIterator(self.position_index_reader) 357 ) 358 iterator.seek(offset, doc_frequency) 359 return iterator 360 361 def close(self): 362 self.position_reader.close() 363 self.position_index_reader.close() 364 365 class PositionDictionaryIterator: 366 367 "Iteration over position dictionary entries." 368 369 def __init__(self, position_iterator, position_index_iterator): 370 self.position_iterator = position_iterator 371 self.position_index_iterator = position_index_iterator 372 self.reset() 373 374 def reset(self): 375 376 # Remember the last values. 377 378 self.found_docnum, self.found_positions = None, None 379 380 # Maintain state for the next index entry, if read. 381 382 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 383 384 def seek(self, offset, doc_frequency): 385 386 """ 387 Seek to 'offset' in the index file, limiting the number of documents 388 available for reading to 'doc_frequency'. 389 """ 390 391 self.reset() 392 393 # Seek to the appropriate index entry. 394 395 self.position_index_iterator.seek(offset, doc_frequency) 396 397 # Initialise the current index entry and current position file iterator. 398 399 self._next_section() 400 self._init_section() 401 402 # Sequence methods. 403 404 def __len__(self): 405 return len(self.position_index_iterator) 406 407 def sort(self): 408 pass 409 410 # Iterator methods. 411 412 def __iter__(self): 413 return self 414 415 def next(self): 416 417 """ 418 Attempt to get the next document record from the section in the 419 positions file. 420 """ 421 422 # Return any visited but unrequested record. 423 424 if self.found_docnum is not None: 425 t = self.found_docnum, self.found_positions 426 self.found_docnum, self.found_positions = None, None 427 return t 428 429 # Or search for the next record. 430 431 while 1: 432 433 # Either return the next record. 434 435 try: 436 return self.position_iterator.next() 437 438 # Or, where a section is finished, get the next section and try again. 439 440 except StopIteration: 441 442 # Although, where a single iterator is in use, the file reader 443 # would be positioned appropriately, this is not guaranteed in a 444 # multiple iterator situation. 445 446 self._next_section() 447 self._init_section() 448 449 def from_document(self, docnum): 450 451 """ 452 Attempt to navigate to a positions entry for the given 'docnum', 453 returning the positions for 'docnum', or None otherwise. 454 """ 455 456 # Return any unrequested document positions. 457 458 if docnum == self.found_docnum: 459 return self.found_positions 460 461 # Read ahead in the index until the next entry refers to a document 462 # later than the desired document. 463 464 try: 465 if self.next_docnum is None: 466 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 467 468 # Read until the next entry is after the desired document number, 469 # or until the end of the results. 470 471 while self.next_docnum <= docnum: 472 self._next_read_section() 473 if self.docnum < docnum: 474 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 475 else: 476 break 477 478 except StopIteration: 479 pass 480 481 # Navigate in the position file to the document. 482 483 self._init_section() 484 485 try: 486 while 1: 487 found_docnum, found_positions = self.position_iterator.next() 488 489 # Return the desired document positions or None (retaining the 490 # positions for the document immediately after). 491 492 if docnum == found_docnum: 493 return found_positions 494 elif docnum < found_docnum: 495 self.found_docnum, self.found_positions = found_docnum, found_positions 496 return None 497 498 except StopIteration: 499 return None 500 501 # Internal methods. 502 503 def _next_section(self): 504 505 "Attempt to get the next section in the index." 506 507 if self.next_docnum is None: 508 self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next() 509 else: 510 self._next_read_section() 511 512 def _next_read_section(self): 513 514 """ 515 Make the next index entry the current one without reading from the 516 index. 517 """ 518 519 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 520 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 521 522 def _init_section(self): 523 524 "Initialise the iterator for the section in the position file." 525 526 # Seek to the position entry. 527 528 self.position_iterator.seek(self.pos_offset, self.section_count) 529 530 # vim: tabstop=4 expandtab shiftwidth=4