1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint, vint_to_array, string_to_array 23 from array import array 24 25 class PositionWriter(FileWriter): 26 27 "Writing position information to files." 28 29 def reset(self): 30 self.last_docnum = 0 31 32 def write_positions(self, docnum, positions): 33 34 """ 35 Write for the document 'docnum' the given 'positions'. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Make sure that the positions are sorted. 42 43 positions.sort() 44 45 # Write the document number delta. 46 # Write the number of positions. 47 48 output = array('B') 49 vint_to_array(docnum - self.last_docnum, output) 50 vint_to_array(len(positions), output) 51 52 # Write the position deltas. 53 54 last = 0 55 56 # Handle tuples incorporating preceding text. 57 58 for position, preceding in positions: 59 vint_to_array(position - last, output) 60 string_to_array(preceding, output) 61 last = position 62 63 output.tofile(self.f) 64 65 self.last_docnum = docnum 66 67 class PositionReader(FileReader): 68 69 "Reading position information within term-specific regions of a file." 70 71 def reset(self): 72 self.last_docnum = 0 73 74 def read_positions(self): 75 76 "Read positions, returning a document number and a list of positions." 77 78 # Read the document number delta and add it to the last number. 79 80 self.last_docnum += self.read_number() 81 82 # Read the number of positions. 83 84 npositions = self.read_number() 85 86 # Read the position deltas, adding each previous position to get the 87 # appropriate collection of absolute positions. 88 89 i = 0 90 last = 0 91 positions = [] 92 93 while i < npositions: 94 last += self.read_number() 95 preceding = self.read_string() 96 positions.append((last, preceding)) 97 i += 1 98 99 return self.last_docnum, positions 100 101 class PositionIndexWriter(FileWriter): 102 103 "Writing position index information to files." 104 105 def reset(self): 106 self.last_docnum = 0 107 self.last_pos_offset = 0 108 109 def write_positions(self, docnum, pos_offset, count): 110 111 """ 112 Write the given 'docnum, 'pos_offset' and document 'count' to the 113 position index file. 114 """ 115 116 # Write the document number delta. 117 # Write the position file offset delta. 118 # Write the document count. 119 120 output = array('B') 121 vint_to_array(docnum - self.last_docnum, output) 122 vint_to_array(pos_offset - self.last_pos_offset, output) 123 vint_to_array(count, output) 124 125 # Actually write the data. 126 127 output.tofile(self.f) 128 129 self.last_pos_offset = pos_offset 130 self.last_docnum = docnum 131 132 class PositionIndexReader(FileReader): 133 134 "Reading position index information within term-specific regions of a file." 135 136 def reset(self): 137 self.last_docnum = 0 138 self.last_pos_offset = 0 139 140 def read_positions(self): 141 142 """ 143 Read a document number, a position file offset for the position index 144 file, and the number of documents in a section of that file. 145 """ 146 147 # Read the document number delta. 148 149 self.last_docnum += self.read_number() 150 151 # Read the offset delta. 152 153 self.last_pos_offset += self.read_number() 154 155 # Read the document count. 156 157 count = self.read_number() 158 159 return self.last_docnum, self.last_pos_offset, count 160 161 # Iterators for position-related files. 162 163 class IteratorBase: 164 165 "Support for iterating over results." 166 167 def __init__(self, reader): 168 169 "Initialise the iterator using the given 'reader'." 170 171 self.reader = reader 172 self.replenish(0) # no iteration initially permitted 173 174 def replenish(self, count): 175 176 "Replenish the iterator with 'count' results." 177 178 self.count = count 179 self.read_documents = 0 180 181 def __len__(self): 182 183 "Return the total number of results." 184 185 return self.count 186 187 def sort(self): 188 pass # Stored document positions are already sorted. 189 190 def __iter__(self): 191 return self 192 193 class PositionIterator(IteratorBase): 194 195 "Iterating over document positions." 196 197 def replenish(self, count): 198 IteratorBase.replenish(self, count) 199 200 # Fill a cache of positions. 201 202 self.cache = [] 203 n = 0 204 205 while n < self.count: 206 self.cache.append(self.reader.read_positions()) 207 n += 1 208 209 def seek(self, offset, count): 210 211 """ 212 Seek to 'offset' in the file, limiting the number of documents available 213 for reading to 'count'. 214 """ 215 216 self.reader.seek(offset) 217 self.replenish(count) 218 219 def next(self): 220 221 "Read positions for a single document." 222 223 if self.read_documents < self.count: 224 positions = self.cache[self.read_documents] 225 self.read_documents += 1 226 return positions 227 else: 228 raise StopIteration 229 230 class PositionIndexIterator(IteratorBase): 231 232 "Iterating over document positions." 233 234 def replenish(self, count): 235 IteratorBase.replenish(self, count) 236 237 # Fill a cache of offsets. 238 239 self.cache = [] 240 self.current = 0 241 n = 0 242 243 while n < self.count: 244 docnum, pos_offset, section_count = t = self.reader.read_positions() 245 self.cache.append(t) 246 n += section_count 247 248 def seek(self, offset, doc_frequency): 249 250 """ 251 Seek to 'offset' in the file, limiting the number of documents available 252 for reading to 'doc_frequency'. 253 """ 254 255 self.reader.seek(offset) 256 self.replenish(doc_frequency) 257 258 def next(self): 259 260 "Read positions for a single document." 261 262 if self.current < len(self.cache): 263 docnum, pos_offset, self.section_count = t = self.cache[self.current] 264 self.current += 1 265 return t 266 else: 267 raise StopIteration 268 269 class PositionDictionaryWriter: 270 271 "Writing position dictionaries." 272 273 def __init__(self, position_writer, position_index_writer, interval): 274 self.position_writer = position_writer 275 self.position_index_writer = position_index_writer 276 self.interval = interval 277 278 def write_term_positions(self, doc_positions): 279 280 """ 281 Write all 'doc_positions' - a collection of tuples of the form (document 282 number, position list) - to the file. 283 284 Add some records to the index, making dictionary entries. 285 286 Return a tuple containing the offset of the written data, the frequency 287 (number of positions), and document frequency (number of documents) for 288 the term involved. 289 """ 290 291 # Reset the writers. 292 293 self.position_writer.reset() 294 self.position_index_writer.reset() 295 296 # Remember the first index entry offset. 297 298 index_offset = self.position_index_writer.f.tell() 299 300 # Write the positions. 301 302 frequency = 0 303 count = 0 304 305 if doc_positions: 306 307 # Retain the first record offset for a subsequent index entry. 308 309 first_offset = self.position_writer.f.tell() 310 first_docnum = None 311 312 doc_positions.sort() 313 314 for docnum, positions in doc_positions: 315 if first_docnum is None: 316 first_docnum = docnum 317 318 self.position_writer.write_positions(docnum, positions) 319 320 frequency += len(positions) 321 count += 1 322 323 # Every {interval} entries, write an index entry. 324 325 if count % self.interval == 0: 326 327 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 328 329 first_offset = self.position_writer.f.tell() 330 first_docnum = None 331 332 # Reset the position writer so that position readers accessing 333 # a section start with the correct document number. 334 335 self.position_writer.reset() 336 337 # Finish writing an index entry for the remaining documents. 338 339 else: 340 if first_docnum is not None: 341 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 342 343 return index_offset, frequency, count 344 345 def close(self): 346 self.position_writer.close() 347 self.position_index_writer.close() 348 349 class PositionDictionaryReader: 350 351 "Access to position dictionary entries through iterators." 352 353 def __init__(self, position_reader, position_index_reader): 354 self.position_reader = position_reader 355 self.position_index_reader = position_index_reader 356 357 def read_term_positions(self, offset, doc_frequency): 358 iterator = PositionDictionaryIterator( 359 PositionIterator(self.position_reader), 360 PositionIndexIterator(self.position_index_reader) 361 ) 362 iterator.seek(offset, doc_frequency) 363 return iterator 364 365 def close(self): 366 self.position_reader.close() 367 self.position_index_reader.close() 368 369 class PositionDictionaryIterator: 370 371 "Iteration over position dictionary entries." 372 373 def __init__(self, position_iterator, position_index_iterator): 374 self.position_iterator = position_iterator 375 self.position_index_iterator = position_index_iterator 376 self.reset() 377 378 def reset(self): 379 380 # Remember the last values. 381 382 self.found_docnum, self.found_positions = None, None 383 384 # Maintain state for the next index entry, if read. 385 386 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 387 388 def seek(self, offset, doc_frequency): 389 390 """ 391 Seek to 'offset' in the index file, limiting the number of documents 392 available for reading to 'doc_frequency'. 393 """ 394 395 self.reset() 396 397 # Seek to the appropriate index entry. 398 399 self.position_index_iterator.seek(offset, doc_frequency) 400 401 # Initialise the current index entry and current position file iterator. 402 403 self._next_section() 404 self._init_section() 405 406 # Sequence methods. 407 408 def __len__(self): 409 return len(self.position_index_iterator) 410 411 def sort(self): 412 pass 413 414 # Iterator methods. 415 416 def __iter__(self): 417 return self 418 419 def next(self): 420 421 """ 422 Attempt to get the next document record from the section in the 423 positions file. 424 """ 425 426 # Return any visited but unrequested record. 427 428 if self.found_docnum is not None: 429 t = self.found_docnum, self.found_positions 430 self.found_docnum, self.found_positions = None, None 431 return t 432 433 # Or search for the next record. 434 435 while 1: 436 437 # Either return the next record. 438 439 try: 440 return self.position_iterator.next() 441 442 # Or, where a section is finished, get the next section and try again. 443 444 except StopIteration: 445 446 # Although, where a single iterator is in use, the file reader 447 # would be positioned appropriately, this is not guaranteed in a 448 # multiple iterator situation. 449 450 self._next_section() 451 self._init_section() 452 453 def from_document(self, docnum): 454 455 """ 456 Attempt to navigate to a positions entry for the given 'docnum', 457 returning the positions for 'docnum', or None otherwise. 458 """ 459 460 # Return any unrequested document positions. 461 462 if docnum == self.found_docnum: 463 return self.found_positions 464 465 # Read ahead in the index until the next entry refers to a document 466 # later than the desired document. 467 468 try: 469 if self.next_docnum is None: 470 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 471 472 # Read until the next entry is after the desired document number, 473 # or until the end of the results. 474 475 while self.next_docnum <= docnum: 476 self._next_read_section() 477 if self.docnum < docnum: 478 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next() 479 else: 480 break 481 482 except StopIteration: 483 pass 484 485 # Navigate in the position file to the document. 486 487 self._init_section() 488 489 try: 490 while 1: 491 found_docnum, found_positions = self.position_iterator.next() 492 493 # Return the desired document positions or None (retaining the 494 # positions for the document immediately after). 495 496 if docnum == found_docnum: 497 return found_positions 498 elif docnum < found_docnum: 499 self.found_docnum, self.found_positions = found_docnum, found_positions 500 return None 501 502 except StopIteration: 503 return None 504 505 # Internal methods. 506 507 def _next_section(self): 508 509 "Attempt to get the next section in the index." 510 511 if self.next_docnum is None: 512 self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next() 513 else: 514 self._next_read_section() 515 516 def _next_read_section(self): 517 518 """ 519 Make the next index entry the current one without reading from the 520 index. 521 """ 522 523 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 524 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 525 526 def _init_section(self): 527 528 "Initialise the iterator for the section in the position file." 529 530 # Seek to the position entry. 531 532 self.position_iterator.seek(self.pos_offset, self.section_count) 533 534 # vim: tabstop=4 expandtab shiftwidth=4