1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint 23 24 class PositionWriter(FileWriter): 25 26 "Writing position information to files." 27 28 def reset(self): 29 self.last_docnum = 0 30 31 def write_positions(self, docnum, positions): 32 33 """ 34 Write for the document 'docnum' the given 'positions'. 35 Return the offset of the written record. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Record the offset of this record. 42 43 offset = self.tell() 44 45 # Make sure that the positions are sorted. 46 47 positions.sort() 48 49 # Write the position deltas. 50 51 output = [] 52 last = 0 53 54 for position in positions: 55 output.append(vint(position - last)) 56 last = position 57 58 # Write the document number delta. 59 # Write the number of positions. 60 # Then write the positions. 61 62 self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output)) 63 64 self.last_docnum = docnum 65 return offset 66 67 class PositionOpener(FileOpener): 68 69 "Reading position information from files." 70 71 def read_term_positions(self, offset, count): 72 73 """ 74 Read all positions from 'offset', seeking to that position in the file 75 before reading. The number of documents available for reading is limited 76 to 'count'. 77 """ 78 79 # Duplicate the file handle. 80 81 f = self.open("rb") 82 return PositionIterator(f, offset, count) 83 84 class PositionIndexWriter(FileWriter): 85 86 "Writing position index information to files." 87 88 def reset(self): 89 self.last_docnum = 0 90 self.last_pos_offset = 0 91 92 def write_positions(self, docnum, pos_offset, count): 93 94 """ 95 Write the given 'docnum, 'pos_offset' and document 'count' to the 96 position index file. 97 """ 98 99 # Record the offset of this record. 100 101 offset = self.tell() 102 output = [] 103 104 # Write the document number delta. 105 106 output.append(vint(docnum - self.last_docnum)) 107 self.last_docnum = docnum 108 109 # Write the position file offset delta. 110 111 output.append(vint(pos_offset - self.last_pos_offset)) 112 self.last_pos_offset = pos_offset 113 114 # Write the document count. 115 116 output.append(vint(count)) 117 118 # Actually write the data. 119 120 self.write("".join(output)) 121 122 return offset 123 124 class PositionIndexOpener(FileOpener): 125 126 "Reading position index information from files." 127 128 def read_term_positions(self, offset, doc_frequency): 129 130 """ 131 Read all positions from 'offset', seeking to that position in the file 132 before reading. The number of documents available for reading is limited 133 to 'doc_frequency'. 134 """ 135 136 # Duplicate the file handle. 137 138 f = self.open("rb") 139 return PositionIndexIterator(f, offset, doc_frequency) 140 141 # Iterators for position-related files. 142 143 class IteratorBase: 144 145 def __init__(self, count): 146 self.replenish(count) 147 148 def replenish(self, count): 149 self.count = count 150 self.read_documents = 0 151 152 def __len__(self): 153 return self.count 154 155 def sort(self): 156 pass # Stored document positions are already sorted. 157 158 def __iter__(self): 159 return self 160 161 class PositionIterator(FileReader, IteratorBase): 162 163 "Iterating over document positions." 164 165 def __init__(self, f, offset, count): 166 FileReader.__init__(self, f) 167 IteratorBase.__init__(self, count) 168 self.seek(offset) 169 170 def reset(self): 171 self.last_docnum = 0 172 173 def read_positions(self): 174 175 "Read positions, returning a document number and a list of positions." 176 177 # Read the document number delta and add it to the last number. 178 179 self.last_docnum += self.read_number() 180 181 # Read the number of positions. 182 183 npositions = self.read_number() 184 185 # Read the position deltas, adding each previous position to get the 186 # appropriate collection of absolute positions. 187 188 i = 0 189 last = 0 190 positions = [] 191 192 while i < npositions: 193 last += self.read_number() 194 positions.append(last) 195 i += 1 196 197 return self.last_docnum, positions 198 199 def next(self): 200 201 "Read positions for a single document." 202 203 if self.read_documents < self.count: 204 self.read_documents += 1 205 return self.read_positions() 206 else: 207 raise StopIteration 208 209 class PositionIndexIterator(FileReader, IteratorBase): 210 211 "Iterating over document positions." 212 213 def __init__(self, f, offset, count): 214 FileReader.__init__(self, f) 215 IteratorBase.__init__(self, count) 216 self.seek(offset) 217 self.section_count = 0 218 219 def reset(self): 220 self.last_docnum = 0 221 self.last_pos_offset = 0 222 223 def read_positions(self): 224 225 """ 226 Read a document number, a position file offset for the position index 227 file, and the number of documents in a section of that file. 228 """ 229 230 # Read the document number delta. 231 232 self.last_docnum += self.read_number() 233 234 # Read the offset delta. 235 236 self.last_pos_offset += self.read_number() 237 238 # Read the document count. 239 240 count = self.read_number() 241 242 return self.last_docnum, self.last_pos_offset, count 243 244 def next(self): 245 246 "Read positions for a single document." 247 248 self.read_documents += self.section_count 249 if self.read_documents < self.count: 250 docnum, pos_offset, self.section_count = t = self.read_positions() 251 return t 252 else: 253 raise StopIteration 254 255 class PositionDictionaryWriter: 256 257 "Writing position dictionaries." 258 259 def __init__(self, position_writer, position_index_writer, interval): 260 self.position_writer = position_writer 261 self.position_index_writer = position_index_writer 262 self.interval = interval 263 264 def write_term_positions(self, doc_positions): 265 266 """ 267 Write all 'doc_positions' - a collection of tuples of the form (document 268 number, position list) - to the file. 269 270 Add some records to the index, making dictionary entries. 271 272 Return a tuple containing the offset of the written data, the frequency 273 (number of positions), and document frequency (number of documents) for 274 the term involved. 275 """ 276 277 # Reset the writers. 278 279 self.position_writer.reset() 280 self.position_index_writer.reset() 281 282 index_offset = None 283 284 # Write the positions. 285 286 frequency = 0 287 first_docnum = None 288 first_offset = None 289 count = 0 290 291 doc_positions.sort() 292 293 for docnum, positions in doc_positions: 294 pos_offset = self.position_writer.write_positions(docnum, positions) 295 296 # Retain the first record offset for a subsequent index entry. 297 298 if first_offset is None: 299 first_offset = pos_offset 300 first_docnum = docnum 301 302 frequency += len(positions) 303 count += 1 304 305 # Every {interval} entries, write an index entry. 306 307 if count % self.interval == 0: 308 io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 309 310 # Remember the first index entry offset. 311 312 if index_offset is None: 313 index_offset = io 314 315 first_offset = None 316 first_docnum = None 317 318 # Reset the position writer so that position readers accessing 319 # a section start with the correct document number. 320 321 self.position_writer.reset() 322 323 # Finish writing an index entry for the remaining documents. 324 325 else: 326 if first_offset is not None: 327 io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 328 329 # Remember the first index entry offset. 330 331 if index_offset is None: 332 index_offset = io 333 334 return index_offset, frequency, count 335 336 def close(self): 337 self.position_writer.close() 338 self.position_index_writer.close() 339 340 class PositionDictionaryReader: 341 342 "Reading position dictionaries." 343 344 def __init__(self, position_opener, position_index_opener): 345 self.position_opener = position_opener 346 self.position_index_opener = position_index_opener 347 348 def read_term_positions(self, offset, doc_frequency): 349 350 """ 351 Return an iterator for dictionary entries starting at 'offset' with the 352 given 'doc_frequency'. 353 """ 354 355 return PositionDictionaryIterator(self.position_opener, 356 self.position_index_opener, offset, doc_frequency) 357 358 def close(self): 359 pass 360 361 class PositionDictionaryIterator: 362 363 "Iteration over position dictionary entries." 364 365 def __init__(self, position_opener, position_index_opener, offset, doc_frequency): 366 self.position_opener = position_opener 367 self.doc_frequency = doc_frequency 368 self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency) 369 self.iterator = None 370 371 # Remember the last values. 372 373 self.found_docnum, self.found_positions = None, None 374 375 # Maintain state for the next index entry, if read. 376 377 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 378 379 # Initialise the current index entry and current position file iterator. 380 381 self._next_section() 382 self._init_section() 383 384 # Sequence methods. 385 386 def __len__(self): 387 return self.doc_frequency 388 389 def sort(self): 390 pass 391 392 # Iterator methods. 393 394 def __iter__(self): 395 return self 396 397 def next(self): 398 399 """ 400 Attempt to get the next document record from the section in the 401 positions file. 402 """ 403 404 # Return any visited but unrequested record. 405 406 if self.found_docnum is not None: 407 t = self.found_docnum, self.found_positions 408 self.found_docnum, self.found_positions = None, None 409 return t 410 411 # Or search for the next record. 412 413 while 1: 414 415 # Either return the next record. 416 417 try: 418 return self.iterator.next() 419 420 # Or, where a section is finished, get the next section and try again. 421 422 except StopIteration: 423 424 # Where a section follows, update the index iterator, but keep 425 # reading using the same file iterator (since the data should 426 # just follow on from the last section). 427 428 self._next_section() 429 self.iterator.replenish(self.section_count) 430 431 # Reset the state of the iterator to make sure that document 432 # numbers are correct. 433 434 self.iterator.reset() 435 436 def from_document(self, docnum): 437 438 """ 439 Attempt to navigate to a positions entry for the given 'docnum', 440 returning the positions for 'docnum', or None otherwise. 441 """ 442 443 # Return any unrequested document positions. 444 445 if docnum == self.found_docnum: 446 return self.found_positions 447 448 # Read ahead in the index until the next entry refers to a document 449 # later than the desired document. 450 451 try: 452 if self.next_docnum is None: 453 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 454 455 # Read until the next entry is after the desired document number, 456 # or until the end of the results. 457 458 while self.next_docnum <= docnum: 459 self._next_read_section() 460 if self.docnum < docnum: 461 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 462 else: 463 break 464 465 except StopIteration: 466 pass 467 468 # Navigate in the position file to the document. 469 470 self._init_section() 471 472 try: 473 while 1: 474 found_docnum, found_positions = self.iterator.next() 475 476 # Return the desired document positions or None (retaining the 477 # positions for the document immediately after). 478 479 if docnum == found_docnum: 480 return found_positions 481 elif docnum < found_docnum: 482 self.found_docnum, self.found_positions = found_docnum, found_positions 483 return None 484 485 except StopIteration: 486 return None 487 488 # Internal methods. 489 490 def _next_section(self): 491 492 "Attempt to get the next section in the index." 493 494 if self.next_docnum is None: 495 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() 496 else: 497 self._next_read_section() 498 499 def _next_read_section(self): 500 501 """ 502 Make the next index entry the current one without reading from the 503 index. 504 """ 505 506 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 507 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 508 509 def _init_section(self): 510 511 "Initialise the iterator for the section in the position file." 512 513 if self.iterator is not None: 514 self.iterator.close() 515 self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) 516 517 def close(self): 518 if self.iterator is not None: 519 self.iterator.close() 520 self.iterator = None 521 if self.index_iterator is not None: 522 self.index_iterator.close() 523 self.index_iterator = None 524 525 # vim: tabstop=4 expandtab shiftwidth=4