1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing position information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from iixr.data import vint, vint_to_array 23 from array import array 24 25 class PositionWriter(FileWriter): 26 27 "Writing position information to files." 28 29 def reset(self): 30 self.last_docnum = 0 31 32 def write_positions(self, docnum, positions): 33 34 """ 35 Write for the document 'docnum' the given 'positions'. 36 """ 37 38 if docnum < self.last_docnum: 39 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 40 41 # Make sure that the positions are sorted. 42 43 positions.sort() 44 45 # Write the document number delta. 46 # Write the number of positions. 47 48 output = array('B') 49 vint_to_array(docnum - self.last_docnum, output) 50 vint_to_array(len(positions), output) 51 52 # Write the position deltas. 53 54 last = 0 55 56 for position in positions: 57 vint_to_array(position - last, output) 58 last = position 59 60 output.tofile(self.f) 61 62 self.last_docnum = docnum 63 64 class PositionIndexWriter(FileWriter): 65 66 "Writing position index information to files." 67 68 def reset(self): 69 self.last_docnum = 0 70 self.last_pos_offset = 0 71 72 def write_positions(self, docnum, pos_offset, count): 73 74 """ 75 Write the given 'docnum, 'pos_offset' and document 'count' to the 76 position index file. 77 """ 78 79 # Write the document number delta. 80 # Write the position file offset delta. 81 # Write the document count. 82 83 output = array('B') 84 vint_to_array(docnum - self.last_docnum, output) 85 vint_to_array(pos_offset - self.last_pos_offset, output) 86 vint_to_array(count, output) 87 88 # Actually write the data. 89 90 output.tofile(self.f) 91 92 self.last_pos_offset = pos_offset 93 self.last_docnum = docnum 94 95 # Iterators for position-related files. 96 97 class IteratorBase: 98 99 def __init__(self, count): 100 self.replenish(count) 101 102 def replenish(self, count): 103 self.count = count 104 self.read_documents = 0 105 106 def __len__(self): 107 return self.count 108 109 def sort(self): 110 pass # Stored document positions are already sorted. 111 112 def __iter__(self): 113 return self 114 115 class PositionReader(FileReader, IteratorBase): 116 117 "Iterating over document positions." 118 119 def __init__(self, f): 120 FileReader.__init__(self, f) 121 IteratorBase.__init__(self, 0) # no iteration initially permitted 122 self.reset() 123 124 def reset(self): 125 self.last_docnum = 0 126 127 def seek(self, offset, count): 128 129 """ 130 Seek to 'offset' in the file, limiting the number of documents available 131 for reading to 'count'. 132 """ 133 134 self.f.seek(offset) 135 self.replenish(count) 136 self.reset() 137 138 def read_positions(self): 139 140 "Read positions, returning a document number and a list of positions." 141 142 # Read the document number delta and add it to the last number. 143 144 self.last_docnum += self.read_number() 145 146 # Read the number of positions. 147 148 npositions = self.read_number() 149 150 # Read the position deltas, adding each previous position to get the 151 # appropriate collection of absolute positions. 152 153 i = 0 154 last = 0 155 positions = [] 156 157 while i < npositions: 158 last += self.read_number() 159 positions.append(last) 160 i += 1 161 162 return self.last_docnum, positions 163 164 def next(self): 165 166 "Read positions for a single document." 167 168 if self.read_documents < self.count: 169 self.read_documents += 1 170 return self.read_positions() 171 else: 172 raise StopIteration 173 174 class PositionIndexReader(FileReader, IteratorBase): 175 176 "Iterating over document positions." 177 178 def __init__(self, f): 179 FileReader.__init__(self, f) 180 IteratorBase.__init__(self, 0) # no iteration initially permitted 181 self.reset() 182 183 def reset(self): 184 self.last_docnum = 0 185 self.last_pos_offset = 0 186 self.section_count = 0 187 188 def seek(self, offset, doc_frequency): 189 190 """ 191 Seek to 'offset' in the file, limiting the number of documents available 192 for reading to 'doc_frequency'. 193 """ 194 195 self.f.seek(offset) 196 self.replenish(doc_frequency) 197 self.reset() 198 199 def read_positions(self): 200 201 """ 202 Read a document number, a position file offset for the position index 203 file, and the number of documents in a section of that file. 204 """ 205 206 # Read the document number delta. 207 208 self.last_docnum += self.read_number() 209 210 # Read the offset delta. 211 212 self.last_pos_offset += self.read_number() 213 214 # Read the document count. 215 216 count = self.read_number() 217 218 return self.last_docnum, self.last_pos_offset, count 219 220 def next(self): 221 222 "Read positions for a single document." 223 224 self.read_documents += self.section_count 225 if self.read_documents < self.count: 226 docnum, pos_offset, self.section_count = t = self.read_positions() 227 return t 228 else: 229 #assert self.read_documents == self.count # not upheld by from_document 230 raise StopIteration 231 232 class PositionDictionaryWriter: 233 234 "Writing position dictionaries." 235 236 def __init__(self, position_writer, position_index_writer, interval): 237 self.position_writer = position_writer 238 self.position_index_writer = position_index_writer 239 self.interval = interval 240 241 def write_term_positions(self, doc_positions): 242 243 """ 244 Write all 'doc_positions' - a collection of tuples of the form (document 245 number, position list) - to the file. 246 247 Add some records to the index, making dictionary entries. 248 249 Return a tuple containing the offset of the written data, the frequency 250 (number of positions), and document frequency (number of documents) for 251 the term involved. 252 """ 253 254 # Reset the writers. 255 256 self.position_writer.reset() 257 self.position_index_writer.reset() 258 259 # Remember the first index entry offset. 260 261 index_offset = self.position_index_writer.f.tell() 262 263 # Write the positions. 264 265 frequency = 0 266 count = 0 267 268 if doc_positions: 269 270 # Retain the first record offset for a subsequent index entry. 271 272 first_offset = self.position_writer.f.tell() 273 first_docnum = None 274 275 doc_positions.sort() 276 277 for docnum, positions in doc_positions: 278 if first_docnum is None: 279 first_docnum = docnum 280 281 self.position_writer.write_positions(docnum, positions) 282 283 frequency += len(positions) 284 count += 1 285 286 # Every {interval} entries, write an index entry. 287 288 if count % self.interval == 0: 289 290 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 291 292 first_offset = self.position_writer.f.tell() 293 first_docnum = None 294 295 # Reset the position writer so that position readers accessing 296 # a section start with the correct document number. 297 298 self.position_writer.reset() 299 300 # Finish writing an index entry for the remaining documents. 301 302 else: 303 if first_docnum is not None: 304 self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 305 306 return index_offset, frequency, count 307 308 def close(self): 309 self.position_writer.close() 310 self.position_index_writer.close() 311 312 class PositionDictionaryReader: 313 314 "Iteration over position dictionary entries." 315 316 def __init__(self, position_reader, position_index_reader): 317 self.position_reader = position_reader 318 self.position_index_reader = position_index_reader 319 self.reset() 320 321 def reset(self): 322 323 # Remember the last values. 324 325 self.found_docnum, self.found_positions = None, None 326 327 # Maintain state for the next index entry, if read. 328 329 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 330 331 def seek(self, offset, doc_frequency): 332 333 """ 334 Seek to 'offset' in the index file, limiting the number of documents 335 available for reading to 'doc_frequency'. 336 """ 337 338 self.reset() 339 340 # Seek to the appropriate index entry. 341 342 self.position_index_reader.seek(offset, doc_frequency) 343 344 # Initialise the current index entry and current position file reader. 345 346 self._next_section() 347 self._init_section() 348 349 # Sequence methods. 350 351 def __len__(self): 352 return len(self.position_index_reader) 353 354 def sort(self): 355 pass 356 357 # Iterator methods. 358 359 def __iter__(self): 360 return self 361 362 def next(self): 363 364 """ 365 Attempt to get the next document record from the section in the 366 positions file. 367 """ 368 369 # Return any visited but unrequested record. 370 371 if self.found_docnum is not None: 372 t = self.found_docnum, self.found_positions 373 self.found_docnum, self.found_positions = None, None 374 return t 375 376 # Or search for the next record. 377 378 while 1: 379 380 # Either return the next record. 381 382 try: 383 return self.position_reader.next() 384 385 # Or, where a section is finished, get the next section and try again. 386 387 except StopIteration: 388 389 # Where a section follows, update the index reader, but keep 390 # reading using the same file reader (since the data should just 391 # follow on from the last section). 392 393 self._next_section() 394 self.position_reader.replenish(self.section_count) 395 396 # Reset the state of the reader to make sure that document 397 # numbers are correct. 398 399 self.position_reader.reset() 400 401 def from_document(self, docnum): 402 403 """ 404 Attempt to navigate to a positions entry for the given 'docnum', 405 returning the positions for 'docnum', or None otherwise. 406 """ 407 408 # Return any unrequested document positions. 409 410 if docnum == self.found_docnum: 411 return self.found_positions 412 413 # Read ahead in the index until the next entry refers to a document 414 # later than the desired document. 415 416 try: 417 if self.next_docnum is None: 418 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_reader.next() 419 420 # Read until the next entry is after the desired document number, 421 # or until the end of the results. 422 423 while self.next_docnum <= docnum: 424 self._next_read_section() 425 if self.docnum < docnum: 426 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_reader.next() 427 else: 428 break 429 430 except StopIteration: 431 pass 432 433 # Navigate in the position file to the document. 434 435 self._init_section() 436 437 try: 438 while 1: 439 found_docnum, found_positions = self.position_reader.next() 440 441 # Return the desired document positions or None (retaining the 442 # positions for the document immediately after). 443 444 if docnum == found_docnum: 445 return found_positions 446 elif docnum < found_docnum: 447 self.found_docnum, self.found_positions = found_docnum, found_positions 448 return None 449 450 except StopIteration: 451 return None 452 453 # Internal methods. 454 455 def _next_section(self): 456 457 "Attempt to get the next section in the index." 458 459 if self.next_docnum is None: 460 self.docnum, self.pos_offset, self.section_count = self.position_index_reader.next() 461 else: 462 self._next_read_section() 463 464 def _next_read_section(self): 465 466 """ 467 Make the next index entry the current one without reading from the 468 index. 469 """ 470 471 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 472 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 473 474 def _init_section(self): 475 476 "Initialise the reader for the section in the position file." 477 478 # Seek to the position entry. 479 480 self.position_reader.seek(self.pos_offset, self.section_count) 481 482 def close(self): 483 self.position_reader.close() 484 self.position_index_reader.close() 485 486 # vim: tabstop=4 expandtab shiftwidth=4