1.1 --- a/iixr/positions.py Sat Feb 12 01:23:58 2011 +0100
1.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
1.3 @@ -1,566 +0,0 @@
1.4 -#!/usr/bin/env python
1.5 -
1.6 -"""
1.7 -Specific classes for storing position information.
1.8 -
1.9 -Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
1.10 -
1.11 -This program is free software; you can redistribute it and/or modify it under
1.12 -the terms of the GNU General Public License as published by the Free Software
1.13 -Foundation; either version 3 of the License, or (at your option) any later
1.14 -version.
1.15 -
1.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 -PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 -
1.20 -You should have received a copy of the GNU General Public License along
1.21 -with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 -"""
1.23 -
1.24 -from iixr.data import *
1.25 -from iixr.files import *
1.26 -
1.27 -class PositionWriter(FileWriter):
1.28 -
1.29 - "Writing position information to files."
1.30 -
1.31 - def begin(self, docnum_size, position_size):
1.32 - self.write_numbers((docnum_size, position_size))
1.33 - self.end_record()
1.34 - self.data_start = self.tell()
1.35 - self.docnum_size = docnum_size
1.36 - self.position_size = position_size
1.37 -
1.38 - def reset(self):
1.39 - self.end_record()
1.40 - self.last_docnum = None
1.41 - self.subtractor = None
1.42 -
1.43 - def write_positions(self, docnum, positions):
1.44 -
1.45 - """
1.46 - Write for the document 'docnum' the given 'positions'.
1.47 - """
1.48 -
1.49 - if not positions:
1.50 - return
1.51 -
1.52 - # Make sure that the positions are sorted.
1.53 -
1.54 - positions.sort()
1.55 -
1.56 - # Calculate an ongoing delta.
1.57 -
1.58 - if self.last_docnum is not None:
1.59 - if docnum < self.last_docnum:
1.60 - raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
1.61 -
1.62 - docnum_seq = self.subtractor(docnum, self.last_docnum)
1.63 -
1.64 - # Or preserve the document number and prepare for future deltas.
1.65 -
1.66 - else:
1.67 - self.subtractor = get_subtractor(docnum)
1.68 - docnum_seq = docnum
1.69 -
1.70 - self.write_sequence_value(docnum_seq, self.docnum_size)
1.71 - self.write_monotonic_sequence(positions, self.position_size)
1.72 -
1.73 - self.last_docnum = docnum
1.74 -
1.75 -class PositionReader(FileReader):
1.76 -
1.77 - "Reading position information within term-specific regions of a file."
1.78 -
1.79 - def begin(self):
1.80 - self.begin_record()
1.81 - try:
1.82 - self.docnum_size, self.position_size = self.read_numbers(2)
1.83 - except EOFError:
1.84 - self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
1.85 - self.data_start = self.tell()
1.86 -
1.87 - def reset(self):
1.88 - self.last_docnum = None
1.89 - self.adder = None
1.90 - self.begin_record()
1.91 -
1.92 - def read_positions(self):
1.93 -
1.94 - """
1.95 - Read positions, returning a document number and a list of positions.
1.96 - """
1.97 -
1.98 - # Read the document number.
1.99 -
1.100 - docnum = self.read_sequence_value(self.docnum_size)
1.101 -
1.102 - # Calculate an ongoing delta.
1.103 -
1.104 - if self.last_docnum is not None:
1.105 - self.last_docnum = self.adder(docnum, self.last_docnum)
1.106 -
1.107 - # Or preserve the document number and prepare for future deltas.
1.108 -
1.109 - else:
1.110 - self.adder = get_adder(docnum)
1.111 - self.last_docnum = docnum
1.112 -
1.113 - positions = self.read_monotonic_sequence(self.position_size)
1.114 -
1.115 - return self.last_docnum, positions
1.116 -
1.117 -class PositionIndexWriter(PositionWriter):
1.118 -
1.119 - "Writing position index information to files."
1.120 -
1.121 - def begin(self, docnum_size):
1.122 - PositionWriter.begin(self, docnum_size, 0)
1.123 -
1.124 - def reset(self):
1.125 - PositionWriter.reset(self)
1.126 - self.last_pos_offset = 0
1.127 -
1.128 - def write_positions(self, docnum, pos_offset, count):
1.129 -
1.130 - """
1.131 - Write the given 'docnum, 'pos_offset' and document 'count' to the
1.132 - position index file.
1.133 - """
1.134 -
1.135 - # Find the size of document number values.
1.136 -
1.137 - if self.last_docnum is not None:
1.138 - docnum_seq = self.subtractor(docnum, self.last_docnum)
1.139 - else:
1.140 - self.subtractor = get_subtractor(docnum)
1.141 - docnum_seq = docnum
1.142 -
1.143 - self.write_sequence_value(docnum_seq, self.docnum_size)
1.144 - self.write_number(pos_offset - self.last_pos_offset)
1.145 - self.write_number(count)
1.146 -
1.147 - self.last_docnum = docnum
1.148 - self.last_pos_offset = pos_offset
1.149 -
1.150 -class PositionIndexReader(PositionReader):
1.151 -
1.152 - "Reading position index information within term-specific regions of a file."
1.153 -
1.154 - def reset(self):
1.155 - PositionReader.reset(self)
1.156 - self.last_pos_offset = 0
1.157 -
1.158 - def read_positions(self):
1.159 -
1.160 - """
1.161 - Read a document number, a position file offset for the position index
1.162 - file, and the number of documents in a section of that file.
1.163 - """
1.164 -
1.165 - # Read the document number.
1.166 -
1.167 - docnum = self.read_sequence_value(self.docnum_size)
1.168 -
1.169 - if self.last_docnum is not None:
1.170 - self.last_docnum = self.adder(docnum, self.last_docnum)
1.171 - else:
1.172 - self.adder = get_adder(docnum)
1.173 - self.last_docnum = docnum
1.174 -
1.175 - # Read the offset delta.
1.176 -
1.177 - self.last_pos_offset += self.read_number()
1.178 -
1.179 - # Read the document count.
1.180 -
1.181 - count = self.read_number()
1.182 -
1.183 - return self.last_docnum, self.last_pos_offset, count
1.184 -
1.185 -# Iterators for position-related files.
1.186 -
1.187 -class IteratorBase:
1.188 -
1.189 - "Support for iterating over results."
1.190 -
1.191 - def __init__(self, reader):
1.192 -
1.193 - "Initialise the iterator using the given 'reader'."
1.194 -
1.195 - self.reader = reader
1.196 - self.replenish(0) # no iteration initially permitted
1.197 -
1.198 - def replenish(self, count):
1.199 -
1.200 - "Replenish the iterator with 'count' results."
1.201 -
1.202 - self.count = count
1.203 - self.read_documents = 0
1.204 -
1.205 - def __len__(self):
1.206 -
1.207 - "Return the total number of results."
1.208 -
1.209 - return self.count
1.210 -
1.211 - def sort(self):
1.212 - pass # Stored document positions are already sorted.
1.213 -
1.214 - def __iter__(self):
1.215 - return self
1.216 -
1.217 -class PositionIterator(IteratorBase):
1.218 -
1.219 - "Iterating over document positions."
1.220 -
1.221 - def replenish(self, count):
1.222 - IteratorBase.replenish(self, count)
1.223 -
1.224 - # Fill a cache of positions.
1.225 -
1.226 - self.cache = []
1.227 - n = 0
1.228 -
1.229 - while n < self.count:
1.230 - self.cache.append(self.reader.read_positions())
1.231 - n += 1
1.232 -
1.233 - def seek(self, offset, count):
1.234 -
1.235 - """
1.236 - Seek to 'offset' in the file, limiting the number of documents available
1.237 - for reading to 'count'.
1.238 - """
1.239 -
1.240 - self.reader.seek(offset)
1.241 - self.replenish(count)
1.242 -
1.243 - def next(self):
1.244 -
1.245 - "Read positions for a single document."
1.246 -
1.247 - if self.read_documents < self.count:
1.248 - positions = self.cache[self.read_documents]
1.249 - self.read_documents += 1
1.250 - return positions
1.251 - else:
1.252 - raise StopIteration
1.253 -
1.254 -class PositionIndexIterator(IteratorBase):
1.255 -
1.256 - "Iterating over document positions."
1.257 -
1.258 - def replenish(self, count):
1.259 - IteratorBase.replenish(self, count)
1.260 -
1.261 - # Fill a cache of offsets.
1.262 -
1.263 - self.cache = []
1.264 - self.current = 0
1.265 - n = 0
1.266 -
1.267 - while n < self.count:
1.268 - docnum, pos_offset, section_count = t = self.reader.read_positions()
1.269 - self.cache.append(t)
1.270 - n += section_count
1.271 -
1.272 - def seek(self, offset, doc_frequency):
1.273 -
1.274 - """
1.275 - Seek to 'offset' in the file, limiting the number of documents available
1.276 - for reading to 'doc_frequency'.
1.277 - """
1.278 -
1.279 - self.reader.seek(offset)
1.280 - self.replenish(doc_frequency)
1.281 -
1.282 - def next(self):
1.283 -
1.284 - "Read positions for a single document."
1.285 -
1.286 - if self.current < len(self.cache):
1.287 - docnum, pos_offset, self.section_count = t = self.cache[self.current]
1.288 - self.current += 1
1.289 - return t
1.290 - else:
1.291 - raise StopIteration
1.292 -
1.293 -class PositionDictionaryWriter:
1.294 -
1.295 - "Writing position dictionaries."
1.296 -
1.297 - def __init__(self, position_writer, position_index_writer, interval):
1.298 - self.position_writer = position_writer
1.299 - self.position_index_writer = position_index_writer
1.300 - self.interval = interval
1.301 -
1.302 - def write_term_positions(self, doc_positions):
1.303 -
1.304 - """
1.305 - Write all 'doc_positions' - a collection of tuples of the form (document
1.306 - number, position list) - to the file.
1.307 -
1.308 - Add some records to the index, making dictionary entries.
1.309 -
1.310 - Return a tuple containing the offset of the written data, the frequency
1.311 - (number of positions), and document frequency (number of documents) for
1.312 - the term involved.
1.313 - """
1.314 -
1.315 - # Write the positions.
1.316 -
1.317 - frequency = 0
1.318 - count = 0
1.319 -
1.320 - if doc_positions:
1.321 - doc_positions.sort()
1.322 -
1.323 - # Look ahead at the first document record.
1.324 - # NOTE: Any iterator would need to support this.
1.325 -
1.326 - first_docnum, first_positions = doc_positions[0]
1.327 - first_position = first_positions[0]
1.328 -
1.329 - # Write out size details.
1.330 -
1.331 - docnum_size, position_size = sizeof(first_docnum), sizeof(first_position)
1.332 - self.position_writer.begin(docnum_size, position_size)
1.333 - self.position_index_writer.begin(docnum_size)
1.334 -
1.335 - # Reset the writers.
1.336 -
1.337 - self.position_writer.reset()
1.338 - self.position_index_writer.reset()
1.339 -
1.340 - # Remember the first index entry offset.
1.341 -
1.342 - index_offset = self.position_index_writer.tell()
1.343 -
1.344 - # Retain the first record offset for a subsequent index entry.
1.345 -
1.346 - first_offset = self.position_writer.tell()
1.347 -
1.348 - for docnum, positions in doc_positions:
1.349 - if first_docnum is None:
1.350 - first_docnum = docnum
1.351 -
1.352 - self.position_writer.write_positions(docnum, positions)
1.353 -
1.354 - frequency += len(positions)
1.355 - count += 1
1.356 -
1.357 - # Every {interval} entries, write an index entry.
1.358 -
1.359 - if count % self.interval == 0:
1.360 -
1.361 - self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
1.362 -
1.363 - # Reset the position writer so that position readers accessing
1.364 - # a section start with the correct document number.
1.365 -
1.366 - self.position_writer.reset()
1.367 -
1.368 - first_offset = self.position_writer.tell()
1.369 - first_docnum = None
1.370 -
1.371 - # Finish writing an index entry for the remaining documents.
1.372 -
1.373 - else:
1.374 - if first_docnum is not None:
1.375 - self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
1.376 -
1.377 - return index_offset, frequency, count
1.378 -
1.379 - def close(self):
1.380 - self.position_writer.close()
1.381 - self.position_index_writer.close()
1.382 -
1.383 -class PositionDictionaryReader:
1.384 -
1.385 - "Access to position dictionary entries through iterators."
1.386 -
1.387 - def __init__(self, position_reader, position_index_reader):
1.388 - self.position_reader = position_reader
1.389 - self.position_index_reader = position_index_reader
1.390 -
1.391 - def read_term_positions(self, offset, doc_frequency):
1.392 - iterator = PositionDictionaryIterator(
1.393 - PositionIterator(self.position_reader),
1.394 - PositionIndexIterator(self.position_index_reader)
1.395 - )
1.396 - iterator.seek(offset, doc_frequency)
1.397 - return iterator
1.398 -
1.399 - def close(self):
1.400 - self.position_reader.close()
1.401 - self.position_index_reader.close()
1.402 -
1.403 -class PositionDictionaryIterator:
1.404 -
1.405 - "Iteration over position dictionary entries."
1.406 -
1.407 - def __init__(self, position_iterator, position_index_iterator):
1.408 - self.position_iterator = position_iterator
1.409 - self.position_index_iterator = position_index_iterator
1.410 - self.reset()
1.411 -
1.412 - def reset(self):
1.413 -
1.414 - # Remember the last values.
1.415 -
1.416 - self.found_docnum, self.found_positions = None, None
1.417 -
1.418 - # Maintain state for the next index entry, if read.
1.419 -
1.420 - self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.421 -
1.422 - def seek(self, offset, doc_frequency):
1.423 -
1.424 - """
1.425 - Seek to 'offset' in the index file, limiting the number of documents
1.426 - available for reading to 'doc_frequency'.
1.427 - """
1.428 -
1.429 - self.reset()
1.430 -
1.431 - # Seek to the appropriate index entry.
1.432 -
1.433 - self.position_index_iterator.seek(offset, doc_frequency)
1.434 -
1.435 - # Initialise the current index entry and current position file iterator.
1.436 -
1.437 - self._next_section()
1.438 - self._init_section()
1.439 -
1.440 - # Sequence methods.
1.441 -
1.442 - def __len__(self):
1.443 - return len(self.position_index_iterator)
1.444 -
1.445 - def sort(self):
1.446 - pass
1.447 -
1.448 - # Iterator methods.
1.449 -
1.450 - def __iter__(self):
1.451 - return self
1.452 -
1.453 - def next(self):
1.454 -
1.455 - """
1.456 - Attempt to get the next document record from the section in the
1.457 - positions file.
1.458 - """
1.459 -
1.460 - # Return any visited but unrequested record.
1.461 -
1.462 - if self.found_docnum is not None:
1.463 - t = self.found_docnum, self.found_positions
1.464 - self.found_docnum, self.found_positions = None, None
1.465 - return t
1.466 -
1.467 - # Or search for the next record.
1.468 -
1.469 - while 1:
1.470 -
1.471 - # Either return the next record.
1.472 -
1.473 - try:
1.474 - return self.position_iterator.next()
1.475 -
1.476 - # Or, where a section is finished, get the next section and try again.
1.477 -
1.478 - except StopIteration:
1.479 -
1.480 - # Although, where a single iterator is in use, the file reader
1.481 - # would be positioned appropriately, this is not guaranteed in a
1.482 - # multiple iterator situation.
1.483 -
1.484 - self._next_section()
1.485 - self._init_section()
1.486 -
1.487 - def from_document(self, docnum):
1.488 -
1.489 - """
1.490 - Attempt to navigate to a positions entry for the given 'docnum',
1.491 - returning the positions for 'docnum', or None otherwise.
1.492 - """
1.493 -
1.494 - # Return any unrequested document positions.
1.495 -
1.496 - if docnum == self.found_docnum:
1.497 - return self.found_positions
1.498 -
1.499 - # Read ahead in the index until the next entry refers to a document
1.500 - # later than the desired document.
1.501 -
1.502 - try:
1.503 - if self.next_docnum is None:
1.504 - self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()
1.505 -
1.506 - # Read until the next entry is after the desired document number,
1.507 - # or until the end of the results.
1.508 -
1.509 - while self.next_docnum <= docnum:
1.510 - self._next_read_section()
1.511 - if self.docnum < docnum:
1.512 - self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()
1.513 - else:
1.514 - break
1.515 -
1.516 - except StopIteration:
1.517 - pass
1.518 -
1.519 - # Navigate in the position file to the document.
1.520 -
1.521 - self._init_section()
1.522 -
1.523 - try:
1.524 - while 1:
1.525 - found_docnum, found_positions = self.position_iterator.next()
1.526 -
1.527 - # Return the desired document positions or None (retaining the
1.528 - # positions for the document immediately after).
1.529 -
1.530 - if docnum <= found_docnum:
1.531 - self.found_docnum, self.found_positions = found_docnum, found_positions
1.532 - if docnum == found_docnum:
1.533 - return found_positions
1.534 - elif docnum < found_docnum:
1.535 - return None
1.536 -
1.537 - except StopIteration:
1.538 - return None
1.539 -
1.540 - # Internal methods.
1.541 -
1.542 - def _next_section(self):
1.543 -
1.544 - "Attempt to get the next section in the index."
1.545 -
1.546 - if self.next_docnum is None:
1.547 - self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()
1.548 - else:
1.549 - self._next_read_section()
1.550 -
1.551 - def _next_read_section(self):
1.552 -
1.553 - """
1.554 - Make the next index entry the current one without reading from the
1.555 - index.
1.556 - """
1.557 -
1.558 - self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
1.559 - self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.560 -
1.561 - def _init_section(self):
1.562 -
1.563 - "Initialise the iterator for the section in the position file."
1.564 -
1.565 - # Seek to the position entry.
1.566 -
1.567 - self.position_iterator.seek(self.pos_offset, self.section_count)
1.568 -
1.569 -# vim: tabstop=4 expandtab shiftwidth=4