1.1 --- a/iixr/positions.py	Sat Feb 12 01:23:58 2011 +0100
     1.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.3 @@ -1,566 +0,0 @@
     1.4 -#!/usr/bin/env python
     1.5 -
     1.6 -"""
     1.7 -Specific classes for storing position information.
     1.8 -
     1.9 -Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
    1.10 -
    1.11 -This program is free software; you can redistribute it and/or modify it under
    1.12 -the terms of the GNU General Public License as published by the Free Software
    1.13 -Foundation; either version 3 of the License, or (at your option) any later
    1.14 -version.
    1.15 -
    1.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
    1.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    1.18 -PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    1.19 -
    1.20 -You should have received a copy of the GNU General Public License along
    1.21 -with this program.  If not, see <http://www.gnu.org/licenses/>.
    1.22 -"""
    1.23 -
    1.24 -from iixr.data import *
    1.25 -from iixr.files import *
    1.26 -
    1.27 -class PositionWriter(FileWriter):
    1.28 -
    1.29 -    "Writing position information to files."
    1.30 -
    1.31 -    def begin(self, docnum_size, position_size):
    1.32 -        self.write_numbers((docnum_size, position_size))
    1.33 -        self.end_record()
    1.34 -        self.data_start = self.tell()
    1.35 -        self.docnum_size = docnum_size
    1.36 -        self.position_size = position_size
    1.37 -
    1.38 -    def reset(self):
    1.39 -        self.end_record()
    1.40 -        self.last_docnum = None
    1.41 -        self.subtractor = None
    1.42 -
    1.43 -    def write_positions(self, docnum, positions):
    1.44 -
    1.45 -        """
    1.46 -        Write for the document 'docnum' the given 'positions'.
    1.47 -        """
    1.48 -
    1.49 -        if not positions:
    1.50 -            return
    1.51 -
    1.52 -        # Make sure that the positions are sorted.
    1.53 -
    1.54 -        positions.sort()
    1.55 -
    1.56 -        # Calculate an ongoing delta.
    1.57 -
    1.58 -        if self.last_docnum is not None:
    1.59 -            if docnum < self.last_docnum:
    1.60 -                raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
    1.61 -
    1.62 -            docnum_seq = self.subtractor(docnum, self.last_docnum)
    1.63 -
    1.64 -        # Or preserve the document number and prepare for future deltas.
    1.65 -
    1.66 -        else:
    1.67 -            self.subtractor = get_subtractor(docnum)
    1.68 -            docnum_seq = docnum
    1.69 -
    1.70 -        self.write_sequence_value(docnum_seq, self.docnum_size)
    1.71 -        self.write_monotonic_sequence(positions, self.position_size)
    1.72 -
    1.73 -        self.last_docnum = docnum
    1.74 -
    1.75 -class PositionReader(FileReader):
    1.76 -
    1.77 -    "Reading position information within term-specific regions of a file."
    1.78 -
    1.79 -    def begin(self):
    1.80 -        self.begin_record()
    1.81 -        try:
    1.82 -            self.docnum_size, self.position_size = self.read_numbers(2)
    1.83 -        except EOFError:
    1.84 -            self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
    1.85 -        self.data_start = self.tell()
    1.86 -
    1.87 -    def reset(self):
    1.88 -        self.last_docnum = None
    1.89 -        self.adder = None
    1.90 -        self.begin_record()
    1.91 -
    1.92 -    def read_positions(self):
    1.93 -
    1.94 -        """
    1.95 -        Read positions, returning a document number and a list of positions.
    1.96 -        """
    1.97 -
    1.98 -        # Read the document number.
    1.99 -
   1.100 -        docnum = self.read_sequence_value(self.docnum_size)
   1.101 -
   1.102 -        # Calculate an ongoing delta.
   1.103 -
   1.104 -        if self.last_docnum is not None:
   1.105 -            self.last_docnum = self.adder(docnum, self.last_docnum)
   1.106 -
   1.107 -        # Or preserve the document number and prepare for future deltas.
   1.108 -
   1.109 -        else:
   1.110 -            self.adder = get_adder(docnum)
   1.111 -            self.last_docnum = docnum
   1.112 -
   1.113 -        positions = self.read_monotonic_sequence(self.position_size)
   1.114 -
   1.115 -        return self.last_docnum, positions
   1.116 -
   1.117 -class PositionIndexWriter(PositionWriter):
   1.118 -
   1.119 -    "Writing position index information to files."
   1.120 -
   1.121 -    def begin(self, docnum_size):
   1.122 -        PositionWriter.begin(self, docnum_size, 0)
   1.123 -
   1.124 -    def reset(self):
   1.125 -        PositionWriter.reset(self)
   1.126 -        self.last_pos_offset = 0
   1.127 -
   1.128 -    def write_positions(self, docnum, pos_offset, count):
   1.129 -
   1.130 -        """
   1.131 -        Write the given 'docnum, 'pos_offset' and document 'count' to the
   1.132 -        position index file.
   1.133 -        """
   1.134 -
   1.135 -        # Find the size of document number values.
   1.136 -
   1.137 -        if self.last_docnum is not None:
   1.138 -            docnum_seq = self.subtractor(docnum, self.last_docnum)
   1.139 -        else:
   1.140 -            self.subtractor = get_subtractor(docnum)
   1.141 -            docnum_seq = docnum
   1.142 -
   1.143 -        self.write_sequence_value(docnum_seq, self.docnum_size)
   1.144 -        self.write_number(pos_offset - self.last_pos_offset)
   1.145 -        self.write_number(count)
   1.146 -
   1.147 -        self.last_docnum = docnum
   1.148 -        self.last_pos_offset = pos_offset
   1.149 -
   1.150 -class PositionIndexReader(PositionReader):
   1.151 -
   1.152 -    "Reading position index information within term-specific regions of a file."
   1.153 -
   1.154 -    def reset(self):
   1.155 -        PositionReader.reset(self)
   1.156 -        self.last_pos_offset = 0
   1.157 -
   1.158 -    def read_positions(self):
   1.159 -
   1.160 -        """
   1.161 -        Read a document number, a position file offset for the position index
   1.162 -        file, and the number of documents in a section of that file.
   1.163 -        """
   1.164 -
   1.165 -        # Read the document number.
   1.166 -
   1.167 -        docnum = self.read_sequence_value(self.docnum_size)
   1.168 -
   1.169 -        if self.last_docnum is not None:
   1.170 -            self.last_docnum = self.adder(docnum, self.last_docnum)
   1.171 -        else:
   1.172 -            self.adder = get_adder(docnum)
   1.173 -            self.last_docnum = docnum
   1.174 -
   1.175 -        # Read the offset delta.
   1.176 -
   1.177 -        self.last_pos_offset += self.read_number()
   1.178 -
   1.179 -        # Read the document count.
   1.180 -
   1.181 -        count = self.read_number()
   1.182 -
   1.183 -        return self.last_docnum, self.last_pos_offset, count
   1.184 -
   1.185 -# Iterators for position-related files.
   1.186 -
   1.187 -class IteratorBase:
   1.188 -
   1.189 -    "Support for iterating over results."
   1.190 -
   1.191 -    def __init__(self, reader):
   1.192 -
   1.193 -        "Initialise the iterator using the given 'reader'."
   1.194 -
   1.195 -        self.reader = reader
   1.196 -        self.replenish(0) # no iteration initially permitted
   1.197 -
   1.198 -    def replenish(self, count):
   1.199 -
   1.200 -        "Replenish the iterator with 'count' results."
   1.201 -
   1.202 -        self.count = count
   1.203 -        self.read_documents = 0
   1.204 -
   1.205 -    def __len__(self):
   1.206 -
   1.207 -        "Return the total number of results."
   1.208 -
   1.209 -        return self.count
   1.210 -
   1.211 -    def sort(self):
   1.212 -        pass # Stored document positions are already sorted.
   1.213 -
   1.214 -    def __iter__(self):
   1.215 -        return self
   1.216 -
   1.217 -class PositionIterator(IteratorBase):
   1.218 -
   1.219 -    "Iterating over document positions."
   1.220 -
   1.221 -    def replenish(self, count):
   1.222 -        IteratorBase.replenish(self, count)
   1.223 -
   1.224 -        # Fill a cache of positions.
   1.225 -
   1.226 -        self.cache = []
   1.227 -        n = 0
   1.228 -
   1.229 -        while n < self.count:
   1.230 -            self.cache.append(self.reader.read_positions())
   1.231 -            n += 1
   1.232 -
   1.233 -    def seek(self, offset, count):
   1.234 -
   1.235 -        """
   1.236 -        Seek to 'offset' in the file, limiting the number of documents available
   1.237 -        for reading to 'count'.
   1.238 -        """
   1.239 -
   1.240 -        self.reader.seek(offset)
   1.241 -        self.replenish(count)
   1.242 -
   1.243 -    def next(self):
   1.244 -
   1.245 -        "Read positions for a single document."
   1.246 -
   1.247 -        if self.read_documents < self.count:
   1.248 -            positions = self.cache[self.read_documents]
   1.249 -            self.read_documents += 1
   1.250 -            return positions
   1.251 -        else:
   1.252 -            raise StopIteration
   1.253 -
   1.254 -class PositionIndexIterator(IteratorBase):
   1.255 -
   1.256 -    "Iterating over document positions."
   1.257 -
   1.258 -    def replenish(self, count):
   1.259 -        IteratorBase.replenish(self, count)
   1.260 -
   1.261 -        # Fill a cache of offsets.
   1.262 -
   1.263 -        self.cache = []
   1.264 -        self.current = 0
   1.265 -        n = 0
   1.266 -
   1.267 -        while n < self.count:
   1.268 -            docnum, pos_offset, section_count = t = self.reader.read_positions()
   1.269 -            self.cache.append(t)
   1.270 -            n += section_count
   1.271 -
   1.272 -    def seek(self, offset, doc_frequency):
   1.273 -
   1.274 -        """
   1.275 -        Seek to 'offset' in the file, limiting the number of documents available
   1.276 -        for reading to 'doc_frequency'.
   1.277 -        """
   1.278 -
   1.279 -        self.reader.seek(offset)
   1.280 -        self.replenish(doc_frequency)
   1.281 -
   1.282 -    def next(self):
   1.283 -
   1.284 -        "Read positions for a single document."
   1.285 -
   1.286 -        if self.current < len(self.cache):
   1.287 -            docnum, pos_offset, self.section_count = t = self.cache[self.current]
   1.288 -            self.current += 1
   1.289 -            return t
   1.290 -        else:
   1.291 -            raise StopIteration
   1.292 -
   1.293 -class PositionDictionaryWriter:
   1.294 -
   1.295 -    "Writing position dictionaries."
   1.296 -
   1.297 -    def __init__(self, position_writer, position_index_writer, interval):
   1.298 -        self.position_writer = position_writer
   1.299 -        self.position_index_writer = position_index_writer
   1.300 -        self.interval = interval
   1.301 -
   1.302 -    def write_term_positions(self, doc_positions):
   1.303 -
   1.304 -        """
   1.305 -        Write all 'doc_positions' - a collection of tuples of the form (document
   1.306 -        number, position list) - to the file.
   1.307 -
   1.308 -        Add some records to the index, making dictionary entries.
   1.309 -
   1.310 -        Return a tuple containing the offset of the written data, the frequency
   1.311 -        (number of positions), and document frequency (number of documents) for
   1.312 -        the term involved.
   1.313 -        """
   1.314 -
   1.315 -        # Write the positions.
   1.316 -
   1.317 -        frequency = 0
   1.318 -        count = 0
   1.319 -
   1.320 -        if doc_positions:
   1.321 -            doc_positions.sort()
   1.322 -
   1.323 -            # Look ahead at the first document record.
   1.324 -            # NOTE: Any iterator would need to support this.
   1.325 -
   1.326 -            first_docnum, first_positions = doc_positions[0]
   1.327 -            first_position = first_positions[0]
   1.328 -
   1.329 -            # Write out size details.
   1.330 -
   1.331 -            docnum_size, position_size = sizeof(first_docnum), sizeof(first_position)
   1.332 -            self.position_writer.begin(docnum_size, position_size)
   1.333 -            self.position_index_writer.begin(docnum_size)
   1.334 -
   1.335 -            # Reset the writers.
   1.336 -
   1.337 -            self.position_writer.reset()
   1.338 -            self.position_index_writer.reset()
   1.339 -
   1.340 -            # Remember the first index entry offset.
   1.341 -
   1.342 -            index_offset = self.position_index_writer.tell()
   1.343 -
   1.344 -            # Retain the first record offset for a subsequent index entry.
   1.345 -
   1.346 -            first_offset = self.position_writer.tell()
   1.347 -
   1.348 -            for docnum, positions in doc_positions:
   1.349 -                if first_docnum is None:
   1.350 -                    first_docnum = docnum
   1.351 -
   1.352 -                self.position_writer.write_positions(docnum, positions)
   1.353 -
   1.354 -                frequency += len(positions)
   1.355 -                count += 1
   1.356 -
   1.357 -                # Every {interval} entries, write an index entry.
   1.358 -
   1.359 -                if count % self.interval == 0:
   1.360 -
   1.361 -                    self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
   1.362 -
   1.363 -                    # Reset the position writer so that position readers accessing
   1.364 -                    # a section start with the correct document number.
   1.365 -
   1.366 -                    self.position_writer.reset()
   1.367 -
   1.368 -                    first_offset = self.position_writer.tell()
   1.369 -                    first_docnum = None
   1.370 -
   1.371 -            # Finish writing an index entry for the remaining documents.
   1.372 -
   1.373 -            else:
   1.374 -                if first_docnum is not None:
   1.375 -                    self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
   1.376 -
   1.377 -        return index_offset, frequency, count
   1.378 -
   1.379 -    def close(self):
   1.380 -        self.position_writer.close()
   1.381 -        self.position_index_writer.close()
   1.382 -
   1.383 -class PositionDictionaryReader:
   1.384 -
   1.385 -    "Access to position dictionary entries through iterators."
   1.386 -
   1.387 -    def __init__(self, position_reader, position_index_reader):
   1.388 -        self.position_reader = position_reader
   1.389 -        self.position_index_reader = position_index_reader
   1.390 -
   1.391 -    def read_term_positions(self, offset, doc_frequency):
   1.392 -        iterator = PositionDictionaryIterator(
   1.393 -            PositionIterator(self.position_reader),
   1.394 -            PositionIndexIterator(self.position_index_reader)
   1.395 -            )
   1.396 -        iterator.seek(offset, doc_frequency)
   1.397 -        return iterator
   1.398 -
   1.399 -    def close(self):
   1.400 -        self.position_reader.close()
   1.401 -        self.position_index_reader.close()
   1.402 -
   1.403 -class PositionDictionaryIterator:
   1.404 -
   1.405 -    "Iteration over position dictionary entries."
   1.406 -
   1.407 -    def __init__(self, position_iterator, position_index_iterator):
   1.408 -        self.position_iterator = position_iterator
   1.409 -        self.position_index_iterator = position_index_iterator
   1.410 -        self.reset()
   1.411 -
   1.412 -    def reset(self):
   1.413 -
   1.414 -        # Remember the last values.
   1.415 -
   1.416 -        self.found_docnum, self.found_positions = None, None
   1.417 -
   1.418 -        # Maintain state for the next index entry, if read.
   1.419 -
   1.420 -        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   1.421 -
   1.422 -    def seek(self, offset, doc_frequency):
   1.423 -
   1.424 -        """
   1.425 -        Seek to 'offset' in the index file, limiting the number of documents
   1.426 -        available for reading to 'doc_frequency'.
   1.427 -        """
   1.428 -
   1.429 -        self.reset()
   1.430 -
   1.431 -        # Seek to the appropriate index entry.
   1.432 -
   1.433 -        self.position_index_iterator.seek(offset, doc_frequency)
   1.434 -
   1.435 -        # Initialise the current index entry and current position file iterator.
   1.436 -
   1.437 -        self._next_section()
   1.438 -        self._init_section()
   1.439 -
   1.440 -    # Sequence methods.
   1.441 -
   1.442 -    def __len__(self):
   1.443 -        return len(self.position_index_iterator)
   1.444 -
   1.445 -    def sort(self):
   1.446 -        pass
   1.447 -
   1.448 -    # Iterator methods.
   1.449 -
   1.450 -    def __iter__(self):
   1.451 -        return self
   1.452 -
   1.453 -    def next(self):
   1.454 -
   1.455 -        """
   1.456 -        Attempt to get the next document record from the section in the
   1.457 -        positions file.
   1.458 -        """
   1.459 -
   1.460 -        # Return any visited but unrequested record.
   1.461 -
   1.462 -        if self.found_docnum is not None:
   1.463 -            t = self.found_docnum, self.found_positions
   1.464 -            self.found_docnum, self.found_positions = None, None
   1.465 -            return t
   1.466 -
   1.467 -        # Or search for the next record.
   1.468 -
   1.469 -        while 1:
   1.470 -
   1.471 -            # Either return the next record.
   1.472 -
   1.473 -            try:
   1.474 -                return self.position_iterator.next()
   1.475 -
   1.476 -            # Or, where a section is finished, get the next section and try again.
   1.477 -
   1.478 -            except StopIteration:
   1.479 -
   1.480 -                # Although, where a single iterator is in use, the file reader
   1.481 -                # would be positioned appropriately, this is not guaranteed in a
   1.482 -                # multiple iterator situation.
   1.483 -
   1.484 -                self._next_section()
   1.485 -                self._init_section()
   1.486 -
   1.487 -    def from_document(self, docnum):
   1.488 -
   1.489 -        """
   1.490 -        Attempt to navigate to a positions entry for the given 'docnum',
   1.491 -        returning the positions for 'docnum', or None otherwise.
   1.492 -        """
   1.493 -
   1.494 -        # Return any unrequested document positions.
   1.495 -
   1.496 -        if docnum == self.found_docnum:
   1.497 -            return self.found_positions
   1.498 -
   1.499 -        # Read ahead in the index until the next entry refers to a document
   1.500 -        # later than the desired document.
   1.501 -
   1.502 -        try:
   1.503 -            if self.next_docnum is None:
   1.504 -                self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()
   1.505 -
   1.506 -            # Read until the next entry is after the desired document number,
   1.507 -            # or until the end of the results.
   1.508 -
   1.509 -            while self.next_docnum <= docnum:
   1.510 -                self._next_read_section()
   1.511 -                if self.docnum < docnum:
   1.512 -                    self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()
   1.513 -                else:
   1.514 -                    break
   1.515 -
   1.516 -        except StopIteration:
   1.517 -            pass
   1.518 -
   1.519 -        # Navigate in the position file to the document.
   1.520 -
   1.521 -        self._init_section()
   1.522 -
   1.523 -        try:
   1.524 -            while 1:
   1.525 -                found_docnum, found_positions = self.position_iterator.next()
   1.526 -
   1.527 -                # Return the desired document positions or None (retaining the
   1.528 -                # positions for the document immediately after).
   1.529 -
   1.530 -                if docnum <= found_docnum:
   1.531 -                    self.found_docnum, self.found_positions = found_docnum, found_positions
   1.532 -                    if docnum == found_docnum:
   1.533 -                        return found_positions
   1.534 -                    elif docnum < found_docnum:
   1.535 -                        return None
   1.536 -
   1.537 -        except StopIteration:
   1.538 -            return None
   1.539 -
   1.540 -    # Internal methods.
   1.541 -
   1.542 -    def _next_section(self):
   1.543 -
   1.544 -        "Attempt to get the next section in the index."
   1.545 -
   1.546 -        if self.next_docnum is None:
   1.547 -            self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()
   1.548 -        else:
   1.549 -            self._next_read_section()
   1.550 -
   1.551 -    def _next_read_section(self):
   1.552 -
   1.553 -        """
   1.554 -        Make the next index entry the current one without reading from the
   1.555 -        index.
   1.556 -        """
   1.557 -
   1.558 -        self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
   1.559 -        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   1.560 -
   1.561 -    def _init_section(self):
   1.562 -
   1.563 -        "Initialise the iterator for the section in the position file."
   1.564 -
   1.565 -        # Seek to the position entry.
   1.566 -
   1.567 -        self.position_iterator.seek(self.pos_offset, self.section_count)
   1.568 -
   1.569 -# vim: tabstop=4 expandtab shiftwidth=4