Removed old module.

     1.1 --- a/iixr.py	Tue Sep 15 00:15:11 2009 +0200
     1.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.3 @@ -1,1876 +0,0 @@
     1.4 -#!/usr/bin/env python
     1.5 -
     1.6 -"""
     1.7 -A simple (and sane) text indexing library.
     1.8 -
     1.9 -Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    1.10 -
    1.11 -This program is free software; you can redistribute it and/or modify it under
    1.12 -the terms of the GNU General Public License as published by the Free Software
    1.13 -Foundation; either version 3 of the License, or (at your option) any later
    1.14 -version.
    1.15 -
    1.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
    1.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    1.18 -PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    1.19 -
    1.20 -You should have received a copy of the GNU General Public License along
    1.21 -with this program.  If not, see <http://www.gnu.org/licenses/>.
    1.22 -"""
    1.23 -
    1.24 -from os import dup, fdopen       # independent iterator access to files
    1.25 -from os import listdir, mkdir    # index and partition discovery
    1.26 -from os import remove, rename    # partition manipulation
    1.27 -from os.path import exists, join
    1.28 -from os.path import commonprefix # to find common string prefixes
    1.29 -from bisect import bisect_right  # to find terms in the dictionary index
    1.30 -import bz2, zlib                 # for field compression
    1.31 -from itermerge import itermerge
    1.32 -
    1.33 -try:
    1.34 -    set
    1.35 -except NameError:
    1.36 -    from sets import Set as set
    1.37 -
    1.38 -# Constants.
    1.39 -
    1.40 -TERM_INTERVAL     = 100
    1.41 -DOCUMENT_INTERVAL = 100
    1.42 -FIELD_INTERVAL    = 100
    1.43 -FLUSH_INTERVAL    = 10000
    1.44 -
    1.45 -WRITE_CACHE_SIZE  = 100000
    1.46 -READ_CACHE_SIZE   = 10000
    1.47 -READ_CACHE_RESIZE = 5000
    1.48 -
    1.49 -TERM_FILENAMES    = "terms", "terms_index", "positions", "positions_index"
    1.50 -FIELD_FILENAMES   = "fields", "fields_index"
    1.51 -
    1.52 -compressors = [("b", bz2.compress), ("z", zlib.compress)]
    1.53 -decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
    1.54 -
    1.55 -# Utility functions.
    1.56 -
    1.57 -try:
    1.58 -    from vint import vint as _vint
    1.59 -
    1.60 -    def vint(number):
    1.61 -
    1.62 -        "Write 'number' as a variable-length integer."
    1.63 -
    1.64 -        if number >= 0:
    1.65 -            return _vint(number)
    1.66 -        else:
    1.67 -            raise ValueError, "Number %r is negative." % number
    1.68 -
    1.69 -except ImportError:
    1.70 -
    1.71 -    def vint(number):
    1.72 -
    1.73 -        "Write 'number' as a variable-length integer."
    1.74 -
    1.75 -        if number >= 0:
    1.76 -
    1.77 -            # Special case: one byte containing a 7-bit number.
    1.78 -
    1.79 -            if number < 128:
    1.80 -                return chr(number)
    1.81 -
    1.82 -            # Write the number from least to most significant digits.
    1.83 -
    1.84 -            bytes = []
    1.85 -
    1.86 -            while number != 0:
    1.87 -                lsd = number & 127
    1.88 -                number = number >> 7
    1.89 -                if number != 0:
    1.90 -                    lsd |= 128
    1.91 -                bytes.append(chr(lsd))
    1.92 -
    1.93 -            return "".join(bytes)
    1.94 -
    1.95 -        # Negative numbers are not supported.
    1.96 -
    1.97 -        else:
    1.98 -            raise ValueError, "Number %r is negative." % number
    1.99 -
   1.100 -# Foundation classes.
   1.101 -
   1.102 -class File:
   1.103 -
   1.104 -    "A basic file abstraction."
   1.105 -
   1.106 -    def __init__(self, f):
   1.107 -        self.f = f
   1.108 -        self.reset()
   1.109 -
   1.110 -    def reset(self):
   1.111 -
   1.112 -        "To be used to reset the state of the reader or writer between records."
   1.113 -
   1.114 -        pass
   1.115 -
   1.116 -    def rewind(self):
   1.117 -        self.seek(0)
   1.118 -        self.reset()
   1.119 -
   1.120 -    def seek(self, offset):
   1.121 -
   1.122 -        "To be defined by readers."
   1.123 -
   1.124 -        pass
   1.125 -
   1.126 -    def flush(self):
   1.127 -
   1.128 -        "To be defined by writers."
   1.129 -
   1.130 -        pass
   1.131 -
   1.132 -    def close(self):
   1.133 -        if self.f is not None:
   1.134 -            self.flush()
   1.135 -            self.f.close()
   1.136 -            self.f = None
   1.137 -
   1.138 -class FileWriter(File):
   1.139 -
   1.140 -    "Writing basic data types to files."
   1.141 -
   1.142 -    def __init__(self, f):
   1.143 -        File.__init__(self, f)
   1.144 -        self.cache = []
   1.145 -        self.cache_length = 0
   1.146 -
   1.147 -    def write_number(self, number):
   1.148 -
   1.149 -        "Write 'number' to the file using a variable length encoding."
   1.150 -
   1.151 -        self.write(vint(number))
   1.152 -
   1.153 -    def write_string(self, s, compress=0):
   1.154 -
   1.155 -        """
   1.156 -        Write 's' to the file, recording its length and compressing the string
   1.157 -        if 'compress' is set to a true value.
   1.158 -        """
   1.159 -
   1.160 -        # Convert Unicode objects to strings.
   1.161 -
   1.162 -        if isinstance(s, unicode):
   1.163 -            s = s.encode("utf-8")
   1.164 -
   1.165 -        # Compress the string if requested.
   1.166 -
   1.167 -        if compress:
   1.168 -            for flag, fn in compressors:
   1.169 -                cs = fn(s)
   1.170 -
   1.171 -                # Take the first string shorter than the original.
   1.172 -
   1.173 -                if len(cs) < len(s):
   1.174 -                    s = cs
   1.175 -                    break
   1.176 -            else:
   1.177 -                flag = "-"
   1.178 -
   1.179 -        else:
   1.180 -            flag = ""
   1.181 -
   1.182 -        # Write the length of the data before the data itself.
   1.183 -
   1.184 -        length = len(s)
   1.185 -        self.write(flag + vint(length) + s)
   1.186 -
   1.187 -    # Cache-affected methods.
   1.188 -
   1.189 -    def write(self, s):
   1.190 -        self.cache.append(s)
   1.191 -        self.cache_length += len(s)
   1.192 -        if self.cache_length >= WRITE_CACHE_SIZE:
   1.193 -            self.flush()
   1.194 -
   1.195 -    def tell(self):
   1.196 -        return self.f.tell() + self.cache_length
   1.197 -
   1.198 -    def flush(self):
   1.199 -        self.f.write("".join(self.cache))
   1.200 -        self.cache = []
   1.201 -        self.cache_length = 0
   1.202 -
   1.203 -class FileReader(File):
   1.204 -
   1.205 -    "Reading basic data types from files."
   1.206 -
   1.207 -    def __init__(self, f):
   1.208 -        File.__init__(self, f)
   1.209 -        self.reset_cache()
   1.210 -
   1.211 -    def reset_cache(self):
   1.212 -        self.cache = ""
   1.213 -        self.cache_length = 0
   1.214 -        self.cache_start = 0
   1.215 -
   1.216 -    def read_number(self):
   1.217 -
   1.218 -        "Read a number from the file."
   1.219 -
   1.220 -        # Read each byte, adding it to the number.
   1.221 -
   1.222 -        shift = 0
   1.223 -        number = 0
   1.224 -        read = self.read
   1.225 -
   1.226 -        try:
   1.227 -            csd = ord(read(1))
   1.228 -            while csd & 128:
   1.229 -                number += ((csd & 127) << shift)
   1.230 -                shift += 7
   1.231 -                csd = ord(read(1))
   1.232 -            else:
   1.233 -                number += (csd << shift)
   1.234 -        except TypeError:
   1.235 -            raise EOFError
   1.236 -
   1.237 -        return number
   1.238 -
   1.239 -    def read_string(self, decompress=0):
   1.240 -
   1.241 -        """
   1.242 -        Read a string from the file, decompressing the stored data if
   1.243 -        'decompress' is set to a true value.
   1.244 -        """
   1.245 -
   1.246 -        # Decompress the data if requested.
   1.247 -
   1.248 -        if decompress:
   1.249 -            flag = self.read(1)
   1.250 -        else:
   1.251 -            flag = "-"
   1.252 -
   1.253 -        length = self.read_number()
   1.254 -        s = self.read(length)
   1.255 -
   1.256 -        # Perform decompression if applicable.
   1.257 -
   1.258 -        if flag != "-":
   1.259 -            fn = decompressors[flag]
   1.260 -            s = fn(s)
   1.261 -
   1.262 -        # Convert strings to Unicode objects.
   1.263 -
   1.264 -        return unicode(s, "utf-8")
   1.265 -
   1.266 -    # Cache-affected methods.
   1.267 -
   1.268 -    def read(self, n):
   1.269 -        needed = n - (self.cache_length - self.cache_start)
   1.270 -
   1.271 -        # Read the needed number of characters, if possible.
   1.272 -
   1.273 -        if needed > 0:
   1.274 -            s = self.f.read(max(needed, READ_CACHE_SIZE))
   1.275 -            self.cache += s
   1.276 -            self.cache_length += len(s)
   1.277 -
   1.278 -        # Get the end of the requested block.
   1.279 -
   1.280 -        next_start = self.cache_start + n
   1.281 -        s = self.cache[self.cache_start:next_start]
   1.282 -
   1.283 -        # Reposition the pointer to the cache.
   1.284 -
   1.285 -        self._seek_cache(len(s))
   1.286 -        return s
   1.287 -
   1.288 -    def tell(self):
   1.289 -        return self.f.tell() - self.cache_length + self.cache_start
   1.290 -
   1.291 -    def seek(self, offset):
   1.292 -        current = self.tell()
   1.293 -        self.f.seek(offset)
   1.294 -
   1.295 -        # If seeking forward, attempt to navigate the cache.
   1.296 -
   1.297 -        if offset >= current:
   1.298 -            self._seek_cache(offset - current)
   1.299 -        else:
   1.300 -            self.reset_cache()
   1.301 -
   1.302 -    def _seek_cache(self, delta):
   1.303 -        next_start = self.cache_start + delta
   1.304 -
   1.305 -        if next_start > 0 and next_start >= len(self.cache):
   1.306 -            self.reset_cache()
   1.307 -
   1.308 -        # If the cache is too big, resize it.
   1.309 -
   1.310 -        elif next_start > READ_CACHE_RESIZE:
   1.311 -            self.cache = self.cache[next_start:]
   1.312 -            self.cache_length = len(self.cache)
   1.313 -            self.cache_start = 0
   1.314 -
   1.315 -        # Otherwise, just reference the next part of the cache.
   1.316 -
   1.317 -        else:
   1.318 -            self.cache_start = next_start
   1.319 -
   1.320 -class FileOpener:
   1.321 -
   1.322 -    "Opening files using their filenames."
   1.323 -
   1.324 -    def __init__(self, filename):
   1.325 -        self.filename = filename
   1.326 -
   1.327 -    def open(self, mode):
   1.328 -        return open(self.filename, mode)
   1.329 -
   1.330 -    def close(self):
   1.331 -        pass
   1.332 -
   1.333 -# Specific classes for storing term and position information.
   1.334 -
   1.335 -class PositionWriter(FileWriter):
   1.336 -
   1.337 -    "Writing position information to files."
   1.338 -
   1.339 -    def reset(self):
   1.340 -        self.last_docnum = 0
   1.341 -
   1.342 -    def write_positions(self, docnum, positions):
   1.343 -
   1.344 -        """
   1.345 -        Write for the document 'docnum' the given 'positions'.
   1.346 -        Return the offset of the written record.
   1.347 -        """
   1.348 -
   1.349 -        if docnum < self.last_docnum:
   1.350 -            raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
   1.351 -
   1.352 -        # Record the offset of this record.
   1.353 -
   1.354 -        offset = self.tell()
   1.355 -
   1.356 -        # Make sure that the positions are sorted.
   1.357 -
   1.358 -        positions.sort()
   1.359 -
   1.360 -        # Write the position deltas.
   1.361 -
   1.362 -        output = []
   1.363 -        last = 0
   1.364 -
   1.365 -        for position in positions:
   1.366 -            output.append(vint(position - last))
   1.367 -            last = position
   1.368 -
   1.369 -        # Write the document number delta.
   1.370 -        # Write the number of positions.
   1.371 -        # Then write the positions.
   1.372 -
   1.373 -        self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))
   1.374 -
   1.375 -        self.last_docnum = docnum
   1.376 -        return offset
   1.377 -
   1.378 -class PositionOpener(FileOpener):
   1.379 -
   1.380 -    "Reading position information from files."
   1.381 -
   1.382 -    def read_term_positions(self, offset, count):
   1.383 -
   1.384 -        """
   1.385 -        Read all positions from 'offset', seeking to that position in the file
   1.386 -        before reading. The number of documents available for reading is limited
   1.387 -        to 'count'.
   1.388 -        """
   1.389 -
   1.390 -        # Duplicate the file handle.
   1.391 -
   1.392 -        f = self.open("rb")
   1.393 -        return PositionIterator(f, offset, count)
   1.394 -
   1.395 -class PositionIndexWriter(FileWriter):
   1.396 -
   1.397 -    "Writing position index information to files."
   1.398 -
   1.399 -    def reset(self):
   1.400 -        self.last_docnum = 0
   1.401 -        self.last_pos_offset = 0
   1.402 -
   1.403 -    def write_positions(self, docnum, pos_offset, count):
   1.404 -
   1.405 -        """
   1.406 -        Write the given 'docnum, 'pos_offset' and document 'count' to the
   1.407 -        position index file.
   1.408 -        """
   1.409 -
   1.410 -        # Record the offset of this record.
   1.411 -
   1.412 -        offset = self.tell()
   1.413 -        output = []
   1.414 -
   1.415 -        # Write the document number delta.
   1.416 -
   1.417 -        output.append(vint(docnum - self.last_docnum))
   1.418 -        self.last_docnum = docnum
   1.419 -
   1.420 -        # Write the position file offset delta.
   1.421 -
   1.422 -        output.append(vint(pos_offset - self.last_pos_offset))
   1.423 -        self.last_pos_offset = pos_offset
   1.424 -
   1.425 -        # Write the document count.
   1.426 -
   1.427 -        output.append(vint(count))
   1.428 -
   1.429 -        # Actually write the data.
   1.430 -
   1.431 -        self.write("".join(output))
   1.432 -
   1.433 -        return offset
   1.434 -
   1.435 -class PositionIndexOpener(FileOpener):
   1.436 -
   1.437 -    "Reading position index information from files."
   1.438 -
   1.439 -    def read_term_positions(self, offset, doc_frequency):
   1.440 -
   1.441 -        """
   1.442 -        Read all positions from 'offset', seeking to that position in the file
   1.443 -        before reading. The number of documents available for reading is limited
   1.444 -        to 'doc_frequency'.
   1.445 -        """
   1.446 -
   1.447 -        # Duplicate the file handle.
   1.448 -
   1.449 -        f = self.open("rb")
   1.450 -        return PositionIndexIterator(f, offset, doc_frequency)
   1.451 -
   1.452 -# Iterators for position-related files.
   1.453 -
   1.454 -class IteratorBase:
   1.455 -
   1.456 -    def __init__(self, count):
   1.457 -        self.replenish(count)
   1.458 -
   1.459 -    def replenish(self, count):
   1.460 -        self.count = count
   1.461 -        self.read_documents = 0
   1.462 -
   1.463 -    def __len__(self):
   1.464 -        return self.count
   1.465 -
   1.466 -    def sort(self):
   1.467 -        pass # Stored document positions are already sorted.
   1.468 -
   1.469 -    def __iter__(self):
   1.470 -        return self
   1.471 -
   1.472 -class PositionIterator(FileReader, IteratorBase):
   1.473 -
   1.474 -    "Iterating over document positions."
   1.475 -
   1.476 -    def __init__(self, f, offset, count):
   1.477 -        FileReader.__init__(self, f)
   1.478 -        IteratorBase.__init__(self, count)
   1.479 -        self.seek(offset)
   1.480 -
   1.481 -    def reset(self):
   1.482 -        self.last_docnum = 0
   1.483 -
   1.484 -    def read_positions(self):
   1.485 -
   1.486 -        "Read positions, returning a document number and a list of positions."
   1.487 -
   1.488 -        # Read the document number delta and add it to the last number.
   1.489 -
   1.490 -        self.last_docnum += self.read_number()
   1.491 -
   1.492 -        # Read the number of positions.
   1.493 -
   1.494 -        npositions = self.read_number()
   1.495 -
   1.496 -        # Read the position deltas, adding each previous position to get the
   1.497 -        # appropriate collection of absolute positions.
   1.498 -
   1.499 -        i = 0
   1.500 -        last = 0
   1.501 -        positions = []
   1.502 -
   1.503 -        while i < npositions:
   1.504 -            last += self.read_number()
   1.505 -            positions.append(last)
   1.506 -            i += 1
   1.507 -
   1.508 -        return self.last_docnum, positions
   1.509 -
   1.510 -    def next(self):
   1.511 -
   1.512 -        "Read positions for a single document."
   1.513 -
   1.514 -        if self.read_documents < self.count:
   1.515 -            self.read_documents += 1
   1.516 -            return self.read_positions()
   1.517 -        else:
   1.518 -            raise StopIteration
   1.519 -
   1.520 -class PositionIndexIterator(FileReader, IteratorBase):
   1.521 -
   1.522 -    "Iterating over document positions."
   1.523 -
   1.524 -    def __init__(self, f, offset, count):
   1.525 -        FileReader.__init__(self, f)
   1.526 -        IteratorBase.__init__(self, count)
   1.527 -        self.seek(offset)
   1.528 -        self.section_count = 0
   1.529 -
   1.530 -    def reset(self):
   1.531 -        self.last_docnum = 0
   1.532 -        self.last_pos_offset = 0
   1.533 -
   1.534 -    def read_positions(self):
   1.535 -
   1.536 -        """
   1.537 -        Read a document number, a position file offset for the position index
   1.538 -        file, and the number of documents in a section of that file.
   1.539 -        """
   1.540 -
   1.541 -        # Read the document number delta.
   1.542 -
   1.543 -        self.last_docnum += self.read_number()
   1.544 -
   1.545 -        # Read the offset delta.
   1.546 -
   1.547 -        self.last_pos_offset += self.read_number()
   1.548 -
   1.549 -        # Read the document count.
   1.550 -
   1.551 -        count = self.read_number()
   1.552 -
   1.553 -        return self.last_docnum, self.last_pos_offset, count
   1.554 -
   1.555 -    def next(self):
   1.556 -
   1.557 -        "Read positions for a single document."
   1.558 -
   1.559 -        self.read_documents += self.section_count
   1.560 -        if self.read_documents < self.count:
   1.561 -            docnum, pos_offset, self.section_count = t = self.read_positions()
   1.562 -            return t
   1.563 -        else:
   1.564 -            raise StopIteration
   1.565 -
   1.566 -class PositionDictionaryWriter:
   1.567 -
   1.568 -    "Writing position dictionaries."
   1.569 -
   1.570 -    def __init__(self, position_writer, position_index_writer, interval):
   1.571 -        self.position_writer = position_writer
   1.572 -        self.position_index_writer = position_index_writer
   1.573 -        self.interval = interval
   1.574 -
   1.575 -    def write_term_positions(self, doc_positions):
   1.576 -
   1.577 -        """
   1.578 -        Write all 'doc_positions' - a collection of tuples of the form (document
   1.579 -        number, position list) - to the file.
   1.580 -
   1.581 -        Add some records to the index, making dictionary entries.
   1.582 -
   1.583 -        Return a tuple containing the offset of the written data, the frequency
   1.584 -        (number of positions), and document frequency (number of documents) for
   1.585 -        the term involved.
   1.586 -        """
   1.587 -
   1.588 -        # Reset the writers.
   1.589 -
   1.590 -        self.position_writer.reset()
   1.591 -        self.position_index_writer.reset()
   1.592 -
   1.593 -        index_offset = None
   1.594 -
   1.595 -        # Write the positions.
   1.596 -
   1.597 -        frequency = 0
   1.598 -        first_docnum = None
   1.599 -        first_offset = None
   1.600 -        count = 0
   1.601 -
   1.602 -        doc_positions.sort()
   1.603 -
   1.604 -        for docnum, positions in doc_positions:
   1.605 -            pos_offset = self.position_writer.write_positions(docnum, positions)
   1.606 -
   1.607 -            # Retain the first record offset for a subsequent index entry.
   1.608 -
   1.609 -            if first_offset is None:
   1.610 -                first_offset = pos_offset
   1.611 -                first_docnum = docnum
   1.612 -
   1.613 -            frequency += len(positions)
   1.614 -            count += 1
   1.615 -
   1.616 -            # Every {interval} entries, write an index entry.
   1.617 -
   1.618 -            if count % self.interval == 0:
   1.619 -                io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
   1.620 -
   1.621 -                # Remember the first index entry offset.
   1.622 -
   1.623 -                if index_offset is None:
   1.624 -                    index_offset = io
   1.625 -
   1.626 -                first_offset = None
   1.627 -                first_docnum = None
   1.628 -
   1.629 -                # Reset the position writer so that position readers accessing
   1.630 -                # a section start with the correct document number.
   1.631 -
   1.632 -                self.position_writer.reset()
   1.633 -
   1.634 -        # Finish writing an index entry for the remaining documents.
   1.635 -
   1.636 -        else:
   1.637 -            if first_offset is not None:
   1.638 -                io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
   1.639 -
   1.640 -                # Remember the first index entry offset.
   1.641 -
   1.642 -                if index_offset is None:
   1.643 -                    index_offset = io
   1.644 -
   1.645 -        return index_offset, frequency, count
   1.646 -
   1.647 -    def close(self):
   1.648 -        self.position_writer.close()
   1.649 -        self.position_index_writer.close()
   1.650 -
   1.651 -class PositionDictionaryReader:
   1.652 -
   1.653 -    "Reading position dictionaries."
   1.654 -
   1.655 -    def __init__(self, position_opener, position_index_opener):
   1.656 -        self.position_opener = position_opener
   1.657 -        self.position_index_opener = position_index_opener
   1.658 -
   1.659 -    def read_term_positions(self, offset, doc_frequency):
   1.660 -
   1.661 -        """
   1.662 -        Return an iterator for dictionary entries starting at 'offset' with the
   1.663 -        given 'doc_frequency'.
   1.664 -        """
   1.665 -
   1.666 -        return PositionDictionaryIterator(self.position_opener,
   1.667 -            self.position_index_opener, offset, doc_frequency)
   1.668 -
   1.669 -    def close(self):
   1.670 -        pass
   1.671 -
   1.672 -class PositionDictionaryIterator:
   1.673 -
   1.674 -    "Iteration over position dictionary entries."
   1.675 -
   1.676 -    def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
   1.677 -        self.position_opener = position_opener
   1.678 -        self.doc_frequency = doc_frequency
   1.679 -        self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
   1.680 -        self.iterator = None
   1.681 -
   1.682 -        # Remember the last values.
   1.683 -
   1.684 -        self.found_docnum, self.found_positions = None, None
   1.685 -
   1.686 -        # Maintain state for the next index entry, if read.
   1.687 -
   1.688 -        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   1.689 -
   1.690 -        # Initialise the current index entry and current position file iterator.
   1.691 -
   1.692 -        self._next_section()
   1.693 -        self._init_section()
   1.694 -
   1.695 -    # Sequence methods.
   1.696 -
   1.697 -    def __len__(self):
   1.698 -        return self.doc_frequency
   1.699 -
   1.700 -    def sort(self):
   1.701 -        pass
   1.702 -
   1.703 -    # Iterator methods.
   1.704 -
   1.705 -    def __iter__(self):
   1.706 -        return self
   1.707 -
   1.708 -    def next(self):
   1.709 -
   1.710 -        """
   1.711 -        Attempt to get the next document record from the section in the
   1.712 -        positions file.
   1.713 -        """
   1.714 -
   1.715 -        # Return any visited but unrequested record.
   1.716 -
   1.717 -        if self.found_docnum is not None:
   1.718 -            t = self.found_docnum, self.found_positions
   1.719 -            self.found_docnum, self.found_positions = None, None
   1.720 -            return t
   1.721 -
   1.722 -        # Or search for the next record.
   1.723 -
   1.724 -        while 1:
   1.725 -
   1.726 -            # Either return the next record.
   1.727 -
   1.728 -            try:
   1.729 -                return self.iterator.next()
   1.730 -
   1.731 -            # Or, where a section is finished, get the next section and try again.
   1.732 -
   1.733 -            except StopIteration:
   1.734 -
   1.735 -                # Where a section follows, update the index iterator, but keep
   1.736 -                # reading using the same file iterator (since the data should
   1.737 -                # just follow on from the last section).
   1.738 -
   1.739 -                self._next_section()
   1.740 -                self.iterator.replenish(self.section_count)
   1.741 -
   1.742 -                # Reset the state of the iterator to make sure that document
   1.743 -                # numbers are correct.
   1.744 -
   1.745 -                self.iterator.reset()
   1.746 -
   1.747 -    def from_document(self, docnum):
   1.748 -
   1.749 -        """
   1.750 -        Attempt to navigate to a positions entry for the given 'docnum',
   1.751 -        returning the positions for 'docnum', or None otherwise.
   1.752 -        """
   1.753 -
   1.754 -        # Return any unrequested document positions.
   1.755 -
   1.756 -        if docnum == self.found_docnum:
   1.757 -            return self.found_positions
   1.758 -
   1.759 -        # Read ahead in the index until the next entry refers to a document
   1.760 -        # later than the desired document.
   1.761 -
   1.762 -        try:
   1.763 -            if self.next_docnum is None:
   1.764 -                self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
   1.765 -
   1.766 -            # Read until the next entry is after the desired document number,
   1.767 -            # or until the end of the results.
   1.768 -
   1.769 -            while self.next_docnum <= docnum:
   1.770 -                self._next_read_section()
   1.771 -                if self.docnum < docnum:
   1.772 -                    self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
   1.773 -                else:
   1.774 -                    break
   1.775 -
   1.776 -        except StopIteration:
   1.777 -            pass
   1.778 -
   1.779 -        # Navigate in the position file to the document.
   1.780 -
   1.781 -        self._init_section()
   1.782 -
   1.783 -        try:
   1.784 -            while 1:
   1.785 -                found_docnum, found_positions = self.iterator.next()
   1.786 -
   1.787 -                # Return the desired document positions or None (retaining the
   1.788 -                # positions for the document immediately after).
   1.789 -
   1.790 -                if docnum == found_docnum:
   1.791 -                    return found_positions
   1.792 -                elif docnum < found_docnum:
   1.793 -                    self.found_docnum, self.found_positions = found_docnum, found_positions
   1.794 -                    return None
   1.795 -
   1.796 -        except StopIteration:
   1.797 -            return None
   1.798 -
   1.799 -    # Internal methods.
   1.800 -
   1.801 -    def _next_section(self):
   1.802 -
   1.803 -        "Attempt to get the next section in the index."
   1.804 -
   1.805 -        if self.next_docnum is None:
   1.806 -            self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
   1.807 -        else:
   1.808 -            self._next_read_section()
   1.809 -
   1.810 -    def _next_read_section(self):
   1.811 -
   1.812 -        """
   1.813 -        Make the next index entry the current one without reading from the
   1.814 -        index.
   1.815 -        """
   1.816 -
   1.817 -        self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
   1.818 -        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   1.819 -
   1.820 -    def _init_section(self):
   1.821 -
   1.822 -        "Initialise the iterator for the section in the position file."
   1.823 -
   1.824 -        if self.iterator is not None:
   1.825 -            self.iterator.close()
   1.826 -        self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
   1.827 -
   1.828 -    def close(self):
   1.829 -        if self.iterator is not None:
   1.830 -            self.iterator.close()
   1.831 -            self.iterator = None
   1.832 -        if self.index_iterator is not None:
   1.833 -            self.index_iterator.close()
   1.834 -            self.index_iterator = None
   1.835 -
   1.836 -class TermWriter(FileWriter):
   1.837 -
   1.838 -    "Writing term information to files."
   1.839 -
   1.840 -    def reset(self):
   1.841 -        self.last_term = ""
   1.842 -        self.last_offset = 0
   1.843 -
   1.844 -    def write_term(self, term, offset, frequency, doc_frequency):
   1.845 -
   1.846 -        """
   1.847 -        Write the given 'term', its position file 'offset', its 'frequency' and
   1.848 -        its 'doc_frequency' (number of documents in which it appears) to the
   1.849 -        term information file. Return the offset after the term information was
   1.850 -        written to the file.
   1.851 -        """
   1.852 -
   1.853 -        # Write the prefix length and term suffix.
   1.854 -
   1.855 -        common = len(commonprefix([self.last_term, term]))
   1.856 -        suffix = term[common:]
   1.857 -
   1.858 -        self.write_number(common)
   1.859 -        self.write_string(suffix)
   1.860 -
   1.861 -        # Write the offset delta.
   1.862 -
   1.863 -        self.write_number(offset - self.last_offset)
   1.864 -
   1.865 -        # Write the frequency.
   1.866 -
   1.867 -        self.write_number(frequency)
   1.868 -
   1.869 -        # Write the document frequency.
   1.870 -
   1.871 -        self.write_number(doc_frequency)
   1.872 -
   1.873 -        self.last_term = term
   1.874 -        self.last_offset = offset
   1.875 -
   1.876 -        return self.tell()
   1.877 -
   1.878 -class TermReader(FileReader):
   1.879 -
   1.880 -    "Reading term information from files."
   1.881 -
   1.882 -    def reset(self):
   1.883 -        self.last_term = ""
   1.884 -        self.last_offset = 0
   1.885 -
   1.886 -    def read_term(self):
   1.887 -
   1.888 -        """
   1.889 -        Read a term, its position file offset, its frequency and its document
   1.890 -        frequency from the term information file.
   1.891 -        """
   1.892 -
   1.893 -        # Read the prefix length and term suffix.
   1.894 -
   1.895 -        common = self.read_number()
   1.896 -        suffix = self.read_string()
   1.897 -
   1.898 -        self.last_term = self.last_term[:common] + suffix
   1.899 -
   1.900 -        # Read the offset delta.
   1.901 -
   1.902 -        self.last_offset += self.read_number()
   1.903 -
   1.904 -        # Read the frequency.
   1.905 -
   1.906 -        frequency = self.read_number()
   1.907 -
   1.908 -        # Read the document frequency.
   1.909 -
   1.910 -        doc_frequency = self.read_number()
   1.911 -
   1.912 -        return self.last_term, self.last_offset, frequency, doc_frequency
   1.913 -
   1.914 -    def go_to_term(self, term, offset, info_offset):
   1.915 -
   1.916 -        """
   1.917 -        Seek past the entry for 'term' having 'offset' to 'info_offset'. This
   1.918 -        permits the scanning for later terms from the specified term.
   1.919 -        """
   1.920 -
   1.921 -        self.seek(info_offset)
   1.922 -        self.last_term = term
   1.923 -        self.last_offset = offset
   1.924 -
   1.925 -class TermIndexWriter(TermWriter):
   1.926 -
   1.927 -    "Writing term dictionary index details to files."
   1.928 -
   1.929 -    def reset(self):
   1.930 -        TermWriter.reset(self)
   1.931 -        self.last_info_offset = 0
   1.932 -
   1.933 -    def write_term(self, term, offset, frequency, doc_frequency, info_offset):
   1.934 -
   1.935 -        """
   1.936 -        Write the given 'term', its position file 'offset', its 'frequency' and
   1.937 -        its 'doc_frequency' to the term dictionary index file, along with the
   1.938 -        'info_offset' in the term information file.
   1.939 -        """
   1.940 -
   1.941 -        TermWriter.write_term(self, term, offset, frequency, doc_frequency)
   1.942 -
   1.943 -        # Write the information file offset delta.
   1.944 -
   1.945 -        self.write_number(info_offset - self.last_info_offset)
   1.946 -        self.last_info_offset = info_offset
   1.947 -
   1.948 -class TermIndexReader(TermReader):
   1.949 -
   1.950 -    "Reading term dictionary index details from files."
   1.951 -
   1.952 -    def reset(self):
   1.953 -        TermReader.reset(self)
   1.954 -        self.last_info_offset = 0
   1.955 -
   1.956 -    def read_term(self):
   1.957 -
   1.958 -        """
   1.959 -        Read a term, its position file offset, its frequency, its document
   1.960 -        frequency and a term information file offset from the term dictionary
   1.961 -        index file.
   1.962 -        """
   1.963 -
   1.964 -        term, offset, frequency, doc_frequency = TermReader.read_term(self)
   1.965 -
   1.966 -        # Read the offset delta.
   1.967 -
   1.968 -        self.last_info_offset += self.read_number()
   1.969 -
   1.970 -        return term, offset, frequency, doc_frequency, self.last_info_offset
   1.971 -
   1.972 -class TermDictionaryWriter:
   1.973 -
   1.974 -    "Writing term dictionaries."
   1.975 -
   1.976 -    def __init__(self, info_writer, index_writer, position_dict_writer, interval):
   1.977 -        self.info_writer = info_writer
   1.978 -        self.index_writer = index_writer
   1.979 -        self.position_dict_writer = position_dict_writer
   1.980 -        self.interval = interval
   1.981 -        self.entry = 0
   1.982 -
   1.983 -    def _write_term(self, term, offset, frequency, doc_frequency):
   1.984 -
   1.985 -        """
   1.986 -        Write the given 'term', its position file 'offset', its 'frequency' and
   1.987 -        its 'doc_frequency' (number of documents in which it appears) to the
   1.988 -        term information file. Return the offset after the term information was
   1.989 -        written to the file.
   1.990 -        """
   1.991 -
   1.992 -        info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
   1.993 -
   1.994 -        if self.entry % self.interval == 0:
   1.995 -            self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
   1.996 -
   1.997 -        self.entry += 1
   1.998 -
   1.999 -    def write_term_positions(self, term, doc_positions):
  1.1000 -
  1.1001 -        """
  1.1002 -        Write the given 'term' and the 'doc_positions' recording the documents
  1.1003 -        and positions at which the term is found.
  1.1004 -        """
  1.1005 -
  1.1006 -        offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
  1.1007 -        self._write_term(term, offset, frequency, doc_frequency)
  1.1008 -
  1.1009 -    def close(self):
  1.1010 -        self.info_writer.close()
  1.1011 -        self.index_writer.close()
  1.1012 -        self.position_dict_writer.close()
  1.1013 -
  1.1014 -class TermDictionaryReader:
  1.1015 -
  1.1016 -    "Reading term dictionaries."
  1.1017 -
  1.1018 -    def __init__(self, info_reader, index_reader, position_dict_reader):
  1.1019 -        self.info_reader = info_reader
  1.1020 -        self.index_reader = index_reader
  1.1021 -        self.position_dict_reader = position_dict_reader
  1.1022 -
  1.1023 -        self.terms = []
  1.1024 -        try:
  1.1025 -            while 1:
  1.1026 -                self.terms.append(self.index_reader.read_term())
  1.1027 -        except EOFError:
  1.1028 -            pass
  1.1029 -
  1.1030 -        # Large numbers for ordering purposes.
  1.1031 -
  1.1032 -        if self.terms:
  1.1033 -            self.max_offset = self.terms[-1][1] + 1
  1.1034 -        else:
  1.1035 -            self.max_offset = None
  1.1036 -
  1.1037 -    def _find_closest_entry(self, term):
  1.1038 -
  1.1039 -        """
  1.1040 -        Find the offsets and frequencies of 'term' from the term dictionary or
  1.1041 -        the closest term starting with the value of 'term'.
  1.1042 -
  1.1043 -        Return the closest index entry consisting of a term, the position file
  1.1044 -        offset, the term frequency, the document frequency, and the term details
  1.1045 -        file offset.
  1.1046 -        """
  1.1047 -
  1.1048 -        i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
  1.1049 -
  1.1050 -        # Get the entry position providing the term or one preceding it.
  1.1051 -        # If no entry precedes the requested term, return the very first entry
  1.1052 -        # as the closest.
  1.1053 -
  1.1054 -        if i == -1:
  1.1055 -            return self.terms[0]
  1.1056 -        else:
  1.1057 -            return self.terms[i]
  1.1058 -
  1.1059 -    def _find_closest_term(self, term):
  1.1060 -
  1.1061 -        """
  1.1062 -        Find the offsets and frequencies of 'term' from the term dictionary or
  1.1063 -        the closest term starting with the value of 'term'.
  1.1064 -
  1.1065 -        Return the closest term (or the term itself), the position file offset,
  1.1066 -        the term frequency, the document frequency, and the term details file
  1.1067 -        offset (or None if the reader is already positioned).
  1.1068 -        """
  1.1069 -
  1.1070 -        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
  1.1071 -
  1.1072 -        # Where the term is found immediately, return the offset and
  1.1073 -        # frequencies. If the term does not appear, return the details of the
  1.1074 -        # closest entry.
  1.1075 -
  1.1076 -        if term <= found_term:
  1.1077 -            return found_term, offset, frequency, doc_frequency, info_offset
  1.1078 -
  1.1079 -        # Otherwise, seek past the index term's entry in the information file
  1.1080 -        # and scan for the desired term.
  1.1081 -
  1.1082 -        else:
  1.1083 -            self.info_reader.go_to_term(found_term, offset, info_offset)
  1.1084 -            try:
  1.1085 -                while term > found_term:
  1.1086 -                    found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
  1.1087 -            except EOFError:
  1.1088 -                pass
  1.1089 -
  1.1090 -            return found_term, offset, frequency, doc_frequency, None
  1.1091 -
  1.1092 -    def _find_term(self, term):
  1.1093 -
  1.1094 -        """
  1.1095 -        Find the position file offset and frequency of 'term' from the term
  1.1096 -        dictionary.
  1.1097 -        """
  1.1098 -
  1.1099 -        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
  1.1100 -
  1.1101 -        # If the term is found, return the offset and frequencies.
  1.1102 -
  1.1103 -        if term == found_term:
  1.1104 -            return offset, frequency, doc_frequency
  1.1105 -        else:
  1.1106 -            return None
  1.1107 -
  1.1108 -    def _get_positions(self, offset, doc_frequency):
  1.1109 -        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
  1.1110 -
  1.1111 -    # Iterator convenience methods.
  1.1112 -
  1.1113 -    def __iter__(self):
  1.1114 -        self.rewind()
  1.1115 -        return self
  1.1116 -
  1.1117 -    def next(self):
  1.1118 -        try:
  1.1119 -            return self.read_term()
  1.1120 -        except EOFError:
  1.1121 -            raise StopIteration
  1.1122 -
  1.1123 -    # Sequential access methods.
  1.1124 -
  1.1125 -    def rewind(self):
  1.1126 -        self.info_reader.rewind()
  1.1127 -
  1.1128 -    def read_term(self):
  1.1129 -
  1.1130 -        """
  1.1131 -        Return the next term, its frequency, its document frequency, and the
  1.1132 -        documents and positions at which the term is found.
  1.1133 -        """
  1.1134 -
  1.1135 -        term, offset, frequency, doc_frequency = self.info_reader.read_term()
  1.1136 -        positions = self._get_positions(offset, doc_frequency)
  1.1137 -        return term, frequency, doc_frequency, positions
  1.1138 -
  1.1139 -    # Query methods.
  1.1140 -
  1.1141 -    def find_terms(self, term):
  1.1142 -
  1.1143 -        "Return all terms whose values start with the value of 'term'."
  1.1144 -
  1.1145 -        terms = []
  1.1146 -
  1.1147 -        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
  1.1148 -
  1.1149 -        # Position the reader, if necessary.
  1.1150 -
  1.1151 -        if info_offset is not None:
  1.1152 -            self.info_reader.go_to_term(found_term, offset, info_offset)
  1.1153 -
  1.1154 -        # Read and record terms.
  1.1155 -
  1.1156 -        try:
  1.1157 -            # Add the found term if it starts with the specified term.
  1.1158 -
  1.1159 -            while found_term.startswith(term):
  1.1160 -                terms.append(found_term)
  1.1161 -                found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
  1.1162 -
  1.1163 -        except EOFError:
  1.1164 -            pass
  1.1165 -
  1.1166 -        return terms
  1.1167 -
  1.1168 -    def find_positions(self, term):
  1.1169 -
  1.1170 -        "Return the documents and positions at which the given 'term' is found."
  1.1171 -
  1.1172 -        t = self._find_term(term)
  1.1173 -        if t is None:
  1.1174 -            return None
  1.1175 -        else:
  1.1176 -            offset, frequency, doc_frequency = t
  1.1177 -            return self._get_positions(offset, doc_frequency)
  1.1178 -
  1.1179 -    def get_frequency(self, term):
  1.1180 -
  1.1181 -        "Return the frequency of the given 'term'."
  1.1182 -
  1.1183 -        t = self._find_term(term)
  1.1184 -        if t is None:
  1.1185 -            return None
  1.1186 -        else:
  1.1187 -            offset, frequency, doc_frequency = t
  1.1188 -            return frequency
  1.1189 -
  1.1190 -    def get_document_frequency(self, term):
  1.1191 -
  1.1192 -        "Return the document frequency of the given 'term'."
  1.1193 -
  1.1194 -        t = self._find_term(term)
  1.1195 -        if t is None:
  1.1196 -            return None
  1.1197 -        else:
  1.1198 -            offset, frequency, doc_frequency = t
  1.1199 -            return doc_frequency
  1.1200 -
  1.1201 -    def close(self):
  1.1202 -        self.info_reader.close()
  1.1203 -        self.index_reader.close()
  1.1204 -        self.position_dict_reader.close()
  1.1205 -
  1.1206 -# Specific classes for storing document information.
  1.1207 -
  1.1208 -class FieldWriter(FileWriter):
  1.1209 -
  1.1210 -    "Writing field data to files."
  1.1211 -
  1.1212 -    def reset(self):
  1.1213 -        self.last_docnum = 0
  1.1214 -
  1.1215 -    def write_fields(self, docnum, fields):
  1.1216 -
  1.1217 -        """
  1.1218 -        Write for the given 'docnum', a list of 'fields' (integer, string pairs
  1.1219 -        representing field identifiers and values respectively).
  1.1220 -        Return the offset at which the fields are stored.
  1.1221 -        """
  1.1222 -
  1.1223 -        offset = self.tell()
  1.1224 -
  1.1225 -        # Write the document number delta.
  1.1226 -
  1.1227 -        self.write_number(docnum - self.last_docnum)
  1.1228 -
  1.1229 -        # Write the number of fields.
  1.1230 -
  1.1231 -        self.write_number(len(fields))
  1.1232 -
  1.1233 -        # Write the fields themselves.
  1.1234 -
  1.1235 -        for i, field in fields:
  1.1236 -            self.write_number(i)
  1.1237 -            self.write_string(field, 1) # compress
  1.1238 -
  1.1239 -        self.last_docnum = docnum
  1.1240 -        return offset
  1.1241 -
  1.1242 -class FieldReader(FileReader):
  1.1243 -
  1.1244 -    "Reading field data from files."
  1.1245 -
  1.1246 -    def reset(self):
  1.1247 -        self.last_docnum = 0
  1.1248 -
  1.1249 -    def read_fields(self):
  1.1250 -
  1.1251 -        """
  1.1252 -        Read fields from the file, returning a tuple containing the document
  1.1253 -        number and a list of field (identifier, value) pairs.
  1.1254 -        """
  1.1255 -
  1.1256 -        # Read the document number.
  1.1257 -
  1.1258 -        self.last_docnum += self.read_number()
  1.1259 -
  1.1260 -        # Read the number of fields.
  1.1261 -
  1.1262 -        nfields = self.read_number()
  1.1263 -
  1.1264 -        # Collect the fields.
  1.1265 -
  1.1266 -        fields = []
  1.1267 -        i = 0
  1.1268 -
  1.1269 -        while i < nfields:
  1.1270 -            identifier = self.read_number()
  1.1271 -            value = self.read_string(1) # decompress
  1.1272 -            fields.append((identifier, value))
  1.1273 -            i += 1
  1.1274 -
  1.1275 -        return self.last_docnum, fields
  1.1276 -
  1.1277 -    def read_document_fields(self, docnum, offset):
  1.1278 -
  1.1279 -        """
  1.1280 -        Read fields for 'docnum' at the given 'offset'. This permits the
  1.1281 -        retrieval of details for the specified document, as well as scanning for
  1.1282 -        later documents.
  1.1283 -        """
  1.1284 -
  1.1285 -        self.seek(offset)
  1.1286 -        bad_docnum, fields = self.read_fields()
  1.1287 -        self.last_docnum = docnum
  1.1288 -        return docnum, fields
  1.1289 -
  1.1290 -class FieldIndexWriter(FileWriter):
  1.1291 -
  1.1292 -    "Writing field index details to files."
  1.1293 -
  1.1294 -    def reset(self):
  1.1295 -        self.last_docnum = 0
  1.1296 -        self.last_offset = 0
  1.1297 -
  1.1298 -    def write_document(self, docnum, offset):
  1.1299 -
  1.1300 -        """
  1.1301 -        Write for the given 'docnum', the 'offset' at which the fields for the
  1.1302 -        document are stored in the fields file.
  1.1303 -        """
  1.1304 -
  1.1305 -        # Write the document number and offset deltas.
  1.1306 -
  1.1307 -        self.write_number(docnum - self.last_docnum)
  1.1308 -        self.write_number(offset - self.last_offset)
  1.1309 -
  1.1310 -        self.last_docnum = docnum
  1.1311 -        self.last_offset = offset
  1.1312 -
  1.1313 -class FieldIndexReader(FileReader):
  1.1314 -
  1.1315 -    "Reading field index details from files."
  1.1316 -
  1.1317 -    def reset(self):
  1.1318 -        self.last_docnum = 0
  1.1319 -        self.last_offset = 0
  1.1320 -
  1.1321 -    def read_document(self):
  1.1322 -
  1.1323 -        "Read a document number and field file offset."
  1.1324 -
  1.1325 -        # Read the document number delta and offset.
  1.1326 -
  1.1327 -        self.last_docnum += self.read_number()
  1.1328 -        self.last_offset += self.read_number()
  1.1329 -
  1.1330 -        return self.last_docnum, self.last_offset
  1.1331 -
  1.1332 -class FieldDictionaryWriter:
  1.1333 -
  1.1334 -    "Writing field dictionary details."
  1.1335 -
  1.1336 -    def __init__(self, field_writer, field_index_writer, interval):
  1.1337 -        self.field_writer = field_writer
  1.1338 -        self.field_index_writer = field_index_writer
  1.1339 -        self.interval = interval
  1.1340 -        self.entry = 0
  1.1341 -
  1.1342 -    def write_fields(self, docnum, fields):
  1.1343 -
  1.1344 -        "Write details of the document with the given 'docnum' and 'fields'."
  1.1345 -
  1.1346 -        offset = self.field_writer.write_fields(docnum, fields)
  1.1347 -
  1.1348 -        if self.entry % self.interval == 0:
  1.1349 -            self.field_index_writer.write_document(docnum, offset)
  1.1350 -
  1.1351 -        self.entry += 1
  1.1352 -
  1.1353 -    def close(self):
  1.1354 -        self.field_writer.close()
  1.1355 -        self.field_index_writer.close()
  1.1356 -
  1.1357 -class FieldDictionaryReader:
  1.1358 -
  1.1359 -    "Reading field dictionary details."
  1.1360 -
  1.1361 -    def __init__(self, field_reader, field_index_reader):
  1.1362 -        self.field_reader = field_reader
  1.1363 -        self.field_index_reader = field_index_reader
  1.1364 -
  1.1365 -        self.docs = []
  1.1366 -        try:
  1.1367 -            while 1:
  1.1368 -                self.docs.append(self.field_index_reader.read_document())
  1.1369 -        except EOFError:
  1.1370 -            pass
  1.1371 -
  1.1372 -        # Large numbers for ordering purposes.
  1.1373 -
  1.1374 -        if self.docs:
  1.1375 -            self.max_offset = self.docs[-1][1]
  1.1376 -        else:
  1.1377 -            self.max_offset = None
  1.1378 -
  1.1379 -    # Iterator convenience methods.
  1.1380 -
  1.1381 -    def __iter__(self):
  1.1382 -        self.rewind()
  1.1383 -        return self
  1.1384 -
  1.1385 -    def next(self):
  1.1386 -        try:
  1.1387 -            return self.read_fields()
  1.1388 -        except EOFError:
  1.1389 -            raise StopIteration
  1.1390 -
  1.1391 -    # Sequential access methods.
  1.1392 -
  1.1393 -    def rewind(self):
  1.1394 -        self.field_reader.rewind()
  1.1395 -
  1.1396 -    def read_fields(self):
  1.1397 -
  1.1398 -        "Return the next document number and fields."
  1.1399 -
  1.1400 -        return self.field_reader.read_fields()
  1.1401 -
  1.1402 -    # Random access methods.
  1.1403 -
  1.1404 -    def get_fields(self, docnum):
  1.1405 -
  1.1406 -        "Read the fields of the document with the given 'docnum'."
  1.1407 -
  1.1408 -        i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
  1.1409 -
  1.1410 -        # Get the entry position providing the term or one preceding it.
  1.1411 -
  1.1412 -        if i == -1:
  1.1413 -            return None
  1.1414 -
  1.1415 -        found_docnum, offset = self.docs[i]
  1.1416 -
  1.1417 -        # Read from the fields file.
  1.1418 -
  1.1419 -        found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
  1.1420 -
  1.1421 -        # Scan for the document, if necessary.
  1.1422 -
  1.1423 -        try:
  1.1424 -            while docnum > found_docnum:
  1.1425 -                found_docnum, fields = self.field_reader.read_fields()
  1.1426 -        except EOFError:
  1.1427 -            pass
  1.1428 -
  1.1429 -        # If the document is found, return the fields.
  1.1430 -
  1.1431 -        if docnum == found_docnum:
  1.1432 -            return fields
  1.1433 -        else:
  1.1434 -            return None
  1.1435 -
  1.1436 -    def close(self):
  1.1437 -        self.field_reader.close()
  1.1438 -        self.field_index_reader.close()
  1.1439 -
  1.1440 -# Dictionary merging classes.
  1.1441 -
  1.1442 -class Merger:
  1.1443 -
  1.1444 -    "Merge files."
  1.1445 -
  1.1446 -    def __init__(self, writer, readers):
  1.1447 -        self.writer = writer
  1.1448 -        self.readers = readers
  1.1449 -
  1.1450 -    def close(self):
  1.1451 -        for reader in self.readers:
  1.1452 -            reader.close()
  1.1453 -        self.writer.close()
  1.1454 -
  1.1455 -class TermDictionaryMerger(Merger):
  1.1456 -
  1.1457 -    "Merge term and position files."
  1.1458 -
  1.1459 -    def merge(self):
  1.1460 -
  1.1461 -        """
  1.1462 -        Merge terms and positions from the readers, sending them to the writer.
  1.1463 -        """
  1.1464 -
  1.1465 -        last_term = None
  1.1466 -        current_readers = []
  1.1467 -
  1.1468 -        for term, frequency, doc_frequency, positions in itermerge(self.readers):
  1.1469 -            if term == last_term:
  1.1470 -                current_readers.append(positions)
  1.1471 -            else:
  1.1472 -                if current_readers:
  1.1473 -                    self.writer.write_term_positions(last_term, itermerge(current_readers))
  1.1474 -                last_term = term
  1.1475 -                current_readers = [positions]
  1.1476 -        else:
  1.1477 -            if current_readers:
  1.1478 -                self.writer.write_term_positions(last_term, itermerge(current_readers))
  1.1479 -
  1.1480 -class FieldDictionaryMerger(Merger):
  1.1481 -
  1.1482 -    "Merge field files."
  1.1483 -
  1.1484 -    def merge(self):
  1.1485 -
  1.1486 -        """
  1.1487 -        Merge fields from the readers, sending them to the writer.
  1.1488 -        """
  1.1489 -
  1.1490 -        for docnum, fields in itermerge(self.readers):
  1.1491 -            self.writer.write_fields(docnum, fields)
  1.1492 -
  1.1493 -# Utility functions.
  1.1494 -
  1.1495 -def get_term_writer(pathname, partition, interval, doc_interval):
  1.1496 -
  1.1497 -    """
  1.1498 -    Return a term dictionary writer using files under the given 'pathname'
  1.1499 -    labelled according to the given 'partition', using the given indexing
  1.1500 -    'interval' for terms and 'doc_interval' for document position records.
  1.1501 -    """
  1.1502 -
  1.1503 -    tdf = open(join(pathname, "terms-%s" % partition), "wb")
  1.1504 -    info_writer = TermWriter(tdf)
  1.1505 -
  1.1506 -    tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
  1.1507 -    index_writer = TermIndexWriter(tdif)
  1.1508 -
  1.1509 -    tpf = open(join(pathname, "positions-%s" % partition), "wb")
  1.1510 -    positions_writer = PositionWriter(tpf)
  1.1511 -
  1.1512 -    tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
  1.1513 -    positions_index_writer = PositionIndexWriter(tpif)
  1.1514 -
  1.1515 -    positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
  1.1516 -
  1.1517 -    return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
  1.1518 -
  1.1519 -def get_field_writer(pathname, partition, interval):
  1.1520 -
  1.1521 -    """
  1.1522 -    Return a field dictionary writer using files under the given 'pathname'
  1.1523 -    labelled according to the given 'partition', using the given indexing
  1.1524 -    'interval'.
  1.1525 -    """
  1.1526 -
  1.1527 -    ff = open(join(pathname, "fields-%s" % partition), "wb")
  1.1528 -    field_writer = FieldWriter(ff)
  1.1529 -
  1.1530 -    fif = open(join(pathname, "fields_index-%s" % partition), "wb")
  1.1531 -    field_index_writer = FieldIndexWriter(fif)
  1.1532 -
  1.1533 -    return FieldDictionaryWriter(field_writer, field_index_writer, interval)
  1.1534 -
  1.1535 -def get_term_reader(pathname, partition):
  1.1536 -
  1.1537 -    """
  1.1538 -    Return a term dictionary reader using files under the given 'pathname'
  1.1539 -    labelled according to the given 'partition'.
  1.1540 -    """
  1.1541 -
  1.1542 -    tdf = open(join(pathname, "terms-%s" % partition), "rb")
  1.1543 -    info_reader = TermReader(tdf)
  1.1544 -
  1.1545 -    tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
  1.1546 -    index_reader = TermIndexReader(tdif)
  1.1547 -
  1.1548 -    positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))
  1.1549 -    positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))
  1.1550 -
  1.1551 -    positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)
  1.1552 -
  1.1553 -    return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
  1.1554 -
  1.1555 -def get_field_reader(pathname, partition):
  1.1556 -
  1.1557 -    """
  1.1558 -    Return a field dictionary reader using files under the given 'pathname'
  1.1559 -    labelled according to the given 'partition'.
  1.1560 -    """
  1.1561 -
  1.1562 -    ff = open(join(pathname, "fields-%s" % partition), "rb")
  1.1563 -    field_reader = FieldReader(ff)
  1.1564 -
  1.1565 -    fif = open(join(pathname, "fields_index-%s" % partition), "rb")
  1.1566 -    field_index_reader = FieldIndexReader(fif)
  1.1567 -
  1.1568 -    return FieldDictionaryReader(field_reader, field_index_reader)
  1.1569 -
  1.1570 -def rename_files(pathname, names, from_partition, to_partition):
  1.1571 -    for name in names:
  1.1572 -        rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
  1.1573 -
  1.1574 -def rename_term_files(pathname, from_partition, to_partition):
  1.1575 -    rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
  1.1576 -
  1.1577 -def rename_field_files(pathname, from_partition, to_partition):
  1.1578 -    rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
  1.1579 -
  1.1580 -def remove_files(pathname, names, partition):
  1.1581 -    for name in names:
  1.1582 -        remove(join(pathname, "%s-%s" % (name, partition)))
  1.1583 -
  1.1584 -def remove_term_files(pathname, partition):
  1.1585 -    remove_files(pathname, TERM_FILENAMES, partition)
  1.1586 -
  1.1587 -def remove_field_files(pathname, partition):
  1.1588 -    remove_files(pathname, FIELD_FILENAMES, partition)
  1.1589 -
  1.1590 -# High-level classes.
  1.1591 -
  1.1592 -class Document:
  1.1593 -
  1.1594 -    "A container of document information."
  1.1595 -
  1.1596 -    def __init__(self, docnum):
  1.1597 -        self.docnum = docnum
  1.1598 -        self.fields = []
  1.1599 -        self.terms = {}
  1.1600 -
  1.1601 -    def add_position(self, term, position):
  1.1602 -
  1.1603 -        """
  1.1604 -        Add a position entry for the given 'term', indicating the given
  1.1605 -        'position'.
  1.1606 -        """
  1.1607 -
  1.1608 -        self.terms.setdefault(term, []).append(position)
  1.1609 -
  1.1610 -    def add_field(self, identifier, value):
  1.1611 -
  1.1612 -        "Add a field having the given 'identifier' and 'value'."
  1.1613 -
  1.1614 -        self.fields.append((identifier, unicode(value))) # convert to string
  1.1615 -
  1.1616 -    def set_fields(self, fields):
  1.1617 -
  1.1618 -        """
  1.1619 -        Set the document's 'fields': a list of tuples each containing an integer
  1.1620 -        identifier and a string value.
  1.1621 -        """
  1.1622 -
  1.1623 -        self.fields = fields
  1.1624 -
  1.1625 -class IndexWriter:
  1.1626 -
  1.1627 -    """
  1.1628 -    Building term information and writing it to the term and field dictionaries.
  1.1629 -    """
  1.1630 -
  1.1631 -    def __init__(self, pathname, interval, doc_interval, flush_interval):
  1.1632 -        self.pathname = pathname
  1.1633 -        self.interval = interval
  1.1634 -        self.doc_interval = doc_interval
  1.1635 -        self.flush_interval = flush_interval
  1.1636 -
  1.1637 -        self.dict_partition = 0
  1.1638 -        self.field_dict_partition = 0
  1.1639 -
  1.1640 -        self.terms = {}
  1.1641 -        self.docs = {}
  1.1642 -
  1.1643 -        self.doc_counter = 0
  1.1644 -
  1.1645 -    def add_document(self, doc):
  1.1646 -
  1.1647 -        """
  1.1648 -        Add the given document 'doc', updating the document counter and flushing
  1.1649 -        terms and fields if appropriate.
  1.1650 -        """
  1.1651 -
  1.1652 -        for term, positions in doc.terms.items():
  1.1653 -            self.terms.setdefault(term, {})[doc.docnum] = positions
  1.1654 -
  1.1655 -        self.docs[doc.docnum] = doc.fields
  1.1656 -
  1.1657 -        self.doc_counter += 1
  1.1658 -        if self.flush_interval and self.doc_counter >= self.flush_interval:
  1.1659 -            self.flush_terms()
  1.1660 -            self.flush_fields()
  1.1661 -            self.doc_counter = 0
  1.1662 -
  1.1663 -    def get_term_writer(self):
  1.1664 -
  1.1665 -        "Return a term dictionary writer for the current partition."
  1.1666 -
  1.1667 -        return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
  1.1668 -
  1.1669 -    def get_field_writer(self):
  1.1670 -
  1.1671 -        "Return a field dictionary writer for the current partition."
  1.1672 -
  1.1673 -        return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
  1.1674 -
  1.1675 -    def flush_terms(self):
  1.1676 -
  1.1677 -        "Flush terms into the current term dictionary partition."
  1.1678 -
  1.1679 -        # Get the terms in order.
  1.1680 -
  1.1681 -        all_terms = self.terms
  1.1682 -        terms = all_terms.keys()
  1.1683 -        terms.sort()
  1.1684 -
  1.1685 -        dict_writer = self.get_term_writer()
  1.1686 -
  1.1687 -        for term in terms:
  1.1688 -            doc_positions = all_terms[term].items()
  1.1689 -            dict_writer.write_term_positions(term, doc_positions)
  1.1690 -
  1.1691 -        dict_writer.close()
  1.1692 -
  1.1693 -        self.terms = {}
  1.1694 -        self.dict_partition += 1
  1.1695 -
  1.1696 -    def flush_fields(self):
  1.1697 -
  1.1698 -        "Flush fields into the current term dictionary partition."
  1.1699 -
  1.1700 -        # Get the documents in order.
  1.1701 -
  1.1702 -        docs = self.docs.items()
  1.1703 -        docs.sort()
  1.1704 -
  1.1705 -        field_dict_writer = self.get_field_writer()
  1.1706 -
  1.1707 -        for docnum, fields in docs:
  1.1708 -            field_dict_writer.write_fields(docnum, fields)
  1.1709 -
  1.1710 -        field_dict_writer.close()
  1.1711 -
  1.1712 -        self.docs = {}
  1.1713 -        self.field_dict_partition += 1
  1.1714 -
  1.1715 -    def close(self):
  1.1716 -        if self.terms:
  1.1717 -            self.flush_terms()
  1.1718 -        if self.docs:
  1.1719 -            self.flush_fields()
  1.1720 -
  1.1721 -class IndexReader:
  1.1722 -
  1.1723 -    "Accessing the term and field dictionaries."
  1.1724 -
  1.1725 -    def __init__(self, pathname):
  1.1726 -        self.dict_reader = get_term_reader(pathname, "merged")
  1.1727 -        self.field_dict_reader = get_field_reader(pathname, "merged")
  1.1728 -
  1.1729 -    def find_terms(self, term):
  1.1730 -        return self.dict_reader.find_terms(term)
  1.1731 -
  1.1732 -    def find_positions(self, term):
  1.1733 -        return self.dict_reader.find_positions(term)
  1.1734 -
  1.1735 -    def get_frequency(self, term):
  1.1736 -        return self.dict_reader.get_frequency(term)
  1.1737 -
  1.1738 -    def get_document_frequency(self, term):
  1.1739 -        return self.dict_reader.get_document_frequency(term)
  1.1740 -
  1.1741 -    def get_fields(self, docnum):
  1.1742 -        return self.field_dict_reader.get_fields(docnum)
  1.1743 -
  1.1744 -    def close(self):
  1.1745 -        self.dict_reader.close()
  1.1746 -        self.field_dict_reader.close()
  1.1747 -
  1.1748 -class Index:
  1.1749 -
  1.1750 -    "An inverted index solution encapsulating the various components."
  1.1751 -
  1.1752 -    def __init__(self, pathname):
  1.1753 -        self.pathname = pathname
  1.1754 -        self.reader = None
  1.1755 -        self.writer = None
  1.1756 -
  1.1757 -    def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
  1.1758 -
  1.1759 -        """
  1.1760 -        Return a writer, optionally using the given indexing 'interval',
  1.1761 -        'doc_interval' and 'flush_interval'.
  1.1762 -        """
  1.1763 -
  1.1764 -        if not exists(self.pathname):
  1.1765 -            mkdir(self.pathname)
  1.1766 -
  1.1767 -        self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
  1.1768 -        return self.writer
  1.1769 -
  1.1770 -    def get_reader(self, partition=0):
  1.1771 -
  1.1772 -        "Return a reader for the index."
  1.1773 -
  1.1774 -        # Ensure that only one partition exists.
  1.1775 -
  1.1776 -        self.merge()
  1.1777 -        return self._get_reader(partition)
  1.1778 -
  1.1779 -    def _get_reader(self, partition):
  1.1780 -
  1.1781 -        "Return a reader for the index."
  1.1782 -
  1.1783 -        if not exists(self.pathname):
  1.1784 -            raise OSError, "Index path %r does not exist." % self.pathname
  1.1785 -
  1.1786 -        self.reader = IndexReader(self.pathname)
  1.1787 -        return self.reader
  1.1788 -
  1.1789 -    def merge(self):
  1.1790 -
  1.1791 -        "Merge/optimise index partitions."
  1.1792 -
  1.1793 -        self.merge_terms()
  1.1794 -        self.merge_fields()
  1.1795 -
  1.1796 -    def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
  1.1797 -
  1.1798 -        """
  1.1799 -        Merge term dictionaries using the given indexing 'interval' and
  1.1800 -        'doc_interval'.
  1.1801 -        """
  1.1802 -
  1.1803 -        readers = []
  1.1804 -        partitions = set()
  1.1805 -
  1.1806 -        for filename in listdir(self.pathname):
  1.1807 -            if filename.startswith("terms-"): # 6 character prefix
  1.1808 -                partition = filename[6:]
  1.1809 -                readers.append(get_term_reader(self.pathname, partition))
  1.1810 -                partitions.add(partition)
  1.1811 -
  1.1812 -        # Write directly to a dictionary.
  1.1813 -
  1.1814 -        if len(readers) > 1:
  1.1815 -            if "merged" in partitions:
  1.1816 -                rename_term_files(self.pathname, "merged", "old-merged")
  1.1817 -                partitions.remove("merged")
  1.1818 -                partitions.add("old-merged")
  1.1819 -
  1.1820 -            writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
  1.1821 -            merger = TermDictionaryMerger(writer, readers)
  1.1822 -            merger.merge()
  1.1823 -            merger.close()
  1.1824 -
  1.1825 -            # Remove old files.
  1.1826 -
  1.1827 -            for partition in partitions:
  1.1828 -                remove_term_files(self.pathname, partition)
  1.1829 -
  1.1830 -        elif len(readers) == 1:
  1.1831 -            partition = list(partitions)[0]
  1.1832 -            if partition != "merged":
  1.1833 -                rename_term_files(self.pathname, partition, "merged")
  1.1834 -
  1.1835 -    def merge_fields(self, interval=FIELD_INTERVAL):
  1.1836 -
  1.1837 -        "Merge field dictionaries using the given indexing 'interval'."
  1.1838 -
  1.1839 -        readers = []
  1.1840 -        partitions = set()
  1.1841 -
  1.1842 -        for filename in listdir(self.pathname):
  1.1843 -            if filename.startswith("fields-"): # 7 character prefix
  1.1844 -                partition = filename[7:]
  1.1845 -                readers.append(get_field_reader(self.pathname, partition))
  1.1846 -                partitions.add(partition)
  1.1847 -
  1.1848 -        # Write directly to a dictionary.
  1.1849 -
  1.1850 -        if len(readers) > 1:
  1.1851 -            if "merged" in partitions:
  1.1852 -                rename_field_files(self.pathname, "merged", "old-merged")
  1.1853 -                partitions.remove("merged")
  1.1854 -                partitions.add("old-merged")
  1.1855 -
  1.1856 -            writer = get_field_writer(self.pathname, "merged", interval)
  1.1857 -            merger = FieldDictionaryMerger(writer, readers)
  1.1858 -            merger.merge()
  1.1859 -            merger.close()
  1.1860 -
  1.1861 -            # Remove old files.
  1.1862 -
  1.1863 -            for partition in partitions:
  1.1864 -                remove_field_files(self.pathname, partition)
  1.1865 -
  1.1866 -        elif len(readers) == 1:
  1.1867 -            partition = list(partitions)[0]
  1.1868 -            if partition != "merged":
  1.1869 -                rename_field_files(self.pathname, partition, "merged")
  1.1870 -
  1.1871 -    def close(self):
  1.1872 -        if self.reader is not None:
  1.1873 -            self.reader.close()
  1.1874 -            self.reader = None
  1.1875 -        if self.writer is not None:
  1.1876 -            self.writer.close()
  1.1877 -            self.writer = None
  1.1878 -
  1.1879 -# vim: tabstop=4 expandtab shiftwidth=4
2009-09-15	Paul Boddie	raw files shortlog changelog graph	Removed old module.
			iixr.py