Made iixr a package with several submodules.

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/iixr/__init__.py	Tue Sep 15 00:15:11 2009 +0200
     1.3 @@ -0,0 +1,23 @@
     1.4 +#!/usr/bin/env python
     1.5 +
     1.6 +"""
     1.7 +A simple (and sane) text indexing library.
     1.8 +
     1.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    1.10 +
    1.11 +This program is free software; you can redistribute it and/or modify it under
    1.12 +the terms of the GNU General Public License as published by the Free Software
    1.13 +Foundation; either version 3 of the License, or (at your option) any later
    1.14 +version.
    1.15 +
    1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    1.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    1.19 +
    1.20 +You should have received a copy of the GNU General Public License along
    1.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    1.22 +"""
    1.23 +
    1.24 +from iixr.index import *
    1.25 +
    1.26 +# vim: tabstop=4 expandtab shiftwidth=4

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/iixr/data.py	Tue Sep 15 00:15:11 2009 +0200
     2.3 @@ -0,0 +1,64 @@
     2.4 +#!/usr/bin/env python
     2.5 +
     2.6 +"""
     2.7 +Variable-length integer functions.
     2.8 +
     2.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    2.10 +
    2.11 +This program is free software; you can redistribute it and/or modify it under
    2.12 +the terms of the GNU General Public License as published by the Free Software
    2.13 +Foundation; either version 3 of the License, or (at your option) any later
    2.14 +version.
    2.15 +
    2.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    2.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    2.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    2.19 +
    2.20 +You should have received a copy of the GNU General Public License along
    2.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    2.22 +"""
    2.23 +
    2.24 +try:
    2.25 +    from vint import vint as _vint
    2.26 +
    2.27 +    def vint(number):
    2.28 +
    2.29 +        "Write 'number' as a variable-length integer."
    2.30 +
    2.31 +        if number >= 0:
    2.32 +            return _vint(number)
    2.33 +        else:
    2.34 +            raise ValueError, "Number %r is negative." % number
    2.35 +
    2.36 +except ImportError:
    2.37 +
    2.38 +    def vint(number):
    2.39 +
    2.40 +        "Write 'number' as a variable-length integer."
    2.41 +
    2.42 +        if number >= 0:
    2.43 +
    2.44 +            # Special case: one byte containing a 7-bit number.
    2.45 +
    2.46 +            if number < 128:
    2.47 +                return chr(number)
    2.48 +
    2.49 +            # Write the number from least to most significant digits.
    2.50 +
    2.51 +            bytes = []
    2.52 +
    2.53 +            while number != 0:
    2.54 +                lsd = number & 127
    2.55 +                number = number >> 7
    2.56 +                if number != 0:
    2.57 +                    lsd |= 128
    2.58 +                bytes.append(chr(lsd))
    2.59 +
    2.60 +            return "".join(bytes)
    2.61 +
    2.62 +        # Negative numbers are not supported.
    2.63 +
    2.64 +        else:
    2.65 +            raise ValueError, "Number %r is negative." % number
    2.66 +
    2.67 +# vim: tabstop=4 expandtab shiftwidth=4

     3.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     3.2 +++ b/iixr/fields.py	Tue Sep 15 00:15:11 2009 +0200
     3.3 @@ -0,0 +1,256 @@
     3.4 +#!/usr/bin/env python
     3.5 +
     3.6 +"""
     3.7 +Specific classes for storing document information.
     3.8 +
     3.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    3.10 +
    3.11 +This program is free software; you can redistribute it and/or modify it under
    3.12 +the terms of the GNU General Public License as published by the Free Software
    3.13 +Foundation; either version 3 of the License, or (at your option) any later
    3.14 +version.
    3.15 +
    3.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    3.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    3.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    3.19 +
    3.20 +You should have received a copy of the GNU General Public License along
    3.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    3.22 +"""
    3.23 +
    3.24 +from iixr.files import *
    3.25 +from bisect import bisect_right  # to find terms in the dictionary index
    3.26 +
    3.27 +class FieldWriter(FileWriter):
    3.28 +
    3.29 +    "Writing field data to files."
    3.30 +
    3.31 +    def reset(self):
    3.32 +        self.last_docnum = 0
    3.33 +
    3.34 +    def write_fields(self, docnum, fields):
    3.35 +
    3.36 +        """
    3.37 +        Write for the given 'docnum', a list of 'fields' (integer, string pairs
    3.38 +        representing field identifiers and values respectively).
    3.39 +        Return the offset at which the fields are stored.
    3.40 +        """
    3.41 +
    3.42 +        offset = self.tell()
    3.43 +
    3.44 +        # Write the document number delta.
    3.45 +
    3.46 +        self.write_number(docnum - self.last_docnum)
    3.47 +
    3.48 +        # Write the number of fields.
    3.49 +
    3.50 +        self.write_number(len(fields))
    3.51 +
    3.52 +        # Write the fields themselves.
    3.53 +
    3.54 +        for i, field in fields:
    3.55 +            self.write_number(i)
    3.56 +            self.write_string(field, 1) # compress
    3.57 +
    3.58 +        self.last_docnum = docnum
    3.59 +        return offset
    3.60 +
    3.61 +class FieldReader(FileReader):
    3.62 +
    3.63 +    "Reading field data from files."
    3.64 +
    3.65 +    def reset(self):
    3.66 +        self.last_docnum = 0
    3.67 +
    3.68 +    def read_fields(self):
    3.69 +
    3.70 +        """
    3.71 +        Read fields from the file, returning a tuple containing the document
    3.72 +        number and a list of field (identifier, value) pairs.
    3.73 +        """
    3.74 +
    3.75 +        # Read the document number.
    3.76 +
    3.77 +        self.last_docnum += self.read_number()
    3.78 +
    3.79 +        # Read the number of fields.
    3.80 +
    3.81 +        nfields = self.read_number()
    3.82 +
    3.83 +        # Collect the fields.
    3.84 +
    3.85 +        fields = []
    3.86 +        i = 0
    3.87 +
    3.88 +        while i < nfields:
    3.89 +            identifier = self.read_number()
    3.90 +            value = self.read_string(1) # decompress
    3.91 +            fields.append((identifier, value))
    3.92 +            i += 1
    3.93 +
    3.94 +        return self.last_docnum, fields
    3.95 +
    3.96 +    def read_document_fields(self, docnum, offset):
    3.97 +
    3.98 +        """
    3.99 +        Read fields for 'docnum' at the given 'offset'. This permits the
   3.100 +        retrieval of details for the specified document, as well as scanning for
   3.101 +        later documents.
   3.102 +        """
   3.103 +
   3.104 +        self.seek(offset)
   3.105 +        bad_docnum, fields = self.read_fields()
   3.106 +        self.last_docnum = docnum
   3.107 +        return docnum, fields
   3.108 +
   3.109 +class FieldIndexWriter(FileWriter):
   3.110 +
   3.111 +    "Writing field index details to files."
   3.112 +
   3.113 +    def reset(self):
   3.114 +        self.last_docnum = 0
   3.115 +        self.last_offset = 0
   3.116 +
   3.117 +    def write_document(self, docnum, offset):
   3.118 +
   3.119 +        """
   3.120 +        Write for the given 'docnum', the 'offset' at which the fields for the
   3.121 +        document are stored in the fields file.
   3.122 +        """
   3.123 +
   3.124 +        # Write the document number and offset deltas.
   3.125 +
   3.126 +        self.write_number(docnum - self.last_docnum)
   3.127 +        self.write_number(offset - self.last_offset)
   3.128 +
   3.129 +        self.last_docnum = docnum
   3.130 +        self.last_offset = offset
   3.131 +
   3.132 +class FieldIndexReader(FileReader):
   3.133 +
   3.134 +    "Reading field index details from files."
   3.135 +
   3.136 +    def reset(self):
   3.137 +        self.last_docnum = 0
   3.138 +        self.last_offset = 0
   3.139 +
   3.140 +    def read_document(self):
   3.141 +
   3.142 +        "Read a document number and field file offset."
   3.143 +
   3.144 +        # Read the document number delta and offset.
   3.145 +
   3.146 +        self.last_docnum += self.read_number()
   3.147 +        self.last_offset += self.read_number()
   3.148 +
   3.149 +        return self.last_docnum, self.last_offset
   3.150 +
   3.151 +class FieldDictionaryWriter:
   3.152 +
   3.153 +    "Writing field dictionary details."
   3.154 +
   3.155 +    def __init__(self, field_writer, field_index_writer, interval):
   3.156 +        self.field_writer = field_writer
   3.157 +        self.field_index_writer = field_index_writer
   3.158 +        self.interval = interval
   3.159 +        self.entry = 0
   3.160 +
   3.161 +    def write_fields(self, docnum, fields):
   3.162 +
   3.163 +        "Write details of the document with the given 'docnum' and 'fields'."
   3.164 +
   3.165 +        offset = self.field_writer.write_fields(docnum, fields)
   3.166 +
   3.167 +        if self.entry % self.interval == 0:
   3.168 +            self.field_index_writer.write_document(docnum, offset)
   3.169 +
   3.170 +        self.entry += 1
   3.171 +
   3.172 +    def close(self):
   3.173 +        self.field_writer.close()
   3.174 +        self.field_index_writer.close()
   3.175 +
   3.176 +class FieldDictionaryReader:
   3.177 +
   3.178 +    "Reading field dictionary details."
   3.179 +
   3.180 +    def __init__(self, field_reader, field_index_reader):
   3.181 +        self.field_reader = field_reader
   3.182 +        self.field_index_reader = field_index_reader
   3.183 +
   3.184 +        self.docs = []
   3.185 +        try:
   3.186 +            while 1:
   3.187 +                self.docs.append(self.field_index_reader.read_document())
   3.188 +        except EOFError:
   3.189 +            pass
   3.190 +
   3.191 +        # Large numbers for ordering purposes.
   3.192 +
   3.193 +        if self.docs:
   3.194 +            self.max_offset = self.docs[-1][1]
   3.195 +        else:
   3.196 +            self.max_offset = None
   3.197 +
   3.198 +    # Iterator convenience methods.
   3.199 +
   3.200 +    def __iter__(self):
   3.201 +        self.rewind()
   3.202 +        return self
   3.203 +
   3.204 +    def next(self):
   3.205 +        try:
   3.206 +            return self.read_fields()
   3.207 +        except EOFError:
   3.208 +            raise StopIteration
   3.209 +
   3.210 +    # Sequential access methods.
   3.211 +
   3.212 +    def rewind(self):
   3.213 +        self.field_reader.rewind()
   3.214 +
   3.215 +    def read_fields(self):
   3.216 +
   3.217 +        "Return the next document number and fields."
   3.218 +
   3.219 +        return self.field_reader.read_fields()
   3.220 +
   3.221 +    # Random access methods.
   3.222 +
   3.223 +    def get_fields(self, docnum):
   3.224 +
   3.225 +        "Read the fields of the document with the given 'docnum'."
   3.226 +
   3.227 +        i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
   3.228 +
   3.229 +        # Get the entry position providing the term or one preceding it.
   3.230 +
   3.231 +        if i == -1:
   3.232 +            return None
   3.233 +
   3.234 +        found_docnum, offset = self.docs[i]
   3.235 +
   3.236 +        # Read from the fields file.
   3.237 +
   3.238 +        found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
   3.239 +
   3.240 +        # Scan for the document, if necessary.
   3.241 +
   3.242 +        try:
   3.243 +            while docnum > found_docnum:
   3.244 +                found_docnum, fields = self.field_reader.read_fields()
   3.245 +        except EOFError:
   3.246 +            pass
   3.247 +
   3.248 +        # If the document is found, return the fields.
   3.249 +
   3.250 +        if docnum == found_docnum:
   3.251 +            return fields
   3.252 +        else:
   3.253 +            return None
   3.254 +
   3.255 +    def close(self):
   3.256 +        self.field_reader.close()
   3.257 +        self.field_index_reader.close()
   3.258 +
   3.259 +# vim: tabstop=4 expandtab shiftwidth=4

     4.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     4.2 +++ b/iixr/files.py	Tue Sep 15 00:15:11 2009 +0200
     4.3 @@ -0,0 +1,264 @@
     4.4 +#!/usr/bin/env python
     4.5 +
     4.6 +"""
     4.7 +Generic file access.
     4.8 +
     4.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    4.10 +
    4.11 +This program is free software; you can redistribute it and/or modify it under
    4.12 +the terms of the GNU General Public License as published by the Free Software
    4.13 +Foundation; either version 3 of the License, or (at your option) any later
    4.14 +version.
    4.15 +
    4.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    4.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    4.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    4.19 +
    4.20 +You should have received a copy of the GNU General Public License along
    4.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    4.22 +"""
    4.23 +
    4.24 +from iixr.data import vint
    4.25 +import bz2, zlib
    4.26 +
    4.27 +# Constants.
    4.28 +
    4.29 +WRITE_CACHE_SIZE  = 100000
    4.30 +READ_CACHE_SIZE   = 10000
    4.31 +READ_CACHE_RESIZE = 5000
    4.32 +
    4.33 +compressors = [("b", bz2.compress), ("z", zlib.compress)]
    4.34 +decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
    4.35 +
    4.36 +class File:
    4.37 +
    4.38 +    "A basic file abstraction."
    4.39 +
    4.40 +    def __init__(self, f):
    4.41 +        self.f = f
    4.42 +        self.reset()
    4.43 +
    4.44 +    def reset(self):
    4.45 +
    4.46 +        "To be used to reset the state of the reader or writer between records."
    4.47 +
    4.48 +        pass
    4.49 +
    4.50 +    def rewind(self):
    4.51 +        self.seek(0)
    4.52 +        self.reset()
    4.53 +
    4.54 +    def seek(self, offset):
    4.55 +
    4.56 +        "To be defined by readers."
    4.57 +
    4.58 +        pass
    4.59 +
    4.60 +    def flush(self):
    4.61 +
    4.62 +        "To be defined by writers."
    4.63 +
    4.64 +        pass
    4.65 +
    4.66 +    def close(self):
    4.67 +        if self.f is not None:
    4.68 +            self.flush()
    4.69 +            self.f.close()
    4.70 +            self.f = None
    4.71 +
    4.72 +class FileWriter(File):
    4.73 +
    4.74 +    "Writing basic data types to files."
    4.75 +
    4.76 +    def __init__(self, f):
    4.77 +        File.__init__(self, f)
    4.78 +        self.cache = []
    4.79 +        self.cache_length = 0
    4.80 +
    4.81 +    def write_number(self, number):
    4.82 +
    4.83 +        "Write 'number' to the file using a variable length encoding."
    4.84 +
    4.85 +        self.write(vint(number))
    4.86 +
    4.87 +    def write_string(self, s, compress=0):
    4.88 +
    4.89 +        """
    4.90 +        Write 's' to the file, recording its length and compressing the string
    4.91 +        if 'compress' is set to a true value.
    4.92 +        """
    4.93 +
    4.94 +        # Convert Unicode objects to strings.
    4.95 +
    4.96 +        if isinstance(s, unicode):
    4.97 +            s = s.encode("utf-8")
    4.98 +
    4.99 +        # Compress the string if requested.
   4.100 +
   4.101 +        if compress:
   4.102 +            for flag, fn in compressors:
   4.103 +                cs = fn(s)
   4.104 +
   4.105 +                # Take the first string shorter than the original.
   4.106 +
   4.107 +                if len(cs) < len(s):
   4.108 +                    s = cs
   4.109 +                    break
   4.110 +            else:
   4.111 +                flag = "-"
   4.112 +
   4.113 +        else:
   4.114 +            flag = ""
   4.115 +
   4.116 +        # Write the length of the data before the data itself.
   4.117 +
   4.118 +        length = len(s)
   4.119 +        self.write(flag + vint(length) + s)
   4.120 +
   4.121 +    # Cache-affected methods.
   4.122 +
   4.123 +    def write(self, s):
   4.124 +        self.cache.append(s)
   4.125 +        self.cache_length += len(s)
   4.126 +        if self.cache_length >= WRITE_CACHE_SIZE:
   4.127 +            self.flush()
   4.128 +
   4.129 +    def tell(self):
   4.130 +        return self.f.tell() + self.cache_length
   4.131 +
   4.132 +    def flush(self):
   4.133 +        self.f.write("".join(self.cache))
   4.134 +        self.cache = []
   4.135 +        self.cache_length = 0
   4.136 +
   4.137 +class FileReader(File):
   4.138 +
   4.139 +    "Reading basic data types from files."
   4.140 +
   4.141 +    def __init__(self, f):
   4.142 +        File.__init__(self, f)
   4.143 +        self.reset_cache()
   4.144 +
   4.145 +    def reset_cache(self):
   4.146 +        self.cache = ""
   4.147 +        self.cache_length = 0
   4.148 +        self.cache_start = 0
   4.149 +
   4.150 +    def read_number(self):
   4.151 +
   4.152 +        "Read a number from the file."
   4.153 +
   4.154 +        # Read each byte, adding it to the number.
   4.155 +
   4.156 +        shift = 0
   4.157 +        number = 0
   4.158 +        read = self.read
   4.159 +
   4.160 +        try:
   4.161 +            csd = ord(read(1))
   4.162 +            while csd & 128:
   4.163 +                number += ((csd & 127) << shift)
   4.164 +                shift += 7
   4.165 +                csd = ord(read(1))
   4.166 +            else:
   4.167 +                number += (csd << shift)
   4.168 +        except TypeError:
   4.169 +            raise EOFError
   4.170 +
   4.171 +        return number
   4.172 +
   4.173 +    def read_string(self, decompress=0):
   4.174 +
   4.175 +        """
   4.176 +        Read a string from the file, decompressing the stored data if
   4.177 +        'decompress' is set to a true value.
   4.178 +        """
   4.179 +
   4.180 +        # Decompress the data if requested.
   4.181 +
   4.182 +        if decompress:
   4.183 +            flag = self.read(1)
   4.184 +        else:
   4.185 +            flag = "-"
   4.186 +
   4.187 +        length = self.read_number()
   4.188 +        s = self.read(length)
   4.189 +
   4.190 +        # Perform decompression if applicable.
   4.191 +
   4.192 +        if flag != "-":
   4.193 +            fn = decompressors[flag]
   4.194 +            s = fn(s)
   4.195 +
   4.196 +        # Convert strings to Unicode objects.
   4.197 +
   4.198 +        return unicode(s, "utf-8")
   4.199 +
   4.200 +    # Cache-affected methods.
   4.201 +
   4.202 +    def read(self, n):
   4.203 +        needed = n - (self.cache_length - self.cache_start)
   4.204 +
   4.205 +        # Read the needed number of characters, if possible.
   4.206 +
   4.207 +        if needed > 0:
   4.208 +            s = self.f.read(max(needed, READ_CACHE_SIZE))
   4.209 +            self.cache += s
   4.210 +            self.cache_length += len(s)
   4.211 +
   4.212 +        # Get the end of the requested block.
   4.213 +
   4.214 +        next_start = self.cache_start + n
   4.215 +        s = self.cache[self.cache_start:next_start]
   4.216 +
   4.217 +        # Reposition the pointer to the cache.
   4.218 +
   4.219 +        self._seek_cache(len(s))
   4.220 +        return s
   4.221 +
   4.222 +    def tell(self):
   4.223 +        return self.f.tell() - self.cache_length + self.cache_start
   4.224 +
   4.225 +    def seek(self, offset):
   4.226 +        current = self.tell()
   4.227 +        self.f.seek(offset)
   4.228 +
   4.229 +        # If seeking forward, attempt to navigate the cache.
   4.230 +
   4.231 +        if offset >= current:
   4.232 +            self._seek_cache(offset - current)
   4.233 +        else:
   4.234 +            self.reset_cache()
   4.235 +
   4.236 +    def _seek_cache(self, delta):
   4.237 +        next_start = self.cache_start + delta
   4.238 +
   4.239 +        if next_start > 0 and next_start >= len(self.cache):
   4.240 +            self.reset_cache()
   4.241 +
   4.242 +        # If the cache is too big, resize it.
   4.243 +
   4.244 +        elif next_start > READ_CACHE_RESIZE:
   4.245 +            self.cache = self.cache[next_start:]
   4.246 +            self.cache_length = len(self.cache)
   4.247 +            self.cache_start = 0
   4.248 +
   4.249 +        # Otherwise, just reference the next part of the cache.
   4.250 +
   4.251 +        else:
   4.252 +            self.cache_start = next_start
   4.253 +
   4.254 +class FileOpener:
   4.255 +
   4.256 +    "Opening files using their filenames."
   4.257 +
   4.258 +    def __init__(self, filename):
   4.259 +        self.filename = filename
   4.260 +
   4.261 +    def open(self, mode):
   4.262 +        return open(self.filename, mode)
   4.263 +
   4.264 +    def close(self):
   4.265 +        pass
   4.266 +
   4.267 +# vim: tabstop=4 expandtab shiftwidth=4

     5.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     5.2 +++ b/iixr/filesystem.py	Tue Sep 15 00:15:11 2009 +0200
     5.3 @@ -0,0 +1,129 @@
     5.4 +#!/usr/bin/env python
     5.5 +
     5.6 +"""
     5.7 +File access.
     5.8 +
     5.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    5.10 +
    5.11 +This program is free software; you can redistribute it and/or modify it under
    5.12 +the terms of the GNU General Public License as published by the Free Software
    5.13 +Foundation; either version 3 of the License, or (at your option) any later
    5.14 +version.
    5.15 +
    5.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    5.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    5.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    5.19 +
    5.20 +You should have received a copy of the GNU General Public License along
    5.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    5.22 +"""
    5.23 +
    5.24 +from iixr.fields import *
    5.25 +from iixr.terms import *
    5.26 +from iixr.positions import *
    5.27 +from os import remove, rename    # partition manipulation
    5.28 +from os.path import join
    5.29 +
    5.30 +# Constants.
    5.31 +
    5.32 +TERM_FILENAMES    = "terms", "terms_index", "positions", "positions_index"
    5.33 +FIELD_FILENAMES   = "fields", "fields_index"
    5.34 +
    5.35 +# Utility functions.
    5.36 +
    5.37 +def get_term_writer(pathname, partition, interval, doc_interval):
    5.38 +
    5.39 +    """
    5.40 +    Return a term dictionary writer using files under the given 'pathname'
    5.41 +    labelled according to the given 'partition', using the given indexing
    5.42 +    'interval' for terms and 'doc_interval' for document position records.
    5.43 +    """
    5.44 +
    5.45 +    tdf = open(join(pathname, "terms-%s" % partition), "wb")
    5.46 +    info_writer = TermWriter(tdf)
    5.47 +
    5.48 +    tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
    5.49 +    index_writer = TermIndexWriter(tdif)
    5.50 +
    5.51 +    tpf = open(join(pathname, "positions-%s" % partition), "wb")
    5.52 +    positions_writer = PositionWriter(tpf)
    5.53 +
    5.54 +    tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
    5.55 +    positions_index_writer = PositionIndexWriter(tpif)
    5.56 +
    5.57 +    positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
    5.58 +
    5.59 +    return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
    5.60 +
    5.61 +def get_field_writer(pathname, partition, interval):
    5.62 +
    5.63 +    """
    5.64 +    Return a field dictionary writer using files under the given 'pathname'
    5.65 +    labelled according to the given 'partition', using the given indexing
    5.66 +    'interval'.
    5.67 +    """
    5.68 +
    5.69 +    ff = open(join(pathname, "fields-%s" % partition), "wb")
    5.70 +    field_writer = FieldWriter(ff)
    5.71 +
    5.72 +    fif = open(join(pathname, "fields_index-%s" % partition), "wb")
    5.73 +    field_index_writer = FieldIndexWriter(fif)
    5.74 +
    5.75 +    return FieldDictionaryWriter(field_writer, field_index_writer, interval)
    5.76 +
    5.77 +def get_term_reader(pathname, partition):
    5.78 +
    5.79 +    """
    5.80 +    Return a term dictionary reader using files under the given 'pathname'
    5.81 +    labelled according to the given 'partition'.
    5.82 +    """
    5.83 +
    5.84 +    tdf = open(join(pathname, "terms-%s" % partition), "rb")
    5.85 +    info_reader = TermReader(tdf)
    5.86 +
    5.87 +    tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
    5.88 +    index_reader = TermIndexReader(tdif)
    5.89 +
    5.90 +    positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))
    5.91 +    positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))
    5.92 +
    5.93 +    positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)
    5.94 +
    5.95 +    return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
    5.96 +
    5.97 +def get_field_reader(pathname, partition):
    5.98 +
    5.99 +    """
   5.100 +    Return a field dictionary reader using files under the given 'pathname'
   5.101 +    labelled according to the given 'partition'.
   5.102 +    """
   5.103 +
   5.104 +    ff = open(join(pathname, "fields-%s" % partition), "rb")
   5.105 +    field_reader = FieldReader(ff)
   5.106 +
   5.107 +    fif = open(join(pathname, "fields_index-%s" % partition), "rb")
   5.108 +    field_index_reader = FieldIndexReader(fif)
   5.109 +
   5.110 +    return FieldDictionaryReader(field_reader, field_index_reader)
   5.111 +
   5.112 +def rename_files(pathname, names, from_partition, to_partition):
   5.113 +    for name in names:
   5.114 +        rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
   5.115 +
   5.116 +def rename_term_files(pathname, from_partition, to_partition):
   5.117 +    rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
   5.118 +
   5.119 +def rename_field_files(pathname, from_partition, to_partition):
   5.120 +    rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
   5.121 +
   5.122 +def remove_files(pathname, names, partition):
   5.123 +    for name in names:
   5.124 +        remove(join(pathname, "%s-%s" % (name, partition)))
   5.125 +
   5.126 +def remove_term_files(pathname, partition):
   5.127 +    remove_files(pathname, TERM_FILENAMES, partition)
   5.128 +
   5.129 +def remove_field_files(pathname, partition):
   5.130 +    remove_files(pathname, FIELD_FILENAMES, partition)
   5.131 +
   5.132 +# vim: tabstop=4 expandtab shiftwidth=4

     6.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     6.2 +++ b/iixr/index.py	Tue Sep 15 00:15:11 2009 +0200
     6.3 @@ -0,0 +1,326 @@
     6.4 +#!/usr/bin/env python
     6.5 +
     6.6 +"""
     6.7 +High-level classes.
     6.8 +
     6.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    6.10 +
    6.11 +This program is free software; you can redistribute it and/or modify it under
    6.12 +the terms of the GNU General Public License as published by the Free Software
    6.13 +Foundation; either version 3 of the License, or (at your option) any later
    6.14 +version.
    6.15 +
    6.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    6.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    6.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    6.19 +
    6.20 +You should have received a copy of the GNU General Public License along
    6.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    6.22 +"""
    6.23 +
    6.24 +from iixr.filesystem import *
    6.25 +from os import listdir, mkdir    # index and partition discovery
    6.26 +from os.path import exists
    6.27 +
    6.28 +try:
    6.29 +    set
    6.30 +except NameError:
    6.31 +    from sets import Set as set
    6.32 +
    6.33 +# Constants.
    6.34 +
    6.35 +TERM_INTERVAL     = 100
    6.36 +DOCUMENT_INTERVAL = 100
    6.37 +FIELD_INTERVAL    = 100
    6.38 +FLUSH_INTERVAL    = 10000
    6.39 +
    6.40 +# High-level classes.
    6.41 +
    6.42 +class Document:
    6.43 +
    6.44 +    "A container of document information."
    6.45 +
    6.46 +    def __init__(self, docnum):
    6.47 +        self.docnum = docnum
    6.48 +        self.fields = []
    6.49 +        self.terms = {}
    6.50 +
    6.51 +    def add_position(self, term, position):
    6.52 +
    6.53 +        """
    6.54 +        Add a position entry for the given 'term', indicating the given
    6.55 +        'position'.
    6.56 +        """
    6.57 +
    6.58 +        self.terms.setdefault(term, []).append(position)
    6.59 +
    6.60 +    def add_field(self, identifier, value):
    6.61 +
    6.62 +        "Add a field having the given 'identifier' and 'value'."
    6.63 +
    6.64 +        self.fields.append((identifier, unicode(value))) # convert to string
    6.65 +
    6.66 +    def set_fields(self, fields):
    6.67 +
    6.68 +        """
    6.69 +        Set the document's 'fields': a list of tuples each containing an integer
    6.70 +        identifier and a string value.
    6.71 +        """
    6.72 +
    6.73 +        self.fields = fields
    6.74 +
    6.75 +class IndexWriter:
    6.76 +
    6.77 +    """
    6.78 +    Building term information and writing it to the term and field dictionaries.
    6.79 +    """
    6.80 +
    6.81 +    def __init__(self, pathname, interval, doc_interval, flush_interval):
    6.82 +        self.pathname = pathname
    6.83 +        self.interval = interval
    6.84 +        self.doc_interval = doc_interval
    6.85 +        self.flush_interval = flush_interval
    6.86 +
    6.87 +        self.dict_partition = 0
    6.88 +        self.field_dict_partition = 0
    6.89 +
    6.90 +        self.terms = {}
    6.91 +        self.docs = {}
    6.92 +
    6.93 +        self.doc_counter = 0
    6.94 +
    6.95 +    def add_document(self, doc):
    6.96 +
    6.97 +        """
    6.98 +        Add the given document 'doc', updating the document counter and flushing
    6.99 +        terms and fields if appropriate.
   6.100 +        """
   6.101 +
   6.102 +        for term, positions in doc.terms.items():
   6.103 +            self.terms.setdefault(term, {})[doc.docnum] = positions
   6.104 +
   6.105 +        self.docs[doc.docnum] = doc.fields
   6.106 +
   6.107 +        self.doc_counter += 1
   6.108 +        if self.flush_interval and self.doc_counter >= self.flush_interval:
   6.109 +            self.flush_terms()
   6.110 +            self.flush_fields()
   6.111 +            self.doc_counter = 0
   6.112 +
   6.113 +    def get_term_writer(self):
   6.114 +
   6.115 +        "Return a term dictionary writer for the current partition."
   6.116 +
   6.117 +        return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
   6.118 +
   6.119 +    def get_field_writer(self):
   6.120 +
   6.121 +        "Return a field dictionary writer for the current partition."
   6.122 +
   6.123 +        return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
   6.124 +
   6.125 +    def flush_terms(self):
   6.126 +
   6.127 +        "Flush terms into the current term dictionary partition."
   6.128 +
   6.129 +        # Get the terms in order.
   6.130 +
   6.131 +        all_terms = self.terms
   6.132 +        terms = all_terms.keys()
   6.133 +        terms.sort()
   6.134 +
   6.135 +        dict_writer = self.get_term_writer()
   6.136 +
   6.137 +        for term in terms:
   6.138 +            doc_positions = all_terms[term].items()
   6.139 +            dict_writer.write_term_positions(term, doc_positions)
   6.140 +
   6.141 +        dict_writer.close()
   6.142 +
   6.143 +        self.terms = {}
   6.144 +        self.dict_partition += 1
   6.145 +
   6.146 +    def flush_fields(self):
   6.147 +
   6.148 +        "Flush fields into the current term dictionary partition."
   6.149 +
   6.150 +        # Get the documents in order.
   6.151 +
   6.152 +        docs = self.docs.items()
   6.153 +        docs.sort()
   6.154 +
   6.155 +        field_dict_writer = self.get_field_writer()
   6.156 +
   6.157 +        for docnum, fields in docs:
   6.158 +            field_dict_writer.write_fields(docnum, fields)
   6.159 +
   6.160 +        field_dict_writer.close()
   6.161 +
   6.162 +        self.docs = {}
   6.163 +        self.field_dict_partition += 1
   6.164 +
   6.165 +    def close(self):
   6.166 +        if self.terms:
   6.167 +            self.flush_terms()
   6.168 +        if self.docs:
   6.169 +            self.flush_fields()
   6.170 +
   6.171 +class IndexReader:
   6.172 +
   6.173 +    "Accessing the term and field dictionaries."
   6.174 +
   6.175 +    def __init__(self, pathname):
   6.176 +        self.dict_reader = get_term_reader(pathname, "merged")
   6.177 +        self.field_dict_reader = get_field_reader(pathname, "merged")
   6.178 +
   6.179 +    def find_terms(self, term):
   6.180 +        return self.dict_reader.find_terms(term)
   6.181 +
   6.182 +    def find_positions(self, term):
   6.183 +        return self.dict_reader.find_positions(term)
   6.184 +
   6.185 +    def get_frequency(self, term):
   6.186 +        return self.dict_reader.get_frequency(term)
   6.187 +
   6.188 +    def get_document_frequency(self, term):
   6.189 +        return self.dict_reader.get_document_frequency(term)
   6.190 +
   6.191 +    def get_fields(self, docnum):
   6.192 +        return self.field_dict_reader.get_fields(docnum)
   6.193 +
   6.194 +    def close(self):
   6.195 +        self.dict_reader.close()
   6.196 +        self.field_dict_reader.close()
   6.197 +
   6.198 +class Index:
   6.199 +
   6.200 +    "An inverted index solution encapsulating the various components."
   6.201 +
   6.202 +    def __init__(self, pathname):
   6.203 +        self.pathname = pathname
   6.204 +        self.reader = None
   6.205 +        self.writer = None
   6.206 +
   6.207 +    def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
   6.208 +
   6.209 +        """
   6.210 +        Return a writer, optionally using the given indexing 'interval',
   6.211 +        'doc_interval' and 'flush_interval'.
   6.212 +        """
   6.213 +
   6.214 +        if not exists(self.pathname):
   6.215 +            mkdir(self.pathname)
   6.216 +
   6.217 +        self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
   6.218 +        return self.writer
   6.219 +
   6.220 +    def get_reader(self, partition=0):
   6.221 +
   6.222 +        "Return a reader for the index."
   6.223 +
   6.224 +        # Ensure that only one partition exists.
   6.225 +
   6.226 +        self.merge()
   6.227 +        return self._get_reader(partition)
   6.228 +
   6.229 +    def _get_reader(self, partition):
   6.230 +
   6.231 +        "Return a reader for the index."
   6.232 +
   6.233 +        if not exists(self.pathname):
   6.234 +            raise OSError, "Index path %r does not exist." % self.pathname
   6.235 +
   6.236 +        self.reader = IndexReader(self.pathname)
   6.237 +        return self.reader
   6.238 +
   6.239 +    def merge(self):
   6.240 +
   6.241 +        "Merge/optimise index partitions."
   6.242 +
   6.243 +        self.merge_terms()
   6.244 +        self.merge_fields()
   6.245 +
   6.246 +    def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
   6.247 +
   6.248 +        """
   6.249 +        Merge term dictionaries using the given indexing 'interval' and
   6.250 +        'doc_interval'.
   6.251 +        """
   6.252 +
   6.253 +        readers = []
   6.254 +        partitions = set()
   6.255 +
   6.256 +        for filename in listdir(self.pathname):
   6.257 +            if filename.startswith("terms-"): # 6 character prefix
   6.258 +                partition = filename[6:]
   6.259 +                readers.append(get_term_reader(self.pathname, partition))
   6.260 +                partitions.add(partition)
   6.261 +
   6.262 +        # Write directly to a dictionary.
   6.263 +
   6.264 +        if len(readers) > 1:
   6.265 +            if "merged" in partitions:
   6.266 +                rename_term_files(self.pathname, "merged", "old-merged")
   6.267 +                partitions.remove("merged")
   6.268 +                partitions.add("old-merged")
   6.269 +
   6.270 +            writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
   6.271 +            merger = TermDictionaryMerger(writer, readers)
   6.272 +            merger.merge()
   6.273 +            merger.close()
   6.274 +
   6.275 +            # Remove old files.
   6.276 +
   6.277 +            for partition in partitions:
   6.278 +                remove_term_files(self.pathname, partition)
   6.279 +
   6.280 +        elif len(readers) == 1:
   6.281 +            partition = list(partitions)[0]
   6.282 +            if partition != "merged":
   6.283 +                rename_term_files(self.pathname, partition, "merged")
   6.284 +
   6.285 +    def merge_fields(self, interval=FIELD_INTERVAL):
   6.286 +
   6.287 +        "Merge field dictionaries using the given indexing 'interval'."
   6.288 +
   6.289 +        readers = []
   6.290 +        partitions = set()
   6.291 +
   6.292 +        for filename in listdir(self.pathname):
   6.293 +            if filename.startswith("fields-"): # 7 character prefix
   6.294 +                partition = filename[7:]
   6.295 +                readers.append(get_field_reader(self.pathname, partition))
   6.296 +                partitions.add(partition)
   6.297 +
   6.298 +        # Write directly to a dictionary.
   6.299 +
   6.300 +        if len(readers) > 1:
   6.301 +            if "merged" in partitions:
   6.302 +                rename_field_files(self.pathname, "merged", "old-merged")
   6.303 +                partitions.remove("merged")
   6.304 +                partitions.add("old-merged")
   6.305 +
   6.306 +            writer = get_field_writer(self.pathname, "merged", interval)
   6.307 +            merger = FieldDictionaryMerger(writer, readers)
   6.308 +            merger.merge()
   6.309 +            merger.close()
   6.310 +
   6.311 +            # Remove old files.
   6.312 +
   6.313 +            for partition in partitions:
   6.314 +                remove_field_files(self.pathname, partition)
   6.315 +
   6.316 +        elif len(readers) == 1:
   6.317 +            partition = list(partitions)[0]
   6.318 +            if partition != "merged":
   6.319 +                rename_field_files(self.pathname, partition, "merged")
   6.320 +
   6.321 +    def close(self):
   6.322 +        if self.reader is not None:
   6.323 +            self.reader.close()
   6.324 +            self.reader = None
   6.325 +        if self.writer is not None:
   6.326 +            self.writer.close()
   6.327 +            self.writer = None
   6.328 +
   6.329 +# vim: tabstop=4 expandtab shiftwidth=4

     7.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     7.2 +++ b/iixr/merging.py	Tue Sep 15 00:15:11 2009 +0200
     7.3 @@ -0,0 +1,74 @@
     7.4 +#!/usr/bin/env python
     7.5 +
     7.6 +"""
     7.7 +Dictionary merging classes.
     7.8 +
     7.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    7.10 +
    7.11 +This program is free software; you can redistribute it and/or modify it under
    7.12 +the terms of the GNU General Public License as published by the Free Software
    7.13 +Foundation; either version 3 of the License, or (at your option) any later
    7.14 +version.
    7.15 +
    7.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    7.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    7.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    7.19 +
    7.20 +You should have received a copy of the GNU General Public License along
    7.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    7.22 +"""
    7.23 +
    7.24 +from itermerge import itermerge
    7.25 +
    7.26 +class Merger:
    7.27 +
    7.28 +    "Merge files."
    7.29 +
    7.30 +    def __init__(self, writer, readers):
    7.31 +        self.writer = writer
    7.32 +        self.readers = readers
    7.33 +
    7.34 +    def close(self):
    7.35 +        for reader in self.readers:
    7.36 +            reader.close()
    7.37 +        self.writer.close()
    7.38 +
    7.39 +class TermDictionaryMerger(Merger):
    7.40 +
    7.41 +    "Merge term and position files."
    7.42 +
    7.43 +    def merge(self):
    7.44 +
    7.45 +        """
    7.46 +        Merge terms and positions from the readers, sending them to the writer.
    7.47 +        """
    7.48 +
    7.49 +        last_term = None
    7.50 +        current_readers = []
    7.51 +
    7.52 +        for term, frequency, doc_frequency, positions in itermerge(self.readers):
    7.53 +            if term == last_term:
    7.54 +                current_readers.append(positions)
    7.55 +            else:
    7.56 +                if current_readers:
    7.57 +                    self.writer.write_term_positions(last_term, itermerge(current_readers))
    7.58 +                last_term = term
    7.59 +                current_readers = [positions]
    7.60 +        else:
    7.61 +            if current_readers:
    7.62 +                self.writer.write_term_positions(last_term, itermerge(current_readers))
    7.63 +
    7.64 +class FieldDictionaryMerger(Merger):
    7.65 +
    7.66 +    "Merge field files."
    7.67 +
    7.68 +    def merge(self):
    7.69 +
    7.70 +        """
    7.71 +        Merge fields from the readers, sending them to the writer.
    7.72 +        """
    7.73 +
    7.74 +        for docnum, fields in itermerge(self.readers):
    7.75 +            self.writer.write_fields(docnum, fields)
    7.76 +
    7.77 +# vim: tabstop=4 expandtab shiftwidth=4

     8.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     8.2 +++ b/iixr/positions.py	Tue Sep 15 00:15:11 2009 +0200
     8.3 @@ -0,0 +1,525 @@
     8.4 +#!/usr/bin/env python
     8.5 +
     8.6 +"""
     8.7 +Specific classes for storing position information.
     8.8 +
     8.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    8.10 +
    8.11 +This program is free software; you can redistribute it and/or modify it under
    8.12 +the terms of the GNU General Public License as published by the Free Software
    8.13 +Foundation; either version 3 of the License, or (at your option) any later
    8.14 +version.
    8.15 +
    8.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    8.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    8.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    8.19 +
    8.20 +You should have received a copy of the GNU General Public License along
    8.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    8.22 +"""
    8.23 +
    8.24 +from iixr.files import *
    8.25 +from iixr.data import vint
    8.26 +
    8.27 +class PositionWriter(FileWriter):
    8.28 +
    8.29 +    "Writing position information to files."
    8.30 +
    8.31 +    def reset(self):
    8.32 +        self.last_docnum = 0
    8.33 +
    8.34 +    def write_positions(self, docnum, positions):
    8.35 +
    8.36 +        """
    8.37 +        Write for the document 'docnum' the given 'positions'.
    8.38 +        Return the offset of the written record.
    8.39 +        """
    8.40 +
    8.41 +        if docnum < self.last_docnum:
    8.42 +            raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
    8.43 +
    8.44 +        # Record the offset of this record.
    8.45 +
    8.46 +        offset = self.tell()
    8.47 +
    8.48 +        # Make sure that the positions are sorted.
    8.49 +
    8.50 +        positions.sort()
    8.51 +
    8.52 +        # Write the position deltas.
    8.53 +
    8.54 +        output = []
    8.55 +        last = 0
    8.56 +
    8.57 +        for position in positions:
    8.58 +            output.append(vint(position - last))
    8.59 +            last = position
    8.60 +
    8.61 +        # Write the document number delta.
    8.62 +        # Write the number of positions.
    8.63 +        # Then write the positions.
    8.64 +
    8.65 +        self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))
    8.66 +
    8.67 +        self.last_docnum = docnum
    8.68 +        return offset
    8.69 +
    8.70 +class PositionOpener(FileOpener):
    8.71 +
    8.72 +    "Reading position information from files."
    8.73 +
    8.74 +    def read_term_positions(self, offset, count):
    8.75 +
    8.76 +        """
    8.77 +        Read all positions from 'offset', seeking to that position in the file
    8.78 +        before reading. The number of documents available for reading is limited
    8.79 +        to 'count'.
    8.80 +        """
    8.81 +
    8.82 +        # Duplicate the file handle.
    8.83 +
    8.84 +        f = self.open("rb")
    8.85 +        return PositionIterator(f, offset, count)
    8.86 +
    8.87 +class PositionIndexWriter(FileWriter):
    8.88 +
    8.89 +    "Writing position index information to files."
    8.90 +
    8.91 +    def reset(self):
    8.92 +        self.last_docnum = 0
    8.93 +        self.last_pos_offset = 0
    8.94 +
    8.95 +    def write_positions(self, docnum, pos_offset, count):
    8.96 +
    8.97 +        """
    8.98 +        Write the given 'docnum, 'pos_offset' and document 'count' to the
    8.99 +        position index file.
   8.100 +        """
   8.101 +
   8.102 +        # Record the offset of this record.
   8.103 +
   8.104 +        offset = self.tell()
   8.105 +        output = []
   8.106 +
   8.107 +        # Write the document number delta.
   8.108 +
   8.109 +        output.append(vint(docnum - self.last_docnum))
   8.110 +        self.last_docnum = docnum
   8.111 +
   8.112 +        # Write the position file offset delta.
   8.113 +
   8.114 +        output.append(vint(pos_offset - self.last_pos_offset))
   8.115 +        self.last_pos_offset = pos_offset
   8.116 +
   8.117 +        # Write the document count.
   8.118 +
   8.119 +        output.append(vint(count))
   8.120 +
   8.121 +        # Actually write the data.
   8.122 +
   8.123 +        self.write("".join(output))
   8.124 +
   8.125 +        return offset
   8.126 +
   8.127 +class PositionIndexOpener(FileOpener):
   8.128 +
   8.129 +    "Reading position index information from files."
   8.130 +
   8.131 +    def read_term_positions(self, offset, doc_frequency):
   8.132 +
   8.133 +        """
   8.134 +        Read all positions from 'offset', seeking to that position in the file
   8.135 +        before reading. The number of documents available for reading is limited
   8.136 +        to 'doc_frequency'.
   8.137 +        """
   8.138 +
   8.139 +        # Duplicate the file handle.
   8.140 +
   8.141 +        f = self.open("rb")
   8.142 +        return PositionIndexIterator(f, offset, doc_frequency)
   8.143 +
   8.144 +# Iterators for position-related files.
   8.145 +
   8.146 +class IteratorBase:
   8.147 +
   8.148 +    def __init__(self, count):
   8.149 +        self.replenish(count)
   8.150 +
   8.151 +    def replenish(self, count):
   8.152 +        self.count = count
   8.153 +        self.read_documents = 0
   8.154 +
   8.155 +    def __len__(self):
   8.156 +        return self.count
   8.157 +
   8.158 +    def sort(self):
   8.159 +        pass # Stored document positions are already sorted.
   8.160 +
   8.161 +    def __iter__(self):
   8.162 +        return self
   8.163 +
   8.164 +class PositionIterator(FileReader, IteratorBase):
   8.165 +
   8.166 +    "Iterating over document positions."
   8.167 +
   8.168 +    def __init__(self, f, offset, count):
   8.169 +        FileReader.__init__(self, f)
   8.170 +        IteratorBase.__init__(self, count)
   8.171 +        self.seek(offset)
   8.172 +
   8.173 +    def reset(self):
   8.174 +        self.last_docnum = 0
   8.175 +
   8.176 +    def read_positions(self):
   8.177 +
   8.178 +        "Read positions, returning a document number and a list of positions."
   8.179 +
   8.180 +        # Read the document number delta and add it to the last number.
   8.181 +
   8.182 +        self.last_docnum += self.read_number()
   8.183 +
   8.184 +        # Read the number of positions.
   8.185 +
   8.186 +        npositions = self.read_number()
   8.187 +
   8.188 +        # Read the position deltas, adding each previous position to get the
   8.189 +        # appropriate collection of absolute positions.
   8.190 +
   8.191 +        i = 0
   8.192 +        last = 0
   8.193 +        positions = []
   8.194 +
   8.195 +        while i < npositions:
   8.196 +            last += self.read_number()
   8.197 +            positions.append(last)
   8.198 +            i += 1
   8.199 +
   8.200 +        return self.last_docnum, positions
   8.201 +
   8.202 +    def next(self):
   8.203 +
   8.204 +        "Read positions for a single document."
   8.205 +
   8.206 +        if self.read_documents < self.count:
   8.207 +            self.read_documents += 1
   8.208 +            return self.read_positions()
   8.209 +        else:
   8.210 +            raise StopIteration
   8.211 +
   8.212 +class PositionIndexIterator(FileReader, IteratorBase):
   8.213 +
   8.214 +    "Iterating over document positions."
   8.215 +
   8.216 +    def __init__(self, f, offset, count):
   8.217 +        FileReader.__init__(self, f)
   8.218 +        IteratorBase.__init__(self, count)
   8.219 +        self.seek(offset)
   8.220 +        self.section_count = 0
   8.221 +
   8.222 +    def reset(self):
   8.223 +        self.last_docnum = 0
   8.224 +        self.last_pos_offset = 0
   8.225 +
   8.226 +    def read_positions(self):
   8.227 +
   8.228 +        """
   8.229 +        Read a document number, a position file offset for the position index
   8.230 +        file, and the number of documents in a section of that file.
   8.231 +        """
   8.232 +
   8.233 +        # Read the document number delta.
   8.234 +
   8.235 +        self.last_docnum += self.read_number()
   8.236 +
   8.237 +        # Read the offset delta.
   8.238 +
   8.239 +        self.last_pos_offset += self.read_number()
   8.240 +
   8.241 +        # Read the document count.
   8.242 +
   8.243 +        count = self.read_number()
   8.244 +
   8.245 +        return self.last_docnum, self.last_pos_offset, count
   8.246 +
   8.247 +    def next(self):
   8.248 +
   8.249 +        "Read positions for a single document."
   8.250 +
   8.251 +        self.read_documents += self.section_count
   8.252 +        if self.read_documents < self.count:
   8.253 +            docnum, pos_offset, self.section_count = t = self.read_positions()
   8.254 +            return t
   8.255 +        else:
   8.256 +            raise StopIteration
   8.257 +
   8.258 +class PositionDictionaryWriter:
   8.259 +
   8.260 +    "Writing position dictionaries."
   8.261 +
   8.262 +    def __init__(self, position_writer, position_index_writer, interval):
   8.263 +        self.position_writer = position_writer
   8.264 +        self.position_index_writer = position_index_writer
   8.265 +        self.interval = interval
   8.266 +
   8.267 +    def write_term_positions(self, doc_positions):
   8.268 +
   8.269 +        """
   8.270 +        Write all 'doc_positions' - a collection of tuples of the form (document
   8.271 +        number, position list) - to the file.
   8.272 +
   8.273 +        Add some records to the index, making dictionary entries.
   8.274 +
   8.275 +        Return a tuple containing the offset of the written data, the frequency
   8.276 +        (number of positions), and document frequency (number of documents) for
   8.277 +        the term involved.
   8.278 +        """
   8.279 +
   8.280 +        # Reset the writers.
   8.281 +
   8.282 +        self.position_writer.reset()
   8.283 +        self.position_index_writer.reset()
   8.284 +
   8.285 +        index_offset = None
   8.286 +
   8.287 +        # Write the positions.
   8.288 +
   8.289 +        frequency = 0
   8.290 +        first_docnum = None
   8.291 +        first_offset = None
   8.292 +        count = 0
   8.293 +
   8.294 +        doc_positions.sort()
   8.295 +
   8.296 +        for docnum, positions in doc_positions:
   8.297 +            pos_offset = self.position_writer.write_positions(docnum, positions)
   8.298 +
   8.299 +            # Retain the first record offset for a subsequent index entry.
   8.300 +
   8.301 +            if first_offset is None:
   8.302 +                first_offset = pos_offset
   8.303 +                first_docnum = docnum
   8.304 +
   8.305 +            frequency += len(positions)
   8.306 +            count += 1
   8.307 +
   8.308 +            # Every {interval} entries, write an index entry.
   8.309 +
   8.310 +            if count % self.interval == 0:
   8.311 +                io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
   8.312 +
   8.313 +                # Remember the first index entry offset.
   8.314 +
   8.315 +                if index_offset is None:
   8.316 +                    index_offset = io
   8.317 +
   8.318 +                first_offset = None
   8.319 +                first_docnum = None
   8.320 +
   8.321 +                # Reset the position writer so that position readers accessing
   8.322 +                # a section start with the correct document number.
   8.323 +
   8.324 +                self.position_writer.reset()
   8.325 +
   8.326 +        # Finish writing an index entry for the remaining documents.
   8.327 +
   8.328 +        else:
   8.329 +            if first_offset is not None:
   8.330 +                io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
   8.331 +
   8.332 +                # Remember the first index entry offset.
   8.333 +
   8.334 +                if index_offset is None:
   8.335 +                    index_offset = io
   8.336 +
   8.337 +        return index_offset, frequency, count
   8.338 +
   8.339 +    def close(self):
   8.340 +        self.position_writer.close()
   8.341 +        self.position_index_writer.close()
   8.342 +
   8.343 +class PositionDictionaryReader:
   8.344 +
   8.345 +    "Reading position dictionaries."
   8.346 +
   8.347 +    def __init__(self, position_opener, position_index_opener):
   8.348 +        self.position_opener = position_opener
   8.349 +        self.position_index_opener = position_index_opener
   8.350 +
   8.351 +    def read_term_positions(self, offset, doc_frequency):
   8.352 +
   8.353 +        """
   8.354 +        Return an iterator for dictionary entries starting at 'offset' with the
   8.355 +        given 'doc_frequency'.
   8.356 +        """
   8.357 +
   8.358 +        return PositionDictionaryIterator(self.position_opener,
   8.359 +            self.position_index_opener, offset, doc_frequency)
   8.360 +
   8.361 +    def close(self):
   8.362 +        pass
   8.363 +
   8.364 +class PositionDictionaryIterator:
   8.365 +
   8.366 +    "Iteration over position dictionary entries."
   8.367 +
   8.368 +    def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
   8.369 +        self.position_opener = position_opener
   8.370 +        self.doc_frequency = doc_frequency
   8.371 +        self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
   8.372 +        self.iterator = None
   8.373 +
   8.374 +        # Remember the last values.
   8.375 +
   8.376 +        self.found_docnum, self.found_positions = None, None
   8.377 +
   8.378 +        # Maintain state for the next index entry, if read.
   8.379 +
   8.380 +        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   8.381 +
   8.382 +        # Initialise the current index entry and current position file iterator.
   8.383 +
   8.384 +        self._next_section()
   8.385 +        self._init_section()
   8.386 +
   8.387 +    # Sequence methods.
   8.388 +
   8.389 +    def __len__(self):
   8.390 +        return self.doc_frequency
   8.391 +
   8.392 +    def sort(self):
   8.393 +        pass
   8.394 +
   8.395 +    # Iterator methods.
   8.396 +
   8.397 +    def __iter__(self):
   8.398 +        return self
   8.399 +
   8.400 +    def next(self):
   8.401 +
   8.402 +        """
   8.403 +        Attempt to get the next document record from the section in the
   8.404 +        positions file.
   8.405 +        """
   8.406 +
   8.407 +        # Return any visited but unrequested record.
   8.408 +
   8.409 +        if self.found_docnum is not None:
   8.410 +            t = self.found_docnum, self.found_positions
   8.411 +            self.found_docnum, self.found_positions = None, None
   8.412 +            return t
   8.413 +
   8.414 +        # Or search for the next record.
   8.415 +
   8.416 +        while 1:
   8.417 +
   8.418 +            # Either return the next record.
   8.419 +
   8.420 +            try:
   8.421 +                return self.iterator.next()
   8.422 +
   8.423 +            # Or, where a section is finished, get the next section and try again.
   8.424 +
   8.425 +            except StopIteration:
   8.426 +
   8.427 +                # Where a section follows, update the index iterator, but keep
   8.428 +                # reading using the same file iterator (since the data should
   8.429 +                # just follow on from the last section).
   8.430 +
   8.431 +                self._next_section()
   8.432 +                self.iterator.replenish(self.section_count)
   8.433 +
   8.434 +                # Reset the state of the iterator to make sure that document
   8.435 +                # numbers are correct.
   8.436 +
   8.437 +                self.iterator.reset()
   8.438 +
   8.439 +    def from_document(self, docnum):
   8.440 +
   8.441 +        """
   8.442 +        Attempt to navigate to a positions entry for the given 'docnum',
   8.443 +        returning the positions for 'docnum', or None otherwise.
   8.444 +        """
   8.445 +
   8.446 +        # Return any unrequested document positions.
   8.447 +
   8.448 +        if docnum == self.found_docnum:
   8.449 +            return self.found_positions
   8.450 +
   8.451 +        # Read ahead in the index until the next entry refers to a document
   8.452 +        # later than the desired document.
   8.453 +
   8.454 +        try:
   8.455 +            if self.next_docnum is None:
   8.456 +                self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
   8.457 +
   8.458 +            # Read until the next entry is after the desired document number,
   8.459 +            # or until the end of the results.
   8.460 +
   8.461 +            while self.next_docnum <= docnum:
   8.462 +                self._next_read_section()
   8.463 +                if self.docnum < docnum:
   8.464 +                    self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
   8.465 +                else:
   8.466 +                    break
   8.467 +
   8.468 +        except StopIteration:
   8.469 +            pass
   8.470 +
   8.471 +        # Navigate in the position file to the document.
   8.472 +
   8.473 +        self._init_section()
   8.474 +
   8.475 +        try:
   8.476 +            while 1:
   8.477 +                found_docnum, found_positions = self.iterator.next()
   8.478 +
   8.479 +                # Return the desired document positions or None (retaining the
   8.480 +                # positions for the document immediately after).
   8.481 +
   8.482 +                if docnum == found_docnum:
   8.483 +                    return found_positions
   8.484 +                elif docnum < found_docnum:
   8.485 +                    self.found_docnum, self.found_positions = found_docnum, found_positions
   8.486 +                    return None
   8.487 +
   8.488 +        except StopIteration:
   8.489 +            return None
   8.490 +
   8.491 +    # Internal methods.
   8.492 +
   8.493 +    def _next_section(self):
   8.494 +
   8.495 +        "Attempt to get the next section in the index."
   8.496 +
   8.497 +        if self.next_docnum is None:
   8.498 +            self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
   8.499 +        else:
   8.500 +            self._next_read_section()
   8.501 +
   8.502 +    def _next_read_section(self):
   8.503 +
   8.504 +        """
   8.505 +        Make the next index entry the current one without reading from the
   8.506 +        index.
   8.507 +        """
   8.508 +
   8.509 +        self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
   8.510 +        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   8.511 +
   8.512 +    def _init_section(self):
   8.513 +
   8.514 +        "Initialise the iterator for the section in the position file."
   8.515 +
   8.516 +        if self.iterator is not None:
   8.517 +            self.iterator.close()
   8.518 +        self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
   8.519 +
   8.520 +    def close(self):
   8.521 +        if self.iterator is not None:
   8.522 +            self.iterator.close()
   8.523 +            self.iterator = None
   8.524 +        if self.index_iterator is not None:
   8.525 +            self.index_iterator.close()
   8.526 +            self.index_iterator = None
   8.527 +
   8.528 +# vim: tabstop=4 expandtab shiftwidth=4

     9.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     9.2 +++ b/iixr/terms.py	Tue Sep 15 00:15:11 2009 +0200
     9.3 @@ -0,0 +1,395 @@
     9.4 +#!/usr/bin/env python
     9.5 +
     9.6 +"""
     9.7 +Specific classes for storing term information.
     9.8 +
     9.9 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    9.10 +
    9.11 +This program is free software; you can redistribute it and/or modify it under
    9.12 +the terms of the GNU General Public License as published by the Free Software
    9.13 +Foundation; either version 3 of the License, or (at your option) any later
    9.14 +version.
    9.15 +
    9.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    9.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    9.18 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    9.19 +
    9.20 +You should have received a copy of the GNU General Public License along
    9.21 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    9.22 +"""
    9.23 +
    9.24 +from iixr.files import *
    9.25 +from os.path import commonprefix # to find common string prefixes
    9.26 +from bisect import bisect_right  # to find terms in the dictionary index
    9.27 +
    9.28 +class TermWriter(FileWriter):
    9.29 +
    9.30 +    "Writing term information to files."
    9.31 +
    9.32 +    def reset(self):
    9.33 +        self.last_term = ""
    9.34 +        self.last_offset = 0
    9.35 +
    9.36 +    def write_term(self, term, offset, frequency, doc_frequency):
    9.37 +
    9.38 +        """
    9.39 +        Write the given 'term', its position file 'offset', its 'frequency' and
    9.40 +        its 'doc_frequency' (number of documents in which it appears) to the
    9.41 +        term information file. Return the offset after the term information was
    9.42 +        written to the file.
    9.43 +        """
    9.44 +
    9.45 +        # Write the prefix length and term suffix.
    9.46 +
    9.47 +        common = len(commonprefix([self.last_term, term]))
    9.48 +        suffix = term[common:]
    9.49 +
    9.50 +        self.write_number(common)
    9.51 +        self.write_string(suffix)
    9.52 +
    9.53 +        # Write the offset delta.
    9.54 +
    9.55 +        self.write_number(offset - self.last_offset)
    9.56 +
    9.57 +        # Write the frequency.
    9.58 +
    9.59 +        self.write_number(frequency)
    9.60 +
    9.61 +        # Write the document frequency.
    9.62 +
    9.63 +        self.write_number(doc_frequency)
    9.64 +
    9.65 +        self.last_term = term
    9.66 +        self.last_offset = offset
    9.67 +
    9.68 +        return self.tell()
    9.69 +
    9.70 +class TermReader(FileReader):
    9.71 +
    9.72 +    "Reading term information from files."
    9.73 +
    9.74 +    def reset(self):
    9.75 +        self.last_term = ""
    9.76 +        self.last_offset = 0
    9.77 +
    9.78 +    def read_term(self):
    9.79 +
    9.80 +        """
    9.81 +        Read a term, its position file offset, its frequency and its document
    9.82 +        frequency from the term information file.
    9.83 +        """
    9.84 +
    9.85 +        # Read the prefix length and term suffix.
    9.86 +
    9.87 +        common = self.read_number()
    9.88 +        suffix = self.read_string()
    9.89 +
    9.90 +        self.last_term = self.last_term[:common] + suffix
    9.91 +
    9.92 +        # Read the offset delta.
    9.93 +
    9.94 +        self.last_offset += self.read_number()
    9.95 +
    9.96 +        # Read the frequency.
    9.97 +
    9.98 +        frequency = self.read_number()
    9.99 +
   9.100 +        # Read the document frequency.
   9.101 +
   9.102 +        doc_frequency = self.read_number()
   9.103 +
   9.104 +        return self.last_term, self.last_offset, frequency, doc_frequency
   9.105 +
   9.106 +    def go_to_term(self, term, offset, info_offset):
   9.107 +
   9.108 +        """
   9.109 +        Seek past the entry for 'term' having 'offset' to 'info_offset'. This
   9.110 +        permits the scanning for later terms from the specified term.
   9.111 +        """
   9.112 +
   9.113 +        self.seek(info_offset)
   9.114 +        self.last_term = term
   9.115 +        self.last_offset = offset
   9.116 +
   9.117 +class TermIndexWriter(TermWriter):
   9.118 +
   9.119 +    "Writing term dictionary index details to files."
   9.120 +
   9.121 +    def reset(self):
   9.122 +        TermWriter.reset(self)
   9.123 +        self.last_info_offset = 0
   9.124 +
   9.125 +    def write_term(self, term, offset, frequency, doc_frequency, info_offset):
   9.126 +
   9.127 +        """
   9.128 +        Write the given 'term', its position file 'offset', its 'frequency' and
   9.129 +        its 'doc_frequency' to the term dictionary index file, along with the
   9.130 +        'info_offset' in the term information file.
   9.131 +        """
   9.132 +
   9.133 +        TermWriter.write_term(self, term, offset, frequency, doc_frequency)
   9.134 +
   9.135 +        # Write the information file offset delta.
   9.136 +
   9.137 +        self.write_number(info_offset - self.last_info_offset)
   9.138 +        self.last_info_offset = info_offset
   9.139 +
   9.140 +class TermIndexReader(TermReader):
   9.141 +
   9.142 +    "Reading term dictionary index details from files."
   9.143 +
   9.144 +    def reset(self):
   9.145 +        TermReader.reset(self)
   9.146 +        self.last_info_offset = 0
   9.147 +
   9.148 +    def read_term(self):
   9.149 +
   9.150 +        """
   9.151 +        Read a term, its position file offset, its frequency, its document
   9.152 +        frequency and a term information file offset from the term dictionary
   9.153 +        index file.
   9.154 +        """
   9.155 +
   9.156 +        term, offset, frequency, doc_frequency = TermReader.read_term(self)
   9.157 +
   9.158 +        # Read the offset delta.
   9.159 +
   9.160 +        self.last_info_offset += self.read_number()
   9.161 +
   9.162 +        return term, offset, frequency, doc_frequency, self.last_info_offset
   9.163 +
   9.164 +class TermDictionaryWriter:
   9.165 +
   9.166 +    "Writing term dictionaries."
   9.167 +
   9.168 +    def __init__(self, info_writer, index_writer, position_dict_writer, interval):
   9.169 +        self.info_writer = info_writer
   9.170 +        self.index_writer = index_writer
   9.171 +        self.position_dict_writer = position_dict_writer
   9.172 +        self.interval = interval
   9.173 +        self.entry = 0
   9.174 +
   9.175 +    def _write_term(self, term, offset, frequency, doc_frequency):
   9.176 +
   9.177 +        """
   9.178 +        Write the given 'term', its position file 'offset', its 'frequency' and
   9.179 +        its 'doc_frequency' (number of documents in which it appears) to the
   9.180 +        term information file. Return the offset after the term information was
   9.181 +        written to the file.
   9.182 +        """
   9.183 +
   9.184 +        info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
   9.185 +
   9.186 +        if self.entry % self.interval == 0:
   9.187 +            self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
   9.188 +
   9.189 +        self.entry += 1
   9.190 +
   9.191 +    def write_term_positions(self, term, doc_positions):
   9.192 +
   9.193 +        """
   9.194 +        Write the given 'term' and the 'doc_positions' recording the documents
   9.195 +        and positions at which the term is found.
   9.196 +        """
   9.197 +
   9.198 +        offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
   9.199 +        self._write_term(term, offset, frequency, doc_frequency)
   9.200 +
   9.201 +    def close(self):
   9.202 +        self.info_writer.close()
   9.203 +        self.index_writer.close()
   9.204 +        self.position_dict_writer.close()
   9.205 +
   9.206 +class TermDictionaryReader:
   9.207 +
   9.208 +    "Reading term dictionaries."
   9.209 +
   9.210 +    def __init__(self, info_reader, index_reader, position_dict_reader):
   9.211 +        self.info_reader = info_reader
   9.212 +        self.index_reader = index_reader
   9.213 +        self.position_dict_reader = position_dict_reader
   9.214 +
   9.215 +        self.terms = []
   9.216 +        try:
   9.217 +            while 1:
   9.218 +                self.terms.append(self.index_reader.read_term())
   9.219 +        except EOFError:
   9.220 +            pass
   9.221 +
   9.222 +        # Large numbers for ordering purposes.
   9.223 +
   9.224 +        if self.terms:
   9.225 +            self.max_offset = self.terms[-1][1] + 1
   9.226 +        else:
   9.227 +            self.max_offset = None
   9.228 +
   9.229 +    def _find_closest_entry(self, term):
   9.230 +
   9.231 +        """
   9.232 +        Find the offsets and frequencies of 'term' from the term dictionary or
   9.233 +        the closest term starting with the value of 'term'.
   9.234 +
   9.235 +        Return the closest index entry consisting of a term, the position file
   9.236 +        offset, the term frequency, the document frequency, and the term details
   9.237 +        file offset.
   9.238 +        """
   9.239 +
   9.240 +        i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
   9.241 +
   9.242 +        # Get the entry position providing the term or one preceding it.
   9.243 +        # If no entry precedes the requested term, return the very first entry
   9.244 +        # as the closest.
   9.245 +
   9.246 +        if i == -1:
   9.247 +            return self.terms[0]
   9.248 +        else:
   9.249 +            return self.terms[i]
   9.250 +
   9.251 +    def _find_closest_term(self, term):
   9.252 +
   9.253 +        """
   9.254 +        Find the offsets and frequencies of 'term' from the term dictionary or
   9.255 +        the closest term starting with the value of 'term'.
   9.256 +
   9.257 +        Return the closest term (or the term itself), the position file offset,
   9.258 +        the term frequency, the document frequency, and the term details file
   9.259 +        offset (or None if the reader is already positioned).
   9.260 +        """
   9.261 +
   9.262 +        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
   9.263 +
   9.264 +        # Where the term is found immediately, return the offset and
   9.265 +        # frequencies. If the term does not appear, return the details of the
   9.266 +        # closest entry.
   9.267 +
   9.268 +        if term <= found_term:
   9.269 +            return found_term, offset, frequency, doc_frequency, info_offset
   9.270 +
   9.271 +        # Otherwise, seek past the index term's entry in the information file
   9.272 +        # and scan for the desired term.
   9.273 +
   9.274 +        else:
   9.275 +            self.info_reader.go_to_term(found_term, offset, info_offset)
   9.276 +            try:
   9.277 +                while term > found_term:
   9.278 +                    found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
   9.279 +            except EOFError:
   9.280 +                pass
   9.281 +
   9.282 +            return found_term, offset, frequency, doc_frequency, None
   9.283 +
   9.284 +    def _find_term(self, term):
   9.285 +
   9.286 +        """
   9.287 +        Find the position file offset and frequency of 'term' from the term
   9.288 +        dictionary.
   9.289 +        """
   9.290 +
   9.291 +        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
   9.292 +
   9.293 +        # If the term is found, return the offset and frequencies.
   9.294 +
   9.295 +        if term == found_term:
   9.296 +            return offset, frequency, doc_frequency
   9.297 +        else:
   9.298 +            return None
   9.299 +
   9.300 +    def _get_positions(self, offset, doc_frequency):
   9.301 +        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
   9.302 +
   9.303 +    # Iterator convenience methods.
   9.304 +
   9.305 +    def __iter__(self):
   9.306 +        self.rewind()
   9.307 +        return self
   9.308 +
   9.309 +    def next(self):
   9.310 +        try:
   9.311 +            return self.read_term()
   9.312 +        except EOFError:
   9.313 +            raise StopIteration
   9.314 +
   9.315 +    # Sequential access methods.
   9.316 +
   9.317 +    def rewind(self):
   9.318 +        self.info_reader.rewind()
   9.319 +
   9.320 +    def read_term(self):
   9.321 +
   9.322 +        """
   9.323 +        Return the next term, its frequency, its document frequency, and the
   9.324 +        documents and positions at which the term is found.
   9.325 +        """
   9.326 +
   9.327 +        term, offset, frequency, doc_frequency = self.info_reader.read_term()
   9.328 +        positions = self._get_positions(offset, doc_frequency)
   9.329 +        return term, frequency, doc_frequency, positions
   9.330 +
   9.331 +    # Query methods.
   9.332 +
   9.333 +    def find_terms(self, term):
   9.334 +
   9.335 +        "Return all terms whose values start with the value of 'term'."
   9.336 +
   9.337 +        terms = []
   9.338 +
   9.339 +        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
   9.340 +
   9.341 +        # Position the reader, if necessary.
   9.342 +
   9.343 +        if info_offset is not None:
   9.344 +            self.info_reader.go_to_term(found_term, offset, info_offset)
   9.345 +
   9.346 +        # Read and record terms.
   9.347 +
   9.348 +        try:
   9.349 +            # Add the found term if it starts with the specified term.
   9.350 +
   9.351 +            while found_term.startswith(term):
   9.352 +                terms.append(found_term)
   9.353 +                found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
   9.354 +
   9.355 +        except EOFError:
   9.356 +            pass
   9.357 +
   9.358 +        return terms
   9.359 +
   9.360 +    def find_positions(self, term):
   9.361 +
   9.362 +        "Return the documents and positions at which the given 'term' is found."
   9.363 +
   9.364 +        t = self._find_term(term)
   9.365 +        if t is None:
   9.366 +            return None
   9.367 +        else:
   9.368 +            offset, frequency, doc_frequency = t
   9.369 +            return self._get_positions(offset, doc_frequency)
   9.370 +
   9.371 +    def get_frequency(self, term):
   9.372 +
   9.373 +        "Return the frequency of the given 'term'."
   9.374 +
   9.375 +        t = self._find_term(term)
   9.376 +        if t is None:
   9.377 +            return None
   9.378 +        else:
   9.379 +            offset, frequency, doc_frequency = t
   9.380 +            return frequency
   9.381 +
   9.382 +    def get_document_frequency(self, term):
   9.383 +
   9.384 +        "Return the document frequency of the given 'term'."
   9.385 +
   9.386 +        t = self._find_term(term)
   9.387 +        if t is None:
   9.388 +            return None
   9.389 +        else:
   9.390 +            offset, frequency, doc_frequency = t
   9.391 +            return doc_frequency
   9.392 +
   9.393 +    def close(self):
   9.394 +        self.info_reader.close()
   9.395 +        self.index_reader.close()
   9.396 +        self.position_dict_reader.close()
   9.397 +
   9.398 +# vim: tabstop=4 expandtab shiftwidth=4

    10.1 --- a/setup.py	Mon Sep 14 21:23:32 2009 +0200
    10.2 +++ b/setup.py	Tue Sep 15 00:15:11 2009 +0200
    10.3 @@ -11,6 +11,7 @@
    10.4      author_email = "paul@boddie.org.uk",
    10.5      url          = "http://www.boddie.org.uk/python/iixr.html",
    10.6      version      = "0.1",
    10.7 -    py_modules   = ["iixr", "itermerge"],
    10.8 +    py_modules   = ["itermerge"],
    10.9 +    packages     = ["iixr"],
   10.10      ext_modules  = [vint],
   10.11      )

    11.1 --- a/test.py	Mon Sep 14 21:23:32 2009 +0200
    11.2 +++ b/test.py	Tue Sep 15 00:15:11 2009 +0200
    11.3 @@ -1,6 +1,10 @@
    11.4  #!/usr/bin/env python
    11.5  
    11.6 -import iixr
    11.7 +from iixr.files import *
    11.8 +from iixr.fields import *
    11.9 +from iixr.terms import *
   11.10 +from iixr.positions import *
   11.11 +from iixr.index import *
   11.12  import os
   11.13  
   11.14  # Remove old test files.
   11.15 @@ -23,13 +27,13 @@
   11.16  numbers = [12345678, 0, 1, 127, 128, 255, 256]
   11.17  
   11.18  f = open("test", "wb")
   11.19 -w = iixr.FileWriter(f)
   11.20 +w = FileWriter(f)
   11.21  for number in numbers:
   11.22      w.write_number(number)
   11.23  w.close()
   11.24  
   11.25  f = open("test", "rb")
   11.26 -r = iixr.FileReader(f)
   11.27 +r = FileReader(f)
   11.28  for number in numbers:
   11.29      n = r.read_number()
   11.30      print number == n, number, n
   11.31 @@ -52,7 +56,7 @@
   11.32      ]
   11.33  
   11.34  f = open("testP", "wb")
   11.35 -w = iixr.PositionWriter(f)
   11.36 +w = PositionWriter(f)
   11.37  for doc_positions in all_doc_positions:
   11.38      for docnum, positions in doc_positions:
   11.39          w.write_positions(docnum, positions)
   11.40 @@ -60,7 +64,7 @@
   11.41  w.close()
   11.42  
   11.43  f = open("testP", "rb")
   11.44 -r = iixr.PositionIterator(f, 0, None)
   11.45 +r = PositionIterator(f, 0, None)
   11.46  for doc_positions in all_doc_positions:
   11.47      for docnum, positions in doc_positions:
   11.48          d, p = r.read_positions()
   11.49 @@ -84,7 +88,7 @@
   11.50  
   11.51  offsets = []
   11.52  f = open("testPI", "wb")
   11.53 -w = iixr.PositionIndexWriter(f)
   11.54 +w = PositionIndexWriter(f)
   11.55  for term_positions in indexed_positions:
   11.56      offset = None
   11.57      doc_frequency = 0
   11.58 @@ -97,7 +101,7 @@
   11.59      offsets.append((offset, doc_frequency))
   11.60  w.close()
   11.61  
   11.62 -r = iixr.PositionIndexOpener("testPI")
   11.63 +r = PositionIndexOpener("testPI")
   11.64  offsets.reverse()
   11.65  indexed_positions.reverse()
   11.66  for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
   11.67 @@ -111,19 +115,19 @@
   11.68  # Test position dictionaries.
   11.69  
   11.70  f = open("testP", "wb")
   11.71 -w = iixr.PositionWriter(f)
   11.72 +w = PositionWriter(f)
   11.73  f2 = open("testPI", "wb")
   11.74 -w2 = iixr.PositionIndexWriter(f2)
   11.75 -wd = iixr.PositionDictionaryWriter(w, w2, 2)
   11.76 +w2 = PositionIndexWriter(f2)
   11.77 +wd = PositionDictionaryWriter(w, w2, 2)
   11.78  offsets = []
   11.79  for doc_positions in all_doc_positions:
   11.80      offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
   11.81      offsets.append((offset, doc_frequency))
   11.82  wd.close()
   11.83  
   11.84 -r = iixr.PositionOpener("testP")
   11.85 -r2 = iixr.PositionIndexOpener("testPI")
   11.86 -rd = iixr.PositionDictionaryReader(r, r2)
   11.87 +r = PositionOpener("testP")
   11.88 +r2 = PositionIndexOpener("testPI")
   11.89 +rd = PositionDictionaryReader(r, r2)
   11.90  offsets.reverse()
   11.91  all_doc_positions.reverse()
   11.92  for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
   11.93 @@ -144,13 +148,13 @@
   11.94      ]
   11.95  
   11.96  f = open("testF", "wb")
   11.97 -w = iixr.FieldWriter(f)
   11.98 +w = FieldWriter(f)
   11.99  for docnum, fields in doc_fields:
  11.100      w.write_fields(docnum, list(enumerate(fields)))
  11.101  w.close()
  11.102  
  11.103  f = open("testF", "rb")
  11.104 -r = iixr.FieldReader(f)
  11.105 +r = FieldReader(f)
  11.106  for docnum, fields in doc_fields:
  11.107      dn, df = r.read_fields()
  11.108      print docnum == dn, docnum, dn
  11.109 @@ -166,13 +170,13 @@
  11.110      ]
  11.111  
  11.112  f = open("testFI", "wb")
  11.113 -w = iixr.FieldIndexWriter(f)
  11.114 +w = FieldIndexWriter(f)
  11.115  for docnum, offset in indexed_docs:
  11.116      w.write_document(docnum, offset)
  11.117  w.close()
  11.118  
  11.119  f = open("testFI", "rb")
  11.120 -r = iixr.FieldIndexReader(f)
  11.121 +r = FieldIndexReader(f)
  11.122  for docnum, offset in indexed_docs:
  11.123      dn, o = r.read_document()
  11.124      print docnum == dn, docnum, dn
  11.125 @@ -182,19 +186,19 @@
  11.126  # Test field dictionaries.
  11.127  
  11.128  f = open("testF", "wb")
  11.129 -w = iixr.FieldWriter(f)
  11.130 +w = FieldWriter(f)
  11.131  f2 = open("testFI", "wb")
  11.132 -w2 = iixr.FieldIndexWriter(f2)
  11.133 -wd = iixr.FieldDictionaryWriter(w, w2, 3)
  11.134 +w2 = FieldIndexWriter(f2)
  11.135 +wd = FieldDictionaryWriter(w, w2, 3)
  11.136  for docnum, fields in doc_fields:
  11.137      wd.write_fields(docnum, list(enumerate(fields)))
  11.138  wd.close()
  11.139  
  11.140  f = open("testF", "rb")
  11.141 -r = iixr.FieldReader(f)
  11.142 +r = FieldReader(f)
  11.143  f2 = open("testFI", "rb")
  11.144 -r2 = iixr.FieldIndexReader(f2)
  11.145 -rd = iixr.FieldDictionaryReader(r, r2)
  11.146 +r2 = FieldIndexReader(f2)
  11.147 +rd = FieldDictionaryReader(r, r2)
  11.148  doc_fields_reversed = doc_fields[:]
  11.149  doc_fields_reversed.reverse()
  11.150  for docnum, fields in doc_fields_reversed:
  11.151 @@ -226,13 +230,13 @@
  11.152      ]
  11.153  
  11.154  f = open("test", "wb")
  11.155 -w = iixr.TermWriter(f)
  11.156 +w = TermWriter(f)
  11.157  for term, offset, frequency, doc_frequency in terms:
  11.158      w.write_term(term, offset, frequency, doc_frequency)
  11.159  w.close()
  11.160  
  11.161  f = open("test", "rb")
  11.162 -r = iixr.TermReader(f)
  11.163 +r = TermReader(f)
  11.164  for term, offset, frequency, doc_frequency in terms:
  11.165      t, o, fr, df = r.read_term()
  11.166      print term == t, term, t
  11.167 @@ -254,13 +258,13 @@
  11.168      ]
  11.169  
  11.170  f = open("test", "wb")
  11.171 -w = iixr.TermIndexWriter(f)
  11.172 +w = TermIndexWriter(f)
  11.173  for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
  11.174      w.write_term(term, offset, frequency, doc_frequency, info_offset)
  11.175  w.close()
  11.176  
  11.177  f = open("test", "rb")
  11.178 -r = iixr.TermIndexReader(f)
  11.179 +r = TermIndexReader(f)
  11.180  for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
  11.181      t, o, fr, df, i = r.read_term()
  11.182      print term == t, term, t
  11.183 @@ -273,27 +277,27 @@
  11.184  # Test dictionaries with only term data.
  11.185  
  11.186  f = open("test", "wb")
  11.187 -w = iixr.TermWriter(f)
  11.188 +w = TermWriter(f)
  11.189  f2 = open("testI", "wb")
  11.190 -w2 = iixr.TermIndexWriter(f2)
  11.191 +w2 = TermIndexWriter(f2)
  11.192  f3 = open("testP", "wb")
  11.193 -w3 = iixr.PositionWriter(f3)
  11.194 +w3 = PositionWriter(f3)
  11.195  f4 = open("testPI", "wb")
  11.196 -w4 = iixr.PositionIndexWriter(f4)
  11.197 -wp = iixr.PositionDictionaryWriter(w3, w4, 2)
  11.198 -wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
  11.199 +w4 = PositionIndexWriter(f4)
  11.200 +wp = PositionDictionaryWriter(w3, w4, 2)
  11.201 +wd = TermDictionaryWriter(w, w2, wp, 3)
  11.202  for term, offset, frequency, doc_frequency in terms:
  11.203      wd._write_term(term, offset, frequency, doc_frequency)
  11.204  wd.close()
  11.205  
  11.206  f = open("test", "rb")
  11.207 -r = iixr.TermReader(f)
  11.208 +r = TermReader(f)
  11.209  f2 = open("testI", "rb")
  11.210 -r2 = iixr.TermIndexReader(f2)
  11.211 -r3 = iixr.PositionOpener("testP")
  11.212 -r4 = iixr.PositionIndexOpener("testPI")
  11.213 -rp = iixr.PositionDictionaryReader(r3, r4)
  11.214 -rd = iixr.TermDictionaryReader(r, r2, rp)
  11.215 +r2 = TermIndexReader(f2)
  11.216 +r3 = PositionOpener("testP")
  11.217 +r4 = PositionIndexOpener("testPI")
  11.218 +rp = PositionDictionaryReader(r3, r4)
  11.219 +rd = TermDictionaryReader(r, r2, rp)
  11.220  terms_reversed = terms[:]
  11.221  terms_reversed.reverse()
  11.222  for term, offset, frequency, doc_frequency in terms_reversed:
  11.223 @@ -335,27 +339,27 @@
  11.224      ]
  11.225  
  11.226  f = open("test", "wb")
  11.227 -w = iixr.TermWriter(f)
  11.228 +w = TermWriter(f)
  11.229  f2 = open("testI", "wb")
  11.230 -w2 = iixr.TermIndexWriter(f2)
  11.231 +w2 = TermIndexWriter(f2)
  11.232  f3 = open("testP", "wb")
  11.233 -w3 = iixr.PositionWriter(f3)
  11.234 +w3 = PositionWriter(f3)
  11.235  f4 = open("testPI", "wb")
  11.236 -w4 = iixr.PositionIndexWriter(f4)
  11.237 -wp = iixr.PositionDictionaryWriter(w3, w4, 2)
  11.238 -wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
  11.239 +w4 = PositionIndexWriter(f4)
  11.240 +wp = PositionDictionaryWriter(w3, w4, 2)
  11.241 +wd = TermDictionaryWriter(w, w2, wp, 3)
  11.242  for term, doc_positions in terms_with_positions:
  11.243      wd.write_term_positions(term, doc_positions)
  11.244  wd.close()
  11.245  
  11.246  f = open("test", "rb")
  11.247 -r = iixr.TermReader(f)
  11.248 +r = TermReader(f)
  11.249  f2 = open("testI", "rb")
  11.250 -r2 = iixr.TermIndexReader(f2)
  11.251 -r3 = iixr.PositionOpener("testP")
  11.252 -r4 = iixr.PositionIndexOpener("testPI")
  11.253 -rp = iixr.PositionDictionaryReader(r3, r4)
  11.254 -rd = iixr.TermDictionaryReader(r, r2, rp)
  11.255 +r2 = TermIndexReader(f2)
  11.256 +r3 = PositionOpener("testP")
  11.257 +r4 = PositionIndexOpener("testPI")
  11.258 +rp = PositionDictionaryReader(r3, r4)
  11.259 +rd = TermDictionaryReader(r, r2, rp)
  11.260  terms_reversed = terms_with_positions[:]
  11.261  terms_reversed.reverse()
  11.262  for term, doc_positions in terms_reversed:
  11.263 @@ -407,10 +411,10 @@
  11.264      ("shells", 37, None)
  11.265      ]
  11.266  
  11.267 -index = iixr.Index("test_index")
  11.268 +index = Index("test_index")
  11.269  wi = index.get_writer(3, 2, 6)
  11.270  for docnum, text in docs:
  11.271 -    doc = iixr.Document(docnum)
  11.272 +    doc = Document(docnum)
  11.273      for position, term in enumerate(text.split()):
  11.274          doc.add_position(term, position)
  11.275      doc.add_field(123, text)
2009-09-15	Paul Boddie	raw files shortlog changelog graph	Made iixr a package with several submodules.
			iixr/__init__.py (file) iixr/data.py (file) iixr/fields.py (file) iixr/files.py (file) iixr/filesystem.py (file) iixr/index.py (file) iixr/merging.py (file) iixr/positions.py (file) iixr/terms.py (file) setup.py (file) test.py (file)