# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1252966511 -7200
# Node ID fad9698e2c46a12bb0e924adb588fb75874b0bb6
# Parent  a0f37b0ef3502b8fc3b5715ce1f11f4242036516
Made iixr a package with several submodules.

diff -r a0f37b0ef350 -r fad9698e2c46 iixr/__init__.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iixr/__init__.py	Tue Sep 15 00:15:11 2009 +0200
@@ -0,0 +1,23 @@
+#!/usr/bin/env python
+
+"""
+A simple (and sane) text indexing library.
+
+Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from iixr.index import *
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r a0f37b0ef350 -r fad9698e2c46 iixr/data.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iixr/data.py	Tue Sep 15 00:15:11 2009 +0200
@@ -0,0 +1,64 @@
+#!/usr/bin/env python
+
+"""
+Variable-length integer functions.
+
+Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+try:
+    from vint import vint as _vint
+
+    def vint(number):
+
+        "Write 'number' as a variable-length integer."
+
+        if number >= 0:
+            return _vint(number)
+        else:
+            raise ValueError, "Number %r is negative." % number
+
+except ImportError:
+
+    def vint(number):
+
+        "Write 'number' as a variable-length integer."
+
+        if number >= 0:
+
+            # Special case: one byte containing a 7-bit number.
+
+            if number < 128:
+                return chr(number)
+
+            # Write the number from least to most significant digits.
+
+            bytes = []
+
+            while number != 0:
+                lsd = number & 127
+                number = number >> 7
+                if number != 0:
+                    lsd |= 128
+                bytes.append(chr(lsd))
+
+            return "".join(bytes)
+
+        # Negative numbers are not supported.
+
+        else:
+            raise ValueError, "Number %r is negative." % number
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r a0f37b0ef350 -r fad9698e2c46 iixr/fields.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iixr/fields.py	Tue Sep 15 00:15:11 2009 +0200
@@ -0,0 +1,256 @@
+#!/usr/bin/env python
+
+"""
+Specific classes for storing document information.
+
+Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from iixr.files import *
+from bisect import bisect_right  # to find terms in the dictionary index
+
+class FieldWriter(FileWriter):
+
+    "Writing field data to files."
+
+    def reset(self):
+        self.last_docnum = 0
+
+    def write_fields(self, docnum, fields):
+
+        """
+        Write for the given 'docnum', a list of 'fields' (integer, string pairs
+        representing field identifiers and values respectively).
+        Return the offset at which the fields are stored.
+        """
+
+        offset = self.tell()
+
+        # Write the document number delta.
+
+        self.write_number(docnum - self.last_docnum)
+
+        # Write the number of fields.
+
+        self.write_number(len(fields))
+
+        # Write the fields themselves.
+
+        for i, field in fields:
+            self.write_number(i)
+            self.write_string(field, 1) # compress
+
+        self.last_docnum = docnum
+        return offset
+
+class FieldReader(FileReader):
+
+    "Reading field data from files."
+
+    def reset(self):
+        self.last_docnum = 0
+
+    def read_fields(self):
+
+        """
+        Read fields from the file, returning a tuple containing the document
+        number and a list of field (identifier, value) pairs.
+        """
+
+        # Read the document number.
+
+        self.last_docnum += self.read_number()
+
+        # Read the number of fields.
+
+        nfields = self.read_number()
+
+        # Collect the fields.
+
+        fields = []
+        i = 0
+
+        while i < nfields:
+            identifier = self.read_number()
+            value = self.read_string(1) # decompress
+            fields.append((identifier, value))
+            i += 1
+
+        return self.last_docnum, fields
+
+    def read_document_fields(self, docnum, offset):
+
+        """
+        Read fields for 'docnum' at the given 'offset'. This permits the
+        retrieval of details for the specified document, as well as scanning for
+        later documents.
+        """
+
+        self.seek(offset)
+        bad_docnum, fields = self.read_fields()
+        self.last_docnum = docnum
+        return docnum, fields
+
+class FieldIndexWriter(FileWriter):
+
+    "Writing field index details to files."
+
+    def reset(self):
+        self.last_docnum = 0
+        self.last_offset = 0
+
+    def write_document(self, docnum, offset):
+
+        """
+        Write for the given 'docnum', the 'offset' at which the fields for the
+        document are stored in the fields file.
+        """
+
+        # Write the document number and offset deltas.
+
+        self.write_number(docnum - self.last_docnum)
+        self.write_number(offset - self.last_offset)
+
+        self.last_docnum = docnum
+        self.last_offset = offset
+
+class FieldIndexReader(FileReader):
+
+    "Reading field index details from files."
+
+    def reset(self):
+        self.last_docnum = 0
+        self.last_offset = 0
+
+    def read_document(self):
+
+        "Read a document number and field file offset."
+
+        # Read the document number delta and offset.
+
+        self.last_docnum += self.read_number()
+        self.last_offset += self.read_number()
+
+        return self.last_docnum, self.last_offset
+
+class FieldDictionaryWriter:
+
+    "Writing field dictionary details."
+
+    def __init__(self, field_writer, field_index_writer, interval):
+        self.field_writer = field_writer
+        self.field_index_writer = field_index_writer
+        self.interval = interval
+        self.entry = 0
+
+    def write_fields(self, docnum, fields):
+
+        "Write details of the document with the given 'docnum' and 'fields'."
+
+        offset = self.field_writer.write_fields(docnum, fields)
+
+        if self.entry % self.interval == 0:
+            self.field_index_writer.write_document(docnum, offset)
+
+        self.entry += 1
+
+    def close(self):
+        self.field_writer.close()
+        self.field_index_writer.close()
+
+class FieldDictionaryReader:
+
+    "Reading field dictionary details."
+
+    def __init__(self, field_reader, field_index_reader):
+        self.field_reader = field_reader
+        self.field_index_reader = field_index_reader
+
+        self.docs = []
+        try:
+            while 1:
+                self.docs.append(self.field_index_reader.read_document())
+        except EOFError:
+            pass
+
+        # Large numbers for ordering purposes.
+
+        if self.docs:
+            self.max_offset = self.docs[-1][1]
+        else:
+            self.max_offset = None
+
+    # Iterator convenience methods.
+
+    def __iter__(self):
+        self.rewind()
+        return self
+
+    def next(self):
+        try:
+            return self.read_fields()
+        except EOFError:
+            raise StopIteration
+
+    # Sequential access methods.
+
+    def rewind(self):
+        self.field_reader.rewind()
+
+    def read_fields(self):
+
+        "Return the next document number and fields."
+
+        return self.field_reader.read_fields()
+
+    # Random access methods.
+
+    def get_fields(self, docnum):
+
+        "Read the fields of the document with the given 'docnum'."
+
+        i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
+
+        # Get the entry position providing the term or one preceding it.
+
+        if i == -1:
+            return None
+
+        found_docnum, offset = self.docs[i]
+
+        # Read from the fields file.
+
+        found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
+
+        # Scan for the document, if necessary.
+
+        try:
+            while docnum > found_docnum:
+                found_docnum, fields = self.field_reader.read_fields()
+        except EOFError:
+            pass
+
+        # If the document is found, return the fields.
+
+        if docnum == found_docnum:
+            return fields
+        else:
+            return None
+
+    def close(self):
+        self.field_reader.close()
+        self.field_index_reader.close()
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r a0f37b0ef350 -r fad9698e2c46 iixr/files.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iixr/files.py	Tue Sep 15 00:15:11 2009 +0200
@@ -0,0 +1,264 @@
+#!/usr/bin/env python
+
+"""
+Generic file access.
+
+Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from iixr.data import vint
+import bz2, zlib
+
+# Constants.
+
+WRITE_CACHE_SIZE  = 100000
+READ_CACHE_SIZE   = 10000
+READ_CACHE_RESIZE = 5000
+
+compressors = [("b", bz2.compress), ("z", zlib.compress)]
+decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
+
+class File:
+
+    "A basic file abstraction."
+
+    def __init__(self, f):
+        self.f = f
+        self.reset()
+
+    def reset(self):
+
+        "To be used to reset the state of the reader or writer between records."
+
+        pass
+
+    def rewind(self):
+        self.seek(0)
+        self.reset()
+
+    def seek(self, offset):
+
+        "To be defined by readers."
+
+        pass
+
+    def flush(self):
+
+        "To be defined by writers."
+
+        pass
+
+    def close(self):
+        if self.f is not None:
+            self.flush()
+            self.f.close()
+            self.f = None
+
+class FileWriter(File):
+
+    "Writing basic data types to files."
+
+    def __init__(self, f):
+        File.__init__(self, f)
+        self.cache = []
+        self.cache_length = 0
+
+    def write_number(self, number):
+
+        "Write 'number' to the file using a variable length encoding."
+
+        self.write(vint(number))
+
+    def write_string(self, s, compress=0):
+
+        """
+        Write 's' to the file, recording its length and compressing the string
+        if 'compress' is set to a true value.
+        """
+
+        # Convert Unicode objects to strings.
+
+        if isinstance(s, unicode):
+            s = s.encode("utf-8")
+
+        # Compress the string if requested.
+
+        if compress:
+            for flag, fn in compressors:
+                cs = fn(s)
+
+                # Take the first string shorter than the original.
+
+                if len(cs) < len(s):
+                    s = cs
+                    break
+            else:
+                flag = "-"
+
+        else:
+            flag = ""
+
+        # Write the length of the data before the data itself.
+
+        length = len(s)
+        self.write(flag + vint(length) + s)
+
+    # Cache-affected methods.
+
+    def write(self, s):
+        self.cache.append(s)
+        self.cache_length += len(s)
+        if self.cache_length >= WRITE_CACHE_SIZE:
+            self.flush()
+
+    def tell(self):
+        return self.f.tell() + self.cache_length
+
+    def flush(self):
+        self.f.write("".join(self.cache))
+        self.cache = []
+        self.cache_length = 0
+
+class FileReader(File):
+
+    "Reading basic data types from files."
+
+    def __init__(self, f):
+        File.__init__(self, f)
+        self.reset_cache()
+
+    def reset_cache(self):
+        self.cache = ""
+        self.cache_length = 0
+        self.cache_start = 0
+
+    def read_number(self):
+
+        "Read a number from the file."
+
+        # Read each byte, adding it to the number.
+
+        shift = 0
+        number = 0
+        read = self.read
+
+        try:
+            csd = ord(read(1))
+            while csd & 128:
+                number += ((csd & 127) << shift)
+                shift += 7
+                csd = ord(read(1))
+            else:
+                number += (csd << shift)
+        except TypeError:
+            raise EOFError
+
+        return number
+
+    def read_string(self, decompress=0):
+
+        """
+        Read a string from the file, decompressing the stored data if
+        'decompress' is set to a true value.
+        """
+
+        # Decompress the data if requested.
+
+        if decompress:
+            flag = self.read(1)
+        else:
+            flag = "-"
+
+        length = self.read_number()
+        s = self.read(length)
+
+        # Perform decompression if applicable.
+
+        if flag != "-":
+            fn = decompressors[flag]
+            s = fn(s)
+
+        # Convert strings to Unicode objects.
+
+        return unicode(s, "utf-8")
+
+    # Cache-affected methods.
+
+    def read(self, n):
+        needed = n - (self.cache_length - self.cache_start)
+
+        # Read the needed number of characters, if possible.
+
+        if needed > 0:
+            s = self.f.read(max(needed, READ_CACHE_SIZE))
+            self.cache += s
+            self.cache_length += len(s)
+
+        # Get the end of the requested block.
+
+        next_start = self.cache_start + n
+        s = self.cache[self.cache_start:next_start]
+
+        # Reposition the pointer to the cache.
+
+        self._seek_cache(len(s))
+        return s
+
+    def tell(self):
+        return self.f.tell() - self.cache_length + self.cache_start
+
+    def seek(self, offset):
+        current = self.tell()
+        self.f.seek(offset)
+
+        # If seeking forward, attempt to navigate the cache.
+
+        if offset >= current:
+            self._seek_cache(offset - current)
+        else:
+            self.reset_cache()
+
+    def _seek_cache(self, delta):
+        next_start = self.cache_start + delta
+
+        if next_start > 0 and next_start >= len(self.cache):
+            self.reset_cache()
+
+        # If the cache is too big, resize it.
+
+        elif next_start > READ_CACHE_RESIZE:
+            self.cache = self.cache[next_start:]
+            self.cache_length = len(self.cache)
+            self.cache_start = 0
+
+        # Otherwise, just reference the next part of the cache.
+
+        else:
+            self.cache_start = next_start
+
+class FileOpener:
+
+    "Opening files using their filenames."
+
+    def __init__(self, filename):
+        self.filename = filename
+
+    def open(self, mode):
+        return open(self.filename, mode)
+
+    def close(self):
+        pass
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r a0f37b0ef350 -r fad9698e2c46 iixr/filesystem.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iixr/filesystem.py	Tue Sep 15 00:15:11 2009 +0200
@@ -0,0 +1,129 @@
+#!/usr/bin/env python
+
+"""
+File access.
+
+Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from iixr.fields import *
+from iixr.terms import *
+from iixr.positions import *
+from os import remove, rename    # partition manipulation
+from os.path import join
+
+# Constants.
+
+TERM_FILENAMES    = "terms", "terms_index", "positions", "positions_index"
+FIELD_FILENAMES   = "fields", "fields_index"
+
+# Utility functions.
+
+def get_term_writer(pathname, partition, interval, doc_interval):
+
+    """
+    Return a term dictionary writer using files under the given 'pathname'
+    labelled according to the given 'partition', using the given indexing
+    'interval' for terms and 'doc_interval' for document position records.
+    """
+
+    tdf = open(join(pathname, "terms-%s" % partition), "wb")
+    info_writer = TermWriter(tdf)
+
+    tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
+    index_writer = TermIndexWriter(tdif)
+
+    tpf = open(join(pathname, "positions-%s" % partition), "wb")
+    positions_writer = PositionWriter(tpf)
+
+    tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
+    positions_index_writer = PositionIndexWriter(tpif)
+
+    positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
+
+    return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
+
+def get_field_writer(pathname, partition, interval):
+
+    """
+    Return a field dictionary writer using files under the given 'pathname'
+    labelled according to the given 'partition', using the given indexing
+    'interval'.
+    """
+
+    ff = open(join(pathname, "fields-%s" % partition), "wb")
+    field_writer = FieldWriter(ff)
+
+    fif = open(join(pathname, "fields_index-%s" % partition), "wb")
+    field_index_writer = FieldIndexWriter(fif)
+
+    return FieldDictionaryWriter(field_writer, field_index_writer, interval)
+
+def get_term_reader(pathname, partition):
+
+    """
+    Return a term dictionary reader using files under the given 'pathname'
+    labelled according to the given 'partition'.
+    """
+
+    tdf = open(join(pathname, "terms-%s" % partition), "rb")
+    info_reader = TermReader(tdf)
+
+    tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
+    index_reader = TermIndexReader(tdif)
+
+    positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))
+    positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))
+
+    positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)
+
+    return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
+
+def get_field_reader(pathname, partition):
+
+    """
+    Return a field dictionary reader using files under the given 'pathname'
+    labelled according to the given 'partition'.
+    """
+
+    ff = open(join(pathname, "fields-%s" % partition), "rb")
+    field_reader = FieldReader(ff)
+
+    fif = open(join(pathname, "fields_index-%s" % partition), "rb")
+    field_index_reader = FieldIndexReader(fif)
+
+    return FieldDictionaryReader(field_reader, field_index_reader)
+
+def rename_files(pathname, names, from_partition, to_partition):
+    for name in names:
+        rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
+
+def rename_term_files(pathname, from_partition, to_partition):
+    rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
+
+def rename_field_files(pathname, from_partition, to_partition):
+    rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
+
+def remove_files(pathname, names, partition):
+    for name in names:
+        remove(join(pathname, "%s-%s" % (name, partition)))
+
+def remove_term_files(pathname, partition):
+    remove_files(pathname, TERM_FILENAMES, partition)
+
+def remove_field_files(pathname, partition):
+    remove_files(pathname, FIELD_FILENAMES, partition)
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r a0f37b0ef350 -r fad9698e2c46 iixr/index.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iixr/index.py	Tue Sep 15 00:15:11 2009 +0200
@@ -0,0 +1,326 @@
+#!/usr/bin/env python
+
+"""
+High-level classes.
+
+Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from iixr.filesystem import *
+from os import listdir, mkdir    # index and partition discovery
+from os.path import exists
+
+try:
+    set
+except NameError:
+    from sets import Set as set
+
+# Constants.
+
+TERM_INTERVAL     = 100
+DOCUMENT_INTERVAL = 100
+FIELD_INTERVAL    = 100
+FLUSH_INTERVAL    = 10000
+
+# High-level classes.
+
+class Document:
+
+    "A container of document information."
+
+    def __init__(self, docnum):
+        self.docnum = docnum
+        self.fields = []
+        self.terms = {}
+
+    def add_position(self, term, position):
+
+        """
+        Add a position entry for the given 'term', indicating the given
+        'position'.
+        """
+
+        self.terms.setdefault(term, []).append(position)
+
+    def add_field(self, identifier, value):
+
+        "Add a field having the given 'identifier' and 'value'."
+
+        self.fields.append((identifier, unicode(value))) # convert to string
+
+    def set_fields(self, fields):
+
+        """
+        Set the document's 'fields': a list of tuples each containing an integer
+        identifier and a string value.
+        """
+
+        self.fields = fields
+
+class IndexWriter:
+
+    """
+    Building term information and writing it to the term and field dictionaries.
+    """
+
+    def __init__(self, pathname, interval, doc_interval, flush_interval):
+        self.pathname = pathname
+        self.interval = interval
+        self.doc_interval = doc_interval
+        self.flush_interval = flush_interval
+
+        self.dict_partition = 0
+        self.field_dict_partition = 0
+
+        self.terms = {}
+        self.docs = {}
+
+        self.doc_counter = 0
+
+    def add_document(self, doc):
+
+        """
+        Add the given document 'doc', updating the document counter and flushing
+        terms and fields if appropriate.
+        """
+
+        for term, positions in doc.terms.items():
+            self.terms.setdefault(term, {})[doc.docnum] = positions
+
+        self.docs[doc.docnum] = doc.fields
+
+        self.doc_counter += 1
+        if self.flush_interval and self.doc_counter >= self.flush_interval:
+            self.flush_terms()
+            self.flush_fields()
+            self.doc_counter = 0
+
+    def get_term_writer(self):
+
+        "Return a term dictionary writer for the current partition."
+
+        return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
+
+    def get_field_writer(self):
+
+        "Return a field dictionary writer for the current partition."
+
+        return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
+
+    def flush_terms(self):
+
+        "Flush terms into the current term dictionary partition."
+
+        # Get the terms in order.
+
+        all_terms = self.terms
+        terms = all_terms.keys()
+        terms.sort()
+
+        dict_writer = self.get_term_writer()
+
+        for term in terms:
+            doc_positions = all_terms[term].items()
+            dict_writer.write_term_positions(term, doc_positions)
+
+        dict_writer.close()
+
+        self.terms = {}
+        self.dict_partition += 1
+
+    def flush_fields(self):
+
+        "Flush fields into the current term dictionary partition."
+
+        # Get the documents in order.
+
+        docs = self.docs.items()
+        docs.sort()
+
+        field_dict_writer = self.get_field_writer()
+
+        for docnum, fields in docs:
+            field_dict_writer.write_fields(docnum, fields)
+
+        field_dict_writer.close()
+
+        self.docs = {}
+        self.field_dict_partition += 1
+
+    def close(self):
+        if self.terms:
+            self.flush_terms()
+        if self.docs:
+            self.flush_fields()
+
+class IndexReader:
+
+    "Accessing the term and field dictionaries."
+
+    def __init__(self, pathname):
+        self.dict_reader = get_term_reader(pathname, "merged")
+        self.field_dict_reader = get_field_reader(pathname, "merged")
+
+    def find_terms(self, term):
+        return self.dict_reader.find_terms(term)
+
+    def find_positions(self, term):
+        return self.dict_reader.find_positions(term)
+
+    def get_frequency(self, term):
+        return self.dict_reader.get_frequency(term)
+
+    def get_document_frequency(self, term):
+        return self.dict_reader.get_document_frequency(term)
+
+    def get_fields(self, docnum):
+        return self.field_dict_reader.get_fields(docnum)
+
+    def close(self):
+        self.dict_reader.close()
+        self.field_dict_reader.close()
+
+class Index:
+
+    "An inverted index solution encapsulating the various components."
+
+    def __init__(self, pathname):
+        self.pathname = pathname
+        self.reader = None
+        self.writer = None
+
+    def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
+
+        """
+        Return a writer, optionally using the given indexing 'interval',
+        'doc_interval' and 'flush_interval'.
+        """
+
+        if not exists(self.pathname):
+            mkdir(self.pathname)
+
+        self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
+        return self.writer
+
+    def get_reader(self, partition=0):
+
+        "Return a reader for the index."
+
+        # Ensure that only one partition exists.
+
+        self.merge()
+        return self._get_reader(partition)
+
+    def _get_reader(self, partition):
+
+        "Return a reader for the index."
+
+        if not exists(self.pathname):
+            raise OSError, "Index path %r does not exist." % self.pathname
+
+        self.reader = IndexReader(self.pathname)
+        return self.reader
+
+    def merge(self):
+
+        "Merge/optimise index partitions."
+
+        self.merge_terms()
+        self.merge_fields()
+
+    def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
+
+        """
+        Merge term dictionaries using the given indexing 'interval' and
+        'doc_interval'.
+        """
+
+        readers = []
+        partitions = set()
+
+        for filename in listdir(self.pathname):
+            if filename.startswith("terms-"): # 6 character prefix
+                partition = filename[6:]
+                readers.append(get_term_reader(self.pathname, partition))
+                partitions.add(partition)
+
+        # Write directly to a dictionary.
+
+        if len(readers) > 1:
+            if "merged" in partitions:
+                rename_term_files(self.pathname, "merged", "old-merged")
+                partitions.remove("merged")
+                partitions.add("old-merged")
+
+            writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
+            merger = TermDictionaryMerger(writer, readers)
+            merger.merge()
+            merger.close()
+
+            # Remove old files.
+
+            for partition in partitions:
+                remove_term_files(self.pathname, partition)
+
+        elif len(readers) == 1:
+            partition = list(partitions)[0]
+            if partition != "merged":
+                rename_term_files(self.pathname, partition, "merged")
+
+    def merge_fields(self, interval=FIELD_INTERVAL):
+
+        "Merge field dictionaries using the given indexing 'interval'."
+
+        readers = []
+        partitions = set()
+
+        for filename in listdir(self.pathname):
+            if filename.startswith("fields-"): # 7 character prefix
+                partition = filename[7:]
+                readers.append(get_field_reader(self.pathname, partition))
+                partitions.add(partition)
+
+        # Write directly to a dictionary.
+
+        if len(readers) > 1:
+            if "merged" in partitions:
+                rename_field_files(self.pathname, "merged", "old-merged")
+                partitions.remove("merged")
+                partitions.add("old-merged")
+
+            writer = get_field_writer(self.pathname, "merged", interval)
+            merger = FieldDictionaryMerger(writer, readers)
+            merger.merge()
+            merger.close()
+
+            # Remove old files.
+
+            for partition in partitions:
+                remove_field_files(self.pathname, partition)
+
+        elif len(readers) == 1:
+            partition = list(partitions)[0]
+            if partition != "merged":
+                rename_field_files(self.pathname, partition, "merged")
+
+    def close(self):
+        if self.reader is not None:
+            self.reader.close()
+            self.reader = None
+        if self.writer is not None:
+            self.writer.close()
+            self.writer = None
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r a0f37b0ef350 -r fad9698e2c46 iixr/merging.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iixr/merging.py	Tue Sep 15 00:15:11 2009 +0200
@@ -0,0 +1,74 @@
+#!/usr/bin/env python
+
+"""
+Dictionary merging classes.
+
+Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from itermerge import itermerge
+
+class Merger:
+
+    "Merge files."
+
+    def __init__(self, writer, readers):
+        self.writer = writer
+        self.readers = readers
+
+    def close(self):
+        for reader in self.readers:
+            reader.close()
+        self.writer.close()
+
+class TermDictionaryMerger(Merger):
+
+    "Merge term and position files."
+
+    def merge(self):
+
+        """
+        Merge terms and positions from the readers, sending them to the writer.
+        """
+
+        last_term = None
+        current_readers = []
+
+        for term, frequency, doc_frequency, positions in itermerge(self.readers):
+            if term == last_term:
+                current_readers.append(positions)
+            else:
+                if current_readers:
+                    self.writer.write_term_positions(last_term, itermerge(current_readers))
+                last_term = term
+                current_readers = [positions]
+        else:
+            if current_readers:
+                self.writer.write_term_positions(last_term, itermerge(current_readers))
+
+class FieldDictionaryMerger(Merger):
+
+    "Merge field files."
+
+    def merge(self):
+
+        """
+        Merge fields from the readers, sending them to the writer.
+        """
+
+        for docnum, fields in itermerge(self.readers):
+            self.writer.write_fields(docnum, fields)
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r a0f37b0ef350 -r fad9698e2c46 iixr/positions.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iixr/positions.py	Tue Sep 15 00:15:11 2009 +0200
@@ -0,0 +1,525 @@
+#!/usr/bin/env python
+
+"""
+Specific classes for storing position information.
+
+Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from iixr.files import *
+from iixr.data import vint
+
+class PositionWriter(FileWriter):
+
+    "Writing position information to files."
+
+    def reset(self):
+        self.last_docnum = 0
+
+    def write_positions(self, docnum, positions):
+
+        """
+        Write for the document 'docnum' the given 'positions'.
+        Return the offset of the written record.
+        """
+
+        if docnum < self.last_docnum:
+            raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
+
+        # Record the offset of this record.
+
+        offset = self.tell()
+
+        # Make sure that the positions are sorted.
+
+        positions.sort()
+
+        # Write the position deltas.
+
+        output = []
+        last = 0
+
+        for position in positions:
+            output.append(vint(position - last))
+            last = position
+
+        # Write the document number delta.
+        # Write the number of positions.
+        # Then write the positions.
+
+        self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))
+
+        self.last_docnum = docnum
+        return offset
+
+class PositionOpener(FileOpener):
+
+    "Reading position information from files."
+
+    def read_term_positions(self, offset, count):
+
+        """
+        Read all positions from 'offset', seeking to that position in the file
+        before reading. The number of documents available for reading is limited
+        to 'count'.
+        """
+
+        # Duplicate the file handle.
+
+        f = self.open("rb")
+        return PositionIterator(f, offset, count)
+
+class PositionIndexWriter(FileWriter):
+
+    "Writing position index information to files."
+
+    def reset(self):
+        self.last_docnum = 0
+        self.last_pos_offset = 0
+
+    def write_positions(self, docnum, pos_offset, count):
+
+        """
+        Write the given 'docnum, 'pos_offset' and document 'count' to the
+        position index file.
+        """
+
+        # Record the offset of this record.
+
+        offset = self.tell()
+        output = []
+
+        # Write the document number delta.
+
+        output.append(vint(docnum - self.last_docnum))
+        self.last_docnum = docnum
+
+        # Write the position file offset delta.
+
+        output.append(vint(pos_offset - self.last_pos_offset))
+        self.last_pos_offset = pos_offset
+
+        # Write the document count.
+
+        output.append(vint(count))
+
+        # Actually write the data.
+
+        self.write("".join(output))
+
+        return offset
+
+class PositionIndexOpener(FileOpener):
+
+    "Reading position index information from files."
+
+    def read_term_positions(self, offset, doc_frequency):
+
+        """
+        Read all positions from 'offset', seeking to that position in the file
+        before reading. The number of documents available for reading is limited
+        to 'doc_frequency'.
+        """
+
+        # Duplicate the file handle.
+
+        f = self.open("rb")
+        return PositionIndexIterator(f, offset, doc_frequency)
+
+# Iterators for position-related files.
+
+class IteratorBase:
+
+    def __init__(self, count):
+        self.replenish(count)
+
+    def replenish(self, count):
+        self.count = count
+        self.read_documents = 0
+
+    def __len__(self):
+        return self.count
+
+    def sort(self):
+        pass # Stored document positions are already sorted.
+
+    def __iter__(self):
+        return self
+
+class PositionIterator(FileReader, IteratorBase):
+
+    "Iterating over document positions."
+
+    def __init__(self, f, offset, count):
+        FileReader.__init__(self, f)
+        IteratorBase.__init__(self, count)
+        self.seek(offset)
+
+    def reset(self):
+        self.last_docnum = 0
+
+    def read_positions(self):
+
+        "Read positions, returning a document number and a list of positions."
+
+        # Read the document number delta and add it to the last number.
+
+        self.last_docnum += self.read_number()
+
+        # Read the number of positions.
+
+        npositions = self.read_number()
+
+        # Read the position deltas, adding each previous position to get the
+        # appropriate collection of absolute positions.
+
+        i = 0
+        last = 0
+        positions = []
+
+        while i < npositions:
+            last += self.read_number()
+            positions.append(last)
+            i += 1
+
+        return self.last_docnum, positions
+
+    def next(self):
+
+        "Read positions for a single document."
+
+        if self.read_documents < self.count:
+            self.read_documents += 1
+            return self.read_positions()
+        else:
+            raise StopIteration
+
+class PositionIndexIterator(FileReader, IteratorBase):
+
+    "Iterating over document positions."
+
+    def __init__(self, f, offset, count):
+        FileReader.__init__(self, f)
+        IteratorBase.__init__(self, count)
+        self.seek(offset)
+        self.section_count = 0
+
+    def reset(self):
+        self.last_docnum = 0
+        self.last_pos_offset = 0
+
+    def read_positions(self):
+
+        """
+        Read a document number, a position file offset for the position index
+        file, and the number of documents in a section of that file.
+        """
+
+        # Read the document number delta.
+
+        self.last_docnum += self.read_number()
+
+        # Read the offset delta.
+
+        self.last_pos_offset += self.read_number()
+
+        # Read the document count.
+
+        count = self.read_number()
+
+        return self.last_docnum, self.last_pos_offset, count
+
+    def next(self):
+
+        "Read positions for a single document."
+
+        self.read_documents += self.section_count
+        if self.read_documents < self.count:
+            docnum, pos_offset, self.section_count = t = self.read_positions()
+            return t
+        else:
+            raise StopIteration
+
+class PositionDictionaryWriter:
+
+    "Writing position dictionaries."
+
+    def __init__(self, position_writer, position_index_writer, interval):
+        self.position_writer = position_writer
+        self.position_index_writer = position_index_writer
+        self.interval = interval
+
+    def write_term_positions(self, doc_positions):
+
+        """
+        Write all 'doc_positions' - a collection of tuples of the form (document
+        number, position list) - to the file.
+
+        Add some records to the index, making dictionary entries.
+
+        Return a tuple containing the offset of the written data, the frequency
+        (number of positions), and document frequency (number of documents) for
+        the term involved.
+        """
+
+        # Reset the writers.
+
+        self.position_writer.reset()
+        self.position_index_writer.reset()
+
+        index_offset = None
+
+        # Write the positions.
+
+        frequency = 0
+        first_docnum = None
+        first_offset = None
+        count = 0
+
+        doc_positions.sort()
+
+        for docnum, positions in doc_positions:
+            pos_offset = self.position_writer.write_positions(docnum, positions)
+
+            # Retain the first record offset for a subsequent index entry.
+
+            if first_offset is None:
+                first_offset = pos_offset
+                first_docnum = docnum
+
+            frequency += len(positions)
+            count += 1
+
+            # Every {interval} entries, write an index entry.
+
+            if count % self.interval == 0:
+                io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
+
+                # Remember the first index entry offset.
+
+                if index_offset is None:
+                    index_offset = io
+
+                first_offset = None
+                first_docnum = None
+
+                # Reset the position writer so that position readers accessing
+                # a section start with the correct document number.
+
+                self.position_writer.reset()
+
+        # Finish writing an index entry for the remaining documents.
+
+        else:
+            if first_offset is not None:
+                io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
+
+                # Remember the first index entry offset.
+
+                if index_offset is None:
+                    index_offset = io
+
+        return index_offset, frequency, count
+
+    def close(self):
+        self.position_writer.close()
+        self.position_index_writer.close()
+
+class PositionDictionaryReader:
+
+    "Reading position dictionaries."
+
+    def __init__(self, position_opener, position_index_opener):
+        self.position_opener = position_opener
+        self.position_index_opener = position_index_opener
+
+    def read_term_positions(self, offset, doc_frequency):
+
+        """
+        Return an iterator for dictionary entries starting at 'offset' with the
+        given 'doc_frequency'.
+        """
+
+        return PositionDictionaryIterator(self.position_opener,
+            self.position_index_opener, offset, doc_frequency)
+
+    def close(self):
+        pass
+
+class PositionDictionaryIterator:
+
+    "Iteration over position dictionary entries."
+
+    def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
+        self.position_opener = position_opener
+        self.doc_frequency = doc_frequency
+        self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
+        self.iterator = None
+
+        # Remember the last values.
+
+        self.found_docnum, self.found_positions = None, None
+
+        # Maintain state for the next index entry, if read.
+
+        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
+
+        # Initialise the current index entry and current position file iterator.
+
+        self._next_section()
+        self._init_section()
+
+    # Sequence methods.
+
+    def __len__(self):
+        return self.doc_frequency
+
+    def sort(self):
+        pass
+
+    # Iterator methods.
+
+    def __iter__(self):
+        return self
+
+    def next(self):
+
+        """
+        Attempt to get the next document record from the section in the
+        positions file.
+        """
+
+        # Return any visited but unrequested record.
+
+        if self.found_docnum is not None:
+            t = self.found_docnum, self.found_positions
+            self.found_docnum, self.found_positions = None, None
+            return t
+
+        # Or search for the next record.
+
+        while 1:
+
+            # Either return the next record.
+
+            try:
+                return self.iterator.next()
+
+            # Or, where a section is finished, get the next section and try again.
+
+            except StopIteration:
+
+                # Where a section follows, update the index iterator, but keep
+                # reading using the same file iterator (since the data should
+                # just follow on from the last section).
+
+                self._next_section()
+                self.iterator.replenish(self.section_count)
+
+                # Reset the state of the iterator to make sure that document
+                # numbers are correct.
+
+                self.iterator.reset()
+
+    def from_document(self, docnum):
+
+        """
+        Attempt to navigate to a positions entry for the given 'docnum',
+        returning the positions for 'docnum', or None otherwise.
+        """
+
+        # Return any unrequested document positions.
+
+        if docnum == self.found_docnum:
+            return self.found_positions
+
+        # Read ahead in the index until the next entry refers to a document
+        # later than the desired document.
+
+        try:
+            if self.next_docnum is None:
+                self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
+
+            # Read until the next entry is after the desired document number,
+            # or until the end of the results.
+
+            while self.next_docnum <= docnum:
+                self._next_read_section()
+                if self.docnum < docnum:
+                    self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
+                else:
+                    break
+
+        except StopIteration:
+            pass
+
+        # Navigate in the position file to the document.
+
+        self._init_section()
+
+        try:
+            while 1:
+                found_docnum, found_positions = self.iterator.next()
+
+                # Return the desired document positions or None (retaining the
+                # positions for the document immediately after).
+
+                if docnum == found_docnum:
+                    return found_positions
+                elif docnum < found_docnum:
+                    self.found_docnum, self.found_positions = found_docnum, found_positions
+                    return None
+
+        except StopIteration:
+            return None
+
+    # Internal methods.
+
+    def _next_section(self):
+
+        "Attempt to get the next section in the index."
+
+        if self.next_docnum is None:
+            self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
+        else:
+            self._next_read_section()
+
+    def _next_read_section(self):
+
+        """
+        Make the next index entry the current one without reading from the
+        index.
+        """
+
+        self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
+        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
+
+    def _init_section(self):
+
+        "Initialise the iterator for the section in the position file."
+
+        if self.iterator is not None:
+            self.iterator.close()
+        self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
+
+    def close(self):
+        if self.iterator is not None:
+            self.iterator.close()
+            self.iterator = None
+        if self.index_iterator is not None:
+            self.index_iterator.close()
+            self.index_iterator = None
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r a0f37b0ef350 -r fad9698e2c46 iixr/terms.py
--- /dev/null	Thu Jan 01 00:00:00 1970 +0000
+++ b/iixr/terms.py	Tue Sep 15 00:15:11 2009 +0200
@@ -0,0 +1,395 @@
+#!/usr/bin/env python
+
+"""
+Specific classes for storing term information.
+
+Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
+
+This program is free software; you can redistribute it and/or modify it under
+the terms of the GNU General Public License as published by the Free Software
+Foundation; either version 3 of the License, or (at your option) any later
+version.
+
+This program is distributed in the hope that it will be useful, but WITHOUT ANY
+WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
+PARTICULAR PURPOSE.  See the GNU General Public License for more details.
+
+You should have received a copy of the GNU General Public License along
+with this program.  If not, see <http://www.gnu.org/licenses/>.
+"""
+
+from iixr.files import *
+from os.path import commonprefix # to find common string prefixes
+from bisect import bisect_right  # to find terms in the dictionary index
+
+class TermWriter(FileWriter):
+
+    "Writing term information to files."
+
+    def reset(self):
+        self.last_term = ""
+        self.last_offset = 0
+
+    def write_term(self, term, offset, frequency, doc_frequency):
+
+        """
+        Write the given 'term', its position file 'offset', its 'frequency' and
+        its 'doc_frequency' (number of documents in which it appears) to the
+        term information file. Return the offset after the term information was
+        written to the file.
+        """
+
+        # Write the prefix length and term suffix.
+
+        common = len(commonprefix([self.last_term, term]))
+        suffix = term[common:]
+
+        self.write_number(common)
+        self.write_string(suffix)
+
+        # Write the offset delta.
+
+        self.write_number(offset - self.last_offset)
+
+        # Write the frequency.
+
+        self.write_number(frequency)
+
+        # Write the document frequency.
+
+        self.write_number(doc_frequency)
+
+        self.last_term = term
+        self.last_offset = offset
+
+        return self.tell()
+
+class TermReader(FileReader):
+
+    "Reading term information from files."
+
+    def reset(self):
+        self.last_term = ""
+        self.last_offset = 0
+
+    def read_term(self):
+
+        """
+        Read a term, its position file offset, its frequency and its document
+        frequency from the term information file.
+        """
+
+        # Read the prefix length and term suffix.
+
+        common = self.read_number()
+        suffix = self.read_string()
+
+        self.last_term = self.last_term[:common] + suffix
+
+        # Read the offset delta.
+
+        self.last_offset += self.read_number()
+
+        # Read the frequency.
+
+        frequency = self.read_number()
+
+        # Read the document frequency.
+
+        doc_frequency = self.read_number()
+
+        return self.last_term, self.last_offset, frequency, doc_frequency
+
+    def go_to_term(self, term, offset, info_offset):
+
+        """
+        Seek past the entry for 'term' having 'offset' to 'info_offset'. This
+        permits the scanning for later terms from the specified term.
+        """
+
+        self.seek(info_offset)
+        self.last_term = term
+        self.last_offset = offset
+
+class TermIndexWriter(TermWriter):
+
+    "Writing term dictionary index details to files."
+
+    def reset(self):
+        TermWriter.reset(self)
+        self.last_info_offset = 0
+
+    def write_term(self, term, offset, frequency, doc_frequency, info_offset):
+
+        """
+        Write the given 'term', its position file 'offset', its 'frequency' and
+        its 'doc_frequency' to the term dictionary index file, along with the
+        'info_offset' in the term information file.
+        """
+
+        TermWriter.write_term(self, term, offset, frequency, doc_frequency)
+
+        # Write the information file offset delta.
+
+        self.write_number(info_offset - self.last_info_offset)
+        self.last_info_offset = info_offset
+
+class TermIndexReader(TermReader):
+
+    "Reading term dictionary index details from files."
+
+    def reset(self):
+        TermReader.reset(self)
+        self.last_info_offset = 0
+
+    def read_term(self):
+
+        """
+        Read a term, its position file offset, its frequency, its document
+        frequency and a term information file offset from the term dictionary
+        index file.
+        """
+
+        term, offset, frequency, doc_frequency = TermReader.read_term(self)
+
+        # Read the offset delta.
+
+        self.last_info_offset += self.read_number()
+
+        return term, offset, frequency, doc_frequency, self.last_info_offset
+
+class TermDictionaryWriter:
+
+    "Writing term dictionaries."
+
+    def __init__(self, info_writer, index_writer, position_dict_writer, interval):
+        self.info_writer = info_writer
+        self.index_writer = index_writer
+        self.position_dict_writer = position_dict_writer
+        self.interval = interval
+        self.entry = 0
+
+    def _write_term(self, term, offset, frequency, doc_frequency):
+
+        """
+        Write the given 'term', its position file 'offset', its 'frequency' and
+        its 'doc_frequency' (number of documents in which it appears) to the
+        term information file. Return the offset after the term information was
+        written to the file.
+        """
+
+        info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
+
+        if self.entry % self.interval == 0:
+            self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
+
+        self.entry += 1
+
+    def write_term_positions(self, term, doc_positions):
+
+        """
+        Write the given 'term' and the 'doc_positions' recording the documents
+        and positions at which the term is found.
+        """
+
+        offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
+        self._write_term(term, offset, frequency, doc_frequency)
+
+    def close(self):
+        self.info_writer.close()
+        self.index_writer.close()
+        self.position_dict_writer.close()
+
+class TermDictionaryReader:
+
+    "Reading term dictionaries."
+
+    def __init__(self, info_reader, index_reader, position_dict_reader):
+        self.info_reader = info_reader
+        self.index_reader = index_reader
+        self.position_dict_reader = position_dict_reader
+
+        self.terms = []
+        try:
+            while 1:
+                self.terms.append(self.index_reader.read_term())
+        except EOFError:
+            pass
+
+        # Large numbers for ordering purposes.
+
+        if self.terms:
+            self.max_offset = self.terms[-1][1] + 1
+        else:
+            self.max_offset = None
+
+    def _find_closest_entry(self, term):
+
+        """
+        Find the offsets and frequencies of 'term' from the term dictionary or
+        the closest term starting with the value of 'term'.
+
+        Return the closest index entry consisting of a term, the position file
+        offset, the term frequency, the document frequency, and the term details
+        file offset.
+        """
+
+        i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
+
+        # Get the entry position providing the term or one preceding it.
+        # If no entry precedes the requested term, return the very first entry
+        # as the closest.
+
+        if i == -1:
+            return self.terms[0]
+        else:
+            return self.terms[i]
+
+    def _find_closest_term(self, term):
+
+        """
+        Find the offsets and frequencies of 'term' from the term dictionary or
+        the closest term starting with the value of 'term'.
+
+        Return the closest term (or the term itself), the position file offset,
+        the term frequency, the document frequency, and the term details file
+        offset (or None if the reader is already positioned).
+        """
+
+        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
+
+        # Where the term is found immediately, return the offset and
+        # frequencies. If the term does not appear, return the details of the
+        # closest entry.
+
+        if term <= found_term:
+            return found_term, offset, frequency, doc_frequency, info_offset
+
+        # Otherwise, seek past the index term's entry in the information file
+        # and scan for the desired term.
+
+        else:
+            self.info_reader.go_to_term(found_term, offset, info_offset)
+            try:
+                while term > found_term:
+                    found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
+            except EOFError:
+                pass
+
+            return found_term, offset, frequency, doc_frequency, None
+
+    def _find_term(self, term):
+
+        """
+        Find the position file offset and frequency of 'term' from the term
+        dictionary.
+        """
+
+        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
+
+        # If the term is found, return the offset and frequencies.
+
+        if term == found_term:
+            return offset, frequency, doc_frequency
+        else:
+            return None
+
+    def _get_positions(self, offset, doc_frequency):
+        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
+
+    # Iterator convenience methods.
+
+    def __iter__(self):
+        self.rewind()
+        return self
+
+    def next(self):
+        try:
+            return self.read_term()
+        except EOFError:
+            raise StopIteration
+
+    # Sequential access methods.
+
+    def rewind(self):
+        self.info_reader.rewind()
+
+    def read_term(self):
+
+        """
+        Return the next term, its frequency, its document frequency, and the
+        documents and positions at which the term is found.
+        """
+
+        term, offset, frequency, doc_frequency = self.info_reader.read_term()
+        positions = self._get_positions(offset, doc_frequency)
+        return term, frequency, doc_frequency, positions
+
+    # Query methods.
+
+    def find_terms(self, term):
+
+        "Return all terms whose values start with the value of 'term'."
+
+        terms = []
+
+        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
+
+        # Position the reader, if necessary.
+
+        if info_offset is not None:
+            self.info_reader.go_to_term(found_term, offset, info_offset)
+
+        # Read and record terms.
+
+        try:
+            # Add the found term if it starts with the specified term.
+
+            while found_term.startswith(term):
+                terms.append(found_term)
+                found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
+
+        except EOFError:
+            pass
+
+        return terms
+
+    def find_positions(self, term):
+
+        "Return the documents and positions at which the given 'term' is found."
+
+        t = self._find_term(term)
+        if t is None:
+            return None
+        else:
+            offset, frequency, doc_frequency = t
+            return self._get_positions(offset, doc_frequency)
+
+    def get_frequency(self, term):
+
+        "Return the frequency of the given 'term'."
+
+        t = self._find_term(term)
+        if t is None:
+            return None
+        else:
+            offset, frequency, doc_frequency = t
+            return frequency
+
+    def get_document_frequency(self, term):
+
+        "Return the document frequency of the given 'term'."
+
+        t = self._find_term(term)
+        if t is None:
+            return None
+        else:
+            offset, frequency, doc_frequency = t
+            return doc_frequency
+
+    def close(self):
+        self.info_reader.close()
+        self.index_reader.close()
+        self.position_dict_reader.close()
+
+# vim: tabstop=4 expandtab shiftwidth=4
diff -r a0f37b0ef350 -r fad9698e2c46 setup.py
--- a/setup.py	Mon Sep 14 21:23:32 2009 +0200
+++ b/setup.py	Tue Sep 15 00:15:11 2009 +0200
@@ -11,6 +11,7 @@
     author_email = "paul@boddie.org.uk",
     url          = "http://www.boddie.org.uk/python/iixr.html",
     version      = "0.1",
-    py_modules   = ["iixr", "itermerge"],
+    py_modules   = ["itermerge"],
+    packages     = ["iixr"],
     ext_modules  = [vint],
     )
diff -r a0f37b0ef350 -r fad9698e2c46 test.py
--- a/test.py	Mon Sep 14 21:23:32 2009 +0200
+++ b/test.py	Tue Sep 15 00:15:11 2009 +0200
@@ -1,6 +1,10 @@
 #!/usr/bin/env python
 
-import iixr
+from iixr.files import *
+from iixr.fields import *
+from iixr.terms import *
+from iixr.positions import *
+from iixr.index import *
 import os
 
 # Remove old test files.
@@ -23,13 +27,13 @@
 numbers = [12345678, 0, 1, 127, 128, 255, 256]
 
 f = open("test", "wb")
-w = iixr.FileWriter(f)
+w = FileWriter(f)
 for number in numbers:
     w.write_number(number)
 w.close()
 
 f = open("test", "rb")
-r = iixr.FileReader(f)
+r = FileReader(f)
 for number in numbers:
     n = r.read_number()
     print number == n, number, n
@@ -52,7 +56,7 @@
     ]
 
 f = open("testP", "wb")
-w = iixr.PositionWriter(f)
+w = PositionWriter(f)
 for doc_positions in all_doc_positions:
     for docnum, positions in doc_positions:
         w.write_positions(docnum, positions)
@@ -60,7 +64,7 @@
 w.close()
 
 f = open("testP", "rb")
-r = iixr.PositionIterator(f, 0, None)
+r = PositionIterator(f, 0, None)
 for doc_positions in all_doc_positions:
     for docnum, positions in doc_positions:
         d, p = r.read_positions()
@@ -84,7 +88,7 @@
 
 offsets = []
 f = open("testPI", "wb")
-w = iixr.PositionIndexWriter(f)
+w = PositionIndexWriter(f)
 for term_positions in indexed_positions:
     offset = None
     doc_frequency = 0
@@ -97,7 +101,7 @@
     offsets.append((offset, doc_frequency))
 w.close()
 
-r = iixr.PositionIndexOpener("testPI")
+r = PositionIndexOpener("testPI")
 offsets.reverse()
 indexed_positions.reverse()
 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
@@ -111,19 +115,19 @@
 # Test position dictionaries.
 
 f = open("testP", "wb")
-w = iixr.PositionWriter(f)
+w = PositionWriter(f)
 f2 = open("testPI", "wb")
-w2 = iixr.PositionIndexWriter(f2)
-wd = iixr.PositionDictionaryWriter(w, w2, 2)
+w2 = PositionIndexWriter(f2)
+wd = PositionDictionaryWriter(w, w2, 2)
 offsets = []
 for doc_positions in all_doc_positions:
     offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
     offsets.append((offset, doc_frequency))
 wd.close()
 
-r = iixr.PositionOpener("testP")
-r2 = iixr.PositionIndexOpener("testPI")
-rd = iixr.PositionDictionaryReader(r, r2)
+r = PositionOpener("testP")
+r2 = PositionIndexOpener("testPI")
+rd = PositionDictionaryReader(r, r2)
 offsets.reverse()
 all_doc_positions.reverse()
 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
@@ -144,13 +148,13 @@
     ]
 
 f = open("testF", "wb")
-w = iixr.FieldWriter(f)
+w = FieldWriter(f)
 for docnum, fields in doc_fields:
     w.write_fields(docnum, list(enumerate(fields)))
 w.close()
 
 f = open("testF", "rb")
-r = iixr.FieldReader(f)
+r = FieldReader(f)
 for docnum, fields in doc_fields:
     dn, df = r.read_fields()
     print docnum == dn, docnum, dn
@@ -166,13 +170,13 @@
     ]
 
 f = open("testFI", "wb")
-w = iixr.FieldIndexWriter(f)
+w = FieldIndexWriter(f)
 for docnum, offset in indexed_docs:
     w.write_document(docnum, offset)
 w.close()
 
 f = open("testFI", "rb")
-r = iixr.FieldIndexReader(f)
+r = FieldIndexReader(f)
 for docnum, offset in indexed_docs:
     dn, o = r.read_document()
     print docnum == dn, docnum, dn
@@ -182,19 +186,19 @@
 # Test field dictionaries.
 
 f = open("testF", "wb")
-w = iixr.FieldWriter(f)
+w = FieldWriter(f)
 f2 = open("testFI", "wb")
-w2 = iixr.FieldIndexWriter(f2)
-wd = iixr.FieldDictionaryWriter(w, w2, 3)
+w2 = FieldIndexWriter(f2)
+wd = FieldDictionaryWriter(w, w2, 3)
 for docnum, fields in doc_fields:
     wd.write_fields(docnum, list(enumerate(fields)))
 wd.close()
 
 f = open("testF", "rb")
-r = iixr.FieldReader(f)
+r = FieldReader(f)
 f2 = open("testFI", "rb")
-r2 = iixr.FieldIndexReader(f2)
-rd = iixr.FieldDictionaryReader(r, r2)
+r2 = FieldIndexReader(f2)
+rd = FieldDictionaryReader(r, r2)
 doc_fields_reversed = doc_fields[:]
 doc_fields_reversed.reverse()
 for docnum, fields in doc_fields_reversed:
@@ -226,13 +230,13 @@
     ]
 
 f = open("test", "wb")
-w = iixr.TermWriter(f)
+w = TermWriter(f)
 for term, offset, frequency, doc_frequency in terms:
     w.write_term(term, offset, frequency, doc_frequency)
 w.close()
 
 f = open("test", "rb")
-r = iixr.TermReader(f)
+r = TermReader(f)
 for term, offset, frequency, doc_frequency in terms:
     t, o, fr, df = r.read_term()
     print term == t, term, t
@@ -254,13 +258,13 @@
     ]
 
 f = open("test", "wb")
-w = iixr.TermIndexWriter(f)
+w = TermIndexWriter(f)
 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
     w.write_term(term, offset, frequency, doc_frequency, info_offset)
 w.close()
 
 f = open("test", "rb")
-r = iixr.TermIndexReader(f)
+r = TermIndexReader(f)
 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
     t, o, fr, df, i = r.read_term()
     print term == t, term, t
@@ -273,27 +277,27 @@
 # Test dictionaries with only term data.
 
 f = open("test", "wb")
-w = iixr.TermWriter(f)
+w = TermWriter(f)
 f2 = open("testI", "wb")
-w2 = iixr.TermIndexWriter(f2)
+w2 = TermIndexWriter(f2)
 f3 = open("testP", "wb")
-w3 = iixr.PositionWriter(f3)
+w3 = PositionWriter(f3)
 f4 = open("testPI", "wb")
-w4 = iixr.PositionIndexWriter(f4)
-wp = iixr.PositionDictionaryWriter(w3, w4, 2)
-wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
+w4 = PositionIndexWriter(f4)
+wp = PositionDictionaryWriter(w3, w4, 2)
+wd = TermDictionaryWriter(w, w2, wp, 3)
 for term, offset, frequency, doc_frequency in terms:
     wd._write_term(term, offset, frequency, doc_frequency)
 wd.close()
 
 f = open("test", "rb")
-r = iixr.TermReader(f)
+r = TermReader(f)
 f2 = open("testI", "rb")
-r2 = iixr.TermIndexReader(f2)
-r3 = iixr.PositionOpener("testP")
-r4 = iixr.PositionIndexOpener("testPI")
-rp = iixr.PositionDictionaryReader(r3, r4)
-rd = iixr.TermDictionaryReader(r, r2, rp)
+r2 = TermIndexReader(f2)
+r3 = PositionOpener("testP")
+r4 = PositionIndexOpener("testPI")
+rp = PositionDictionaryReader(r3, r4)
+rd = TermDictionaryReader(r, r2, rp)
 terms_reversed = terms[:]
 terms_reversed.reverse()
 for term, offset, frequency, doc_frequency in terms_reversed:
@@ -335,27 +339,27 @@
     ]
 
 f = open("test", "wb")
-w = iixr.TermWriter(f)
+w = TermWriter(f)
 f2 = open("testI", "wb")
-w2 = iixr.TermIndexWriter(f2)
+w2 = TermIndexWriter(f2)
 f3 = open("testP", "wb")
-w3 = iixr.PositionWriter(f3)
+w3 = PositionWriter(f3)
 f4 = open("testPI", "wb")
-w4 = iixr.PositionIndexWriter(f4)
-wp = iixr.PositionDictionaryWriter(w3, w4, 2)
-wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
+w4 = PositionIndexWriter(f4)
+wp = PositionDictionaryWriter(w3, w4, 2)
+wd = TermDictionaryWriter(w, w2, wp, 3)
 for term, doc_positions in terms_with_positions:
     wd.write_term_positions(term, doc_positions)
 wd.close()
 
 f = open("test", "rb")
-r = iixr.TermReader(f)
+r = TermReader(f)
 f2 = open("testI", "rb")
-r2 = iixr.TermIndexReader(f2)
-r3 = iixr.PositionOpener("testP")
-r4 = iixr.PositionIndexOpener("testPI")
-rp = iixr.PositionDictionaryReader(r3, r4)
-rd = iixr.TermDictionaryReader(r, r2, rp)
+r2 = TermIndexReader(f2)
+r3 = PositionOpener("testP")
+r4 = PositionIndexOpener("testPI")
+rp = PositionDictionaryReader(r3, r4)
+rd = TermDictionaryReader(r, r2, rp)
 terms_reversed = terms_with_positions[:]
 terms_reversed.reverse()
 for term, doc_positions in terms_reversed:
@@ -407,10 +411,10 @@
     ("shells", 37, None)
     ]
 
-index = iixr.Index("test_index")
+index = Index("test_index")
 wi = index.get_writer(3, 2, 6)
 for docnum, text in docs:
-    doc = iixr.Document(docnum)
+    doc = Document(docnum)
     for position, term in enumerate(text.split()):
         doc.add_position(term, position)
     doc.add_field(123, text)