1.1 --- a/iixr/fields.py	Sat Feb 12 01:23:58 2011 +0100
     1.2 +++ /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.3 @@ -1,345 +0,0 @@
     1.4 -#!/usr/bin/env python
     1.5 -
     1.6 -"""
     1.7 -Specific classes for storing document information.
     1.8 -
     1.9 -Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
    1.10 -
    1.11 -This program is free software; you can redistribute it and/or modify it under
    1.12 -the terms of the GNU General Public License as published by the Free Software
    1.13 -Foundation; either version 3 of the License, or (at your option) any later
    1.14 -version.
    1.15 -
    1.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
    1.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    1.18 -PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    1.19 -
    1.20 -You should have received a copy of the GNU General Public License along
    1.21 -with this program.  If not, see <http://www.gnu.org/licenses/>.
    1.22 -"""
    1.23 -
    1.24 -from iixr.data import *
    1.25 -from iixr.files import *
    1.26 -from bisect import bisect_right  # to find terms in the dictionary index
    1.27 -
    1.28 -DOCUMENT_CACHE_LIMIT = 10000
    1.29 -
    1.30 -class FieldWriter(FileWriter):
    1.31 -
    1.32 -    "Writing field data to files."
    1.33 -
    1.34 -    def begin(self, docnum_size):
    1.35 -        self.write_number(docnum_size)
    1.36 -        self.end_record()
    1.37 -        self.docnum_size = docnum_size
    1.38 -        self.data_start = self.tell()
    1.39 -
    1.40 -    def reset(self):
    1.41 -        self.end_record()
    1.42 -        self.last_docnum = None
    1.43 -        self.subtractor = None
    1.44 -
    1.45 -    def write_fields(self, docnum, fields):
    1.46 -
    1.47 -        """
    1.48 -        Write for the given 'docnum', a list of 'fields' (integer, string pairs
    1.49 -        representing field identifiers and values respectively).
    1.50 -        """
    1.51 -
    1.52 -        # Find the size of document number values.
    1.53 -
    1.54 -        if self.last_docnum is not None:
    1.55 -            docnum_seq = self.subtractor(docnum, self.last_docnum)
    1.56 -        else:
    1.57 -            self.subtractor = get_subtractor(docnum)
    1.58 -            docnum_seq = docnum
    1.59 -
    1.60 -        # Write the document number.
    1.61 -
    1.62 -        self.write_sequence_value(docnum_seq, self.docnum_size)
    1.63 -
    1.64 -        # Write the number of fields.
    1.65 -
    1.66 -        self.write_number(len(fields))
    1.67 -
    1.68 -        # Write the fields themselves.
    1.69 -
    1.70 -        for i, field in fields:
    1.71 -            self.write_number(i)
    1.72 -            self.write_string(field, 1) # compress
    1.73 -
    1.74 -        self.last_docnum = docnum
    1.75 -
    1.76 -class FieldReader(FileReader):
    1.77 -
    1.78 -    "Reading field data from files."
    1.79 -
    1.80 -    def begin(self):
    1.81 -        self.begin_record()
    1.82 -        try:
    1.83 -            self.docnum_size = self.read_number()
    1.84 -        except EOFError:
    1.85 -            self.docnum_size = 0 # NOTE: No fields!
    1.86 -        self.data_start = self.tell()
    1.87 -
    1.88 -    def reset(self):
    1.89 -        self.last_docnum = None
    1.90 -        self.adder = None
    1.91 -        self.begin_record()
    1.92 -
    1.93 -    def read_fields(self):
    1.94 -
    1.95 -        """
    1.96 -        Read fields from the file, returning a tuple containing the document
    1.97 -        number and a list of field (identifier, value) pairs.
    1.98 -        """
    1.99 -
   1.100 -        # Read the document number.
   1.101 -
   1.102 -        docnum = self.read_sequence_value(self.docnum_size)
   1.103 -
   1.104 -        if self.last_docnum is not None:
   1.105 -            self.last_docnum = self.adder(docnum, self.last_docnum)
   1.106 -        else:
   1.107 -            self.adder = get_adder(docnum)
   1.108 -            self.last_docnum = docnum
   1.109 -
   1.110 -        # Read the number of fields.
   1.111 -
   1.112 -        nfields = self.read_number()
   1.113 -
   1.114 -        # Collect the fields.
   1.115 -
   1.116 -        fields = []
   1.117 -        i = 0
   1.118 -
   1.119 -        while i < nfields:
   1.120 -            identifier = self.read_number()
   1.121 -            value = self.read_string(1) # decompress
   1.122 -            fields.append((identifier, value))
   1.123 -            i += 1
   1.124 -
   1.125 -        return self.last_docnum, fields
   1.126 -
   1.127 -    def read_document_fields(self, docnum, offset):
   1.128 -
   1.129 -        """
   1.130 -        Read fields for 'docnum' at the given 'offset'. This permits the
   1.131 -        retrieval of details for the specified document, as well as scanning for
   1.132 -        later documents.
   1.133 -        """
   1.134 -
   1.135 -        self.seek(offset)
   1.136 -        bad_docnum, fields = self.read_fields()
   1.137 -        self.last_docnum = docnum
   1.138 -        return docnum, fields
   1.139 -
   1.140 -class FieldIndexWriter(FieldWriter):
   1.141 -
   1.142 -    "Writing field index details to files."
   1.143 -
   1.144 -    def reset(self):
   1.145 -        FieldWriter.reset(self)
   1.146 -        self.last_offset = 0
   1.147 -
   1.148 -    def write_document(self, docnum, offset):
   1.149 -
   1.150 -        """
   1.151 -        Write for the given 'docnum', the 'offset' at which the fields for the
   1.152 -        document are stored in the fields file.
   1.153 -        """
   1.154 -
   1.155 -        # Find the size of document number values.
   1.156 -
   1.157 -        if self.last_docnum is not None:
   1.158 -            docnum_seq = self.subtractor(docnum, self.last_docnum)
   1.159 -        else:
   1.160 -            self.subtractor = get_subtractor(docnum)
   1.161 -            docnum_seq = docnum
   1.162 -
   1.163 -        # Write the document number.
   1.164 -
   1.165 -        self.write_sequence_value(docnum_seq, self.docnum_size)
   1.166 -
   1.167 -        # Write the offset delta.
   1.168 -
   1.169 -        self.write_number(offset - self.last_offset)
   1.170 -
   1.171 -        self.last_docnum = docnum
   1.172 -        self.last_offset = offset
   1.173 -
   1.174 -class FieldIndexReader(FieldReader):
   1.175 -
   1.176 -    "Reading field index details from files."
   1.177 -
   1.178 -    def reset(self):
   1.179 -        FieldReader.reset(self)
   1.180 -        self.last_offset = 0
   1.181 -
   1.182 -    def read_document(self):
   1.183 -
   1.184 -        "Read a document number and field file offset."
   1.185 -
   1.186 -        # Read the document number.
   1.187 -
   1.188 -        docnum = self.read_sequence_value(self.docnum_size)
   1.189 -
   1.190 -        if self.last_docnum is not None:
   1.191 -            self.last_docnum = self.adder(docnum, self.last_docnum)
   1.192 -        else:
   1.193 -            self.adder = get_adder(docnum)
   1.194 -            self.last_docnum = docnum
   1.195 -
   1.196 -        # Read the offset.
   1.197 -
   1.198 -        self.last_offset += self.read_number()
   1.199 -
   1.200 -        return self.last_docnum, self.last_offset
   1.201 -
   1.202 -class FieldDictionaryWriter:
   1.203 -
   1.204 -    "Writing field dictionary details."
   1.205 -
   1.206 -    def __init__(self, field_writer, field_index_writer, interval):
   1.207 -        self.field_writer = field_writer
   1.208 -        self.field_index_writer = field_index_writer
   1.209 -        self.interval = interval
   1.210 -        self.entry = 0
   1.211 -
   1.212 -    def write_fields(self, docnum, fields):
   1.213 -
   1.214 -        "Write details of the given 'docnum' and 'fields'."
   1.215 -
   1.216 -        if self.entry == 0:
   1.217 -            docnum_size = sizeof(docnum)
   1.218 -            self.field_writer.begin(docnum_size)
   1.219 -            self.field_index_writer.begin(docnum_size)
   1.220 -            self.field_index_writer.reset()
   1.221 -
   1.222 -        if self.entry % self.interval == 0:
   1.223 -            self.field_writer.reset()
   1.224 -            offset = self.field_writer.tell()
   1.225 -            self.field_writer.write_fields(docnum, fields)
   1.226 -            self.field_index_writer.write_document(docnum, offset)
   1.227 -        else:
   1.228 -            self.field_writer.write_fields(docnum, fields)
   1.229 -
   1.230 -        self.entry += 1
   1.231 -
   1.232 -    def close(self):
   1.233 -        self.field_writer.close()
   1.234 -        self.field_index_writer.close()
   1.235 -
   1.236 -class FieldDictionaryReader:
   1.237 -
   1.238 -    "Reading field dictionary details."
   1.239 -
   1.240 -    def __init__(self, field_reader, field_index_reader):
   1.241 -        self.field_reader = field_reader
   1.242 -        self.field_index_reader = field_index_reader
   1.243 -
   1.244 -        self.field_reader.reset()
   1.245 -        self.field_index_reader.reset()
   1.246 -
   1.247 -        self.cache = {}
   1.248 -
   1.249 -        self.entry = 0
   1.250 -        self.docs = []
   1.251 -        try:
   1.252 -            while 1:
   1.253 -                self.docs.append(self.field_index_reader.read_document())
   1.254 -        except EOFError:
   1.255 -            pass
   1.256 -
   1.257 -        # Large numbers for ordering purposes.
   1.258 -
   1.259 -        if self.docs:
   1.260 -            self.max_offset = self.docs[-1][1]
   1.261 -        else:
   1.262 -            self.max_offset = None
   1.263 -
   1.264 -    # Iterator convenience methods.
   1.265 -
   1.266 -    def __iter__(self):
   1.267 -        self.rewind()
   1.268 -        return self
   1.269 -
   1.270 -    def next(self):
   1.271 -        try:
   1.272 -            return self.read_fields()
   1.273 -        except EOFError:
   1.274 -            raise StopIteration
   1.275 -
   1.276 -    # Sequential access methods.
   1.277 -
   1.278 -    def rewind(self):
   1.279 -        self.field_reader.rewind()
   1.280 -
   1.281 -    def read_fields(self):
   1.282 -
   1.283 -        "Return the next document number and fields."
   1.284 -
   1.285 -        try:
   1.286 -            return self.field_reader.read_fields()
   1.287 -        except EOFError:
   1.288 -            self.entry += 1
   1.289 -            try:
   1.290 -                found_docnum, offset = self.docs[self.entry]
   1.291 -            except IndexError:
   1.292 -                raise EOFError
   1.293 -            else:
   1.294 -                self.field_reader.reset()
   1.295 -                return self.field_reader.read_fields()
   1.296 -
   1.297 -    # Random access methods.
   1.298 -
   1.299 -    def get_fields(self, docnum):
   1.300 -
   1.301 -        "Read the fields of the document with the given 'docnum'."
   1.302 -
   1.303 -        if self.cache.has_key(docnum):
   1.304 -            return self.cache[docnum]
   1.305 -
   1.306 -        i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
   1.307 -
   1.308 -        # Get the entry position providing the term or one preceding it.
   1.309 -
   1.310 -        if i == -1:
   1.311 -            return None
   1.312 -
   1.313 -        found_docnum, offset = self.docs[i]
   1.314 -
   1.315 -        # Read from the fields file.
   1.316 -
   1.317 -        found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
   1.318 -
   1.319 -        # Scan for the document, if necessary.
   1.320 -
   1.321 -        try:
   1.322 -            while docnum > found_docnum:
   1.323 -                found_docnum, fields = self.field_reader.read_fields()
   1.324 -        except EOFError:
   1.325 -            pass
   1.326 -
   1.327 -        # If the document is found, return the fields.
   1.328 -
   1.329 -        if docnum == found_docnum:
   1.330 -
   1.331 -            # Store the fields in the cache, removing entries if the limit has
   1.332 -            # been reached.
   1.333 -
   1.334 -            keys = self.cache.keys()
   1.335 -
   1.336 -            if len(keys) == DOCUMENT_CACHE_LIMIT:
   1.337 -                del self.cache[keys[0]]
   1.338 -
   1.339 -            self.cache[docnum] = fields
   1.340 -            return fields
   1.341 -        else:
   1.342 -            return None
   1.343 -
   1.344 -    def close(self):
   1.345 -        self.field_reader.close()
   1.346 -        self.field_index_reader.close()
   1.347 -
   1.348 -# vim: tabstop=4 expandtab shiftwidth=4