1.1 --- a/iixr/fields.py Sat Feb 12 01:23:58 2011 +0100
1.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
1.3 @@ -1,345 +0,0 @@
1.4 -#!/usr/bin/env python
1.5 -
1.6 -"""
1.7 -Specific classes for storing document information.
1.8 -
1.9 -Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
1.10 -
1.11 -This program is free software; you can redistribute it and/or modify it under
1.12 -the terms of the GNU General Public License as published by the Free Software
1.13 -Foundation; either version 3 of the License, or (at your option) any later
1.14 -version.
1.15 -
1.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 -PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 -
1.20 -You should have received a copy of the GNU General Public License along
1.21 -with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 -"""
1.23 -
1.24 -from iixr.data import *
1.25 -from iixr.files import *
1.26 -from bisect import bisect_right # to find terms in the dictionary index
1.27 -
1.28 -DOCUMENT_CACHE_LIMIT = 10000
1.29 -
1.30 -class FieldWriter(FileWriter):
1.31 -
1.32 - "Writing field data to files."
1.33 -
1.34 - def begin(self, docnum_size):
1.35 - self.write_number(docnum_size)
1.36 - self.end_record()
1.37 - self.docnum_size = docnum_size
1.38 - self.data_start = self.tell()
1.39 -
1.40 - def reset(self):
1.41 - self.end_record()
1.42 - self.last_docnum = None
1.43 - self.subtractor = None
1.44 -
1.45 - def write_fields(self, docnum, fields):
1.46 -
1.47 - """
1.48 - Write for the given 'docnum', a list of 'fields' (integer, string pairs
1.49 - representing field identifiers and values respectively).
1.50 - """
1.51 -
1.52 - # Find the size of document number values.
1.53 -
1.54 - if self.last_docnum is not None:
1.55 - docnum_seq = self.subtractor(docnum, self.last_docnum)
1.56 - else:
1.57 - self.subtractor = get_subtractor(docnum)
1.58 - docnum_seq = docnum
1.59 -
1.60 - # Write the document number.
1.61 -
1.62 - self.write_sequence_value(docnum_seq, self.docnum_size)
1.63 -
1.64 - # Write the number of fields.
1.65 -
1.66 - self.write_number(len(fields))
1.67 -
1.68 - # Write the fields themselves.
1.69 -
1.70 - for i, field in fields:
1.71 - self.write_number(i)
1.72 - self.write_string(field, 1) # compress
1.73 -
1.74 - self.last_docnum = docnum
1.75 -
1.76 -class FieldReader(FileReader):
1.77 -
1.78 - "Reading field data from files."
1.79 -
1.80 - def begin(self):
1.81 - self.begin_record()
1.82 - try:
1.83 - self.docnum_size = self.read_number()
1.84 - except EOFError:
1.85 - self.docnum_size = 0 # NOTE: No fields!
1.86 - self.data_start = self.tell()
1.87 -
1.88 - def reset(self):
1.89 - self.last_docnum = None
1.90 - self.adder = None
1.91 - self.begin_record()
1.92 -
1.93 - def read_fields(self):
1.94 -
1.95 - """
1.96 - Read fields from the file, returning a tuple containing the document
1.97 - number and a list of field (identifier, value) pairs.
1.98 - """
1.99 -
1.100 - # Read the document number.
1.101 -
1.102 - docnum = self.read_sequence_value(self.docnum_size)
1.103 -
1.104 - if self.last_docnum is not None:
1.105 - self.last_docnum = self.adder(docnum, self.last_docnum)
1.106 - else:
1.107 - self.adder = get_adder(docnum)
1.108 - self.last_docnum = docnum
1.109 -
1.110 - # Read the number of fields.
1.111 -
1.112 - nfields = self.read_number()
1.113 -
1.114 - # Collect the fields.
1.115 -
1.116 - fields = []
1.117 - i = 0
1.118 -
1.119 - while i < nfields:
1.120 - identifier = self.read_number()
1.121 - value = self.read_string(1) # decompress
1.122 - fields.append((identifier, value))
1.123 - i += 1
1.124 -
1.125 - return self.last_docnum, fields
1.126 -
1.127 - def read_document_fields(self, docnum, offset):
1.128 -
1.129 - """
1.130 - Read fields for 'docnum' at the given 'offset'. This permits the
1.131 - retrieval of details for the specified document, as well as scanning for
1.132 - later documents.
1.133 - """
1.134 -
1.135 - self.seek(offset)
1.136 - bad_docnum, fields = self.read_fields()
1.137 - self.last_docnum = docnum
1.138 - return docnum, fields
1.139 -
1.140 -class FieldIndexWriter(FieldWriter):
1.141 -
1.142 - "Writing field index details to files."
1.143 -
1.144 - def reset(self):
1.145 - FieldWriter.reset(self)
1.146 - self.last_offset = 0
1.147 -
1.148 - def write_document(self, docnum, offset):
1.149 -
1.150 - """
1.151 - Write for the given 'docnum', the 'offset' at which the fields for the
1.152 - document are stored in the fields file.
1.153 - """
1.154 -
1.155 - # Find the size of document number values.
1.156 -
1.157 - if self.last_docnum is not None:
1.158 - docnum_seq = self.subtractor(docnum, self.last_docnum)
1.159 - else:
1.160 - self.subtractor = get_subtractor(docnum)
1.161 - docnum_seq = docnum
1.162 -
1.163 - # Write the document number.
1.164 -
1.165 - self.write_sequence_value(docnum_seq, self.docnum_size)
1.166 -
1.167 - # Write the offset delta.
1.168 -
1.169 - self.write_number(offset - self.last_offset)
1.170 -
1.171 - self.last_docnum = docnum
1.172 - self.last_offset = offset
1.173 -
1.174 -class FieldIndexReader(FieldReader):
1.175 -
1.176 - "Reading field index details from files."
1.177 -
1.178 - def reset(self):
1.179 - FieldReader.reset(self)
1.180 - self.last_offset = 0
1.181 -
1.182 - def read_document(self):
1.183 -
1.184 - "Read a document number and field file offset."
1.185 -
1.186 - # Read the document number.
1.187 -
1.188 - docnum = self.read_sequence_value(self.docnum_size)
1.189 -
1.190 - if self.last_docnum is not None:
1.191 - self.last_docnum = self.adder(docnum, self.last_docnum)
1.192 - else:
1.193 - self.adder = get_adder(docnum)
1.194 - self.last_docnum = docnum
1.195 -
1.196 - # Read the offset.
1.197 -
1.198 - self.last_offset += self.read_number()
1.199 -
1.200 - return self.last_docnum, self.last_offset
1.201 -
1.202 -class FieldDictionaryWriter:
1.203 -
1.204 - "Writing field dictionary details."
1.205 -
1.206 - def __init__(self, field_writer, field_index_writer, interval):
1.207 - self.field_writer = field_writer
1.208 - self.field_index_writer = field_index_writer
1.209 - self.interval = interval
1.210 - self.entry = 0
1.211 -
1.212 - def write_fields(self, docnum, fields):
1.213 -
1.214 - "Write details of the given 'docnum' and 'fields'."
1.215 -
1.216 - if self.entry == 0:
1.217 - docnum_size = sizeof(docnum)
1.218 - self.field_writer.begin(docnum_size)
1.219 - self.field_index_writer.begin(docnum_size)
1.220 - self.field_index_writer.reset()
1.221 -
1.222 - if self.entry % self.interval == 0:
1.223 - self.field_writer.reset()
1.224 - offset = self.field_writer.tell()
1.225 - self.field_writer.write_fields(docnum, fields)
1.226 - self.field_index_writer.write_document(docnum, offset)
1.227 - else:
1.228 - self.field_writer.write_fields(docnum, fields)
1.229 -
1.230 - self.entry += 1
1.231 -
1.232 - def close(self):
1.233 - self.field_writer.close()
1.234 - self.field_index_writer.close()
1.235 -
1.236 -class FieldDictionaryReader:
1.237 -
1.238 - "Reading field dictionary details."
1.239 -
1.240 - def __init__(self, field_reader, field_index_reader):
1.241 - self.field_reader = field_reader
1.242 - self.field_index_reader = field_index_reader
1.243 -
1.244 - self.field_reader.reset()
1.245 - self.field_index_reader.reset()
1.246 -
1.247 - self.cache = {}
1.248 -
1.249 - self.entry = 0
1.250 - self.docs = []
1.251 - try:
1.252 - while 1:
1.253 - self.docs.append(self.field_index_reader.read_document())
1.254 - except EOFError:
1.255 - pass
1.256 -
1.257 - # Large numbers for ordering purposes.
1.258 -
1.259 - if self.docs:
1.260 - self.max_offset = self.docs[-1][1]
1.261 - else:
1.262 - self.max_offset = None
1.263 -
1.264 - # Iterator convenience methods.
1.265 -
1.266 - def __iter__(self):
1.267 - self.rewind()
1.268 - return self
1.269 -
1.270 - def next(self):
1.271 - try:
1.272 - return self.read_fields()
1.273 - except EOFError:
1.274 - raise StopIteration
1.275 -
1.276 - # Sequential access methods.
1.277 -
1.278 - def rewind(self):
1.279 - self.field_reader.rewind()
1.280 -
1.281 - def read_fields(self):
1.282 -
1.283 - "Return the next document number and fields."
1.284 -
1.285 - try:
1.286 - return self.field_reader.read_fields()
1.287 - except EOFError:
1.288 - self.entry += 1
1.289 - try:
1.290 - found_docnum, offset = self.docs[self.entry]
1.291 - except IndexError:
1.292 - raise EOFError
1.293 - else:
1.294 - self.field_reader.reset()
1.295 - return self.field_reader.read_fields()
1.296 -
1.297 - # Random access methods.
1.298 -
1.299 - def get_fields(self, docnum):
1.300 -
1.301 - "Read the fields of the document with the given 'docnum'."
1.302 -
1.303 - if self.cache.has_key(docnum):
1.304 - return self.cache[docnum]
1.305 -
1.306 - i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
1.307 -
1.308 - # Get the entry position providing the term or one preceding it.
1.309 -
1.310 - if i == -1:
1.311 - return None
1.312 -
1.313 - found_docnum, offset = self.docs[i]
1.314 -
1.315 - # Read from the fields file.
1.316 -
1.317 - found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
1.318 -
1.319 - # Scan for the document, if necessary.
1.320 -
1.321 - try:
1.322 - while docnum > found_docnum:
1.323 - found_docnum, fields = self.field_reader.read_fields()
1.324 - except EOFError:
1.325 - pass
1.326 -
1.327 - # If the document is found, return the fields.
1.328 -
1.329 - if docnum == found_docnum:
1.330 -
1.331 - # Store the fields in the cache, removing entries if the limit has
1.332 - # been reached.
1.333 -
1.334 - keys = self.cache.keys()
1.335 -
1.336 - if len(keys) == DOCUMENT_CACHE_LIMIT:
1.337 - del self.cache[keys[0]]
1.338 -
1.339 - self.cache[docnum] = fields
1.340 - return fields
1.341 - else:
1.342 - return None
1.343 -
1.344 - def close(self):
1.345 - self.field_reader.close()
1.346 - self.field_index_reader.close()
1.347 -
1.348 -# vim: tabstop=4 expandtab shiftwidth=4