1.1 --- a/iixr/data.py Thu Feb 03 01:26:35 2011 +0100
1.2 +++ b/iixr/data.py Mon Feb 07 02:05:38 2011 +0100
1.3 @@ -1,7 +1,7 @@
1.4 #!/usr/bin/env python
1.5
1.6 """
1.7 -Variable-length integer functions.
1.8 +Data representation functions.
1.9
1.10 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
1.11
1.12 @@ -19,9 +19,76 @@
1.13 """
1.14
1.15 from array import array
1.16 +import operator
1.17
1.18 -vint_cache = {}
1.19 -vint_bytes_cache = {}
1.20 +# High-level representations.
1.21 +
1.22 +def convert_sequence(values, op):
1.23 + if values:
1.24 + new_values = list(values)
1.25 + last = new_values[0]
1.26 + i = 1
1.27 + length = len(new_values)
1.28 + while i < length:
1.29 + current = new_values[i]
1.30 + new_values[i] = op(new_values[i], last)
1.31 + last = current
1.32 + i += 1
1.33 +
1.34 +def add_seq_monotonic(x, y):
1.35 + return op_seq_monotonic(x, y, operator.add)
1.36 +
1.37 +def sub_seq_monotonic(x, y):
1.38 + return op_seq_monotonic(x, y, operator.sub)
1.39 +
1.40 +def op_seq_monotonic(x, y, op):
1.41 + return tuple([op(a, b) for a, b in zip(x, y)])
1.42 +
1.43 +def add_seq(x, y):
1.44 + length = min(len(x), len(y))
1.45 + seq = list(x)[:length]
1.46 + i = 0
1.47 + while i < length:
1.48 + if x[i] != 0:
1.49 + seq[i] = x[i] + y[i]
1.50 + break
1.51 + seq[i] = y[i]
1.52 + i += 1
1.53 + return tuple(seq)
1.54 +
1.55 +def sub_seq(x, y):
1.56 + length = min(len(x), len(y))
1.57 + seq = list(x)[:length]
1.58 + i = 0
1.59 + while i < length:
1.60 + replacement = x[i] - y[i]
1.61 + if replacement != 0:
1.62 + seq[i] = replacement
1.63 + break
1.64 + seq[i] = 0
1.65 + i += 1
1.66 + return tuple(seq)
1.67 +
1.68 +def is_sequence(value):
1.69 + return isinstance(value, (list, tuple))
1.70 +
1.71 +def get_monotonic_adder(value):
1.72 + return is_sequence(value) and add_seq_monotonic or operator.add
1.73 +
1.74 +def get_monotonic_subtractor(value):
1.75 + return is_sequence(value) and sub_seq_monotonic or operator.sub
1.76 +
1.77 +def get_adder(value):
1.78 + return is_sequence(value) and add_seq or operator.add
1.79 +
1.80 +def get_subtractor(value):
1.81 + return is_sequence(value) and sub_seq or operator.sub
1.82 +
1.83 +# Low-level representations.
1.84 +# Variable-length integer functions.
1.85 +
1.86 +vint_cache = []
1.87 +vint_bytes_cache = []
1.88
1.89 def vint(number):
1.90
1.91 @@ -29,7 +96,7 @@
1.92
1.93 try:
1.94 return vint_cache[number]
1.95 - except KeyError:
1.96 + except IndexError:
1.97 if number >= 0:
1.98 bytes = array('B')
1.99 _vint_to_array(number, bytes)
1.100 @@ -46,7 +113,7 @@
1.101
1.102 try:
1.103 bytes += vint_bytes_cache[number]
1.104 - except KeyError:
1.105 + except IndexError:
1.106 if number >= 0:
1.107 _vint_to_array(number, bytes)
1.108
1.109 @@ -75,6 +142,28 @@
1.110 number += bytes.pop() & 127
1.111 return number
1.112
1.113 +def vint_from_array_start(bytes, start):
1.114 +
1.115 + """
1.116 + Read a variable-length integer from 'bytes', starting at 'start', and
1.117 + returning a tuple containing a number and the first position after the
1.118 + number.
1.119 + """
1.120 +
1.121 + number = 0
1.122 + length = len(bytes)
1.123 + digit = 0
1.124 + while start < length:
1.125 + x = bytes[start]
1.126 + number += (x & 127) << digit
1.127 + digit += 7
1.128 + start += 1
1.129 + if not (x & 128):
1.130 + break
1.131 + return number, start
1.132 +
1.133 +# String serialisation.
1.134 +
1.135 def string_to_array(s, bytes):
1.136
1.137 "Write the given string 's' to 'bytes'."
1.138 @@ -82,10 +171,45 @@
1.139 vint_to_array(len(s), bytes)
1.140 bytes.fromstring(s.encode("utf-8"))
1.141
1.142 +# Sequence serialisation.
1.143 +
1.144 +def sequence_to_array(value, bytes):
1.145 +
1.146 + "Write the given sequence 'value' to 'bytes'."
1.147 +
1.148 + size = is_sequence(value) and len(value) or 0
1.149 + vint_to_array(size, bytes)
1.150 + if size:
1.151 + for a in value:
1.152 + vint_to_array(a, bytes)
1.153 + else:
1.154 + vint_to_array(value, bytes)
1.155 +
1.156 +def sequence_from_array(bytes, start=0):
1.157 +
1.158 + """
1.159 + Read a sequence from 'bytes', returning the sequence and the first position
1.160 + after the sequence.
1.161 + """
1.162 +
1.163 + size, start = vint_from_array_start(bytes, start)
1.164 + if size:
1.165 + j = 0
1.166 + value = []
1.167 + while j < size:
1.168 + v, start = vint_from_array_start(bytes, start)
1.169 + value.append(v)
1.170 + j += 1
1.171 + return tuple(value), start
1.172 + else:
1.173 + return vint_from_array_start(bytes, start)
1.174 +
1.175 +# Variable-length integer cache initialisation.
1.176 +
1.177 for i in xrange(0, 65536):
1.178 bytes = array('B')
1.179 _vint_to_array(i, bytes)
1.180 - vint_bytes_cache[i] = bytes
1.181 - vint_cache[i] = bytes.tostring()
1.182 + vint_bytes_cache.append(bytes)
1.183 + vint_cache.append(bytes.tostring())
1.184
1.185 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/iixr/fields.py Thu Feb 03 01:26:35 2011 +0100
2.2 +++ b/iixr/fields.py Mon Feb 07 02:05:38 2011 +0100
2.3 @@ -3,7 +3,7 @@
2.4 """
2.5 Specific classes for storing document information.
2.6
2.7 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
2.8 +Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
2.9
2.10 This program is free software; you can redistribute it and/or modify it under
2.11 the terms of the GNU General Public License as published by the Free Software
2.12 @@ -18,6 +18,7 @@
2.13 with this program. If not, see <http://www.gnu.org/licenses/>.
2.14 """
2.15
2.16 +from iixr.data import *
2.17 from iixr.files import *
2.18 from bisect import bisect_right # to find terms in the dictionary index
2.19
2.20 @@ -29,7 +30,7 @@
2.21
2.22 def reset(self):
2.23 self.last_docnum = None
2.24 - self.docnum_size = None
2.25 + self.subtractor = None
2.26
2.27 def write_fields(self, docnum, fields):
2.28
2.29 @@ -40,15 +41,17 @@
2.30
2.31 # Find the size of document number values.
2.32
2.33 - if self.docnum_size is None:
2.34 - self.docnum_size = self.get_value_size(docnum)
2.35 - self.last_docnum = self.get_initial_value(self.docnum_size)
2.36 + if self.last_docnum is not None:
2.37 + docnum_seq = self.subtractor(docnum, self.last_docnum)
2.38 + else:
2.39 + self.subtractor = get_subtractor(docnum)
2.40 + docnum_seq = docnum
2.41
2.42 - # Write the number of values per document number.
2.43 - # Write the document number delta.
2.44 + self.begin_record()
2.45
2.46 - self.write_number(self.docnum_size)
2.47 - self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
2.48 + # Write the document number.
2.49 +
2.50 + self.write_sequence_value(docnum_seq)
2.51
2.52 # Write the number of fields.
2.53
2.54 @@ -60,12 +63,17 @@
2.55 self.write_number(i)
2.56 self.write_string(field, 1) # compress
2.57
2.58 + self.end_record()
2.59 +
2.60 + self.last_docnum = docnum
2.61 +
2.62 class FieldReader(FileReader):
2.63
2.64 "Reading field data from files."
2.65
2.66 def reset(self):
2.67 self.last_docnum = None
2.68 + self.adder = None
2.69
2.70 def read_fields(self):
2.71
2.72 @@ -74,16 +82,17 @@
2.73 number and a list of field (identifier, value) pairs.
2.74 """
2.75
2.76 - # Read the number of values per document number.
2.77 + self.begin_record()
2.78
2.79 - docnum_size = self.read_number()
2.80 + # Read the document number.
2.81 +
2.82 + docnum = self.read_sequence_value()
2.83
2.84 - if self.last_docnum is None:
2.85 - self.last_docnum = self.get_initial_value(docnum_size)
2.86 -
2.87 - # Read the document number delta and add it to the last number.
2.88 -
2.89 - self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
2.90 + if self.last_docnum is not None:
2.91 + self.last_docnum = self.adder(docnum, self.last_docnum)
2.92 + else:
2.93 + self.adder = get_adder(docnum)
2.94 + self.last_docnum = docnum
2.95
2.96 # Read the number of fields.
2.97
2.98 @@ -100,6 +109,8 @@
2.99 fields.append((identifier, value))
2.100 i += 1
2.101
2.102 + self.end_record()
2.103 +
2.104 return self.last_docnum, fields
2.105
2.106 def read_document_fields(self, docnum, offset):
2.107 @@ -121,7 +132,7 @@
2.108
2.109 def reset(self):
2.110 self.last_docnum = None
2.111 - self.docnum_size = None
2.112 + self.subtractor = None
2.113 self.last_offset = 0
2.114
2.115 def write_document(self, docnum, offset):
2.116 @@ -133,19 +144,24 @@
2.117
2.118 # Find the size of document number values.
2.119
2.120 - if self.docnum_size is None:
2.121 - self.docnum_size = self.get_value_size(docnum)
2.122 - self.last_docnum = self.get_initial_value(self.docnum_size)
2.123 + if self.last_docnum is not None:
2.124 + docnum_seq = self.subtractor(docnum, self.last_docnum)
2.125 + else:
2.126 + self.subtractor = get_subtractor(docnum)
2.127 + docnum_seq = docnum
2.128
2.129 - # Write the number of values per document number.
2.130 - # Write the document number delta.
2.131 + self.begin_record()
2.132
2.133 - self.write_number(self.docnum_size)
2.134 - self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
2.135 + # Write the document number.
2.136 +
2.137 + self.write_sequence_value(docnum_seq)
2.138
2.139 # Write the offset delta.
2.140
2.141 self.write_number(offset - self.last_offset)
2.142 + self.end_record()
2.143 +
2.144 + self.last_docnum = docnum
2.145 self.last_offset = offset
2.146
2.147 class FieldIndexReader(FileReader):
2.148 @@ -154,26 +170,29 @@
2.149
2.150 def reset(self):
2.151 self.last_docnum = None
2.152 + self.adder = None
2.153 self.last_offset = 0
2.154
2.155 def read_document(self):
2.156
2.157 "Read a document number and field file offset."
2.158
2.159 - # Read the number of values per document number.
2.160 + self.begin_record()
2.161
2.162 - docnum_size = self.read_number()
2.163 + # Read the document number.
2.164 +
2.165 + docnum = self.read_sequence_value()
2.166
2.167 - if self.last_docnum is None:
2.168 - self.last_docnum = self.get_initial_value(docnum_size)
2.169 -
2.170 - # Read the document number delta and add it to the last number.
2.171 -
2.172 - self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
2.173 + if self.last_docnum is not None:
2.174 + self.last_docnum = self.adder(docnum, self.last_docnum)
2.175 + else:
2.176 + self.adder = get_adder(docnum)
2.177 + self.last_docnum = docnum
2.178
2.179 # Read the offset.
2.180
2.181 self.last_offset += self.read_number()
2.182 + self.end_record()
2.183
2.184 return self.last_docnum, self.last_offset
2.185
3.1 --- a/iixr/files.py Thu Feb 03 01:26:35 2011 +0100
3.2 +++ b/iixr/files.py Mon Feb 07 02:05:38 2011 +0100
3.3 @@ -18,7 +18,7 @@
3.4 with this program. If not, see <http://www.gnu.org/licenses/>.
3.5 """
3.6
3.7 -from iixr.data import vint, vint_to_array, vint_from_array
3.8 +from iixr.data import *
3.9 from array import array
3.10 import zlib
3.11
3.12 @@ -30,7 +30,8 @@
3.13
3.14 def __init__(self, f):
3.15 self.f = f
3.16 - self.data = array('B')
3.17 + self.data = array('B') # master buffer
3.18 + self.record = array('B') # record buffer
3.19 self.reset()
3.20
3.21 def reset(self):
3.22 @@ -47,29 +48,11 @@
3.23 self.f.seek(0)
3.24 self.reset()
3.25
3.26 - def flush(self):
3.27 - if self.f is not None:
3.28 - self.data.tofile(self.f)
3.29 - self.data = array('B')
3.30 -
3.31 def close(self):
3.32 if self.f is not None:
3.33 - self.data.tofile(self.f)
3.34 self.f.close()
3.35 self.f = None
3.36
3.37 - def get_value_size(self, value):
3.38 - if isinstance(value, (list, tuple)):
3.39 - return len(value)
3.40 - else:
3.41 - return 0
3.42 -
3.43 - def get_initial_value(self, size):
3.44 - if size:
3.45 - return [0] * size
3.46 - else:
3.47 - return 0
3.48 -
3.49 class FileWriter(File):
3.50
3.51 "Writing basic data types to files."
3.52 @@ -77,18 +60,26 @@
3.53 def tell(self):
3.54 return self.f.tell() + len(self.data)
3.55
3.56 + def begin_record(self):
3.57 + pass
3.58 +
3.59 + def end_record(self):
3.60 + vint_to_array(len(self.record), self.data)
3.61 + self.data += self.record
3.62 + self.record = array('B')
3.63 +
3.64 def write_number(self, number):
3.65
3.66 "Write 'number' to the file using a variable length encoding."
3.67
3.68 - vint_to_array(number, self.data)
3.69 + vint_to_array(number, self.record)
3.70
3.71 def write_numbers(self, numbers):
3.72
3.73 "Write 'numbers' to the file using a variable length encoding."
3.74
3.75 for number in numbers:
3.76 - vint_to_array(number, self.data)
3.77 + vint_to_array(number, self.record)
3.78
3.79 def write_string(self, s, compress=0):
3.80
3.81 @@ -121,120 +112,122 @@
3.82 # Write the length of the data before the data itself.
3.83
3.84 length = len(s)
3.85 - self.data.fromstring("".join([flag, vint(length), s]))
3.86 + self.record.fromstring("".join([flag, vint(length), s]))
3.87 +
3.88 + def write_sequence_value(self, value):
3.89 + sequence_to_array(value, self.record)
3.90 +
3.91 + def write_sequence_values(self, values):
3.92 + vint_to_array(len(values), self.record)
3.93 + for value in values:
3.94 + self.write_sequence_value(value)
3.95
3.96 - def write_sequence(self, value, last, size, monotonic=1):
3.97 - if size:
3.98 - emit_delta = 1
3.99 - for v, l in map(None, value, last)[:size]:
3.100 - if v is None:
3.101 - v = l
3.102 - if monotonic or emit_delta:
3.103 - v_out = v - l
3.104 - if emit_delta and v_out != 0:
3.105 - emit_delta = 0
3.106 - else:
3.107 - v_out = v + 1
3.108 - vint_to_array(v_out, self.data)
3.109 - else:
3.110 - vint_to_array(value - last, self.data)
3.111 + def write_delta_sequence(self, values):
3.112 + convert_sequence(values, get_subtractor(values[0]))
3.113 + self.write_sequence_values(values)
3.114 +
3.115 + def write_monotonic_sequence(self, values):
3.116 + convert_sequence(values, get_monotonic_subtractor(values[0]))
3.117 + self.write_sequence_values(values)
3.118
3.119 - return value
3.120 + def flush(self):
3.121 + if self.f is not None:
3.122 + self.data.tofile(self.f)
3.123 + self.data = array('B')
3.124 +
3.125 + def close(self):
3.126 + self.flush()
3.127 + File.close(self)
3.128
3.129 class FileReader(File):
3.130
3.131 "Reading basic data types from files."
3.132
3.133 - def read_number(self):
3.134 + def begin_record(self):
3.135 + size = self.read_number_from_file()
3.136 + self.record.fromfile(self.f, size)
3.137 + self.start = 0
3.138 +
3.139 + def end_record(self):
3.140 + self.record = array('B')
3.141 +
3.142 + def read_number_from_file(self):
3.143
3.144 "Read a number from the file."
3.145
3.146 # Read each byte, adding it to the number.
3.147
3.148 f = self.f
3.149 - a = self.data
3.150 + a = array('B')
3.151 fromfile = a.fromfile
3.152
3.153 - try:
3.154 - fromfile(f, 1)
3.155 - csd = a[-1]
3.156 - if csd < 128:
3.157 - return csd
3.158 - else:
3.159 - while csd & 128:
3.160 - fromfile(f, 1)
3.161 - csd = a[-1]
3.162 - return vint_from_array(self.data)
3.163 - finally:
3.164 - self.data = array('B')
3.165 + fromfile(f, 1)
3.166 + csd = a[-1]
3.167 + if csd < 128:
3.168 + return csd
3.169 + else:
3.170 + while csd & 128:
3.171 + fromfile(f, 1)
3.172 + csd = a[-1]
3.173 + return vint_from_array(a)
3.174 +
3.175 + def read_number(self):
3.176 +
3.177 + "Read a number from the current record."
3.178 +
3.179 + n, self.start = vint_from_array_start(self.record, self.start)
3.180 + return n
3.181
3.182 def read_string(self, decompress=0):
3.183
3.184 """
3.185 - Read a string from the file, decompressing the stored data if
3.186 + Read a string from the current record, decompressing the stored data if
3.187 'decompress' is set to a true value.
3.188 """
3.189
3.190 # Decompress the data if requested.
3.191
3.192 if decompress:
3.193 - flag = self.f.read(1)
3.194 + flag = chr(self.record[self.start])
3.195 + self.start += 1
3.196 else:
3.197 flag = "-"
3.198
3.199 length = self.read_number()
3.200 + start = self.start
3.201 + self.start += length
3.202 + s = self.record[start:self.start].tostring()
3.203
3.204 - try:
3.205 - self.data.fromfile(self.f, length)
3.206 - s = self.data.tostring()
3.207 -
3.208 - # Perform decompression if applicable.
3.209 + # Perform decompression if applicable.
3.210
3.211 - if flag == "z":
3.212 - s = zlib.decompress(s)
3.213 + if flag == "z":
3.214 + s = zlib.decompress(s)
3.215
3.216 - # Convert strings to Unicode objects.
3.217 + # Convert strings to Unicode objects.
3.218
3.219 - return unicode(s, "utf-8")
3.220 + return unicode(s, "utf-8")
3.221
3.222 - finally:
3.223 - self.data = array('B')
3.224 + def read_sequence_value(self):
3.225 + value, self.start = sequence_from_array(self.record, self.start)
3.226 + return value
3.227
3.228 - def read_sequence(self, last, size, monotonic=1):
3.229 - if size:
3.230 - value = []
3.231 - if monotonic:
3.232 - for v in last:
3.233 - v_in = self.read_number()
3.234 - value.append(v + v_in)
3.235 - else:
3.236 - i = 0
3.237 - n = len(last)
3.238 - value = list(last)
3.239 -
3.240 - # Traverse a copy of the last value.
3.241 -
3.242 - while i < n:
3.243 - v_in = self.read_number()
3.244 + def read_sequences(self):
3.245 + values = []
3.246 + length = self.read_number()
3.247 + i = 0
3.248 + while i < length:
3.249 + values.append(self.read_sequence_value())
3.250 + i += 1
3.251 + return values
3.252
3.253 - # While zeros are read, retain the last value elements.
3.254 - # Otherwise, add the delta...
3.255 -
3.256 - if v_in != 0:
3.257 - value[i] += v_in
3.258 - i += 1
3.259 -
3.260 - # Then set absolute values for the remaining elements.
3.261 + def read_delta_sequence(self):
3.262 + values = self.read_sequences()
3.263 + convert_sequence(values, get_adder(values[0]))
3.264 + return values
3.265
3.266 - while i < n:
3.267 - value[i] = self.read_number() - 1
3.268 - i += 1
3.269 - break
3.270 -
3.271 - i += 1
3.272 -
3.273 - return tuple(value)
3.274 - else:
3.275 - return last + self.read_number()
3.276 + def read_monotonic_sequence(self):
3.277 + values = self.read_sequences()
3.278 + convert_sequence(values, get_monotonic_adder(values[0]))
3.279 + return values
3.280
3.281 # vim: tabstop=4 expandtab shiftwidth=4
4.1 --- a/iixr/positions.py Thu Feb 03 01:26:35 2011 +0100
4.2 +++ b/iixr/positions.py Mon Feb 07 02:05:38 2011 +0100
4.3 @@ -3,7 +3,7 @@
4.4 """
4.5 Specific classes for storing position information.
4.6
4.7 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
4.8 +Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
4.9
4.10 This program is free software; you can redistribute it and/or modify it under
4.11 the terms of the GNU General Public License as published by the Free Software
4.12 @@ -18,8 +18,8 @@
4.13 with this program. If not, see <http://www.gnu.org/licenses/>.
4.14 """
4.15
4.16 +from iixr.data import *
4.17 from iixr.files import *
4.18 -from iixr.data import vint, vint_to_array
4.19
4.20 class PositionWriter(FileWriter):
4.21
4.22 @@ -27,7 +27,7 @@
4.23
4.24 def reset(self):
4.25 self.last_docnum = None
4.26 - self.docnum_size = None
4.27 + self.subtractor = None
4.28
4.29 def write_positions(self, docnum, positions):
4.30
4.31 @@ -35,39 +35,31 @@
4.32 Write for the document 'docnum' the given 'positions'.
4.33 """
4.34
4.35 - # Find the size of document number values.
4.36 -
4.37 - if self.docnum_size is None:
4.38 - self.docnum_size = self.get_value_size(docnum)
4.39 - self.last_docnum = self.get_initial_value(self.docnum_size)
4.40 -
4.41 - if docnum < self.last_docnum:
4.42 - raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
4.43 + if not positions:
4.44 + return
4.45
4.46 # Make sure that the positions are sorted.
4.47
4.48 positions.sort()
4.49
4.50 - # Find the size of position values.
4.51 -
4.52 - size = self.get_value_size(positions[0])
4.53 + # Calculate an ongoing delta.
4.54
4.55 - # Write the number of values per document number.
4.56 - # Write the document number delta.
4.57 - # Write the number of positions.
4.58 - # Write the number of values per position.
4.59 + if self.last_docnum is not None:
4.60 + if docnum < self.last_docnum:
4.61 + raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
4.62 +
4.63 + docnum_seq = self.subtractor(docnum, self.last_docnum)
4.64
4.65 - self.write_number(self.docnum_size)
4.66 - self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
4.67 - self.write_number(len(positions))
4.68 - self.write_number(size)
4.69 + # Or preserve the document number and prepare for future deltas.
4.70
4.71 - # Write the position deltas.
4.72 + else:
4.73 + self.subtractor = get_subtractor(docnum)
4.74 + docnum_seq = docnum
4.75
4.76 - last = self.get_initial_value(size)
4.77 -
4.78 - for position in positions:
4.79 - last = self.write_sequence(position, last, size)
4.80 + self.begin_record()
4.81 + self.write_sequence_value(docnum_seq)
4.82 + self.write_monotonic_sequence(positions)
4.83 + self.end_record()
4.84
4.85 self.last_docnum = docnum
4.86
4.87 @@ -77,6 +69,7 @@
4.88
4.89 def reset(self):
4.90 self.last_docnum = None
4.91 + self.adder = None
4.92
4.93 def read_positions(self):
4.94
4.95 @@ -84,38 +77,25 @@
4.96 Read positions, returning a document number and a list of positions.
4.97 """
4.98
4.99 - # Read the number of values per document number.
4.100 + self.begin_record()
4.101
4.102 - docnum_size = self.read_number()
4.103 -
4.104 - if self.last_docnum is None:
4.105 - self.last_docnum = self.get_initial_value(docnum_size)
4.106 + # Read the document number.
4.107
4.108 - # Read the document number delta and add it to the last number.
4.109 -
4.110 - self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
4.111 + docnum = self.read_sequence_value()
4.112
4.113 - # Read the number of positions.
4.114 -
4.115 - npositions = self.read_number()
4.116 + # Calculate an ongoing delta.
4.117
4.118 - # Read the number of values per position.
4.119 -
4.120 - size = self.read_number()
4.121 + if self.last_docnum is not None:
4.122 + self.last_docnum = self.adder(docnum, self.last_docnum)
4.123
4.124 - # Read the position deltas, adding each previous position to get the
4.125 - # appropriate collection of absolute positions.
4.126 -
4.127 - i = 0
4.128 + # Or preserve the document number and prepare for future deltas.
4.129
4.130 - last = self.get_initial_value(size)
4.131 -
4.132 - positions = []
4.133 + else:
4.134 + self.adder = get_adder(docnum)
4.135 + self.last_docnum = docnum
4.136
4.137 - while i < npositions:
4.138 - last = self.read_sequence(last, size)
4.139 - positions.append(last)
4.140 - i += 1
4.141 + positions = self.read_monotonic_sequence()
4.142 + self.end_record()
4.143
4.144 return self.last_docnum, positions
4.145
4.146 @@ -125,7 +105,7 @@
4.147
4.148 def reset(self):
4.149 self.last_docnum = None
4.150 - self.docnum_size = None
4.151 + self.subtractor = None
4.152 self.last_pos_offset = 0
4.153
4.154 def write_positions(self, docnum, pos_offset, count):
4.155 @@ -137,20 +117,19 @@
4.156
4.157 # Find the size of document number values.
4.158
4.159 - if self.docnum_size is None:
4.160 - self.docnum_size = self.get_value_size(docnum)
4.161 - self.last_docnum = self.get_initial_value(self.docnum_size)
4.162 + if self.last_docnum is not None:
4.163 + docnum_seq = self.subtractor(docnum, self.last_docnum)
4.164 + else:
4.165 + self.subtractor = get_subtractor(docnum)
4.166 + docnum_seq = docnum
4.167
4.168 - # Write the number of values per document number.
4.169 - # Write the document number delta.
4.170 - # Write the position file offset delta.
4.171 - # Write the document count.
4.172 -
4.173 - self.write_number(self.docnum_size)
4.174 - self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
4.175 + self.begin_record()
4.176 + self.write_sequence_value(docnum_seq)
4.177 self.write_number(pos_offset - self.last_pos_offset)
4.178 self.write_number(count)
4.179 + self.end_record()
4.180
4.181 + self.last_docnum = docnum
4.182 self.last_pos_offset = pos_offset
4.183
4.184 class PositionIndexReader(FileReader):
4.185 @@ -159,6 +138,7 @@
4.186
4.187 def reset(self):
4.188 self.last_docnum = None
4.189 + self.adder = None
4.190 self.last_pos_offset = 0
4.191
4.192 def read_positions(self):
4.193 @@ -168,16 +148,17 @@
4.194 file, and the number of documents in a section of that file.
4.195 """
4.196
4.197 - # Read the number of values per document number.
4.198 + self.begin_record()
4.199
4.200 - docnum_size = self.read_number()
4.201 + # Read the document number.
4.202 +
4.203 + docnum = self.read_sequence_value()
4.204
4.205 - if self.last_docnum is None:
4.206 - self.last_docnum = self.get_initial_value(docnum_size)
4.207 -
4.208 - # Read the document number delta and add it to the last number.
4.209 -
4.210 - self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
4.211 + if self.last_docnum is not None:
4.212 + self.last_docnum = self.adder(docnum, self.last_docnum)
4.213 + else:
4.214 + self.adder = get_adder(docnum)
4.215 + self.last_docnum = docnum
4.216
4.217 # Read the offset delta.
4.218
4.219 @@ -186,6 +167,7 @@
4.220 # Read the document count.
4.221
4.222 count = self.read_number()
4.223 + self.end_record()
4.224
4.225 return self.last_docnum, self.last_pos_offset, count
4.226
5.1 --- a/iixr/terms.py Thu Feb 03 01:26:35 2011 +0100
5.2 +++ b/iixr/terms.py Mon Feb 07 02:05:38 2011 +0100
5.3 @@ -3,7 +3,7 @@
5.4 """
5.5 Specific classes for storing term information.
5.6
5.7 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
5.8 +Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
5.9
5.10 This program is free software; you can redistribute it and/or modify it under
5.11 the terms of the GNU General Public License as published by the Free Software
5.12 @@ -40,6 +40,14 @@
5.13 term information file.
5.14 """
5.15
5.16 + self.begin_record()
5.17 + self._write_term(term, offset, frequency, doc_frequency)
5.18 + self.end_record()
5.19 +
5.20 + def _write_term(self, term, offset, frequency, doc_frequency):
5.21 +
5.22 + "Performs the term writing for 'write_term'."
5.23 +
5.24 if term <= self.last_term:
5.25 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
5.26
5.27 @@ -79,6 +87,16 @@
5.28 frequency from the term information file.
5.29 """
5.30
5.31 + self.begin_record()
5.32 + try:
5.33 + return self._read_term()
5.34 + finally:
5.35 + self.end_record()
5.36 +
5.37 + def _read_term(self):
5.38 +
5.39 + "Performs the term reading for 'read_term'."
5.40 +
5.41 # Read the prefix length and term suffix.
5.42
5.43 common = self.read_number()
5.44 @@ -127,11 +145,14 @@
5.45 'info_offset' in the term information file.
5.46 """
5.47
5.48 - TermWriter.write_term(self, term, offset, frequency, doc_frequency)
5.49 + self.begin_record()
5.50 + TermWriter._write_term(self, term, offset, frequency, doc_frequency)
5.51
5.52 # Write the information file offset delta.
5.53
5.54 self.write_number(info_offset - self.last_info_offset)
5.55 + self.end_record()
5.56 +
5.57 self.last_info_offset = info_offset
5.58
5.59 class TermIndexReader(TermReader):
5.60 @@ -150,11 +171,13 @@
5.61 index file.
5.62 """
5.63
5.64 - term, offset, frequency, doc_frequency = TermReader.read_term(self)
5.65 + self.begin_record()
5.66 + term, offset, frequency, doc_frequency = TermReader._read_term(self)
5.67
5.68 # Read the offset delta.
5.69
5.70 self.last_info_offset += self.read_number()
5.71 + self.end_record()
5.72
5.73 return term, offset, frequency, doc_frequency, self.last_info_offset
5.74
6.1 --- a/test.py Thu Feb 03 01:26:35 2011 +0100
6.2 +++ b/test.py Mon Feb 07 02:05:38 2011 +0100
6.3 @@ -32,49 +32,53 @@
6.4
6.5 f = open("test", "wb")
6.6 w = FileWriter(f)
6.7 +w.begin_record()
6.8 for number in numbers:
6.9 w.write_number(number)
6.10 +w.end_record()
6.11 w.close()
6.12
6.13 f = open("test", "rb")
6.14 r = FileReader(f)
6.15 +r.begin_record()
6.16 for number in numbers:
6.17 n = r.read_number()
6.18 print number == n, number, n
6.19 +r.end_record()
6.20 r.close()
6.21
6.22 tuples = [(0, 0), (1, 3), (2, 5), (3, 9)]
6.23
6.24 f = open("testMS", "wb")
6.25 w = FileWriter(f)
6.26 -last = w.get_initial_value(2)
6.27 -for t in tuples:
6.28 - last = w.write_sequence(t, last, 2)
6.29 +w.begin_record()
6.30 +w.write_monotonic_sequence(tuples)
6.31 +w.end_record()
6.32 w.close()
6.33
6.34 f = open("testMS", "rb")
6.35 r = FileReader(f)
6.36 -last = r.get_initial_value(2)
6.37 -for t in tuples:
6.38 - last = t2 = r.read_sequence(last, 2)
6.39 +r.begin_record()
6.40 +for t, t2 in zip(r.read_monotonic_sequence(), tuples):
6.41 print t == t2, t, t2
6.42 +r.end_record()
6.43 r.close()
6.44
6.45 tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)]
6.46
6.47 f = open("testNMS", "wb")
6.48 w = FileWriter(f)
6.49 -last = w.get_initial_value(2)
6.50 -for t in tuples2:
6.51 - last = w.write_sequence(t, last, 2, monotonic=0)
6.52 +w.begin_record()
6.53 +w.write_delta_sequence(tuples2)
6.54 +w.end_record()
6.55 w.close()
6.56
6.57 f = open("testNMS", "rb")
6.58 r = FileReader(f)
6.59 -last = r.get_initial_value(2)
6.60 -for t in tuples2:
6.61 - last = t2 = r.read_sequence(last, 2, monotonic=0)
6.62 +r.begin_record()
6.63 +for t, t2 in zip(r.read_delta_sequence(), tuples2):
6.64 print t == t2, t, t2
6.65 +r.end_record()
6.66 r.close()
6.67
6.68 print "- Test positions."
6.69 @@ -138,8 +142,8 @@
6.70 for doc_positions in all_doc_positions_seq:
6.71 for docnum, positions in doc_positions:
6.72 d, p = r.read_positions()
6.73 - print tuple(docnum) == tuple(d), docnum, d
6.74 - print tuple(positions) == tuple(p), positions, p
6.75 + print docnum == d, docnum, d
6.76 + print positions == p, positions, p
6.77 r.reset()
6.78 r.close()
6.79