1.1 --- a/iixr/fields.py Fri Jan 28 01:36:25 2011 +0100
1.2 +++ b/iixr/fields.py Thu Feb 03 01:26:35 2011 +0100
1.3 @@ -47,14 +47,12 @@
1.4 # Write the number of values per document number.
1.5 # Write the document number delta.
1.6
1.7 - output = array('B')
1.8 - vint_to_array(self.docnum_size, output)
1.9 - self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)
1.10 + self.write_number(self.docnum_size)
1.11 + self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
1.12
1.13 # Write the number of fields.
1.14
1.15 - vint_to_array(len(fields), output)
1.16 - output.tofile(self.f)
1.17 + self.write_number(len(fields))
1.18
1.19 # Write the fields themselves.
1.20
1.21 @@ -142,10 +140,8 @@
1.22 # Write the number of values per document number.
1.23 # Write the document number delta.
1.24
1.25 - output = array('B')
1.26 - vint_to_array(self.docnum_size, output)
1.27 - self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)
1.28 - output.tofile(self.f)
1.29 + self.write_number(self.docnum_size)
1.30 + self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
1.31
1.32 # Write the offset delta.
1.33
1.34 @@ -196,7 +192,7 @@
1.35 "Write details of the document with the given 'docnum' and 'fields'."
1.36
1.37 if self.entry % self.interval == 0:
1.38 - offset = self.field_writer.f.tell()
1.39 + offset = self.field_writer.tell()
1.40 self.field_writer.write_fields(docnum, fields)
1.41 self.field_index_writer.write_document(docnum, offset)
1.42 else:
2.1 --- a/iixr/files.py Fri Jan 28 01:36:25 2011 +0100
2.2 +++ b/iixr/files.py Thu Feb 03 01:26:35 2011 +0100
2.3 @@ -30,6 +30,7 @@
2.4
2.5 def __init__(self, f):
2.6 self.f = f
2.7 + self.data = array('B')
2.8 self.reset()
2.9
2.10 def reset(self):
2.11 @@ -46,8 +47,14 @@
2.12 self.f.seek(0)
2.13 self.reset()
2.14
2.15 + def flush(self):
2.16 + if self.f is not None:
2.17 + self.data.tofile(self.f)
2.18 + self.data = array('B')
2.19 +
2.20 def close(self):
2.21 if self.f is not None:
2.22 + self.data.tofile(self.f)
2.23 self.f.close()
2.24 self.f = None
2.25
2.26 @@ -67,23 +74,21 @@
2.27
2.28 "Writing basic data types to files."
2.29
2.30 - def __init__(self, f):
2.31 - File.__init__(self, f)
2.32 + def tell(self):
2.33 + return self.f.tell() + len(self.data)
2.34
2.35 def write_number(self, number):
2.36
2.37 "Write 'number' to the file using a variable length encoding."
2.38
2.39 - self.f.write(vint(number))
2.40 + vint_to_array(number, self.data)
2.41
2.42 def write_numbers(self, numbers):
2.43
2.44 "Write 'numbers' to the file using a variable length encoding."
2.45
2.46 - output = array('B')
2.47 for number in numbers:
2.48 - vint_to_array(number, output)
2.49 - output.tofile(self.f)
2.50 + vint_to_array(number, self.data)
2.51
2.52 def write_string(self, s, compress=0):
2.53
2.54 @@ -116,9 +121,9 @@
2.55 # Write the length of the data before the data itself.
2.56
2.57 length = len(s)
2.58 - self.f.write("".join([flag, vint(length), s]))
2.59 + self.data.fromstring("".join([flag, vint(length), s]))
2.60
2.61 - def write_sequence(self, output, value, last, size, monotonic=1):
2.62 + def write_sequence(self, value, last, size, monotonic=1):
2.63 if size:
2.64 emit_delta = 1
2.65 for v, l in map(None, value, last)[:size]:
2.66 @@ -130,9 +135,9 @@
2.67 emit_delta = 0
2.68 else:
2.69 v_out = v + 1
2.70 - vint_to_array(v_out, output)
2.71 + vint_to_array(v_out, self.data)
2.72 else:
2.73 - vint_to_array(value - last, output)
2.74 + vint_to_array(value - last, self.data)
2.75
2.76 return value
2.77
2.78 @@ -140,28 +145,28 @@
2.79
2.80 "Reading basic data types from files."
2.81
2.82 - def __init__(self, f):
2.83 - File.__init__(self, f)
2.84 -
2.85 def read_number(self):
2.86
2.87 "Read a number from the file."
2.88
2.89 # Read each byte, adding it to the number.
2.90
2.91 - a = array('B')
2.92 + f = self.f
2.93 + a = self.data
2.94 fromfile = a.fromfile
2.95 - f = self.f
2.96
2.97 - fromfile(f, 1)
2.98 - csd = a[-1]
2.99 - if csd < 128:
2.100 - return csd
2.101 - else:
2.102 - while csd & 128:
2.103 - fromfile(f, 1)
2.104 - csd = a[-1]
2.105 - return vint_from_array(a)
2.106 + try:
2.107 + fromfile(f, 1)
2.108 + csd = a[-1]
2.109 + if csd < 128:
2.110 + return csd
2.111 + else:
2.112 + while csd & 128:
2.113 + fromfile(f, 1)
2.114 + csd = a[-1]
2.115 + return vint_from_array(self.data)
2.116 + finally:
2.117 + self.data = array('B')
2.118
2.119 def read_string(self, decompress=0):
2.120
2.121 @@ -170,26 +175,30 @@
2.122 'decompress' is set to a true value.
2.123 """
2.124
2.125 - read = self.f.read
2.126 -
2.127 # Decompress the data if requested.
2.128
2.129 if decompress:
2.130 - flag = read(1)
2.131 + flag = self.f.read(1)
2.132 else:
2.133 flag = "-"
2.134
2.135 length = self.read_number()
2.136 - s = read(length)
2.137
2.138 - # Perform decompression if applicable.
2.139 + try:
2.140 + self.data.fromfile(self.f, length)
2.141 + s = self.data.tostring()
2.142 +
2.143 + # Perform decompression if applicable.
2.144
2.145 - if flag == "z":
2.146 - s = zlib.decompress(s)
2.147 + if flag == "z":
2.148 + s = zlib.decompress(s)
2.149 +
2.150 + # Convert strings to Unicode objects.
2.151
2.152 - # Convert strings to Unicode objects.
2.153 + return unicode(s, "utf-8")
2.154
2.155 - return unicode(s, "utf-8")
2.156 + finally:
2.157 + self.data = array('B')
2.158
2.159 def read_sequence(self, last, size, monotonic=1):
2.160 if size:
3.1 --- a/iixr/positions.py Fri Jan 28 01:36:25 2011 +0100
3.2 +++ b/iixr/positions.py Thu Feb 03 01:26:35 2011 +0100
3.3 @@ -20,7 +20,6 @@
3.4
3.5 from iixr.files import *
3.6 from iixr.data import vint, vint_to_array
3.7 -from array import array
3.8
3.9 class PositionWriter(FileWriter):
3.10
3.11 @@ -58,20 +57,17 @@
3.12 # Write the number of positions.
3.13 # Write the number of values per position.
3.14
3.15 - output = array('B')
3.16 - vint_to_array(self.docnum_size, output)
3.17 - self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)
3.18 - vint_to_array(len(positions), output)
3.19 - vint_to_array(size, output)
3.20 + self.write_number(self.docnum_size)
3.21 + self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
3.22 + self.write_number(len(positions))
3.23 + self.write_number(size)
3.24
3.25 # Write the position deltas.
3.26
3.27 last = self.get_initial_value(size)
3.28
3.29 for position in positions:
3.30 - last = self.write_sequence(output, position, last, size)
3.31 -
3.32 - output.tofile(self.f)
3.33 + last = self.write_sequence(position, last, size)
3.34
3.35 self.last_docnum = docnum
3.36
3.37 @@ -150,15 +146,10 @@
3.38 # Write the position file offset delta.
3.39 # Write the document count.
3.40
3.41 - output = array('B')
3.42 - vint_to_array(self.docnum_size, output)
3.43 - self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)
3.44 - vint_to_array(pos_offset - self.last_pos_offset, output)
3.45 - vint_to_array(count, output)
3.46 -
3.47 - # Actually write the data.
3.48 -
3.49 - output.tofile(self.f)
3.50 + self.write_number(self.docnum_size)
3.51 + self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)
3.52 + self.write_number(pos_offset - self.last_pos_offset)
3.53 + self.write_number(count)
3.54
3.55 self.last_pos_offset = pos_offset
3.56
3.57 @@ -335,7 +326,7 @@
3.58
3.59 # Remember the first index entry offset.
3.60
3.61 - index_offset = self.position_index_writer.f.tell()
3.62 + index_offset = self.position_index_writer.tell()
3.63
3.64 # Write the positions.
3.65
3.66 @@ -346,7 +337,7 @@
3.67
3.68 # Retain the first record offset for a subsequent index entry.
3.69
3.70 - first_offset = self.position_writer.f.tell()
3.71 + first_offset = self.position_writer.tell()
3.72 first_docnum = None
3.73
3.74 doc_positions.sort()
3.75 @@ -366,7 +357,7 @@
3.76
3.77 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
3.78
3.79 - first_offset = self.position_writer.f.tell()
3.80 + first_offset = self.position_writer.tell()
3.81 first_docnum = None
3.82
3.83 # Reset the position writer so that position readers accessing
4.1 --- a/iixr/terms.py Fri Jan 28 01:36:25 2011 +0100
4.2 +++ b/iixr/terms.py Thu Feb 03 01:26:35 2011 +0100
4.3 @@ -181,7 +181,7 @@
4.4 self.info_writer.write_term(term, offset, frequency, doc_frequency)
4.5
4.6 if self.entry % self.interval == 0:
4.7 - info_offset = self.info_writer.f.tell()
4.8 + info_offset = self.info_writer.tell()
4.9 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
4.10
4.11 self.entry += 1
5.1 --- a/test.py Fri Jan 28 01:36:25 2011 +0100
5.2 +++ b/test.py Thu Feb 03 01:26:35 2011 +0100
5.3 @@ -5,7 +5,6 @@
5.4 from iixr.terms import *
5.5 from iixr.positions import *
5.6 from iixr.index import *
5.7 -from array import array
5.8 import os, sys
5.9
5.10 # Remove old test files.
5.11 @@ -48,11 +47,9 @@
5.12
5.13 f = open("testMS", "wb")
5.14 w = FileWriter(f)
5.15 -b = array("B")
5.16 last = w.get_initial_value(2)
5.17 for t in tuples:
5.18 - last = w.write_sequence(b, t, last, 2)
5.19 -b.tofile(w.f)
5.20 + last = w.write_sequence(t, last, 2)
5.21 w.close()
5.22
5.23 f = open("testMS", "rb")
5.24 @@ -67,11 +64,9 @@
5.25
5.26 f = open("testNMS", "wb")
5.27 w = FileWriter(f)
5.28 -b = array("B")
5.29 last = w.get_initial_value(2)
5.30 for t in tuples2:
5.31 - last = w.write_sequence(b, t, last, 2, monotonic=0)
5.32 -b.tofile(w.f)
5.33 + last = w.write_sequence(t, last, 2, monotonic=0)
5.34 w.close()
5.35
5.36 f = open("testNMS", "rb")
5.37 @@ -170,7 +165,7 @@
5.38 w.reset()
5.39 for docnum, pos_offset, count in term_positions:
5.40 if offset is None:
5.41 - offset = w.f.tell()
5.42 + offset = w.tell()
5.43 w.write_positions(docnum, pos_offset, count)
5.44 doc_frequency += count
5.45 offsets.append((offset, doc_frequency))