1.1 --- a/iixr/fields.py Sat Nov 20 23:56:16 2010 +0100
1.2 +++ b/iixr/fields.py Sun Nov 21 20:44:43 2010 +0100
1.3 @@ -28,7 +28,8 @@
1.4 "Writing field data to files."
1.5
1.6 def reset(self):
1.7 - self.last_docnum = 0
1.8 + self.last_docnum = None
1.9 + self.docnum_size = None
1.10
1.11 def write_fields(self, docnum, fields):
1.12
1.13 @@ -37,13 +38,23 @@
1.14 representing field identifiers and values respectively).
1.15 """
1.16
1.17 + # Find the size of document number values.
1.18 +
1.19 + if self.docnum_size is None:
1.20 + self.docnum_size = self.get_value_size(docnum)
1.21 + self.last_docnum = self.get_initial_value(self.docnum_size)
1.22 +
1.23 + # Write the number of values per document number.
1.24 # Write the document number delta.
1.25
1.26 - self.write_number(docnum - self.last_docnum)
1.27 + output = array('B')
1.28 + vint_to_array(self.docnum_size, output)
1.29 + self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)
1.30
1.31 # Write the number of fields.
1.32
1.33 - self.write_number(len(fields))
1.34 + vint_to_array(len(fields), output)
1.35 + output.tofile(self.f)
1.36
1.37 # Write the fields themselves.
1.38
1.39 @@ -51,14 +62,12 @@
1.40 self.write_number(i)
1.41 self.write_string(field, 1) # compress
1.42
1.43 - self.last_docnum = docnum
1.44 -
1.45 class FieldReader(FileReader):
1.46
1.47 "Reading field data from files."
1.48
1.49 def reset(self):
1.50 - self.last_docnum = 0
1.51 + self.last_docnum = None
1.52
1.53 def read_fields(self):
1.54
1.55 @@ -67,9 +76,16 @@
1.56 number and a list of field (identifier, value) pairs.
1.57 """
1.58
1.59 - # Read the document number.
1.60 + # Read the number of values per document number.
1.61 +
1.62 + docnum_size = self.read_number()
1.63
1.64 - self.last_docnum += self.read_number()
1.65 + if self.last_docnum is None:
1.66 + self.last_docnum = self.get_initial_value(docnum_size)
1.67 +
1.68 + # Read the document number delta and add it to the last number.
1.69 +
1.70 + self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
1.71
1.72 # Read the number of fields.
1.73
1.74 @@ -106,7 +122,8 @@
1.75 "Writing field index details to files."
1.76
1.77 def reset(self):
1.78 - self.last_docnum = 0
1.79 + self.last_docnum = None
1.80 + self.docnum_size = None
1.81 self.last_offset = 0
1.82
1.83 def write_document(self, docnum, offset):
1.84 @@ -116,12 +133,23 @@
1.85 document are stored in the fields file.
1.86 """
1.87
1.88 - # Write the document number and offset deltas.
1.89 + # Find the size of document number values.
1.90 +
1.91 + if self.docnum_size is None:
1.92 + self.docnum_size = self.get_value_size(docnum)
1.93 + self.last_docnum = self.get_initial_value(self.docnum_size)
1.94 +
1.95 + # Write the number of values per document number.
1.96 + # Write the document number delta.
1.97
1.98 - self.write_number(docnum - self.last_docnum)
1.99 + output = array('B')
1.100 + vint_to_array(self.docnum_size, output)
1.101 + self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)
1.102 + output.tofile(self.f)
1.103 +
1.104 + # Write the offset delta.
1.105 +
1.106 self.write_number(offset - self.last_offset)
1.107 -
1.108 - self.last_docnum = docnum
1.109 self.last_offset = offset
1.110
1.111 class FieldIndexReader(FileReader):
1.112 @@ -129,16 +157,26 @@
1.113 "Reading field index details from files."
1.114
1.115 def reset(self):
1.116 - self.last_docnum = 0
1.117 + self.last_docnum = None
1.118 self.last_offset = 0
1.119
1.120 def read_document(self):
1.121
1.122 "Read a document number and field file offset."
1.123
1.124 - # Read the document number delta and offset.
1.125 + # Read the number of values per document number.
1.126 +
1.127 + docnum_size = self.read_number()
1.128 +
1.129 + if self.last_docnum is None:
1.130 + self.last_docnum = self.get_initial_value(docnum_size)
1.131
1.132 - self.last_docnum += self.read_number()
1.133 + # Read the document number delta and add it to the last number.
1.134 +
1.135 + self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
1.136 +
1.137 + # Read the offset.
1.138 +
1.139 self.last_offset += self.read_number()
1.140
1.141 return self.last_docnum, self.last_offset
2.1 --- a/iixr/files.py Sat Nov 20 23:56:16 2010 +0100
2.2 +++ b/iixr/files.py Sun Nov 21 20:44:43 2010 +0100
2.3 @@ -18,7 +18,7 @@
2.4 with this program. If not, see <http://www.gnu.org/licenses/>.
2.5 """
2.6
2.7 -from iixr.data import vint
2.8 +from iixr.data import vint, vint_to_array
2.9 from array import array
2.10 import zlib
2.11
2.12 @@ -51,6 +51,18 @@
2.13 self.f.close()
2.14 self.f = None
2.15
2.16 + def get_value_size(self, value):
2.17 + if isinstance(value, (list, tuple)):
2.18 + return len(value)
2.19 + else:
2.20 + return 0
2.21 +
2.22 + def get_initial_value(self, size):
2.23 + if size:
2.24 + return [0] * size
2.25 + else:
2.26 + return 0
2.27 +
2.28 class FileWriter(File):
2.29
2.30 "Writing basic data types to files."
2.31 @@ -97,6 +109,24 @@
2.32 length = len(s)
2.33 self.f.write("".join([flag, vint(length), s]))
2.34
2.35 + def write_sequence(self, output, value, last, size, monotonic=1):
2.36 + if size:
2.37 + emit_delta = 1
2.38 + for v, l in map(None, value, last)[:size]:
2.39 + if v is None:
2.40 + v = l
2.41 + if monotonic or emit_delta:
2.42 + v_out = v - l
2.43 + if emit_delta and v_out != 0:
2.44 + emit_delta = 0
2.45 + else:
2.46 + v_out = v + 1
2.47 + vint_to_array(v_out, output)
2.48 + else:
2.49 + vint_to_array(value - last, output)
2.50 +
2.51 + return value
2.52 +
2.53 class FileReader(File):
2.54
2.55 "Reading basic data types from files."
2.56 @@ -152,4 +182,20 @@
2.57
2.58 return unicode(s, "utf-8")
2.59
2.60 + def read_sequence(self, last, size, monotonic=1):
2.61 + if size:
2.62 + expect_delta = 1
2.63 + value = []
2.64 + for v in last:
2.65 + v_in = self.read_number()
2.66 + if monotonic or expect_delta:
2.67 + value.append(v + v_in)
2.68 + if expect_delta and v_in != 0:
2.69 + expect_delta = 0
2.70 + else:
2.71 + value.append(v_in - 1)
2.72 + return tuple(value)
2.73 + else:
2.74 + return last + self.read_number()
2.75 +
2.76 # vim: tabstop=4 expandtab shiftwidth=4
3.1 --- a/iixr/positions.py Sat Nov 20 23:56:16 2010 +0100
3.2 +++ b/iixr/positions.py Sun Nov 21 20:44:43 2010 +0100
3.3 @@ -27,7 +27,8 @@
3.4 "Writing position information to files."
3.5
3.6 def reset(self):
3.7 - self.last_docnum = 0
3.8 + self.last_docnum = None
3.9 + self.docnum_size = None
3.10
3.11 def write_positions(self, docnum, positions):
3.12
3.13 @@ -35,6 +36,12 @@
3.14 Write for the document 'docnum' the given 'positions'.
3.15 """
3.16
3.17 + # Find the size of document number values.
3.18 +
3.19 + if self.docnum_size is None:
3.20 + self.docnum_size = self.get_value_size(docnum)
3.21 + self.last_docnum = self.get_initial_value(self.docnum_size)
3.22 +
3.23 if docnum < self.last_docnum:
3.24 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
3.25
3.26 @@ -42,20 +49,27 @@
3.27
3.28 positions.sort()
3.29
3.30 + # Find the size of position values.
3.31 +
3.32 + size = self.get_value_size(positions[0])
3.33 +
3.34 + # Write the number of values per document number.
3.35 # Write the document number delta.
3.36 # Write the number of positions.
3.37 + # Write the number of values per position.
3.38
3.39 output = array('B')
3.40 - vint_to_array(docnum - self.last_docnum, output)
3.41 + vint_to_array(self.docnum_size, output)
3.42 + self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)
3.43 vint_to_array(len(positions), output)
3.44 + vint_to_array(size, output)
3.45
3.46 # Write the position deltas.
3.47
3.48 - last = 0
3.49 + last = self.get_initial_value(size)
3.50
3.51 for position in positions:
3.52 - vint_to_array(position - last, output)
3.53 - last = position
3.54 + last = self.write_sequence(output, position, last, size)
3.55
3.56 output.tofile(self.f)
3.57
3.58 @@ -66,29 +80,44 @@
3.59 "Reading position information within term-specific regions of a file."
3.60
3.61 def reset(self):
3.62 - self.last_docnum = 0
3.63 + self.last_docnum = None
3.64
3.65 def read_positions(self):
3.66
3.67 - "Read positions, returning a document number and a list of positions."
3.68 + """
3.69 + Read positions, returning a document number and a list of positions.
3.70 + """
3.71 +
3.72 + # Read the number of values per document number.
3.73 +
3.74 + docnum_size = self.read_number()
3.75 +
3.76 + if self.last_docnum is None:
3.77 + self.last_docnum = self.get_initial_value(docnum_size)
3.78
3.79 # Read the document number delta and add it to the last number.
3.80
3.81 - self.last_docnum += self.read_number()
3.82 + self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
3.83
3.84 # Read the number of positions.
3.85
3.86 npositions = self.read_number()
3.87
3.88 + # Read the number of values per position.
3.89 +
3.90 + size = self.read_number()
3.91 +
3.92 # Read the position deltas, adding each previous position to get the
3.93 # appropriate collection of absolute positions.
3.94
3.95 i = 0
3.96 - last = 0
3.97 +
3.98 + last = self.get_initial_value(size)
3.99 +
3.100 positions = []
3.101
3.102 while i < npositions:
3.103 - last += self.read_number()
3.104 + last = self.read_sequence(last, size)
3.105 positions.append(last)
3.106 i += 1
3.107
3.108 @@ -99,7 +128,8 @@
3.109 "Writing position index information to files."
3.110
3.111 def reset(self):
3.112 - self.last_docnum = 0
3.113 + self.last_docnum = None
3.114 + self.docnum_size = None
3.115 self.last_pos_offset = 0
3.116
3.117 def write_positions(self, docnum, pos_offset, count):
3.118 @@ -109,12 +139,20 @@
3.119 position index file.
3.120 """
3.121
3.122 + # Find the size of document number values.
3.123 +
3.124 + if self.docnum_size is None:
3.125 + self.docnum_size = self.get_value_size(docnum)
3.126 + self.last_docnum = self.get_initial_value(self.docnum_size)
3.127 +
3.128 + # Write the number of values per document number.
3.129 # Write the document number delta.
3.130 # Write the position file offset delta.
3.131 # Write the document count.
3.132
3.133 output = array('B')
3.134 - vint_to_array(docnum - self.last_docnum, output)
3.135 + vint_to_array(self.docnum_size, output)
3.136 + self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)
3.137 vint_to_array(pos_offset - self.last_pos_offset, output)
3.138 vint_to_array(count, output)
3.139
3.140 @@ -123,14 +161,13 @@
3.141 output.tofile(self.f)
3.142
3.143 self.last_pos_offset = pos_offset
3.144 - self.last_docnum = docnum
3.145
3.146 class PositionIndexReader(FileReader):
3.147
3.148 "Reading position index information within term-specific regions of a file."
3.149
3.150 def reset(self):
3.151 - self.last_docnum = 0
3.152 + self.last_docnum = None
3.153 self.last_pos_offset = 0
3.154
3.155 def read_positions(self):
3.156 @@ -140,9 +177,16 @@
3.157 file, and the number of documents in a section of that file.
3.158 """
3.159
3.160 - # Read the document number delta.
3.161 + # Read the number of values per document number.
3.162 +
3.163 + docnum_size = self.read_number()
3.164
3.165 - self.last_docnum += self.read_number()
3.166 + if self.last_docnum is None:
3.167 + self.last_docnum = self.get_initial_value(docnum_size)
3.168 +
3.169 + # Read the document number delta and add it to the last number.
3.170 +
3.171 + self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)
3.172
3.173 # Read the offset delta.
3.174
4.1 --- a/iixr/terms.py Sat Nov 20 23:56:16 2010 +0100
4.2 +++ b/iixr/terms.py Sun Nov 21 20:44:43 2010 +0100
4.3 @@ -336,7 +336,7 @@
4.4
4.5 "Return a list of all terms."
4.6
4.7 - return self.terms
4.8 + return iter(self)
4.9
4.10 def find_terms(self, term):
4.11
5.1 --- a/test.py Sat Nov 20 23:56:16 2010 +0100
5.2 +++ b/test.py Sun Nov 21 20:44:43 2010 +0100
5.3 @@ -5,11 +5,12 @@
5.4 from iixr.terms import *
5.5 from iixr.positions import *
5.6 from iixr.index import *
5.7 +from array import array
5.8 import os, sys
5.9
5.10 # Remove old test files.
5.11
5.12 -for filename in ("test", "testF", "testFI", "testI", "testP", "testPI"):
5.13 +for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"):
5.14 try:
5.15 os.remove(filename)
5.16 except OSError:
5.17 @@ -43,6 +44,44 @@
5.18 print number == n, number, n
5.19 r.close()
5.20
5.21 +tuples = [(0, 0), (1, 3), (2, 5), (3, 9)]
5.22 +
5.23 +f = open("testMS", "wb")
5.24 +w = FileWriter(f)
5.25 +b = array("B")
5.26 +last = w.get_initial_value(2)
5.27 +for t in tuples:
5.28 + last = w.write_sequence(b, t, last, 2)
5.29 +b.tofile(w.f)
5.30 +w.close()
5.31 +
5.32 +f = open("testMS", "rb")
5.33 +r = FileReader(f)
5.34 +last = r.get_initial_value(2)
5.35 +for t in tuples:
5.36 + last = t2 = r.read_sequence(last, 2)
5.37 + print t == t2, t, t2
5.38 +r.close()
5.39 +
5.40 +tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)]
5.41 +
5.42 +f = open("testNMS", "wb")
5.43 +w = FileWriter(f)
5.44 +b = array("B")
5.45 +last = w.get_initial_value(2)
5.46 +for t in tuples2:
5.47 + last = w.write_sequence(b, t, last, 2, monotonic=0)
5.48 +b.tofile(w.f)
5.49 +w.close()
5.50 +
5.51 +f = open("testNMS", "rb")
5.52 +r = FileReader(f)
5.53 +last = r.get_initial_value(2)
5.54 +for t in tuples2:
5.55 + last = t2 = r.read_sequence(last, 2, monotonic=0)
5.56 + print t == t2, t, t2
5.57 +r.close()
5.58 +
5.59 print "- Test positions."
5.60
5.61 all_doc_positions = [
5.62 @@ -77,6 +116,38 @@
5.63 r.reset()
5.64 r.close()
5.65
5.66 +all_doc_positions_seq = [
5.67 + [
5.68 + ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]),
5.69 + ((124, 1), [(0, 0), (100, 350)]),
5.70 + ((124, 2), [(11, 38), (99, 379), (199, 720)]),
5.71 + ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)])
5.72 + ],
5.73 + [
5.74 + ((78, 1), [(9, 19)]),
5.75 + ((196, 0), [(10, 27), (11, 29)]),
5.76 + ((196, 1), [(17, 46), (21, 52), (30, 60)])
5.77 + ]
5.78 + ]
5.79 +
5.80 +f = open("testP2", "wb")
5.81 +w = PositionWriter(f)
5.82 +for doc_positions in all_doc_positions_seq:
5.83 + for docnum, positions in doc_positions:
5.84 + w.write_positions(docnum, positions)
5.85 + w.reset()
5.86 +w.close()
5.87 +
5.88 +f = open("testP2", "rb")
5.89 +r = PositionReader(f)
5.90 +for doc_positions in all_doc_positions_seq:
5.91 + for docnum, positions in doc_positions:
5.92 + d, p = r.read_positions()
5.93 + print tuple(docnum) == tuple(d), docnum, d
5.94 + print tuple(positions) == tuple(p), positions, p
5.95 + r.reset()
5.96 +r.close()
5.97 +
5.98 print "- Test position index files."
5.99
5.100 indexed_positions = [