# HG changeset patch # User Paul Boddie # Date 1290368683 -3600 # Node ID d308dc25f5a2b0330d08bcb147e51eb328ee2139 # Parent 6dd92daca068f82954f6de43ea902f474a324fae Introduced support for specifying sequences for document numbers and positions, with the latter being "monotonic" sequences whose elements contain items that are always greater than or equal to the items in the same position in each preceding element of the sequence. Fixed the get_terms method of the term dictionary reader to refer to the iterator over term information (and not the list of terms provided by the term index). Expanded the tests to cover sequences as document numbers and positions. diff -r 6dd92daca068 -r d308dc25f5a2 iixr/fields.py --- a/iixr/fields.py Sat Nov 20 23:56:16 2010 +0100 +++ b/iixr/fields.py Sun Nov 21 20:44:43 2010 +0100 @@ -28,7 +28,8 @@ "Writing field data to files." def reset(self): - self.last_docnum = 0 + self.last_docnum = None + self.docnum_size = None def write_fields(self, docnum, fields): @@ -37,13 +38,23 @@ representing field identifiers and values respectively). """ + # Find the size of document number values. + + if self.docnum_size is None: + self.docnum_size = self.get_value_size(docnum) + self.last_docnum = self.get_initial_value(self.docnum_size) + + # Write the number of values per document number. # Write the document number delta. - self.write_number(docnum - self.last_docnum) + output = array('B') + vint_to_array(self.docnum_size, output) + self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) # Write the number of fields. - self.write_number(len(fields)) + vint_to_array(len(fields), output) + output.tofile(self.f) # Write the fields themselves. @@ -51,14 +62,12 @@ self.write_number(i) self.write_string(field, 1) # compress - self.last_docnum = docnum - class FieldReader(FileReader): "Reading field data from files." def reset(self): - self.last_docnum = 0 + self.last_docnum = None def read_fields(self): @@ -67,9 +76,16 @@ number and a list of field (identifier, value) pairs. """ - # Read the document number. + # Read the number of values per document number. + + docnum_size = self.read_number() - self.last_docnum += self.read_number() + if self.last_docnum is None: + self.last_docnum = self.get_initial_value(docnum_size) + + # Read the document number delta and add it to the last number. + + self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) # Read the number of fields. @@ -106,7 +122,8 @@ "Writing field index details to files." def reset(self): - self.last_docnum = 0 + self.last_docnum = None + self.docnum_size = None self.last_offset = 0 def write_document(self, docnum, offset): @@ -116,12 +133,23 @@ document are stored in the fields file. """ - # Write the document number and offset deltas. + # Find the size of document number values. + + if self.docnum_size is None: + self.docnum_size = self.get_value_size(docnum) + self.last_docnum = self.get_initial_value(self.docnum_size) + + # Write the number of values per document number. + # Write the document number delta. - self.write_number(docnum - self.last_docnum) + output = array('B') + vint_to_array(self.docnum_size, output) + self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) + output.tofile(self.f) + + # Write the offset delta. + self.write_number(offset - self.last_offset) - - self.last_docnum = docnum self.last_offset = offset class FieldIndexReader(FileReader): @@ -129,16 +157,26 @@ "Reading field index details from files." def reset(self): - self.last_docnum = 0 + self.last_docnum = None self.last_offset = 0 def read_document(self): "Read a document number and field file offset." - # Read the document number delta and offset. + # Read the number of values per document number. + + docnum_size = self.read_number() + + if self.last_docnum is None: + self.last_docnum = self.get_initial_value(docnum_size) - self.last_docnum += self.read_number() + # Read the document number delta and add it to the last number. + + self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) + + # Read the offset. + self.last_offset += self.read_number() return self.last_docnum, self.last_offset diff -r 6dd92daca068 -r d308dc25f5a2 iixr/files.py --- a/iixr/files.py Sat Nov 20 23:56:16 2010 +0100 +++ b/iixr/files.py Sun Nov 21 20:44:43 2010 +0100 @@ -18,7 +18,7 @@ with this program. If not, see . """ -from iixr.data import vint +from iixr.data import vint, vint_to_array from array import array import zlib @@ -51,6 +51,18 @@ self.f.close() self.f = None + def get_value_size(self, value): + if isinstance(value, (list, tuple)): + return len(value) + else: + return 0 + + def get_initial_value(self, size): + if size: + return [0] * size + else: + return 0 + class FileWriter(File): "Writing basic data types to files." @@ -97,6 +109,24 @@ length = len(s) self.f.write("".join([flag, vint(length), s])) + def write_sequence(self, output, value, last, size, monotonic=1): + if size: + emit_delta = 1 + for v, l in map(None, value, last)[:size]: + if v is None: + v = l + if monotonic or emit_delta: + v_out = v - l + if emit_delta and v_out != 0: + emit_delta = 0 + else: + v_out = v + 1 + vint_to_array(v_out, output) + else: + vint_to_array(value - last, output) + + return value + class FileReader(File): "Reading basic data types from files." @@ -152,4 +182,20 @@ return unicode(s, "utf-8") + def read_sequence(self, last, size, monotonic=1): + if size: + expect_delta = 1 + value = [] + for v in last: + v_in = self.read_number() + if monotonic or expect_delta: + value.append(v + v_in) + if expect_delta and v_in != 0: + expect_delta = 0 + else: + value.append(v_in - 1) + return tuple(value) + else: + return last + self.read_number() + # vim: tabstop=4 expandtab shiftwidth=4 diff -r 6dd92daca068 -r d308dc25f5a2 iixr/positions.py --- a/iixr/positions.py Sat Nov 20 23:56:16 2010 +0100 +++ b/iixr/positions.py Sun Nov 21 20:44:43 2010 +0100 @@ -27,7 +27,8 @@ "Writing position information to files." def reset(self): - self.last_docnum = 0 + self.last_docnum = None + self.docnum_size = None def write_positions(self, docnum, positions): @@ -35,6 +36,12 @@ Write for the document 'docnum' the given 'positions'. """ + # Find the size of document number values. + + if self.docnum_size is None: + self.docnum_size = self.get_value_size(docnum) + self.last_docnum = self.get_initial_value(self.docnum_size) + if docnum < self.last_docnum: raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) @@ -42,20 +49,27 @@ positions.sort() + # Find the size of position values. + + size = self.get_value_size(positions[0]) + + # Write the number of values per document number. # Write the document number delta. # Write the number of positions. + # Write the number of values per position. output = array('B') - vint_to_array(docnum - self.last_docnum, output) + vint_to_array(self.docnum_size, output) + self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) vint_to_array(len(positions), output) + vint_to_array(size, output) # Write the position deltas. - last = 0 + last = self.get_initial_value(size) for position in positions: - vint_to_array(position - last, output) - last = position + last = self.write_sequence(output, position, last, size) output.tofile(self.f) @@ -66,29 +80,44 @@ "Reading position information within term-specific regions of a file." def reset(self): - self.last_docnum = 0 + self.last_docnum = None def read_positions(self): - "Read positions, returning a document number and a list of positions." + """ + Read positions, returning a document number and a list of positions. + """ + + # Read the number of values per document number. + + docnum_size = self.read_number() + + if self.last_docnum is None: + self.last_docnum = self.get_initial_value(docnum_size) # Read the document number delta and add it to the last number. - self.last_docnum += self.read_number() + self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) # Read the number of positions. npositions = self.read_number() + # Read the number of values per position. + + size = self.read_number() + # Read the position deltas, adding each previous position to get the # appropriate collection of absolute positions. i = 0 - last = 0 + + last = self.get_initial_value(size) + positions = [] while i < npositions: - last += self.read_number() + last = self.read_sequence(last, size) positions.append(last) i += 1 @@ -99,7 +128,8 @@ "Writing position index information to files." def reset(self): - self.last_docnum = 0 + self.last_docnum = None + self.docnum_size = None self.last_pos_offset = 0 def write_positions(self, docnum, pos_offset, count): @@ -109,12 +139,20 @@ position index file. """ + # Find the size of document number values. + + if self.docnum_size is None: + self.docnum_size = self.get_value_size(docnum) + self.last_docnum = self.get_initial_value(self.docnum_size) + + # Write the number of values per document number. # Write the document number delta. # Write the position file offset delta. # Write the document count. output = array('B') - vint_to_array(docnum - self.last_docnum, output) + vint_to_array(self.docnum_size, output) + self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) vint_to_array(pos_offset - self.last_pos_offset, output) vint_to_array(count, output) @@ -123,14 +161,13 @@ output.tofile(self.f) self.last_pos_offset = pos_offset - self.last_docnum = docnum class PositionIndexReader(FileReader): "Reading position index information within term-specific regions of a file." def reset(self): - self.last_docnum = 0 + self.last_docnum = None self.last_pos_offset = 0 def read_positions(self): @@ -140,9 +177,16 @@ file, and the number of documents in a section of that file. """ - # Read the document number delta. + # Read the number of values per document number. + + docnum_size = self.read_number() - self.last_docnum += self.read_number() + if self.last_docnum is None: + self.last_docnum = self.get_initial_value(docnum_size) + + # Read the document number delta and add it to the last number. + + self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) # Read the offset delta. diff -r 6dd92daca068 -r d308dc25f5a2 iixr/terms.py --- a/iixr/terms.py Sat Nov 20 23:56:16 2010 +0100 +++ b/iixr/terms.py Sun Nov 21 20:44:43 2010 +0100 @@ -336,7 +336,7 @@ "Return a list of all terms." - return self.terms + return iter(self) def find_terms(self, term): diff -r 6dd92daca068 -r d308dc25f5a2 test.py --- a/test.py Sat Nov 20 23:56:16 2010 +0100 +++ b/test.py Sun Nov 21 20:44:43 2010 +0100 @@ -5,11 +5,12 @@ from iixr.terms import * from iixr.positions import * from iixr.index import * +from array import array import os, sys # Remove old test files. -for filename in ("test", "testF", "testFI", "testI", "testP", "testPI"): +for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"): try: os.remove(filename) except OSError: @@ -43,6 +44,44 @@ print number == n, number, n r.close() +tuples = [(0, 0), (1, 3), (2, 5), (3, 9)] + +f = open("testMS", "wb") +w = FileWriter(f) +b = array("B") +last = w.get_initial_value(2) +for t in tuples: + last = w.write_sequence(b, t, last, 2) +b.tofile(w.f) +w.close() + +f = open("testMS", "rb") +r = FileReader(f) +last = r.get_initial_value(2) +for t in tuples: + last = t2 = r.read_sequence(last, 2) + print t == t2, t, t2 +r.close() + +tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)] + +f = open("testNMS", "wb") +w = FileWriter(f) +b = array("B") +last = w.get_initial_value(2) +for t in tuples2: + last = w.write_sequence(b, t, last, 2, monotonic=0) +b.tofile(w.f) +w.close() + +f = open("testNMS", "rb") +r = FileReader(f) +last = r.get_initial_value(2) +for t in tuples2: + last = t2 = r.read_sequence(last, 2, monotonic=0) + print t == t2, t, t2 +r.close() + print "- Test positions." all_doc_positions = [ @@ -77,6 +116,38 @@ r.reset() r.close() +all_doc_positions_seq = [ + [ + ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]), + ((124, 1), [(0, 0), (100, 350)]), + ((124, 2), [(11, 38), (99, 379), (199, 720)]), + ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)]) + ], + [ + ((78, 1), [(9, 19)]), + ((196, 0), [(10, 27), (11, 29)]), + ((196, 1), [(17, 46), (21, 52), (30, 60)]) + ] + ] + +f = open("testP2", "wb") +w = PositionWriter(f) +for doc_positions in all_doc_positions_seq: + for docnum, positions in doc_positions: + w.write_positions(docnum, positions) + w.reset() +w.close() + +f = open("testP2", "rb") +r = PositionReader(f) +for doc_positions in all_doc_positions_seq: + for docnum, positions in doc_positions: + d, p = r.read_positions() + print tuple(docnum) == tuple(d), docnum, d + print tuple(positions) == tuple(p), positions, p + r.reset() +r.close() + print "- Test position index files." indexed_positions = [