1.1 --- a/iixr.py Wed Aug 26 23:36:02 2009 +0200
1.2 +++ b/iixr.py Thu Aug 27 00:02:50 2009 +0200
1.3 @@ -413,7 +413,7 @@
1.4 self.max_offset = self.terms[-1][1]
1.5 self.max_info_offset = self.terms[-1][2]
1.6
1.7 - def find(self, term):
1.8 + def find_term(self, term):
1.9
1.10 "Find the position file offset of 'term' from the term dictionary."
1.11
1.12 @@ -453,7 +453,7 @@
1.13
1.14 "Return the documents and positions at which the given 'term' is found."
1.15
1.16 - offset = self.find(term)
1.17 + offset = self.find_term(term)
1.18 if offset is None:
1.19 return None
1.20 else:
1.21 @@ -464,4 +464,45 @@
1.22 self.index_reader.close()
1.23 self.position_reader.close()
1.24
1.25 +class IndexWriter:
1.26 +
1.27 + "Building term information and writing it to the term dictionary."
1.28 +
1.29 + def __init__(self, dict_writer):
1.30 + self.dict_writer = dict_writer
1.31 + self.terms = {}
1.32 +
1.33 + def add_position(self, term, docnum, position):
1.34 +
1.35 + """
1.36 + Add a position entry for the given 'term' in the document with the given
1.37 + 'docnum', indicating the given 'position'.
1.38 + """
1.39 +
1.40 + if not self.terms.has_key(term):
1.41 + doc_positions = self.terms[term] = {}
1.42 + else:
1.43 + doc_positions = self.terms[term]
1.44 +
1.45 + if not doc_positions.has_key(docnum):
1.46 + doc = doc_positions[docnum] = []
1.47 + else:
1.48 + doc = doc_positions[docnum]
1.49 +
1.50 + doc.append(position)
1.51 +
1.52 + def close(self):
1.53 +
1.54 + # Get the terms in order.
1.55 +
1.56 + terms = self.terms.items()
1.57 + terms.sort()
1.58 +
1.59 + for term, doc_positions in terms:
1.60 + doc_positions = doc_positions.items()
1.61 + doc_positions.sort()
1.62 + self.dict_writer.write_term_positions(term, doc_positions)
1.63 +
1.64 + self.dict_writer.close()
1.65 +
1.66 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/test.py Wed Aug 26 23:36:02 2009 +0200
2.2 +++ b/test.py Thu Aug 27 00:02:50 2009 +0200
2.3 @@ -132,10 +132,10 @@
2.4 terms_reversed = terms[:]
2.5 terms_reversed.reverse()
2.6 for term, offset in terms_reversed:
2.7 - o = rd.find(term)
2.8 + o = rd.find_term(term)
2.9 print offset == o, offset, o
2.10 for term in ("dog", "dingo"):
2.11 - o = rd.find(term)
2.12 + o = rd.find_term(term)
2.13 print o is None, o
2.14 rd.close()
2.15
2.16 @@ -176,4 +176,45 @@
2.17 print dp is None, dp
2.18 rd.close()
2.19
2.20 +docs = [
2.21 + (1, "The cat sat on the mat"),
2.22 + (2, "Every good boy deserves football"),
2.23 + (13, "One good turn deserves another"),
2.24 + (14, "Every man for himself"),
2.25 + (25, "Red sky at night shepherd's delight"),
2.26 + (36, "She sells sea shells on the sea shore")
2.27 + ]
2.28 +
2.29 +doc_tests = [
2.30 + ("Every", [(2, [0]), (14, [0])]),
2.31 + ("good", [(2, [1]), (13, [1])]),
2.32 + ("deserves", [(2, [3]), (13, [3])]),
2.33 + ("sea", [(36, [2, 6])])
2.34 + ]
2.35 +
2.36 +f = open("test", "wb")
2.37 +w = iixr.TermWriter(f)
2.38 +f2 = open("testI", "wb")
2.39 +w2 = iixr.TermIndexWriter(f2)
2.40 +f3 = open("testP", "wb")
2.41 +w3 = iixr.PositionWriter(f3)
2.42 +wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
2.43 +wi = iixr.IndexWriter(wd)
2.44 +for docnum, text in docs:
2.45 + for position, term in enumerate(text.split()):
2.46 + wi.add_position(term, docnum, position)
2.47 +wi.close()
2.48 +
2.49 +f = open("test", "rb")
2.50 +r = iixr.TermReader(f)
2.51 +f2 = open("testI", "rb")
2.52 +r2 = iixr.TermIndexReader(f2)
2.53 +f3 = open("testP", "rb")
2.54 +r3 = iixr.PositionReader(f3)
2.55 +rd = iixr.TermDictionaryReader(r, r2, r3)
2.56 +for term, doc_positions in doc_tests:
2.57 + dp = rd.find_positions(term)
2.58 + print doc_positions == dp, doc_positions, dp
2.59 +rd.close()
2.60 +
2.61 # vim: tabstop=4 expandtab shiftwidth=4