# HG changeset patch # User Paul Boddie # Date 1290468172 -3600 # Node ID 7e79dd580a629c70c12fac3f39304098745ccfe0 # Parent f1cbbf5ef885fac3039b6076cb7eb52ee3626d4d Added support for phrase searching where document positions are specified using sequences of values, with the first value in each sequence being the token index/position. Added more tests of document numbers and position values being specified using sequences. diff -r f1cbbf5ef885 -r 7e79dd580a62 iixr/phrases.py --- a/iixr/phrases.py Mon Nov 22 23:50:03 2010 +0100 +++ b/iixr/phrases.py Tue Nov 23 00:22:52 2010 +0100 @@ -168,6 +168,15 @@ raise StopIteration def is_phrase_position(self, last, last_token, current, current_token): - return current - last <= 1 and current_token > last_token + if current_token <= last_token: + return 0 + + # NOTE: For position sequences, assume that the first value is the token + # NOTE: index/position. + + if isinstance(last, (list, tuple)): + return current[0] - last[0] <= 1 + else: + return current - last <= 1 # vim: tabstop=4 expandtab shiftwidth=4 diff -r f1cbbf5ef885 -r 7e79dd580a62 test.py --- a/test.py Mon Nov 22 23:50:03 2010 +0100 +++ b/test.py Tue Nov 23 00:22:52 2010 +0100 @@ -17,7 +17,7 @@ pass try: - for dirname in ("test_index", "test_index2", "test_index3"): + for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"): for filename in os.listdir(dirname): os.remove(os.path.join(dirname, filename)) os.rmdir(dirname) @@ -534,6 +534,78 @@ index.close() +docs2 = [ + ((1, 0), "The cat sat on the mat"), + ((1, 2), "Every good boy deserves football"), + ((13, 1), "One good turn deserves another"), + ((14, 0), "Every man for himself"), + ((14, 25), "Red sky at night shepherd's delight"), + ((36, 12), "She sells sea shells on the sea shore") + ] + +doc_tests2 = [ + ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]), + ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]), + ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]), + ("sea", 2, [((36, 12), [(2, 10), (6, 28)])]) + ] + +position_tests2 = [ + ("Every", (14, 0), [(0, 0)]), + ("sea", (36, 12), [(2, 10), (6, 28)]), + ("shells", (1, 0), None), + ("shells", (37, 0), None) + ] + +phrase_tests2 = [ + (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]), + (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]), + (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])]) + ] + +index = Index("test_indexT", 3, 2, 3, 6) +wi = index.get_writer() +for docnum, text in docs2: + doc = Document(docnum) + offset = 0 + for position, term in enumerate(text.split()): + doc.add_position(term, (position, offset)) + offset += len(term) + 1 # assume one space after the term + doc.add_field(123, text) + wi.add_document(doc) +wi.close() + +rd = index.get_reader() + +print "- (Test searching.)" + +for term, frequency, doc_positions in doc_tests2: + dp = list(rd.find_positions(term)) + print doc_positions == dp, doc_positions, dp + fr = rd.get_frequency(term) + print frequency == fr, frequency, fr + +print "- (Test fields.)" + +for docnum, text in docs2: + df = dict(rd.get_fields(docnum)) + print df[123] == text, text, df[123] + +print "- (Test navigation.)" + +for term, docnum, positions in position_tests2: + dp = rd.find_positions(term) + pos = dp.from_document(docnum) + print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos + +print "- (Test phrases.)" + +for terms, results in phrase_tests2: + res = list(rd.find_common_positions(terms)) + print results == res, results, res + +index.close() + print "- Test index updates." index = Index("test_index")