Added support for phrase searching where document positions are specified using sequences of values, with the first value in each sequence being the token index/position. Added more tests of document numbers and position values being specified using sequences.

     1.1 --- a/iixr/phrases.py	Mon Nov 22 23:50:03 2010 +0100
     1.2 +++ b/iixr/phrases.py	Tue Nov 23 00:22:52 2010 +0100
     1.3 @@ -168,6 +168,15 @@
     1.4              raise StopIteration
     1.5  
     1.6      def is_phrase_position(self, last, last_token, current, current_token):
     1.7 -        return current - last <= 1 and current_token > last_token
     1.8 +        if current_token <= last_token:
     1.9 +            return 0
    1.10 +
    1.11 +        # NOTE: For position sequences, assume that the first value is the token
    1.12 +        # NOTE: index/position.
    1.13 +
    1.14 +        if isinstance(last, (list, tuple)):
    1.15 +            return current[0] - last[0] <= 1
    1.16 +        else:
    1.17 +            return current - last <= 1
    1.18  
    1.19  # vim: tabstop=4 expandtab shiftwidth=4

     2.1 --- a/test.py	Mon Nov 22 23:50:03 2010 +0100
     2.2 +++ b/test.py	Tue Nov 23 00:22:52 2010 +0100
     2.3 @@ -17,7 +17,7 @@
     2.4          pass
     2.5  
     2.6  try:
     2.7 -    for dirname in ("test_index", "test_index2", "test_index3"):
     2.8 +    for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"):
     2.9          for filename in os.listdir(dirname):
    2.10              os.remove(os.path.join(dirname, filename))
    2.11          os.rmdir(dirname)
    2.12 @@ -534,6 +534,78 @@
    2.13  
    2.14  index.close()
    2.15  
    2.16 +docs2 = [
    2.17 +    ((1, 0), "The cat sat on the mat"),
    2.18 +    ((1, 2), "Every good boy deserves football"),
    2.19 +    ((13, 1), "One good turn deserves another"),
    2.20 +    ((14, 0), "Every man for himself"),
    2.21 +    ((14, 25), "Red sky at night shepherd's delight"),
    2.22 +    ((36, 12), "She sells sea shells on the sea shore")
    2.23 +    ]
    2.24 +
    2.25 +doc_tests2 = [
    2.26 +    ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]),
    2.27 +    ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]),
    2.28 +    ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]),
    2.29 +    ("sea", 2, [((36, 12), [(2, 10), (6, 28)])])
    2.30 +    ]
    2.31 +
    2.32 +position_tests2 = [
    2.33 +    ("Every", (14, 0), [(0, 0)]),
    2.34 +    ("sea", (36, 12), [(2, 10), (6, 28)]),
    2.35 +    ("shells", (1, 0), None),
    2.36 +    ("shells", (37, 0), None)
    2.37 +    ]
    2.38 +
    2.39 +phrase_tests2 = [
    2.40 +    (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]),
    2.41 +    (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]),
    2.42 +    (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])])
    2.43 +    ]
    2.44 +
    2.45 +index = Index("test_indexT", 3, 2, 3, 6)
    2.46 +wi = index.get_writer()
    2.47 +for docnum, text in docs2:
    2.48 +    doc = Document(docnum)
    2.49 +    offset = 0
    2.50 +    for position, term in enumerate(text.split()):
    2.51 +        doc.add_position(term, (position, offset))
    2.52 +        offset += len(term) + 1 # assume one space after the term
    2.53 +    doc.add_field(123, text)
    2.54 +    wi.add_document(doc)
    2.55 +wi.close()
    2.56 +
    2.57 +rd = index.get_reader()
    2.58 +
    2.59 +print "- (Test searching.)"
    2.60 +
    2.61 +for term, frequency, doc_positions in doc_tests2:
    2.62 +    dp = list(rd.find_positions(term))
    2.63 +    print doc_positions == dp, doc_positions, dp
    2.64 +    fr = rd.get_frequency(term)
    2.65 +    print frequency == fr, frequency, fr
    2.66 +
    2.67 +print "- (Test fields.)"
    2.68 +
    2.69 +for docnum, text in docs2:
    2.70 +    df = dict(rd.get_fields(docnum))
    2.71 +    print df[123] == text, text, df[123]
    2.72 +
    2.73 +print "- (Test navigation.)"
    2.74 +
    2.75 +for term, docnum, positions in position_tests2:
    2.76 +    dp = rd.find_positions(term)
    2.77 +    pos = dp.from_document(docnum)
    2.78 +    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
    2.79 +
    2.80 +print "- (Test phrases.)"
    2.81 +
    2.82 +for terms, results in phrase_tests2:
    2.83 +    res = list(rd.find_common_positions(terms))
    2.84 +    print results == res, results, res
    2.85 +
    2.86 +index.close()
    2.87 +
    2.88  print "- Test index updates."
    2.89  
    2.90  index = Index("test_index")
2010-11-23	Paul Boddie	raw files shortlog changelog graph	Added support for phrase searching where document positions are specified using sequences of values, with the first value in each sequence being the token index/position. Added more tests of document numbers and position values being specified using sequences.
			iixr/phrases.py (file) test.py (file)