1.1 --- a/iixr/phrases.py Tue Sep 22 20:32:24 2009 +0200
1.2 +++ b/iixr/phrases.py Sun Sep 27 23:03:19 2009 +0200
1.3 @@ -22,12 +22,12 @@
1.4 from itermerge import itermerge
1.5 from bisect import insort_right
1.6
1.7 -class PhraseIterator(itermerge):
1.8 +class CommonIterator(itermerge):
1.9
1.10 - "Iteration over many terms."
1.11 -
1.12 - def __init__(self, sequences):
1.13 - itermerge.__init__(self, sequences)
1.14 + """
1.15 + Iteration over many terms, driving the search for term co-occurrences using
1.16 + the least frequent term.
1.17 + """
1.18
1.19 def _add_seq(self, sequence, i):
1.20
1.21 @@ -36,17 +36,35 @@
1.22 insort_right(self.iters, (len(sequence), i, iter(sequence)))
1.23
1.24 def next(self):
1.25 +
1.26 + """
1.27 + Return a tuple containing a document identifier and a list of position
1.28 + lists for all terms.
1.29 + """
1.30 +
1.31 if self.iters:
1.32 while 1:
1.33 +
1.34 + # Get the next document for the lowest frequency term.
1.35 +
1.36 freq, i, it = self.iters[0]
1.37 doc, positions = it.next()
1.38 values = [(i, positions)]
1.39 +
1.40 + # Attempt to find the other terms in this document.
1.41 +
1.42 for freq, i, it in self.iters[1:]:
1.43 positions = it.from_document(doc)
1.44 - if positions is None:
1.45 +
1.46 + # Insert position details if appropriate.
1.47 +
1.48 + if positions is not None:
1.49 + insort_right(values, (i, positions))
1.50 +
1.51 + # Otherwise, reject this document.
1.52 +
1.53 + else:
1.54 break
1.55 - else:
1.56 - insort_right(values, (i, positions))
1.57 else:
1.58 return doc, [positions for (i, positions) in values]
1.59 else:
1.60 @@ -58,4 +76,98 @@
1.61 it.close()
1.62 self.iters = []
1.63
1.64 +class PhraseIterator(CommonIterator):
1.65 +
1.66 + "Phrase iteration using the phrase filter."
1.67 +
1.68 + def __init__(self, sequences):
1.69 + CommonIterator.__init__(self, sequences)
1.70 + self.current_doc = None
1.71 + self.current_positions = None
1.72 +
1.73 + def next(self):
1.74 +
1.75 + """
1.76 + Return a tuple containing a document identifier and a list of term
1.77 + positions.
1.78 + """
1.79 +
1.80 + while 1:
1.81 + if self.current_doc is None:
1.82 + self.current_doc, all_positions = CommonIterator.next(self)
1.83 +
1.84 + # Handle incomplete phrases.
1.85 +
1.86 + try:
1.87 + self.current_positions = PhraseFilter(all_positions)
1.88 + except StopIteration:
1.89 + self.current_doc = None
1.90 + continue
1.91 +
1.92 + # Return new phrases.
1.93 +
1.94 + try:
1.95 + return self.current_doc, self.current_positions.next()
1.96 + except StopIteration:
1.97 + self.current_doc = None
1.98 +
1.99 +class PhraseFilter(itermerge):
1.100 +
1.101 + "Filter phrase suggestions according to position information."
1.102 +
1.103 + def _add_seq(self, sequence, i):
1.104 +
1.105 + "Store the details of the given 'sequence' at position 'i'."
1.106 +
1.107 + it = iter(sequence)
1.108 + self._add_next(it.next, i)
1.109 +
1.110 + def _add_next(self, next, i):
1.111 +
1.112 + """
1.113 + Store the current value for an iterator, alongside the means of
1.114 + getting the next value - the 'next' method - together with the
1.115 + iterator's position 'i'.
1.116 + """
1.117 +
1.118 + # Allow StopIteration to be raised.
1.119 +
1.120 + insort_right(self.iters, (next(), i, next))
1.121 +
1.122 + def next(self):
1.123 + if self.iters:
1.124 + while 1:
1.125 + current, first_token, next = self.iters[0]
1.126 + values = [current]
1.127 + last = current
1.128 + last_token = first_token
1.129 +
1.130 + # Find a sequence of positions providing a phrase.
1.131 +
1.132 + for current, current_token, _next in self.iters[1:]:
1.133 + if not self.is_phrase_position(last, last_token, current, current_token):
1.134 + break
1.135 + values.append(current)
1.136 + last = current
1.137 + last_token = current_token
1.138 + else:
1.139 + del self.iters[0]
1.140 +
1.141 + # Handle future end of iteration.
1.142 +
1.143 + try:
1.144 + self._add_next(next, first_token)
1.145 + except StopIteration:
1.146 + self.iters = []
1.147 +
1.148 + return values
1.149 +
1.150 + del self.iters[0]
1.151 + self._add_next(next, first_token)
1.152 + else:
1.153 + raise StopIteration
1.154 +
1.155 + def is_phrase_position(self, last, last_token, current, current_token):
1.156 + return current - last <= 1 and current_token > last_token
1.157 +
1.158 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/test.py Tue Sep 22 20:32:24 2009 +0200
2.2 +++ b/test.py Sun Sep 27 23:03:19 2009 +0200
2.3 @@ -416,9 +416,9 @@
2.4 ]
2.5
2.6 phrase_tests = [
2.7 - (["good", "boy"], [(2, [[1], [2]])]),
2.8 - (["good", "deserves"], [(2, [[1], [3]]), (13, [[1], [3]])]),
2.9 - (["sea", "shore"], [(36, [[2, 6], [7]])])
2.10 + (["good", "boy"], [(2, [1, 2])]),
2.11 + (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),
2.12 + (["sea", "shore"], [(36, [6, 7])])
2.13 ]
2.14
2.15 index = Index("test_index")