# HG changeset patch # User Paul Boddie # Date 1254085399 -7200 # Node ID de111fdce60f8bfba452256fc4e10493c6493c16 # Parent 563be18fc4e26ebe91d7167c9652197636a9ee3d Added proper phrase searching. diff -r 563be18fc4e2 -r de111fdce60f iixr/phrases.py --- a/iixr/phrases.py Tue Sep 22 20:32:24 2009 +0200 +++ b/iixr/phrases.py Sun Sep 27 23:03:19 2009 +0200 @@ -22,12 +22,12 @@ from itermerge import itermerge from bisect import insort_right -class PhraseIterator(itermerge): +class CommonIterator(itermerge): - "Iteration over many terms." - - def __init__(self, sequences): - itermerge.__init__(self, sequences) + """ + Iteration over many terms, driving the search for term co-occurrences using + the least frequent term. + """ def _add_seq(self, sequence, i): @@ -36,17 +36,35 @@ insort_right(self.iters, (len(sequence), i, iter(sequence))) def next(self): + + """ + Return a tuple containing a document identifier and a list of position + lists for all terms. + """ + if self.iters: while 1: + + # Get the next document for the lowest frequency term. + freq, i, it = self.iters[0] doc, positions = it.next() values = [(i, positions)] + + # Attempt to find the other terms in this document. + for freq, i, it in self.iters[1:]: positions = it.from_document(doc) - if positions is None: + + # Insert position details if appropriate. + + if positions is not None: + insort_right(values, (i, positions)) + + # Otherwise, reject this document. + + else: break - else: - insort_right(values, (i, positions)) else: return doc, [positions for (i, positions) in values] else: @@ -58,4 +76,98 @@ it.close() self.iters = [] +class PhraseIterator(CommonIterator): + + "Phrase iteration using the phrase filter." + + def __init__(self, sequences): + CommonIterator.__init__(self, sequences) + self.current_doc = None + self.current_positions = None + + def next(self): + + """ + Return a tuple containing a document identifier and a list of term + positions. + """ + + while 1: + if self.current_doc is None: + self.current_doc, all_positions = CommonIterator.next(self) + + # Handle incomplete phrases. + + try: + self.current_positions = PhraseFilter(all_positions) + except StopIteration: + self.current_doc = None + continue + + # Return new phrases. + + try: + return self.current_doc, self.current_positions.next() + except StopIteration: + self.current_doc = None + +class PhraseFilter(itermerge): + + "Filter phrase suggestions according to position information." + + def _add_seq(self, sequence, i): + + "Store the details of the given 'sequence' at position 'i'." + + it = iter(sequence) + self._add_next(it.next, i) + + def _add_next(self, next, i): + + """ + Store the current value for an iterator, alongside the means of + getting the next value - the 'next' method - together with the + iterator's position 'i'. + """ + + # Allow StopIteration to be raised. + + insort_right(self.iters, (next(), i, next)) + + def next(self): + if self.iters: + while 1: + current, first_token, next = self.iters[0] + values = [current] + last = current + last_token = first_token + + # Find a sequence of positions providing a phrase. + + for current, current_token, _next in self.iters[1:]: + if not self.is_phrase_position(last, last_token, current, current_token): + break + values.append(current) + last = current + last_token = current_token + else: + del self.iters[0] + + # Handle future end of iteration. + + try: + self._add_next(next, first_token) + except StopIteration: + self.iters = [] + + return values + + del self.iters[0] + self._add_next(next, first_token) + else: + raise StopIteration + + def is_phrase_position(self, last, last_token, current, current_token): + return current - last <= 1 and current_token > last_token + # vim: tabstop=4 expandtab shiftwidth=4 diff -r 563be18fc4e2 -r de111fdce60f test.py --- a/test.py Tue Sep 22 20:32:24 2009 +0200 +++ b/test.py Sun Sep 27 23:03:19 2009 +0200 @@ -416,9 +416,9 @@ ] phrase_tests = [ - (["good", "boy"], [(2, [[1], [2]])]), - (["good", "deserves"], [(2, [[1], [3]]), (13, [[1], [3]])]), - (["sea", "shore"], [(36, [[2, 6], [7]])]) + (["good", "boy"], [(2, [1, 2])]), + (["on", "the"], [(1, [3, 4]), (36, [4, 5])]), + (["sea", "shore"], [(36, [6, 7])]) ] index = Index("test_index")