# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1254085399 -7200
# Node ID de111fdce60f8bfba452256fc4e10493c6493c16
# Parent  563be18fc4e26ebe91d7167c9652197636a9ee3d
Added proper phrase searching.

diff -r 563be18fc4e2 -r de111fdce60f iixr/phrases.py
--- a/iixr/phrases.py	Tue Sep 22 20:32:24 2009 +0200
+++ b/iixr/phrases.py	Sun Sep 27 23:03:19 2009 +0200
@@ -22,12 +22,12 @@
 from itermerge import itermerge
 from bisect import insort_right
 
-class PhraseIterator(itermerge):
+class CommonIterator(itermerge):
 
-    "Iteration over many terms."
-
-    def __init__(self, sequences):
-        itermerge.__init__(self, sequences)
+    """
+    Iteration over many terms, driving the search for term co-occurrences using
+    the least frequent term.
+    """
 
     def _add_seq(self, sequence, i):
 
@@ -36,17 +36,35 @@
         insort_right(self.iters, (len(sequence), i, iter(sequence)))
 
     def next(self):
+
+        """
+        Return a tuple containing a document identifier and a list of position
+        lists for all terms.
+        """
+
         if self.iters:
             while 1:
+
+                # Get the next document for the lowest frequency term.
+
                 freq, i, it = self.iters[0]
                 doc, positions = it.next()
                 values = [(i, positions)]
+
+                # Attempt to find the other terms in this document.
+
                 for freq, i, it in self.iters[1:]:
                     positions = it.from_document(doc)
-                    if positions is None:
+
+                    # Insert position details if appropriate.
+
+                    if positions is not None:
+                        insort_right(values, (i, positions))
+
+                    # Otherwise, reject this document.
+
+                    else:
                         break
-                    else:
-                        insort_right(values, (i, positions))
                 else:
                     return doc, [positions for (i, positions) in values]
         else:
@@ -58,4 +76,98 @@
                 it.close()
         self.iters = []
 
+class PhraseIterator(CommonIterator):
+
+    "Phrase iteration using the phrase filter."
+
+    def __init__(self, sequences):
+        CommonIterator.__init__(self, sequences)
+        self.current_doc = None
+        self.current_positions = None
+
+    def next(self):
+
+        """
+        Return a tuple containing a document identifier and a list of term
+        positions.
+        """
+
+        while 1:
+            if self.current_doc is None:
+                self.current_doc, all_positions = CommonIterator.next(self)
+
+                # Handle incomplete phrases.
+
+                try:
+                    self.current_positions = PhraseFilter(all_positions)
+                except StopIteration:
+                    self.current_doc = None
+                    continue
+
+            # Return new phrases.
+
+            try:
+                return self.current_doc, self.current_positions.next()
+            except StopIteration:
+                self.current_doc = None
+
+class PhraseFilter(itermerge):
+
+    "Filter phrase suggestions according to position information."
+
+    def _add_seq(self, sequence, i):
+
+        "Store the details of the given 'sequence' at position 'i'."
+
+        it = iter(sequence)
+        self._add_next(it.next, i)
+
+    def _add_next(self, next, i):
+
+        """
+        Store the current value for an iterator, alongside the means of
+        getting the next value - the 'next' method - together with the
+        iterator's position 'i'.
+        """
+
+        # Allow StopIteration to be raised.
+
+        insort_right(self.iters, (next(), i, next))
+
+    def next(self):
+        if self.iters:
+            while 1:
+                current, first_token, next = self.iters[0]
+                values = [current]
+                last = current
+                last_token = first_token
+
+                # Find a sequence of positions providing a phrase.
+
+                for current, current_token, _next in self.iters[1:]:
+                    if not self.is_phrase_position(last, last_token, current, current_token):
+                        break
+                    values.append(current)
+                    last = current
+                    last_token = current_token
+                else:
+                    del self.iters[0]
+
+                    # Handle future end of iteration.
+
+                    try:
+                        self._add_next(next, first_token)
+                    except StopIteration:
+                        self.iters = []
+
+                    return values
+
+                del self.iters[0]
+                self._add_next(next, first_token)
+        else:
+            raise StopIteration
+
+    def is_phrase_position(self, last, last_token, current, current_token):
+        return current - last <= 1 and current_token > last_token
+
 # vim: tabstop=4 expandtab shiftwidth=4
diff -r 563be18fc4e2 -r de111fdce60f test.py
--- a/test.py	Tue Sep 22 20:32:24 2009 +0200
+++ b/test.py	Sun Sep 27 23:03:19 2009 +0200
@@ -416,9 +416,9 @@
     ]
 
 phrase_tests = [
-    (["good", "boy"], [(2, [[1], [2]])]),
-    (["good", "deserves"], [(2, [[1], [3]]), (13, [[1], [3]])]),
-    (["sea", "shore"], [(36, [[2, 6], [7]])])
+    (["good", "boy"], [(2, [1, 2])]),
+    (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),
+    (["sea", "shore"], [(36, [6, 7])])
     ]
 
 index = Index("test_index")