Added proper phrase searching.

     1.1 --- a/iixr/phrases.py	Tue Sep 22 20:32:24 2009 +0200
     1.2 +++ b/iixr/phrases.py	Sun Sep 27 23:03:19 2009 +0200
     1.3 @@ -22,12 +22,12 @@
     1.4  from itermerge import itermerge
     1.5  from bisect import insort_right
     1.6  
     1.7 -class PhraseIterator(itermerge):
     1.8 +class CommonIterator(itermerge):
     1.9  
    1.10 -    "Iteration over many terms."
    1.11 -
    1.12 -    def __init__(self, sequences):
    1.13 -        itermerge.__init__(self, sequences)
    1.14 +    """
    1.15 +    Iteration over many terms, driving the search for term co-occurrences using
    1.16 +    the least frequent term.
    1.17 +    """
    1.18  
    1.19      def _add_seq(self, sequence, i):
    1.20  
    1.21 @@ -36,17 +36,35 @@
    1.22          insort_right(self.iters, (len(sequence), i, iter(sequence)))
    1.23  
    1.24      def next(self):
    1.25 +
    1.26 +        """
    1.27 +        Return a tuple containing a document identifier and a list of position
    1.28 +        lists for all terms.
    1.29 +        """
    1.30 +
    1.31          if self.iters:
    1.32              while 1:
    1.33 +
    1.34 +                # Get the next document for the lowest frequency term.
    1.35 +
    1.36                  freq, i, it = self.iters[0]
    1.37                  doc, positions = it.next()
    1.38                  values = [(i, positions)]
    1.39 +
    1.40 +                # Attempt to find the other terms in this document.
    1.41 +
    1.42                  for freq, i, it in self.iters[1:]:
    1.43                      positions = it.from_document(doc)
    1.44 -                    if positions is None:
    1.45 +
    1.46 +                    # Insert position details if appropriate.
    1.47 +
    1.48 +                    if positions is not None:
    1.49 +                        insort_right(values, (i, positions))
    1.50 +
    1.51 +                    # Otherwise, reject this document.
    1.52 +
    1.53 +                    else:
    1.54                          break
    1.55 -                    else:
    1.56 -                        insort_right(values, (i, positions))
    1.57                  else:
    1.58                      return doc, [positions for (i, positions) in values]
    1.59          else:
    1.60 @@ -58,4 +76,98 @@
    1.61                  it.close()
    1.62          self.iters = []
    1.63  
    1.64 +class PhraseIterator(CommonIterator):
    1.65 +
    1.66 +    "Phrase iteration using the phrase filter."
    1.67 +
    1.68 +    def __init__(self, sequences):
    1.69 +        CommonIterator.__init__(self, sequences)
    1.70 +        self.current_doc = None
    1.71 +        self.current_positions = None
    1.72 +
    1.73 +    def next(self):
    1.74 +
    1.75 +        """
    1.76 +        Return a tuple containing a document identifier and a list of term
    1.77 +        positions.
    1.78 +        """
    1.79 +
    1.80 +        while 1:
    1.81 +            if self.current_doc is None:
    1.82 +                self.current_doc, all_positions = CommonIterator.next(self)
    1.83 +
    1.84 +                # Handle incomplete phrases.
    1.85 +
    1.86 +                try:
    1.87 +                    self.current_positions = PhraseFilter(all_positions)
    1.88 +                except StopIteration:
    1.89 +                    self.current_doc = None
    1.90 +                    continue
    1.91 +
    1.92 +            # Return new phrases.
    1.93 +
    1.94 +            try:
    1.95 +                return self.current_doc, self.current_positions.next()
    1.96 +            except StopIteration:
    1.97 +                self.current_doc = None
    1.98 +
    1.99 +class PhraseFilter(itermerge):
   1.100 +
   1.101 +    "Filter phrase suggestions according to position information."
   1.102 +
   1.103 +    def _add_seq(self, sequence, i):
   1.104 +
   1.105 +        "Store the details of the given 'sequence' at position 'i'."
   1.106 +
   1.107 +        it = iter(sequence)
   1.108 +        self._add_next(it.next, i)
   1.109 +
   1.110 +    def _add_next(self, next, i):
   1.111 +
   1.112 +        """
   1.113 +        Store the current value for an iterator, alongside the means of
   1.114 +        getting the next value - the 'next' method - together with the
   1.115 +        iterator's position 'i'.
   1.116 +        """
   1.117 +
   1.118 +        # Allow StopIteration to be raised.
   1.119 +
   1.120 +        insort_right(self.iters, (next(), i, next))
   1.121 +
   1.122 +    def next(self):
   1.123 +        if self.iters:
   1.124 +            while 1:
   1.125 +                current, first_token, next = self.iters[0]
   1.126 +                values = [current]
   1.127 +                last = current
   1.128 +                last_token = first_token
   1.129 +
   1.130 +                # Find a sequence of positions providing a phrase.
   1.131 +
   1.132 +                for current, current_token, _next in self.iters[1:]:
   1.133 +                    if not self.is_phrase_position(last, last_token, current, current_token):
   1.134 +                        break
   1.135 +                    values.append(current)
   1.136 +                    last = current
   1.137 +                    last_token = current_token
   1.138 +                else:
   1.139 +                    del self.iters[0]
   1.140 +
   1.141 +                    # Handle future end of iteration.
   1.142 +
   1.143 +                    try:
   1.144 +                        self._add_next(next, first_token)
   1.145 +                    except StopIteration:
   1.146 +                        self.iters = []
   1.147 +
   1.148 +                    return values
   1.149 +
   1.150 +                del self.iters[0]
   1.151 +                self._add_next(next, first_token)
   1.152 +        else:
   1.153 +            raise StopIteration
   1.154 +
   1.155 +    def is_phrase_position(self, last, last_token, current, current_token):
   1.156 +        return current - last <= 1 and current_token > last_token
   1.157 +
   1.158  # vim: tabstop=4 expandtab shiftwidth=4

     2.1 --- a/test.py	Tue Sep 22 20:32:24 2009 +0200
     2.2 +++ b/test.py	Sun Sep 27 23:03:19 2009 +0200
     2.3 @@ -416,9 +416,9 @@
     2.4      ]
     2.5  
     2.6  phrase_tests = [
     2.7 -    (["good", "boy"], [(2, [[1], [2]])]),
     2.8 -    (["good", "deserves"], [(2, [[1], [3]]), (13, [[1], [3]])]),
     2.9 -    (["sea", "shore"], [(36, [[2, 6], [7]])])
    2.10 +    (["good", "boy"], [(2, [1, 2])]),
    2.11 +    (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),
    2.12 +    (["sea", "shore"], [(36, [6, 7])])
    2.13      ]
    2.14  
    2.15  index = Index("test_index")
2009-09-27	Paul Boddie	raw files shortlog changelog graph	Added proper phrase searching.
			iixr/phrases.py (file) test.py (file)