Added elementary phrase searching support.

     1.1 --- a/iixr/index.py	Sat Sep 19 21:42:55 2009 +0200
     1.2 +++ b/iixr/index.py	Tue Sep 22 01:08:13 2009 +0200
     1.3 @@ -180,6 +180,9 @@
     1.4      def find_positions(self, term):
     1.5          return self.dict_reader.find_positions(term)
     1.6  
     1.7 +    def find_common_positions(self, term):
     1.8 +        return self.dict_reader.find_common_positions(term)
     1.9 +
    1.10      def get_frequency(self, term):
    1.11          return self.dict_reader.get_frequency(term)
    1.12  

     2.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     2.2 +++ b/iixr/phrases.py	Tue Sep 22 01:08:13 2009 +0200
     2.3 @@ -0,0 +1,56 @@
     2.4 +#!/usr/bin/env python
     2.5 +
     2.6 +"""
     2.7 +Phrase iterators providing navigation over common positions for a number of
     2.8 +different terms.
     2.9 +
    2.10 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
    2.11 +
    2.12 +This program is free software; you can redistribute it and/or modify it under
    2.13 +the terms of the GNU General Public License as published by the Free Software
    2.14 +Foundation; either version 3 of the License, or (at your option) any later
    2.15 +version.
    2.16 +
    2.17 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
    2.18 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
    2.19 +PARTICULAR PURPOSE.  See the GNU General Public License for more details.
    2.20 +
    2.21 +You should have received a copy of the GNU General Public License along
    2.22 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    2.23 +"""
    2.24 +
    2.25 +from itermerge import itermerge
    2.26 +from bisect import insort_right
    2.27 +
    2.28 +class PhraseIterator(itermerge):
    2.29 +
    2.30 +    "Iteration over many terms."
    2.31 +
    2.32 +    def __init__(self, sequences):
    2.33 +        itermerge.__init__(self, sequences)
    2.34 +
    2.35 +    def _add_iter(self, iterator, i):
    2.36 +
    2.37 +        "Store the details of the given 'iterator' at position 'i'."
    2.38 +
    2.39 +        insort_right(self.iters, (len(iterator), i, iterator))
    2.40 +
    2.41 +    def next(self):
    2.42 +        if self.iters:
    2.43 +            freq, i, it = self.iters[0]
    2.44 +            while 1:
    2.45 +                doc, positions = it.next()
    2.46 +                values = [(i, positions)]
    2.47 +                for freq, i, it in self.iters[1:]:
    2.48 +                    positions = it.from_document(doc)
    2.49 +                    if positions is None:
    2.50 +                        break
    2.51 +                    else:
    2.52 +                        values.append((i, positions))
    2.53 +                else:
    2.54 +                    values.sort()
    2.55 +                    return doc, [positions for (i, positions) in values]
    2.56 +        else:
    2.57 +            raise StopIteration
    2.58 +
    2.59 +# vim: tabstop=4 expandtab shiftwidth=4

     3.1 --- a/iixr/positions.py	Sat Sep 19 21:42:55 2009 +0200
     3.2 +++ b/iixr/positions.py	Tue Sep 22 01:08:13 2009 +0200
     3.3 @@ -238,7 +238,7 @@
     3.4              docnum, pos_offset, self.section_count = t = self.read_positions()
     3.5              return t
     3.6          else:
     3.7 -            assert self.read_documents == self.count
     3.8 +            #assert self.read_documents == self.count # not upheld by from_document
     3.9              raise StopIteration
    3.10  
    3.11  class PositionDictionaryWriter:

     4.1 --- a/iixr/terms.py	Sat Sep 19 21:42:55 2009 +0200
     4.2 +++ b/iixr/terms.py	Tue Sep 22 01:08:13 2009 +0200
     4.3 @@ -20,6 +20,7 @@
     4.4  
     4.5  from iixr.files import *
     4.6  from iixr.positions import *
     4.7 +from iixr.phrases import PhraseIterator
     4.8  from os.path import commonprefix # to find common string prefixes
     4.9  from bisect import bisect_right  # to find terms in the dictionary index
    4.10  
    4.11 @@ -376,6 +377,15 @@
    4.12              offset, frequency, doc_frequency = t
    4.13              return self._get_positions(offset, doc_frequency)
    4.14  
    4.15 +    def find_common_positions(self, terms):
    4.16 +
    4.17 +        """
    4.18 +        Return the documents and positions at which all the given 'terms' are
    4.19 +        found, where only common documents are returned.
    4.20 +        """
    4.21 +
    4.22 +        return PhraseIterator([self.find_positions(term) for term in terms])
    4.23 +
    4.24      def get_frequency(self, term):
    4.25  
    4.26          "Return the frequency of the given 'term'."

     5.1 --- a/itermerge.py	Sat Sep 19 21:42:55 2009 +0200
     5.2 +++ b/itermerge.py	Tue Sep 22 01:08:13 2009 +0200
     5.3 @@ -32,10 +32,16 @@
     5.4  
     5.5          # Prepare the underlying iterators.
     5.6  
     5.7 -        for seq in sequences:
     5.8 +        for i, seq in enumerate(sequences):
     5.9              it = iter(seq)
    5.10 -            next = it.next
    5.11 -            self._add_next(next)
    5.12 +            self._add_iter(it, i)
    5.13 +
    5.14 +    def _add_iter(self, iterator, i):
    5.15 +
    5.16 +        "Store the details of the given 'iterator' at position 'i'."
    5.17 +
    5.18 +        next = iterator.next
    5.19 +        self._add_next(next)
    5.20  
    5.21      def sort(self):
    5.22          pass # The output should be sorted.

     6.1 --- a/test.py	Sat Sep 19 21:42:55 2009 +0200
     6.2 +++ b/test.py	Tue Sep 22 01:08:13 2009 +0200
     6.3 @@ -415,6 +415,12 @@
     6.4      ("shells", 37, None)
     6.5      ]
     6.6  
     6.7 +phrase_tests = [
     6.8 +    (["good", "boy"], [(2, [[1], [2]])]),
     6.9 +    (["good", "deserves"], [(2, [[1], [3]]), (13, [[1], [3]])]),
    6.10 +    (["sea", "shore"], [(36, [[2, 6], [7]])])
    6.11 +    ]
    6.12 +
    6.13  index = Index("test_index")
    6.14  wi = index.get_writer(3, 2, 6)
    6.15  for docnum, text in docs:
    6.16 @@ -426,18 +432,34 @@
    6.17  wi.close()
    6.18  
    6.19  rd = index.get_reader()
    6.20 +
    6.21 +# (Test searching.)
    6.22 +
    6.23  for term, frequency, doc_positions in doc_tests:
    6.24      dp = list(rd.find_positions(term))
    6.25      print doc_positions == dp, doc_positions, dp
    6.26      fr = rd.get_frequency(term)
    6.27      print frequency == fr, frequency, fr
    6.28 +
    6.29 +# (Test fields.)
    6.30 +
    6.31  for docnum, text in docs:
    6.32      df = dict(rd.get_fields(docnum))
    6.33      print df[123] == text, text, df[123]
    6.34 +
    6.35 +# (Test navigation.)
    6.36 +
    6.37  for term, docnum, positions in position_tests:
    6.38      dp = rd.find_positions(term)
    6.39      pos = dp.from_document(docnum)
    6.40      print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
    6.41 +
    6.42 +# (Test phrases.)
    6.43 +
    6.44 +for terms, results in phrase_tests:
    6.45 +    res = list(rd.find_common_positions(terms))
    6.46 +    print results == res, results, res
    6.47 +
    6.48  index.close()
    6.49  
    6.50  # Test index updates.
2009-09-22	Paul Boddie	raw files shortlog changelog graph	Added elementary phrase searching support.
			iixr/index.py (file) iixr/phrases.py (file) iixr/positions.py (file) iixr/terms.py (file) itermerge.py (file) test.py (file)