1.1 --- a/iixr/index.py Sat Sep 19 21:42:55 2009 +0200
1.2 +++ b/iixr/index.py Tue Sep 22 01:08:13 2009 +0200
1.3 @@ -180,6 +180,9 @@
1.4 def find_positions(self, term):
1.5 return self.dict_reader.find_positions(term)
1.6
1.7 + def find_common_positions(self, term):
1.8 + return self.dict_reader.find_common_positions(term)
1.9 +
1.10 def get_frequency(self, term):
1.11 return self.dict_reader.get_frequency(term)
1.12
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
2.2 +++ b/iixr/phrases.py Tue Sep 22 01:08:13 2009 +0200
2.3 @@ -0,0 +1,56 @@
2.4 +#!/usr/bin/env python
2.5 +
2.6 +"""
2.7 +Phrase iterators providing navigation over common positions for a number of
2.8 +different terms.
2.9 +
2.10 +Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
2.11 +
2.12 +This program is free software; you can redistribute it and/or modify it under
2.13 +the terms of the GNU General Public License as published by the Free Software
2.14 +Foundation; either version 3 of the License, or (at your option) any later
2.15 +version.
2.16 +
2.17 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
2.18 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
2.19 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
2.20 +
2.21 +You should have received a copy of the GNU General Public License along
2.22 +with this program. If not, see <http://www.gnu.org/licenses/>.
2.23 +"""
2.24 +
2.25 +from itermerge import itermerge
2.26 +from bisect import insort_right
2.27 +
2.28 +class PhraseIterator(itermerge):
2.29 +
2.30 + "Iteration over many terms."
2.31 +
2.32 + def __init__(self, sequences):
2.33 + itermerge.__init__(self, sequences)
2.34 +
2.35 + def _add_iter(self, iterator, i):
2.36 +
2.37 + "Store the details of the given 'iterator' at position 'i'."
2.38 +
2.39 + insort_right(self.iters, (len(iterator), i, iterator))
2.40 +
2.41 + def next(self):
2.42 + if self.iters:
2.43 + freq, i, it = self.iters[0]
2.44 + while 1:
2.45 + doc, positions = it.next()
2.46 + values = [(i, positions)]
2.47 + for freq, i, it in self.iters[1:]:
2.48 + positions = it.from_document(doc)
2.49 + if positions is None:
2.50 + break
2.51 + else:
2.52 + values.append((i, positions))
2.53 + else:
2.54 + values.sort()
2.55 + return doc, [positions for (i, positions) in values]
2.56 + else:
2.57 + raise StopIteration
2.58 +
2.59 +# vim: tabstop=4 expandtab shiftwidth=4
3.1 --- a/iixr/positions.py Sat Sep 19 21:42:55 2009 +0200
3.2 +++ b/iixr/positions.py Tue Sep 22 01:08:13 2009 +0200
3.3 @@ -238,7 +238,7 @@
3.4 docnum, pos_offset, self.section_count = t = self.read_positions()
3.5 return t
3.6 else:
3.7 - assert self.read_documents == self.count
3.8 + #assert self.read_documents == self.count # not upheld by from_document
3.9 raise StopIteration
3.10
3.11 class PositionDictionaryWriter:
4.1 --- a/iixr/terms.py Sat Sep 19 21:42:55 2009 +0200
4.2 +++ b/iixr/terms.py Tue Sep 22 01:08:13 2009 +0200
4.3 @@ -20,6 +20,7 @@
4.4
4.5 from iixr.files import *
4.6 from iixr.positions import *
4.7 +from iixr.phrases import PhraseIterator
4.8 from os.path import commonprefix # to find common string prefixes
4.9 from bisect import bisect_right # to find terms in the dictionary index
4.10
4.11 @@ -376,6 +377,15 @@
4.12 offset, frequency, doc_frequency = t
4.13 return self._get_positions(offset, doc_frequency)
4.14
4.15 + def find_common_positions(self, terms):
4.16 +
4.17 + """
4.18 + Return the documents and positions at which all the given 'terms' are
4.19 + found, where only common documents are returned.
4.20 + """
4.21 +
4.22 + return PhraseIterator([self.find_positions(term) for term in terms])
4.23 +
4.24 def get_frequency(self, term):
4.25
4.26 "Return the frequency of the given 'term'."
5.1 --- a/itermerge.py Sat Sep 19 21:42:55 2009 +0200
5.2 +++ b/itermerge.py Tue Sep 22 01:08:13 2009 +0200
5.3 @@ -32,10 +32,16 @@
5.4
5.5 # Prepare the underlying iterators.
5.6
5.7 - for seq in sequences:
5.8 + for i, seq in enumerate(sequences):
5.9 it = iter(seq)
5.10 - next = it.next
5.11 - self._add_next(next)
5.12 + self._add_iter(it, i)
5.13 +
5.14 + def _add_iter(self, iterator, i):
5.15 +
5.16 + "Store the details of the given 'iterator' at position 'i'."
5.17 +
5.18 + next = iterator.next
5.19 + self._add_next(next)
5.20
5.21 def sort(self):
5.22 pass # The output should be sorted.
6.1 --- a/test.py Sat Sep 19 21:42:55 2009 +0200
6.2 +++ b/test.py Tue Sep 22 01:08:13 2009 +0200
6.3 @@ -415,6 +415,12 @@
6.4 ("shells", 37, None)
6.5 ]
6.6
6.7 +phrase_tests = [
6.8 + (["good", "boy"], [(2, [[1], [2]])]),
6.9 + (["good", "deserves"], [(2, [[1], [3]]), (13, [[1], [3]])]),
6.10 + (["sea", "shore"], [(36, [[2, 6], [7]])])
6.11 + ]
6.12 +
6.13 index = Index("test_index")
6.14 wi = index.get_writer(3, 2, 6)
6.15 for docnum, text in docs:
6.16 @@ -426,18 +432,34 @@
6.17 wi.close()
6.18
6.19 rd = index.get_reader()
6.20 +
6.21 +# (Test searching.)
6.22 +
6.23 for term, frequency, doc_positions in doc_tests:
6.24 dp = list(rd.find_positions(term))
6.25 print doc_positions == dp, doc_positions, dp
6.26 fr = rd.get_frequency(term)
6.27 print frequency == fr, frequency, fr
6.28 +
6.29 +# (Test fields.)
6.30 +
6.31 for docnum, text in docs:
6.32 df = dict(rd.get_fields(docnum))
6.33 print df[123] == text, text, df[123]
6.34 +
6.35 +# (Test navigation.)
6.36 +
6.37 for term, docnum, positions in position_tests:
6.38 dp = rd.find_positions(term)
6.39 pos = dp.from_document(docnum)
6.40 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
6.41 +
6.42 +# (Test phrases.)
6.43 +
6.44 +for terms, results in phrase_tests:
6.45 + res = list(rd.find_common_positions(terms))
6.46 + print results == res, results, res
6.47 +
6.48 index.close()
6.49
6.50 # Test index updates.