# HG changeset patch # User Paul Boddie # Date 1253574493 -7200 # Node ID 6313e687d8682b352dfd55c644a0d209f35c4e5f # Parent eff15120152bff94b1d13c03d7a55124d0753206 Added elementary phrase searching support. diff -r eff15120152b -r 6313e687d868 iixr/index.py --- a/iixr/index.py Sat Sep 19 21:42:55 2009 +0200 +++ b/iixr/index.py Tue Sep 22 01:08:13 2009 +0200 @@ -180,6 +180,9 @@ def find_positions(self, term): return self.dict_reader.find_positions(term) + def find_common_positions(self, term): + return self.dict_reader.find_common_positions(term) + def get_frequency(self, term): return self.dict_reader.get_frequency(term) diff -r eff15120152b -r 6313e687d868 iixr/phrases.py --- /dev/null Thu Jan 01 00:00:00 1970 +0000 +++ b/iixr/phrases.py Tue Sep 22 01:08:13 2009 +0200 @@ -0,0 +1,56 @@ +#!/usr/bin/env python + +""" +Phrase iterators providing navigation over common positions for a number of +different terms. + +Copyright (C) 2009 Paul Boddie + +This program is free software; you can redistribute it and/or modify it under +the terms of the GNU General Public License as published by the Free Software +Foundation; either version 3 of the License, or (at your option) any later +version. + +This program is distributed in the hope that it will be useful, but WITHOUT ANY +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A +PARTICULAR PURPOSE. See the GNU General Public License for more details. + +You should have received a copy of the GNU General Public License along +with this program. If not, see . +""" + +from itermerge import itermerge +from bisect import insort_right + +class PhraseIterator(itermerge): + + "Iteration over many terms." + + def __init__(self, sequences): + itermerge.__init__(self, sequences) + + def _add_iter(self, iterator, i): + + "Store the details of the given 'iterator' at position 'i'." + + insort_right(self.iters, (len(iterator), i, iterator)) + + def next(self): + if self.iters: + freq, i, it = self.iters[0] + while 1: + doc, positions = it.next() + values = [(i, positions)] + for freq, i, it in self.iters[1:]: + positions = it.from_document(doc) + if positions is None: + break + else: + values.append((i, positions)) + else: + values.sort() + return doc, [positions for (i, positions) in values] + else: + raise StopIteration + +# vim: tabstop=4 expandtab shiftwidth=4 diff -r eff15120152b -r 6313e687d868 iixr/positions.py --- a/iixr/positions.py Sat Sep 19 21:42:55 2009 +0200 +++ b/iixr/positions.py Tue Sep 22 01:08:13 2009 +0200 @@ -238,7 +238,7 @@ docnum, pos_offset, self.section_count = t = self.read_positions() return t else: - assert self.read_documents == self.count + #assert self.read_documents == self.count # not upheld by from_document raise StopIteration class PositionDictionaryWriter: diff -r eff15120152b -r 6313e687d868 iixr/terms.py --- a/iixr/terms.py Sat Sep 19 21:42:55 2009 +0200 +++ b/iixr/terms.py Tue Sep 22 01:08:13 2009 +0200 @@ -20,6 +20,7 @@ from iixr.files import * from iixr.positions import * +from iixr.phrases import PhraseIterator from os.path import commonprefix # to find common string prefixes from bisect import bisect_right # to find terms in the dictionary index @@ -376,6 +377,15 @@ offset, frequency, doc_frequency = t return self._get_positions(offset, doc_frequency) + def find_common_positions(self, terms): + + """ + Return the documents and positions at which all the given 'terms' are + found, where only common documents are returned. + """ + + return PhraseIterator([self.find_positions(term) for term in terms]) + def get_frequency(self, term): "Return the frequency of the given 'term'." diff -r eff15120152b -r 6313e687d868 itermerge.py --- a/itermerge.py Sat Sep 19 21:42:55 2009 +0200 +++ b/itermerge.py Tue Sep 22 01:08:13 2009 +0200 @@ -32,10 +32,16 @@ # Prepare the underlying iterators. - for seq in sequences: + for i, seq in enumerate(sequences): it = iter(seq) - next = it.next - self._add_next(next) + self._add_iter(it, i) + + def _add_iter(self, iterator, i): + + "Store the details of the given 'iterator' at position 'i'." + + next = iterator.next + self._add_next(next) def sort(self): pass # The output should be sorted. diff -r eff15120152b -r 6313e687d868 test.py --- a/test.py Sat Sep 19 21:42:55 2009 +0200 +++ b/test.py Tue Sep 22 01:08:13 2009 +0200 @@ -415,6 +415,12 @@ ("shells", 37, None) ] +phrase_tests = [ + (["good", "boy"], [(2, [[1], [2]])]), + (["good", "deserves"], [(2, [[1], [3]]), (13, [[1], [3]])]), + (["sea", "shore"], [(36, [[2, 6], [7]])]) + ] + index = Index("test_index") wi = index.get_writer(3, 2, 6) for docnum, text in docs: @@ -426,18 +432,34 @@ wi.close() rd = index.get_reader() + +# (Test searching.) + for term, frequency, doc_positions in doc_tests: dp = list(rd.find_positions(term)) print doc_positions == dp, doc_positions, dp fr = rd.get_frequency(term) print frequency == fr, frequency, fr + +# (Test fields.) + for docnum, text in docs: df = dict(rd.get_fields(docnum)) print df[123] == text, text, df[123] + +# (Test navigation.) + for term, docnum, positions in position_tests: dp = rd.find_positions(term) pos = dp.from_document(docnum) print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos + +# (Test phrases.) + +for terms, results in phrase_tests: + res = list(rd.find_common_positions(terms)) + print results == res, results, res + index.close() # Test index updates.