# HG changeset patch # User Paul Boddie # Date 1253054718 -7200 # Node ID 4ce5005d3fc786eb3c48ffe6daad13387ab42591 # Parent d8b4b6f1a5d94f184d0c18db624692b8ae1e1f90 Added iterator reuse for sequential term dictionary access, along with iterator reuse within position dictionary iteration. A special class, ResetPositionDictionaryIterator, permits the deferred initialisation of iterators, thus preventing their premature reuse and the subsequent loss of results waiting to be read. Fixed imports. Tidied PositionIndexIterator initialisation by moving section_count initialisation into the reset method. diff -r d8b4b6f1a5d9 -r 4ce5005d3fc7 iixr/index.py --- a/iixr/index.py Tue Sep 15 00:32:56 2009 +0200 +++ b/iixr/index.py Wed Sep 16 00:45:18 2009 +0200 @@ -19,6 +19,7 @@ """ from iixr.filesystem import * +from iixr.merging import * from os import listdir, mkdir # index and partition discovery from os.path import exists diff -r d8b4b6f1a5d9 -r 4ce5005d3fc7 iixr/positions.py --- a/iixr/positions.py Tue Sep 15 00:32:56 2009 +0200 +++ b/iixr/positions.py Wed Sep 16 00:45:18 2009 +0200 @@ -214,11 +214,11 @@ FileReader.__init__(self, f) IteratorBase.__init__(self, count) self.seek(offset) - self.section_count = 0 def reset(self): self.last_docnum = 0 self.last_pos_offset = 0 + self.section_count = 0 def read_positions(self): @@ -250,6 +250,7 @@ docnum, pos_offset, self.section_count = t = self.read_positions() return t else: + assert self.read_documents == self.count raise StopIteration class PositionDictionaryWriter: @@ -364,14 +365,34 @@ def __init__(self, position_opener, position_index_opener, offset, doc_frequency): self.position_opener = position_opener + self.position_index_opener = position_index_opener self.doc_frequency = doc_frequency - self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency) + + self.index_iterator = None self.iterator = None + # Initialise the iterators. + + self.reset(offset, doc_frequency) + + def reset(self, offset, doc_frequency): + # Remember the last values. self.found_docnum, self.found_positions = None, None + # Attempt to reuse the index iterator. + + if self.index_iterator is not None: + self.index_iterator.replenish(doc_frequency) + self.index_iterator.seek(offset) + self.index_iterator.reset() + + # Or make a new index iterator. + + else: + self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency) + # Maintain state for the next index entry, if read. self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None @@ -510,9 +531,17 @@ "Initialise the iterator for the section in the position file." + # Attempt to reuse any correctly positioned iterator. + if self.iterator is not None: - self.iterator.close() - self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) + self.iterator.replenish(self.section_count) + self.iterator.seek(self.pos_offset) + self.iterator.reset() + + # Otherwise, obtain a new iterator. + + else: + self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) def close(self): if self.iterator is not None: @@ -522,4 +551,20 @@ self.index_iterator.close() self.index_iterator = None +class ResetPositionDictionaryIterator: + + """ + A helper class which permits the reuse of iterators without modifying their + state. + """ + + def __init__(self, iterator, offset, doc_frequency): + self.iterator = iterator + self.offset = offset + self.doc_frequency = doc_frequency + + def __iter__(self): + self.iterator.reset(self.offset, self.doc_frequency) + return iter(self.iterator) + # vim: tabstop=4 expandtab shiftwidth=4 diff -r d8b4b6f1a5d9 -r 4ce5005d3fc7 iixr/terms.py --- a/iixr/terms.py Tue Sep 15 00:32:56 2009 +0200 +++ b/iixr/terms.py Wed Sep 16 00:45:18 2009 +0200 @@ -19,6 +19,7 @@ """ from iixr.files import * +from iixr.positions import * from os.path import commonprefix # to find common string prefixes from bisect import bisect_right # to find terms in the dictionary index @@ -208,6 +209,7 @@ self.info_reader = info_reader self.index_reader = index_reader self.position_dict_reader = position_dict_reader + self.position_dict_iterator = None # for sequential/iterator access self.terms = [] try: @@ -322,8 +324,13 @@ """ term, offset, frequency, doc_frequency = self.info_reader.read_term() - positions = self._get_positions(offset, doc_frequency) - return term, frequency, doc_frequency, positions + + # For sequential access, attempt to reuse any iterator. + + if self.position_dict_iterator is None: + self.position_dict_iterator = self._get_positions(offset, doc_frequency) + + return term, frequency, doc_frequency, ResetPositionDictionaryIterator(self.position_dict_iterator, offset, doc_frequency) # Query methods.