# HG changeset patch # User Paul Boddie # Date 1297724990 -3600 # Node ID 892f694ba76f6f04d54972d9acf9dd34f13f21c7 # Parent a0cec1caf0f0bb214ac78bc48fc542389b403fde Fixed the itermerge optimisation for single element collections of iterators. Removed the common iterator support in the terms module since the features are really only relevant for the combined iterator. Fixed the seeking and state resetting logic in the combined iterator, retaining the last returned values only in this class. Introduced usage of the combined iterator as index reader only where a single pair of files is present. diff -r a0cec1caf0f0 -r 892f694ba76f iixr/index.py --- a/iixr/index.py Mon Feb 14 03:06:16 2011 +0100 +++ b/iixr/index.py Tue Feb 15 00:09:50 2011 +0100 @@ -27,7 +27,7 @@ # Constants. FLUSH_INTERVAL = 10000 -INDEX_INTERVAL = 1000 +INDEX_INTERVAL = 100 OPEN_PARTITIONS = 20 # High-level classes. @@ -163,7 +163,12 @@ except IOError: readers = self._get_readers(get_term_reader) - self.reader = MultipleReader(readers) + # NOTE: Multiple reading of combined readers is not supported. + + if len(readers) == 1: + self.reader = readers[0] + else: + self.reader = MultipleReader(readers) return self.reader diff -r a0cec1caf0f0 -r 892f694ba76f iixr/terms.py --- a/iixr/terms.py Mon Feb 14 03:06:16 2011 +0100 +++ b/iixr/terms.py Tue Feb 15 00:09:50 2011 +0100 @@ -317,62 +317,30 @@ self.last_offset += self.read_number() return self.last_term, self.last_offset -# Iterator support classes. - -class Iterator: - - "Common iterator support." - - def __init__(self): - - "Cache the last term and positions." +# External reading classes. - self.last_term_returned = None - self.last_positions_returned = None +class TermIterator(TermReader): - def go_to_term(self, term): - if term == self.last_term_returned: - return self.last_term_returned, self.last_positions_returned - - t, dp = self.next() - while t < term: - t, dp = self.next() - self.last_term_returned, self.last_positions_returned = t, dp - return t, dp + "An iterator over terms and positions read from a file." def __iter__(self): return self def next(self): - self.last_term_returned, self.last_positions_returned = t = self._next() - return t - -# External reading classes. - -class TermIterator(TermReader, Iterator): - - "An iterator over terms and positions read from a file." - - def __init__(self, f): - TermReader.__init__(self, f) - Iterator.__init__(self) - - def _next(self): try: self.begin_record() return self.read_term() except EOFError: raise StopIteration -class TermDataIterator(TermReader, Iterator): +class TermDataIterator(TermReader): "An iterator over terms and unprocessed document positions data." - def __init__(self, f): - TermReader.__init__(self, f) - Iterator.__init__(self) + def __iter__(self): + return self - def _next(self): + def next(self): try: self.begin_record() return self.read_term_plus_remaining() @@ -393,24 +361,28 @@ except EOFError: raise StopIteration -class CombinedIterator(Iterator): +class CombinedIterator: "An iterator providing index and information file access." def __init__(self, reader, index_reader): - Iterator.__init__(self) self.reader = reader self.index_reader = index_reader self.records = list(index_reader) self.terms = [t for t, dp in self.records] + # Cache the last term and positions. + + self.last_term_returned = None + self.last_positions_returned = None + def go_to_term(self, term): """ Return the 'term' and positions or nearest following term and positions. """ - if self.last_term_returned == term: + if self.last_term_returned is not None and self.last_term_returned == term: return self.last_term_returned, self.last_positions_returned # Get the record providing a term less than or equal to the requested @@ -425,10 +397,17 @@ # Seek to the corresponding record in the information file. # Only do this if the term is more quickly reached by seeking. - if term <= t or self.last_term_returned is None or term <= self.last_term_returned or \ - self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned: + if term <= t or ( + self.last_term_returned is None or term < self.last_term_returned or + self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned + ): self.reader.seek(offset) + + # Skip the term information, overwrite the reader's state. + + self.reader.begin_record() + self.reader.read_term_only() self.reader.last_term = t # Where the found term is equal or greater, just read the positions for @@ -436,12 +415,7 @@ if t >= term: - # Skip the term information, overwrite the reader's state, and get - # the positions. - - self.reader.begin_record() - self.reader.read_term_only() - self.reader.last_term = t + # Get the positions. self.last_term_returned, self.last_positions_returned = t, self.reader.read_positions() return self.last_term_returned, self.last_positions_returned @@ -451,17 +425,19 @@ else: - # Overwrite the reader's state, then scan for the term. + # Scan for the term. - t, dp = self.reader.next() while t < term: - t, dp = self.reader.next() + t, dp = self.next() # remembers the term and positions - self.last_term_returned, self.last_positions_returned = t, dp return t, dp - def _next(self): - return self.reader.next() + def __iter__(self): + return self + + def next(self): + self.last_term_returned, self.last_positions_returned = record = self.reader.next() + return record def close(self): if self.reader is not None: @@ -501,23 +477,32 @@ return 0, 0 def go_to_term(self, term): + + # Refresh the iterators to provide sought values. + self.iters = [] for reader in self.readers: try: insort_right(self.iters, (reader.go_to_term(term), reader.next)) except StopIteration: pass + self.next_value = None return self.next() def next(self): + + # Return any prematurely read value. + if self.next_value is not None: term, positions = self.next_value + + # Otherwise, get the next value. + else: term, positions = itermerge.next(self) - # Look at the next item to see if it is has positions for the current - # term. + # Look at the next item to see if it has positions for the current term. try: t, p = itermerge.next(self) diff -r a0cec1caf0f0 -r 892f694ba76f itermerge.py --- a/itermerge.py Mon Feb 14 03:06:16 2011 +0100 +++ b/itermerge.py Tue Feb 15 00:09:50 2011 +0100 @@ -78,7 +78,10 @@ del self.iters[0] self._add_next(next) else: - self.iters[0] = next(), next + try: + self.iters[0] = next(), next + except StopIteration: + self.iters = [] return value else: raise StopIteration