1.1 --- a/iixr/index.py Tue Sep 15 00:32:56 2009 +0200
1.2 +++ b/iixr/index.py Wed Sep 16 00:45:18 2009 +0200
1.3 @@ -19,6 +19,7 @@
1.4 """
1.5
1.6 from iixr.filesystem import *
1.7 +from iixr.merging import *
1.8 from os import listdir, mkdir # index and partition discovery
1.9 from os.path import exists
1.10
2.1 --- a/iixr/positions.py Tue Sep 15 00:32:56 2009 +0200
2.2 +++ b/iixr/positions.py Wed Sep 16 00:45:18 2009 +0200
2.3 @@ -214,11 +214,11 @@
2.4 FileReader.__init__(self, f)
2.5 IteratorBase.__init__(self, count)
2.6 self.seek(offset)
2.7 - self.section_count = 0
2.8
2.9 def reset(self):
2.10 self.last_docnum = 0
2.11 self.last_pos_offset = 0
2.12 + self.section_count = 0
2.13
2.14 def read_positions(self):
2.15
2.16 @@ -250,6 +250,7 @@
2.17 docnum, pos_offset, self.section_count = t = self.read_positions()
2.18 return t
2.19 else:
2.20 + assert self.read_documents == self.count
2.21 raise StopIteration
2.22
2.23 class PositionDictionaryWriter:
2.24 @@ -364,14 +365,34 @@
2.25
2.26 def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
2.27 self.position_opener = position_opener
2.28 + self.position_index_opener = position_index_opener
2.29 self.doc_frequency = doc_frequency
2.30 - self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
2.31 +
2.32 + self.index_iterator = None
2.33 self.iterator = None
2.34
2.35 + # Initialise the iterators.
2.36 +
2.37 + self.reset(offset, doc_frequency)
2.38 +
2.39 + def reset(self, offset, doc_frequency):
2.40 +
2.41 # Remember the last values.
2.42
2.43 self.found_docnum, self.found_positions = None, None
2.44
2.45 + # Attempt to reuse the index iterator.
2.46 +
2.47 + if self.index_iterator is not None:
2.48 + self.index_iterator.replenish(doc_frequency)
2.49 + self.index_iterator.seek(offset)
2.50 + self.index_iterator.reset()
2.51 +
2.52 + # Or make a new index iterator.
2.53 +
2.54 + else:
2.55 + self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency)
2.56 +
2.57 # Maintain state for the next index entry, if read.
2.58
2.59 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
2.60 @@ -510,9 +531,17 @@
2.61
2.62 "Initialise the iterator for the section in the position file."
2.63
2.64 + # Attempt to reuse any correctly positioned iterator.
2.65 +
2.66 if self.iterator is not None:
2.67 - self.iterator.close()
2.68 - self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
2.69 + self.iterator.replenish(self.section_count)
2.70 + self.iterator.seek(self.pos_offset)
2.71 + self.iterator.reset()
2.72 +
2.73 + # Otherwise, obtain a new iterator.
2.74 +
2.75 + else:
2.76 + self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
2.77
2.78 def close(self):
2.79 if self.iterator is not None:
2.80 @@ -522,4 +551,20 @@
2.81 self.index_iterator.close()
2.82 self.index_iterator = None
2.83
2.84 +class ResetPositionDictionaryIterator:
2.85 +
2.86 + """
2.87 + A helper class which permits the reuse of iterators without modifying their
2.88 + state.
2.89 + """
2.90 +
2.91 + def __init__(self, iterator, offset, doc_frequency):
2.92 + self.iterator = iterator
2.93 + self.offset = offset
2.94 + self.doc_frequency = doc_frequency
2.95 +
2.96 + def __iter__(self):
2.97 + self.iterator.reset(self.offset, self.doc_frequency)
2.98 + return iter(self.iterator)
2.99 +
2.100 # vim: tabstop=4 expandtab shiftwidth=4
3.1 --- a/iixr/terms.py Tue Sep 15 00:32:56 2009 +0200
3.2 +++ b/iixr/terms.py Wed Sep 16 00:45:18 2009 +0200
3.3 @@ -19,6 +19,7 @@
3.4 """
3.5
3.6 from iixr.files import *
3.7 +from iixr.positions import *
3.8 from os.path import commonprefix # to find common string prefixes
3.9 from bisect import bisect_right # to find terms in the dictionary index
3.10
3.11 @@ -208,6 +209,7 @@
3.12 self.info_reader = info_reader
3.13 self.index_reader = index_reader
3.14 self.position_dict_reader = position_dict_reader
3.15 + self.position_dict_iterator = None # for sequential/iterator access
3.16
3.17 self.terms = []
3.18 try:
3.19 @@ -322,8 +324,13 @@
3.20 """
3.21
3.22 term, offset, frequency, doc_frequency = self.info_reader.read_term()
3.23 - positions = self._get_positions(offset, doc_frequency)
3.24 - return term, frequency, doc_frequency, positions
3.25 +
3.26 + # For sequential access, attempt to reuse any iterator.
3.27 +
3.28 + if self.position_dict_iterator is None:
3.29 + self.position_dict_iterator = self._get_positions(offset, doc_frequency)
3.30 +
3.31 + return term, frequency, doc_frequency, ResetPositionDictionaryIterator(self.position_dict_iterator, offset, doc_frequency)
3.32
3.33 # Query methods.
3.34