1.1 --- a/iixr/terms.py Mon Feb 14 03:06:16 2011 +0100
1.2 +++ b/iixr/terms.py Tue Feb 15 00:09:50 2011 +0100
1.3 @@ -317,62 +317,30 @@
1.4 self.last_offset += self.read_number()
1.5 return self.last_term, self.last_offset
1.6
1.7 -# Iterator support classes.
1.8 -
1.9 -class Iterator:
1.10 -
1.11 - "Common iterator support."
1.12 -
1.13 - def __init__(self):
1.14 -
1.15 - "Cache the last term and positions."
1.16 +# External reading classes.
1.17
1.18 - self.last_term_returned = None
1.19 - self.last_positions_returned = None
1.20 +class TermIterator(TermReader):
1.21
1.22 - def go_to_term(self, term):
1.23 - if term == self.last_term_returned:
1.24 - return self.last_term_returned, self.last_positions_returned
1.25 -
1.26 - t, dp = self.next()
1.27 - while t < term:
1.28 - t, dp = self.next()
1.29 - self.last_term_returned, self.last_positions_returned = t, dp
1.30 - return t, dp
1.31 + "An iterator over terms and positions read from a file."
1.32
1.33 def __iter__(self):
1.34 return self
1.35
1.36 def next(self):
1.37 - self.last_term_returned, self.last_positions_returned = t = self._next()
1.38 - return t
1.39 -
1.40 -# External reading classes.
1.41 -
1.42 -class TermIterator(TermReader, Iterator):
1.43 -
1.44 - "An iterator over terms and positions read from a file."
1.45 -
1.46 - def __init__(self, f):
1.47 - TermReader.__init__(self, f)
1.48 - Iterator.__init__(self)
1.49 -
1.50 - def _next(self):
1.51 try:
1.52 self.begin_record()
1.53 return self.read_term()
1.54 except EOFError:
1.55 raise StopIteration
1.56
1.57 -class TermDataIterator(TermReader, Iterator):
1.58 +class TermDataIterator(TermReader):
1.59
1.60 "An iterator over terms and unprocessed document positions data."
1.61
1.62 - def __init__(self, f):
1.63 - TermReader.__init__(self, f)
1.64 - Iterator.__init__(self)
1.65 + def __iter__(self):
1.66 + return self
1.67
1.68 - def _next(self):
1.69 + def next(self):
1.70 try:
1.71 self.begin_record()
1.72 return self.read_term_plus_remaining()
1.73 @@ -393,24 +361,28 @@
1.74 except EOFError:
1.75 raise StopIteration
1.76
1.77 -class CombinedIterator(Iterator):
1.78 +class CombinedIterator:
1.79
1.80 "An iterator providing index and information file access."
1.81
1.82 def __init__(self, reader, index_reader):
1.83 - Iterator.__init__(self)
1.84 self.reader = reader
1.85 self.index_reader = index_reader
1.86 self.records = list(index_reader)
1.87 self.terms = [t for t, dp in self.records]
1.88
1.89 + # Cache the last term and positions.
1.90 +
1.91 + self.last_term_returned = None
1.92 + self.last_positions_returned = None
1.93 +
1.94 def go_to_term(self, term):
1.95
1.96 """
1.97 Return the 'term' and positions or nearest following term and positions.
1.98 """
1.99
1.100 - if self.last_term_returned == term:
1.101 + if self.last_term_returned is not None and self.last_term_returned == term:
1.102 return self.last_term_returned, self.last_positions_returned
1.103
1.104 # Get the record providing a term less than or equal to the requested
1.105 @@ -425,10 +397,17 @@
1.106 # Seek to the corresponding record in the information file.
1.107 # Only do this if the term is more quickly reached by seeking.
1.108
1.109 - if term <= t or self.last_term_returned is None or term <= self.last_term_returned or \
1.110 - self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned:
1.111 + if term <= t or (
1.112 + self.last_term_returned is None or term < self.last_term_returned or
1.113 + self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned
1.114 + ):
1.115
1.116 self.reader.seek(offset)
1.117 +
1.118 + # Skip the term information, overwrite the reader's state.
1.119 +
1.120 + self.reader.begin_record()
1.121 + self.reader.read_term_only()
1.122 self.reader.last_term = t
1.123
1.124 # Where the found term is equal or greater, just read the positions for
1.125 @@ -436,12 +415,7 @@
1.126
1.127 if t >= term:
1.128
1.129 - # Skip the term information, overwrite the reader's state, and get
1.130 - # the positions.
1.131 -
1.132 - self.reader.begin_record()
1.133 - self.reader.read_term_only()
1.134 - self.reader.last_term = t
1.135 + # Get the positions.
1.136
1.137 self.last_term_returned, self.last_positions_returned = t, self.reader.read_positions()
1.138 return self.last_term_returned, self.last_positions_returned
1.139 @@ -451,17 +425,19 @@
1.140
1.141 else:
1.142
1.143 - # Overwrite the reader's state, then scan for the term.
1.144 + # Scan for the term.
1.145
1.146 - t, dp = self.reader.next()
1.147 while t < term:
1.148 - t, dp = self.reader.next()
1.149 + t, dp = self.next() # remembers the term and positions
1.150
1.151 - self.last_term_returned, self.last_positions_returned = t, dp
1.152 return t, dp
1.153
1.154 - def _next(self):
1.155 - return self.reader.next()
1.156 + def __iter__(self):
1.157 + return self
1.158 +
1.159 + def next(self):
1.160 + self.last_term_returned, self.last_positions_returned = record = self.reader.next()
1.161 + return record
1.162
1.163 def close(self):
1.164 if self.reader is not None:
1.165 @@ -501,23 +477,32 @@
1.166 return 0, 0
1.167
1.168 def go_to_term(self, term):
1.169 +
1.170 + # Refresh the iterators to provide sought values.
1.171 +
1.172 self.iters = []
1.173 for reader in self.readers:
1.174 try:
1.175 insort_right(self.iters, (reader.go_to_term(term), reader.next))
1.176 except StopIteration:
1.177 pass
1.178 +
1.179 self.next_value = None
1.180 return self.next()
1.181
1.182 def next(self):
1.183 +
1.184 + # Return any prematurely read value.
1.185 +
1.186 if self.next_value is not None:
1.187 term, positions = self.next_value
1.188 +
1.189 + # Otherwise, get the next value.
1.190 +
1.191 else:
1.192 term, positions = itermerge.next(self)
1.193
1.194 - # Look at the next item to see if it is has positions for the current
1.195 - # term.
1.196 + # Look at the next item to see if it has positions for the current term.
1.197
1.198 try:
1.199 t, p = itermerge.next(self)