1.1 --- a/iixr/index.py Mon Feb 14 03:06:16 2011 +0100
1.2 +++ b/iixr/index.py Tue Feb 15 00:09:50 2011 +0100
1.3 @@ -27,7 +27,7 @@
1.4 # Constants.
1.5
1.6 FLUSH_INTERVAL = 10000
1.7 -INDEX_INTERVAL = 1000
1.8 +INDEX_INTERVAL = 100
1.9 OPEN_PARTITIONS = 20
1.10
1.11 # High-level classes.
1.12 @@ -163,7 +163,12 @@
1.13 except IOError:
1.14 readers = self._get_readers(get_term_reader)
1.15
1.16 - self.reader = MultipleReader(readers)
1.17 + # NOTE: Multiple reading of combined readers is not supported.
1.18 +
1.19 + if len(readers) == 1:
1.20 + self.reader = readers[0]
1.21 + else:
1.22 + self.reader = MultipleReader(readers)
1.23
1.24 return self.reader
1.25
2.1 --- a/iixr/terms.py Mon Feb 14 03:06:16 2011 +0100
2.2 +++ b/iixr/terms.py Tue Feb 15 00:09:50 2011 +0100
2.3 @@ -317,62 +317,30 @@
2.4 self.last_offset += self.read_number()
2.5 return self.last_term, self.last_offset
2.6
2.7 -# Iterator support classes.
2.8 -
2.9 -class Iterator:
2.10 -
2.11 - "Common iterator support."
2.12 -
2.13 - def __init__(self):
2.14 -
2.15 - "Cache the last term and positions."
2.16 +# External reading classes.
2.17
2.18 - self.last_term_returned = None
2.19 - self.last_positions_returned = None
2.20 +class TermIterator(TermReader):
2.21
2.22 - def go_to_term(self, term):
2.23 - if term == self.last_term_returned:
2.24 - return self.last_term_returned, self.last_positions_returned
2.25 -
2.26 - t, dp = self.next()
2.27 - while t < term:
2.28 - t, dp = self.next()
2.29 - self.last_term_returned, self.last_positions_returned = t, dp
2.30 - return t, dp
2.31 + "An iterator over terms and positions read from a file."
2.32
2.33 def __iter__(self):
2.34 return self
2.35
2.36 def next(self):
2.37 - self.last_term_returned, self.last_positions_returned = t = self._next()
2.38 - return t
2.39 -
2.40 -# External reading classes.
2.41 -
2.42 -class TermIterator(TermReader, Iterator):
2.43 -
2.44 - "An iterator over terms and positions read from a file."
2.45 -
2.46 - def __init__(self, f):
2.47 - TermReader.__init__(self, f)
2.48 - Iterator.__init__(self)
2.49 -
2.50 - def _next(self):
2.51 try:
2.52 self.begin_record()
2.53 return self.read_term()
2.54 except EOFError:
2.55 raise StopIteration
2.56
2.57 -class TermDataIterator(TermReader, Iterator):
2.58 +class TermDataIterator(TermReader):
2.59
2.60 "An iterator over terms and unprocessed document positions data."
2.61
2.62 - def __init__(self, f):
2.63 - TermReader.__init__(self, f)
2.64 - Iterator.__init__(self)
2.65 + def __iter__(self):
2.66 + return self
2.67
2.68 - def _next(self):
2.69 + def next(self):
2.70 try:
2.71 self.begin_record()
2.72 return self.read_term_plus_remaining()
2.73 @@ -393,24 +361,28 @@
2.74 except EOFError:
2.75 raise StopIteration
2.76
2.77 -class CombinedIterator(Iterator):
2.78 +class CombinedIterator:
2.79
2.80 "An iterator providing index and information file access."
2.81
2.82 def __init__(self, reader, index_reader):
2.83 - Iterator.__init__(self)
2.84 self.reader = reader
2.85 self.index_reader = index_reader
2.86 self.records = list(index_reader)
2.87 self.terms = [t for t, dp in self.records]
2.88
2.89 + # Cache the last term and positions.
2.90 +
2.91 + self.last_term_returned = None
2.92 + self.last_positions_returned = None
2.93 +
2.94 def go_to_term(self, term):
2.95
2.96 """
2.97 Return the 'term' and positions or nearest following term and positions.
2.98 """
2.99
2.100 - if self.last_term_returned == term:
2.101 + if self.last_term_returned is not None and self.last_term_returned == term:
2.102 return self.last_term_returned, self.last_positions_returned
2.103
2.104 # Get the record providing a term less than or equal to the requested
2.105 @@ -425,10 +397,17 @@
2.106 # Seek to the corresponding record in the information file.
2.107 # Only do this if the term is more quickly reached by seeking.
2.108
2.109 - if term <= t or self.last_term_returned is None or term <= self.last_term_returned or \
2.110 - self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned:
2.111 + if term <= t or (
2.112 + self.last_term_returned is None or term < self.last_term_returned or
2.113 + self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned
2.114 + ):
2.115
2.116 self.reader.seek(offset)
2.117 +
2.118 + # Skip the term information, overwrite the reader's state.
2.119 +
2.120 + self.reader.begin_record()
2.121 + self.reader.read_term_only()
2.122 self.reader.last_term = t
2.123
2.124 # Where the found term is equal or greater, just read the positions for
2.125 @@ -436,12 +415,7 @@
2.126
2.127 if t >= term:
2.128
2.129 - # Skip the term information, overwrite the reader's state, and get
2.130 - # the positions.
2.131 -
2.132 - self.reader.begin_record()
2.133 - self.reader.read_term_only()
2.134 - self.reader.last_term = t
2.135 + # Get the positions.
2.136
2.137 self.last_term_returned, self.last_positions_returned = t, self.reader.read_positions()
2.138 return self.last_term_returned, self.last_positions_returned
2.139 @@ -451,17 +425,19 @@
2.140
2.141 else:
2.142
2.143 - # Overwrite the reader's state, then scan for the term.
2.144 + # Scan for the term.
2.145
2.146 - t, dp = self.reader.next()
2.147 while t < term:
2.148 - t, dp = self.reader.next()
2.149 + t, dp = self.next() # remembers the term and positions
2.150
2.151 - self.last_term_returned, self.last_positions_returned = t, dp
2.152 return t, dp
2.153
2.154 - def _next(self):
2.155 - return self.reader.next()
2.156 + def __iter__(self):
2.157 + return self
2.158 +
2.159 + def next(self):
2.160 + self.last_term_returned, self.last_positions_returned = record = self.reader.next()
2.161 + return record
2.162
2.163 def close(self):
2.164 if self.reader is not None:
2.165 @@ -501,23 +477,32 @@
2.166 return 0, 0
2.167
2.168 def go_to_term(self, term):
2.169 +
2.170 + # Refresh the iterators to provide sought values.
2.171 +
2.172 self.iters = []
2.173 for reader in self.readers:
2.174 try:
2.175 insort_right(self.iters, (reader.go_to_term(term), reader.next))
2.176 except StopIteration:
2.177 pass
2.178 +
2.179 self.next_value = None
2.180 return self.next()
2.181
2.182 def next(self):
2.183 +
2.184 + # Return any prematurely read value.
2.185 +
2.186 if self.next_value is not None:
2.187 term, positions = self.next_value
2.188 +
2.189 + # Otherwise, get the next value.
2.190 +
2.191 else:
2.192 term, positions = itermerge.next(self)
2.193
2.194 - # Look at the next item to see if it is has positions for the current
2.195 - # term.
2.196 + # Look at the next item to see if it has positions for the current term.
2.197
2.198 try:
2.199 t, p = itermerge.next(self)
3.1 --- a/itermerge.py Mon Feb 14 03:06:16 2011 +0100
3.2 +++ b/itermerge.py Tue Feb 15 00:09:50 2011 +0100
3.3 @@ -78,7 +78,10 @@
3.4 del self.iters[0]
3.5 self._add_next(next)
3.6 else:
3.7 - self.iters[0] = next(), next
3.8 + try:
3.9 + self.iters[0] = next(), next
3.10 + except StopIteration:
3.11 + self.iters = []
3.12 return value
3.13 else:
3.14 raise StopIteration