iixr

Changeset

100:892f694ba76f
2011-02-15 Paul Boddie raw files shortlog changelog graph Fixed the itermerge optimisation for single element collections of iterators. Removed the common iterator support in the terms module since the features are really only relevant for the combined iterator. Fixed the seeking and state resetting logic in the combined iterator, retaining the last returned values only in this class. Introduced usage of the combined iterator as index reader only where a single pair of files is present.
iixr/index.py (file) iixr/terms.py (file) itermerge.py (file)
     1.1 --- a/iixr/index.py	Mon Feb 14 03:06:16 2011 +0100
     1.2 +++ b/iixr/index.py	Tue Feb 15 00:09:50 2011 +0100
     1.3 @@ -27,7 +27,7 @@
     1.4  # Constants.
     1.5  
     1.6  FLUSH_INTERVAL    = 10000
     1.7 -INDEX_INTERVAL    = 1000
     1.8 +INDEX_INTERVAL    = 100
     1.9  OPEN_PARTITIONS   = 20
    1.10  
    1.11  # High-level classes.
    1.12 @@ -163,7 +163,12 @@
    1.13              except IOError:
    1.14                  readers = self._get_readers(get_term_reader)
    1.15  
    1.16 -            self.reader = MultipleReader(readers)
    1.17 +            # NOTE: Multiple reading of combined readers is not supported.
    1.18 +
    1.19 +            if len(readers) == 1:
    1.20 +                self.reader = readers[0]
    1.21 +            else:
    1.22 +                self.reader = MultipleReader(readers)
    1.23  
    1.24          return self.reader
    1.25  
     2.1 --- a/iixr/terms.py	Mon Feb 14 03:06:16 2011 +0100
     2.2 +++ b/iixr/terms.py	Tue Feb 15 00:09:50 2011 +0100
     2.3 @@ -317,62 +317,30 @@
     2.4          self.last_offset += self.read_number()
     2.5          return self.last_term, self.last_offset
     2.6  
     2.7 -# Iterator support classes.
     2.8 -
     2.9 -class Iterator:
    2.10 -
    2.11 -    "Common iterator support."
    2.12 -
    2.13 -    def __init__(self):
    2.14 -
    2.15 -        "Cache the last term and positions."
    2.16 +# External reading classes.
    2.17  
    2.18 -        self.last_term_returned = None
    2.19 -        self.last_positions_returned = None
    2.20 +class TermIterator(TermReader):
    2.21  
    2.22 -    def go_to_term(self, term):
    2.23 -        if term == self.last_term_returned:
    2.24 -            return self.last_term_returned, self.last_positions_returned
    2.25 -
    2.26 -        t, dp = self.next()
    2.27 -        while t < term:
    2.28 -            t, dp = self.next()
    2.29 -        self.last_term_returned, self.last_positions_returned = t, dp
    2.30 -        return t, dp
    2.31 +    "An iterator over terms and positions read from a file."
    2.32  
    2.33      def __iter__(self):
    2.34          return self
    2.35  
    2.36      def next(self):
    2.37 -        self.last_term_returned, self.last_positions_returned = t = self._next()
    2.38 -        return t
    2.39 -
    2.40 -# External reading classes.
    2.41 -
    2.42 -class TermIterator(TermReader, Iterator):
    2.43 -
    2.44 -    "An iterator over terms and positions read from a file."
    2.45 -
    2.46 -    def __init__(self, f):
    2.47 -        TermReader.__init__(self, f)
    2.48 -        Iterator.__init__(self)
    2.49 -
    2.50 -    def _next(self):
    2.51          try:
    2.52              self.begin_record()
    2.53              return self.read_term()
    2.54          except EOFError:
    2.55              raise StopIteration
    2.56  
    2.57 -class TermDataIterator(TermReader, Iterator):
    2.58 +class TermDataIterator(TermReader):
    2.59  
    2.60      "An iterator over terms and unprocessed document positions data."
    2.61  
    2.62 -    def __init__(self, f):
    2.63 -        TermReader.__init__(self, f)
    2.64 -        Iterator.__init__(self)
    2.65 +    def __iter__(self):
    2.66 +        return self
    2.67  
    2.68 -    def _next(self):
    2.69 +    def next(self):
    2.70          try:
    2.71              self.begin_record()
    2.72              return self.read_term_plus_remaining()
    2.73 @@ -393,24 +361,28 @@
    2.74          except EOFError:
    2.75              raise StopIteration
    2.76  
    2.77 -class CombinedIterator(Iterator):
    2.78 +class CombinedIterator:
    2.79  
    2.80      "An iterator providing index and information file access."
    2.81  
    2.82      def __init__(self, reader, index_reader):
    2.83 -        Iterator.__init__(self)
    2.84          self.reader = reader
    2.85          self.index_reader = index_reader
    2.86          self.records = list(index_reader)
    2.87          self.terms = [t for t, dp in self.records]
    2.88  
    2.89 +        # Cache the last term and positions.
    2.90 +
    2.91 +        self.last_term_returned = None
    2.92 +        self.last_positions_returned = None
    2.93 +
    2.94      def go_to_term(self, term):
    2.95  
    2.96          """
    2.97          Return the 'term' and positions or nearest following term and positions.
    2.98          """
    2.99  
   2.100 -        if self.last_term_returned == term:
   2.101 +        if self.last_term_returned is not None and self.last_term_returned == term:
   2.102              return self.last_term_returned, self.last_positions_returned
   2.103  
   2.104          # Get the record providing a term less than or equal to the requested
   2.105 @@ -425,10 +397,17 @@
   2.106          # Seek to the corresponding record in the information file.
   2.107          # Only do this if the term is more quickly reached by seeking. 
   2.108  
   2.109 -        if term <= t or self.last_term_returned is None or term <= self.last_term_returned or \
   2.110 -            self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned:
   2.111 +        if term <= t or (
   2.112 +            self.last_term_returned is None or term < self.last_term_returned or
   2.113 +            self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned
   2.114 +            ):
   2.115  
   2.116              self.reader.seek(offset)
   2.117 +
   2.118 +            # Skip the term information, overwrite the reader's state.
   2.119 +
   2.120 +            self.reader.begin_record()
   2.121 +            self.reader.read_term_only()
   2.122              self.reader.last_term = t
   2.123  
   2.124          # Where the found term is equal or greater, just read the positions for
   2.125 @@ -436,12 +415,7 @@
   2.126  
   2.127          if t >= term:
   2.128  
   2.129 -            # Skip the term information, overwrite the reader's state, and get
   2.130 -            # the positions.
   2.131 -
   2.132 -            self.reader.begin_record()
   2.133 -            self.reader.read_term_only()
   2.134 -            self.reader.last_term = t
   2.135 +            # Get the positions.
   2.136  
   2.137              self.last_term_returned, self.last_positions_returned = t, self.reader.read_positions()
   2.138              return self.last_term_returned, self.last_positions_returned
   2.139 @@ -451,17 +425,19 @@
   2.140  
   2.141          else:
   2.142  
   2.143 -            # Overwrite the reader's state, then scan for the term.
   2.144 +            # Scan for the term.
   2.145  
   2.146 -            t, dp = self.reader.next()
   2.147              while t < term:
   2.148 -                t, dp = self.reader.next()
   2.149 +                t, dp = self.next() # remembers the term and positions
   2.150  
   2.151 -            self.last_term_returned, self.last_positions_returned = t, dp
   2.152              return t, dp
   2.153  
   2.154 -    def _next(self):
   2.155 -        return self.reader.next()
   2.156 +    def __iter__(self):
   2.157 +        return self
   2.158 +
   2.159 +    def next(self):
   2.160 +        self.last_term_returned, self.last_positions_returned = record = self.reader.next()
   2.161 +        return record
   2.162  
   2.163      def close(self):
   2.164          if self.reader is not None:
   2.165 @@ -501,23 +477,32 @@
   2.166              return 0, 0
   2.167  
   2.168      def go_to_term(self, term):
   2.169 +
   2.170 +        # Refresh the iterators to provide sought values.
   2.171 +
   2.172          self.iters = []
   2.173          for reader in self.readers:
   2.174              try:
   2.175                  insort_right(self.iters, (reader.go_to_term(term), reader.next))
   2.176              except StopIteration:
   2.177                  pass
   2.178 +
   2.179          self.next_value = None
   2.180          return self.next()
   2.181  
   2.182      def next(self):
   2.183 +
   2.184 +        # Return any prematurely read value.
   2.185 +
   2.186          if self.next_value is not None:
   2.187              term, positions = self.next_value
   2.188 +
   2.189 +        # Otherwise, get the next value.
   2.190 +
   2.191          else:
   2.192              term, positions = itermerge.next(self)
   2.193  
   2.194 -        # Look at the next item to see if it is has positions for the current
   2.195 -        # term.
   2.196 +        # Look at the next item to see if it has positions for the current term.
   2.197  
   2.198          try:
   2.199              t, p = itermerge.next(self)
     3.1 --- a/itermerge.py	Mon Feb 14 03:06:16 2011 +0100
     3.2 +++ b/itermerge.py	Tue Feb 15 00:09:50 2011 +0100
     3.3 @@ -78,7 +78,10 @@
     3.4                  del self.iters[0]
     3.5                  self._add_next(next)
     3.6              else:
     3.7 -                self.iters[0] = next(), next
     3.8 +                try:
     3.9 +                    self.iters[0] = next(), next
    3.10 +                except StopIteration:
    3.11 +                    self.iters = []
    3.12              return value
    3.13          else:
    3.14              raise StopIteration