Attempted to introduce position dictionaries with extra term record details providing document frequency information. Attempted to introduce file descriptor duplication in order to support concurrent iterators.

     1.1 --- a/iixr.py	Mon Aug 31 21:02:30 2009 +0200
     1.2 +++ b/iixr.py	Wed Sep 02 01:30:42 2009 +0200
     1.3 @@ -18,6 +18,7 @@
     1.4  with this program.  If not, see <http://www.gnu.org/licenses/>.
     1.5  """
     1.6  
     1.7 +from os import dup, fdopen       # independent iterator access to files
     1.8  from os import listdir, mkdir    # index and partition discovery
     1.9  from os import remove, rename    # partition manipulation
    1.10  from os.path import exists, join
    1.11 @@ -194,11 +195,18 @@
    1.12  
    1.13      def write_positions(self, docnum, positions):
    1.14  
    1.15 -        "Write for the document 'docnum' the given 'positions'."
    1.16 +        """
    1.17 +        Write for the document 'docnum' the given 'positions'.
    1.18 +        Return the offset of the written record.
    1.19 +        """
    1.20  
    1.21          if docnum < self.last_docnum:
    1.22              raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
    1.23  
    1.24 +        # Record the offset of this record.
    1.25 +
    1.26 +        offset = self.f.tell()
    1.27 +
    1.28          # Write the document number delta.
    1.29  
    1.30          self.write_number(docnum - self.last_docnum)
    1.31 @@ -221,34 +229,7 @@
    1.32  
    1.33          self.last_docnum = docnum
    1.34  
    1.35 -    def write_term_positions(self, doc_positions):
    1.36 -
    1.37 -        """
    1.38 -        Write all 'doc_positions' - a collection of tuples of the form (document
    1.39 -        number, position list) - to the file, returning a tuple containing the
    1.40 -        offset at which they were stored together with the frequency (number of
    1.41 -        positions) for the term involved.
    1.42 -        """
    1.43 -
    1.44 -        # Reset the writer and record the current file offset.
    1.45 -
    1.46 -        self.reset()
    1.47 -        offset = self.f.tell()
    1.48 -
    1.49 -        # Write the number of documents.
    1.50 -
    1.51 -        self.write_number(len(doc_positions))
    1.52 -        doc_positions.sort()
    1.53 -
    1.54 -        # Write the positions.
    1.55 -
    1.56 -        frequency = 0
    1.57 -
    1.58 -        for docnum, positions in doc_positions:
    1.59 -            self.write_positions(docnum, positions)
    1.60 -            frequency += len(positions)
    1.61 -
    1.62 -        return offset, frequency
    1.63 +        return offset
    1.64  
    1.65  class PositionReader(FileReader):
    1.66  
    1.67 @@ -283,54 +264,295 @@
    1.68  
    1.69          return self.last_docnum, positions
    1.70  
    1.71 -    def read_term_positions(self, offset):
    1.72 +    def read_term_positions(self, offset, count):
    1.73  
    1.74          """
    1.75          Read all positions from 'offset', seeking to that position in the file
    1.76 -        before reading.
    1.77 +        before reading. The number of documents available for reading is limited
    1.78 +        to 'count'.
    1.79          """
    1.80  
    1.81          self.reset()
    1.82 -        self.f.seek(offset)
    1.83 +
    1.84 +        # Duplicate the file handle.
    1.85 +
    1.86 +        f = fdopen(dup(self.f.fileno()), "rb")
    1.87 +        f.seek(offset)
    1.88 +        return PositionIterator(f, count)
    1.89 +
    1.90 +class IteratorBase:
    1.91 +
    1.92 +    def __init__(self, count):
    1.93 +        self.replenish(count)
    1.94  
    1.95 -        # Could duplicate the file handle using...
    1.96 -        # fdopen(dup(self.f.fileno()), "rb")
    1.97 +    def replenish(self, count):
    1.98 +        self.count = count
    1.99 +        self.read_documents = 0
   1.100 +
   1.101 +    def __len__(self):
   1.102 +        return self.count
   1.103  
   1.104 -        return PositionIterator(self.f)
   1.105 +    def sort(self):
   1.106 +        pass # Stored document positions are already sorted.
   1.107  
   1.108 -class PositionIterator(PositionReader):
   1.109 +    def __iter__(self):
   1.110 +        return self
   1.111 +
   1.112 +class PositionIterator(PositionReader, IteratorBase):
   1.113  
   1.114      "Iterating over document positions."
   1.115  
   1.116 -    def __init__(self, f):
   1.117 +    def __init__(self, f, count):
   1.118          PositionReader.__init__(self, f)
   1.119 +        IteratorBase.__init__(self, count)
   1.120 +
   1.121 +    def next(self):
   1.122 +
   1.123 +        "Read positions for a single document."
   1.124 +
   1.125 +        if self.read_documents < self.count:
   1.126 +            self.read_documents += 1
   1.127 +            return self.read_positions()
   1.128 +        else:
   1.129 +            raise StopIteration
   1.130 +
   1.131 +class PositionIndexWriter(FileWriter):
   1.132 +
   1.133 +    "Writing position index information to files."
   1.134 +
   1.135 +    def reset(self):
   1.136 +        self.last_docnum = 0
   1.137 +        self.last_pos_offset = 0
   1.138 +
   1.139 +    def write_positions(self, docnum, pos_offset, count):
   1.140 +
   1.141 +        """
   1.142 +        Write the given 'docnum, 'pos_offset' and document 'count' to the
   1.143 +        position index file.
   1.144 +        """
   1.145 +
   1.146 +        # Record the offset of this record.
   1.147 +
   1.148 +        offset = self.f.tell()
   1.149 +
   1.150 +        # Write the document number delta.
   1.151 +
   1.152 +        self.write_number(docnum - self.last_docnum)
   1.153 +        self.last_docnum = docnum
   1.154 +
   1.155 +        # Write the position file offset delta.
   1.156 +
   1.157 +        self.write_number(pos_offset - self.last_pos_offset)
   1.158 +        self.last_pos_offset = pos_offset
   1.159 +
   1.160 +        # Write the document count.
   1.161 +
   1.162 +        self.write_number(count)
   1.163 +
   1.164 +        return offset
   1.165 +
   1.166 +class PositionIndexReader(FileReader):
   1.167 +
   1.168 +    "Reading position index information from files."
   1.169  
   1.170 -        # Read the number of documents.
   1.171 +    def reset(self):
   1.172 +        self.last_docnum = 0
   1.173 +        self.last_pos_offset = 0
   1.174 +
   1.175 +    def read_positions(self):
   1.176 +
   1.177 +        """
   1.178 +        Read a document number, a position file offset for the position index
   1.179 +        file, and the number of documents in a section of that file.
   1.180 +        """
   1.181 +
   1.182 +        # Read the document number delta.
   1.183 +
   1.184 +        self.last_docnum += self.read_number()
   1.185 +
   1.186 +        # Read the offset delta.
   1.187 +
   1.188 +        self.last_pos_offset += self.read_number()
   1.189 +
   1.190 +        # Read the document count.
   1.191 +
   1.192 +        count = self.read_number()
   1.193 +
   1.194 +        return self.last_docnum, self.last_pos_offset, count
   1.195 +
   1.196 +    def read_term_positions(self, offset, doc_frequency):
   1.197  
   1.198 -        self.ndocuments = self.read_number()
   1.199 -        self.read_documents = 0
   1.200 +        """
   1.201 +        Read all positions from 'offset', seeking to that position in the file
   1.202 +        before reading. The number of documents available for reading is limited
   1.203 +        to 'doc_frequency'.
   1.204 +        """
   1.205 +
   1.206 +        # NOTE: This is almost a duplication of PositionReader.read_term_positions.
   1.207 +
   1.208 +        self.reset()
   1.209 +
   1.210 +        # Duplicate the file handle.
   1.211 +
   1.212 +        f = fdopen(dup(self.f.fileno()), "rb")
   1.213 +        f.seek(offset)
   1.214 +        return PositionIndexIterator(f, doc_frequency)
   1.215 +
   1.216 +class PositionIndexIterator(PositionIndexReader, IteratorBase):
   1.217 +
   1.218 +    "Iterating over document positions."
   1.219 +
   1.220 +    def __init__(self, f, count):
   1.221 +        PositionIndexReader.__init__(self, f)
   1.222 +        IteratorBase.__init__(self, count)
   1.223 +        self.section_count = 0
   1.224 +
   1.225 +    def next(self):
   1.226 +
   1.227 +        "Read positions for a single document."
   1.228  
   1.229 -    def __len__(self):
   1.230 -        return self.ndocuments
   1.231 +        self.read_documents += self.section_count
   1.232 +        if self.read_documents < self.count:
   1.233 +            docnum, pos_offset, self.section_count = t = self.read_positions()
   1.234 +            return t
   1.235 +        else:
   1.236 +            raise StopIteration
   1.237 +
   1.238 +class PositionDictionaryWriter:
   1.239 +
   1.240 +    "Writing position dictionaries."
   1.241 +
   1.242 +    def __init__(self, position_writer, position_index_writer, interval):
   1.243 +        self.position_writer = position_writer
   1.244 +        self.position_index_writer = position_index_writer
   1.245 +        self.interval = interval
   1.246 +
   1.247 +    def write_term_positions(self, doc_positions):
   1.248 +
   1.249 +        """
   1.250 +        Write all 'doc_positions' - a collection of tuples of the form (document
   1.251 +        number, position list) - to the file.
   1.252 +
   1.253 +        Add some records to the index, making dictionary entries.
   1.254 +
   1.255 +        Return a tuple containing the offset of the written data, the frequency
   1.256 +        (number of positions), and document frequency (number of documents) for
   1.257 +        the term involved.
   1.258 +        """
   1.259 +
   1.260 +        # Reset the writer.
   1.261 +
   1.262 +        self.position_writer.reset()
   1.263 +        index_offset = None
   1.264 +
   1.265 +        # Write the positions.
   1.266 +
   1.267 +        frequency = 0
   1.268 +        first_offset = None
   1.269 +        count = 0
   1.270 +
   1.271 +        doc_positions.sort()
   1.272 +
   1.273 +        for docnum, positions in doc_positions:
   1.274 +            pos_offset = self.position_writer.write_positions(docnum, positions)
   1.275 +
   1.276 +            # Retain the first record offset for a subsequent index entry.
   1.277 +
   1.278 +            if first_offset is None:
   1.279 +                first_offset = pos_offset
   1.280 +
   1.281 +            frequency += len(positions)
   1.282 +
   1.283 +            # Every {interval} entries, write an index entry.
   1.284 +
   1.285 +            if count == self.interval:
   1.286 +                io = self.position_index_writer.write_positions(docnum, first_offset, self.interval)
   1.287  
   1.288 -    def sort(self):
   1.289 +                # Remember the first index entry offset.
   1.290 +
   1.291 +                if index_offset is None:
   1.292 +                    index_offset = io
   1.293 +
   1.294 +                first_offset = None
   1.295 +                count = 0
   1.296 +
   1.297 +            count += 1
   1.298 +
   1.299 +        # Finish writing an index entry for the remaining documents.
   1.300 +
   1.301 +        else:
   1.302 +            if first_offset is not None:
   1.303 +                io = self.position_index_writer.write_positions(docnum, first_offset, count)
   1.304 +
   1.305 +                # Remember the first index entry offset.
   1.306 +
   1.307 +                if index_offset is None:
   1.308 +                    index_offset = io
   1.309 +
   1.310 +        return index_offset, frequency, len(doc_positions)
   1.311 +
   1.312 +    def close(self):
   1.313 +        self.position_writer.close()
   1.314 +        self.position_index_writer.close()
   1.315 +
   1.316 +class PositionDictionaryReader:
   1.317  
   1.318 -        "Stored document positions are already sorted."
   1.319 +    "Reading position dictionaries."
   1.320 +
   1.321 +    def __init__(self, position_reader, position_index_reader):
   1.322 +        self.position_reader = position_reader
   1.323 +        self.position_index_reader = position_index_reader
   1.324 +
   1.325 +    def read_term_positions(self, offset, doc_frequency):
   1.326 +
   1.327 +        """
   1.328 +        Return an iterator for dictionary entries starting at 'offset' with the
   1.329 +        given 'doc_frequency'.
   1.330 +        """
   1.331  
   1.332 -        pass
   1.333 +        return PositionDictionaryIterator(self.position_reader,
   1.334 +            self.position_index_reader, offset, doc_frequency)
   1.335 +
   1.336 +    def close(self):
   1.337 +        self.position_reader.close()
   1.338 +        self.position_index_reader.close()
   1.339 +
   1.340 +class PositionDictionaryIterator:
   1.341 +
   1.342 +    "Iteration over position dictionary entries."
   1.343 +
   1.344 +    def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
   1.345 +        self.position_reader = position_reader
   1.346 +
   1.347 +        self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
   1.348 +        self.next_section()
   1.349 +        self.init_section()
   1.350  
   1.351      def __iter__(self):
   1.352          return self
   1.353  
   1.354      def next(self):
   1.355  
   1.356 -        "Read positions for a single document."
   1.357 +        # Attempt to get the next document record from the section in the positions file.
   1.358 +
   1.359 +        while 1:
   1.360 +
   1.361 +            # Either return the next record.
   1.362 +
   1.363 +            try:
   1.364 +                return self.iterator.next()
   1.365  
   1.366 -        if self.read_documents < self.ndocuments:
   1.367 -            self.read_documents += 1
   1.368 -            return self.read_positions()
   1.369 -        else:
   1.370 -            raise StopIteration
   1.371 +            # Or, where a section is finished, get the next section and try again.
   1.372 +
   1.373 +            except StopIteration:
   1.374 +                self.next_section()
   1.375 +                self.iterator.replenish(self.section_count)
   1.376 +
   1.377 +    def next_section(self):
   1.378 +        self.docnum, self.pos_offset, self.section_count = self.index_iterator.read_positions()
   1.379 +
   1.380 +    def init_section(self):
   1.381 +        self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
   1.382  
   1.383  class TermWriter(FileWriter):
   1.384  
   1.385 @@ -340,12 +562,13 @@
   1.386          self.last_term = ""
   1.387          self.last_offset = 0
   1.388  
   1.389 -    def write_term(self, term, offset, frequency):
   1.390 +    def write_term(self, term, offset, frequency, doc_frequency):
   1.391  
   1.392          """
   1.393 -        Write the given 'term', its position file 'offset', and its 'frequency'
   1.394 -        to the term information file. Return the offset after the term
   1.395 -        information was written to the file.
   1.396 +        Write the given 'term', its position file 'offset', its 'frequency' and
   1.397 +        its 'doc_frequency' (number of documents in which it appears) to the
   1.398 +        term information file. Return the offset after the term information was
   1.399 +        written to the file.
   1.400          """
   1.401  
   1.402          # Write the prefix length and term suffix.
   1.403 @@ -364,6 +587,10 @@
   1.404  
   1.405          self.write_number(frequency)
   1.406  
   1.407 +        # Write the document frequency.
   1.408 +
   1.409 +        self.write_number(doc_frequency)
   1.410 +
   1.411          self.last_term = term
   1.412          self.last_offset = offset
   1.413  
   1.414 @@ -380,8 +607,8 @@
   1.415      def read_term(self):
   1.416  
   1.417          """
   1.418 -        Read a term, its position file offset, and its frequency from the term
   1.419 -        information file.
   1.420 +        Read a term, its position file offset, its frequency and its document
   1.421 +        frequence from the term information file.
   1.422          """
   1.423  
   1.424          # Read the prefix length and term suffix.
   1.425 @@ -399,7 +626,11 @@
   1.426  
   1.427          frequency = self.read_number()
   1.428  
   1.429 -        return self.last_term, self.last_offset, frequency
   1.430 +        # Read the document frequency.
   1.431 +
   1.432 +        doc_frequency = self.read_number()
   1.433 +
   1.434 +        return self.last_term, self.last_offset, frequency, doc_frequency
   1.435  
   1.436      def go_to_term(self, term, offset, info_offset):
   1.437  
   1.438 @@ -420,15 +651,15 @@
   1.439          TermWriter.reset(self)
   1.440          self.last_info_offset = 0
   1.441  
   1.442 -    def write_term(self, term, offset, frequency, info_offset):
   1.443 +    def write_term(self, term, offset, frequency, doc_frequency, info_offset):
   1.444  
   1.445          """
   1.446 -        Write the given 'term', its position file 'offset', and its 'frequency'
   1.447 -        to the term dictionary index file, along with the 'info_offset' in the
   1.448 -        term information file.
   1.449 +        Write the given 'term', its position file 'offset', its 'frequency' and
   1.450 +        its 'doc_frequency' to the term dictionary index file, along with the
   1.451 +        'info_offset' in the term information file.
   1.452          """
   1.453  
   1.454 -        TermWriter.write_term(self, term, offset, frequency)
   1.455 +        TermWriter.write_term(self, term, offset, frequency, doc_frequency)
   1.456  
   1.457          # Write the information file offset delta.
   1.458  
   1.459 @@ -446,41 +677,43 @@
   1.460      def read_term(self):
   1.461  
   1.462          """
   1.463 -        Read a term, its position file offset, its frequency, and its term
   1.464 -        information file offset from the term dictionary index file.
   1.465 +        Read a term, its position file offset, its frequency, its document
   1.466 +        frequency and a term information file offset from the term dictionary
   1.467 +        index file.
   1.468          """
   1.469  
   1.470 -        term, offset, frequency = TermReader.read_term(self)
   1.471 +        term, offset, frequency, doc_frequency = TermReader.read_term(self)
   1.472  
   1.473          # Read the offset delta.
   1.474  
   1.475          self.last_info_offset += self.read_number()
   1.476  
   1.477 -        return term, offset, frequency, self.last_info_offset
   1.478 +        return term, offset, frequency, doc_frequency, self.last_info_offset
   1.479  
   1.480  class TermDictionaryWriter:
   1.481  
   1.482      "Writing term dictionaries."
   1.483  
   1.484 -    def __init__(self, info_writer, index_writer, position_writer, interval):
   1.485 +    def __init__(self, info_writer, index_writer, position_dict_writer, interval):
   1.486          self.info_writer = info_writer
   1.487          self.index_writer = index_writer
   1.488 -        self.position_writer = position_writer
   1.489 +        self.position_dict_writer = position_dict_writer
   1.490          self.interval = interval
   1.491          self.entry = 0
   1.492  
   1.493 -    def _write_term(self, term, offset, frequency):
   1.494 +    def _write_term(self, term, offset, frequency, doc_frequency):
   1.495  
   1.496          """
   1.497 -        Write the given 'term', its position file 'offset', and its 'frequency'
   1.498 -        to the term information file and optionally to the index, making a
   1.499 -        dictionary entry.
   1.500 +        Write the given 'term', its position file 'offset', its 'frequency' and
   1.501 +        its 'doc_frequency' (number of documents in which it appears) to the
   1.502 +        term information file. Return the offset after the term information was
   1.503 +        written to the file.
   1.504          """
   1.505  
   1.506 -        info_offset = self.info_writer.write_term(term, offset, frequency)
   1.507 +        info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
   1.508  
   1.509          if self.entry % self.interval == 0:
   1.510 -            self.index_writer.write_term(term, offset, frequency, info_offset)
   1.511 +            self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
   1.512  
   1.513          self.entry += 1
   1.514  
   1.515 @@ -491,13 +724,13 @@
   1.516          and positions at which the term is found.
   1.517          """
   1.518  
   1.519 -        offset, frequency = self.position_writer.write_term_positions(doc_positions)
   1.520 -        self._write_term(term, offset, frequency)
   1.521 +        offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
   1.522 +        self._write_term(term, offset, frequency, doc_frequency)
   1.523  
   1.524      def close(self):
   1.525          self.info_writer.close()
   1.526          self.index_writer.close()
   1.527 -        self.position_writer.close()
   1.528 +        self.position_dict_writer.close()
   1.529  
   1.530  class TermDictionaryReader:
   1.531  
   1.532 @@ -533,12 +766,13 @@
   1.533          if i == -1:
   1.534              return None
   1.535  
   1.536 -        found_term, offset, frequency, info_offset = self.terms[i]
   1.537 +        found_term, offset, frequency, doc_frequency, info_offset = self.terms[i]
   1.538  
   1.539 -        # Where the term is found immediately, return the offset.
   1.540 +        # Where the term is found immediately, return the offset and
   1.541 +        # frequencies.
   1.542  
   1.543          if term == found_term:
   1.544 -            return offset, frequency
   1.545 +            return offset, frequency, doc_frequency
   1.546  
   1.547          # Otherwise, seek past the index term's entry in the information file
   1.548          # and scan for the desired term.
   1.549 @@ -547,33 +781,33 @@
   1.550              self.info_reader.go_to_term(found_term, offset, info_offset)
   1.551              try:
   1.552                  while term > found_term:
   1.553 -                    found_term, offset, frequency = self.info_reader.read_term()
   1.554 +                    found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
   1.555              except EOFError:
   1.556                  pass
   1.557  
   1.558 -            # If the term is found, return the offset and frequency.
   1.559 +            # If the term is found, return the offset and frequencies.
   1.560  
   1.561              if term == found_term:
   1.562 -                return offset, frequency
   1.563 +                return offset, frequency, doc_frequency
   1.564              else:
   1.565                  return None
   1.566  
   1.567      def rewind(self):
   1.568          self.info_reader.rewind()
   1.569  
   1.570 -    def _get_positions(self, offset):
   1.571 -        return self.position_reader.read_term_positions(offset)
   1.572 +    def _get_positions(self, offset, doc_frequency):
   1.573 +        return self.position_reader.read_term_positions(offset, doc_frequency)
   1.574  
   1.575      def read_term(self):
   1.576  
   1.577          """
   1.578 -        Return the next term, its frequency and the documents and positions at
   1.579 -        which the term is found.
   1.580 +        Return the next term, its frequency, its document frequency, and the
   1.581 +        documents and positions at which the term is found.
   1.582          """
   1.583  
   1.584 -        term, offset, frequency = self.info_reader.read_term()
   1.585 -        positions = self._get_positions(offset)
   1.586 -        return term, frequency, positions
   1.587 +        term, offset, frequency, doc_frequency = self.info_reader.read_term()
   1.588 +        positions = self._get_positions(offset, doc_frequency)
   1.589 +        return term, frequency, doc_frequency, positions
   1.590  
   1.591      def find_positions(self, term):
   1.592  
   1.593 @@ -583,8 +817,8 @@
   1.594          if t is None:
   1.595              return None
   1.596          else:
   1.597 -            offset, frequency = t
   1.598 -            return self._get_positions(offset)
   1.599 +            offset, frequency, doc_frequency = t
   1.600 +            return self._get_positions(offset, doc_frequency)
   1.601  
   1.602      def get_frequency(self, term):
   1.603  
   1.604 @@ -594,9 +828,20 @@
   1.605          if t is None:
   1.606              return None
   1.607          else:
   1.608 -            offset, frequency = t
   1.609 +            offset, frequency, doc_frequency = t
   1.610              return frequency
   1.611  
   1.612 +    def get_document_frequency(self, term):
   1.613 +
   1.614 +        "Return the document frequency of the given 'term'."
   1.615 +
   1.616 +        t = self._find_term(term)
   1.617 +        if t is None:
   1.618 +            return None
   1.619 +        else:
   1.620 +            offset, frequency, doc_frequency = t
   1.621 +            return doc_frequency
   1.622 +
   1.623      def close(self):
   1.624          self.info_reader.close()
   1.625          self.index_reader.close()
   1.626 @@ -850,7 +1095,7 @@
   1.627              reader.rewind()
   1.628  
   1.629              try:
   1.630 -                term, frequency, positions = reader.read_term()
   1.631 +                term, frequency, doc_frequency, positions = reader.read_term()
   1.632                  insort_right(entries, (term, positions, partition))
   1.633              except EOFError:
   1.634                  pass
   1.635 @@ -889,7 +1134,7 @@
   1.636  
   1.637              for partition in to_update:
   1.638                  try:
   1.639 -                    term, frequency, positions = self.readers[partition].read_term()
   1.640 +                    term, frequency, doc_frequency, positions = self.readers[partition].read_term()
   1.641                      insort_right(entries, (term, positions, partition))
   1.642                  except EOFError:
   1.643                      pass
   1.644 @@ -975,12 +1220,12 @@
   1.645  
   1.646  # Utility functions.
   1.647  
   1.648 -def get_term_writer(pathname, partition, interval):
   1.649 +def get_term_writer(pathname, partition, interval, doc_interval):
   1.650  
   1.651      """
   1.652      Return a term dictionary writer using files under the given 'pathname'
   1.653      labelled according to the given 'partition', using the given indexing
   1.654 -    'interval'.
   1.655 +    'interval' for terms and 'doc_interval' for document position records.
   1.656      """
   1.657  
   1.658      tdf = open(join(pathname, "terms-%s" % partition), "wb")
   1.659 @@ -992,7 +1237,12 @@
   1.660      tpf = open(join(pathname, "positions-%s" % partition), "wb")
   1.661      positions_writer = PositionWriter(tpf)
   1.662  
   1.663 -    return TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)
   1.664 +    tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
   1.665 +    positions_index_writer = PositionIndexWriter(tpif)
   1.666 +
   1.667 +    positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
   1.668 +
   1.669 +    return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
   1.670  
   1.671  def get_field_writer(pathname, partition, interval):
   1.672  
   1.673 @@ -1026,7 +1276,12 @@
   1.674      tpf = open(join(pathname, "positions-%s" % partition), "rb")
   1.675      positions_reader = PositionReader(tpf)
   1.676  
   1.677 -    return TermDictionaryReader(info_reader, index_reader, positions_reader)
   1.678 +    tpif = open(join(pathname, "positions_index-%s" % partition), "rb")
   1.679 +    positions_index_reader = PositionIndexReader(tpif)
   1.680 +
   1.681 +    positions_dict_reader = PositionDictionaryReader(positions_reader, positions_index_reader)
   1.682 +
   1.683 +    return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
   1.684  
   1.685  def get_field_reader(pathname, partition):
   1.686  

     2.1 --- a/test.py	Mon Aug 31 21:02:30 2009 +0200
     2.2 +++ b/test.py	Wed Sep 02 01:30:42 2009 +0200
     2.3 @@ -38,15 +38,18 @@
     2.4  all_doc_positions = [
     2.5      [
     2.6          (123, [1, 3, 5, 15, 25]),
     2.7 -        (124, [0, 100])
     2.8 +        (124, [0, 100]),
     2.9 +        (125, [11, 99, 199]),
    2.10 +        (130, [77, 78, 80, 82, 89])
    2.11      ],
    2.12      [
    2.13          (78, [9]),
    2.14 -        (196, [10, 11])
    2.15 +        (196, [10, 11]),
    2.16 +        (197, [17, 21, 30])
    2.17      ]
    2.18      ]
    2.19  
    2.20 -f = open("test", "wb")
    2.21 +f = open("testP", "wb")
    2.22  w = iixr.PositionWriter(f)
    2.23  for doc_positions in all_doc_positions:
    2.24      for docnum, positions in doc_positions:
    2.25 @@ -54,7 +57,7 @@
    2.26      w.reset()
    2.27  w.close()
    2.28  
    2.29 -f = open("test", "rb")
    2.30 +f = open("testP", "rb")
    2.31  r = iixr.PositionReader(f)
    2.32  for doc_positions in all_doc_positions:
    2.33      for docnum, positions in doc_positions:
    2.34 @@ -64,20 +67,68 @@
    2.35      r.reset()
    2.36  r.close()
    2.37  
    2.38 -f = open("test", "wb")
    2.39 +# Test position index files.
    2.40 +
    2.41 +indexed_positions = [
    2.42 +    [
    2.43 +        (1234, 0, 100),
    2.44 +        (2345, 700, 100),
    2.45 +        (3456, 1900, 50)
    2.46 +    ],
    2.47 +    [
    2.48 +        (4567, 2800, 20)
    2.49 +    ]
    2.50 +    ]
    2.51 +
    2.52 +offsets = []
    2.53 +f = open("testPI", "wb")
    2.54 +w = iixr.PositionIndexWriter(f)
    2.55 +for term_positions in indexed_positions:
    2.56 +    offset = None
    2.57 +    doc_frequency = 0
    2.58 +    w.reset()
    2.59 +    for docnum, pos_offset, count in term_positions:
    2.60 +        io = w.write_positions(docnum, pos_offset, count)
    2.61 +        if offset is None:
    2.62 +            offset = io
    2.63 +        doc_frequency += count
    2.64 +    offsets.append((offset, doc_frequency))
    2.65 +w.close()
    2.66 +
    2.67 +f = open("testPI", "rb")
    2.68 +r = iixr.PositionIndexReader(f)
    2.69 +offsets.reverse()
    2.70 +indexed_positions.reverse()
    2.71 +for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
    2.72 +    found_positions = r.read_term_positions(offset, doc_frequency)
    2.73 +    for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):
    2.74 +        print docnum == dn, docnum, dn
    2.75 +        print pos_offset == po, pos_offset, po
    2.76 +        print count == c, count, c
    2.77 +r.close()
    2.78 +
    2.79 +# Test position dictionaries.
    2.80 +
    2.81 +f = open("testP", "wb")
    2.82  w = iixr.PositionWriter(f)
    2.83 +f2 = open("testPI", "wb")
    2.84 +w2 = iixr.PositionIndexWriter(f2)
    2.85 +wd = iixr.PositionDictionaryWriter(w, w2, 2)
    2.86  offsets = []
    2.87  for doc_positions in all_doc_positions:
    2.88 -    offset, frequency = w.write_term_positions(doc_positions)
    2.89 -    offsets.append(offset)
    2.90 +    offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
    2.91 +    offsets.append((offset, doc_frequency))
    2.92  w.close()
    2.93  
    2.94 -f = open("test", "rb")
    2.95 +f = open("testP", "rb")
    2.96  r = iixr.PositionReader(f)
    2.97 +f2 = open("testPI", "rb")
    2.98 +r2 = iixr.PositionIndexReader(f2)
    2.99 +rd = iixr.PositionDictionaryReader(r, r2)
   2.100  offsets.reverse()
   2.101  all_doc_positions.reverse()
   2.102 -for offset, doc_positions in zip(offsets, all_doc_positions):
   2.103 -    dp = list(r.read_term_positions(offset))
   2.104 +for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
   2.105 +    dp = list(rd.read_term_positions(offset, doc_frequency))
   2.106      print doc_positions == dp, doc_positions, dp
   2.107  r.close()
   2.108  
   2.109 @@ -166,55 +217,57 @@
   2.110  # Test terms.
   2.111  
   2.112  terms = [
   2.113 -    # term       offset      frequency
   2.114 -    ("aardvark",  100000123,  1),
   2.115 -    ("anteater",  100000456,  2),
   2.116 -    ("badger",    100000789, 13),
   2.117 -    ("bull",     1000001234, 59),
   2.118 -    ("bulldog",  1000002345, 99),
   2.119 -    ("cat",      1000003456, 89)
   2.120 +    # term       offset      frequency  doc_frequency
   2.121 +    ("aardvark",  100000123,  1,         1),
   2.122 +    ("anteater",  100000456,  2,         1),
   2.123 +    ("badger",    100000789, 13,         7),
   2.124 +    ("bull",     1000001234, 59,        17),
   2.125 +    ("bulldog",  1000002345, 99,        80),
   2.126 +    ("cat",      1000003456, 89,        28)
   2.127      ]
   2.128  
   2.129  f = open("test", "wb")
   2.130  w = iixr.TermWriter(f)
   2.131 -for term, offset, frequency in terms:
   2.132 -    w.write_term(term, offset, frequency)
   2.133 +for term, offset, frequency, doc_frequency in terms:
   2.134 +    w.write_term(term, offset, frequency, doc_frequency)
   2.135  w.close()
   2.136  
   2.137  f = open("test", "rb")
   2.138  r = iixr.TermReader(f)
   2.139 -for term, offset, frequency in terms:
   2.140 -    t, o, fr = r.read_term()
   2.141 +for term, offset, frequency, doc_frequency in terms:
   2.142 +    t, o, fr, df = r.read_term()
   2.143      print term == t, term, t
   2.144      print offset == o, offset, o
   2.145      print frequency == fr, frequency, fr
   2.146 +    print doc_frequency == df, doc_frequency, df
   2.147  r.close()
   2.148  
   2.149  # Test terms in index files.
   2.150  
   2.151  indexed_terms = [
   2.152 -    # term       offset      frequency  info_offset
   2.153 -    ("aardvark",  100000123,  1,        200000321),
   2.154 -    ("anteater",  100000456,  2,        200000654),
   2.155 -    ("badger",    100000789, 13,        200000987),
   2.156 -    ("bull",     1000001234, 59,        200004321),
   2.157 -    ("bulldog",  1000002345, 99,        200005432),
   2.158 -    ("cat",      1000003456, 89,        200006543)
   2.159 +    # term       offset      frequency  doc_frequency   info_offset
   2.160 +    ("aardvark",  100000123,  1,         1,             200000321),
   2.161 +    ("anteater",  100000456,  2,         1,             200000654),
   2.162 +    ("badger",    100000789, 13,         7,             200000987),
   2.163 +    ("bull",     1000001234, 59,        17,             200004321),
   2.164 +    ("bulldog",  1000002345, 99,        80,             200005432),
   2.165 +    ("cat",      1000003456, 89,        28,             200006543)
   2.166      ]
   2.167  
   2.168  f = open("test", "wb")
   2.169  w = iixr.TermIndexWriter(f)
   2.170 -for term, offset, frequency, info_offset in indexed_terms:
   2.171 -    w.write_term(term, offset, frequency, info_offset)
   2.172 +for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
   2.173 +    w.write_term(term, offset, frequency, doc_frequency, info_offset)
   2.174  w.close()
   2.175  
   2.176  f = open("test", "rb")
   2.177  r = iixr.TermIndexReader(f)
   2.178 -for term, offset, frequency, info_offset in indexed_terms:
   2.179 -    t, o, fr, i = r.read_term()
   2.180 +for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
   2.181 +    t, o, fr, df, i = r.read_term()
   2.182      print term == t, term, t
   2.183      print offset == o, offset, o
   2.184      print frequency == fr, frequency, fr
   2.185 +    print doc_frequency == df, doc_frequency, df
   2.186      print info_offset == i, info_offset, i
   2.187  r.close()
   2.188  
   2.189 @@ -224,26 +277,23 @@
   2.190  w = iixr.TermWriter(f)
   2.191  f2 = open("testI", "wb")
   2.192  w2 = iixr.TermIndexWriter(f2)
   2.193 -f3 = open("testP", "wb")
   2.194 -w3 = iixr.PositionWriter(f3)
   2.195 -wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
   2.196 -for term, offset, frequency in terms:
   2.197 -    wd._write_term(term, offset, frequency)
   2.198 +wd = iixr.TermDictionaryWriter(w, w2, None, 3)
   2.199 +for term, offset, frequency, doc_frequency in terms:
   2.200 +    wd._write_term(term, offset, frequency, doc_frequency)
   2.201  wd.close()
   2.202  
   2.203  f = open("test", "rb")
   2.204  r = iixr.TermReader(f)
   2.205  f2 = open("testI", "rb")
   2.206  r2 = iixr.TermIndexReader(f2)
   2.207 -f3 = open("testP", "rb")
   2.208 -r3 = iixr.PositionReader(f3)
   2.209 -rd = iixr.TermDictionaryReader(r, r2, r3)
   2.210 +rd = iixr.TermDictionaryReader(r, r2, None)
   2.211  terms_reversed = terms[:]
   2.212  terms_reversed.reverse()
   2.213 -for term, offset, frequency in terms_reversed:
   2.214 -    o, fr = rd._find_term(term)
   2.215 +for term, offset, frequency, doc_frequency in terms_reversed:
   2.216 +    o, fr, df = rd._find_term(term)
   2.217      print offset == o, offset, o
   2.218      print frequency == fr, frequency, fr
   2.219 +    print doc_frequency == df, doc_frequency, df
   2.220  for term in ("dog", "dingo"):
   2.221      t = rd._find_term(term)
   2.222      print t is None, t
   2.223 @@ -255,7 +305,7 @@
   2.224      ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),
   2.225      ("anteater",  [(1, [43, 44])]),
   2.226      ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
   2.227 -    ("bull",      [(6, [128]), (16, [12])]),
   2.228 +    ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
   2.229      ("bulldog",   [(43, [17, 19, 256, 512])]),
   2.230      ("cat",       [(123, [12, 145, 196]), (1200, [113])])
   2.231      ]
   2.232 @@ -266,7 +316,10 @@
   2.233  w2 = iixr.TermIndexWriter(f2)
   2.234  f3 = open("testP", "wb")
   2.235  w3 = iixr.PositionWriter(f3)
   2.236 -wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
   2.237 +f4 = open("testPI", "wb")
   2.238 +w4 = iixr.PositionIndexWriter(f4)
   2.239 +wp = iixr.PositionDictionaryWriter(r3, r4, 2)
   2.240 +wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
   2.241  for term, doc_positions in terms_with_positions:
   2.242      wd.write_term_positions(term, doc_positions)
   2.243  wd.close()
   2.244 @@ -277,7 +330,10 @@
   2.245  r2 = iixr.TermIndexReader(f2)
   2.246  f3 = open("testP", "rb")
   2.247  r3 = iixr.PositionReader(f3)
   2.248 -rd = iixr.TermDictionaryReader(r, r2, r3)
   2.249 +f4 = open("testPI", "rb")
   2.250 +r4 = iixr.PositionIndexReader(f4)
   2.251 +rp = iixr.PositionDictionaryReader(r3, r4)
   2.252 +rd = iixr.TermDictionaryReader(r, r2, rp)
   2.253  terms_reversed = terms_with_positions[:]
   2.254  terms_reversed.reverse()
   2.255  for term, doc_positions in terms_reversed:
   2.256 @@ -291,7 +347,7 @@
   2.257  
   2.258  rd.rewind()
   2.259  for term, doc_positions in terms_with_positions:
   2.260 -    t, fr, dp = rd.read_term()
   2.261 +    t, fr, df, dp = rd.read_term()
   2.262      dp = list(dp)
   2.263      print term == t, term, t
   2.264      print doc_positions == dp, doc_positions, dp
2009-09-02	Paul Boddie	raw files shortlog changelog graph	Attempted to introduce position dictionaries with extra term record details providing document frequency information. Attempted to introduce file descriptor duplication in order to support concurrent iterators.
			iixr.py (file) test.py (file)