1.1 --- a/iixr/terms.py	Sat Feb 12 01:23:58 2011 +0100
     1.2 +++ b/iixr/terms.py	Sun Feb 13 02:49:55 2011 +0100
     1.3 @@ -18,29 +18,87 @@
     1.4  with this program.  If not, see <http://www.gnu.org/licenses/>.
     1.5  """
     1.6  
     1.7 +from iixr.data import *
     1.8  from iixr.files import *
     1.9 -from iixr.positions import *
    1.10  from iixr.phrases import PhraseIterator
    1.11  from os.path import commonprefix # to find common string prefixes
    1.12 -from bisect import bisect_right  # to find terms in the dictionary index
    1.13  
    1.14  class TermWriter(FileWriter):
    1.15  
    1.16      "Writing term information to files."
    1.17  
    1.18 -    def reset(self):
    1.19 +    def begin(self, docnum_size, position_size):
    1.20 +
    1.21 +        "Begin writing to the file."
    1.22 +
    1.23 +        self.write_numbers((docnum_size, position_size))
    1.24          self.end_record()
    1.25 +
    1.26 +        self.data_start = self.tell()
    1.27 +        self.docnum_size = docnum_size
    1.28 +        self.position_size = position_size
    1.29 +        self.subtractor = get_subtractor(docnum_size)
    1.30          self.last_term = ""
    1.31 -        self.last_offset = 0
    1.32  
    1.33 -    def write_term(self, term, offset, frequency, doc_frequency):
    1.34 +    def write_terms(self, terms):
    1.35  
    1.36          """
    1.37 -        Write the given 'term', its position file 'offset', its 'frequency' and
    1.38 -        its 'doc_frequency' (number of documents in which it appears) to the
    1.39 -        term information file.
    1.40 +        Write the 'terms' to the term information file, with each term's details
    1.41 +        stored in a separate record.
    1.42          """
    1.43  
    1.44 +        if hasattr(terms, "items"):
    1.45 +            terms = terms.items()
    1.46 +            terms.sort()
    1.47 +
    1.48 +        for term, doc_positions in terms:
    1.49 +            if not doc_positions:
    1.50 +                continue
    1.51 +
    1.52 +            if hasattr(doc_positions, "items"):
    1.53 +                doc_positions = doc_positions.items()
    1.54 +
    1.55 +            docnum, positions = doc_positions[0]
    1.56 +
    1.57 +            if not positions:
    1.58 +                continue
    1.59 +
    1.60 +            # Start the writing, if appropriate.
    1.61 +
    1.62 +            if self.data_start is None:
    1.63 +                self.begin(sizeof(docnum), sizeof(positions[0]))
    1.64 +
    1.65 +            # Write each term and document positions.
    1.66 +
    1.67 +            self.write_term(term, doc_positions)
    1.68 +            self.end_record()
    1.69 +
    1.70 +    # Methods requiring an open record.
    1.71 +
    1.72 +    def write_term(self, term, doc_positions):
    1.73 +
    1.74 +        """
    1.75 +        Write the given 'term', its document frequency (number of documents in
    1.76 +        which it appears), and 'doc_positions' to the term information file.
    1.77 +        """
    1.78 +
    1.79 +        self.write_term_only(term)
    1.80 +
    1.81 +        # Write the document frequency and the term positions.
    1.82 +
    1.83 +        self.write_positions(doc_positions)
    1.84 +
    1.85 +    def write_term_plus_remaining(self, term, data):
    1.86 +
    1.87 +        "Write the given 'term' and the document position 'data'."
    1.88 +
    1.89 +        self.write_term_only(term)
    1.90 +        self.write_remaining(data)
    1.91 +
    1.92 +    def write_term_only(self, term):
    1.93 +
    1.94 +        "Write only the given 'term'."
    1.95 +
    1.96          if term <= self.last_term:
    1.97              raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
    1.98  
    1.99 @@ -52,430 +110,173 @@
   1.100          self.write_number(common)
   1.101          self.write_string(suffix)
   1.102  
   1.103 -        # Write the offset delta.
   1.104 -        # Write the frequency.
   1.105 +        self.last_term = term
   1.106 +
   1.107 +    def write_positions(self, doc_positions):
   1.108 +
   1.109 +        "Write the given 'doc_positions' to the file."
   1.110 +
   1.111 +        # Make sure that the positions are sorted.
   1.112 +
   1.113 +        doc_positions.sort()
   1.114 +
   1.115          # Write the document frequency.
   1.116  
   1.117 -        self.write_numbers((
   1.118 -            offset - self.last_offset,
   1.119 -            frequency,
   1.120 -            doc_frequency
   1.121 -            ))
   1.122 +        self.write_number(len(doc_positions))
   1.123 +
   1.124 +        last_docnum = None
   1.125 +
   1.126 +        for docnum, positions in doc_positions:
   1.127 +
   1.128 +            # Store the first document number as it is.
   1.129 +
   1.130 +            if last_docnum is None:
   1.131 +                docnum_seq = docnum
   1.132 +
   1.133 +            # Reject out-of-order documents.
   1.134 +
   1.135 +            elif docnum < last_docnum:
   1.136 +                raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum)
   1.137  
   1.138 -        self.last_term = term
   1.139 -        self.last_offset = offset
   1.140 +            # Calculate an ongoing delta.
   1.141 +
   1.142 +            else:
   1.143 +                docnum_seq = self.subtractor(docnum, last_docnum)
   1.144 +
   1.145 +            # Write the document number and positions.
   1.146 +
   1.147 +            self.write_sequence_value(docnum_seq, self.docnum_size)
   1.148 +            self.write_monotonic_sequence(positions, self.position_size)
   1.149 +
   1.150 +            last_docnum = docnum
   1.151 +
   1.152 +        # Write a terminating byte to indicate that no more document pages
   1.153 +        # exist.
   1.154 +
   1.155 +        self.write_byte(0)
   1.156  
   1.157  class TermReader(FileReader):
   1.158  
   1.159      "Reading term information from files."
   1.160  
   1.161 -    def reset(self):
   1.162 +    def begin(self):
   1.163 +
   1.164 +        "Begin reading from the file."
   1.165 +
   1.166 +        self.begin_record()
   1.167 +        try:
   1.168 +            self.docnum_size, self.position_size = self.read_numbers(2)
   1.169 +        except EOFError:
   1.170 +            self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
   1.171 +
   1.172 +        self.data_start = self.tell()
   1.173 +        self.adder = get_adder(self.docnum_size)
   1.174          self.last_term = ""
   1.175 -        self.last_offset = 0
   1.176 -        self.begin_record()
   1.177 +
   1.178 +    def get_sizes(self):
   1.179 +        return self.docnum_size, self.position_size
   1.180 +
   1.181 +    # Methods requiring an open record.
   1.182  
   1.183      def read_term(self):
   1.184  
   1.185 +        "Read a term and its document positions from the term information file."
   1.186 +
   1.187 +        # Read the term.
   1.188 +
   1.189 +        self.read_term_only()
   1.190 +
   1.191 +        # Read the document frequency and the term positions.
   1.192 +
   1.193 +        positions = self.read_positions()
   1.194 +
   1.195 +        return self.last_term, positions
   1.196 +
   1.197 +    def read_term_plus_remaining(self):
   1.198 +
   1.199          """
   1.200 -        Read a term, its position file offset, its frequency and its document
   1.201 -        frequency from the term information file.
   1.202 +        Read a term and the unprocessed document position data.
   1.203          """
   1.204  
   1.205 +        self.read_term_only()
   1.206 +        return self.last_term, self.read_remaining()
   1.207 +
   1.208 +    def read_term_only(self):
   1.209 +
   1.210 +        "Read a term only."
   1.211 +
   1.212          # Read the prefix length and term suffix.
   1.213  
   1.214          common = self.read_number()
   1.215          suffix = self.read_string()
   1.216  
   1.217          self.last_term = self.last_term[:common] + suffix
   1.218 -
   1.219 -        # Read the offset delta.
   1.220 -
   1.221 -        self.last_offset += self.read_number()
   1.222 -
   1.223 -        # Read the frequency.
   1.224 -
   1.225 -        frequency = self.read_number()
   1.226 -
   1.227 -        # Read the document frequency.
   1.228 -
   1.229 -        doc_frequency = self.read_number()
   1.230 +        return self.last_term
   1.231  
   1.232 -        return self.last_term, self.last_offset, frequency, doc_frequency
   1.233 -
   1.234 -    def go_to_term(self, term, offset, info_offset):
   1.235 -
   1.236 -        """
   1.237 -        Seek past the entry for 'term' having 'offset' to 'info_offset'. This
   1.238 -        permits the scanning for later terms from the specified term.
   1.239 -        """
   1.240 -
   1.241 -        self.seek(info_offset)
   1.242 -        self.last_term = term
   1.243 -        self.last_offset = offset
   1.244 -
   1.245 -class TermIndexWriter(TermWriter):
   1.246 +    def read_positions(self):
   1.247  
   1.248 -    "Writing term dictionary index details to files."
   1.249 -
   1.250 -    def reset(self):
   1.251 -        TermWriter.reset(self)
   1.252 -        self.last_info_offset = 0
   1.253 -
   1.254 -    def write_term(self, term, offset, frequency, doc_frequency, info_offset):
   1.255 -
   1.256 -        """
   1.257 -        Write the given 'term', its position file 'offset', its 'frequency' and
   1.258 -        its 'doc_frequency' to the term dictionary index file, along with the
   1.259 -        'info_offset' in the term information file.
   1.260 -        """
   1.261 +        "Read document positions from the term information file."
   1.262  
   1.263 -        TermWriter.write_term(self, term, offset, frequency, doc_frequency)
   1.264 -
   1.265 -        # Write the information file offset delta.
   1.266 -
   1.267 -        self.write_number(info_offset - self.last_info_offset)
   1.268 -
   1.269 -        self.last_info_offset = info_offset
   1.270 +        doc_positions = []
   1.271  
   1.272 -class TermIndexReader(TermReader):
   1.273 -
   1.274 -    "Reading term dictionary index details from files."
   1.275 -
   1.276 -    def reset(self):
   1.277 -        TermReader.reset(self)
   1.278 -        self.last_info_offset = 0
   1.279 +        while 1:
   1.280  
   1.281 -    def read_term(self):
   1.282 -
   1.283 -        """
   1.284 -        Read a term, its position file offset, its frequency, its document
   1.285 -        frequency and a term information file offset from the term dictionary
   1.286 -        index file.
   1.287 -        """
   1.288 -
   1.289 -        term, offset, frequency, doc_frequency = TermReader.read_term(self)
   1.290 -
   1.291 -        # Read the offset delta.
   1.292 -
   1.293 -        self.last_info_offset += self.read_number()
   1.294 +            # Read the document frequency.
   1.295  
   1.296 -        return term, offset, frequency, doc_frequency, self.last_info_offset
   1.297 -
   1.298 -class TermDictionaryWriter:
   1.299 -
   1.300 -    "Writing term dictionaries."
   1.301 -
   1.302 -    def __init__(self, info_writer, index_writer, position_dict_writer, interval):
   1.303 -        self.info_writer = info_writer
   1.304 -        self.index_writer = index_writer
   1.305 -        self.position_dict_writer = position_dict_writer
   1.306 -        self.interval = interval
   1.307 -        self.entry = 0
   1.308 -
   1.309 -        self.index_writer.reset()
   1.310 +            npositions = self.read_number()
   1.311  
   1.312 -    def _write_term(self, term, offset, frequency, doc_frequency):
   1.313 -
   1.314 -        """
   1.315 -        Write the given 'term', its position file 'offset', its 'frequency' and
   1.316 -        its 'doc_frequency' (number of documents in which it appears) to the
   1.317 -        term information file. Return the offset before the term information was
   1.318 -        written to the file.
   1.319 -        """
   1.320 -
   1.321 -        if self.entry % self.interval == 0:
   1.322 -            self.info_writer.reset()
   1.323 -            info_offset = self.info_writer.tell()
   1.324 -            self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
   1.325 +            last_docnum = None
   1.326 +            i = 0
   1.327 +            while i < npositions:
   1.328  
   1.329 -        self.info_writer.write_term(term, offset, frequency, doc_frequency)
   1.330 -        self.entry += 1
   1.331 -
   1.332 -    def write_term_positions(self, term, doc_positions):
   1.333 -
   1.334 -        """
   1.335 -        Write the given 'term' and the 'doc_positions' recording the documents
   1.336 -        and positions at which the term is found.
   1.337 -        """
   1.338 -
   1.339 -        offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
   1.340 -
   1.341 -        if not frequency or not doc_frequency:
   1.342 -            raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions)
   1.343 -
   1.344 -        self._write_term(term, offset, frequency, doc_frequency)
   1.345 +                # Read the document number.
   1.346  
   1.347 -    def close(self):
   1.348 -        self.info_writer.close()
   1.349 -        self.index_writer.close()
   1.350 -        self.position_dict_writer.close()
   1.351 -
   1.352 -class TermDictionaryReader:
   1.353 -
   1.354 -    "Reading term dictionaries."
   1.355 +                docnum = self.read_sequence_value(self.docnum_size)
   1.356 +                if last_docnum is not None:
   1.357 +                    docnum = self.adder(docnum, last_docnum)
   1.358  
   1.359 -    def __init__(self, info_reader, index_reader, position_dict_reader):
   1.360 -        self.info_reader = info_reader
   1.361 -        self.index_reader = index_reader
   1.362 -        self.position_dict_reader = position_dict_reader
   1.363 -
   1.364 -        self.info_reader.reset()
   1.365 -        self.index_reader.reset()
   1.366 -
   1.367 -        self.entry = 0
   1.368 -        self.terms = []
   1.369 -        try:
   1.370 -            while 1:
   1.371 -                self.terms.append(self.index_reader.read_term())
   1.372 -        except EOFError:
   1.373 -            pass
   1.374 -
   1.375 -        # Large numbers for ordering purposes.
   1.376 +                # Read the positions.
   1.377  
   1.378 -        if self.terms:
   1.379 -            self.max_offset = self.terms[-1][1] + 1
   1.380 -        else:
   1.381 -            self.max_offset = None
   1.382 -
   1.383 -    def _find_closest_entry(self, term):
   1.384 -
   1.385 -        """
   1.386 -        Find the offsets and frequencies of 'term' from the term dictionary or
   1.387 -        the closest term starting with the value of 'term'.
   1.388 -
   1.389 -        Return the closest index entry consisting of a term, the position file
   1.390 -        offset, the term frequency, the document frequency, and the term details
   1.391 -        file offset.
   1.392 -        """
   1.393 +                positions = self.read_monotonic_sequence(self.position_size)
   1.394 +                doc_positions.append((docnum, positions))
   1.395  
   1.396 -        i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
   1.397 -
   1.398 -        # Get the entry position providing the term or one preceding it.
   1.399 -        # If no entry precedes the requested term, return the very first entry
   1.400 -        # as the closest.
   1.401 -
   1.402 -        if i == -1:
   1.403 -            self.entry = 0
   1.404 -            return self.terms[0]
   1.405 -        else:
   1.406 -            self.entry = i
   1.407 -            return self.terms[i]
   1.408 -
   1.409 -    def _find_closest_term(self, term):
   1.410 -
   1.411 -        """
   1.412 -        Find the offsets and frequencies of 'term' from the term dictionary or
   1.413 -        the closest term starting with the value of 'term'.
   1.414 +                last_docnum = docnum
   1.415 +                i += 1
   1.416  
   1.417 -        Return the closest term (or the term itself), the position file offset,
   1.418 -        the term frequency, the document frequency, and the term details file
   1.419 -        offset (or None if the reader is already positioned).
   1.420 -        """
   1.421 -
   1.422 -        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
   1.423 -
   1.424 -        # Where the term is found immediately, return the offset and
   1.425 -        # frequencies. If the term does not appear, return the details of the
   1.426 -        # closest entry.
   1.427 -
   1.428 -        if term <= found_term:
   1.429 -            return found_term, offset, frequency, doc_frequency, info_offset
   1.430 +            # Read a terminating byte to discover whether more document pages
   1.431 +            # exist.
   1.432  
   1.433 -        # Otherwise, seek past the index term's entry in the information file
   1.434 -        # and scan for the desired term.
   1.435 -
   1.436 -        else:
   1.437 -            # Reset the term and offset for the new page.
   1.438 -            self.info_reader.go_to_term("", 0, info_offset)
   1.439 -            try:
   1.440 -                while term > found_term:
   1.441 -                    found_term, offset, frequency, doc_frequency = self._read_term()
   1.442 -            except EOFError:
   1.443 -                pass
   1.444 -
   1.445 -            return found_term, offset, frequency, doc_frequency, None
   1.446 -
   1.447 -    def _find_term(self, term):
   1.448 +            if not self.read_byte():
   1.449 +                break
   1.450  
   1.451 -        """
   1.452 -        Find the position file offset and frequency of 'term' from the term
   1.453 -        dictionary.
   1.454 -        """
   1.455 -
   1.456 -        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
   1.457 -
   1.458 -        # If the term is found, return the offset and frequencies.
   1.459 -
   1.460 -        if term == found_term:
   1.461 -            return offset, frequency, doc_frequency
   1.462 -        else:
   1.463 -            return None
   1.464 -
   1.465 -    def _get_term_and_positions(self, term, offset, frequency, doc_frequency):
   1.466 +        return doc_positions
   1.467  
   1.468 -        """
   1.469 -        Return the term plus positions details using the given 'term', 'offset',
   1.470 -        'frequency' and 'doc_frequency'.
   1.471 -        """
   1.472 -
   1.473 -        return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency)
   1.474 -
   1.475 -    def _get_positions(self, offset, doc_frequency):
   1.476 +class TermIterator(TermReader):
   1.477  
   1.478 -        """
   1.479 -        Obtain positions from the position index 'offset' expecting a number of
   1.480 -        documents equal to the given 'doc_frequency'.
   1.481 -        """
   1.482 -
   1.483 -        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
   1.484 -
   1.485 -    # Iterator convenience methods.
   1.486 +    "An iterator over terms and positions read from a file."
   1.487  
   1.488      def __iter__(self):
   1.489 -        self.rewind()
   1.490          return self
   1.491  
   1.492      def next(self):
   1.493          try:
   1.494 +            self.begin_record()
   1.495              return self.read_term()
   1.496          except EOFError:
   1.497              raise StopIteration
   1.498  
   1.499 -    # Sequential access methods.
   1.500 -
   1.501 -    def rewind(self):
   1.502 -        self.entry = 0
   1.503 -        self.info_reader.rewind()
   1.504 -
   1.505 -    def read_term(self):
   1.506 -
   1.507 -        """
   1.508 -        Return the next term, its frequency, its document frequency, and the
   1.509 -        documents and positions at which the term is found.
   1.510 -        """
   1.511 -
   1.512 -        return self._get_term_and_positions(*self._read_term())
   1.513 -
   1.514 -    def _read_term(self):
   1.515 -
   1.516 -        try:
   1.517 -            term, offset, frequency, doc_frequency = self.info_reader.read_term()
   1.518 -        except EOFError:
   1.519 -            self.entry += 1
   1.520 -            try:
   1.521 -                term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry]
   1.522 -            except IndexError:
   1.523 -                raise EOFError
   1.524 -            else:
   1.525 -                # Reset the term and offset for the new page.
   1.526 -
   1.527 -                self.info_reader.go_to_term("", 0, info_offset)
   1.528 -
   1.529 -                # Skip the term in the information file.
   1.530 -
   1.531 -                self.info_reader.read_term()
   1.532 +class TermDataIterator(TermReader):
   1.533  
   1.534 -        return term, offset, frequency, doc_frequency
   1.535 -
   1.536 -    def go_to_term(self, term):
   1.537 -
   1.538 -        """
   1.539 -        Navigate to 'term' in the dictionary, returning the details from its
   1.540 -        entry. The returned details can be augmented with position information
   1.541 -        when presented to the _get_term_and_positions method.
   1.542 -        """
   1.543 -
   1.544 -        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
   1.545 -
   1.546 -        # Position the reader, if necessary.
   1.547 -
   1.548 -        if info_offset is not None:
   1.549 +    "An iterator over terms and unprocessed document positions data."
   1.550  
   1.551 -            # Reset the term and offset for the new page.
   1.552 -
   1.553 -            self.info_reader.go_to_term("", 0, info_offset)
   1.554 -
   1.555 -            # Skip the term in the information file.
   1.556 -
   1.557 -            self.info_reader.read_term()
   1.558 -
   1.559 -        return found_term, offset, frequency, doc_frequency
   1.560 -
   1.561 -    # Query methods.
   1.562 -
   1.563 -    def get_terms(self):
   1.564 -
   1.565 -        "Return a list of all terms."
   1.566 -
   1.567 -        return iter(self)
   1.568 +    def __iter__(self):
   1.569 +        return self
   1.570  
   1.571 -    def find_terms(self, term):
   1.572 -
   1.573 -        "Return all terms whose values start with the value of 'term'."
   1.574 -
   1.575 -        terms = []
   1.576 -
   1.577 -        found_term, offset, frequency, doc_frequency = self.go_to_term(term)
   1.578 -
   1.579 -        # Read and record terms.
   1.580 -
   1.581 +    def next(self):
   1.582          try:
   1.583 -            # Add the found term if it starts with the specified term.
   1.584 -
   1.585 -            while found_term.startswith(term):
   1.586 -                terms.append(found_term)
   1.587 -                found_term, offset, frequency, doc_frequency = self._read_term()
   1.588 -
   1.589 +            self.begin_record()
   1.590 +            return self.read_term_plus_remaining()
   1.591          except EOFError:
   1.592 -            pass
   1.593 -
   1.594 -        return terms
   1.595 -
   1.596 -    def find_positions(self, term):
   1.597 -
   1.598 -        "Return the documents and positions at which the given 'term' is found."
   1.599 -
   1.600 -        t = self._find_term(term)
   1.601 -        if t is None:
   1.602 -            return []
   1.603 -        else:
   1.604 -            offset, frequency, doc_frequency = t
   1.605 -            return self._get_positions(offset, doc_frequency)
   1.606 -
   1.607 -    def find_common_positions(self, terms):
   1.608 -
   1.609 -        """
   1.610 -        Return the documents and positions at which all the given 'terms' are
   1.611 -        found, where only common documents are returned.
   1.612 -        """
   1.613 -
   1.614 -        return PhraseIterator([self.find_positions(term) for term in terms])
   1.615 -
   1.616 -    def get_frequency(self, term):
   1.617 -
   1.618 -        "Return the frequency of the given 'term'."
   1.619 -
   1.620 -        t = self._find_term(term)
   1.621 -        if t is None:
   1.622 -            return None
   1.623 -        else:
   1.624 -            offset, frequency, doc_frequency = t
   1.625 -            return frequency
   1.626 -
   1.627 -    def get_document_frequency(self, term):
   1.628 -
   1.629 -        "Return the document frequency of the given 'term'."
   1.630 -
   1.631 -        t = self._find_term(term)
   1.632 -        if t is None:
   1.633 -            return None
   1.634 -        else:
   1.635 -            offset, frequency, doc_frequency = t
   1.636 -            return doc_frequency
   1.637 -
   1.638 -    def close(self):
   1.639 -        self.info_reader.close()
   1.640 -        self.index_reader.close()
   1.641 -        self.position_dict_reader.close()
   1.642 +            raise StopIteration
   1.643  
   1.644  # vim: tabstop=4 expandtab shiftwidth=4