Added term retrieval by prefix. Added a conversion of field values to Unicode in the add_field method.

     1.1 --- a/iixr.py	Sat Sep 05 18:10:50 2009 +0200
     1.2 +++ b/iixr.py	Sun Sep 06 02:01:00 2009 +0200
     1.3 @@ -735,7 +735,7 @@
     1.4  
     1.5          """
     1.6          Read a term, its position file offset, its frequency and its document
     1.7 -        frequence from the term information file.
     1.8 +        frequency from the term information file.
     1.9          """
    1.10  
    1.11          # Read the prefix length and term suffix.
    1.12 @@ -879,27 +879,47 @@
    1.13  
    1.14          self.max_offset = self.terms[-1][1] + 1
    1.15  
    1.16 -    def _find_term(self, term):
    1.17 +    def _find_closest_entry(self, term):
    1.18  
    1.19          """
    1.20 -        Find the position file offset and frequency of 'term' from the term
    1.21 -        dictionary.
    1.22 +        Find the offsets and frequencies of 'term' from the term dictionary or
    1.23 +        the closest term starting with the value of 'term'.
    1.24 +
    1.25 +        Return the closest index entry consisting of a term, the position file
    1.26 +        offset, the term frequency, the document frequency, and the term details
    1.27 +        file offset.
    1.28          """
    1.29  
    1.30          i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
    1.31  
    1.32          # Get the entry position providing the term or one preceding it.
    1.33 +        # If no entry precedes the requested term, return the very first entry
    1.34 +        # as the closest.
    1.35  
    1.36          if i == -1:
    1.37 -            return None
    1.38 -
    1.39 -        found_term, offset, frequency, doc_frequency, info_offset = self.terms[i]
    1.40 +            return self.terms[0]
    1.41 +        else:
    1.42 +            return self.terms[i]
    1.43 +
    1.44 +    def _find_closest_term(self, term):
    1.45 +
    1.46 +        """
    1.47 +        Find the offsets and frequencies of 'term' from the term dictionary or
    1.48 +        the closest term starting with the value of 'term'.
    1.49 +
    1.50 +        Return the closest term (or the term itself), the position file offset,
    1.51 +        the term frequency, the document frequency, and the term details file
    1.52 +        offset (or None if the reader is already positioned).
    1.53 +        """
    1.54 +
    1.55 +        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
    1.56  
    1.57          # Where the term is found immediately, return the offset and
    1.58 -        # frequencies.
    1.59 -
    1.60 -        if term == found_term:
    1.61 -            return offset, frequency, doc_frequency
    1.62 +        # frequencies. If the term does not appear, return the details of the
    1.63 +        # closest entry.
    1.64 +
    1.65 +        if term <= found_term:
    1.66 +            return found_term, offset, frequency, doc_frequency, info_offset
    1.67  
    1.68          # Otherwise, seek past the index term's entry in the information file
    1.69          # and scan for the desired term.
    1.70 @@ -912,19 +932,32 @@
    1.71              except EOFError:
    1.72                  pass
    1.73  
    1.74 -            # If the term is found, return the offset and frequencies.
    1.75 -
    1.76 -            if term == found_term:
    1.77 -                return offset, frequency, doc_frequency
    1.78 -            else:
    1.79 -                return None
    1.80 +            return found_term, offset, frequency, doc_frequency, None
    1.81 +
    1.82 +    def _find_term(self, term):
    1.83 +
    1.84 +        """
    1.85 +        Find the position file offset and frequency of 'term' from the term
    1.86 +        dictionary.
    1.87 +        """
    1.88 +
    1.89 +        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
    1.90 +
    1.91 +        # If the term is found, return the offset and frequencies.
    1.92 +
    1.93 +        if term == found_term:
    1.94 +            return offset, frequency, doc_frequency
    1.95 +        else:
    1.96 +            return None
    1.97 +
    1.98 +    def _get_positions(self, offset, doc_frequency):
    1.99 +        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
   1.100 +
   1.101 +    # Sequential access methods.
   1.102  
   1.103      def rewind(self):
   1.104          self.info_reader.rewind()
   1.105  
   1.106 -    def _get_positions(self, offset, doc_frequency):
   1.107 -        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
   1.108 -
   1.109      def read_term(self):
   1.110  
   1.111          """
   1.112 @@ -936,6 +969,35 @@
   1.113          positions = self._get_positions(offset, doc_frequency)
   1.114          return term, frequency, doc_frequency, positions
   1.115  
   1.116 +    # Query methods.
   1.117 +
   1.118 +    def find_terms(self, term):
   1.119 +
   1.120 +        "Return all terms whose values start with the value of 'term'."
   1.121 +
   1.122 +        terms = []
   1.123 +
   1.124 +        found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
   1.125 +
   1.126 +        # Position the reader, if necessary.
   1.127 +
   1.128 +        if info_offset is not None:
   1.129 +            self.info_reader.go_to_term(found_term, offset, info_offset)
   1.130 +
   1.131 +        # Read and record terms.
   1.132 +
   1.133 +        try:
   1.134 +            # Add the found term if it starts with the specified term.
   1.135 +
   1.136 +            while found_term.startswith(term):
   1.137 +                terms.append(found_term)
   1.138 +                found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
   1.139 +
   1.140 +        except EOFError:
   1.141 +            pass
   1.142 +
   1.143 +        return terms
   1.144 +
   1.145      def find_positions(self, term):
   1.146  
   1.147          "Return the documents and positions at which the given 'term' is found."
   1.148 @@ -1504,7 +1566,7 @@
   1.149          else:
   1.150              doc_fields = self.docs[docnum]
   1.151  
   1.152 -        doc_fields.append((identifier, value))
   1.153 +        doc_fields.append((identifier, unicode(value))) # convert to string
   1.154  
   1.155          self.field_counter += 1
   1.156          if self.flush_interval and self.field_counter >= self.flush_interval:

     2.1 --- a/test.py	Sat Sep 05 18:10:50 2009 +0200
     2.2 +++ b/test.py	Sun Sep 06 02:01:00 2009 +0200
     2.3 @@ -201,8 +201,8 @@
     2.4  doc_fields_reversed = doc_fields[:]
     2.5  doc_fields_reversed.reverse()
     2.6  for docnum, fields in doc_fields_reversed:
     2.7 -    df = rd.get_fields(docnum)
     2.8 -    print list(enumerate(fields)) == df, list(enumerate(fields)), df
     2.9 +    df = dict(rd.get_fields(docnum))
    2.10 +    print dict(enumerate(fields)) == df, dict(enumerate(fields)), df
    2.11  for docnum in (13579, 246810):
    2.12      df = rd.get_fields(docnum)
    2.13      print df is None, df
    2.14 @@ -309,6 +309,13 @@
    2.15  for term in ("dog", "dingo"):
    2.16      t = rd._find_term(term)
    2.17      print t is None, t
    2.18 +
    2.19 +# (Test term prefix searching.)
    2.20 +
    2.21 +print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]
    2.22 +print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]
    2.23 +print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]
    2.24 +print rd.find_terms("d") == [], rd.find_terms("d"), []
    2.25  rd.close()
    2.26  
    2.27  # Test dictionaries with term and position data.
    2.28 @@ -361,7 +368,7 @@
    2.29  for term, doc_positions in terms_reversed:
    2.30      dp = list(rd.find_positions(term))
    2.31      print doc_positions == dp, doc_positions, dp
    2.32 -for term in ("dog", "dingo"):
    2.33 +for term in ("aaa", "dog", "dingo"):
    2.34      dp = rd.find_positions(term)
    2.35      print dp is None, dp
    2.36  
    2.37 @@ -422,8 +429,8 @@
    2.38      fr = rd.get_frequency(term)
    2.39      print frequency == fr, frequency, fr
    2.40  for docnum, text in docs:
    2.41 -    df = rd.get_fields(docnum)
    2.42 -    print (123, text) == df[0], (123, text), df[0]
    2.43 +    df = dict(rd.get_fields(docnum))
    2.44 +    print df[123] == text, text, df[123]
    2.45  for term, docnum, positions in position_tests:
    2.46      dp = rd.find_positions(term)
    2.47      pos = dp.from_document(docnum)
2009-09-06	Paul Boddie	raw files shortlog changelog graph	Added term retrieval by prefix. Added a conversion of field values to Unicode in the add_field method.
			iixr.py (file) test.py (file)