Added term frequency information to the term dictionary.

     1.1 --- a/iixr.py	Sat Aug 29 21:15:47 2009 +0200
     1.2 +++ b/iixr.py	Sat Aug 29 22:12:25 2009 +0200
     1.3 @@ -215,8 +215,9 @@
     1.4  
     1.5          """
     1.6          Write all 'doc_positions' - a collection of tuples of the form (document
     1.7 -        number, position list) - to the file, returning the offset at which they
     1.8 -        were stored.
     1.9 +        number, position list) - to the file, returning a tuple containing the
    1.10 +        offset at which they were stored together with the frequency (number of
    1.11 +        positions) for the term involved.
    1.12          """
    1.13  
    1.14          # Reset the writer and record the current file offset.
    1.15 @@ -230,10 +231,13 @@
    1.16  
    1.17          # Write the positions.
    1.18  
    1.19 +        frequency = 0
    1.20 +
    1.21          for docnum, positions in doc_positions:
    1.22              self.write_positions(docnum, positions)
    1.23 +            frequency += len(positions)
    1.24  
    1.25 -        return offset
    1.26 +        return offset, frequency
    1.27  
    1.28  class PositionReader(FileReader):
    1.29  
    1.30 @@ -301,12 +305,12 @@
    1.31          self.last_term = ""
    1.32          self.last_offset = 0
    1.33  
    1.34 -    def write_term(self, term, offset):
    1.35 +    def write_term(self, term, offset, frequency):
    1.36  
    1.37          """
    1.38 -        Write the given 'term' and its position file 'offset' to the term
    1.39 -        information file. Return the offset after the term information was
    1.40 -        written to the file.
    1.41 +        Write the given 'term', its position file 'offset', and its 'frequency'
    1.42 +        to the term information file. Return the offset after the term
    1.43 +        information was written to the file.
    1.44          """
    1.45  
    1.46          # Too long terms are not currently supported.
    1.47 @@ -326,6 +330,10 @@
    1.48  
    1.49          self.write_number(offset - self.last_offset)
    1.50  
    1.51 +        # Write the frequency.
    1.52 +
    1.53 +        self.write_number(frequency)
    1.54 +
    1.55          self.last_term = term
    1.56          self.last_offset = offset
    1.57  
    1.58 @@ -342,7 +350,8 @@
    1.59      def read_term(self):
    1.60  
    1.61          """
    1.62 -        Read a term and its position file offset from the term information file.
    1.63 +        Read a term, its position file offset, and its frequency from the term
    1.64 +        information file.
    1.65          """
    1.66  
    1.67          # Read the prefix length and term suffix.
    1.68 @@ -356,7 +365,11 @@
    1.69  
    1.70          self.last_offset += self.read_number()
    1.71  
    1.72 -        return self.last_term, self.last_offset
    1.73 +        # Read the frequency.
    1.74 +
    1.75 +        frequency = self.read_number()
    1.76 +
    1.77 +        return self.last_term, self.last_offset, frequency
    1.78  
    1.79      def go_to_term(self, term, offset, info_offset):
    1.80  
    1.81 @@ -377,15 +390,15 @@
    1.82          TermWriter.reset(self)
    1.83          self.last_info_offset = 0
    1.84  
    1.85 -    def write_term(self, term, offset, info_offset):
    1.86 +    def write_term(self, term, offset, frequency, info_offset):
    1.87  
    1.88          """
    1.89 -        Write the given 'term' and its position file 'offset' to the term
    1.90 -        dictionary index file, along with the 'info_offset' in the term
    1.91 -        information file.
    1.92 +        Write the given 'term', its position file 'offset', and its 'frequency'
    1.93 +        to the term dictionary index file, along with the 'info_offset' in the
    1.94 +        term information file.
    1.95          """
    1.96  
    1.97 -        TermWriter.write_term(self, term, offset)
    1.98 +        TermWriter.write_term(self, term, offset, frequency)
    1.99  
   1.100          # Write the information file offset delta.
   1.101  
   1.102 @@ -403,17 +416,17 @@
   1.103      def read_term(self):
   1.104  
   1.105          """
   1.106 -        Read a term, its position file offset, and its term information file
   1.107 -        offset from the term dictionary index file.
   1.108 +        Read a term, its position file offset, its frequency, and its term
   1.109 +        information file offset from the term dictionary index file.
   1.110          """
   1.111  
   1.112 -        term, offset = TermReader.read_term(self)
   1.113 +        term, offset, frequency = TermReader.read_term(self)
   1.114  
   1.115          # Read the offset delta.
   1.116  
   1.117          self.last_info_offset += self.read_number()
   1.118  
   1.119 -        return term, offset, self.last_info_offset
   1.120 +        return term, offset, frequency, self.last_info_offset
   1.121  
   1.122  class TermDictionaryWriter:
   1.123  
   1.124 @@ -426,17 +439,18 @@
   1.125          self.interval = interval
   1.126          self.entry = 0
   1.127  
   1.128 -    def _write_term(self, term, offset):
   1.129 +    def _write_term(self, term, offset, frequency):
   1.130  
   1.131          """
   1.132 -        Write the given 'term' and its position file 'offset' to the term
   1.133 -        information file and optionally to the index, making a dictionary entry.
   1.134 +        Write the given 'term', its position file 'offset', and its 'frequency'
   1.135 +        to the term information file and optionally to the index, making a
   1.136 +        dictionary entry.
   1.137          """
   1.138  
   1.139 -        info_offset = self.info_writer.write_term(term, offset)
   1.140 +        info_offset = self.info_writer.write_term(term, offset, frequency)
   1.141  
   1.142          if self.entry % self.interval == 0:
   1.143 -            self.index_writer.write_term(term, offset, info_offset)
   1.144 +            self.index_writer.write_term(term, offset, frequency, info_offset)
   1.145  
   1.146          self.entry += 1
   1.147  
   1.148 @@ -447,8 +461,8 @@
   1.149          and positions at which the term is found.
   1.150          """
   1.151  
   1.152 -        offset = self.position_writer.write_all_positions(doc_positions)
   1.153 -        self._write_term(term, offset)
   1.154 +        offset, frequency = self.position_writer.write_all_positions(doc_positions)
   1.155 +        self._write_term(term, offset, frequency)
   1.156  
   1.157      def close(self):
   1.158          self.info_writer.close()
   1.159 @@ -478,7 +492,10 @@
   1.160  
   1.161      def _find_term(self, term):
   1.162  
   1.163 -        "Find the position file offset of 'term' from the term dictionary."
   1.164 +        """
   1.165 +        Find the position file offset and frequency of 'term' from the term
   1.166 +        dictionary.
   1.167 +        """
   1.168  
   1.169          i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
   1.170  
   1.171 @@ -487,12 +504,12 @@
   1.172          if i == -1:
   1.173              return None
   1.174  
   1.175 -        found_term, offset, info_offset = self.terms[i]
   1.176 +        found_term, offset, frequency, info_offset = self.terms[i]
   1.177  
   1.178          # Where the term is found immediately, return the offset.
   1.179  
   1.180          if term == found_term:
   1.181 -            return offset
   1.182 +            return offset, frequency
   1.183  
   1.184          # Otherwise, seek past the index term's entry in the information file
   1.185          # and scan for the desired term.
   1.186 @@ -501,14 +518,14 @@
   1.187              self.info_reader.go_to_term(found_term, offset, info_offset)
   1.188              try:
   1.189                  while term > found_term:
   1.190 -                    found_term, offset = self.info_reader.read_term()
   1.191 +                    found_term, offset, frequency = self.info_reader.read_term()
   1.192              except EOFError:
   1.193                  pass
   1.194  
   1.195 -            # If the term is found, return the offset.
   1.196 +            # If the term is found, return the offset and frequency.
   1.197  
   1.198              if term == found_term:
   1.199 -                return offset
   1.200 +                return offset, frequency
   1.201              else:
   1.202                  return None
   1.203  
   1.204 @@ -516,12 +533,24 @@
   1.205  
   1.206          "Return the documents and positions at which the given 'term' is found."
   1.207  
   1.208 -        offset = self._find_term(term)
   1.209 -        if offset is None:
   1.210 +        t = self._find_term(term)
   1.211 +        if t is None:
   1.212              return None
   1.213          else:
   1.214 +            offset, frequency = t
   1.215              return self.position_reader.read_all_positions(offset)
   1.216  
   1.217 +    def get_frequency(self, term):
   1.218 +
   1.219 +        "Return the frequency of the given 'term'."
   1.220 +
   1.221 +        t = self._find_term(term)
   1.222 +        if t is None:
   1.223 +            return None
   1.224 +        else:
   1.225 +            offset, frequency = t
   1.226 +            return frequency
   1.227 +
   1.228      def close(self):
   1.229          self.info_reader.close()
   1.230          self.index_reader.close()
   1.231 @@ -810,6 +839,9 @@
   1.232      def find_positions(self, term):
   1.233          return self.dict_reader.find_positions(term)
   1.234  
   1.235 +    def get_frequency(self, term):
   1.236 +        return self.dict_reader.get_frequency(term)
   1.237 +
   1.238      def get_fields(self, docnum):
   1.239          return self.field_dict_reader.read_fields(docnum)
   1.240  

     2.1 --- a/test.py	Sat Aug 29 21:15:47 2009 +0200
     2.2 +++ b/test.py	Sat Aug 29 22:12:25 2009 +0200
     2.3 @@ -54,9 +54,8 @@
     2.4  w = iixr.PositionWriter(f)
     2.5  offsets = []
     2.6  for doc_positions in all_doc_positions:
     2.7 -    offsets.append(
     2.8 -        w.write_all_positions(doc_positions)
     2.9 -        )
    2.10 +    offset, frequency = w.write_all_positions(doc_positions)
    2.11 +    offsets.append(offset)
    2.12  w.close()
    2.13  
    2.14  f = open("test", "rb")
    2.15 @@ -145,51 +144,55 @@
    2.16  # Test terms.
    2.17  
    2.18  terms = [
    2.19 -    ("aardvark",  100000123),
    2.20 -    ("anteater",  100000456),
    2.21 -    ("badger",    100000789),
    2.22 -    ("bull",     1000001234),
    2.23 -    ("bulldog",  1000002345),
    2.24 -    ("cat",      1000003456)
    2.25 +    # term       offset      frequency
    2.26 +    ("aardvark",  100000123,  1),
    2.27 +    ("anteater",  100000456,  2),
    2.28 +    ("badger",    100000789, 13),
    2.29 +    ("bull",     1000001234, 59),
    2.30 +    ("bulldog",  1000002345, 99),
    2.31 +    ("cat",      1000003456, 89)
    2.32      ]
    2.33  
    2.34  f = open("test", "wb")
    2.35  w = iixr.TermWriter(f)
    2.36 -for term, offset in terms:
    2.37 -    w.write_term(term, offset)
    2.38 +for term, offset, frequency in terms:
    2.39 +    w.write_term(term, offset, frequency)
    2.40  w.close()
    2.41  
    2.42  f = open("test", "rb")
    2.43  r = iixr.TermReader(f)
    2.44 -for term, offset in terms:
    2.45 -    t, o = r.read_term()
    2.46 +for term, offset, frequency in terms:
    2.47 +    t, o, fr = r.read_term()
    2.48      print term == t, term, t
    2.49      print offset == o, offset, o
    2.50 +    print frequency == fr, frequency, fr
    2.51  r.close()
    2.52  
    2.53  # Test terms in index files.
    2.54  
    2.55  indexed_terms = [
    2.56 -    ("aardvark",  100000123, 200000321),
    2.57 -    ("anteater",  100000456, 200000654),
    2.58 -    ("badger",    100000789, 200000987),
    2.59 -    ("bull",     1000001234, 200004321),
    2.60 -    ("bulldog",  1000002345, 200005432),
    2.61 -    ("cat",      1000003456, 200006543)
    2.62 +    # term       offset      frequency  info_offset
    2.63 +    ("aardvark",  100000123,  1,        200000321),
    2.64 +    ("anteater",  100000456,  2,        200000654),
    2.65 +    ("badger",    100000789, 13,        200000987),
    2.66 +    ("bull",     1000001234, 59,        200004321),
    2.67 +    ("bulldog",  1000002345, 99,        200005432),
    2.68 +    ("cat",      1000003456, 89,        200006543)
    2.69      ]
    2.70  
    2.71  f = open("test", "wb")
    2.72  w = iixr.TermIndexWriter(f)
    2.73 -for term, offset, info_offset in indexed_terms:
    2.74 -    w.write_term(term, offset, info_offset)
    2.75 +for term, offset, frequency, info_offset in indexed_terms:
    2.76 +    w.write_term(term, offset, frequency, info_offset)
    2.77  w.close()
    2.78  
    2.79  f = open("test", "rb")
    2.80  r = iixr.TermIndexReader(f)
    2.81 -for term, offset, info_offset in indexed_terms:
    2.82 -    t, o, i = r.read_term()
    2.83 +for term, offset, frequency, info_offset in indexed_terms:
    2.84 +    t, o, fr, i = r.read_term()
    2.85      print term == t, term, t
    2.86      print offset == o, offset, o
    2.87 +    print frequency == fr, frequency, fr
    2.88      print info_offset == i, info_offset, i
    2.89  r.close()
    2.90  
    2.91 @@ -202,8 +205,8 @@
    2.92  f3 = open("testP", "wb")
    2.93  w3 = iixr.PositionWriter(f3)
    2.94  wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
    2.95 -for term, offset in terms:
    2.96 -    wd._write_term(term, offset)
    2.97 +for term, offset, frequency in terms:
    2.98 +    wd._write_term(term, offset, frequency)
    2.99  wd.close()
   2.100  
   2.101  f = open("test", "rb")
   2.102 @@ -215,12 +218,13 @@
   2.103  rd = iixr.TermDictionaryReader(r, r2, r3)
   2.104  terms_reversed = terms[:]
   2.105  terms_reversed.reverse()
   2.106 -for term, offset in terms_reversed:
   2.107 -    o = rd._find_term(term)
   2.108 +for term, offset, frequency in terms_reversed:
   2.109 +    o, fr = rd._find_term(term)
   2.110      print offset == o, offset, o
   2.111 +    print frequency == fr, frequency, fr
   2.112  for term in ("dog", "dingo"):
   2.113 -    o = rd._find_term(term)
   2.114 -    print o is None, o
   2.115 +    t = rd._find_term(term)
   2.116 +    print t is None, t
   2.117  rd.close()
   2.118  
   2.119  # Test dictionaries with term and position data.
   2.120 @@ -274,10 +278,10 @@
   2.121      ]
   2.122  
   2.123  doc_tests = [
   2.124 -    ("Every", [(2, [0]), (14, [0])]),
   2.125 -    ("good", [(2, [1]), (13, [1])]),
   2.126 -    ("deserves", [(2, [3]), (13, [3])]),
   2.127 -    ("sea", [(36, [2, 6])])
   2.128 +    ("Every", 2, [(2, [0]), (14, [0])]),
   2.129 +    ("good", 2, [(2, [1]), (13, [1])]),
   2.130 +    ("deserves", 2, [(2, [3]), (13, [3])]),
   2.131 +    ("sea", 2, [(36, [2, 6])])
   2.132      ]
   2.133  
   2.134  index = iixr.Index("test_index")
   2.135 @@ -289,9 +293,11 @@
   2.136  wi.close()
   2.137  
   2.138  rd = index.get_reader()
   2.139 -for term, doc_positions in doc_tests:
   2.140 +for term, frequency, doc_positions in doc_tests:
   2.141      dp = rd.find_positions(term)
   2.142      print doc_positions == dp, doc_positions, dp
   2.143 +    fr = rd.get_frequency(term)
   2.144 +    print frequency == fr, frequency, fr
   2.145  for docnum, text in docs:
   2.146      df = rd.get_fields(docnum)
   2.147      print text == df[0], text, df[0]
2009-08-29	Paul Boddie	raw files shortlog changelog graph	Added term frequency information to the term dictionary.
			iixr.py (file) test.py (file)