1.1 --- a/iixr.py Sat Aug 29 21:15:47 2009 +0200
1.2 +++ b/iixr.py Sat Aug 29 22:12:25 2009 +0200
1.3 @@ -215,8 +215,9 @@
1.4
1.5 """
1.6 Write all 'doc_positions' - a collection of tuples of the form (document
1.7 - number, position list) - to the file, returning the offset at which they
1.8 - were stored.
1.9 + number, position list) - to the file, returning a tuple containing the
1.10 + offset at which they were stored together with the frequency (number of
1.11 + positions) for the term involved.
1.12 """
1.13
1.14 # Reset the writer and record the current file offset.
1.15 @@ -230,10 +231,13 @@
1.16
1.17 # Write the positions.
1.18
1.19 + frequency = 0
1.20 +
1.21 for docnum, positions in doc_positions:
1.22 self.write_positions(docnum, positions)
1.23 + frequency += len(positions)
1.24
1.25 - return offset
1.26 + return offset, frequency
1.27
1.28 class PositionReader(FileReader):
1.29
1.30 @@ -301,12 +305,12 @@
1.31 self.last_term = ""
1.32 self.last_offset = 0
1.33
1.34 - def write_term(self, term, offset):
1.35 + def write_term(self, term, offset, frequency):
1.36
1.37 """
1.38 - Write the given 'term' and its position file 'offset' to the term
1.39 - information file. Return the offset after the term information was
1.40 - written to the file.
1.41 + Write the given 'term', its position file 'offset', and its 'frequency'
1.42 + to the term information file. Return the offset after the term
1.43 + information was written to the file.
1.44 """
1.45
1.46 # Too long terms are not currently supported.
1.47 @@ -326,6 +330,10 @@
1.48
1.49 self.write_number(offset - self.last_offset)
1.50
1.51 + # Write the frequency.
1.52 +
1.53 + self.write_number(frequency)
1.54 +
1.55 self.last_term = term
1.56 self.last_offset = offset
1.57
1.58 @@ -342,7 +350,8 @@
1.59 def read_term(self):
1.60
1.61 """
1.62 - Read a term and its position file offset from the term information file.
1.63 + Read a term, its position file offset, and its frequency from the term
1.64 + information file.
1.65 """
1.66
1.67 # Read the prefix length and term suffix.
1.68 @@ -356,7 +365,11 @@
1.69
1.70 self.last_offset += self.read_number()
1.71
1.72 - return self.last_term, self.last_offset
1.73 + # Read the frequency.
1.74 +
1.75 + frequency = self.read_number()
1.76 +
1.77 + return self.last_term, self.last_offset, frequency
1.78
1.79 def go_to_term(self, term, offset, info_offset):
1.80
1.81 @@ -377,15 +390,15 @@
1.82 TermWriter.reset(self)
1.83 self.last_info_offset = 0
1.84
1.85 - def write_term(self, term, offset, info_offset):
1.86 + def write_term(self, term, offset, frequency, info_offset):
1.87
1.88 """
1.89 - Write the given 'term' and its position file 'offset' to the term
1.90 - dictionary index file, along with the 'info_offset' in the term
1.91 - information file.
1.92 + Write the given 'term', its position file 'offset', and its 'frequency'
1.93 + to the term dictionary index file, along with the 'info_offset' in the
1.94 + term information file.
1.95 """
1.96
1.97 - TermWriter.write_term(self, term, offset)
1.98 + TermWriter.write_term(self, term, offset, frequency)
1.99
1.100 # Write the information file offset delta.
1.101
1.102 @@ -403,17 +416,17 @@
1.103 def read_term(self):
1.104
1.105 """
1.106 - Read a term, its position file offset, and its term information file
1.107 - offset from the term dictionary index file.
1.108 + Read a term, its position file offset, its frequency, and its term
1.109 + information file offset from the term dictionary index file.
1.110 """
1.111
1.112 - term, offset = TermReader.read_term(self)
1.113 + term, offset, frequency = TermReader.read_term(self)
1.114
1.115 # Read the offset delta.
1.116
1.117 self.last_info_offset += self.read_number()
1.118
1.119 - return term, offset, self.last_info_offset
1.120 + return term, offset, frequency, self.last_info_offset
1.121
1.122 class TermDictionaryWriter:
1.123
1.124 @@ -426,17 +439,18 @@
1.125 self.interval = interval
1.126 self.entry = 0
1.127
1.128 - def _write_term(self, term, offset):
1.129 + def _write_term(self, term, offset, frequency):
1.130
1.131 """
1.132 - Write the given 'term' and its position file 'offset' to the term
1.133 - information file and optionally to the index, making a dictionary entry.
1.134 + Write the given 'term', its position file 'offset', and its 'frequency'
1.135 + to the term information file and optionally to the index, making a
1.136 + dictionary entry.
1.137 """
1.138
1.139 - info_offset = self.info_writer.write_term(term, offset)
1.140 + info_offset = self.info_writer.write_term(term, offset, frequency)
1.141
1.142 if self.entry % self.interval == 0:
1.143 - self.index_writer.write_term(term, offset, info_offset)
1.144 + self.index_writer.write_term(term, offset, frequency, info_offset)
1.145
1.146 self.entry += 1
1.147
1.148 @@ -447,8 +461,8 @@
1.149 and positions at which the term is found.
1.150 """
1.151
1.152 - offset = self.position_writer.write_all_positions(doc_positions)
1.153 - self._write_term(term, offset)
1.154 + offset, frequency = self.position_writer.write_all_positions(doc_positions)
1.155 + self._write_term(term, offset, frequency)
1.156
1.157 def close(self):
1.158 self.info_writer.close()
1.159 @@ -478,7 +492,10 @@
1.160
1.161 def _find_term(self, term):
1.162
1.163 - "Find the position file offset of 'term' from the term dictionary."
1.164 + """
1.165 + Find the position file offset and frequency of 'term' from the term
1.166 + dictionary.
1.167 + """
1.168
1.169 i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
1.170
1.171 @@ -487,12 +504,12 @@
1.172 if i == -1:
1.173 return None
1.174
1.175 - found_term, offset, info_offset = self.terms[i]
1.176 + found_term, offset, frequency, info_offset = self.terms[i]
1.177
1.178 # Where the term is found immediately, return the offset.
1.179
1.180 if term == found_term:
1.181 - return offset
1.182 + return offset, frequency
1.183
1.184 # Otherwise, seek past the index term's entry in the information file
1.185 # and scan for the desired term.
1.186 @@ -501,14 +518,14 @@
1.187 self.info_reader.go_to_term(found_term, offset, info_offset)
1.188 try:
1.189 while term > found_term:
1.190 - found_term, offset = self.info_reader.read_term()
1.191 + found_term, offset, frequency = self.info_reader.read_term()
1.192 except EOFError:
1.193 pass
1.194
1.195 - # If the term is found, return the offset.
1.196 + # If the term is found, return the offset and frequency.
1.197
1.198 if term == found_term:
1.199 - return offset
1.200 + return offset, frequency
1.201 else:
1.202 return None
1.203
1.204 @@ -516,12 +533,24 @@
1.205
1.206 "Return the documents and positions at which the given 'term' is found."
1.207
1.208 - offset = self._find_term(term)
1.209 - if offset is None:
1.210 + t = self._find_term(term)
1.211 + if t is None:
1.212 return None
1.213 else:
1.214 + offset, frequency = t
1.215 return self.position_reader.read_all_positions(offset)
1.216
1.217 + def get_frequency(self, term):
1.218 +
1.219 + "Return the frequency of the given 'term'."
1.220 +
1.221 + t = self._find_term(term)
1.222 + if t is None:
1.223 + return None
1.224 + else:
1.225 + offset, frequency = t
1.226 + return frequency
1.227 +
1.228 def close(self):
1.229 self.info_reader.close()
1.230 self.index_reader.close()
1.231 @@ -810,6 +839,9 @@
1.232 def find_positions(self, term):
1.233 return self.dict_reader.find_positions(term)
1.234
1.235 + def get_frequency(self, term):
1.236 + return self.dict_reader.get_frequency(term)
1.237 +
1.238 def get_fields(self, docnum):
1.239 return self.field_dict_reader.read_fields(docnum)
1.240
2.1 --- a/test.py Sat Aug 29 21:15:47 2009 +0200
2.2 +++ b/test.py Sat Aug 29 22:12:25 2009 +0200
2.3 @@ -54,9 +54,8 @@
2.4 w = iixr.PositionWriter(f)
2.5 offsets = []
2.6 for doc_positions in all_doc_positions:
2.7 - offsets.append(
2.8 - w.write_all_positions(doc_positions)
2.9 - )
2.10 + offset, frequency = w.write_all_positions(doc_positions)
2.11 + offsets.append(offset)
2.12 w.close()
2.13
2.14 f = open("test", "rb")
2.15 @@ -145,51 +144,55 @@
2.16 # Test terms.
2.17
2.18 terms = [
2.19 - ("aardvark", 100000123),
2.20 - ("anteater", 100000456),
2.21 - ("badger", 100000789),
2.22 - ("bull", 1000001234),
2.23 - ("bulldog", 1000002345),
2.24 - ("cat", 1000003456)
2.25 + # term offset frequency
2.26 + ("aardvark", 100000123, 1),
2.27 + ("anteater", 100000456, 2),
2.28 + ("badger", 100000789, 13),
2.29 + ("bull", 1000001234, 59),
2.30 + ("bulldog", 1000002345, 99),
2.31 + ("cat", 1000003456, 89)
2.32 ]
2.33
2.34 f = open("test", "wb")
2.35 w = iixr.TermWriter(f)
2.36 -for term, offset in terms:
2.37 - w.write_term(term, offset)
2.38 +for term, offset, frequency in terms:
2.39 + w.write_term(term, offset, frequency)
2.40 w.close()
2.41
2.42 f = open("test", "rb")
2.43 r = iixr.TermReader(f)
2.44 -for term, offset in terms:
2.45 - t, o = r.read_term()
2.46 +for term, offset, frequency in terms:
2.47 + t, o, fr = r.read_term()
2.48 print term == t, term, t
2.49 print offset == o, offset, o
2.50 + print frequency == fr, frequency, fr
2.51 r.close()
2.52
2.53 # Test terms in index files.
2.54
2.55 indexed_terms = [
2.56 - ("aardvark", 100000123, 200000321),
2.57 - ("anteater", 100000456, 200000654),
2.58 - ("badger", 100000789, 200000987),
2.59 - ("bull", 1000001234, 200004321),
2.60 - ("bulldog", 1000002345, 200005432),
2.61 - ("cat", 1000003456, 200006543)
2.62 + # term offset frequency info_offset
2.63 + ("aardvark", 100000123, 1, 200000321),
2.64 + ("anteater", 100000456, 2, 200000654),
2.65 + ("badger", 100000789, 13, 200000987),
2.66 + ("bull", 1000001234, 59, 200004321),
2.67 + ("bulldog", 1000002345, 99, 200005432),
2.68 + ("cat", 1000003456, 89, 200006543)
2.69 ]
2.70
2.71 f = open("test", "wb")
2.72 w = iixr.TermIndexWriter(f)
2.73 -for term, offset, info_offset in indexed_terms:
2.74 - w.write_term(term, offset, info_offset)
2.75 +for term, offset, frequency, info_offset in indexed_terms:
2.76 + w.write_term(term, offset, frequency, info_offset)
2.77 w.close()
2.78
2.79 f = open("test", "rb")
2.80 r = iixr.TermIndexReader(f)
2.81 -for term, offset, info_offset in indexed_terms:
2.82 - t, o, i = r.read_term()
2.83 +for term, offset, frequency, info_offset in indexed_terms:
2.84 + t, o, fr, i = r.read_term()
2.85 print term == t, term, t
2.86 print offset == o, offset, o
2.87 + print frequency == fr, frequency, fr
2.88 print info_offset == i, info_offset, i
2.89 r.close()
2.90
2.91 @@ -202,8 +205,8 @@
2.92 f3 = open("testP", "wb")
2.93 w3 = iixr.PositionWriter(f3)
2.94 wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
2.95 -for term, offset in terms:
2.96 - wd._write_term(term, offset)
2.97 +for term, offset, frequency in terms:
2.98 + wd._write_term(term, offset, frequency)
2.99 wd.close()
2.100
2.101 f = open("test", "rb")
2.102 @@ -215,12 +218,13 @@
2.103 rd = iixr.TermDictionaryReader(r, r2, r3)
2.104 terms_reversed = terms[:]
2.105 terms_reversed.reverse()
2.106 -for term, offset in terms_reversed:
2.107 - o = rd._find_term(term)
2.108 +for term, offset, frequency in terms_reversed:
2.109 + o, fr = rd._find_term(term)
2.110 print offset == o, offset, o
2.111 + print frequency == fr, frequency, fr
2.112 for term in ("dog", "dingo"):
2.113 - o = rd._find_term(term)
2.114 - print o is None, o
2.115 + t = rd._find_term(term)
2.116 + print t is None, t
2.117 rd.close()
2.118
2.119 # Test dictionaries with term and position data.
2.120 @@ -274,10 +278,10 @@
2.121 ]
2.122
2.123 doc_tests = [
2.124 - ("Every", [(2, [0]), (14, [0])]),
2.125 - ("good", [(2, [1]), (13, [1])]),
2.126 - ("deserves", [(2, [3]), (13, [3])]),
2.127 - ("sea", [(36, [2, 6])])
2.128 + ("Every", 2, [(2, [0]), (14, [0])]),
2.129 + ("good", 2, [(2, [1]), (13, [1])]),
2.130 + ("deserves", 2, [(2, [3]), (13, [3])]),
2.131 + ("sea", 2, [(36, [2, 6])])
2.132 ]
2.133
2.134 index = iixr.Index("test_index")
2.135 @@ -289,9 +293,11 @@
2.136 wi.close()
2.137
2.138 rd = index.get_reader()
2.139 -for term, doc_positions in doc_tests:
2.140 +for term, frequency, doc_positions in doc_tests:
2.141 dp = rd.find_positions(term)
2.142 print doc_positions == dp, doc_positions, dp
2.143 + fr = rd.get_frequency(term)
2.144 + print frequency == fr, frequency, fr
2.145 for docnum, text in docs:
2.146 df = rd.get_fields(docnum)
2.147 print text == df[0], text, df[0]