1.1 --- a/iixr.py Sat Sep 05 18:10:50 2009 +0200
1.2 +++ b/iixr.py Sun Sep 06 02:01:00 2009 +0200
1.3 @@ -735,7 +735,7 @@
1.4
1.5 """
1.6 Read a term, its position file offset, its frequency and its document
1.7 - frequence from the term information file.
1.8 + frequency from the term information file.
1.9 """
1.10
1.11 # Read the prefix length and term suffix.
1.12 @@ -879,27 +879,47 @@
1.13
1.14 self.max_offset = self.terms[-1][1] + 1
1.15
1.16 - def _find_term(self, term):
1.17 + def _find_closest_entry(self, term):
1.18
1.19 """
1.20 - Find the position file offset and frequency of 'term' from the term
1.21 - dictionary.
1.22 + Find the offsets and frequencies of 'term' from the term dictionary or
1.23 + the closest term starting with the value of 'term'.
1.24 +
1.25 + Return the closest index entry consisting of a term, the position file
1.26 + offset, the term frequency, the document frequency, and the term details
1.27 + file offset.
1.28 """
1.29
1.30 i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
1.31
1.32 # Get the entry position providing the term or one preceding it.
1.33 + # If no entry precedes the requested term, return the very first entry
1.34 + # as the closest.
1.35
1.36 if i == -1:
1.37 - return None
1.38 -
1.39 - found_term, offset, frequency, doc_frequency, info_offset = self.terms[i]
1.40 + return self.terms[0]
1.41 + else:
1.42 + return self.terms[i]
1.43 +
1.44 + def _find_closest_term(self, term):
1.45 +
1.46 + """
1.47 + Find the offsets and frequencies of 'term' from the term dictionary or
1.48 + the closest term starting with the value of 'term'.
1.49 +
1.50 + Return the closest term (or the term itself), the position file offset,
1.51 + the term frequency, the document frequency, and the term details file
1.52 + offset (or None if the reader is already positioned).
1.53 + """
1.54 +
1.55 + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
1.56
1.57 # Where the term is found immediately, return the offset and
1.58 - # frequencies.
1.59 -
1.60 - if term == found_term:
1.61 - return offset, frequency, doc_frequency
1.62 + # frequencies. If the term does not appear, return the details of the
1.63 + # closest entry.
1.64 +
1.65 + if term <= found_term:
1.66 + return found_term, offset, frequency, doc_frequency, info_offset
1.67
1.68 # Otherwise, seek past the index term's entry in the information file
1.69 # and scan for the desired term.
1.70 @@ -912,19 +932,32 @@
1.71 except EOFError:
1.72 pass
1.73
1.74 - # If the term is found, return the offset and frequencies.
1.75 -
1.76 - if term == found_term:
1.77 - return offset, frequency, doc_frequency
1.78 - else:
1.79 - return None
1.80 + return found_term, offset, frequency, doc_frequency, None
1.81 +
1.82 + def _find_term(self, term):
1.83 +
1.84 + """
1.85 + Find the position file offset and frequency of 'term' from the term
1.86 + dictionary.
1.87 + """
1.88 +
1.89 + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
1.90 +
1.91 + # If the term is found, return the offset and frequencies.
1.92 +
1.93 + if term == found_term:
1.94 + return offset, frequency, doc_frequency
1.95 + else:
1.96 + return None
1.97 +
1.98 + def _get_positions(self, offset, doc_frequency):
1.99 + return self.position_dict_reader.read_term_positions(offset, doc_frequency)
1.100 +
1.101 + # Sequential access methods.
1.102
1.103 def rewind(self):
1.104 self.info_reader.rewind()
1.105
1.106 - def _get_positions(self, offset, doc_frequency):
1.107 - return self.position_dict_reader.read_term_positions(offset, doc_frequency)
1.108 -
1.109 def read_term(self):
1.110
1.111 """
1.112 @@ -936,6 +969,35 @@
1.113 positions = self._get_positions(offset, doc_frequency)
1.114 return term, frequency, doc_frequency, positions
1.115
1.116 + # Query methods.
1.117 +
1.118 + def find_terms(self, term):
1.119 +
1.120 + "Return all terms whose values start with the value of 'term'."
1.121 +
1.122 + terms = []
1.123 +
1.124 + found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
1.125 +
1.126 + # Position the reader, if necessary.
1.127 +
1.128 + if info_offset is not None:
1.129 + self.info_reader.go_to_term(found_term, offset, info_offset)
1.130 +
1.131 + # Read and record terms.
1.132 +
1.133 + try:
1.134 + # Add the found term if it starts with the specified term.
1.135 +
1.136 + while found_term.startswith(term):
1.137 + terms.append(found_term)
1.138 + found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.139 +
1.140 + except EOFError:
1.141 + pass
1.142 +
1.143 + return terms
1.144 +
1.145 def find_positions(self, term):
1.146
1.147 "Return the documents and positions at which the given 'term' is found."
1.148 @@ -1504,7 +1566,7 @@
1.149 else:
1.150 doc_fields = self.docs[docnum]
1.151
1.152 - doc_fields.append((identifier, value))
1.153 + doc_fields.append((identifier, unicode(value))) # convert to string
1.154
1.155 self.field_counter += 1
1.156 if self.flush_interval and self.field_counter >= self.flush_interval:
2.1 --- a/test.py Sat Sep 05 18:10:50 2009 +0200
2.2 +++ b/test.py Sun Sep 06 02:01:00 2009 +0200
2.3 @@ -201,8 +201,8 @@
2.4 doc_fields_reversed = doc_fields[:]
2.5 doc_fields_reversed.reverse()
2.6 for docnum, fields in doc_fields_reversed:
2.7 - df = rd.get_fields(docnum)
2.8 - print list(enumerate(fields)) == df, list(enumerate(fields)), df
2.9 + df = dict(rd.get_fields(docnum))
2.10 + print dict(enumerate(fields)) == df, dict(enumerate(fields)), df
2.11 for docnum in (13579, 246810):
2.12 df = rd.get_fields(docnum)
2.13 print df is None, df
2.14 @@ -309,6 +309,13 @@
2.15 for term in ("dog", "dingo"):
2.16 t = rd._find_term(term)
2.17 print t is None, t
2.18 +
2.19 +# (Test term prefix searching.)
2.20 +
2.21 +print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]
2.22 +print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]
2.23 +print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]
2.24 +print rd.find_terms("d") == [], rd.find_terms("d"), []
2.25 rd.close()
2.26
2.27 # Test dictionaries with term and position data.
2.28 @@ -361,7 +368,7 @@
2.29 for term, doc_positions in terms_reversed:
2.30 dp = list(rd.find_positions(term))
2.31 print doc_positions == dp, doc_positions, dp
2.32 -for term in ("dog", "dingo"):
2.33 +for term in ("aaa", "dog", "dingo"):
2.34 dp = rd.find_positions(term)
2.35 print dp is None, dp
2.36
2.37 @@ -422,8 +429,8 @@
2.38 fr = rd.get_frequency(term)
2.39 print frequency == fr, frequency, fr
2.40 for docnum, text in docs:
2.41 - df = rd.get_fields(docnum)
2.42 - print (123, text) == df[0], (123, text), df[0]
2.43 + df = dict(rd.get_fields(docnum))
2.44 + print df[123] == text, text, df[123]
2.45 for term, docnum, positions in position_tests:
2.46 dp = rd.find_positions(term)
2.47 pos = dp.from_document(docnum)