1.1 --- a/iixr.py Thu Sep 03 01:09:06 2009 +0200
1.2 +++ b/iixr.py Thu Sep 03 22:10:43 2009 +0200
1.3 @@ -282,8 +282,6 @@
1.4 to 'count'.
1.5 """
1.6
1.7 - self.reset()
1.8 -
1.9 # Duplicate the file handle.
1.10
1.11 f = fdopen(dup(self.f.fileno()), "rb")
1.12 @@ -362,10 +360,6 @@
1.13 to 'doc_frequency'.
1.14 """
1.15
1.16 - # NOTE: This is almost a duplication of PositionReader.read_term_positions.
1.17 -
1.18 - self.reset()
1.19 -
1.20 # Duplicate the file handle.
1.21
1.22 f = fdopen(dup(self.f.fileno()), "rb")
1.23 @@ -494,6 +488,11 @@
1.24 first_docnum = None
1.25 count = 0
1.26
1.27 + # Reset the position writer so that position readers accessing
1.28 + # a section start with the correct document number.
1.29 +
1.30 + self.position_writer.reset()
1.31 +
1.32 # Finish writing an index entry for the remaining documents.
1.33
1.34 else:
1.35 @@ -542,6 +541,10 @@
1.36 self.doc_frequency = doc_frequency
1.37 self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
1.38
1.39 + # Remember the last values.
1.40 +
1.41 + self.found_docnum, self.found_positions = None, None
1.42 +
1.43 # Maintain state for the next index entry, if read.
1.44
1.45 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.46 @@ -567,6 +570,15 @@
1.47 positions file.
1.48 """
1.49
1.50 + # Return any visited but unrequested record.
1.51 +
1.52 + if self.found_docnum is not None:
1.53 + t = self.found_docnum, self.found_positions
1.54 + self.found_docnum, self.found_positions = None, None
1.55 + return t
1.56 +
1.57 + # Or search for the next record.
1.58 +
1.59 while 1:
1.60
1.61 # Either return the next record.
1.62 @@ -585,13 +597,23 @@
1.63 self._next_section()
1.64 self.iterator.replenish(self.section_count)
1.65
1.66 - def __getitem__(self, docnum):
1.67 + # Reset the state of the iterator to make sure that document
1.68 + # numbers are correct.
1.69 +
1.70 + self.iterator.reset()
1.71 +
1.72 + def from_document(self, docnum):
1.73
1.74 """
1.75 Attempt to navigate to a positions entry for the given 'docnum',
1.76 - returning the positions, if present, or None otherwise.
1.77 + returning the positions for 'docnum', or None otherwise.
1.78 """
1.79
1.80 + # Return any unrequested document positions.
1.81 +
1.82 + if docnum == self.found_docnum:
1.83 + return self.found_positions
1.84 +
1.85 # Read ahead in the index until the next entry refers to a document
1.86 # later than the desired document.
1.87
1.88 @@ -599,9 +621,15 @@
1.89 if self.next_docnum is None:
1.90 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
1.91
1.92 - while self.next_docnum < docnum:
1.93 + # Read until the next entry is after the desired document number,
1.94 + # or until the end of the results.
1.95 +
1.96 + while self.next_docnum <= docnum:
1.97 self._next_read_section()
1.98 - self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
1.99 + if self.docnum < docnum:
1.100 + self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
1.101 + else:
1.102 + break
1.103
1.104 except StopIteration:
1.105 pass
1.106 @@ -612,11 +640,17 @@
1.107
1.108 try:
1.109 while 1:
1.110 - found_docnum, positions = self.iterator.next()
1.111 + found_docnum, found_positions = self.iterator.next()
1.112 +
1.113 + # Return the desired document positions or those immediately
1.114 + # after.
1.115 +
1.116 if docnum == found_docnum:
1.117 - return positions
1.118 - elif docnum < found_docnum:
1.119 + return found_positions
1.120 + elif docnum < self.found_docnum:
1.121 + self.found_docnum, self.found_positions = found_docnum, found_positions
1.122 return None
1.123 +
1.124 except StopIteration:
1.125 return None
1.126
1.127 @@ -630,7 +664,6 @@
1.128 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
1.129 else:
1.130 self._next_read_section()
1.131 - self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.132
1.133 def _next_read_section(self):
1.134
1.135 @@ -640,6 +673,7 @@
1.136 """
1.137
1.138 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
1.139 + self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.140
1.141 def _init_section(self):
1.142
1.143 @@ -829,10 +863,10 @@
1.144
1.145 "Reading term dictionaries."
1.146
1.147 - def __init__(self, info_reader, index_reader, position_reader):
1.148 + def __init__(self, info_reader, index_reader, position_dict_reader):
1.149 self.info_reader = info_reader
1.150 self.index_reader = index_reader
1.151 - self.position_reader = position_reader
1.152 + self.position_dict_reader = position_dict_reader
1.153
1.154 self.terms = []
1.155 try:
1.156 @@ -889,7 +923,7 @@
1.157 self.info_reader.rewind()
1.158
1.159 def _get_positions(self, offset, doc_frequency):
1.160 - return self.position_reader.read_term_positions(offset, doc_frequency)
1.161 + return self.position_dict_reader.read_term_positions(offset, doc_frequency)
1.162
1.163 def read_term(self):
1.164
1.165 @@ -938,7 +972,7 @@
1.166 def close(self):
1.167 self.info_reader.close()
1.168 self.index_reader.close()
1.169 - self.position_reader.close()
1.170 + self.position_dict_reader.close()
1.171
1.172 # Specific classes for storing document information.
1.173
1.174 @@ -1548,6 +1582,9 @@
1.175 def get_frequency(self, term):
1.176 return self.dict_reader.get_frequency(term)
1.177
1.178 + def get_document_frequency(self, term):
1.179 + return self.dict_reader.get_document_frequency(term)
1.180 +
1.181 def get_fields(self, docnum):
1.182 return self.field_dict_reader.get_fields(docnum)
1.183
2.1 --- a/test.py Thu Sep 03 01:09:06 2009 +0200
2.2 +++ b/test.py Thu Sep 03 22:10:43 2009 +0200
2.3 @@ -322,6 +322,16 @@
2.4 ("cat", [(123, [12, 145, 196]), (1200, [113])])
2.5 ]
2.6
2.7 +position_dict_tests = [
2.8 + ("badger", 19, [55, 1333]),
2.9 + ("badger", 20, None),
2.10 + ("bull", 6, [128]),
2.11 + ("bull", 26, [1, 3, 5, 7, 9]),
2.12 + ("cat", 111, None),
2.13 + ("cat", 123, [12, 145, 196]),
2.14 + ("cat", 1234, None)
2.15 + ]
2.16 +
2.17 f = open("test", "wb")
2.18 w = iixr.TermWriter(f)
2.19 f2 = open("testI", "wb")
2.20 @@ -355,6 +365,13 @@
2.21 dp = rd.find_positions(term)
2.22 print dp is None, dp
2.23
2.24 +# (Test iterators.)
2.25 +
2.26 +for term, docnum, positions in position_dict_tests:
2.27 + dp = rd.find_positions(term)
2.28 + pos = dp.from_document(docnum)
2.29 + print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
2.30 +
2.31 # (Test sequential access.)
2.32
2.33 rd.rewind()
2.34 @@ -386,7 +403,8 @@
2.35 position_tests = [
2.36 ("Every", 14, [0]),
2.37 ("sea", 36, [2, 6]),
2.38 - ("shells", 1, None)
2.39 + ("shells", 1, None),
2.40 + ("shells", 37, None)
2.41 ]
2.42
2.43 index = iixr.Index("test_index")
2.44 @@ -408,8 +426,8 @@
2.45 print (123, text) == df[0], (123, text), df[0]
2.46 for term, docnum, positions in position_tests:
2.47 dp = rd.find_positions(term)
2.48 - pos = dp[docnum]
2.49 - print positions is None and positions is pos or positions == list(pos), positions, pos
2.50 + pos = dp.from_document(docnum)
2.51 + print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
2.52 index.close()
2.53
2.54 # vim: tabstop=4 expandtab shiftwidth=4