1.1 --- a/iixr/files.py Sat Oct 03 03:03:32 2009 +0200
1.2 +++ b/iixr/files.py Fri Jan 08 00:44:59 2010 +0100
1.3 @@ -148,17 +148,4 @@
1.4
1.5 return unicode(s, "utf-8")
1.6
1.7 -class FileOpener:
1.8 -
1.9 - "Opening files using their filenames."
1.10 -
1.11 - def __init__(self, filename):
1.12 - self.filename = filename
1.13 -
1.14 - def open(self, mode):
1.15 - return open(self.filename, mode)
1.16 -
1.17 - def close(self):
1.18 - pass
1.19 -
1.20 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/iixr/filesystem.py Sat Oct 03 03:03:32 2009 +0200
2.2 +++ b/iixr/filesystem.py Fri Jan 08 00:44:59 2010 +0100
2.3 @@ -85,12 +85,15 @@
2.4 tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
2.5 index_reader = TermIndexReader(tdif)
2.6
2.7 - positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))
2.8 - positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))
2.9 + pf = open(join(pathname, "positions-%s" % partition), "rb")
2.10 + position_reader = PositionReader(pf)
2.11
2.12 - positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)
2.13 + pif = open(join(pathname, "positions_index-%s" % partition), "rb")
2.14 + position_index_reader = PositionIndexReader(pif)
2.15
2.16 - return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
2.17 + position_dict_reader = PositionDictionaryReader(position_reader, position_index_reader)
2.18 +
2.19 + return TermDictionaryReader(info_reader, index_reader, position_dict_reader)
2.20
2.21 def get_field_reader(pathname, partition):
2.22
3.1 --- a/iixr/positions.py Sat Oct 03 03:03:32 2009 +0200
3.2 +++ b/iixr/positions.py Fri Jan 08 00:44:59 2010 +0100
3.3 @@ -61,21 +61,6 @@
3.4
3.5 self.last_docnum = docnum
3.6
3.7 -class PositionOpener(FileOpener):
3.8 -
3.9 - "Reading position information from files."
3.10 -
3.11 - def read_term_positions(self, offset, count):
3.12 -
3.13 - """
3.14 - Read all positions from 'offset', seeking to that position in the file
3.15 - before reading. The number of documents available for reading is limited
3.16 - to 'count'.
3.17 - """
3.18 -
3.19 - f = self.open("rb")
3.20 - return PositionIterator(f, offset, count)
3.21 -
3.22 class PositionIndexWriter(FileWriter):
3.23
3.24 "Writing position index information to files."
3.25 @@ -107,21 +92,6 @@
3.26 self.last_pos_offset = pos_offset
3.27 self.last_docnum = docnum
3.28
3.29 -class PositionIndexOpener(FileOpener):
3.30 -
3.31 - "Reading position index information from files."
3.32 -
3.33 - def read_term_positions(self, offset, doc_frequency):
3.34 -
3.35 - """
3.36 - Read all positions from 'offset', seeking to that position in the file
3.37 - before reading. The number of documents available for reading is limited
3.38 - to 'doc_frequency'.
3.39 - """
3.40 -
3.41 - f = self.open("rb")
3.42 - return PositionIndexIterator(f, offset, doc_frequency)
3.43 -
3.44 # Iterators for position-related files.
3.45
3.46 class IteratorBase:
3.47 @@ -142,18 +112,29 @@
3.48 def __iter__(self):
3.49 return self
3.50
3.51 -class PositionIterator(FileReader, IteratorBase):
3.52 +class PositionReader(FileReader, IteratorBase):
3.53
3.54 "Iterating over document positions."
3.55
3.56 - def __init__(self, f, offset, count):
3.57 + def __init__(self, f):
3.58 FileReader.__init__(self, f)
3.59 - IteratorBase.__init__(self, count)
3.60 - self.f.seek(offset)
3.61 + IteratorBase.__init__(self, 0) # no iteration initially permitted
3.62 + self.reset()
3.63
3.64 def reset(self):
3.65 self.last_docnum = 0
3.66
3.67 + def seek(self, offset, count):
3.68 +
3.69 + """
3.70 + Seek to 'offset' in the file, limiting the number of documents available
3.71 + for reading to 'count'.
3.72 + """
3.73 +
3.74 + self.f.seek(offset)
3.75 + self.replenish(count)
3.76 + self.reset()
3.77 +
3.78 def read_positions(self):
3.79
3.80 "Read positions, returning a document number and a list of positions."
3.81 @@ -190,20 +171,31 @@
3.82 else:
3.83 raise StopIteration
3.84
3.85 -class PositionIndexIterator(FileReader, IteratorBase):
3.86 +class PositionIndexReader(FileReader, IteratorBase):
3.87
3.88 "Iterating over document positions."
3.89
3.90 - def __init__(self, f, offset, count):
3.91 + def __init__(self, f):
3.92 FileReader.__init__(self, f)
3.93 - IteratorBase.__init__(self, count)
3.94 - self.f.seek(offset)
3.95 + IteratorBase.__init__(self, 0) # no iteration initially permitted
3.96 + self.reset()
3.97
3.98 def reset(self):
3.99 self.last_docnum = 0
3.100 self.last_pos_offset = 0
3.101 self.section_count = 0
3.102
3.103 + def seek(self, offset, doc_frequency):
3.104 +
3.105 + """
3.106 + Seek to 'offset' in the file, limiting the number of documents available
3.107 + for reading to 'doc_frequency'.
3.108 + """
3.109 +
3.110 + self.f.seek(offset)
3.111 + self.replenish(doc_frequency)
3.112 + self.reset()
3.113 +
3.114 def read_positions(self):
3.115
3.116 """
3.117 @@ -319,65 +311,37 @@
3.118
3.119 class PositionDictionaryReader:
3.120
3.121 - "Reading position dictionaries."
3.122 -
3.123 - def __init__(self, position_opener, position_index_opener):
3.124 - self.position_opener = position_opener
3.125 - self.position_index_opener = position_index_opener
3.126 -
3.127 - def read_term_positions(self, offset, doc_frequency):
3.128 -
3.129 - """
3.130 - Return an iterator for dictionary entries starting at 'offset' with the
3.131 - given 'doc_frequency'.
3.132 - """
3.133 -
3.134 - return PositionDictionaryIterator(self.position_opener,
3.135 - self.position_index_opener, offset, doc_frequency)
3.136 -
3.137 - def close(self):
3.138 - pass
3.139 -
3.140 -class PositionDictionaryIterator:
3.141 -
3.142 "Iteration over position dictionary entries."
3.143
3.144 - def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
3.145 - self.position_opener = position_opener
3.146 - self.position_index_opener = position_index_opener
3.147 - self.doc_frequency = doc_frequency
3.148 + def __init__(self, position_reader, position_index_reader):
3.149 + self.position_reader = position_reader
3.150 + self.position_index_reader = position_index_reader
3.151 + self.reset()
3.152
3.153 - self.index_iterator = None
3.154 - self.iterator = None
3.155 -
3.156 - # Initialise the iterators.
3.157 -
3.158 - self.reset(offset, doc_frequency)
3.159 -
3.160 - def reset(self, offset, doc_frequency):
3.161 + def reset(self):
3.162
3.163 # Remember the last values.
3.164
3.165 self.found_docnum, self.found_positions = None, None
3.166
3.167 - # Attempt to reuse the index iterator.
3.168 -
3.169 - if self.index_iterator is not None:
3.170 - ii = self.index_iterator
3.171 - ii.replenish(doc_frequency)
3.172 - ii.f.seek(offset)
3.173 - ii.reset()
3.174 -
3.175 - # Or make a new index iterator.
3.176 -
3.177 - else:
3.178 - self.index_iterator = self.position_index_opener.read_term_positions(offset, doc_frequency)
3.179 -
3.180 # Maintain state for the next index entry, if read.
3.181
3.182 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
3.183
3.184 - # Initialise the current index entry and current position file iterator.
3.185 + def seek(self, offset, doc_frequency):
3.186 +
3.187 + """
3.188 + Seek to 'offset' in the index file, limiting the number of documents
3.189 + available for reading to 'doc_frequency'.
3.190 + """
3.191 +
3.192 + self.reset()
3.193 +
3.194 + # Seek to the appropriate index entry.
3.195 +
3.196 + self.position_index_reader.seek(offset, doc_frequency)
3.197 +
3.198 + # Initialise the current index entry and current position file reader.
3.199
3.200 self._next_section()
3.201 self._init_section()
3.202 @@ -385,7 +349,7 @@
3.203 # Sequence methods.
3.204
3.205 def __len__(self):
3.206 - return self.doc_frequency
3.207 + return len(self.position_index_reader)
3.208
3.209 def sort(self):
3.210 pass
3.211 @@ -416,23 +380,23 @@
3.212 # Either return the next record.
3.213
3.214 try:
3.215 - return self.iterator.next()
3.216 + return self.position_reader.next()
3.217
3.218 # Or, where a section is finished, get the next section and try again.
3.219
3.220 except StopIteration:
3.221
3.222 - # Where a section follows, update the index iterator, but keep
3.223 - # reading using the same file iterator (since the data should
3.224 - # just follow on from the last section).
3.225 + # Where a section follows, update the index reader, but keep
3.226 + # reading using the same file reader (since the data should just
3.227 + # follow on from the last section).
3.228
3.229 self._next_section()
3.230 - self.iterator.replenish(self.section_count)
3.231 + self.position_reader.replenish(self.section_count)
3.232
3.233 - # Reset the state of the iterator to make sure that document
3.234 + # Reset the state of the reader to make sure that document
3.235 # numbers are correct.
3.236
3.237 - self.iterator.reset()
3.238 + self.position_reader.reset()
3.239
3.240 def from_document(self, docnum):
3.241
3.242 @@ -451,7 +415,7 @@
3.243
3.244 try:
3.245 if self.next_docnum is None:
3.246 - self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
3.247 + self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_reader.next()
3.248
3.249 # Read until the next entry is after the desired document number,
3.250 # or until the end of the results.
3.251 @@ -459,7 +423,7 @@
3.252 while self.next_docnum <= docnum:
3.253 self._next_read_section()
3.254 if self.docnum < docnum:
3.255 - self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
3.256 + self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_reader.next()
3.257 else:
3.258 break
3.259
3.260 @@ -472,7 +436,7 @@
3.261
3.262 try:
3.263 while 1:
3.264 - found_docnum, found_positions = self.iterator.next()
3.265 + found_docnum, found_positions = self.position_reader.next()
3.266
3.267 # Return the desired document positions or None (retaining the
3.268 # positions for the document immediately after).
3.269 @@ -493,7 +457,7 @@
3.270 "Attempt to get the next section in the index."
3.271
3.272 if self.next_docnum is None:
3.273 - self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
3.274 + self.docnum, self.pos_offset, self.section_count = self.position_index_reader.next()
3.275 else:
3.276 self._next_read_section()
3.277
3.278 @@ -509,43 +473,14 @@
3.279
3.280 def _init_section(self):
3.281
3.282 - "Initialise the iterator for the section in the position file."
3.283 -
3.284 - # Attempt to reuse any correctly positioned iterator.
3.285 + "Initialise the reader for the section in the position file."
3.286
3.287 - if self.iterator is not None:
3.288 - i = self.iterator
3.289 - i.replenish(self.section_count)
3.290 - i.f.seek(self.pos_offset)
3.291 - i.reset()
3.292 + # Seek to the position entry.
3.293
3.294 - # Otherwise, obtain a new iterator.
3.295 -
3.296 - else:
3.297 - self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
3.298 + self.position_reader.seek(self.pos_offset, self.section_count)
3.299
3.300 def close(self):
3.301 - if self.iterator is not None:
3.302 - self.iterator.close()
3.303 - self.iterator = None
3.304 - if self.index_iterator is not None:
3.305 - self.index_iterator.close()
3.306 - self.index_iterator = None
3.307 -
3.308 -class ResetPositionDictionaryIterator:
3.309 -
3.310 - """
3.311 - A helper class which permits the reuse of iterators without modifying their
3.312 - state.
3.313 - """
3.314 -
3.315 - def __init__(self, iterator, offset, doc_frequency):
3.316 - self.iterator = iterator
3.317 - self.offset = offset
3.318 - self.doc_frequency = doc_frequency
3.319 -
3.320 - def __iter__(self):
3.321 - self.iterator.reset(self.offset, self.doc_frequency)
3.322 - return iter(self.iterator)
3.323 + self.position_reader.close()
3.324 + self.position_index_reader.close()
3.325
3.326 # vim: tabstop=4 expandtab shiftwidth=4
4.1 --- a/iixr/terms.py Sat Oct 03 03:03:32 2009 +0200
4.2 +++ b/iixr/terms.py Fri Jan 08 00:44:59 2010 +0100
4.3 @@ -208,7 +208,6 @@
4.4 self.info_reader = info_reader
4.5 self.index_reader = index_reader
4.6 self.position_dict_reader = position_dict_reader
4.7 - self.position_dict_iterator = None # for sequential/iterator access
4.8
4.9 self.terms = []
4.10 try:
4.11 @@ -302,7 +301,8 @@
4.12 documents equal to the given 'doc_frequency'.
4.13 """
4.14
4.15 - return self.position_dict_reader.read_term_positions(offset, doc_frequency)
4.16 + self.position_dict_reader.seek(offset, doc_frequency)
4.17 + return self.position_dict_reader
4.18
4.19 # Iterator convenience methods.
4.20
4.21 @@ -330,12 +330,8 @@
4.22
4.23 term, offset, frequency, doc_frequency = self.info_reader.read_term()
4.24
4.25 - # For sequential access, attempt to reuse any iterator.
4.26 -
4.27 - if self.position_dict_iterator is None:
4.28 - self.position_dict_iterator = self._get_positions(offset, doc_frequency)
4.29 -
4.30 - return term, frequency, doc_frequency, ResetPositionDictionaryIterator(self.position_dict_iterator, offset, doc_frequency)
4.31 + self.position_dict_reader.seek(offset, doc_frequency)
4.32 + return term, frequency, doc_frequency, self.position_dict_reader
4.33
4.34 # Query methods.
4.35
4.36 @@ -412,8 +408,5 @@
4.37 self.info_reader.close()
4.38 self.index_reader.close()
4.39 self.position_dict_reader.close()
4.40 - if self.position_dict_iterator is not None:
4.41 - self.position_dict_iterator.close()
4.42 - self.position_dict_iterator = None
4.43
4.44 # vim: tabstop=4 expandtab shiftwidth=4
5.1 --- a/test.py Sat Oct 03 03:03:32 2009 +0200
5.2 +++ b/test.py Fri Jan 08 00:44:59 2010 +0100
5.3 @@ -68,7 +68,7 @@
5.4 w.close()
5.5
5.6 f = open("testP", "rb")
5.7 -r = PositionIterator(f, 0, None)
5.8 +r = PositionReader(f)
5.9 for doc_positions in all_doc_positions:
5.10 for docnum, positions in doc_positions:
5.11 d, p = r.read_positions()
5.12 @@ -105,12 +105,12 @@
5.13 offsets.append((offset, doc_frequency))
5.14 w.close()
5.15
5.16 -r = PositionIndexOpener("testPI")
5.17 +r = PositionIndexReader(open("testPI", "rb"))
5.18 offsets.reverse()
5.19 indexed_positions.reverse()
5.20 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
5.21 - found_positions = r.read_term_positions(offset, doc_frequency)
5.22 - for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):
5.23 + r.seek(offset, doc_frequency)
5.24 + for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):
5.25 print docnum == dn, docnum, dn
5.26 print pos_offset == po, pos_offset, po
5.27 print count == c, count, c
5.28 @@ -129,13 +129,14 @@
5.29 offsets.append((offset, doc_frequency))
5.30 wd.close()
5.31
5.32 -r = PositionOpener("testP")
5.33 -r2 = PositionIndexOpener("testPI")
5.34 +r = PositionReader(open("testP", "rb"))
5.35 +r2 = PositionIndexReader(open("testPI", "rb"))
5.36 rd = PositionDictionaryReader(r, r2)
5.37 offsets.reverse()
5.38 all_doc_positions.reverse()
5.39 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
5.40 - dp = list(rd.read_term_positions(offset, doc_frequency))
5.41 + rd.seek(offset, doc_frequency)
5.42 + dp = list(rd)
5.43 print doc_positions == dp, doc_positions, dp
5.44 rd.close()
5.45
5.46 @@ -298,8 +299,8 @@
5.47 r = TermReader(f)
5.48 f2 = open("testI", "rb")
5.49 r2 = TermIndexReader(f2)
5.50 -r3 = PositionOpener("testP")
5.51 -r4 = PositionIndexOpener("testPI")
5.52 +r3 = PositionReader(open("testP", "rb"))
5.53 +r4 = PositionIndexReader(open("testPI", "rb"))
5.54 rp = PositionDictionaryReader(r3, r4)
5.55 rd = TermDictionaryReader(r, r2, rp)
5.56 terms_reversed = terms[:]
5.57 @@ -360,8 +361,8 @@
5.58 r = TermReader(f)
5.59 f2 = open("testI", "rb")
5.60 r2 = TermIndexReader(f2)
5.61 -r3 = PositionOpener("testP")
5.62 -r4 = PositionIndexOpener("testPI")
5.63 +r3 = PositionReader(open("testP", "rb"))
5.64 +r4 = PositionIndexReader(open("testPI", "rb"))
5.65 rp = PositionDictionaryReader(r3, r4)
5.66 rd = TermDictionaryReader(r, r2, rp)
5.67 terms_reversed = terms_with_positions[:]