1.1 --- a/iixr.py Wed Sep 02 22:25:29 2009 +0200
1.2 +++ b/iixr.py Thu Sep 03 01:09:06 2009 +0200
1.3 @@ -27,6 +27,11 @@
1.4 from bisect import insort_right # to maintain a sorted list of data for merging
1.5 import bz2, zlib # for field compression
1.6
1.7 +try:
1.8 + set
1.9 +except NameError:
1.10 + from sets import Set as set
1.11 +
1.12 # Constants.
1.13
1.14 TERM_INTERVAL = 100
1.15 @@ -285,42 +290,6 @@
1.16 f.seek(offset)
1.17 return PositionIterator(f, count)
1.18
1.19 -class IteratorBase:
1.20 -
1.21 - def __init__(self, count):
1.22 - self.replenish(count)
1.23 -
1.24 - def replenish(self, count):
1.25 - self.count = count
1.26 - self.read_documents = 0
1.27 -
1.28 - def __len__(self):
1.29 - return self.count
1.30 -
1.31 - def sort(self):
1.32 - pass # Stored document positions are already sorted.
1.33 -
1.34 - def __iter__(self):
1.35 - return self
1.36 -
1.37 -class PositionIterator(PositionReader, IteratorBase):
1.38 -
1.39 - "Iterating over document positions."
1.40 -
1.41 - def __init__(self, f, count):
1.42 - PositionReader.__init__(self, f)
1.43 - IteratorBase.__init__(self, count)
1.44 -
1.45 - def next(self):
1.46 -
1.47 - "Read positions for a single document."
1.48 -
1.49 - if self.read_documents < self.count:
1.50 - self.read_documents += 1
1.51 - return self.read_positions()
1.52 - else:
1.53 - raise StopIteration
1.54 -
1.55 class PositionIndexWriter(FileWriter):
1.56
1.57 "Writing position index information to files."
1.58 @@ -403,6 +372,44 @@
1.59 f.seek(offset)
1.60 return PositionIndexIterator(f, doc_frequency)
1.61
1.62 +# Iterators for position-related files.
1.63 +
1.64 +class IteratorBase:
1.65 +
1.66 + def __init__(self, count):
1.67 + self.replenish(count)
1.68 +
1.69 + def replenish(self, count):
1.70 + self.count = count
1.71 + self.read_documents = 0
1.72 +
1.73 + def __len__(self):
1.74 + return self.count
1.75 +
1.76 + def sort(self):
1.77 + pass # Stored document positions are already sorted.
1.78 +
1.79 + def __iter__(self):
1.80 + return self
1.81 +
1.82 +class PositionIterator(PositionReader, IteratorBase):
1.83 +
1.84 + "Iterating over document positions."
1.85 +
1.86 + def __init__(self, f, count):
1.87 + PositionReader.__init__(self, f)
1.88 + IteratorBase.__init__(self, count)
1.89 +
1.90 + def next(self):
1.91 +
1.92 + "Read positions for a single document."
1.93 +
1.94 + if self.read_documents < self.count:
1.95 + self.read_documents += 1
1.96 + return self.read_positions()
1.97 + else:
1.98 + raise StopIteration
1.99 +
1.100 class PositionIndexIterator(PositionIndexReader, IteratorBase):
1.101
1.102 "Iterating over document positions."
1.103 @@ -533,10 +540,16 @@
1.104 def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
1.105 self.position_reader = position_reader
1.106 self.doc_frequency = doc_frequency
1.107 + self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
1.108
1.109 - self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
1.110 - self.next_section()
1.111 - self.init_section()
1.112 + # Maintain state for the next index entry, if read.
1.113 +
1.114 + self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.115 +
1.116 + # Initialise the current index entry and current position file iterator.
1.117 +
1.118 + self._next_section()
1.119 + self._init_section()
1.120
1.121 def __len__(self):
1.122 return self.doc_frequency
1.123 @@ -549,7 +562,10 @@
1.124
1.125 def next(self):
1.126
1.127 - # Attempt to get the next document record from the section in the positions file.
1.128 + """
1.129 + Attempt to get the next document record from the section in the
1.130 + positions file.
1.131 + """
1.132
1.133 while 1:
1.134
1.135 @@ -566,13 +582,69 @@
1.136 # reading using the same file iterator (since the data should
1.137 # just follow on from the last section).
1.138
1.139 - self.next_section()
1.140 + self._next_section()
1.141 self.iterator.replenish(self.section_count)
1.142
1.143 - def next_section(self):
1.144 - self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
1.145 + def __getitem__(self, docnum):
1.146 +
1.147 + """
1.148 + Attempt to navigate to a positions entry for the given 'docnum',
1.149 + returning the positions, if present, or None otherwise.
1.150 + """
1.151 +
1.152 + # Read ahead in the index until the next entry refers to a document
1.153 + # later than the desired document.
1.154 +
1.155 + try:
1.156 + if self.next_docnum is None:
1.157 + self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
1.158 +
1.159 + while self.next_docnum < docnum:
1.160 + self._next_read_section()
1.161 + self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
1.162 +
1.163 + except StopIteration:
1.164 + pass
1.165 +
1.166 + # Navigate in the position file to the document.
1.167 +
1.168 + self._init_section()
1.169
1.170 - def init_section(self):
1.171 + try:
1.172 + while 1:
1.173 + found_docnum, positions = self.iterator.next()
1.174 + if docnum == found_docnum:
1.175 + return positions
1.176 + elif docnum < found_docnum:
1.177 + return None
1.178 + except StopIteration:
1.179 + return None
1.180 +
1.181 + # Internal methods.
1.182 +
1.183 + def _next_section(self):
1.184 +
1.185 + "Attempt to get the next section in the index."
1.186 +
1.187 + if self.next_docnum is None:
1.188 + self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
1.189 + else:
1.190 + self._next_read_section()
1.191 + self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.192 +
1.193 + def _next_read_section(self):
1.194 +
1.195 + """
1.196 + Make the next index entry the current one without reading from the
1.197 + index.
1.198 + """
1.199 +
1.200 + self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
1.201 +
1.202 + def _init_section(self):
1.203 +
1.204 + "Initialise the iterator for the section in the position file."
1.205 +
1.206 self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
1.207
1.208 class TermWriter(FileWriter):
1.209 @@ -1534,17 +1606,22 @@
1.210 """
1.211
1.212 readers = []
1.213 - partitions = []
1.214 + partitions = set()
1.215
1.216 for filename in listdir(self.pathname):
1.217 if filename.startswith("terms-"): # 6 character prefix
1.218 partition = filename[6:]
1.219 readers.append(get_term_reader(self.pathname, partition))
1.220 - partitions.append(partition)
1.221 + partitions.add(partition)
1.222
1.223 # Write directly to a dictionary.
1.224
1.225 if len(readers) > 1:
1.226 + if "merged" in partitions:
1.227 + rename_term_files(self.pathname, "merged", "old-merged")
1.228 + partitions.remove("merged")
1.229 + partitions.add("old-merged")
1.230 +
1.231 writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
1.232 merger = TermDictionaryMerger(writer, readers)
1.233 merger.merge()
1.234 @@ -1555,25 +1632,32 @@
1.235 for partition in partitions:
1.236 remove_term_files(self.pathname, partition)
1.237
1.238 - elif len(readers) == 1 and partitions[0] != "merged":
1.239 - rename_term_files(self.pathname, partitions[0], "merged")
1.240 + elif len(readers) == 1:
1.241 + partition = list(partitions)[0]
1.242 + if partition != "merged":
1.243 + rename_term_files(self.pathname, partition, "merged")
1.244
1.245 def merge_fields(self, interval=FIELD_INTERVAL):
1.246
1.247 "Merge field dictionaries using the given indexing 'interval'."
1.248
1.249 readers = []
1.250 - partitions = []
1.251 + partitions = set()
1.252
1.253 for filename in listdir(self.pathname):
1.254 if filename.startswith("fields-"): # 7 character prefix
1.255 partition = filename[7:]
1.256 readers.append(get_field_reader(self.pathname, partition))
1.257 - partitions.append(partition)
1.258 + partitions.add(partition)
1.259
1.260 # Write directly to a dictionary.
1.261
1.262 if len(readers) > 1:
1.263 + if "merged" in partitions:
1.264 + rename_field_files(self.pathname, "merged", "old-merged")
1.265 + partitions.remove("merged")
1.266 + partitions.add("old-merged")
1.267 +
1.268 writer = get_field_writer(self.pathname, "merged", interval)
1.269 merger = FieldDictionaryMerger(writer, readers)
1.270 merger.merge()
1.271 @@ -1584,8 +1668,10 @@
1.272 for partition in partitions:
1.273 remove_field_files(self.pathname, partition)
1.274
1.275 - elif len(readers) == 1 and partitions[0] != "merged":
1.276 - rename_field_files(self.pathname, partitions[0], "merged")
1.277 + elif len(readers) == 1:
1.278 + partition = list(partitions)[0]
1.279 + if partition != "merged":
1.280 + rename_field_files(self.pathname, partition, "merged")
1.281
1.282 def close(self):
1.283 if self.reader is not None:
2.1 --- a/test.py Wed Sep 02 22:25:29 2009 +0200
2.2 +++ b/test.py Thu Sep 03 01:09:06 2009 +0200
2.3 @@ -383,6 +383,12 @@
2.4 ("sea", 2, [(36, [2, 6])])
2.5 ]
2.6
2.7 +position_tests = [
2.8 + ("Every", 14, [0]),
2.9 + ("sea", 36, [2, 6]),
2.10 + ("shells", 1, None)
2.11 + ]
2.12 +
2.13 index = iixr.Index("test_index")
2.14 wi = index.get_writer(3, 2, 6)
2.15 for docnum, text in docs:
2.16 @@ -400,6 +406,10 @@
2.17 for docnum, text in docs:
2.18 df = rd.get_fields(docnum)
2.19 print (123, text) == df[0], (123, text), df[0]
2.20 +for term, docnum, positions in position_tests:
2.21 + dp = rd.find_positions(term)
2.22 + pos = dp[docnum]
2.23 + print positions is None and positions is pos or positions == list(pos), positions, pos
2.24 index.close()
2.25
2.26 # vim: tabstop=4 expandtab shiftwidth=4