1.1 --- a/iixr.py Sun Aug 30 21:29:10 2009 +0200
1.2 +++ b/iixr.py Sun Aug 30 23:10:32 2009 +0200
1.3 @@ -19,6 +19,7 @@
1.4 """
1.5
1.6 from os import listdir, mkdir # index and partition discovery
1.7 +from os import remove, rename # partition manipulation
1.8 from os.path import exists, join
1.9 from os.path import commonprefix # to find common string prefixes
1.10 from bisect import bisect_right # to find terms in the dictionary index
1.11 @@ -491,8 +492,7 @@
1.12
1.13 # Large numbers for ordering purposes.
1.14
1.15 - self.max_offset = self.terms[-1][1]
1.16 - self.max_info_offset = self.terms[-1][2]
1.17 + self.max_offset = self.terms[-1][1] + 1
1.18
1.19 def _find_term(self, term):
1.20
1.21 @@ -501,7 +501,7 @@
1.22 dictionary.
1.23 """
1.24
1.25 - i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
1.26 + i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
1.27
1.28 # Get the entry position providing the term or one preceding it.
1.29
1.30 @@ -848,7 +848,7 @@
1.31 # For such entries, merge the positions.
1.32
1.33 if other_term == term:
1.34 - self.merge_positions(doc_positions, other_doc_positions)
1.35 + doc_positions = self.merge_positions(doc_positions, other_doc_positions)
1.36 to_update.append(other_partition)
1.37 i += 1
1.38 else:
1.39 @@ -864,7 +864,7 @@
1.40
1.41 for partition in to_update:
1.42 try:
1.43 - term, frequency, positions = self_readers[partition].read_term()
1.44 + term, frequency, positions = self.readers[partition].read_term()
1.45 insort_right(entries, (term, positions, partition))
1.46 except EOFError:
1.47 pass
1.48 @@ -881,12 +881,10 @@
1.49 for docnum, positions in other_doc_positions:
1.50 if doc_position_dict.has_key(docnum):
1.51 doc_position_dict[docnum] += positions
1.52 - doc_position_dict[docnum].sort()
1.53 else:
1.54 doc_position_dict[docnum] = positions
1.55
1.56 - doc_positions = doc_position_dict.items()
1.57 - return doc_positions
1.58 + return doc_position_dict.items()
1.59
1.60 class FieldDictionaryMerger(Merger):
1.61
1.62 @@ -945,7 +943,7 @@
1.63
1.64 for partition in to_update:
1.65 try:
1.66 - docnum, fields = self_readers[partition].read_fields()
1.67 + docnum, fields = self.readers[partition].read_fields()
1.68 insort_right(entries, (docnum, fields, partition))
1.69 except EOFError:
1.70 pass
1.71 @@ -963,7 +961,7 @@
1.72 tdf = open(join(pathname, "terms-%s" % partition), "wb")
1.73 info_writer = TermWriter(tdf)
1.74
1.75 - tdif = open(join(pathname, "index-%s" % partition), "wb")
1.76 + tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
1.77 index_writer = TermIndexWriter(tdif)
1.78
1.79 tpf = open(join(pathname, "positions-%s" % partition), "wb")
1.80 @@ -987,6 +985,59 @@
1.81
1.82 return FieldDictionaryWriter(field_writer, field_index_writer, interval)
1.83
1.84 +def get_term_reader(pathname, partition):
1.85 +
1.86 + """
1.87 + Return a term dictionary reader using files under the given 'pathname'
1.88 + labelled according to the given 'partition'.
1.89 + """
1.90 +
1.91 + tdf = open(join(pathname, "terms-%s" % partition), "rb")
1.92 + info_reader = TermReader(tdf)
1.93 +
1.94 + tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
1.95 + index_reader = TermIndexReader(tdif)
1.96 +
1.97 + tpf = open(join(pathname, "positions-%s" % partition), "rb")
1.98 + positions_reader = PositionReader(tpf)
1.99 +
1.100 + return TermDictionaryReader(info_reader, index_reader, positions_reader)
1.101 +
1.102 +def get_field_reader(pathname, partition):
1.103 +
1.104 + """
1.105 + Return a field dictionary reader using files under the given 'pathname'
1.106 + labelled according to the given 'partition'.
1.107 + """
1.108 +
1.109 + ff = open(join(pathname, "fields-%s" % partition), "rb")
1.110 + field_reader = FieldReader(ff)
1.111 +
1.112 + fif = open(join(pathname, "fields_index-%s" % partition), "rb")
1.113 + field_index_reader = FieldIndexReader(fif)
1.114 +
1.115 + return FieldDictionaryReader(field_reader, field_index_reader)
1.116 +
1.117 +def rename_files(pathname, names, from_partition, to_partition):
1.118 + for name in names:
1.119 + rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
1.120 +
1.121 +def rename_term_files(pathname, from_partition, to_partition):
1.122 + rename_files(pathname, ("terms", "terms_index", "positions"), from_partition, to_partition)
1.123 +
1.124 +def rename_field_files(pathname, from_partition, to_partition):
1.125 + rename_files(pathname, ("fields", "fields_index"), from_partition, to_partition)
1.126 +
1.127 +def remove_files(pathname, names, partition):
1.128 + for name in names:
1.129 + remove(join(pathname, "%s-%s" % (name, partition)))
1.130 +
1.131 +def remove_term_files(pathname, partition):
1.132 + remove_files(pathname, ("terms", "terms_index", "positions"), partition)
1.133 +
1.134 +def remove_field_files(pathname, partition):
1.135 + remove_files(pathname, ("fields", "fields_index"), partition)
1.136 +
1.137 # High-level classes.
1.138
1.139 class IndexWriter:
1.140 @@ -1112,31 +1163,9 @@
1.141
1.142 "Accessing the term and field dictionaries."
1.143
1.144 - def __init__(self, pathname, partition=0):
1.145 - self.pathname = pathname
1.146 - self.dict_reader = self.get_term_reader(partition)
1.147 - self.field_dict_reader = self.get_field_reader(partition)
1.148 -
1.149 - def get_term_reader(self, partition):
1.150 - tdf = open(join(self.pathname, "terms-%s" % partition), "rb")
1.151 - info_reader = TermReader(tdf)
1.152 -
1.153 - tdif = open(join(self.pathname, "index-%s" % partition), "rb")
1.154 - index_reader = TermIndexReader(tdif)
1.155 -
1.156 - tpf = open(join(self.pathname, "positions-%s" % partition), "rb")
1.157 - positions_reader = PositionReader(tpf)
1.158 -
1.159 - return TermDictionaryReader(info_reader, index_reader, positions_reader)
1.160 -
1.161 - def get_field_reader(self, partition):
1.162 - ff = open(join(self.pathname, "fields-%s" % partition), "rb")
1.163 - field_reader = FieldReader(ff)
1.164 -
1.165 - fif = open(join(self.pathname, "fields_index-%s" % partition), "rb")
1.166 - field_index_reader = FieldIndexReader(fif)
1.167 -
1.168 - return FieldDictionaryReader(field_reader, field_index_reader)
1.169 + def __init__(self, pathname):
1.170 + self.dict_reader = get_term_reader(pathname, "merged")
1.171 + self.field_dict_reader = get_field_reader(pathname, "merged")
1.172
1.173 def find_positions(self, term):
1.174 return self.dict_reader.find_positions(term)
1.175 @@ -1177,10 +1206,21 @@
1.176
1.177 "Return a reader for the index."
1.178
1.179 + # Ensure that only one partition exists.
1.180 +
1.181 + self.merge_terms()
1.182 + self.merge_fields()
1.183 +
1.184 + return self._get_reader(partition)
1.185 +
1.186 + def _get_reader(self, partition):
1.187 +
1.188 + "Return a reader for the index."
1.189 +
1.190 if not exists(self.pathname):
1.191 raise OSError, "Index path %r does not exist." % self.pathname
1.192
1.193 - self.reader = IndexReader(self.pathname, partition)
1.194 + self.reader = IndexReader(self.pathname)
1.195 return self.reader
1.196
1.197 def merge_terms(self, interval=INTERVAL):
1.198 @@ -1188,17 +1228,58 @@
1.199 "Merge term dictionaries using the given indexing 'interval'."
1.200
1.201 readers = []
1.202 + partitions = []
1.203
1.204 - for filename in os.listdir(self.pathname):
1.205 + for filename in listdir(self.pathname):
1.206 if filename.startswith("terms-"): # 6 character prefix
1.207 - partition = int(filename[6:])
1.208 - readers.append(self.get_reader(partition))
1.209 + partition = filename[6:]
1.210 + readers.append(get_term_reader(self.pathname, partition))
1.211 + partitions.append(partition)
1.212 +
1.213 + # Write directly to a dictionary.
1.214 +
1.215 + if len(readers) > 1:
1.216 + writer = get_term_writer(self.pathname, "merged", interval)
1.217 + merger = TermDictionaryMerger(writer, readers)
1.218 + merger.merge()
1.219 + merger.close()
1.220 +
1.221 + # Remove old files.
1.222 +
1.223 + for partition in partitions:
1.224 + remove_term_files(self.pathname, partition)
1.225 +
1.226 + elif len(readers) == 1 and partitions[0] != "merged":
1.227 + rename_term_files(self.pathname, partitions[0], "merged")
1.228 +
1.229 + def merge_fields(self, interval=INTERVAL):
1.230
1.231 - writer = get_writer(self.pathname, "new", interval)
1.232 + "Merge field dictionaries using the given indexing 'interval'."
1.233 +
1.234 + readers = []
1.235 + partitions = []
1.236 +
1.237 + for filename in listdir(self.pathname):
1.238 + if filename.startswith("fields-"): # 7 character prefix
1.239 + partition = filename[7:]
1.240 + readers.append(get_field_reader(self.pathname, partition))
1.241 + partitions.append(partition)
1.242 +
1.243 + # Write directly to a dictionary.
1.244
1.245 - merger = TermDictionaryMerger(writer, readers)
1.246 - merger.merge()
1.247 - merger.close()
1.248 + if len(readers) > 1:
1.249 + writer = get_field_writer(self.pathname, "merged", interval)
1.250 + merger = FieldDictionaryMerger(writer, readers)
1.251 + merger.merge()
1.252 + merger.close()
1.253 +
1.254 + # Remove old files.
1.255 +
1.256 + for partition in partitions:
1.257 + remove_field_files(self.pathname, partition)
1.258 +
1.259 + elif len(readers) == 1 and partitions[0] != "merged":
1.260 + rename_field_files(self.pathname, partitions[0], "merged")
1.261
1.262 def close(self):
1.263 if self.reader is not None: