1.1 --- a/iixr.py Wed Sep 02 01:30:42 2009 +0200
1.2 +++ b/iixr.py Wed Sep 02 22:25:29 2009 +0200
1.3 @@ -29,8 +29,13 @@
1.4
1.5 # Constants.
1.6
1.7 -INTERVAL = 100
1.8 -FLUSH_INTERVAL = 1000000
1.9 +TERM_INTERVAL = 100
1.10 +DOCUMENT_INTERVAL = 100
1.11 +FIELD_INTERVAL = 100
1.12 +FLUSH_INTERVAL = 1000000
1.13 +
1.14 +TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
1.15 +FIELD_FILENAMES = "fields", "fields_index"
1.16
1.17 compressors = [("b", bz2.compress), ("z", zlib.compress)]
1.18 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
1.19 @@ -440,14 +445,17 @@
1.20 the term involved.
1.21 """
1.22
1.23 - # Reset the writer.
1.24 + # Reset the writers.
1.25
1.26 self.position_writer.reset()
1.27 + self.position_index_writer.reset()
1.28 +
1.29 index_offset = None
1.30
1.31 # Write the positions.
1.32
1.33 frequency = 0
1.34 + first_docnum = None
1.35 first_offset = None
1.36 count = 0
1.37
1.38 @@ -460,13 +468,15 @@
1.39
1.40 if first_offset is None:
1.41 first_offset = pos_offset
1.42 + first_docnum = docnum
1.43
1.44 frequency += len(positions)
1.45 + count += 1
1.46
1.47 # Every {interval} entries, write an index entry.
1.48
1.49 if count == self.interval:
1.50 - io = self.position_index_writer.write_positions(docnum, first_offset, self.interval)
1.51 + io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
1.52
1.53 # Remember the first index entry offset.
1.54
1.55 @@ -474,15 +484,14 @@
1.56 index_offset = io
1.57
1.58 first_offset = None
1.59 + first_docnum = None
1.60 count = 0
1.61
1.62 - count += 1
1.63 -
1.64 # Finish writing an index entry for the remaining documents.
1.65
1.66 else:
1.67 if first_offset is not None:
1.68 - io = self.position_index_writer.write_positions(docnum, first_offset, count)
1.69 + io = self.position_index_writer.write_positions(first_docnum, first_offset, count)
1.70
1.71 # Remember the first index entry offset.
1.72
1.73 @@ -523,11 +532,18 @@
1.74
1.75 def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
1.76 self.position_reader = position_reader
1.77 + self.doc_frequency = doc_frequency
1.78
1.79 self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
1.80 self.next_section()
1.81 self.init_section()
1.82
1.83 + def __len__(self):
1.84 + return self.doc_frequency
1.85 +
1.86 + def sort(self):
1.87 + pass
1.88 +
1.89 def __iter__(self):
1.90 return self
1.91
1.92 @@ -545,11 +561,16 @@
1.93 # Or, where a section is finished, get the next section and try again.
1.94
1.95 except StopIteration:
1.96 +
1.97 + # Where a section follows, update the index iterator, but keep
1.98 + # reading using the same file iterator (since the data should
1.99 + # just follow on from the last section).
1.100 +
1.101 self.next_section()
1.102 self.iterator.replenish(self.section_count)
1.103
1.104 def next_section(self):
1.105 - self.docnum, self.pos_offset, self.section_count = self.index_iterator.read_positions()
1.106 + self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
1.107
1.108 def init_section(self):
1.109 self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
1.110 @@ -1303,20 +1324,20 @@
1.111 rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
1.112
1.113 def rename_term_files(pathname, from_partition, to_partition):
1.114 - rename_files(pathname, ("terms", "terms_index", "positions"), from_partition, to_partition)
1.115 + rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
1.116
1.117 def rename_field_files(pathname, from_partition, to_partition):
1.118 - rename_files(pathname, ("fields", "fields_index"), from_partition, to_partition)
1.119 + rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
1.120
1.121 def remove_files(pathname, names, partition):
1.122 for name in names:
1.123 remove(join(pathname, "%s-%s" % (name, partition)))
1.124
1.125 def remove_term_files(pathname, partition):
1.126 - remove_files(pathname, ("terms", "terms_index", "positions"), partition)
1.127 + remove_files(pathname, TERM_FILENAMES, partition)
1.128
1.129 def remove_field_files(pathname, partition):
1.130 - remove_files(pathname, ("fields", "fields_index"), partition)
1.131 + remove_files(pathname, FIELD_FILENAMES, partition)
1.132
1.133 # High-level classes.
1.134
1.135 @@ -1326,9 +1347,10 @@
1.136 Building term information and writing it to the term and field dictionaries.
1.137 """
1.138
1.139 - def __init__(self, pathname, interval, flush_interval):
1.140 + def __init__(self, pathname, interval, doc_interval, flush_interval):
1.141 self.pathname = pathname
1.142 self.interval = interval
1.143 + self.doc_interval = doc_interval
1.144 self.flush_interval = flush_interval
1.145
1.146 self.dict_partition = 0
1.147 @@ -1387,7 +1409,7 @@
1.148
1.149 "Return a term dictionary writer for the current partition."
1.150
1.151 - return get_term_writer(self.pathname, self.dict_partition, self.interval)
1.152 + return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
1.153
1.154 def get_field_writer(self):
1.155
1.156 @@ -1470,17 +1492,17 @@
1.157 self.reader = None
1.158 self.writer = None
1.159
1.160 - def get_writer(self, interval=INTERVAL, flush_interval=FLUSH_INTERVAL):
1.161 + def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
1.162
1.163 """
1.164 - Return a writer, optionally using the given indexing 'interval' and
1.165 - 'flush_interval'.
1.166 + Return a writer, optionally using the given indexing 'interval',
1.167 + 'doc_interval' and 'flush_interval'.
1.168 """
1.169
1.170 if not exists(self.pathname):
1.171 mkdir(self.pathname)
1.172
1.173 - self.writer = IndexWriter(self.pathname, interval, flush_interval)
1.174 + self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
1.175 return self.writer
1.176
1.177 def get_reader(self, partition=0):
1.178 @@ -1504,9 +1526,12 @@
1.179 self.reader = IndexReader(self.pathname)
1.180 return self.reader
1.181
1.182 - def merge_terms(self, interval=INTERVAL):
1.183 + def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
1.184
1.185 - "Merge term dictionaries using the given indexing 'interval'."
1.186 + """
1.187 + Merge term dictionaries using the given indexing 'interval' and
1.188 + 'doc_interval'.
1.189 + """
1.190
1.191 readers = []
1.192 partitions = []
1.193 @@ -1520,7 +1545,7 @@
1.194 # Write directly to a dictionary.
1.195
1.196 if len(readers) > 1:
1.197 - writer = get_term_writer(self.pathname, "merged", interval)
1.198 + writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
1.199 merger = TermDictionaryMerger(writer, readers)
1.200 merger.merge()
1.201 merger.close()
1.202 @@ -1533,7 +1558,7 @@
1.203 elif len(readers) == 1 and partitions[0] != "merged":
1.204 rename_term_files(self.pathname, partitions[0], "merged")
1.205
1.206 - def merge_fields(self, interval=INTERVAL):
1.207 + def merge_fields(self, interval=FIELD_INTERVAL):
1.208
1.209 "Merge field dictionaries using the given indexing 'interval'."
1.210
2.1 --- a/test.py Wed Sep 02 01:30:42 2009 +0200
2.2 +++ b/test.py Wed Sep 02 22:25:29 2009 +0200
2.3 @@ -12,7 +12,9 @@
2.4 pass
2.5
2.6 try:
2.7 - os.removedirs("test_index")
2.8 + for filename in os.listdir("test_index"):
2.9 + os.remove(os.path.join("test_index", filename))
2.10 + os.rmdir("test_index")
2.11 except OSError:
2.12 pass
2.13
2.14 @@ -118,7 +120,7 @@
2.15 for doc_positions in all_doc_positions:
2.16 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
2.17 offsets.append((offset, doc_frequency))
2.18 -w.close()
2.19 +wd.close()
2.20
2.21 f = open("testP", "rb")
2.22 r = iixr.PositionReader(f)
2.23 @@ -130,7 +132,7 @@
2.24 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
2.25 dp = list(rd.read_term_positions(offset, doc_frequency))
2.26 print doc_positions == dp, doc_positions, dp
2.27 -r.close()
2.28 +rd.close()
2.29
2.30 # Test fields.
2.31
2.32 @@ -277,7 +279,12 @@
2.33 w = iixr.TermWriter(f)
2.34 f2 = open("testI", "wb")
2.35 w2 = iixr.TermIndexWriter(f2)
2.36 -wd = iixr.TermDictionaryWriter(w, w2, None, 3)
2.37 +f3 = open("testP", "wb")
2.38 +w3 = iixr.PositionWriter(f3)
2.39 +f4 = open("testPI", "wb")
2.40 +w4 = iixr.PositionIndexWriter(f4)
2.41 +wp = iixr.PositionDictionaryWriter(w3, w4, 2)
2.42 +wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
2.43 for term, offset, frequency, doc_frequency in terms:
2.44 wd._write_term(term, offset, frequency, doc_frequency)
2.45 wd.close()
2.46 @@ -286,7 +293,12 @@
2.47 r = iixr.TermReader(f)
2.48 f2 = open("testI", "rb")
2.49 r2 = iixr.TermIndexReader(f2)
2.50 -rd = iixr.TermDictionaryReader(r, r2, None)
2.51 +f3 = open("testP", "rb")
2.52 +r3 = iixr.PositionReader(f3)
2.53 +f4 = open("testPI", "rb")
2.54 +r4 = iixr.PositionIndexReader(f4)
2.55 +rp = iixr.PositionDictionaryReader(r3, r4)
2.56 +rd = iixr.TermDictionaryReader(r, r2, rp)
2.57 terms_reversed = terms[:]
2.58 terms_reversed.reverse()
2.59 for term, offset, frequency, doc_frequency in terms_reversed:
2.60 @@ -318,7 +330,7 @@
2.61 w3 = iixr.PositionWriter(f3)
2.62 f4 = open("testPI", "wb")
2.63 w4 = iixr.PositionIndexWriter(f4)
2.64 -wp = iixr.PositionDictionaryWriter(r3, r4, 2)
2.65 +wp = iixr.PositionDictionaryWriter(w3, w4, 2)
2.66 wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
2.67 for term, doc_positions in terms_with_positions:
2.68 wd.write_term_positions(term, doc_positions)
2.69 @@ -372,7 +384,7 @@
2.70 ]
2.71
2.72 index = iixr.Index("test_index")
2.73 -wi = index.get_writer(3, 6)
2.74 +wi = index.get_writer(3, 2, 6)
2.75 for docnum, text in docs:
2.76 for position, term in enumerate(text.split()):
2.77 wi.add_position(term, docnum, position)