# HG changeset patch # User Paul Boddie # Date 1253388992 -7200 # Node ID 495d396e86c55a0a33ae9735a18b7d017c85e448 # Parent 2bed39249624aa5c7e747d7e42367752d4adc77a Added measures for the closure of position iterators. Added index updating using the contents of other indexes. Restricted top-level imports. diff -r 2bed39249624 -r 495d396e86c5 iixr/__init__.py --- a/iixr/__init__.py Sat Sep 19 01:43:35 2009 +0200 +++ b/iixr/__init__.py Sat Sep 19 21:36:32 2009 +0200 @@ -18,6 +18,6 @@ with this program. If not, see . """ -from iixr.index import * +from iixr.index import Document, Index # vim: tabstop=4 expandtab shiftwidth=4 diff -r 2bed39249624 -r 495d396e86c5 iixr/filesystem.py --- a/iixr/filesystem.py Sat Sep 19 01:43:35 2009 +0200 +++ b/iixr/filesystem.py Sat Sep 19 21:36:32 2009 +0200 @@ -21,7 +21,8 @@ from iixr.fields import * from iixr.terms import * from iixr.positions import * -from os import remove, rename # partition manipulation +from os import remove, rename # partition manipulation +from shutil import copy # index updating from os.path import join # Constants. @@ -106,6 +107,8 @@ return FieldDictionaryReader(field_reader, field_index_reader) +# Renaming. + def rename_files(pathname, names, from_partition, to_partition): for name in names: rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition))) @@ -116,6 +119,8 @@ def rename_field_files(pathname, from_partition, to_partition): rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition) +# Removal/deletion. + def remove_files(pathname, names, partition): for name in names: remove(join(pathname, "%s-%s" % (name, partition))) @@ -126,4 +131,17 @@ def remove_field_files(pathname, partition): remove_files(pathname, FIELD_FILENAMES, partition) +# Copying. + +def copy_files(source, names, partition, destination, suffix): + for name in names: + filename = "%s-%s" % (name, partition) + copy(join(source, filename), join(destination, filename + suffix)) + +def copy_term_files(source, partition, destination, suffix): + copy_files(source, TERM_FILENAMES, partition, destination, suffix) + +def copy_field_files(source, partition, destination, suffix): + copy_files(source, FIELD_FILENAMES, partition, destination, suffix) + # vim: tabstop=4 expandtab shiftwidth=4 diff -r 2bed39249624 -r 495d396e86c5 iixr/index.py --- a/iixr/index.py Sat Sep 19 01:43:35 2009 +0200 +++ b/iixr/index.py Sat Sep 19 21:36:32 2009 +0200 @@ -234,14 +234,41 @@ self.reader = IndexReader(self.pathname) return self.reader + def _get_partitions(self, prefix): + + """ + Return a set of partition identifiers using 'prefix' to identify + relevant files. + """ + + prefix_length = len(prefix) + + partitions = set() + for filename in listdir(self.pathname): + if filename.startswith(prefix): + partitions.add(filename[prefix_length:]) + return partitions + + def get_term_partitions(self): + + "Return a set of term partition identifiers." + + return self._get_partitions("terms-") + + def get_field_partitions(self): + + "Return a set of field partition identifiers." + + return self._get_partitions("fields-") + def merge(self): "Merge/optimise index partitions." - self.merge_terms() - self.merge_fields() + self._merge_terms() + self._merge_fields() - def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL): + def _merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL): """ Merge term dictionaries using the given indexing 'interval' and @@ -249,13 +276,10 @@ """ readers = [] - partitions = set() + partitions = self.get_term_partitions() - for filename in listdir(self.pathname): - if filename.startswith("terms-"): # 6 character prefix - partition = filename[6:] - readers.append(get_term_reader(self.pathname, partition)) - partitions.add(partition) + for partition in partitions: + readers.append(get_term_reader(self.pathname, partition)) # Write directly to a dictionary. @@ -280,18 +304,15 @@ if partition != "merged": rename_term_files(self.pathname, partition, "merged") - def merge_fields(self, interval=FIELD_INTERVAL): + def _merge_fields(self, interval=FIELD_INTERVAL): "Merge field dictionaries using the given indexing 'interval'." readers = [] - partitions = set() + partitions = self.get_field_partitions() - for filename in listdir(self.pathname): - if filename.startswith("fields-"): # 7 character prefix - partition = filename[7:] - readers.append(get_field_reader(self.pathname, partition)) - partitions.add(partition) + for partition in partitions: + readers.append(get_field_reader(self.pathname, partition)) # Write directly to a dictionary. @@ -316,6 +337,18 @@ if partition != "merged": rename_field_files(self.pathname, partition, "merged") + def update(self, other_indexes): + + "Copy the content of the 'other_indexes' into this index and merge." + + for i, index in enumerate(other_indexes): + for partition in index.get_term_partitions(): + copy_term_files(index.pathname, partition, self.pathname, "-added-%d" % i) + for partition in index.get_field_partitions(): + copy_field_files(index.pathname, partition, self.pathname, "-added-%d" % i) + + self.merge() + def close(self): if self.reader is not None: self.reader.close() diff -r 2bed39249624 -r 495d396e86c5 iixr/positions.py --- a/iixr/positions.py Sat Sep 19 01:43:35 2009 +0200 +++ b/iixr/positions.py Sat Sep 19 21:36:32 2009 +0200 @@ -328,6 +328,7 @@ def __init__(self, position_opener, position_index_opener): self.position_opener = position_opener self.position_index_opener = position_index_opener + self.position_dict_iterators = [] def read_term_positions(self, offset, doc_frequency): @@ -336,11 +337,15 @@ given 'doc_frequency'. """ - return PositionDictionaryIterator(self.position_opener, + it = PositionDictionaryIterator(self.position_opener, self.position_index_opener, offset, doc_frequency) + self.position_dict_iterators.append(it) + return it def close(self): - pass + for it in self.position_dict_iterators: + it.close() + self.position_dict_iterators = [] class PositionDictionaryIterator: diff -r 2bed39249624 -r 495d396e86c5 iixr/terms.py --- a/iixr/terms.py Sat Sep 19 01:43:35 2009 +0200 +++ b/iixr/terms.py Sat Sep 19 21:36:32 2009 +0200 @@ -295,6 +295,12 @@ return None def _get_positions(self, offset, doc_frequency): + + """ + Obtain positions from the position index 'offset' expecting a number of + documents equal to the given 'doc_frequency'. + """ + return self.position_dict_reader.read_term_positions(offset, doc_frequency) # Iterator convenience methods. @@ -396,5 +402,8 @@ self.info_reader.close() self.index_reader.close() self.position_dict_reader.close() + if self.position_dict_iterator is not None: + self.position_dict_iterator.close() + self.position_dict_iterator = None # vim: tabstop=4 expandtab shiftwidth=4 diff -r 2bed39249624 -r 495d396e86c5 test.py --- a/test.py Sat Sep 19 01:43:35 2009 +0200 +++ b/test.py Sat Sep 19 21:36:32 2009 +0200 @@ -436,4 +436,39 @@ print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos index.close() +# Test index updates. + +index = Index("test_index") + +index2 = Index("test_index2") +wi = index2.get_writer(3, 2, 6) +for docnum, text in docs: + + # Add the same documents but with different numbers. + + doc = Document(docnum + 100) + for position, term in enumerate(text.split()): + doc.add_position(term, position) + doc.add_field(123, text) + wi.add_document(doc) +wi.close() + +index2.update([index]) +index.close() + +rd = index2.get_reader() +for term, frequency, doc_positions in doc_tests: + + # Add the extra documents to the expected result. + + for docnum, positions in doc_positions[:]: + doc_positions.append((docnum + 100, positions)) + frequency *= 2 + + dp = list(rd.find_positions(term)) + print doc_positions == dp, doc_positions, dp + fr = rd.get_frequency(term) + print frequency == fr, frequency, fr +index2.close() + # vim: tabstop=4 expandtab shiftwidth=4