1.1 --- a/iixr/__init__.py Sat Sep 19 01:43:35 2009 +0200
1.2 +++ b/iixr/__init__.py Sat Sep 19 21:36:32 2009 +0200
1.3 @@ -18,6 +18,6 @@
1.4 with this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 -from iixr.index import *
1.8 +from iixr.index import Document, Index
1.9
1.10 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/iixr/filesystem.py Sat Sep 19 01:43:35 2009 +0200
2.2 +++ b/iixr/filesystem.py Sat Sep 19 21:36:32 2009 +0200
2.3 @@ -21,7 +21,8 @@
2.4 from iixr.fields import *
2.5 from iixr.terms import *
2.6 from iixr.positions import *
2.7 -from os import remove, rename # partition manipulation
2.8 +from os import remove, rename # partition manipulation
2.9 +from shutil import copy # index updating
2.10 from os.path import join
2.11
2.12 # Constants.
2.13 @@ -106,6 +107,8 @@
2.14
2.15 return FieldDictionaryReader(field_reader, field_index_reader)
2.16
2.17 +# Renaming.
2.18 +
2.19 def rename_files(pathname, names, from_partition, to_partition):
2.20 for name in names:
2.21 rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
2.22 @@ -116,6 +119,8 @@
2.23 def rename_field_files(pathname, from_partition, to_partition):
2.24 rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
2.25
2.26 +# Removal/deletion.
2.27 +
2.28 def remove_files(pathname, names, partition):
2.29 for name in names:
2.30 remove(join(pathname, "%s-%s" % (name, partition)))
2.31 @@ -126,4 +131,17 @@
2.32 def remove_field_files(pathname, partition):
2.33 remove_files(pathname, FIELD_FILENAMES, partition)
2.34
2.35 +# Copying.
2.36 +
2.37 +def copy_files(source, names, partition, destination, suffix):
2.38 + for name in names:
2.39 + filename = "%s-%s" % (name, partition)
2.40 + copy(join(source, filename), join(destination, filename + suffix))
2.41 +
2.42 +def copy_term_files(source, partition, destination, suffix):
2.43 + copy_files(source, TERM_FILENAMES, partition, destination, suffix)
2.44 +
2.45 +def copy_field_files(source, partition, destination, suffix):
2.46 + copy_files(source, FIELD_FILENAMES, partition, destination, suffix)
2.47 +
2.48 # vim: tabstop=4 expandtab shiftwidth=4
3.1 --- a/iixr/index.py Sat Sep 19 01:43:35 2009 +0200
3.2 +++ b/iixr/index.py Sat Sep 19 21:36:32 2009 +0200
3.3 @@ -234,14 +234,41 @@
3.4 self.reader = IndexReader(self.pathname)
3.5 return self.reader
3.6
3.7 + def _get_partitions(self, prefix):
3.8 +
3.9 + """
3.10 + Return a set of partition identifiers using 'prefix' to identify
3.11 + relevant files.
3.12 + """
3.13 +
3.14 + prefix_length = len(prefix)
3.15 +
3.16 + partitions = set()
3.17 + for filename in listdir(self.pathname):
3.18 + if filename.startswith(prefix):
3.19 + partitions.add(filename[prefix_length:])
3.20 + return partitions
3.21 +
3.22 + def get_term_partitions(self):
3.23 +
3.24 + "Return a set of term partition identifiers."
3.25 +
3.26 + return self._get_partitions("terms-")
3.27 +
3.28 + def get_field_partitions(self):
3.29 +
3.30 + "Return a set of field partition identifiers."
3.31 +
3.32 + return self._get_partitions("fields-")
3.33 +
3.34 def merge(self):
3.35
3.36 "Merge/optimise index partitions."
3.37
3.38 - self.merge_terms()
3.39 - self.merge_fields()
3.40 + self._merge_terms()
3.41 + self._merge_fields()
3.42
3.43 - def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
3.44 + def _merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
3.45
3.46 """
3.47 Merge term dictionaries using the given indexing 'interval' and
3.48 @@ -249,13 +276,10 @@
3.49 """
3.50
3.51 readers = []
3.52 - partitions = set()
3.53 + partitions = self.get_term_partitions()
3.54
3.55 - for filename in listdir(self.pathname):
3.56 - if filename.startswith("terms-"): # 6 character prefix
3.57 - partition = filename[6:]
3.58 - readers.append(get_term_reader(self.pathname, partition))
3.59 - partitions.add(partition)
3.60 + for partition in partitions:
3.61 + readers.append(get_term_reader(self.pathname, partition))
3.62
3.63 # Write directly to a dictionary.
3.64
3.65 @@ -280,18 +304,15 @@
3.66 if partition != "merged":
3.67 rename_term_files(self.pathname, partition, "merged")
3.68
3.69 - def merge_fields(self, interval=FIELD_INTERVAL):
3.70 + def _merge_fields(self, interval=FIELD_INTERVAL):
3.71
3.72 "Merge field dictionaries using the given indexing 'interval'."
3.73
3.74 readers = []
3.75 - partitions = set()
3.76 + partitions = self.get_field_partitions()
3.77
3.78 - for filename in listdir(self.pathname):
3.79 - if filename.startswith("fields-"): # 7 character prefix
3.80 - partition = filename[7:]
3.81 - readers.append(get_field_reader(self.pathname, partition))
3.82 - partitions.add(partition)
3.83 + for partition in partitions:
3.84 + readers.append(get_field_reader(self.pathname, partition))
3.85
3.86 # Write directly to a dictionary.
3.87
3.88 @@ -316,6 +337,18 @@
3.89 if partition != "merged":
3.90 rename_field_files(self.pathname, partition, "merged")
3.91
3.92 + def update(self, other_indexes):
3.93 +
3.94 + "Copy the content of the 'other_indexes' into this index and merge."
3.95 +
3.96 + for i, index in enumerate(other_indexes):
3.97 + for partition in index.get_term_partitions():
3.98 + copy_term_files(index.pathname, partition, self.pathname, "-added-%d" % i)
3.99 + for partition in index.get_field_partitions():
3.100 + copy_field_files(index.pathname, partition, self.pathname, "-added-%d" % i)
3.101 +
3.102 + self.merge()
3.103 +
3.104 def close(self):
3.105 if self.reader is not None:
3.106 self.reader.close()
4.1 --- a/iixr/positions.py Sat Sep 19 01:43:35 2009 +0200
4.2 +++ b/iixr/positions.py Sat Sep 19 21:36:32 2009 +0200
4.3 @@ -328,6 +328,7 @@
4.4 def __init__(self, position_opener, position_index_opener):
4.5 self.position_opener = position_opener
4.6 self.position_index_opener = position_index_opener
4.7 + self.position_dict_iterators = []
4.8
4.9 def read_term_positions(self, offset, doc_frequency):
4.10
4.11 @@ -336,11 +337,15 @@
4.12 given 'doc_frequency'.
4.13 """
4.14
4.15 - return PositionDictionaryIterator(self.position_opener,
4.16 + it = PositionDictionaryIterator(self.position_opener,
4.17 self.position_index_opener, offset, doc_frequency)
4.18 + self.position_dict_iterators.append(it)
4.19 + return it
4.20
4.21 def close(self):
4.22 - pass
4.23 + for it in self.position_dict_iterators:
4.24 + it.close()
4.25 + self.position_dict_iterators = []
4.26
4.27 class PositionDictionaryIterator:
4.28
5.1 --- a/iixr/terms.py Sat Sep 19 01:43:35 2009 +0200
5.2 +++ b/iixr/terms.py Sat Sep 19 21:36:32 2009 +0200
5.3 @@ -295,6 +295,12 @@
5.4 return None
5.5
5.6 def _get_positions(self, offset, doc_frequency):
5.7 +
5.8 + """
5.9 + Obtain positions from the position index 'offset' expecting a number of
5.10 + documents equal to the given 'doc_frequency'.
5.11 + """
5.12 +
5.13 return self.position_dict_reader.read_term_positions(offset, doc_frequency)
5.14
5.15 # Iterator convenience methods.
5.16 @@ -396,5 +402,8 @@
5.17 self.info_reader.close()
5.18 self.index_reader.close()
5.19 self.position_dict_reader.close()
5.20 + if self.position_dict_iterator is not None:
5.21 + self.position_dict_iterator.close()
5.22 + self.position_dict_iterator = None
5.23
5.24 # vim: tabstop=4 expandtab shiftwidth=4
6.1 --- a/test.py Sat Sep 19 01:43:35 2009 +0200
6.2 +++ b/test.py Sat Sep 19 21:36:32 2009 +0200
6.3 @@ -436,4 +436,39 @@
6.4 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
6.5 index.close()
6.6
6.7 +# Test index updates.
6.8 +
6.9 +index = Index("test_index")
6.10 +
6.11 +index2 = Index("test_index2")
6.12 +wi = index2.get_writer(3, 2, 6)
6.13 +for docnum, text in docs:
6.14 +
6.15 + # Add the same documents but with different numbers.
6.16 +
6.17 + doc = Document(docnum + 100)
6.18 + for position, term in enumerate(text.split()):
6.19 + doc.add_position(term, position)
6.20 + doc.add_field(123, text)
6.21 + wi.add_document(doc)
6.22 +wi.close()
6.23 +
6.24 +index2.update([index])
6.25 +index.close()
6.26 +
6.27 +rd = index2.get_reader()
6.28 +for term, frequency, doc_positions in doc_tests:
6.29 +
6.30 + # Add the extra documents to the expected result.
6.31 +
6.32 + for docnum, positions in doc_positions[:]:
6.33 + doc_positions.append((docnum + 100, positions))
6.34 + frequency *= 2
6.35 +
6.36 + dp = list(rd.find_positions(term))
6.37 + print doc_positions == dp, doc_positions, dp
6.38 + fr = rd.get_frequency(term)
6.39 + print frequency == fr, frequency, fr
6.40 +index2.close()
6.41 +
6.42 # vim: tabstop=4 expandtab shiftwidth=4