Added measures for the closure of position iterators. Added index updating using the contents of other indexes. Restricted top-level imports.

     1.1 --- a/iixr/__init__.py	Sat Sep 19 01:43:35 2009 +0200
     1.2 +++ b/iixr/__init__.py	Sat Sep 19 21:36:32 2009 +0200
     1.3 @@ -18,6 +18,6 @@
     1.4  with this program.  If not, see <http://www.gnu.org/licenses/>.
     1.5  """
     1.6  
     1.7 -from iixr.index import *
     1.8 +from iixr.index import Document, Index
     1.9  
    1.10  # vim: tabstop=4 expandtab shiftwidth=4

     2.1 --- a/iixr/filesystem.py	Sat Sep 19 01:43:35 2009 +0200
     2.2 +++ b/iixr/filesystem.py	Sat Sep 19 21:36:32 2009 +0200
     2.3 @@ -21,7 +21,8 @@
     2.4  from iixr.fields import *
     2.5  from iixr.terms import *
     2.6  from iixr.positions import *
     2.7 -from os import remove, rename    # partition manipulation
     2.8 +from os import remove, rename   # partition manipulation
     2.9 +from shutil import copy         # index updating
    2.10  from os.path import join
    2.11  
    2.12  # Constants.
    2.13 @@ -106,6 +107,8 @@
    2.14  
    2.15      return FieldDictionaryReader(field_reader, field_index_reader)
    2.16  
    2.17 +# Renaming.
    2.18 +
    2.19  def rename_files(pathname, names, from_partition, to_partition):
    2.20      for name in names:
    2.21          rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
    2.22 @@ -116,6 +119,8 @@
    2.23  def rename_field_files(pathname, from_partition, to_partition):
    2.24      rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
    2.25  
    2.26 +# Removal/deletion.
    2.27 +
    2.28  def remove_files(pathname, names, partition):
    2.29      for name in names:
    2.30          remove(join(pathname, "%s-%s" % (name, partition)))
    2.31 @@ -126,4 +131,17 @@
    2.32  def remove_field_files(pathname, partition):
    2.33      remove_files(pathname, FIELD_FILENAMES, partition)
    2.34  
    2.35 +# Copying.
    2.36 +
    2.37 +def copy_files(source, names, partition, destination, suffix):
    2.38 +    for name in names:
    2.39 +        filename = "%s-%s" % (name, partition)
    2.40 +        copy(join(source, filename), join(destination, filename + suffix))
    2.41 +
    2.42 +def copy_term_files(source, partition, destination, suffix):
    2.43 +    copy_files(source, TERM_FILENAMES, partition, destination, suffix)
    2.44 +
    2.45 +def copy_field_files(source, partition, destination, suffix):
    2.46 +    copy_files(source, FIELD_FILENAMES, partition, destination, suffix)
    2.47 +
    2.48  # vim: tabstop=4 expandtab shiftwidth=4

     3.1 --- a/iixr/index.py	Sat Sep 19 01:43:35 2009 +0200
     3.2 +++ b/iixr/index.py	Sat Sep 19 21:36:32 2009 +0200
     3.3 @@ -234,14 +234,41 @@
     3.4          self.reader = IndexReader(self.pathname)
     3.5          return self.reader
     3.6  
     3.7 +    def _get_partitions(self, prefix):
     3.8 +
     3.9 +        """
    3.10 +        Return a set of partition identifiers using 'prefix' to identify
    3.11 +        relevant files.
    3.12 +        """
    3.13 +
    3.14 +        prefix_length = len(prefix)
    3.15 +
    3.16 +        partitions = set()
    3.17 +        for filename in listdir(self.pathname):
    3.18 +            if filename.startswith(prefix):
    3.19 +                partitions.add(filename[prefix_length:])
    3.20 +        return partitions
    3.21 +
    3.22 +    def get_term_partitions(self):
    3.23 +
    3.24 +        "Return a set of term partition identifiers."
    3.25 +
    3.26 +        return self._get_partitions("terms-")
    3.27 +
    3.28 +    def get_field_partitions(self):
    3.29 +
    3.30 +        "Return a set of field partition identifiers."
    3.31 +
    3.32 +        return self._get_partitions("fields-")
    3.33 +
    3.34      def merge(self):
    3.35  
    3.36          "Merge/optimise index partitions."
    3.37  
    3.38 -        self.merge_terms()
    3.39 -        self.merge_fields()
    3.40 +        self._merge_terms()
    3.41 +        self._merge_fields()
    3.42  
    3.43 -    def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
    3.44 +    def _merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
    3.45  
    3.46          """
    3.47          Merge term dictionaries using the given indexing 'interval' and
    3.48 @@ -249,13 +276,10 @@
    3.49          """
    3.50  
    3.51          readers = []
    3.52 -        partitions = set()
    3.53 +        partitions = self.get_term_partitions()
    3.54  
    3.55 -        for filename in listdir(self.pathname):
    3.56 -            if filename.startswith("terms-"): # 6 character prefix
    3.57 -                partition = filename[6:]
    3.58 -                readers.append(get_term_reader(self.pathname, partition))
    3.59 -                partitions.add(partition)
    3.60 +        for partition in partitions:
    3.61 +            readers.append(get_term_reader(self.pathname, partition))
    3.62  
    3.63          # Write directly to a dictionary.
    3.64  
    3.65 @@ -280,18 +304,15 @@
    3.66              if partition != "merged":
    3.67                  rename_term_files(self.pathname, partition, "merged")
    3.68  
    3.69 -    def merge_fields(self, interval=FIELD_INTERVAL):
    3.70 +    def _merge_fields(self, interval=FIELD_INTERVAL):
    3.71  
    3.72          "Merge field dictionaries using the given indexing 'interval'."
    3.73  
    3.74          readers = []
    3.75 -        partitions = set()
    3.76 +        partitions = self.get_field_partitions()
    3.77  
    3.78 -        for filename in listdir(self.pathname):
    3.79 -            if filename.startswith("fields-"): # 7 character prefix
    3.80 -                partition = filename[7:]
    3.81 -                readers.append(get_field_reader(self.pathname, partition))
    3.82 -                partitions.add(partition)
    3.83 +        for partition in partitions:
    3.84 +            readers.append(get_field_reader(self.pathname, partition))
    3.85  
    3.86          # Write directly to a dictionary.
    3.87  
    3.88 @@ -316,6 +337,18 @@
    3.89              if partition != "merged":
    3.90                  rename_field_files(self.pathname, partition, "merged")
    3.91  
    3.92 +    def update(self, other_indexes):
    3.93 +
    3.94 +        "Copy the content of the 'other_indexes' into this index and merge."
    3.95 +
    3.96 +        for i, index in enumerate(other_indexes):
    3.97 +            for partition in index.get_term_partitions():
    3.98 +                copy_term_files(index.pathname, partition, self.pathname, "-added-%d" % i)
    3.99 +            for partition in index.get_field_partitions():
   3.100 +                copy_field_files(index.pathname, partition, self.pathname, "-added-%d" % i)
   3.101 +
   3.102 +        self.merge()
   3.103 +
   3.104      def close(self):
   3.105          if self.reader is not None:
   3.106              self.reader.close()

     4.1 --- a/iixr/positions.py	Sat Sep 19 01:43:35 2009 +0200
     4.2 +++ b/iixr/positions.py	Sat Sep 19 21:36:32 2009 +0200
     4.3 @@ -328,6 +328,7 @@
     4.4      def __init__(self, position_opener, position_index_opener):
     4.5          self.position_opener = position_opener
     4.6          self.position_index_opener = position_index_opener
     4.7 +        self.position_dict_iterators = []
     4.8  
     4.9      def read_term_positions(self, offset, doc_frequency):
    4.10  
    4.11 @@ -336,11 +337,15 @@
    4.12          given 'doc_frequency'.
    4.13          """
    4.14  
    4.15 -        return PositionDictionaryIterator(self.position_opener,
    4.16 +        it = PositionDictionaryIterator(self.position_opener,
    4.17              self.position_index_opener, offset, doc_frequency)
    4.18 +        self.position_dict_iterators.append(it)
    4.19 +        return it
    4.20  
    4.21      def close(self):
    4.22 -        pass
    4.23 +        for it in self.position_dict_iterators:
    4.24 +            it.close()
    4.25 +        self.position_dict_iterators = []
    4.26  
    4.27  class PositionDictionaryIterator:
    4.28  

     5.1 --- a/iixr/terms.py	Sat Sep 19 01:43:35 2009 +0200
     5.2 +++ b/iixr/terms.py	Sat Sep 19 21:36:32 2009 +0200
     5.3 @@ -295,6 +295,12 @@
     5.4              return None
     5.5  
     5.6      def _get_positions(self, offset, doc_frequency):
     5.7 +
     5.8 +        """
     5.9 +        Obtain positions from the position index 'offset' expecting a number of
    5.10 +        documents equal to the given 'doc_frequency'.
    5.11 +        """
    5.12 +
    5.13          return self.position_dict_reader.read_term_positions(offset, doc_frequency)
    5.14  
    5.15      # Iterator convenience methods.
    5.16 @@ -396,5 +402,8 @@
    5.17          self.info_reader.close()
    5.18          self.index_reader.close()
    5.19          self.position_dict_reader.close()
    5.20 +        if self.position_dict_iterator is not None:
    5.21 +            self.position_dict_iterator.close()
    5.22 +            self.position_dict_iterator = None
    5.23  
    5.24  # vim: tabstop=4 expandtab shiftwidth=4

     6.1 --- a/test.py	Sat Sep 19 01:43:35 2009 +0200
     6.2 +++ b/test.py	Sat Sep 19 21:36:32 2009 +0200
     6.3 @@ -436,4 +436,39 @@
     6.4      print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
     6.5  index.close()
     6.6  
     6.7 +# Test index updates.
     6.8 +
     6.9 +index = Index("test_index")
    6.10 +
    6.11 +index2 = Index("test_index2")
    6.12 +wi = index2.get_writer(3, 2, 6)
    6.13 +for docnum, text in docs:
    6.14 +
    6.15 +    # Add the same documents but with different numbers.
    6.16 +
    6.17 +    doc = Document(docnum + 100)
    6.18 +    for position, term in enumerate(text.split()):
    6.19 +        doc.add_position(term, position)
    6.20 +    doc.add_field(123, text)
    6.21 +    wi.add_document(doc)
    6.22 +wi.close()
    6.23 +
    6.24 +index2.update([index])
    6.25 +index.close()
    6.26 +
    6.27 +rd = index2.get_reader()
    6.28 +for term, frequency, doc_positions in doc_tests:
    6.29 +
    6.30 +    # Add the extra documents to the expected result.
    6.31 +
    6.32 +    for docnum, positions in doc_positions[:]:
    6.33 +        doc_positions.append((docnum + 100, positions))
    6.34 +    frequency *= 2
    6.35 +
    6.36 +    dp = list(rd.find_positions(term))
    6.37 +    print doc_positions == dp, doc_positions, dp
    6.38 +    fr = rd.get_frequency(term)
    6.39 +    print frequency == fr, frequency, fr
    6.40 +index2.close()
    6.41 +
    6.42  # vim: tabstop=4 expandtab shiftwidth=4
2009-09-19	Paul Boddie	raw files shortlog changelog graph	Added measures for the closure of position iterators. Added index updating using the contents of other indexes. Restricted top-level imports.
			iixr/__init__.py (file) iixr/filesystem.py (file) iixr/index.py (file) iixr/positions.py (file) iixr/terms.py (file) test.py (file)