# HG changeset patch # User Paul Boddie # Date 1290466203 -3600 # Node ID f1cbbf5ef885fac3039b6076cb7eb52ee3626d4d # Parent 8d35240236b2a411c31a389881b4176687b236f6 Made partition discovery more widely available, adding code to find the next partition number to use, thus avoiding overwriting index data when opening a writer on an existing index. Made sure that term and field dictionaries are always written out: this might not occur if the underlying writers have been obtained from an index writer and then used to write data directly. diff -r 8d35240236b2 -r f1cbbf5ef885 iixr/filesystem.py --- a/iixr/filesystem.py Mon Nov 22 23:45:44 2010 +0100 +++ b/iixr/filesystem.py Mon Nov 22 23:50:03 2010 +0100 @@ -21,10 +21,15 @@ from iixr.fields import * from iixr.terms import * from iixr.positions import * -from os import remove, rename # partition manipulation -from shutil import copy # index updating +from os import listdir, remove, rename # partition manipulation +from shutil import copy # index updating from os.path import join +try: + set +except NameError: + from sets import Set as set + # Constants. TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index" @@ -32,6 +37,42 @@ # Utility functions. +def get_partitions(pathname, prefix): + + """ + Return a set of partition names for partitions residing at the given + 'pathname' having the given 'prefix'. + """ + + prefix_length = len(prefix) + + partitions = set() + for filename in listdir(pathname): + if filename.startswith(prefix): + partitions.add(filename[prefix_length:]) + return partitions + +def get_term_partitions(pathname): + + """ + Return a set of term partition identifiers for partitions residing at the + given 'pathname'. + """ + + return get_partitions(pathname, "terms-") + +def get_field_partitions(pathname): + + """ + Return a set of field partition identifiers for partitions residing at the + given 'pathname'. + """ + + return get_partitions(pathname, "fields-") + +def get_next_partition(partitions): + return max([int(n) for n in partitions if n.isdigit()] or [-1]) + 1 + def get_term_writer(pathname, partition, interval, doc_interval): """ diff -r 8d35240236b2 -r f1cbbf5ef885 iixr/index.py --- a/iixr/index.py Mon Nov 22 23:45:44 2010 +0100 +++ b/iixr/index.py Mon Nov 22 23:50:03 2010 +0100 @@ -21,14 +21,9 @@ from iixr.filesystem import * from iixr.merging import * from itertools import islice -from os import listdir, mkdir # index and partition discovery +from os import mkdir # index discovery from os.path import exists -try: - set -except NameError: - from sets import Set as set - # Constants. TERM_INTERVAL = 100 @@ -98,8 +93,8 @@ self.field_interval = field_interval self.flush_interval = flush_interval - self.dict_partition = 0 - self.field_dict_partition = 0 + self.dict_partition = get_next_partition(get_term_partitions(self.pathname)) + self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname)) self.terms = {} self.docs = [] @@ -176,9 +171,9 @@ self.field_dict_partition += 1 def close(self): - if self.terms: + if self.terms or not get_term_partitions(self.pathname): self.flush_terms() - if self.docs: + if self.docs or not get_field_partitions(self.pathname): self.flush_fields() class IndexReader: @@ -265,32 +260,17 @@ self.reader = IndexReader(self.pathname) return self.reader - def _get_partitions(self, prefix): - - """ - Return a set of partition identifiers using 'prefix' to identify - relevant files. - """ - - prefix_length = len(prefix) - - partitions = set() - for filename in listdir(self.pathname): - if filename.startswith(prefix): - partitions.add(filename[prefix_length:]) - return partitions - def get_term_partitions(self): "Return a set of term partition identifiers." - return self._get_partitions("terms-") + return get_term_partitions(self.pathname) def get_field_partitions(self): "Return a set of field partition identifiers." - return self._get_partitions("fields-") + return get_field_partitions(self.pathname) def merge(self):