1.1 --- a/iixr/filesystem.py Mon Nov 22 23:45:44 2010 +0100
1.2 +++ b/iixr/filesystem.py Mon Nov 22 23:50:03 2010 +0100
1.3 @@ -21,10 +21,15 @@
1.4 from iixr.fields import *
1.5 from iixr.terms import *
1.6 from iixr.positions import *
1.7 -from os import remove, rename # partition manipulation
1.8 -from shutil import copy # index updating
1.9 +from os import listdir, remove, rename # partition manipulation
1.10 +from shutil import copy # index updating
1.11 from os.path import join
1.12
1.13 +try:
1.14 + set
1.15 +except NameError:
1.16 + from sets import Set as set
1.17 +
1.18 # Constants.
1.19
1.20 TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
1.21 @@ -32,6 +37,42 @@
1.22
1.23 # Utility functions.
1.24
1.25 +def get_partitions(pathname, prefix):
1.26 +
1.27 + """
1.28 + Return a set of partition names for partitions residing at the given
1.29 + 'pathname' having the given 'prefix'.
1.30 + """
1.31 +
1.32 + prefix_length = len(prefix)
1.33 +
1.34 + partitions = set()
1.35 + for filename in listdir(pathname):
1.36 + if filename.startswith(prefix):
1.37 + partitions.add(filename[prefix_length:])
1.38 + return partitions
1.39 +
1.40 +def get_term_partitions(pathname):
1.41 +
1.42 + """
1.43 + Return a set of term partition identifiers for partitions residing at the
1.44 + given 'pathname'.
1.45 + """
1.46 +
1.47 + return get_partitions(pathname, "terms-")
1.48 +
1.49 +def get_field_partitions(pathname):
1.50 +
1.51 + """
1.52 + Return a set of field partition identifiers for partitions residing at the
1.53 + given 'pathname'.
1.54 + """
1.55 +
1.56 + return get_partitions(pathname, "fields-")
1.57 +
1.58 +def get_next_partition(partitions):
1.59 + return max([int(n) for n in partitions if n.isdigit()] or [-1]) + 1
1.60 +
1.61 def get_term_writer(pathname, partition, interval, doc_interval):
1.62
1.63 """
2.1 --- a/iixr/index.py Mon Nov 22 23:45:44 2010 +0100
2.2 +++ b/iixr/index.py Mon Nov 22 23:50:03 2010 +0100
2.3 @@ -21,14 +21,9 @@
2.4 from iixr.filesystem import *
2.5 from iixr.merging import *
2.6 from itertools import islice
2.7 -from os import listdir, mkdir # index and partition discovery
2.8 +from os import mkdir # index discovery
2.9 from os.path import exists
2.10
2.11 -try:
2.12 - set
2.13 -except NameError:
2.14 - from sets import Set as set
2.15 -
2.16 # Constants.
2.17
2.18 TERM_INTERVAL = 100
2.19 @@ -98,8 +93,8 @@
2.20 self.field_interval = field_interval
2.21 self.flush_interval = flush_interval
2.22
2.23 - self.dict_partition = 0
2.24 - self.field_dict_partition = 0
2.25 + self.dict_partition = get_next_partition(get_term_partitions(self.pathname))
2.26 + self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname))
2.27
2.28 self.terms = {}
2.29 self.docs = []
2.30 @@ -176,9 +171,9 @@
2.31 self.field_dict_partition += 1
2.32
2.33 def close(self):
2.34 - if self.terms:
2.35 + if self.terms or not get_term_partitions(self.pathname):
2.36 self.flush_terms()
2.37 - if self.docs:
2.38 + if self.docs or not get_field_partitions(self.pathname):
2.39 self.flush_fields()
2.40
2.41 class IndexReader:
2.42 @@ -265,32 +260,17 @@
2.43 self.reader = IndexReader(self.pathname)
2.44 return self.reader
2.45
2.46 - def _get_partitions(self, prefix):
2.47 -
2.48 - """
2.49 - Return a set of partition identifiers using 'prefix' to identify
2.50 - relevant files.
2.51 - """
2.52 -
2.53 - prefix_length = len(prefix)
2.54 -
2.55 - partitions = set()
2.56 - for filename in listdir(self.pathname):
2.57 - if filename.startswith(prefix):
2.58 - partitions.add(filename[prefix_length:])
2.59 - return partitions
2.60 -
2.61 def get_term_partitions(self):
2.62
2.63 "Return a set of term partition identifiers."
2.64
2.65 - return self._get_partitions("terms-")
2.66 + return get_term_partitions(self.pathname)
2.67
2.68 def get_field_partitions(self):
2.69
2.70 "Return a set of field partition identifiers."
2.71
2.72 - return self._get_partitions("fields-")
2.73 + return get_field_partitions(self.pathname)
2.74
2.75 def merge(self):
2.76