Added navigation to specific documents in the position dictionary iterator. Fixed merging to preserve existing merged partitions.

     1.1 --- a/iixr.py	Wed Sep 02 22:25:29 2009 +0200
     1.2 +++ b/iixr.py	Thu Sep 03 01:09:06 2009 +0200
     1.3 @@ -27,6 +27,11 @@
     1.4  from bisect import insort_right  # to maintain a sorted list of data for merging
     1.5  import bz2, zlib                 # for field compression
     1.6  
     1.7 +try:
     1.8 +    set
     1.9 +except NameError:
    1.10 +    from sets import Set as set
    1.11 +
    1.12  # Constants.
    1.13  
    1.14  TERM_INTERVAL     = 100
    1.15 @@ -285,42 +290,6 @@
    1.16          f.seek(offset)
    1.17          return PositionIterator(f, count)
    1.18  
    1.19 -class IteratorBase:
    1.20 -
    1.21 -    def __init__(self, count):
    1.22 -        self.replenish(count)
    1.23 -
    1.24 -    def replenish(self, count):
    1.25 -        self.count = count
    1.26 -        self.read_documents = 0
    1.27 -
    1.28 -    def __len__(self):
    1.29 -        return self.count
    1.30 -
    1.31 -    def sort(self):
    1.32 -        pass # Stored document positions are already sorted.
    1.33 -
    1.34 -    def __iter__(self):
    1.35 -        return self
    1.36 -
    1.37 -class PositionIterator(PositionReader, IteratorBase):
    1.38 -
    1.39 -    "Iterating over document positions."
    1.40 -
    1.41 -    def __init__(self, f, count):
    1.42 -        PositionReader.__init__(self, f)
    1.43 -        IteratorBase.__init__(self, count)
    1.44 -
    1.45 -    def next(self):
    1.46 -
    1.47 -        "Read positions for a single document."
    1.48 -
    1.49 -        if self.read_documents < self.count:
    1.50 -            self.read_documents += 1
    1.51 -            return self.read_positions()
    1.52 -        else:
    1.53 -            raise StopIteration
    1.54 -
    1.55  class PositionIndexWriter(FileWriter):
    1.56  
    1.57      "Writing position index information to files."
    1.58 @@ -403,6 +372,44 @@
    1.59          f.seek(offset)
    1.60          return PositionIndexIterator(f, doc_frequency)
    1.61  
    1.62 +# Iterators for position-related files.
    1.63 +
    1.64 +class IteratorBase:
    1.65 +
    1.66 +    def __init__(self, count):
    1.67 +        self.replenish(count)
    1.68 +
    1.69 +    def replenish(self, count):
    1.70 +        self.count = count
    1.71 +        self.read_documents = 0
    1.72 +
    1.73 +    def __len__(self):
    1.74 +        return self.count
    1.75 +
    1.76 +    def sort(self):
    1.77 +        pass # Stored document positions are already sorted.
    1.78 +
    1.79 +    def __iter__(self):
    1.80 +        return self
    1.81 +
    1.82 +class PositionIterator(PositionReader, IteratorBase):
    1.83 +
    1.84 +    "Iterating over document positions."
    1.85 +
    1.86 +    def __init__(self, f, count):
    1.87 +        PositionReader.__init__(self, f)
    1.88 +        IteratorBase.__init__(self, count)
    1.89 +
    1.90 +    def next(self):
    1.91 +
    1.92 +        "Read positions for a single document."
    1.93 +
    1.94 +        if self.read_documents < self.count:
    1.95 +            self.read_documents += 1
    1.96 +            return self.read_positions()
    1.97 +        else:
    1.98 +            raise StopIteration
    1.99 +
   1.100  class PositionIndexIterator(PositionIndexReader, IteratorBase):
   1.101  
   1.102      "Iterating over document positions."
   1.103 @@ -533,10 +540,16 @@
   1.104      def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
   1.105          self.position_reader = position_reader
   1.106          self.doc_frequency = doc_frequency
   1.107 +        self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
   1.108  
   1.109 -        self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
   1.110 -        self.next_section()
   1.111 -        self.init_section()
   1.112 +        # Maintain state for the next index entry, if read.
   1.113 +
   1.114 +        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   1.115 +
   1.116 +        # Initialise the current index entry and current position file iterator.
   1.117 +
   1.118 +        self._next_section()
   1.119 +        self._init_section()
   1.120  
   1.121      def __len__(self):
   1.122          return self.doc_frequency
   1.123 @@ -549,7 +562,10 @@
   1.124  
   1.125      def next(self):
   1.126  
   1.127 -        # Attempt to get the next document record from the section in the positions file.
   1.128 +        """
   1.129 +        Attempt to get the next document record from the section in the
   1.130 +        positions file.
   1.131 +        """
   1.132  
   1.133          while 1:
   1.134  
   1.135 @@ -566,13 +582,69 @@
   1.136                  # reading using the same file iterator (since the data should
   1.137                  # just follow on from the last section).
   1.138  
   1.139 -                self.next_section()
   1.140 +                self._next_section()
   1.141                  self.iterator.replenish(self.section_count)
   1.142  
   1.143 -    def next_section(self):
   1.144 -        self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
   1.145 +    def __getitem__(self, docnum):
   1.146 +
   1.147 +        """
   1.148 +        Attempt to navigate to a positions entry for the given 'docnum',
   1.149 +        returning the positions, if present, or None otherwise.
   1.150 +        """
   1.151 +
   1.152 +        # Read ahead in the index until the next entry refers to a document
   1.153 +        # later than the desired document.
   1.154 +
   1.155 +        try:
   1.156 +            if self.next_docnum is None:
   1.157 +                self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
   1.158 +
   1.159 +            while self.next_docnum < docnum:
   1.160 +                self._next_read_section()
   1.161 +                self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
   1.162 +
   1.163 +        except StopIteration:
   1.164 +            pass
   1.165 +
   1.166 +        # Navigate in the position file to the document.
   1.167 +
   1.168 +        self._init_section()
   1.169  
   1.170 -    def init_section(self):
   1.171 +        try:
   1.172 +            while 1:
   1.173 +                found_docnum, positions = self.iterator.next()
   1.174 +                if docnum == found_docnum:
   1.175 +                    return positions
   1.176 +                elif docnum < found_docnum:
   1.177 +                    return None
   1.178 +        except StopIteration:
   1.179 +            return None
   1.180 +
   1.181 +    # Internal methods.
   1.182 +
   1.183 +    def _next_section(self):
   1.184 +
   1.185 +        "Attempt to get the next section in the index."
   1.186 +
   1.187 +        if self.next_docnum is None:
   1.188 +            self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
   1.189 +        else:
   1.190 +            self._next_read_section()
   1.191 +            self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   1.192 +
   1.193 +    def _next_read_section(self):
   1.194 +
   1.195 +        """
   1.196 +        Make the next index entry the current one without reading from the
   1.197 +        index.
   1.198 +        """
   1.199 +
   1.200 +        self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
   1.201 +
   1.202 +    def _init_section(self):
   1.203 +
   1.204 +        "Initialise the iterator for the section in the position file."
   1.205 +
   1.206          self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
   1.207  
   1.208  class TermWriter(FileWriter):
   1.209 @@ -1534,17 +1606,22 @@
   1.210          """
   1.211  
   1.212          readers = []
   1.213 -        partitions = []
   1.214 +        partitions = set()
   1.215  
   1.216          for filename in listdir(self.pathname):
   1.217              if filename.startswith("terms-"): # 6 character prefix
   1.218                  partition = filename[6:]
   1.219                  readers.append(get_term_reader(self.pathname, partition))
   1.220 -                partitions.append(partition)
   1.221 +                partitions.add(partition)
   1.222  
   1.223          # Write directly to a dictionary.
   1.224  
   1.225          if len(readers) > 1:
   1.226 +            if "merged" in partitions:
   1.227 +                rename_term_files(self.pathname, "merged", "old-merged")
   1.228 +                partitions.remove("merged")
   1.229 +                partitions.add("old-merged")
   1.230 +
   1.231              writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
   1.232              merger = TermDictionaryMerger(writer, readers)
   1.233              merger.merge()
   1.234 @@ -1555,25 +1632,32 @@
   1.235              for partition in partitions:
   1.236                  remove_term_files(self.pathname, partition)
   1.237  
   1.238 -        elif len(readers) == 1 and partitions[0] != "merged":
   1.239 -            rename_term_files(self.pathname, partitions[0], "merged")
   1.240 +        elif len(readers) == 1:
   1.241 +            partition = list(partitions)[0]
   1.242 +            if partition != "merged":
   1.243 +                rename_term_files(self.pathname, partition, "merged")
   1.244  
   1.245      def merge_fields(self, interval=FIELD_INTERVAL):
   1.246  
   1.247          "Merge field dictionaries using the given indexing 'interval'."
   1.248  
   1.249          readers = []
   1.250 -        partitions = []
   1.251 +        partitions = set()
   1.252  
   1.253          for filename in listdir(self.pathname):
   1.254              if filename.startswith("fields-"): # 7 character prefix
   1.255                  partition = filename[7:]
   1.256                  readers.append(get_field_reader(self.pathname, partition))
   1.257 -                partitions.append(partition)
   1.258 +                partitions.add(partition)
   1.259  
   1.260          # Write directly to a dictionary.
   1.261  
   1.262          if len(readers) > 1:
   1.263 +            if "merged" in partitions:
   1.264 +                rename_field_files(self.pathname, "merged", "old-merged")
   1.265 +                partitions.remove("merged")
   1.266 +                partitions.add("old-merged")
   1.267 +
   1.268              writer = get_field_writer(self.pathname, "merged", interval)
   1.269              merger = FieldDictionaryMerger(writer, readers)
   1.270              merger.merge()
   1.271 @@ -1584,8 +1668,10 @@
   1.272              for partition in partitions:
   1.273                  remove_field_files(self.pathname, partition)
   1.274  
   1.275 -        elif len(readers) == 1 and partitions[0] != "merged":
   1.276 -            rename_field_files(self.pathname, partitions[0], "merged")
   1.277 +        elif len(readers) == 1:
   1.278 +            partition = list(partitions)[0]
   1.279 +            if partition != "merged":
   1.280 +                rename_field_files(self.pathname, partition, "merged")
   1.281  
   1.282      def close(self):
   1.283          if self.reader is not None:

     2.1 --- a/test.py	Wed Sep 02 22:25:29 2009 +0200
     2.2 +++ b/test.py	Thu Sep 03 01:09:06 2009 +0200
     2.3 @@ -383,6 +383,12 @@
     2.4      ("sea", 2, [(36, [2, 6])])
     2.5      ]
     2.6  
     2.7 +position_tests = [
     2.8 +    ("Every", 14, [0]),
     2.9 +    ("sea", 36, [2, 6]),
    2.10 +    ("shells", 1, None)
    2.11 +    ]
    2.12 +
    2.13  index = iixr.Index("test_index")
    2.14  wi = index.get_writer(3, 2, 6)
    2.15  for docnum, text in docs:
    2.16 @@ -400,6 +406,10 @@
    2.17  for docnum, text in docs:
    2.18      df = rd.get_fields(docnum)
    2.19      print (123, text) == df[0], (123, text), df[0]
    2.20 +for term, docnum, positions in position_tests:
    2.21 +    dp = rd.find_positions(term)
    2.22 +    pos = dp[docnum]
    2.23 +    print positions is None and positions is pos or positions == list(pos), positions, pos
    2.24  index.close()
    2.25  
    2.26  # vim: tabstop=4 expandtab shiftwidth=4
2009-09-03	Paul Boddie	raw files shortlog changelog graph	Added navigation to specific documents in the position dictionary iterator. Fixed merging to preserve existing merged partitions.
			iixr.py (file) test.py (file)