Fixed position writing to restart document numbering for each section of the position file. Fixed position dictionary iteration to reset the position iterator so that the document numbering of a newly encountered section is properly interpreted. Fixed position dictionary iteration across index entries, ensuring that entries at the start of sections, recorded in index entries themselves, are handled correctly. Removed unnecessary reader reset operations where iterators will be created with reset state anyway. Added a document frequency method to IndexReader. Added result caching to the position dictionary iterator in order to preserve record data for documents which were visited unintentionally.

     1.1 --- a/iixr.py	Thu Sep 03 01:09:06 2009 +0200
     1.2 +++ b/iixr.py	Thu Sep 03 22:10:43 2009 +0200
     1.3 @@ -282,8 +282,6 @@
     1.4          to 'count'.
     1.5          """
     1.6  
     1.7 -        self.reset()
     1.8 -
     1.9          # Duplicate the file handle.
    1.10  
    1.11          f = fdopen(dup(self.f.fileno()), "rb")
    1.12 @@ -362,10 +360,6 @@
    1.13          to 'doc_frequency'.
    1.14          """
    1.15  
    1.16 -        # NOTE: This is almost a duplication of PositionReader.read_term_positions.
    1.17 -
    1.18 -        self.reset()
    1.19 -
    1.20          # Duplicate the file handle.
    1.21  
    1.22          f = fdopen(dup(self.f.fileno()), "rb")
    1.23 @@ -494,6 +488,11 @@
    1.24                  first_docnum = None
    1.25                  count = 0
    1.26  
    1.27 +                # Reset the position writer so that position readers accessing
    1.28 +                # a section start with the correct document number.
    1.29 +
    1.30 +                self.position_writer.reset()
    1.31 +
    1.32          # Finish writing an index entry for the remaining documents.
    1.33  
    1.34          else:
    1.35 @@ -542,6 +541,10 @@
    1.36          self.doc_frequency = doc_frequency
    1.37          self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
    1.38  
    1.39 +        # Remember the last values.
    1.40 +
    1.41 +        self.found_docnum, self.found_positions = None, None
    1.42 +
    1.43          # Maintain state for the next index entry, if read.
    1.44  
    1.45          self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
    1.46 @@ -567,6 +570,15 @@
    1.47          positions file.
    1.48          """
    1.49  
    1.50 +        # Return any visited but unrequested record.
    1.51 +
    1.52 +        if self.found_docnum is not None:
    1.53 +            t = self.found_docnum, self.found_positions
    1.54 +            self.found_docnum, self.found_positions = None, None
    1.55 +            return t
    1.56 +
    1.57 +        # Or search for the next record.
    1.58 +
    1.59          while 1:
    1.60  
    1.61              # Either return the next record.
    1.62 @@ -585,13 +597,23 @@
    1.63                  self._next_section()
    1.64                  self.iterator.replenish(self.section_count)
    1.65  
    1.66 -    def __getitem__(self, docnum):
    1.67 +                # Reset the state of the iterator to make sure that document
    1.68 +                # numbers are correct.
    1.69 +
    1.70 +                self.iterator.reset()
    1.71 +
    1.72 +    def from_document(self, docnum):
    1.73  
    1.74          """
    1.75          Attempt to navigate to a positions entry for the given 'docnum',
    1.76 -        returning the positions, if present, or None otherwise.
    1.77 +        returning the positions for 'docnum', or None otherwise.
    1.78          """
    1.79  
    1.80 +        # Return any unrequested document positions.
    1.81 +
    1.82 +        if docnum == self.found_docnum:
    1.83 +            return self.found_positions
    1.84 +
    1.85          # Read ahead in the index until the next entry refers to a document
    1.86          # later than the desired document.
    1.87  
    1.88 @@ -599,9 +621,15 @@
    1.89              if self.next_docnum is None:
    1.90                  self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
    1.91  
    1.92 -            while self.next_docnum < docnum:
    1.93 +            # Read until the next entry is after the desired document number,
    1.94 +            # or until the end of the results.
    1.95 +
    1.96 +            while self.next_docnum <= docnum:
    1.97                  self._next_read_section()
    1.98 -                self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
    1.99 +                if self.docnum < docnum:
   1.100 +                    self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
   1.101 +                else:
   1.102 +                    break
   1.103  
   1.104          except StopIteration:
   1.105              pass
   1.106 @@ -612,11 +640,17 @@
   1.107  
   1.108          try:
   1.109              while 1:
   1.110 -                found_docnum, positions = self.iterator.next()
   1.111 +                found_docnum, found_positions = self.iterator.next()
   1.112 +
   1.113 +                # Return the desired document positions or those immediately
   1.114 +                # after.
   1.115 +
   1.116                  if docnum == found_docnum:
   1.117 -                    return positions
   1.118 -                elif docnum < found_docnum:
   1.119 +                    return found_positions
   1.120 +                elif docnum < self.found_docnum:
   1.121 +                    self.found_docnum, self.found_positions = found_docnum, found_positions
   1.122                      return None
   1.123 +
   1.124          except StopIteration:
   1.125              return None
   1.126  
   1.127 @@ -630,7 +664,6 @@
   1.128              self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
   1.129          else:
   1.130              self._next_read_section()
   1.131 -            self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   1.132  
   1.133      def _next_read_section(self):
   1.134  
   1.135 @@ -640,6 +673,7 @@
   1.136          """
   1.137  
   1.138          self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
   1.139 +        self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
   1.140  
   1.141      def _init_section(self):
   1.142  
   1.143 @@ -829,10 +863,10 @@
   1.144  
   1.145      "Reading term dictionaries."
   1.146  
   1.147 -    def __init__(self, info_reader, index_reader, position_reader):
   1.148 +    def __init__(self, info_reader, index_reader, position_dict_reader):
   1.149          self.info_reader = info_reader
   1.150          self.index_reader = index_reader
   1.151 -        self.position_reader = position_reader
   1.152 +        self.position_dict_reader = position_dict_reader
   1.153  
   1.154          self.terms = []
   1.155          try:
   1.156 @@ -889,7 +923,7 @@
   1.157          self.info_reader.rewind()
   1.158  
   1.159      def _get_positions(self, offset, doc_frequency):
   1.160 -        return self.position_reader.read_term_positions(offset, doc_frequency)
   1.161 +        return self.position_dict_reader.read_term_positions(offset, doc_frequency)
   1.162  
   1.163      def read_term(self):
   1.164  
   1.165 @@ -938,7 +972,7 @@
   1.166      def close(self):
   1.167          self.info_reader.close()
   1.168          self.index_reader.close()
   1.169 -        self.position_reader.close()
   1.170 +        self.position_dict_reader.close()
   1.171  
   1.172  # Specific classes for storing document information.
   1.173  
   1.174 @@ -1548,6 +1582,9 @@
   1.175      def get_frequency(self, term):
   1.176          return self.dict_reader.get_frequency(term)
   1.177  
   1.178 +    def get_document_frequency(self, term):
   1.179 +        return self.dict_reader.get_document_frequency(term)
   1.180 +
   1.181      def get_fields(self, docnum):
   1.182          return self.field_dict_reader.get_fields(docnum)
   1.183  

     2.1 --- a/test.py	Thu Sep 03 01:09:06 2009 +0200
     2.2 +++ b/test.py	Thu Sep 03 22:10:43 2009 +0200
     2.3 @@ -322,6 +322,16 @@
     2.4      ("cat",       [(123, [12, 145, 196]), (1200, [113])])
     2.5      ]
     2.6  
     2.7 +position_dict_tests = [
     2.8 +    ("badger", 19, [55, 1333]),
     2.9 +    ("badger", 20, None),
    2.10 +    ("bull", 6, [128]),
    2.11 +    ("bull", 26, [1, 3, 5, 7, 9]),
    2.12 +    ("cat", 111, None),
    2.13 +    ("cat", 123, [12, 145, 196]),
    2.14 +    ("cat", 1234, None)
    2.15 +    ]
    2.16 +
    2.17  f = open("test", "wb")
    2.18  w = iixr.TermWriter(f)
    2.19  f2 = open("testI", "wb")
    2.20 @@ -355,6 +365,13 @@
    2.21      dp = rd.find_positions(term)
    2.22      print dp is None, dp
    2.23  
    2.24 +# (Test iterators.)
    2.25 +
    2.26 +for term, docnum, positions in position_dict_tests:
    2.27 +    dp = rd.find_positions(term)
    2.28 +    pos = dp.from_document(docnum)
    2.29 +    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
    2.30 +
    2.31  # (Test sequential access.)
    2.32  
    2.33  rd.rewind()
    2.34 @@ -386,7 +403,8 @@
    2.35  position_tests = [
    2.36      ("Every", 14, [0]),
    2.37      ("sea", 36, [2, 6]),
    2.38 -    ("shells", 1, None)
    2.39 +    ("shells", 1, None),
    2.40 +    ("shells", 37, None)
    2.41      ]
    2.42  
    2.43  index = iixr.Index("test_index")
    2.44 @@ -408,8 +426,8 @@
    2.45      print (123, text) == df[0], (123, text), df[0]
    2.46  for term, docnum, positions in position_tests:
    2.47      dp = rd.find_positions(term)
    2.48 -    pos = dp[docnum]
    2.49 -    print positions is None and positions is pos or positions == list(pos), positions, pos
    2.50 +    pos = dp.from_document(docnum)
    2.51 +    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
    2.52  index.close()
    2.53  
    2.54  # vim: tabstop=4 expandtab shiftwidth=4
2009-09-03	Paul Boddie	raw files shortlog changelog graph	Fixed position writing to restart document numbering for each section of the position file. Fixed position dictionary iteration to reset the position iterator so that the document numbering of a newly encountered section is properly interpreted. Fixed position dictionary iteration across index entries, ensuring that entries at the start of sections, recorded in index entries themselves, are handled correctly. Removed unnecessary reader reset operations where iterators will be created with reset state anyway. Added a document frequency method to IndexReader. Added result caching to the position dictionary iterator in order to preserve record data for documents which were visited unintentionally.
			iixr.py (file) test.py (file)