# HG changeset patch # User Paul Boddie # Date 1297120107 -3600 # Node ID fc0e9882717b26d5f9e39fe6def1c055dcbe7bfb # Parent 1f3986bca1a3733f1047a32b45de700ac4473283 Moved the record handling into reset methods in order to have records encompass entire "pages" of stored data, rather than individual entries. Changed the term dictionary index to refer to the start of each "page" of term dictionary entries rather than the second entry. This is done so that the entire "page" or record can be loaded when such a "page" is requested, although it effectively prohibits direct traversal of the term dictionary without having to refer to the term dictionary index. Introduced a test for array exhaustion when reading variable-length integers from a particular starting position. diff -r 1f3986bca1a3 -r fc0e9882717b iixr/data.py --- a/iixr/data.py Mon Feb 07 02:05:38 2011 +0100 +++ b/iixr/data.py Tue Feb 08 00:08:27 2011 +0100 @@ -150,8 +150,11 @@ number. """ + length = len(bytes) + if start == length: + raise EOFError + number = 0 - length = len(bytes) digit = 0 while start < length: x = bytes[start] diff -r 1f3986bca1a3 -r fc0e9882717b iixr/fields.py --- a/iixr/fields.py Mon Feb 07 02:05:38 2011 +0100 +++ b/iixr/fields.py Tue Feb 08 00:08:27 2011 +0100 @@ -29,6 +29,7 @@ "Writing field data to files." def reset(self): + self.end_record() self.last_docnum = None self.subtractor = None @@ -47,8 +48,6 @@ self.subtractor = get_subtractor(docnum) docnum_seq = docnum - self.begin_record() - # Write the document number. self.write_sequence_value(docnum_seq) @@ -63,8 +62,6 @@ self.write_number(i) self.write_string(field, 1) # compress - self.end_record() - self.last_docnum = docnum class FieldReader(FileReader): @@ -74,6 +71,7 @@ def reset(self): self.last_docnum = None self.adder = None + self.begin_record() def read_fields(self): @@ -82,8 +80,6 @@ number and a list of field (identifier, value) pairs. """ - self.begin_record() - # Read the document number. docnum = self.read_sequence_value() @@ -109,8 +105,6 @@ fields.append((identifier, value)) i += 1 - self.end_record() - return self.last_docnum, fields def read_document_fields(self, docnum, offset): @@ -131,6 +125,7 @@ "Writing field index details to files." def reset(self): + self.end_record() self.last_docnum = None self.subtractor = None self.last_offset = 0 @@ -150,8 +145,6 @@ self.subtractor = get_subtractor(docnum) docnum_seq = docnum - self.begin_record() - # Write the document number. self.write_sequence_value(docnum_seq) @@ -159,7 +152,6 @@ # Write the offset delta. self.write_number(offset - self.last_offset) - self.end_record() self.last_docnum = docnum self.last_offset = offset @@ -172,13 +164,12 @@ self.last_docnum = None self.adder = None self.last_offset = 0 + self.begin_record() def read_document(self): "Read a document number and field file offset." - self.begin_record() - # Read the document number. docnum = self.read_sequence_value() @@ -192,7 +183,6 @@ # Read the offset. self.last_offset += self.read_number() - self.end_record() return self.last_docnum, self.last_offset @@ -211,6 +201,7 @@ "Write details of the document with the given 'docnum' and 'fields'." if self.entry % self.interval == 0: + self.field_writer.reset() offset = self.field_writer.tell() self.field_writer.write_fields(docnum, fields) self.field_index_writer.write_document(docnum, offset) @@ -230,6 +221,7 @@ def __init__(self, field_reader, field_index_reader): self.field_reader = field_reader self.field_index_reader = field_index_reader + self.entry = 0 self.cache = {} self.docs = [] @@ -267,7 +259,17 @@ "Return the next document number and fields." - return self.field_reader.read_fields() + try: + return self.field_reader.read_fields() + except EOFError: + self.entry += 1 + try: + found_docnum, offset = self.docs[self.entry] + except IndexError: + raise EOFError + else: + self.field_reader.reset() + return self.field_reader.read_fields() # Random access methods. diff -r 1f3986bca1a3 -r fc0e9882717b iixr/files.py --- a/iixr/files.py Mon Feb 07 02:05:38 2011 +0100 +++ b/iixr/files.py Tue Feb 08 00:08:27 2011 +0100 @@ -58,15 +58,17 @@ "Writing basic data types to files." def tell(self): + # NOTE: Will not be accurate within the current record. return self.f.tell() + len(self.data) def begin_record(self): pass def end_record(self): - vint_to_array(len(self.record), self.data) - self.data += self.record - self.record = array('B') + if self.record: + vint_to_array(len(self.record), self.data) + self.data += self.record + self.record = array('B') def write_number(self, number): @@ -132,6 +134,7 @@ def flush(self): if self.f is not None: + self.end_record() self.data.tofile(self.f) self.data = array('B') @@ -144,12 +147,16 @@ "Reading basic data types from files." def begin_record(self): - size = self.read_number_from_file() - self.record.fromfile(self.f, size) + self.record = array('B') self.start = 0 + try: + size = self.read_number_from_file() + self.record.fromfile(self.f, size) + except EOFError: + pass def end_record(self): - self.record = array('B') + pass def read_number_from_file(self): diff -r 1f3986bca1a3 -r fc0e9882717b iixr/positions.py --- a/iixr/positions.py Mon Feb 07 02:05:38 2011 +0100 +++ b/iixr/positions.py Tue Feb 08 00:08:27 2011 +0100 @@ -26,6 +26,7 @@ "Writing position information to files." def reset(self): + self.end_record() self.last_docnum = None self.subtractor = None @@ -56,10 +57,8 @@ self.subtractor = get_subtractor(docnum) docnum_seq = docnum - self.begin_record() self.write_sequence_value(docnum_seq) self.write_monotonic_sequence(positions) - self.end_record() self.last_docnum = docnum @@ -70,6 +69,7 @@ def reset(self): self.last_docnum = None self.adder = None + self.begin_record() def read_positions(self): @@ -77,8 +77,6 @@ Read positions, returning a document number and a list of positions. """ - self.begin_record() - # Read the document number. docnum = self.read_sequence_value() @@ -95,7 +93,6 @@ self.last_docnum = docnum positions = self.read_monotonic_sequence() - self.end_record() return self.last_docnum, positions @@ -104,6 +101,7 @@ "Writing position index information to files." def reset(self): + self.end_record() self.last_docnum = None self.subtractor = None self.last_pos_offset = 0 @@ -123,11 +121,9 @@ self.subtractor = get_subtractor(docnum) docnum_seq = docnum - self.begin_record() self.write_sequence_value(docnum_seq) self.write_number(pos_offset - self.last_pos_offset) self.write_number(count) - self.end_record() self.last_docnum = docnum self.last_pos_offset = pos_offset @@ -140,6 +136,7 @@ self.last_docnum = None self.adder = None self.last_pos_offset = 0 + self.begin_record() def read_positions(self): @@ -148,8 +145,6 @@ file, and the number of documents in a section of that file. """ - self.begin_record() - # Read the document number. docnum = self.read_sequence_value() @@ -167,7 +162,6 @@ # Read the document count. count = self.read_number() - self.end_record() return self.last_docnum, self.last_pos_offset, count @@ -339,14 +333,14 @@ self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) - first_offset = self.position_writer.tell() - first_docnum = None - # Reset the position writer so that position readers accessing # a section start with the correct document number. self.position_writer.reset() + first_offset = self.position_writer.tell() + first_docnum = None + # Finish writing an index entry for the remaining documents. else: diff -r 1f3986bca1a3 -r fc0e9882717b iixr/terms.py --- a/iixr/terms.py Mon Feb 07 02:05:38 2011 +0100 +++ b/iixr/terms.py Tue Feb 08 00:08:27 2011 +0100 @@ -29,6 +29,7 @@ "Writing term information to files." def reset(self): + self.end_record() self.last_term = "" self.last_offset = 0 @@ -40,14 +41,6 @@ term information file. """ - self.begin_record() - self._write_term(term, offset, frequency, doc_frequency) - self.end_record() - - def _write_term(self, term, offset, frequency, doc_frequency): - - "Performs the term writing for 'write_term'." - if term <= self.last_term: raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) @@ -79,6 +72,7 @@ def reset(self): self.last_term = "" self.last_offset = 0 + self.begin_record() def read_term(self): @@ -87,16 +81,6 @@ frequency from the term information file. """ - self.begin_record() - try: - return self._read_term() - finally: - self.end_record() - - def _read_term(self): - - "Performs the term reading for 'read_term'." - # Read the prefix length and term suffix. common = self.read_number() @@ -145,13 +129,11 @@ 'info_offset' in the term information file. """ - self.begin_record() - TermWriter._write_term(self, term, offset, frequency, doc_frequency) + TermWriter.write_term(self, term, offset, frequency, doc_frequency) # Write the information file offset delta. self.write_number(info_offset - self.last_info_offset) - self.end_record() self.last_info_offset = info_offset @@ -171,13 +153,11 @@ index file. """ - self.begin_record() - term, offset, frequency, doc_frequency = TermReader._read_term(self) + term, offset, frequency, doc_frequency = TermReader.read_term(self) # Read the offset delta. self.last_info_offset += self.read_number() - self.end_record() return term, offset, frequency, doc_frequency, self.last_info_offset @@ -197,16 +177,16 @@ """ Write the given 'term', its position file 'offset', its 'frequency' and its 'doc_frequency' (number of documents in which it appears) to the - term information file. Return the offset after the term information was + term information file. Return the offset before the term information was written to the file. """ - self.info_writer.write_term(term, offset, frequency, doc_frequency) - if self.entry % self.interval == 0: + self.info_writer.reset() info_offset = self.info_writer.tell() self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) + self.info_writer.write_term(term, offset, frequency, doc_frequency) self.entry += 1 def write_term_positions(self, term, doc_positions): @@ -236,6 +216,7 @@ self.info_reader = info_reader self.index_reader = index_reader self.position_dict_reader = position_dict_reader + self.entry = 0 self.terms = [] try: @@ -269,8 +250,10 @@ # as the closest. if i == -1: + self.entry = 0 return self.terms[0] else: + self.entry = i return self.terms[i] def _find_closest_term(self, term): @@ -297,10 +280,11 @@ # and scan for the desired term. else: - self.info_reader.go_to_term(found_term, offset, info_offset) + # Reset the term and offset for the new page. + self.info_reader.go_to_term("", 0, info_offset) try: while term > found_term: - found_term, offset, frequency, doc_frequency = self.info_reader.read_term() + found_term, offset, frequency, doc_frequency = self._read_term() except EOFError: pass @@ -355,6 +339,7 @@ # Sequential access methods. def rewind(self): + self.entry = 0 self.info_reader.rewind() def read_term(self): @@ -364,8 +349,28 @@ documents and positions at which the term is found. """ - term, offset, frequency, doc_frequency = self.info_reader.read_term() - return self._get_term_and_positions(term, offset, frequency, doc_frequency) + return self._get_term_and_positions(*self._read_term()) + + def _read_term(self): + + try: + term, offset, frequency, doc_frequency = self.info_reader.read_term() + except EOFError: + self.entry += 1 + try: + term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry] + except IndexError: + raise EOFError + else: + # Reset the term and offset for the new page. + + self.info_reader.go_to_term("", 0, info_offset) + + # Skip the term in the information file. + + self.info_reader.read_term() + + return term, offset, frequency, doc_frequency def go_to_term(self, term): @@ -380,7 +385,14 @@ # Position the reader, if necessary. if info_offset is not None: - self.info_reader.go_to_term(found_term, offset, info_offset) + + # Reset the term and offset for the new page. + + self.info_reader.go_to_term("", 0, info_offset) + + # Skip the term in the information file. + + self.info_reader.read_term() return found_term, offset, frequency, doc_frequency @@ -407,7 +419,7 @@ while found_term.startswith(term): terms.append(found_term) - found_term, offset, frequency, doc_frequency = self.info_reader.read_term() + found_term, offset, frequency, doc_frequency = self._read_term() except EOFError: pass