# HG changeset patch # User Paul Boddie # Date 1253300855 -7200 # Node ID 7189d6ef001f984dd34703904850fa6e221252f6 # Parent 8d0f465630d2fd6f0ee511e962d876551dedac25 Introduced various optimisations: increasing the vint cache and introducing array usage instead of lists, removing unnecessary tell operations, restructuring position writing to work with explicit tell invocations, and restructuring number reading slightly. diff -r 8d0f465630d2 -r 7189d6ef001f iixr/data.py --- a/iixr/data.py Fri Sep 18 01:39:08 2009 +0200 +++ b/iixr/data.py Fri Sep 18 21:07:35 2009 +0200 @@ -18,6 +18,8 @@ with this program. If not, see . """ +from array import array + vint_cache = {} def vint(number): @@ -31,23 +33,22 @@ # Write the number from least to most significant digits. - bytes = [] + bytes = array('B') while number > 127: - lsd = number & 127 | 128 - bytes.append(chr(lsd)) + bytes.append(number & 127 | 128) number = number >> 7 else: - bytes.append(chr(number)) + bytes.append(number) - return "".join(bytes) + return bytes.tostring() # Negative numbers are not supported. else: raise ValueError, "Number %r is negative." % number -for i in xrange(0, 1024): +for i in xrange(0, 65536): vint_cache[i] = vint(i) # vim: tabstop=4 expandtab shiftwidth=4 diff -r 8d0f465630d2 -r 7189d6ef001f iixr/fields.py --- a/iixr/fields.py Fri Sep 18 01:39:08 2009 +0200 +++ b/iixr/fields.py Fri Sep 18 21:07:35 2009 +0200 @@ -33,11 +33,8 @@ """ Write for the given 'docnum', a list of 'fields' (integer, string pairs representing field identifiers and values respectively). - Return the offset at which the fields are stored. """ - offset = self.f.tell() - # Write the document number delta. self.write_number(docnum - self.last_docnum) @@ -53,7 +50,6 @@ self.write_string(field, 1) # compress self.last_docnum = docnum - return offset class FieldReader(FileReader): @@ -159,10 +155,12 @@ "Write details of the document with the given 'docnum' and 'fields'." - offset = self.field_writer.write_fields(docnum, fields) - if self.entry % self.interval == 0: + offset = self.field_writer.f.tell() + self.field_writer.write_fields(docnum, fields) self.field_index_writer.write_document(docnum, offset) + else: + self.field_writer.write_fields(docnum, fields) self.entry += 1 diff -r 8d0f465630d2 -r 7189d6ef001f iixr/files.py --- a/iixr/files.py Fri Sep 18 01:39:08 2009 +0200 +++ b/iixr/files.py Fri Sep 18 21:07:35 2009 +0200 @@ -105,23 +105,25 @@ # Read each byte, adding it to the number. - shift = 0 - number = 0 read = self.f.read - try: - csd = ord(read(1)) - while csd & 128: - number += ((csd & 127) << shift) - shift += 7 - csd = ord(read(1)) + c = read(1) + if c: + csd = ord(c) + if csd < 128: + return csd else: - number += (csd << shift) - except TypeError: + shift = 0 + number = 0 + while csd & 128: + number += ((csd & 127) << shift) + shift += 7 + csd = ord(read(1)) + else: + return number + (csd << shift) + else: raise EOFError - return number - def read_string(self, decompress=0): """ diff -r 8d0f465630d2 -r 7189d6ef001f iixr/positions.py --- a/iixr/positions.py Fri Sep 18 01:39:08 2009 +0200 +++ b/iixr/positions.py Fri Sep 18 21:07:35 2009 +0200 @@ -32,16 +32,11 @@ """ Write for the document 'docnum' the given 'positions'. - Return the offset of the written record. """ if docnum < self.last_docnum: raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) - # Record the offset of this record. - - offset = self.f.tell() - # Make sure that the positions are sorted. positions.sort() @@ -63,7 +58,6 @@ self.f.write("".join([vint(x) for x in output])) self.last_docnum = docnum - return offset class PositionOpener(FileOpener): @@ -97,10 +91,6 @@ position index file. """ - # Record the offset of this record. - - offset = self.f.tell() - # Write the document number delta. # Write the position file offset delta. # Write the document count. @@ -113,7 +103,6 @@ self.last_pos_offset = pos_offset self.last_docnum = docnum - return offset class PositionIndexOpener(FileOpener): @@ -274,57 +263,52 @@ self.position_writer.reset() self.position_index_writer.reset() - index_offset = None + # Remember the first index entry offset. + + index_offset = self.position_index_writer.f.tell() # Write the positions. frequency = 0 - first_docnum = None - first_offset = None count = 0 - doc_positions.sort() - - for docnum, positions in doc_positions: - pos_offset = self.position_writer.write_positions(docnum, positions) + if doc_positions: # Retain the first record offset for a subsequent index entry. - if first_offset is None: - first_offset = pos_offset - first_docnum = docnum + first_offset = self.position_writer.f.tell() + first_docnum = None - frequency += len(positions) - count += 1 + doc_positions.sort() - # Every {interval} entries, write an index entry. - - if count % self.interval == 0: - io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) + for docnum, positions in doc_positions: + if first_docnum is None: + first_docnum = docnum - # Remember the first index entry offset. + self.position_writer.write_positions(docnum, positions) - if index_offset is None: - index_offset = io + frequency += len(positions) + count += 1 + + # Every {interval} entries, write an index entry. - first_offset = None - first_docnum = None + if count % self.interval == 0: - # Reset the position writer so that position readers accessing - # a section start with the correct document number. + self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) - self.position_writer.reset() + first_offset = self.position_writer.f.tell() + first_docnum = None - # Finish writing an index entry for the remaining documents. + # Reset the position writer so that position readers accessing + # a section start with the correct document number. + + self.position_writer.reset() - else: - if first_offset is not None: - io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) + # Finish writing an index entry for the remaining documents. - # Remember the first index entry offset. - - if index_offset is None: - index_offset = io + else: + if first_docnum is not None: + self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) return index_offset, frequency, count diff -r 8d0f465630d2 -r 7189d6ef001f iixr/terms.py --- a/iixr/terms.py Fri Sep 18 01:39:08 2009 +0200 +++ b/iixr/terms.py Fri Sep 18 21:07:35 2009 +0200 @@ -36,8 +36,7 @@ """ Write the given 'term', its position file 'offset', its 'frequency' and its 'doc_frequency' (number of documents in which it appears) to the - term information file. Return the offset after the term information was - written to the file. + term information file. """ # Write the prefix length and term suffix. @@ -63,8 +62,6 @@ self.last_term = term self.last_offset = offset - return self.f.tell() - class TermReader(FileReader): "Reading term information from files." @@ -179,9 +176,10 @@ written to the file. """ - info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency) + self.info_writer.write_term(term, offset, frequency, doc_frequency) if self.entry % self.interval == 0: + info_offset = self.info_writer.f.tell() self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) self.entry += 1 diff -r 8d0f465630d2 -r 7189d6ef001f test.py --- a/test.py Fri Sep 18 01:39:08 2009 +0200 +++ b/test.py Fri Sep 18 21:07:35 2009 +0200 @@ -94,9 +94,9 @@ doc_frequency = 0 w.reset() for docnum, pos_offset, count in term_positions: - io = w.write_positions(docnum, pos_offset, count) if offset is None: - offset = io + offset = w.f.tell() + w.write_positions(docnum, pos_offset, count) doc_frequency += count offsets.append((offset, doc_frequency)) w.close()