# HG changeset patch # User Paul Boddie # Date 1296692795 -3600 # Node ID 4c35f0aa339cef62072023de65a56ae04c584106 # Parent b50ba4291c5c6e1e40590f1860a97b1b7f6dd8f3 Changed the files to have an internal array for reading and writing data. diff -r b50ba4291c5c -r 4c35f0aa339c iixr/fields.py --- a/iixr/fields.py Fri Jan 28 01:36:25 2011 +0100 +++ b/iixr/fields.py Thu Feb 03 01:26:35 2011 +0100 @@ -47,14 +47,12 @@ # Write the number of values per document number. # Write the document number delta. - output = array('B') - vint_to_array(self.docnum_size, output) - self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) + self.write_number(self.docnum_size) + self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) # Write the number of fields. - vint_to_array(len(fields), output) - output.tofile(self.f) + self.write_number(len(fields)) # Write the fields themselves. @@ -142,10 +140,8 @@ # Write the number of values per document number. # Write the document number delta. - output = array('B') - vint_to_array(self.docnum_size, output) - self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) - output.tofile(self.f) + self.write_number(self.docnum_size) + self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) # Write the offset delta. @@ -196,7 +192,7 @@ "Write details of the document with the given 'docnum' and 'fields'." if self.entry % self.interval == 0: - offset = self.field_writer.f.tell() + offset = self.field_writer.tell() self.field_writer.write_fields(docnum, fields) self.field_index_writer.write_document(docnum, offset) else: diff -r b50ba4291c5c -r 4c35f0aa339c iixr/files.py --- a/iixr/files.py Fri Jan 28 01:36:25 2011 +0100 +++ b/iixr/files.py Thu Feb 03 01:26:35 2011 +0100 @@ -30,6 +30,7 @@ def __init__(self, f): self.f = f + self.data = array('B') self.reset() def reset(self): @@ -46,8 +47,14 @@ self.f.seek(0) self.reset() + def flush(self): + if self.f is not None: + self.data.tofile(self.f) + self.data = array('B') + def close(self): if self.f is not None: + self.data.tofile(self.f) self.f.close() self.f = None @@ -67,23 +74,21 @@ "Writing basic data types to files." - def __init__(self, f): - File.__init__(self, f) + def tell(self): + return self.f.tell() + len(self.data) def write_number(self, number): "Write 'number' to the file using a variable length encoding." - self.f.write(vint(number)) + vint_to_array(number, self.data) def write_numbers(self, numbers): "Write 'numbers' to the file using a variable length encoding." - output = array('B') for number in numbers: - vint_to_array(number, output) - output.tofile(self.f) + vint_to_array(number, self.data) def write_string(self, s, compress=0): @@ -116,9 +121,9 @@ # Write the length of the data before the data itself. length = len(s) - self.f.write("".join([flag, vint(length), s])) + self.data.fromstring("".join([flag, vint(length), s])) - def write_sequence(self, output, value, last, size, monotonic=1): + def write_sequence(self, value, last, size, monotonic=1): if size: emit_delta = 1 for v, l in map(None, value, last)[:size]: @@ -130,9 +135,9 @@ emit_delta = 0 else: v_out = v + 1 - vint_to_array(v_out, output) + vint_to_array(v_out, self.data) else: - vint_to_array(value - last, output) + vint_to_array(value - last, self.data) return value @@ -140,28 +145,28 @@ "Reading basic data types from files." - def __init__(self, f): - File.__init__(self, f) - def read_number(self): "Read a number from the file." # Read each byte, adding it to the number. - a = array('B') + f = self.f + a = self.data fromfile = a.fromfile - f = self.f - fromfile(f, 1) - csd = a[-1] - if csd < 128: - return csd - else: - while csd & 128: - fromfile(f, 1) - csd = a[-1] - return vint_from_array(a) + try: + fromfile(f, 1) + csd = a[-1] + if csd < 128: + return csd + else: + while csd & 128: + fromfile(f, 1) + csd = a[-1] + return vint_from_array(self.data) + finally: + self.data = array('B') def read_string(self, decompress=0): @@ -170,26 +175,30 @@ 'decompress' is set to a true value. """ - read = self.f.read - # Decompress the data if requested. if decompress: - flag = read(1) + flag = self.f.read(1) else: flag = "-" length = self.read_number() - s = read(length) - # Perform decompression if applicable. + try: + self.data.fromfile(self.f, length) + s = self.data.tostring() + + # Perform decompression if applicable. - if flag == "z": - s = zlib.decompress(s) + if flag == "z": + s = zlib.decompress(s) + + # Convert strings to Unicode objects. - # Convert strings to Unicode objects. + return unicode(s, "utf-8") - return unicode(s, "utf-8") + finally: + self.data = array('B') def read_sequence(self, last, size, monotonic=1): if size: diff -r b50ba4291c5c -r 4c35f0aa339c iixr/positions.py --- a/iixr/positions.py Fri Jan 28 01:36:25 2011 +0100 +++ b/iixr/positions.py Thu Feb 03 01:26:35 2011 +0100 @@ -20,7 +20,6 @@ from iixr.files import * from iixr.data import vint, vint_to_array -from array import array class PositionWriter(FileWriter): @@ -58,20 +57,17 @@ # Write the number of positions. # Write the number of values per position. - output = array('B') - vint_to_array(self.docnum_size, output) - self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) - vint_to_array(len(positions), output) - vint_to_array(size, output) + self.write_number(self.docnum_size) + self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) + self.write_number(len(positions)) + self.write_number(size) # Write the position deltas. last = self.get_initial_value(size) for position in positions: - last = self.write_sequence(output, position, last, size) - - output.tofile(self.f) + last = self.write_sequence(position, last, size) self.last_docnum = docnum @@ -150,15 +146,10 @@ # Write the position file offset delta. # Write the document count. - output = array('B') - vint_to_array(self.docnum_size, output) - self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) - vint_to_array(pos_offset - self.last_pos_offset, output) - vint_to_array(count, output) - - # Actually write the data. - - output.tofile(self.f) + self.write_number(self.docnum_size) + self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) + self.write_number(pos_offset - self.last_pos_offset) + self.write_number(count) self.last_pos_offset = pos_offset @@ -335,7 +326,7 @@ # Remember the first index entry offset. - index_offset = self.position_index_writer.f.tell() + index_offset = self.position_index_writer.tell() # Write the positions. @@ -346,7 +337,7 @@ # Retain the first record offset for a subsequent index entry. - first_offset = self.position_writer.f.tell() + first_offset = self.position_writer.tell() first_docnum = None doc_positions.sort() @@ -366,7 +357,7 @@ self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) - first_offset = self.position_writer.f.tell() + first_offset = self.position_writer.tell() first_docnum = None # Reset the position writer so that position readers accessing diff -r b50ba4291c5c -r 4c35f0aa339c iixr/terms.py --- a/iixr/terms.py Fri Jan 28 01:36:25 2011 +0100 +++ b/iixr/terms.py Thu Feb 03 01:26:35 2011 +0100 @@ -181,7 +181,7 @@ self.info_writer.write_term(term, offset, frequency, doc_frequency) if self.entry % self.interval == 0: - info_offset = self.info_writer.f.tell() + info_offset = self.info_writer.tell() self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) self.entry += 1 diff -r b50ba4291c5c -r 4c35f0aa339c test.py --- a/test.py Fri Jan 28 01:36:25 2011 +0100 +++ b/test.py Thu Feb 03 01:26:35 2011 +0100 @@ -5,7 +5,6 @@ from iixr.terms import * from iixr.positions import * from iixr.index import * -from array import array import os, sys # Remove old test files. @@ -48,11 +47,9 @@ f = open("testMS", "wb") w = FileWriter(f) -b = array("B") last = w.get_initial_value(2) for t in tuples: - last = w.write_sequence(b, t, last, 2) -b.tofile(w.f) + last = w.write_sequence(t, last, 2) w.close() f = open("testMS", "rb") @@ -67,11 +64,9 @@ f = open("testNMS", "wb") w = FileWriter(f) -b = array("B") last = w.get_initial_value(2) for t in tuples2: - last = w.write_sequence(b, t, last, 2, monotonic=0) -b.tofile(w.f) + last = w.write_sequence(t, last, 2, monotonic=0) w.close() f = open("testNMS", "rb") @@ -170,7 +165,7 @@ w.reset() for docnum, pos_offset, count in term_positions: if offset is None: - offset = w.f.tell() + offset = w.tell() w.write_positions(docnum, pos_offset, count) doc_frequency += count offsets.append((offset, doc_frequency))