# HG changeset patch # User Paul Boddie # Date 1297040738 -3600 # Node ID 1f3986bca1a3733f1047a32b45de700ac4473283 # Parent 4c35f0aa339cef62072023de65a56ae04c584106 Introduced record-oriented reading and writing of files where an array is populated in a single read from a file or flushed to a buffer in a single write operation. Moved various data representation operations into the data module, removing explicit object size concerns from the higher-level modules, replacing them with usage of adder and subtractor functions where appropriate. Made the vint caches lists instead of dictionaries. Enforced tuples as the input representation of serialised sequence values. diff -r 4c35f0aa339c -r 1f3986bca1a3 iixr/data.py --- a/iixr/data.py Thu Feb 03 01:26:35 2011 +0100 +++ b/iixr/data.py Mon Feb 07 02:05:38 2011 +0100 @@ -1,7 +1,7 @@ #!/usr/bin/env python """ -Variable-length integer functions. +Data representation functions. Copyright (C) 2009, 2010, 2011 Paul Boddie @@ -19,9 +19,76 @@ """ from array import array +import operator -vint_cache = {} -vint_bytes_cache = {} +# High-level representations. + +def convert_sequence(values, op): + if values: + new_values = list(values) + last = new_values[0] + i = 1 + length = len(new_values) + while i < length: + current = new_values[i] + new_values[i] = op(new_values[i], last) + last = current + i += 1 + +def add_seq_monotonic(x, y): + return op_seq_monotonic(x, y, operator.add) + +def sub_seq_monotonic(x, y): + return op_seq_monotonic(x, y, operator.sub) + +def op_seq_monotonic(x, y, op): + return tuple([op(a, b) for a, b in zip(x, y)]) + +def add_seq(x, y): + length = min(len(x), len(y)) + seq = list(x)[:length] + i = 0 + while i < length: + if x[i] != 0: + seq[i] = x[i] + y[i] + break + seq[i] = y[i] + i += 1 + return tuple(seq) + +def sub_seq(x, y): + length = min(len(x), len(y)) + seq = list(x)[:length] + i = 0 + while i < length: + replacement = x[i] - y[i] + if replacement != 0: + seq[i] = replacement + break + seq[i] = 0 + i += 1 + return tuple(seq) + +def is_sequence(value): + return isinstance(value, (list, tuple)) + +def get_monotonic_adder(value): + return is_sequence(value) and add_seq_monotonic or operator.add + +def get_monotonic_subtractor(value): + return is_sequence(value) and sub_seq_monotonic or operator.sub + +def get_adder(value): + return is_sequence(value) and add_seq or operator.add + +def get_subtractor(value): + return is_sequence(value) and sub_seq or operator.sub + +# Low-level representations. +# Variable-length integer functions. + +vint_cache = [] +vint_bytes_cache = [] def vint(number): @@ -29,7 +96,7 @@ try: return vint_cache[number] - except KeyError: + except IndexError: if number >= 0: bytes = array('B') _vint_to_array(number, bytes) @@ -46,7 +113,7 @@ try: bytes += vint_bytes_cache[number] - except KeyError: + except IndexError: if number >= 0: _vint_to_array(number, bytes) @@ -75,6 +142,28 @@ number += bytes.pop() & 127 return number +def vint_from_array_start(bytes, start): + + """ + Read a variable-length integer from 'bytes', starting at 'start', and + returning a tuple containing a number and the first position after the + number. + """ + + number = 0 + length = len(bytes) + digit = 0 + while start < length: + x = bytes[start] + number += (x & 127) << digit + digit += 7 + start += 1 + if not (x & 128): + break + return number, start + +# String serialisation. + def string_to_array(s, bytes): "Write the given string 's' to 'bytes'." @@ -82,10 +171,45 @@ vint_to_array(len(s), bytes) bytes.fromstring(s.encode("utf-8")) +# Sequence serialisation. + +def sequence_to_array(value, bytes): + + "Write the given sequence 'value' to 'bytes'." + + size = is_sequence(value) and len(value) or 0 + vint_to_array(size, bytes) + if size: + for a in value: + vint_to_array(a, bytes) + else: + vint_to_array(value, bytes) + +def sequence_from_array(bytes, start=0): + + """ + Read a sequence from 'bytes', returning the sequence and the first position + after the sequence. + """ + + size, start = vint_from_array_start(bytes, start) + if size: + j = 0 + value = [] + while j < size: + v, start = vint_from_array_start(bytes, start) + value.append(v) + j += 1 + return tuple(value), start + else: + return vint_from_array_start(bytes, start) + +# Variable-length integer cache initialisation. + for i in xrange(0, 65536): bytes = array('B') _vint_to_array(i, bytes) - vint_bytes_cache[i] = bytes - vint_cache[i] = bytes.tostring() + vint_bytes_cache.append(bytes) + vint_cache.append(bytes.tostring()) # vim: tabstop=4 expandtab shiftwidth=4 diff -r 4c35f0aa339c -r 1f3986bca1a3 iixr/fields.py --- a/iixr/fields.py Thu Feb 03 01:26:35 2011 +0100 +++ b/iixr/fields.py Mon Feb 07 02:05:38 2011 +0100 @@ -3,7 +3,7 @@ """ Specific classes for storing document information. -Copyright (C) 2009, 2010 Paul Boddie +Copyright (C) 2009, 2010, 2011 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -18,6 +18,7 @@ with this program. If not, see . """ +from iixr.data import * from iixr.files import * from bisect import bisect_right # to find terms in the dictionary index @@ -29,7 +30,7 @@ def reset(self): self.last_docnum = None - self.docnum_size = None + self.subtractor = None def write_fields(self, docnum, fields): @@ -40,15 +41,17 @@ # Find the size of document number values. - if self.docnum_size is None: - self.docnum_size = self.get_value_size(docnum) - self.last_docnum = self.get_initial_value(self.docnum_size) + if self.last_docnum is not None: + docnum_seq = self.subtractor(docnum, self.last_docnum) + else: + self.subtractor = get_subtractor(docnum) + docnum_seq = docnum - # Write the number of values per document number. - # Write the document number delta. + self.begin_record() - self.write_number(self.docnum_size) - self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) + # Write the document number. + + self.write_sequence_value(docnum_seq) # Write the number of fields. @@ -60,12 +63,17 @@ self.write_number(i) self.write_string(field, 1) # compress + self.end_record() + + self.last_docnum = docnum + class FieldReader(FileReader): "Reading field data from files." def reset(self): self.last_docnum = None + self.adder = None def read_fields(self): @@ -74,16 +82,17 @@ number and a list of field (identifier, value) pairs. """ - # Read the number of values per document number. + self.begin_record() - docnum_size = self.read_number() + # Read the document number. + + docnum = self.read_sequence_value() - if self.last_docnum is None: - self.last_docnum = self.get_initial_value(docnum_size) - - # Read the document number delta and add it to the last number. - - self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) + if self.last_docnum is not None: + self.last_docnum = self.adder(docnum, self.last_docnum) + else: + self.adder = get_adder(docnum) + self.last_docnum = docnum # Read the number of fields. @@ -100,6 +109,8 @@ fields.append((identifier, value)) i += 1 + self.end_record() + return self.last_docnum, fields def read_document_fields(self, docnum, offset): @@ -121,7 +132,7 @@ def reset(self): self.last_docnum = None - self.docnum_size = None + self.subtractor = None self.last_offset = 0 def write_document(self, docnum, offset): @@ -133,19 +144,24 @@ # Find the size of document number values. - if self.docnum_size is None: - self.docnum_size = self.get_value_size(docnum) - self.last_docnum = self.get_initial_value(self.docnum_size) + if self.last_docnum is not None: + docnum_seq = self.subtractor(docnum, self.last_docnum) + else: + self.subtractor = get_subtractor(docnum) + docnum_seq = docnum - # Write the number of values per document number. - # Write the document number delta. + self.begin_record() - self.write_number(self.docnum_size) - self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) + # Write the document number. + + self.write_sequence_value(docnum_seq) # Write the offset delta. self.write_number(offset - self.last_offset) + self.end_record() + + self.last_docnum = docnum self.last_offset = offset class FieldIndexReader(FileReader): @@ -154,26 +170,29 @@ def reset(self): self.last_docnum = None + self.adder = None self.last_offset = 0 def read_document(self): "Read a document number and field file offset." - # Read the number of values per document number. + self.begin_record() - docnum_size = self.read_number() + # Read the document number. + + docnum = self.read_sequence_value() - if self.last_docnum is None: - self.last_docnum = self.get_initial_value(docnum_size) - - # Read the document number delta and add it to the last number. - - self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) + if self.last_docnum is not None: + self.last_docnum = self.adder(docnum, self.last_docnum) + else: + self.adder = get_adder(docnum) + self.last_docnum = docnum # Read the offset. self.last_offset += self.read_number() + self.end_record() return self.last_docnum, self.last_offset diff -r 4c35f0aa339c -r 1f3986bca1a3 iixr/files.py --- a/iixr/files.py Thu Feb 03 01:26:35 2011 +0100 +++ b/iixr/files.py Mon Feb 07 02:05:38 2011 +0100 @@ -18,7 +18,7 @@ with this program. If not, see . """ -from iixr.data import vint, vint_to_array, vint_from_array +from iixr.data import * from array import array import zlib @@ -30,7 +30,8 @@ def __init__(self, f): self.f = f - self.data = array('B') + self.data = array('B') # master buffer + self.record = array('B') # record buffer self.reset() def reset(self): @@ -47,29 +48,11 @@ self.f.seek(0) self.reset() - def flush(self): - if self.f is not None: - self.data.tofile(self.f) - self.data = array('B') - def close(self): if self.f is not None: - self.data.tofile(self.f) self.f.close() self.f = None - def get_value_size(self, value): - if isinstance(value, (list, tuple)): - return len(value) - else: - return 0 - - def get_initial_value(self, size): - if size: - return [0] * size - else: - return 0 - class FileWriter(File): "Writing basic data types to files." @@ -77,18 +60,26 @@ def tell(self): return self.f.tell() + len(self.data) + def begin_record(self): + pass + + def end_record(self): + vint_to_array(len(self.record), self.data) + self.data += self.record + self.record = array('B') + def write_number(self, number): "Write 'number' to the file using a variable length encoding." - vint_to_array(number, self.data) + vint_to_array(number, self.record) def write_numbers(self, numbers): "Write 'numbers' to the file using a variable length encoding." for number in numbers: - vint_to_array(number, self.data) + vint_to_array(number, self.record) def write_string(self, s, compress=0): @@ -121,120 +112,122 @@ # Write the length of the data before the data itself. length = len(s) - self.data.fromstring("".join([flag, vint(length), s])) + self.record.fromstring("".join([flag, vint(length), s])) + + def write_sequence_value(self, value): + sequence_to_array(value, self.record) + + def write_sequence_values(self, values): + vint_to_array(len(values), self.record) + for value in values: + self.write_sequence_value(value) - def write_sequence(self, value, last, size, monotonic=1): - if size: - emit_delta = 1 - for v, l in map(None, value, last)[:size]: - if v is None: - v = l - if monotonic or emit_delta: - v_out = v - l - if emit_delta and v_out != 0: - emit_delta = 0 - else: - v_out = v + 1 - vint_to_array(v_out, self.data) - else: - vint_to_array(value - last, self.data) + def write_delta_sequence(self, values): + convert_sequence(values, get_subtractor(values[0])) + self.write_sequence_values(values) + + def write_monotonic_sequence(self, values): + convert_sequence(values, get_monotonic_subtractor(values[0])) + self.write_sequence_values(values) - return value + def flush(self): + if self.f is not None: + self.data.tofile(self.f) + self.data = array('B') + + def close(self): + self.flush() + File.close(self) class FileReader(File): "Reading basic data types from files." - def read_number(self): + def begin_record(self): + size = self.read_number_from_file() + self.record.fromfile(self.f, size) + self.start = 0 + + def end_record(self): + self.record = array('B') + + def read_number_from_file(self): "Read a number from the file." # Read each byte, adding it to the number. f = self.f - a = self.data + a = array('B') fromfile = a.fromfile - try: - fromfile(f, 1) - csd = a[-1] - if csd < 128: - return csd - else: - while csd & 128: - fromfile(f, 1) - csd = a[-1] - return vint_from_array(self.data) - finally: - self.data = array('B') + fromfile(f, 1) + csd = a[-1] + if csd < 128: + return csd + else: + while csd & 128: + fromfile(f, 1) + csd = a[-1] + return vint_from_array(a) + + def read_number(self): + + "Read a number from the current record." + + n, self.start = vint_from_array_start(self.record, self.start) + return n def read_string(self, decompress=0): """ - Read a string from the file, decompressing the stored data if + Read a string from the current record, decompressing the stored data if 'decompress' is set to a true value. """ # Decompress the data if requested. if decompress: - flag = self.f.read(1) + flag = chr(self.record[self.start]) + self.start += 1 else: flag = "-" length = self.read_number() + start = self.start + self.start += length + s = self.record[start:self.start].tostring() - try: - self.data.fromfile(self.f, length) - s = self.data.tostring() - - # Perform decompression if applicable. + # Perform decompression if applicable. - if flag == "z": - s = zlib.decompress(s) + if flag == "z": + s = zlib.decompress(s) - # Convert strings to Unicode objects. + # Convert strings to Unicode objects. - return unicode(s, "utf-8") + return unicode(s, "utf-8") - finally: - self.data = array('B') + def read_sequence_value(self): + value, self.start = sequence_from_array(self.record, self.start) + return value - def read_sequence(self, last, size, monotonic=1): - if size: - value = [] - if monotonic: - for v in last: - v_in = self.read_number() - value.append(v + v_in) - else: - i = 0 - n = len(last) - value = list(last) - - # Traverse a copy of the last value. - - while i < n: - v_in = self.read_number() + def read_sequences(self): + values = [] + length = self.read_number() + i = 0 + while i < length: + values.append(self.read_sequence_value()) + i += 1 + return values - # While zeros are read, retain the last value elements. - # Otherwise, add the delta... - - if v_in != 0: - value[i] += v_in - i += 1 - - # Then set absolute values for the remaining elements. + def read_delta_sequence(self): + values = self.read_sequences() + convert_sequence(values, get_adder(values[0])) + return values - while i < n: - value[i] = self.read_number() - 1 - i += 1 - break - - i += 1 - - return tuple(value) - else: - return last + self.read_number() + def read_monotonic_sequence(self): + values = self.read_sequences() + convert_sequence(values, get_monotonic_adder(values[0])) + return values # vim: tabstop=4 expandtab shiftwidth=4 diff -r 4c35f0aa339c -r 1f3986bca1a3 iixr/positions.py --- a/iixr/positions.py Thu Feb 03 01:26:35 2011 +0100 +++ b/iixr/positions.py Mon Feb 07 02:05:38 2011 +0100 @@ -3,7 +3,7 @@ """ Specific classes for storing position information. -Copyright (C) 2009, 2010 Paul Boddie +Copyright (C) 2009, 2010, 2011 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -18,8 +18,8 @@ with this program. If not, see . """ +from iixr.data import * from iixr.files import * -from iixr.data import vint, vint_to_array class PositionWriter(FileWriter): @@ -27,7 +27,7 @@ def reset(self): self.last_docnum = None - self.docnum_size = None + self.subtractor = None def write_positions(self, docnum, positions): @@ -35,39 +35,31 @@ Write for the document 'docnum' the given 'positions'. """ - # Find the size of document number values. - - if self.docnum_size is None: - self.docnum_size = self.get_value_size(docnum) - self.last_docnum = self.get_initial_value(self.docnum_size) - - if docnum < self.last_docnum: - raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) + if not positions: + return # Make sure that the positions are sorted. positions.sort() - # Find the size of position values. - - size = self.get_value_size(positions[0]) + # Calculate an ongoing delta. - # Write the number of values per document number. - # Write the document number delta. - # Write the number of positions. - # Write the number of values per position. + if self.last_docnum is not None: + if docnum < self.last_docnum: + raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) + + docnum_seq = self.subtractor(docnum, self.last_docnum) - self.write_number(self.docnum_size) - self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) - self.write_number(len(positions)) - self.write_number(size) + # Or preserve the document number and prepare for future deltas. - # Write the position deltas. + else: + self.subtractor = get_subtractor(docnum) + docnum_seq = docnum - last = self.get_initial_value(size) - - for position in positions: - last = self.write_sequence(position, last, size) + self.begin_record() + self.write_sequence_value(docnum_seq) + self.write_monotonic_sequence(positions) + self.end_record() self.last_docnum = docnum @@ -77,6 +69,7 @@ def reset(self): self.last_docnum = None + self.adder = None def read_positions(self): @@ -84,38 +77,25 @@ Read positions, returning a document number and a list of positions. """ - # Read the number of values per document number. + self.begin_record() - docnum_size = self.read_number() - - if self.last_docnum is None: - self.last_docnum = self.get_initial_value(docnum_size) + # Read the document number. - # Read the document number delta and add it to the last number. - - self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) + docnum = self.read_sequence_value() - # Read the number of positions. - - npositions = self.read_number() + # Calculate an ongoing delta. - # Read the number of values per position. - - size = self.read_number() + if self.last_docnum is not None: + self.last_docnum = self.adder(docnum, self.last_docnum) - # Read the position deltas, adding each previous position to get the - # appropriate collection of absolute positions. - - i = 0 + # Or preserve the document number and prepare for future deltas. - last = self.get_initial_value(size) - - positions = [] + else: + self.adder = get_adder(docnum) + self.last_docnum = docnum - while i < npositions: - last = self.read_sequence(last, size) - positions.append(last) - i += 1 + positions = self.read_monotonic_sequence() + self.end_record() return self.last_docnum, positions @@ -125,7 +105,7 @@ def reset(self): self.last_docnum = None - self.docnum_size = None + self.subtractor = None self.last_pos_offset = 0 def write_positions(self, docnum, pos_offset, count): @@ -137,20 +117,19 @@ # Find the size of document number values. - if self.docnum_size is None: - self.docnum_size = self.get_value_size(docnum) - self.last_docnum = self.get_initial_value(self.docnum_size) + if self.last_docnum is not None: + docnum_seq = self.subtractor(docnum, self.last_docnum) + else: + self.subtractor = get_subtractor(docnum) + docnum_seq = docnum - # Write the number of values per document number. - # Write the document number delta. - # Write the position file offset delta. - # Write the document count. - - self.write_number(self.docnum_size) - self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) + self.begin_record() + self.write_sequence_value(docnum_seq) self.write_number(pos_offset - self.last_pos_offset) self.write_number(count) + self.end_record() + self.last_docnum = docnum self.last_pos_offset = pos_offset class PositionIndexReader(FileReader): @@ -159,6 +138,7 @@ def reset(self): self.last_docnum = None + self.adder = None self.last_pos_offset = 0 def read_positions(self): @@ -168,16 +148,17 @@ file, and the number of documents in a section of that file. """ - # Read the number of values per document number. + self.begin_record() - docnum_size = self.read_number() + # Read the document number. + + docnum = self.read_sequence_value() - if self.last_docnum is None: - self.last_docnum = self.get_initial_value(docnum_size) - - # Read the document number delta and add it to the last number. - - self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) + if self.last_docnum is not None: + self.last_docnum = self.adder(docnum, self.last_docnum) + else: + self.adder = get_adder(docnum) + self.last_docnum = docnum # Read the offset delta. @@ -186,6 +167,7 @@ # Read the document count. count = self.read_number() + self.end_record() return self.last_docnum, self.last_pos_offset, count diff -r 4c35f0aa339c -r 1f3986bca1a3 iixr/terms.py --- a/iixr/terms.py Thu Feb 03 01:26:35 2011 +0100 +++ b/iixr/terms.py Mon Feb 07 02:05:38 2011 +0100 @@ -3,7 +3,7 @@ """ Specific classes for storing term information. -Copyright (C) 2009, 2010 Paul Boddie +Copyright (C) 2009, 2010, 2011 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -40,6 +40,14 @@ term information file. """ + self.begin_record() + self._write_term(term, offset, frequency, doc_frequency) + self.end_record() + + def _write_term(self, term, offset, frequency, doc_frequency): + + "Performs the term writing for 'write_term'." + if term <= self.last_term: raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) @@ -79,6 +87,16 @@ frequency from the term information file. """ + self.begin_record() + try: + return self._read_term() + finally: + self.end_record() + + def _read_term(self): + + "Performs the term reading for 'read_term'." + # Read the prefix length and term suffix. common = self.read_number() @@ -127,11 +145,14 @@ 'info_offset' in the term information file. """ - TermWriter.write_term(self, term, offset, frequency, doc_frequency) + self.begin_record() + TermWriter._write_term(self, term, offset, frequency, doc_frequency) # Write the information file offset delta. self.write_number(info_offset - self.last_info_offset) + self.end_record() + self.last_info_offset = info_offset class TermIndexReader(TermReader): @@ -150,11 +171,13 @@ index file. """ - term, offset, frequency, doc_frequency = TermReader.read_term(self) + self.begin_record() + term, offset, frequency, doc_frequency = TermReader._read_term(self) # Read the offset delta. self.last_info_offset += self.read_number() + self.end_record() return term, offset, frequency, doc_frequency, self.last_info_offset diff -r 4c35f0aa339c -r 1f3986bca1a3 test.py --- a/test.py Thu Feb 03 01:26:35 2011 +0100 +++ b/test.py Mon Feb 07 02:05:38 2011 +0100 @@ -32,49 +32,53 @@ f = open("test", "wb") w = FileWriter(f) +w.begin_record() for number in numbers: w.write_number(number) +w.end_record() w.close() f = open("test", "rb") r = FileReader(f) +r.begin_record() for number in numbers: n = r.read_number() print number == n, number, n +r.end_record() r.close() tuples = [(0, 0), (1, 3), (2, 5), (3, 9)] f = open("testMS", "wb") w = FileWriter(f) -last = w.get_initial_value(2) -for t in tuples: - last = w.write_sequence(t, last, 2) +w.begin_record() +w.write_monotonic_sequence(tuples) +w.end_record() w.close() f = open("testMS", "rb") r = FileReader(f) -last = r.get_initial_value(2) -for t in tuples: - last = t2 = r.read_sequence(last, 2) +r.begin_record() +for t, t2 in zip(r.read_monotonic_sequence(), tuples): print t == t2, t, t2 +r.end_record() r.close() tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)] f = open("testNMS", "wb") w = FileWriter(f) -last = w.get_initial_value(2) -for t in tuples2: - last = w.write_sequence(t, last, 2, monotonic=0) +w.begin_record() +w.write_delta_sequence(tuples2) +w.end_record() w.close() f = open("testNMS", "rb") r = FileReader(f) -last = r.get_initial_value(2) -for t in tuples2: - last = t2 = r.read_sequence(last, 2, monotonic=0) +r.begin_record() +for t, t2 in zip(r.read_delta_sequence(), tuples2): print t == t2, t, t2 +r.end_record() r.close() print "- Test positions." @@ -138,8 +142,8 @@ for doc_positions in all_doc_positions_seq: for docnum, positions in doc_positions: d, p = r.read_positions() - print tuple(docnum) == tuple(d), docnum, d - print tuple(positions) == tuple(p), positions, p + print docnum == d, docnum, d + print positions == p, positions, p r.reset() r.close()