# HG changeset patch # User Paul Boddie # Date 1254531812 -7200 # Node ID 89465c390a4687b2fb173e5b0361c9e5652db8df # Parent 353e83aab58d83db26600c12db8da608a8fb0769 Added a document cache, used when reading fields. Optimised read_number slightly using arrays. diff -r 353e83aab58d -r 89465c390a46 iixr/fields.py --- a/iixr/fields.py Fri Oct 02 00:22:10 2009 +0200 +++ b/iixr/fields.py Sat Oct 03 03:03:32 2009 +0200 @@ -21,6 +21,8 @@ from iixr.files import * from bisect import bisect_right # to find terms in the dictionary index +DOCUMENT_CACHE_LIMIT = 10000 + class FieldWriter(FileWriter): "Writing field data to files." @@ -176,6 +178,7 @@ self.field_reader = field_reader self.field_index_reader = field_index_reader + self.cache = {} self.docs = [] try: while 1: @@ -219,6 +222,9 @@ "Read the fields of the document with the given 'docnum'." + if self.cache.has_key(docnum): + return self.cache[docnum] + i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 # Get the entry position providing the term or one preceding it. @@ -243,6 +249,16 @@ # If the document is found, return the fields. if docnum == found_docnum: + + # Store the fields in the cache, removing entries if the limit has + # been reached. + + keys = self.cache.keys() + + if len(keys) == DOCUMENT_CACHE_LIMIT: + del self.cache[keys[0]] + + self.cache[docnum] = fields return fields else: return None diff -r 353e83aab58d -r 89465c390a46 iixr/files.py --- a/iixr/files.py Fri Oct 02 00:22:10 2009 +0200 +++ b/iixr/files.py Sat Oct 03 03:03:32 2009 +0200 @@ -19,6 +19,7 @@ """ from iixr.data import vint +from array import array import zlib # Constants. @@ -105,24 +106,19 @@ # Read each byte, adding it to the number. - read = self.f.read + a = array('B') + fromfile = a.fromfile + f = self.f - c = read(1) - if c: - csd = ord(c) - if csd < 128: - return csd - else: - shift = 0 - number = 0 - while csd & 128: - number += ((csd & 127) << shift) - shift += 7 - csd = ord(read(1)) - else: - return number + (csd << shift) + fromfile(f, 1) + csd = a[-1] + if csd < 128: + return csd else: - raise EOFError + while csd & 128: + fromfile(f, 1) + csd = a[-1] + return sum([((csd & 127) << (number * 7)) for (number, csd) in enumerate(a)]) def read_string(self, decompress=0):