# HG changeset patch # User Paul Boddie # Date 1252711939 -7200 # Node ID 628c5c3881099d567d1232ed5bcc6a04df3682b2 # Parent f0f1799c9f4cb262d1bcf9fd4895f138214c8750 Moved cache-affected writing methods into the FileWriter class. Fixed cache flushing in FileWriter to use the cache length, not the number of list elements. Introduced caching into the FileReader class. Introduced a seek method into FileReader in order to work with the caching, altering iterator construction. diff -r f0f1799c9f4c -r 628c5c388109 iixr.py --- a/iixr.py Sat Sep 12 00:31:31 2009 +0200 +++ b/iixr.py Sat Sep 12 01:32:19 2009 +0200 @@ -99,8 +99,6 @@ def __init__(self, f): self.f = f self.reset() - self.cache = [] - self.cache_length = 0 def reset(self): @@ -109,22 +107,20 @@ pass def rewind(self): - self.f.seek(0) + self.seek(0) self.reset() - def write(self, s): - self.cache.append(s) - self.cache_length += len(s) - if len(self.cache) >= 1000: - self.flush() - - def tell(self): - return self.f.tell() + self.cache_length + def seek(self, offset): + + "To be defined by readers." + + pass def flush(self): - self.f.write("".join(self.cache)) - self.cache = [] - self.cache_length = 0 + + "To be defined by writers." + + pass def close(self): if self.f is not None: @@ -136,6 +132,11 @@ "Writing basic data types to files." + def __init__(self, f): + File.__init__(self, f) + self.cache = [] + self.cache_length = 0 + def write_number(self, number): "Write 'number' to the file using a variable length encoding." @@ -176,10 +177,31 @@ length = len(s) self.write(flag + vint(length) + s) + # Cache-affected methods. + + def write(self, s): + self.cache.append(s) + self.cache_length += len(s) + if self.cache_length >= 1000: + self.flush() + + def tell(self): + return self.f.tell() + self.cache_length + + def flush(self): + self.f.write("".join(self.cache)) + self.cache = [] + self.cache_length = 0 + class FileReader(File): "Reading basic data types from files." + def __init__(self, f): + File.__init__(self, f) + self.cache = "" + self.cache_length = 0 + def read_number(self): "Read a number from the file." @@ -188,7 +210,7 @@ shift = 0 number = 0 - read = self.f.read + read = self.read try: csd = ord(read(1)) @@ -213,12 +235,12 @@ # Decompress the data if requested. if decompress: - flag = self.f.read(1) + flag = self.read(1) else: flag = "-" length = self.read_number() - s = self.f.read(length) + s = self.read(length) # Perform decompression if applicable. @@ -230,6 +252,28 @@ return unicode(s, "utf-8") + # Cache-affected methods. + + def read(self, n): + needed = n - self.cache_length + if needed > 0: + s = self.f.read(max(needed, 1000)) + self.cache += s + self.cache_length += len(s) + + s = self.cache[:n] + self.cache = self.cache[n:] + self.cache_length -= len(s) + return s + + def tell(self): + return self.f.tell() - self.cache_length + + def seek(self, offset): + self.f.seek(offset) + self.cache = "" + self.cache_length = 0 + class FileOpener: "Opening files using their filenames." @@ -303,8 +347,7 @@ # Duplicate the file handle. f = self.open("rb") - f.seek(offset) - return PositionIterator(f, count) + return PositionIterator(f, offset, count) class PositionIndexWriter(FileWriter): @@ -361,8 +404,7 @@ # Duplicate the file handle. f = self.open("rb") - f.seek(offset) - return PositionIndexIterator(f, doc_frequency) + return PositionIndexIterator(f, offset, doc_frequency) # Iterators for position-related files. @@ -388,9 +430,10 @@ "Iterating over document positions." - def __init__(self, f, count): + def __init__(self, f, offset, count): FileReader.__init__(self, f) IteratorBase.__init__(self, count) + self.seek(offset) def reset(self): self.last_docnum = 0 @@ -435,9 +478,10 @@ "Iterating over document positions." - def __init__(self, f, count): + def __init__(self, f, offset, count): FileReader.__init__(self, f) IteratorBase.__init__(self, count) + self.seek(offset) self.section_count = 0 def reset(self): @@ -831,7 +875,7 @@ permits the scanning for later terms from the specified term. """ - self.f.seek(info_offset) + self.seek(info_offset) self.last_term = term self.last_offset = offset @@ -1195,7 +1239,7 @@ later documents. """ - self.f.seek(offset) + self.seek(offset) bad_docnum, fields = self.read_fields() self.last_docnum = docnum return docnum, fields diff -r f0f1799c9f4c -r 628c5c388109 test.py --- a/test.py Sat Sep 12 00:31:31 2009 +0200 +++ b/test.py Sat Sep 12 01:32:19 2009 +0200 @@ -60,7 +60,7 @@ w.close() f = open("testP", "rb") -r = iixr.PositionIterator(f, None) +r = iixr.PositionIterator(f, 0, None) for doc_positions in all_doc_positions: for docnum, positions in doc_positions: d, p = r.read_positions()