# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1254531812 -7200
# Node ID 89465c390a4687b2fb173e5b0361c9e5652db8df
# Parent  353e83aab58d83db26600c12db8da608a8fb0769
Added a document cache, used when reading fields.
Optimised read_number slightly using arrays.

diff -r 353e83aab58d -r 89465c390a46 iixr/fields.py
--- a/iixr/fields.py	Fri Oct 02 00:22:10 2009 +0200
+++ b/iixr/fields.py	Sat Oct 03 03:03:32 2009 +0200
@@ -21,6 +21,8 @@
 from iixr.files import *
 from bisect import bisect_right  # to find terms in the dictionary index
 
+DOCUMENT_CACHE_LIMIT = 10000
+
 class FieldWriter(FileWriter):
 
     "Writing field data to files."
@@ -176,6 +178,7 @@
         self.field_reader = field_reader
         self.field_index_reader = field_index_reader
 
+        self.cache = {}
         self.docs = []
         try:
             while 1:
@@ -219,6 +222,9 @@
 
         "Read the fields of the document with the given 'docnum'."
 
+        if self.cache.has_key(docnum):
+            return self.cache[docnum]
+
         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
 
         # Get the entry position providing the term or one preceding it.
@@ -243,6 +249,16 @@
         # If the document is found, return the fields.
 
         if docnum == found_docnum:
+
+            # Store the fields in the cache, removing entries if the limit has
+            # been reached.
+
+            keys = self.cache.keys()
+
+            if len(keys) == DOCUMENT_CACHE_LIMIT:
+                del self.cache[keys[0]]
+
+            self.cache[docnum] = fields
             return fields
         else:
             return None
diff -r 353e83aab58d -r 89465c390a46 iixr/files.py
--- a/iixr/files.py	Fri Oct 02 00:22:10 2009 +0200
+++ b/iixr/files.py	Sat Oct 03 03:03:32 2009 +0200
@@ -19,6 +19,7 @@
 """
 
 from iixr.data import vint
+from array import array
 import zlib
 
 # Constants.
@@ -105,24 +106,19 @@
 
         # Read each byte, adding it to the number.
 
-        read = self.f.read
+        a = array('B')
+        fromfile = a.fromfile
+        f = self.f
 
-        c = read(1)
-        if c:
-            csd = ord(c)
-            if csd < 128:
-                return csd
-            else:
-                shift = 0
-                number = 0
-                while csd & 128:
-                    number += ((csd & 127) << shift)
-                    shift += 7
-                    csd = ord(read(1))
-                else:
-                    return number + (csd << shift)
+        fromfile(f, 1)
+        csd = a[-1]
+        if csd < 128:
+            return csd
         else:
-            raise EOFError
+            while csd & 128:
+                fromfile(f, 1)
+                csd = a[-1]
+            return sum([((csd & 127) << (number * 7)) for (number, csd) in enumerate(a)])
 
     def read_string(self, decompress=0):