1.1 --- a/iixr.py Sat Sep 12 00:31:31 2009 +0200
1.2 +++ b/iixr.py Sat Sep 12 01:32:19 2009 +0200
1.3 @@ -99,8 +99,6 @@
1.4 def __init__(self, f):
1.5 self.f = f
1.6 self.reset()
1.7 - self.cache = []
1.8 - self.cache_length = 0
1.9
1.10 def reset(self):
1.11
1.12 @@ -109,22 +107,20 @@
1.13 pass
1.14
1.15 def rewind(self):
1.16 - self.f.seek(0)
1.17 + self.seek(0)
1.18 self.reset()
1.19
1.20 - def write(self, s):
1.21 - self.cache.append(s)
1.22 - self.cache_length += len(s)
1.23 - if len(self.cache) >= 1000:
1.24 - self.flush()
1.25 -
1.26 - def tell(self):
1.27 - return self.f.tell() + self.cache_length
1.28 + def seek(self, offset):
1.29 +
1.30 + "To be defined by readers."
1.31 +
1.32 + pass
1.33
1.34 def flush(self):
1.35 - self.f.write("".join(self.cache))
1.36 - self.cache = []
1.37 - self.cache_length = 0
1.38 +
1.39 + "To be defined by writers."
1.40 +
1.41 + pass
1.42
1.43 def close(self):
1.44 if self.f is not None:
1.45 @@ -136,6 +132,11 @@
1.46
1.47 "Writing basic data types to files."
1.48
1.49 + def __init__(self, f):
1.50 + File.__init__(self, f)
1.51 + self.cache = []
1.52 + self.cache_length = 0
1.53 +
1.54 def write_number(self, number):
1.55
1.56 "Write 'number' to the file using a variable length encoding."
1.57 @@ -176,10 +177,31 @@
1.58 length = len(s)
1.59 self.write(flag + vint(length) + s)
1.60
1.61 + # Cache-affected methods.
1.62 +
1.63 + def write(self, s):
1.64 + self.cache.append(s)
1.65 + self.cache_length += len(s)
1.66 + if self.cache_length >= 1000:
1.67 + self.flush()
1.68 +
1.69 + def tell(self):
1.70 + return self.f.tell() + self.cache_length
1.71 +
1.72 + def flush(self):
1.73 + self.f.write("".join(self.cache))
1.74 + self.cache = []
1.75 + self.cache_length = 0
1.76 +
1.77 class FileReader(File):
1.78
1.79 "Reading basic data types from files."
1.80
1.81 + def __init__(self, f):
1.82 + File.__init__(self, f)
1.83 + self.cache = ""
1.84 + self.cache_length = 0
1.85 +
1.86 def read_number(self):
1.87
1.88 "Read a number from the file."
1.89 @@ -188,7 +210,7 @@
1.90
1.91 shift = 0
1.92 number = 0
1.93 - read = self.f.read
1.94 + read = self.read
1.95
1.96 try:
1.97 csd = ord(read(1))
1.98 @@ -213,12 +235,12 @@
1.99 # Decompress the data if requested.
1.100
1.101 if decompress:
1.102 - flag = self.f.read(1)
1.103 + flag = self.read(1)
1.104 else:
1.105 flag = "-"
1.106
1.107 length = self.read_number()
1.108 - s = self.f.read(length)
1.109 + s = self.read(length)
1.110
1.111 # Perform decompression if applicable.
1.112
1.113 @@ -230,6 +252,28 @@
1.114
1.115 return unicode(s, "utf-8")
1.116
1.117 + # Cache-affected methods.
1.118 +
1.119 + def read(self, n):
1.120 + needed = n - self.cache_length
1.121 + if needed > 0:
1.122 + s = self.f.read(max(needed, 1000))
1.123 + self.cache += s
1.124 + self.cache_length += len(s)
1.125 +
1.126 + s = self.cache[:n]
1.127 + self.cache = self.cache[n:]
1.128 + self.cache_length -= len(s)
1.129 + return s
1.130 +
1.131 + def tell(self):
1.132 + return self.f.tell() - self.cache_length
1.133 +
1.134 + def seek(self, offset):
1.135 + self.f.seek(offset)
1.136 + self.cache = ""
1.137 + self.cache_length = 0
1.138 +
1.139 class FileOpener:
1.140
1.141 "Opening files using their filenames."
1.142 @@ -303,8 +347,7 @@
1.143 # Duplicate the file handle.
1.144
1.145 f = self.open("rb")
1.146 - f.seek(offset)
1.147 - return PositionIterator(f, count)
1.148 + return PositionIterator(f, offset, count)
1.149
1.150 class PositionIndexWriter(FileWriter):
1.151
1.152 @@ -361,8 +404,7 @@
1.153 # Duplicate the file handle.
1.154
1.155 f = self.open("rb")
1.156 - f.seek(offset)
1.157 - return PositionIndexIterator(f, doc_frequency)
1.158 + return PositionIndexIterator(f, offset, doc_frequency)
1.159
1.160 # Iterators for position-related files.
1.161
1.162 @@ -388,9 +430,10 @@
1.163
1.164 "Iterating over document positions."
1.165
1.166 - def __init__(self, f, count):
1.167 + def __init__(self, f, offset, count):
1.168 FileReader.__init__(self, f)
1.169 IteratorBase.__init__(self, count)
1.170 + self.seek(offset)
1.171
1.172 def reset(self):
1.173 self.last_docnum = 0
1.174 @@ -435,9 +478,10 @@
1.175
1.176 "Iterating over document positions."
1.177
1.178 - def __init__(self, f, count):
1.179 + def __init__(self, f, offset, count):
1.180 FileReader.__init__(self, f)
1.181 IteratorBase.__init__(self, count)
1.182 + self.seek(offset)
1.183 self.section_count = 0
1.184
1.185 def reset(self):
1.186 @@ -831,7 +875,7 @@
1.187 permits the scanning for later terms from the specified term.
1.188 """
1.189
1.190 - self.f.seek(info_offset)
1.191 + self.seek(info_offset)
1.192 self.last_term = term
1.193 self.last_offset = offset
1.194
1.195 @@ -1195,7 +1239,7 @@
1.196 later documents.
1.197 """
1.198
1.199 - self.f.seek(offset)
1.200 + self.seek(offset)
1.201 bad_docnum, fields = self.read_fields()
1.202 self.last_docnum = docnum
1.203 return docnum, fields
2.1 --- a/test.py Sat Sep 12 00:31:31 2009 +0200
2.2 +++ b/test.py Sat Sep 12 01:32:19 2009 +0200
2.3 @@ -60,7 +60,7 @@
2.4 w.close()
2.5
2.6 f = open("testP", "rb")
2.7 -r = iixr.PositionIterator(f, None)
2.8 +r = iixr.PositionIterator(f, 0, None)
2.9 for doc_positions in all_doc_positions:
2.10 for docnum, positions in doc_positions:
2.11 d, p = r.read_positions()