1.1 --- a/iixr.py Tue Aug 25 22:44:15 2009 +0200
1.2 +++ b/iixr.py Tue Aug 25 23:53:20 2009 +0200
1.3 @@ -19,6 +19,7 @@
1.4 """
1.5
1.6 from os.path import commonprefix # to find common string prefixes
1.7 +from bisect import bisect_right # to find terms in the dictionary index
1.8
1.9 # Foundation classes.
1.10
1.11 @@ -104,7 +105,7 @@
1.12
1.13 "Read a number from the file."
1.14
1.15 - nbytes = ord(self.f.read(1))
1.16 + nbytes = self.read_unsigned_byte()
1.17
1.18 # Read each byte, adding it to the number.
1.19
1.20 @@ -126,7 +127,11 @@
1.21
1.22 "Read a number from the file, consuming a single byte."
1.23
1.24 - return ord(self.f.read(1))
1.25 + s = self.f.read(1)
1.26 + if not s:
1.27 + raise EOFError
1.28 +
1.29 + return ord(s)
1.30
1.31 def read_string(self):
1.32
1.33 @@ -263,7 +268,8 @@
1.34
1.35 """
1.36 Write the given 'term' and its position file 'offset' to the term
1.37 - information file.
1.38 + information file. Return the offset after the term information was
1.39 + written to the file.
1.40 """
1.41
1.42 # Too long terms are not currently supported.
1.43 @@ -286,6 +292,8 @@
1.44 self.last_term = term
1.45 self.last_offset = offset
1.46
1.47 + return self.f.tell()
1.48 +
1.49 class TermReader(FileReader):
1.50
1.51 "Reading term information from files."
1.52 @@ -313,4 +321,146 @@
1.53
1.54 return self.last_term, self.last_offset
1.55
1.56 + def go_to_term(self, term, offset, info_offset):
1.57 +
1.58 + "Seek past the entry for 'term' having 'offset' to 'info_offset'."
1.59 +
1.60 + self.f.seek(info_offset)
1.61 + self.last_term = term
1.62 + self.last_offset = offset
1.63 +
1.64 +class TermIndexWriter(TermWriter):
1.65 +
1.66 + "Writing term dictionary index details to files."
1.67 +
1.68 + def reset(self):
1.69 + TermWriter.reset(self)
1.70 + self.last_info_offset = 0
1.71 +
1.72 + def write_term(self, term, offset, info_offset):
1.73 +
1.74 + """
1.75 + Write the given 'term' and its position file 'offset' to the term
1.76 + dictionary index file, along with the 'info_offset' in the term
1.77 + information file.
1.78 + """
1.79 +
1.80 + TermWriter.write_term(self, term, offset)
1.81 +
1.82 + # Write the information file offset delta.
1.83 +
1.84 + self.write_number(info_offset - self.last_info_offset)
1.85 + self.last_info_offset = info_offset
1.86 +
1.87 +class TermIndexReader(TermReader):
1.88 +
1.89 + "Reading term dictionary index details from files."
1.90 +
1.91 + def reset(self):
1.92 + TermReader.reset(self)
1.93 + self.last_info_offset = 0
1.94 +
1.95 + def read_term(self):
1.96 +
1.97 + """
1.98 + Read a term, its position file offset, and its term information file
1.99 + offset from the term dictionary index file.
1.100 + """
1.101 +
1.102 + term, offset = TermReader.read_term(self)
1.103 +
1.104 + # Read the offset delta.
1.105 +
1.106 + self.last_info_offset += self.read_number()
1.107 +
1.108 + return term, offset, self.last_info_offset
1.109 +
1.110 +class TermDictionaryWriter:
1.111 +
1.112 + "Writing term dictionaries."
1.113 +
1.114 + def __init__(self, info_writer, index_writer, interval):
1.115 + self.info_writer = info_writer
1.116 + self.index_writer = index_writer
1.117 + self.interval = interval
1.118 + self.entry = 0
1.119 +
1.120 + def write_term(self, term, offset):
1.121 +
1.122 + """
1.123 + Write the given 'term' and its position file 'offset' to the term
1.124 + information file and optionally to the index, making a dictionary entry.
1.125 + """
1.126 +
1.127 + info_offset = self.info_writer.write_term(term, offset)
1.128 +
1.129 + if self.entry % self.interval == 0:
1.130 + self.index_writer.write_term(term, offset, info_offset)
1.131 +
1.132 + self.entry += 1
1.133 +
1.134 + def close(self):
1.135 + self.info_writer.close()
1.136 + self.index_writer.close()
1.137 +
1.138 +class TermDictionaryReader:
1.139 +
1.140 + "Reading term dictionaries."
1.141 +
1.142 + def __init__(self, info_reader, index_reader):
1.143 + self.info_reader = info_reader
1.144 + self.index_reader = index_reader
1.145 +
1.146 + self.terms = []
1.147 + try:
1.148 + while 1:
1.149 + self.terms.append(self.index_reader.read_term())
1.150 + except EOFError:
1.151 + pass
1.152 +
1.153 + # Large numbers for ordering purposes.
1.154 +
1.155 + self.max_offset = self.terms[-1][1]
1.156 + self.max_info_offset = self.terms[-1][2]
1.157 +
1.158 + def find(self, term):
1.159 +
1.160 + "Find the position file offset of 'term' from the term dictionary."
1.161 +
1.162 + i = bisect_right(self.terms, (term, self.max_offset, self.max_info_offset)) - 1
1.163 +
1.164 + # Get the entry position providing the term or one preceding it.
1.165 +
1.166 + if i == -1:
1.167 + return None
1.168 +
1.169 + found_term, offset, info_offset = self.terms[i]
1.170 +
1.171 + # Where the term is found immediately, return the offset.
1.172 +
1.173 + if term == found_term:
1.174 + return offset
1.175 +
1.176 + # Otherwise, seek past the index term's entry in the information file
1.177 + # and scan for the desired term.
1.178 +
1.179 + else:
1.180 + self.info_reader.go_to_term(found_term, offset, info_offset)
1.181 + try:
1.182 + while term > found_term:
1.183 + found_term, offset = self.info_reader.read_term()
1.184 + except EOFError:
1.185 + pass
1.186 +
1.187 + # If the term is found, return the offset.
1.188 +
1.189 + if term == found_term:
1.190 + return offset
1.191 + else:
1.192 + return None
1.193 +
1.194 + def close(self):
1.195 + self.info_reader.close()
1.196 + self.index_reader.close()
1.197 +
1.198 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/test.py Tue Aug 25 22:44:15 2009 +0200
2.2 +++ b/test.py Tue Aug 25 23:53:20 2009 +0200
2.3 @@ -10,7 +10,7 @@
2.4 w.write_number(number)
2.5 w.close()
2.6
2.7 -f = open("test", "r")
2.8 +f = open("test", "rb")
2.9 r = iixr.FileReader(f)
2.10 for number in numbers:
2.11 n = r.read_number()
2.12 @@ -36,7 +36,7 @@
2.13 w.reset()
2.14 w.close()
2.15
2.16 -f = open("test", "r")
2.17 +f = open("test", "rb")
2.18 r = iixr.PositionReader(f)
2.19 for doc_positions in all_doc_positions:
2.20 for docnum, positions in doc_positions:
2.21 @@ -55,7 +55,7 @@
2.22 )
2.23 w.close()
2.24
2.25 -f = open("test", "r")
2.26 +f = open("test", "rb")
2.27 r = iixr.PositionReader(f)
2.28 offsets.reverse()
2.29 all_doc_positions.reverse()
2.30 @@ -79,7 +79,7 @@
2.31 w.write_term(term, offset)
2.32 w.close()
2.33
2.34 -f = open("test", "r")
2.35 +f = open("test", "rb")
2.36 r = iixr.TermReader(f)
2.37 for term, offset in terms:
2.38 t, o = r.read_term()
2.39 @@ -87,4 +87,52 @@
2.40 print offset == o, offset, o
2.41 r.close()
2.42
2.43 +indexed_terms = [
2.44 + ("aardvark", 100000123, 200000321),
2.45 + ("anteater", 100000456, 200000654),
2.46 + ("badger", 100000789, 200000987),
2.47 + ("bull", 1000001234, 200004321),
2.48 + ("bulldog", 1000002345, 200005432),
2.49 + ("cat", 1000003456, 200006543)
2.50 + ]
2.51 +
2.52 +f = open("test", "wb")
2.53 +w = iixr.TermIndexWriter(f)
2.54 +for term, offset, info_offset in indexed_terms:
2.55 + w.write_term(term, offset, info_offset)
2.56 +w.close()
2.57 +
2.58 +f = open("test", "rb")
2.59 +r = iixr.TermIndexReader(f)
2.60 +for term, offset, info_offset in indexed_terms:
2.61 + t, o, i = r.read_term()
2.62 + print term == t, term, t
2.63 + print offset == o, offset, o
2.64 + print info_offset == i, info_offset, i
2.65 +r.close()
2.66 +
2.67 +f = open("test", "wb")
2.68 +w = iixr.TermWriter(f)
2.69 +f2 = open("testI", "wb")
2.70 +w2 = iixr.TermIndexWriter(f2)
2.71 +w3 = iixr.TermDictionaryWriter(w, w2, 3)
2.72 +for term, offset in terms:
2.73 + w3.write_term(term, offset)
2.74 +w3.close()
2.75 +
2.76 +f = open("test", "rb")
2.77 +r = iixr.TermReader(f)
2.78 +f2 = open("testI", "rb")
2.79 +r2 = iixr.TermIndexReader(f2)
2.80 +r3 = iixr.TermDictionaryReader(r, r2)
2.81 +terms_reversed = terms[:]
2.82 +terms_reversed.reverse()
2.83 +for term, offset in terms_reversed:
2.84 + o = r3.find(term)
2.85 + print offset == o, offset, o
2.86 +for term in ("dog", "dingo"):
2.87 + o = r3.find(term)
2.88 + print o is None, o
2.89 +r3.close()
2.90 +
2.91 # vim: tabstop=4 expandtab shiftwidth=4