1.1 --- a/iixr.py Sat Aug 29 02:15:29 2009 +0200
1.2 +++ b/iixr.py Sat Aug 29 21:15:47 2009 +0200
1.3 @@ -22,12 +22,15 @@
1.4 from os.path import exists, join
1.5 from os.path import commonprefix # to find common string prefixes
1.6 from bisect import bisect_right # to find terms in the dictionary index
1.7 -import bz2 # for field compression
1.8 +import bz2, zlib # for field compression
1.9
1.10 # Constants.
1.11
1.12 INTERVAL = 100
1.13
1.14 +compressors = [("b", bz2.compress), ("z", zlib.compress)]
1.15 +decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
1.16 +
1.17 # Foundation classes.
1.18
1.19 class File:
1.20 @@ -94,7 +97,20 @@
1.21 # Compress the string if requested.
1.22
1.23 if compress:
1.24 - s = bz2.compress(s)
1.25 + for flag, fn in compressors:
1.26 + cs = fn(s)
1.27 +
1.28 + # Take the first string shorter than the original.
1.29 +
1.30 + if len(cs) < len(s):
1.31 + s = cs
1.32 + break
1.33 + else:
1.34 + flag = "-"
1.35 +
1.36 + # Record whether compression was used.
1.37 +
1.38 + self.f.write(flag)
1.39
1.40 # Write the length of the data before the data itself.
1.41
1.42 @@ -137,13 +153,21 @@
1.43 'decompress' is set to a true value.
1.44 """
1.45
1.46 + # Decompress the data if requested.
1.47 +
1.48 + if decompress:
1.49 + flag = self.f.read(1)
1.50 + else:
1.51 + flag = "-"
1.52 +
1.53 length = self.read_number()
1.54 s = self.f.read(length)
1.55
1.56 - # Decompress the data if requested.
1.57 + # Perform decompression if applicable.
1.58
1.59 - if decompress:
1.60 - s = bz2.decompress(s)
1.61 + if flag != "-":
1.62 + fn = decompressors[flag]
1.63 + s = fn(s)
1.64
1.65 # Convert strings to Unicode objects.
1.66
1.67 @@ -532,7 +556,7 @@
1.68 # Write the fields themselves.
1.69
1.70 for field in fields:
1.71 - self.write_string(field, 0) # compress
1.72 + self.write_string(field, 1) # compress
1.73
1.74 self.last_docnum = docnum
1.75 return offset
1.76 @@ -565,7 +589,7 @@
1.77 i = 0
1.78
1.79 while i < nfields:
1.80 - fields.append(self.read_string(0)) # decompress
1.81 + fields.append(self.read_string(1)) # decompress
1.82 i += 1
1.83
1.84 return self.last_docnum, fields
1.85 @@ -589,6 +613,7 @@
1.86
1.87 def reset(self):
1.88 self.last_docnum = 0
1.89 + self.last_offset = 0
1.90
1.91 def write_document(self, docnum, offset):
1.92
1.93 @@ -597,12 +622,13 @@
1.94 document are stored in the fields file.
1.95 """
1.96
1.97 - # Write the document number delta and offset.
1.98 + # Write the document number and offset deltas.
1.99
1.100 self.write_number(docnum - self.last_docnum)
1.101 - self.write_number(offset)
1.102 + self.write_number(offset - self.last_offset)
1.103
1.104 self.last_docnum = docnum
1.105 + self.last_offset = offset
1.106
1.107 class FieldIndexReader(FileReader):
1.108
1.109 @@ -610,6 +636,7 @@
1.110
1.111 def reset(self):
1.112 self.last_docnum = 0
1.113 + self.last_offset = 0
1.114
1.115 def read_document(self):
1.116
1.117 @@ -618,9 +645,9 @@
1.118 # Read the document number delta and offset.
1.119
1.120 self.last_docnum += self.read_number()
1.121 - offset = self.read_number()
1.122 + self.last_offset += self.read_number()
1.123
1.124 - return self.last_docnum, offset
1.125 + return self.last_docnum, self.last_offset
1.126
1.127 class FieldDictionaryWriter:
1.128
1.129 @@ -706,11 +733,15 @@
1.130
1.131 class IndexWriter:
1.132
1.133 - "Building term information and writing it to the term dictionary."
1.134 + """
1.135 + Building term information and writing it to the term and field dictionaries.
1.136 + """
1.137
1.138 - def __init__(self, dict_writer):
1.139 + def __init__(self, dict_writer, field_dict_writer):
1.140 self.dict_writer = dict_writer
1.141 + self.field_dict_writer = field_dict_writer
1.142 self.terms = {}
1.143 + self.docs = {}
1.144
1.145 def add_position(self, term, docnum, position):
1.146
1.147 @@ -731,6 +762,15 @@
1.148
1.149 doc.append(position)
1.150
1.151 + def add_fields(self, docnum, fields):
1.152 +
1.153 + "Add for the document with the given 'docnum' a list of 'fields'."
1.154 +
1.155 + if not self.docs.has_key(docnum):
1.156 + doc_fields = self.docs[docnum] = fields
1.157 + else:
1.158 + self.docs[docnum] += fields
1.159 +
1.160 def close(self):
1.161 if self.dict_writer is None:
1.162 return
1.163 @@ -748,6 +788,35 @@
1.164 self.dict_writer.close()
1.165 self.dict_writer = None
1.166
1.167 + # Get the documents in order.
1.168 +
1.169 + docs = self.docs.items()
1.170 + docs.sort()
1.171 +
1.172 + for docnum, fields in docs:
1.173 + self.field_dict_writer.write_fields(docnum, fields)
1.174 +
1.175 + self.field_dict_writer.close()
1.176 + self.field_dict_writer = None
1.177 +
1.178 +class IndexReader:
1.179 +
1.180 + "Accessing the term and field dictionaries."
1.181 +
1.182 + def __init__(self, dict_reader, field_dict_reader):
1.183 + self.dict_reader = dict_reader
1.184 + self.field_dict_reader = field_dict_reader
1.185 +
1.186 + def find_positions(self, term):
1.187 + return self.dict_reader.find_positions(term)
1.188 +
1.189 + def get_fields(self, docnum):
1.190 + return self.field_dict_reader.read_fields(docnum)
1.191 +
1.192 + def close(self):
1.193 + self.dict_reader.close()
1.194 + self.field_dict_reader.close()
1.195 +
1.196 class Index:
1.197
1.198 "An inverted index solution encapsulating the various components."
1.199 @@ -775,7 +844,15 @@
1.200
1.201 dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)
1.202
1.203 - self.writer = IndexWriter(dict_writer)
1.204 + ff = open(join(self.pathname, "fields"), "wb")
1.205 + field_writer = FieldWriter(ff)
1.206 +
1.207 + fif = open(join(self.pathname, "fields_index"), "wb")
1.208 + field_index_writer = FieldIndexWriter(fif)
1.209 +
1.210 + field_dict_writer = FieldDictionaryWriter(field_writer, field_index_writer, interval)
1.211 +
1.212 + self.writer = IndexWriter(dict_writer, field_dict_writer)
1.213 return self.writer
1.214
1.215 def get_reader(self):
1.216 @@ -794,7 +871,17 @@
1.217 tpf = open(join(self.pathname, "positions"), "rb")
1.218 positions_reader = PositionReader(tpf)
1.219
1.220 - self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader)
1.221 + dict_reader = TermDictionaryReader(info_reader, index_reader, positions_reader)
1.222 +
1.223 + ff = open(join(self.pathname, "fields"), "rb")
1.224 + field_reader = FieldReader(ff)
1.225 +
1.226 + fif = open(join(self.pathname, "fields_index"), "rb")
1.227 + field_index_reader = FieldIndexReader(fif)
1.228 +
1.229 + field_dict_reader = FieldDictionaryReader(field_reader, field_index_reader)
1.230 +
1.231 + self.reader = IndexReader(dict_reader, field_dict_reader)
1.232 return self.reader
1.233
1.234 def close(self):
2.1 --- a/test.py Sat Aug 29 02:15:29 2009 +0200
2.2 +++ b/test.py Sat Aug 29 21:15:47 2009 +0200
2.3 @@ -285,12 +285,16 @@
2.4 for docnum, text in docs:
2.5 for position, term in enumerate(text.split()):
2.6 wi.add_position(term, docnum, position)
2.7 + wi.add_fields(docnum, [text])
2.8 wi.close()
2.9
2.10 rd = index.get_reader()
2.11 for term, doc_positions in doc_tests:
2.12 dp = rd.find_positions(term)
2.13 print doc_positions == dp, doc_positions, dp
2.14 +for docnum, text in docs:
2.15 + df = rd.get_fields(docnum)
2.16 + print text == df[0], text, df[0]
2.17 index.close()
2.18
2.19 # vim: tabstop=4 expandtab shiftwidth=4