1.1 --- a/iixr.py Fri Aug 28 01:15:17 2009 +0200
1.2 +++ b/iixr.py Sat Aug 29 02:15:29 2009 +0200
1.3 @@ -149,7 +149,7 @@
1.4
1.5 return unicode(s, "utf-8")
1.6
1.7 -# Specific classes.
1.8 +# Specific classes for storing term and position information.
1.9
1.10 class PositionWriter(FileWriter):
1.11
1.12 @@ -336,7 +336,10 @@
1.13
1.14 def go_to_term(self, term, offset, info_offset):
1.15
1.16 - "Seek past the entry for 'term' having 'offset' to 'info_offset'."
1.17 + """
1.18 + Seek past the entry for 'term' having 'offset' to 'info_offset'. This
1.19 + permits the scanning for later terms from the specified term.
1.20 + """
1.21
1.22 self.f.seek(info_offset)
1.23 self.last_term = term
1.24 @@ -399,7 +402,7 @@
1.25 self.interval = interval
1.26 self.entry = 0
1.27
1.28 - def write_term(self, term, offset):
1.29 + def _write_term(self, term, offset):
1.30
1.31 """
1.32 Write the given 'term' and its position file 'offset' to the term
1.33 @@ -421,7 +424,7 @@
1.34 """
1.35
1.36 offset = self.position_writer.write_all_positions(doc_positions)
1.37 - self.write_term(term, offset)
1.38 + self._write_term(term, offset)
1.39
1.40 def close(self):
1.41 self.info_writer.close()
1.42 @@ -449,7 +452,7 @@
1.43 self.max_offset = self.terms[-1][1]
1.44 self.max_info_offset = self.terms[-1][2]
1.45
1.46 - def find_term(self, term):
1.47 + def _find_term(self, term):
1.48
1.49 "Find the position file offset of 'term' from the term dictionary."
1.50
1.51 @@ -489,7 +492,7 @@
1.52
1.53 "Return the documents and positions at which the given 'term' is found."
1.54
1.55 - offset = self.find_term(term)
1.56 + offset = self._find_term(term)
1.57 if offset is None:
1.58 return None
1.59 else:
1.60 @@ -500,19 +503,28 @@
1.61 self.index_reader.close()
1.62 self.position_reader.close()
1.63
1.64 +# Specific classes for storing document information.
1.65 +
1.66 class FieldWriter(FileWriter):
1.67
1.68 "Writing field data to files."
1.69
1.70 - def write_fields(self, fields):
1.71 + def reset(self):
1.72 + self.last_docnum = 0
1.73 +
1.74 + def write_fields(self, docnum, fields):
1.75
1.76 """
1.77 - Write the given list of 'fields' (strings representing field values).
1.78 - Return the offset at which the fields are stored.
1.79 + Write for the given 'docnum', a list of 'fields' (strings representing
1.80 + field values). Return the offset at which the fields are stored.
1.81 """
1.82
1.83 offset = self.f.tell()
1.84
1.85 + # Write the document number delta.
1.86 +
1.87 + self.write_number(docnum - self.last_docnum)
1.88 +
1.89 # Write the number of fields.
1.90
1.91 self.write_number(len(fields))
1.92 @@ -522,15 +534,26 @@
1.93 for field in fields:
1.94 self.write_string(field, 0) # compress
1.95
1.96 + self.last_docnum = docnum
1.97 return offset
1.98
1.99 class FieldReader(FileReader):
1.100
1.101 "Reading field data from files."
1.102
1.103 + def reset(self):
1.104 + self.last_docnum = 0
1.105 +
1.106 def read_fields(self):
1.107
1.108 - "Read fields from the file, returning the field values in a list."
1.109 + """
1.110 + Read fields from the file, returning a tuple containing the document
1.111 + number and a list of field values.
1.112 + """
1.113 +
1.114 + # Read the document number.
1.115 +
1.116 + self.last_docnum += self.read_number()
1.117
1.118 # Read the number of fields.
1.119
1.120 @@ -545,14 +568,139 @@
1.121 fields.append(self.read_string(0)) # decompress
1.122 i += 1
1.123
1.124 - return fields
1.125 + return self.last_docnum, fields
1.126 +
1.127 + def read_document_fields(self, docnum, offset):
1.128
1.129 - def read_doc_fields(self, offset):
1.130 -
1.131 - "Read all fields at the given 'offset."
1.132 + """
1.133 + Read fields for 'docnum' at the given 'offset'. This permits the
1.134 + retrieval of details for the specified document, as well as scanning for
1.135 + later documents.
1.136 + """
1.137
1.138 self.f.seek(offset)
1.139 - return self.read_fields()
1.140 + bad_docnum, fields = self.read_fields()
1.141 + self.last_docnum = docnum
1.142 + return docnum, fields
1.143 +
1.144 +class FieldIndexWriter(FileWriter):
1.145 +
1.146 + "Writing field index details to files."
1.147 +
1.148 + def reset(self):
1.149 + self.last_docnum = 0
1.150 +
1.151 + def write_document(self, docnum, offset):
1.152 +
1.153 + """
1.154 + Write for the given 'docnum', the 'offset' at which the fields for the
1.155 + document are stored in the fields file.
1.156 + """
1.157 +
1.158 + # Write the document number delta and offset.
1.159 +
1.160 + self.write_number(docnum - self.last_docnum)
1.161 + self.write_number(offset)
1.162 +
1.163 + self.last_docnum = docnum
1.164 +
1.165 +class FieldIndexReader(FileReader):
1.166 +
1.167 + "Reading field index details from files."
1.168 +
1.169 + def reset(self):
1.170 + self.last_docnum = 0
1.171 +
1.172 + def read_document(self):
1.173 +
1.174 + "Read a document number and field file offset."
1.175 +
1.176 + # Read the document number delta and offset.
1.177 +
1.178 + self.last_docnum += self.read_number()
1.179 + offset = self.read_number()
1.180 +
1.181 + return self.last_docnum, offset
1.182 +
1.183 +class FieldDictionaryWriter:
1.184 +
1.185 + "Writing field dictionary details."
1.186 +
1.187 + def __init__(self, field_writer, field_index_writer, interval):
1.188 + self.field_writer = field_writer
1.189 + self.field_index_writer = field_index_writer
1.190 + self.interval = interval
1.191 + self.entry = 0
1.192 +
1.193 + def write_fields(self, docnum, fields):
1.194 +
1.195 + "Write details of the document with the given 'docnum' and 'fields'."
1.196 +
1.197 + offset = self.field_writer.write_fields(docnum, fields)
1.198 +
1.199 + if self.entry % self.interval == 0:
1.200 + self.field_index_writer.write_document(docnum, offset)
1.201 +
1.202 + self.entry += 1
1.203 +
1.204 + def close(self):
1.205 + self.field_writer.close()
1.206 + self.field_index_writer.close()
1.207 +
1.208 +class FieldDictionaryReader:
1.209 +
1.210 + "Reading field dictionary details."
1.211 +
1.212 + def __init__(self, field_reader, field_index_reader):
1.213 + self.field_reader = field_reader
1.214 + self.field_index_reader = field_index_reader
1.215 +
1.216 + self.docs = []
1.217 + try:
1.218 + while 1:
1.219 + self.docs.append(self.field_index_reader.read_document())
1.220 + except EOFError:
1.221 + pass
1.222 +
1.223 + # Large numbers for ordering purposes.
1.224 +
1.225 + self.max_offset = self.docs[-1][1]
1.226 +
1.227 + def read_fields(self, docnum):
1.228 +
1.229 + "Read the fields of the document with the given 'docnum'."
1.230 +
1.231 + i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
1.232 +
1.233 + # Get the entry position providing the term or one preceding it.
1.234 +
1.235 + if i == -1:
1.236 + return None
1.237 +
1.238 + found_docnum, offset = self.docs[i]
1.239 +
1.240 + # Read from the fields file.
1.241 +
1.242 + found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
1.243 +
1.244 + # Scan for the document, if necessary.
1.245 +
1.246 + try:
1.247 + while docnum > found_docnum:
1.248 + found_docnum, fields = self.field_reader.read_fields()
1.249 + except EOFError:
1.250 + pass
1.251 +
1.252 + # If the document is found, return the fields.
1.253 +
1.254 + if docnum == found_docnum:
1.255 + return fields
1.256 + else:
1.257 + return None
1.258 +
1.259 + def close(self):
1.260 + self.field_reader.close()
1.261 + self.field_index_reader.close()
1.262
1.263 # High-level classes.
1.264
2.1 --- a/test.py Fri Aug 28 01:15:17 2009 +0200
2.2 +++ b/test.py Sat Aug 29 02:15:29 2009 +0200
2.3 @@ -2,6 +2,8 @@
2.4
2.5 import iixr
2.6
2.7 +# Test basic data types.
2.8 +
2.9 numbers = [12345678, 0, 1, 127, 128, 255, 256]
2.10
2.11 f = open("test", "wb")
2.12 @@ -17,6 +19,8 @@
2.13 print number == n, number, n
2.14 r.close()
2.15
2.16 +# Test positions.
2.17 +
2.18 all_doc_positions = [
2.19 [
2.20 (123, [1, 3, 5, 15, 25]),
2.21 @@ -64,30 +68,82 @@
2.22 print doc_positions == dp, doc_positions, dp
2.23 r.close()
2.24
2.25 +# Test fields.
2.26 +
2.27 doc_fields = [
2.28 - ["testing", "fields", "stored", "compressed"],
2.29 - ["fields", "for a second", "document"]
2.30 + (123, ["testing", "fields", "stored", "compressed"]),
2.31 + (456, ["fields", "for a second", "document"]),
2.32 + (789, ["field value"]),
2.33 + (1234, []),
2.34 + (2345, ["abc", "def"]),
2.35 + (3456, ["apple", "banana", "cherry"]),
2.36 + (4567, ["drue", "eple"])
2.37 ]
2.38
2.39 f = open("testF", "wb")
2.40 w = iixr.FieldWriter(f)
2.41 -offsets = []
2.42 -for fields in doc_fields:
2.43 - offsets.append(w.write_fields(fields))
2.44 +for docnum, fields in doc_fields:
2.45 + w.write_fields(docnum, fields)
2.46 w.close()
2.47
2.48 f = open("testF", "rb")
2.49 r = iixr.FieldReader(f)
2.50 -for fields in doc_fields:
2.51 - df = r.read_fields()
2.52 - print fields == df, fields, df
2.53 -offsets.reverse()
2.54 -doc_fields.reverse()
2.55 -for offset, fields in zip(offsets, doc_fields):
2.56 - df = r.read_doc_fields(offset)
2.57 +for docnum, fields in doc_fields:
2.58 + dn, df = r.read_fields()
2.59 + print docnum == dn, docnum, dn
2.60 print fields == df, fields, df
2.61 r.close()
2.62
2.63 +# Test field index files.
2.64 +
2.65 +indexed_docs = [
2.66 + (123, 100000987),
2.67 + (456, 100004321),
2.68 + (789, 100008765)
2.69 + ]
2.70 +
2.71 +f = open("testFI", "wb")
2.72 +w = iixr.FieldIndexWriter(f)
2.73 +for docnum, offset in indexed_docs:
2.74 + w.write_document(docnum, offset)
2.75 +w.close()
2.76 +
2.77 +f = open("testFI", "rb")
2.78 +r = iixr.FieldIndexReader(f)
2.79 +for docnum, offset in indexed_docs:
2.80 + dn, o = r.read_document()
2.81 + print docnum == dn, docnum, dn
2.82 + print offset == o, offset, o
2.83 +r.close()
2.84 +
2.85 +# Test field dictionaries.
2.86 +
2.87 +f = open("testF", "wb")
2.88 +w = iixr.FieldWriter(f)
2.89 +f2 = open("testFI", "wb")
2.90 +w2 = iixr.FieldIndexWriter(f2)
2.91 +wd = iixr.FieldDictionaryWriter(w, w2, 3)
2.92 +for docnum, fields in doc_fields:
2.93 + wd.write_fields(docnum, fields)
2.94 +wd.close()
2.95 +
2.96 +f = open("testF", "rb")
2.97 +r = iixr.FieldReader(f)
2.98 +f2 = open("testFI", "rb")
2.99 +r2 = iixr.FieldIndexReader(f2)
2.100 +rd = iixr.FieldDictionaryReader(r, r2)
2.101 +doc_fields_reversed = doc_fields[:]
2.102 +doc_fields_reversed.reverse()
2.103 +for docnum, fields in doc_fields_reversed:
2.104 + df = rd.read_fields(docnum)
2.105 + print fields == df, fields, df
2.106 +for docnum in (13579, 246810):
2.107 + df = rd.read_fields(docnum)
2.108 + print df is None, df
2.109 +rd.close()
2.110 +
2.111 +# Test terms.
2.112 +
2.113 terms = [
2.114 ("aardvark", 100000123),
2.115 ("anteater", 100000456),
2.116 @@ -111,6 +167,8 @@
2.117 print offset == o, offset, o
2.118 r.close()
2.119
2.120 +# Test terms in index files.
2.121 +
2.122 indexed_terms = [
2.123 ("aardvark", 100000123, 200000321),
2.124 ("anteater", 100000456, 200000654),
2.125 @@ -135,6 +193,8 @@
2.126 print info_offset == i, info_offset, i
2.127 r.close()
2.128
2.129 +# Test dictionaries with only term data.
2.130 +
2.131 f = open("test", "wb")
2.132 w = iixr.TermWriter(f)
2.133 f2 = open("testI", "wb")
2.134 @@ -143,7 +203,7 @@
2.135 w3 = iixr.PositionWriter(f3)
2.136 wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
2.137 for term, offset in terms:
2.138 - wd.write_term(term, offset)
2.139 + wd._write_term(term, offset)
2.140 wd.close()
2.141
2.142 f = open("test", "rb")
2.143 @@ -156,13 +216,15 @@
2.144 terms_reversed = terms[:]
2.145 terms_reversed.reverse()
2.146 for term, offset in terms_reversed:
2.147 - o = rd.find_term(term)
2.148 + o = rd._find_term(term)
2.149 print offset == o, offset, o
2.150 for term in ("dog", "dingo"):
2.151 - o = rd.find_term(term)
2.152 + o = rd._find_term(term)
2.153 print o is None, o
2.154 rd.close()
2.155
2.156 +# Test dictionaries with term and position data.
2.157 +
2.158 terms_with_positions = [
2.159 ("aardvark", [(1, [2, 45, 96]), (20, [13])]),
2.160 ("anteater", [(1, [43, 44])]),
2.161 @@ -200,6 +262,8 @@
2.162 print dp is None, dp
2.163 rd.close()
2.164
2.165 +# Test high-level index operations.
2.166 +
2.167 docs = [
2.168 (1, "The cat sat on the mat"),
2.169 (2, "Every good boy deserves football"),