2.1 --- a/iixr/fields.py Mon Feb 07 02:05:38 2011 +0100
2.2 +++ b/iixr/fields.py Tue Feb 08 00:08:27 2011 +0100
2.3 @@ -29,6 +29,7 @@
2.4 "Writing field data to files."
2.5
2.6 def reset(self):
2.7 + self.end_record()
2.8 self.last_docnum = None
2.9 self.subtractor = None
2.10
2.11 @@ -47,8 +48,6 @@
2.12 self.subtractor = get_subtractor(docnum)
2.13 docnum_seq = docnum
2.14
2.15 - self.begin_record()
2.16 -
2.17 # Write the document number.
2.18
2.19 self.write_sequence_value(docnum_seq)
2.20 @@ -63,8 +62,6 @@
2.21 self.write_number(i)
2.22 self.write_string(field, 1) # compress
2.23
2.24 - self.end_record()
2.25 -
2.26 self.last_docnum = docnum
2.27
2.28 class FieldReader(FileReader):
2.29 @@ -74,6 +71,7 @@
2.30 def reset(self):
2.31 self.last_docnum = None
2.32 self.adder = None
2.33 + self.begin_record()
2.34
2.35 def read_fields(self):
2.36
2.37 @@ -82,8 +80,6 @@
2.38 number and a list of field (identifier, value) pairs.
2.39 """
2.40
2.41 - self.begin_record()
2.42 -
2.43 # Read the document number.
2.44
2.45 docnum = self.read_sequence_value()
2.46 @@ -109,8 +105,6 @@
2.47 fields.append((identifier, value))
2.48 i += 1
2.49
2.50 - self.end_record()
2.51 -
2.52 return self.last_docnum, fields
2.53
2.54 def read_document_fields(self, docnum, offset):
2.55 @@ -131,6 +125,7 @@
2.56 "Writing field index details to files."
2.57
2.58 def reset(self):
2.59 + self.end_record()
2.60 self.last_docnum = None
2.61 self.subtractor = None
2.62 self.last_offset = 0
2.63 @@ -150,8 +145,6 @@
2.64 self.subtractor = get_subtractor(docnum)
2.65 docnum_seq = docnum
2.66
2.67 - self.begin_record()
2.68 -
2.69 # Write the document number.
2.70
2.71 self.write_sequence_value(docnum_seq)
2.72 @@ -159,7 +152,6 @@
2.73 # Write the offset delta.
2.74
2.75 self.write_number(offset - self.last_offset)
2.76 - self.end_record()
2.77
2.78 self.last_docnum = docnum
2.79 self.last_offset = offset
2.80 @@ -172,13 +164,12 @@
2.81 self.last_docnum = None
2.82 self.adder = None
2.83 self.last_offset = 0
2.84 + self.begin_record()
2.85
2.86 def read_document(self):
2.87
2.88 "Read a document number and field file offset."
2.89
2.90 - self.begin_record()
2.91 -
2.92 # Read the document number.
2.93
2.94 docnum = self.read_sequence_value()
2.95 @@ -192,7 +183,6 @@
2.96 # Read the offset.
2.97
2.98 self.last_offset += self.read_number()
2.99 - self.end_record()
2.100
2.101 return self.last_docnum, self.last_offset
2.102
2.103 @@ -211,6 +201,7 @@
2.104 "Write details of the document with the given 'docnum' and 'fields'."
2.105
2.106 if self.entry % self.interval == 0:
2.107 + self.field_writer.reset()
2.108 offset = self.field_writer.tell()
2.109 self.field_writer.write_fields(docnum, fields)
2.110 self.field_index_writer.write_document(docnum, offset)
2.111 @@ -230,6 +221,7 @@
2.112 def __init__(self, field_reader, field_index_reader):
2.113 self.field_reader = field_reader
2.114 self.field_index_reader = field_index_reader
2.115 + self.entry = 0
2.116
2.117 self.cache = {}
2.118 self.docs = []
2.119 @@ -267,7 +259,17 @@
2.120
2.121 "Return the next document number and fields."
2.122
2.123 - return self.field_reader.read_fields()
2.124 + try:
2.125 + return self.field_reader.read_fields()
2.126 + except EOFError:
2.127 + self.entry += 1
2.128 + try:
2.129 + found_docnum, offset = self.docs[self.entry]
2.130 + except IndexError:
2.131 + raise EOFError
2.132 + else:
2.133 + self.field_reader.reset()
2.134 + return self.field_reader.read_fields()
2.135
2.136 # Random access methods.
2.137
4.1 --- a/iixr/positions.py Mon Feb 07 02:05:38 2011 +0100
4.2 +++ b/iixr/positions.py Tue Feb 08 00:08:27 2011 +0100
4.3 @@ -26,6 +26,7 @@
4.4 "Writing position information to files."
4.5
4.6 def reset(self):
4.7 + self.end_record()
4.8 self.last_docnum = None
4.9 self.subtractor = None
4.10
4.11 @@ -56,10 +57,8 @@
4.12 self.subtractor = get_subtractor(docnum)
4.13 docnum_seq = docnum
4.14
4.15 - self.begin_record()
4.16 self.write_sequence_value(docnum_seq)
4.17 self.write_monotonic_sequence(positions)
4.18 - self.end_record()
4.19
4.20 self.last_docnum = docnum
4.21
4.22 @@ -70,6 +69,7 @@
4.23 def reset(self):
4.24 self.last_docnum = None
4.25 self.adder = None
4.26 + self.begin_record()
4.27
4.28 def read_positions(self):
4.29
4.30 @@ -77,8 +77,6 @@
4.31 Read positions, returning a document number and a list of positions.
4.32 """
4.33
4.34 - self.begin_record()
4.35 -
4.36 # Read the document number.
4.37
4.38 docnum = self.read_sequence_value()
4.39 @@ -95,7 +93,6 @@
4.40 self.last_docnum = docnum
4.41
4.42 positions = self.read_monotonic_sequence()
4.43 - self.end_record()
4.44
4.45 return self.last_docnum, positions
4.46
4.47 @@ -104,6 +101,7 @@
4.48 "Writing position index information to files."
4.49
4.50 def reset(self):
4.51 + self.end_record()
4.52 self.last_docnum = None
4.53 self.subtractor = None
4.54 self.last_pos_offset = 0
4.55 @@ -123,11 +121,9 @@
4.56 self.subtractor = get_subtractor(docnum)
4.57 docnum_seq = docnum
4.58
4.59 - self.begin_record()
4.60 self.write_sequence_value(docnum_seq)
4.61 self.write_number(pos_offset - self.last_pos_offset)
4.62 self.write_number(count)
4.63 - self.end_record()
4.64
4.65 self.last_docnum = docnum
4.66 self.last_pos_offset = pos_offset
4.67 @@ -140,6 +136,7 @@
4.68 self.last_docnum = None
4.69 self.adder = None
4.70 self.last_pos_offset = 0
4.71 + self.begin_record()
4.72
4.73 def read_positions(self):
4.74
4.75 @@ -148,8 +145,6 @@
4.76 file, and the number of documents in a section of that file.
4.77 """
4.78
4.79 - self.begin_record()
4.80 -
4.81 # Read the document number.
4.82
4.83 docnum = self.read_sequence_value()
4.84 @@ -167,7 +162,6 @@
4.85 # Read the document count.
4.86
4.87 count = self.read_number()
4.88 - self.end_record()
4.89
4.90 return self.last_docnum, self.last_pos_offset, count
4.91
4.92 @@ -339,14 +333,14 @@
4.93
4.94 self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
4.95
4.96 - first_offset = self.position_writer.tell()
4.97 - first_docnum = None
4.98 -
4.99 # Reset the position writer so that position readers accessing
4.100 # a section start with the correct document number.
4.101
4.102 self.position_writer.reset()
4.103
4.104 + first_offset = self.position_writer.tell()
4.105 + first_docnum = None
4.106 +
4.107 # Finish writing an index entry for the remaining documents.
4.108
4.109 else:
5.1 --- a/iixr/terms.py Mon Feb 07 02:05:38 2011 +0100
5.2 +++ b/iixr/terms.py Tue Feb 08 00:08:27 2011 +0100
5.3 @@ -29,6 +29,7 @@
5.4 "Writing term information to files."
5.5
5.6 def reset(self):
5.7 + self.end_record()
5.8 self.last_term = ""
5.9 self.last_offset = 0
5.10
5.11 @@ -40,14 +41,6 @@
5.12 term information file.
5.13 """
5.14
5.15 - self.begin_record()
5.16 - self._write_term(term, offset, frequency, doc_frequency)
5.17 - self.end_record()
5.18 -
5.19 - def _write_term(self, term, offset, frequency, doc_frequency):
5.20 -
5.21 - "Performs the term writing for 'write_term'."
5.22 -
5.23 if term <= self.last_term:
5.24 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
5.25
5.26 @@ -79,6 +72,7 @@
5.27 def reset(self):
5.28 self.last_term = ""
5.29 self.last_offset = 0
5.30 + self.begin_record()
5.31
5.32 def read_term(self):
5.33
5.34 @@ -87,16 +81,6 @@
5.35 frequency from the term information file.
5.36 """
5.37
5.38 - self.begin_record()
5.39 - try:
5.40 - return self._read_term()
5.41 - finally:
5.42 - self.end_record()
5.43 -
5.44 - def _read_term(self):
5.45 -
5.46 - "Performs the term reading for 'read_term'."
5.47 -
5.48 # Read the prefix length and term suffix.
5.49
5.50 common = self.read_number()
5.51 @@ -145,13 +129,11 @@
5.52 'info_offset' in the term information file.
5.53 """
5.54
5.55 - self.begin_record()
5.56 - TermWriter._write_term(self, term, offset, frequency, doc_frequency)
5.57 + TermWriter.write_term(self, term, offset, frequency, doc_frequency)
5.58
5.59 # Write the information file offset delta.
5.60
5.61 self.write_number(info_offset - self.last_info_offset)
5.62 - self.end_record()
5.63
5.64 self.last_info_offset = info_offset
5.65
5.66 @@ -171,13 +153,11 @@
5.67 index file.
5.68 """
5.69
5.70 - self.begin_record()
5.71 - term, offset, frequency, doc_frequency = TermReader._read_term(self)
5.72 + term, offset, frequency, doc_frequency = TermReader.read_term(self)
5.73
5.74 # Read the offset delta.
5.75
5.76 self.last_info_offset += self.read_number()
5.77 - self.end_record()
5.78
5.79 return term, offset, frequency, doc_frequency, self.last_info_offset
5.80
5.81 @@ -197,16 +177,16 @@
5.82 """
5.83 Write the given 'term', its position file 'offset', its 'frequency' and
5.84 its 'doc_frequency' (number of documents in which it appears) to the
5.85 - term information file. Return the offset after the term information was
5.86 + term information file. Return the offset before the term information was
5.87 written to the file.
5.88 """
5.89
5.90 - self.info_writer.write_term(term, offset, frequency, doc_frequency)
5.91 -
5.92 if self.entry % self.interval == 0:
5.93 + self.info_writer.reset()
5.94 info_offset = self.info_writer.tell()
5.95 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
5.96
5.97 + self.info_writer.write_term(term, offset, frequency, doc_frequency)
5.98 self.entry += 1
5.99
5.100 def write_term_positions(self, term, doc_positions):
5.101 @@ -236,6 +216,7 @@
5.102 self.info_reader = info_reader
5.103 self.index_reader = index_reader
5.104 self.position_dict_reader = position_dict_reader
5.105 + self.entry = 0
5.106
5.107 self.terms = []
5.108 try:
5.109 @@ -269,8 +250,10 @@
5.110 # as the closest.
5.111
5.112 if i == -1:
5.113 + self.entry = 0
5.114 return self.terms[0]
5.115 else:
5.116 + self.entry = i
5.117 return self.terms[i]
5.118
5.119 def _find_closest_term(self, term):
5.120 @@ -297,10 +280,11 @@
5.121 # and scan for the desired term.
5.122
5.123 else:
5.124 - self.info_reader.go_to_term(found_term, offset, info_offset)
5.125 + # Reset the term and offset for the new page.
5.126 + self.info_reader.go_to_term("", 0, info_offset)
5.127 try:
5.128 while term > found_term:
5.129 - found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
5.130 + found_term, offset, frequency, doc_frequency = self._read_term()
5.131 except EOFError:
5.132 pass
5.133
5.134 @@ -355,6 +339,7 @@
5.135 # Sequential access methods.
5.136
5.137 def rewind(self):
5.138 + self.entry = 0
5.139 self.info_reader.rewind()
5.140
5.141 def read_term(self):
5.142 @@ -364,8 +349,28 @@
5.143 documents and positions at which the term is found.
5.144 """
5.145
5.146 - term, offset, frequency, doc_frequency = self.info_reader.read_term()
5.147 - return self._get_term_and_positions(term, offset, frequency, doc_frequency)
5.148 + return self._get_term_and_positions(*self._read_term())
5.149 +
5.150 + def _read_term(self):
5.151 +
5.152 + try:
5.153 + term, offset, frequency, doc_frequency = self.info_reader.read_term()
5.154 + except EOFError:
5.155 + self.entry += 1
5.156 + try:
5.157 + term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry]
5.158 + except IndexError:
5.159 + raise EOFError
5.160 + else:
5.161 + # Reset the term and offset for the new page.
5.162 +
5.163 + self.info_reader.go_to_term("", 0, info_offset)
5.164 +
5.165 + # Skip the term in the information file.
5.166 +
5.167 + self.info_reader.read_term()
5.168 +
5.169 + return term, offset, frequency, doc_frequency
5.170
5.171 def go_to_term(self, term):
5.172
5.173 @@ -380,7 +385,14 @@
5.174 # Position the reader, if necessary.
5.175
5.176 if info_offset is not None:
5.177 - self.info_reader.go_to_term(found_term, offset, info_offset)
5.178 +
5.179 + # Reset the term and offset for the new page.
5.180 +
5.181 + self.info_reader.go_to_term("", 0, info_offset)
5.182 +
5.183 + # Skip the term in the information file.
5.184 +
5.185 + self.info_reader.read_term()
5.186
5.187 return found_term, offset, frequency, doc_frequency
5.188
5.189 @@ -407,7 +419,7 @@
5.190
5.191 while found_term.startswith(term):
5.192 terms.append(found_term)
5.193 - found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
5.194 + found_term, offset, frequency, doc_frequency = self._read_term()
5.195
5.196 except EOFError:
5.197 pass