1.1 --- a/iixr/positions.py Tue Feb 08 00:08:27 2011 +0100
1.2 +++ b/iixr/positions.py Thu Feb 10 01:19:13 2011 +0100
1.3 @@ -25,6 +25,13 @@
1.4
1.5 "Writing position information to files."
1.6
1.7 + def begin(self, docnum_size, position_size):
1.8 + self.write_numbers((docnum_size, position_size))
1.9 + self.end_record()
1.10 + self.data_start = self.tell()
1.11 + self.docnum_size = docnum_size
1.12 + self.position_size = position_size
1.13 +
1.14 def reset(self):
1.15 self.end_record()
1.16 self.last_docnum = None
1.17 @@ -57,8 +64,8 @@
1.18 self.subtractor = get_subtractor(docnum)
1.19 docnum_seq = docnum
1.20
1.21 - self.write_sequence_value(docnum_seq)
1.22 - self.write_monotonic_sequence(positions)
1.23 + self.write_sequence_value(docnum_seq, self.docnum_size)
1.24 + self.write_monotonic_sequence(positions, self.position_size)
1.25
1.26 self.last_docnum = docnum
1.27
1.28 @@ -66,6 +73,14 @@
1.29
1.30 "Reading position information within term-specific regions of a file."
1.31
1.32 + def begin(self):
1.33 + self.begin_record()
1.34 + try:
1.35 + self.docnum_size, self.position_size = self.read_numbers(2)
1.36 + except EOFError:
1.37 + self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
1.38 + self.data_start = self.tell()
1.39 +
1.40 def reset(self):
1.41 self.last_docnum = None
1.42 self.adder = None
1.43 @@ -79,7 +94,7 @@
1.44
1.45 # Read the document number.
1.46
1.47 - docnum = self.read_sequence_value()
1.48 + docnum = self.read_sequence_value(self.docnum_size)
1.49
1.50 # Calculate an ongoing delta.
1.51
1.52 @@ -92,18 +107,19 @@
1.53 self.adder = get_adder(docnum)
1.54 self.last_docnum = docnum
1.55
1.56 - positions = self.read_monotonic_sequence()
1.57 + positions = self.read_monotonic_sequence(self.position_size)
1.58
1.59 return self.last_docnum, positions
1.60
1.61 -class PositionIndexWriter(FileWriter):
1.62 +class PositionIndexWriter(PositionWriter):
1.63
1.64 "Writing position index information to files."
1.65
1.66 + def begin(self, docnum_size):
1.67 + PositionWriter.begin(self, docnum_size, 0)
1.68 +
1.69 def reset(self):
1.70 - self.end_record()
1.71 - self.last_docnum = None
1.72 - self.subtractor = None
1.73 + PositionWriter.reset(self)
1.74 self.last_pos_offset = 0
1.75
1.76 def write_positions(self, docnum, pos_offset, count):
1.77 @@ -121,22 +137,20 @@
1.78 self.subtractor = get_subtractor(docnum)
1.79 docnum_seq = docnum
1.80
1.81 - self.write_sequence_value(docnum_seq)
1.82 + self.write_sequence_value(docnum_seq, self.docnum_size)
1.83 self.write_number(pos_offset - self.last_pos_offset)
1.84 self.write_number(count)
1.85
1.86 self.last_docnum = docnum
1.87 self.last_pos_offset = pos_offset
1.88
1.89 -class PositionIndexReader(FileReader):
1.90 +class PositionIndexReader(PositionReader):
1.91
1.92 "Reading position index information within term-specific regions of a file."
1.93
1.94 def reset(self):
1.95 - self.last_docnum = None
1.96 - self.adder = None
1.97 + PositionReader.reset(self)
1.98 self.last_pos_offset = 0
1.99 - self.begin_record()
1.100
1.101 def read_positions(self):
1.102
1.103 @@ -147,7 +161,7 @@
1.104
1.105 # Read the document number.
1.106
1.107 - docnum = self.read_sequence_value()
1.108 + docnum = self.read_sequence_value(self.docnum_size)
1.109
1.110 if self.last_docnum is not None:
1.111 self.last_docnum = self.adder(docnum, self.last_docnum)
1.112 @@ -295,28 +309,38 @@
1.113 the term involved.
1.114 """
1.115
1.116 - # Reset the writers.
1.117 -
1.118 - self.position_writer.reset()
1.119 - self.position_index_writer.reset()
1.120 -
1.121 - # Remember the first index entry offset.
1.122 -
1.123 - index_offset = self.position_index_writer.tell()
1.124 -
1.125 # Write the positions.
1.126
1.127 frequency = 0
1.128 count = 0
1.129
1.130 if doc_positions:
1.131 + doc_positions.sort()
1.132 +
1.133 + # Look ahead at the first document record.
1.134 + # NOTE: Any iterator would need to support this.
1.135 +
1.136 + first_docnum, first_positions = doc_positions[0]
1.137 + first_position = first_positions[0]
1.138 +
1.139 + # Write out size details.
1.140 +
1.141 + docnum_size, position_size = sizeof(first_docnum), sizeof(first_position)
1.142 + self.position_writer.begin(docnum_size, position_size)
1.143 + self.position_index_writer.begin(docnum_size)
1.144 +
1.145 + # Reset the writers.
1.146 +
1.147 + self.position_writer.reset()
1.148 + self.position_index_writer.reset()
1.149 +
1.150 + # Remember the first index entry offset.
1.151 +
1.152 + index_offset = self.position_index_writer.tell()
1.153
1.154 # Retain the first record offset for a subsequent index entry.
1.155
1.156 first_offset = self.position_writer.tell()
1.157 - first_docnum = None
1.158 -
1.159 - doc_positions.sort()
1.160
1.161 for docnum, positions in doc_positions:
1.162 if first_docnum is None: