1.1 --- a/iixr/data.py Fri Sep 18 01:39:08 2009 +0200
1.2 +++ b/iixr/data.py Fri Sep 18 21:07:35 2009 +0200
1.3 @@ -18,6 +18,8 @@
1.4 with this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 +from array import array
1.8 +
1.9 vint_cache = {}
1.10
1.11 def vint(number):
1.12 @@ -31,23 +33,22 @@
1.13
1.14 # Write the number from least to most significant digits.
1.15
1.16 - bytes = []
1.17 + bytes = array('B')
1.18
1.19 while number > 127:
1.20 - lsd = number & 127 | 128
1.21 - bytes.append(chr(lsd))
1.22 + bytes.append(number & 127 | 128)
1.23 number = number >> 7
1.24 else:
1.25 - bytes.append(chr(number))
1.26 + bytes.append(number)
1.27
1.28 - return "".join(bytes)
1.29 + return bytes.tostring()
1.30
1.31 # Negative numbers are not supported.
1.32
1.33 else:
1.34 raise ValueError, "Number %r is negative." % number
1.35
1.36 -for i in xrange(0, 1024):
1.37 +for i in xrange(0, 65536):
1.38 vint_cache[i] = vint(i)
1.39
1.40 # vim: tabstop=4 expandtab shiftwidth=4
2.1 --- a/iixr/fields.py Fri Sep 18 01:39:08 2009 +0200
2.2 +++ b/iixr/fields.py Fri Sep 18 21:07:35 2009 +0200
2.3 @@ -33,11 +33,8 @@
2.4 """
2.5 Write for the given 'docnum', a list of 'fields' (integer, string pairs
2.6 representing field identifiers and values respectively).
2.7 - Return the offset at which the fields are stored.
2.8 """
2.9
2.10 - offset = self.f.tell()
2.11 -
2.12 # Write the document number delta.
2.13
2.14 self.write_number(docnum - self.last_docnum)
2.15 @@ -53,7 +50,6 @@
2.16 self.write_string(field, 1) # compress
2.17
2.18 self.last_docnum = docnum
2.19 - return offset
2.20
2.21 class FieldReader(FileReader):
2.22
2.23 @@ -159,10 +155,12 @@
2.24
2.25 "Write details of the document with the given 'docnum' and 'fields'."
2.26
2.27 - offset = self.field_writer.write_fields(docnum, fields)
2.28 -
2.29 if self.entry % self.interval == 0:
2.30 + offset = self.field_writer.f.tell()
2.31 + self.field_writer.write_fields(docnum, fields)
2.32 self.field_index_writer.write_document(docnum, offset)
2.33 + else:
2.34 + self.field_writer.write_fields(docnum, fields)
2.35
2.36 self.entry += 1
2.37
3.1 --- a/iixr/files.py Fri Sep 18 01:39:08 2009 +0200
3.2 +++ b/iixr/files.py Fri Sep 18 21:07:35 2009 +0200
3.3 @@ -105,23 +105,25 @@
3.4
3.5 # Read each byte, adding it to the number.
3.6
3.7 - shift = 0
3.8 - number = 0
3.9 read = self.f.read
3.10
3.11 - try:
3.12 - csd = ord(read(1))
3.13 - while csd & 128:
3.14 - number += ((csd & 127) << shift)
3.15 - shift += 7
3.16 - csd = ord(read(1))
3.17 + c = read(1)
3.18 + if c:
3.19 + csd = ord(c)
3.20 + if csd < 128:
3.21 + return csd
3.22 else:
3.23 - number += (csd << shift)
3.24 - except TypeError:
3.25 + shift = 0
3.26 + number = 0
3.27 + while csd & 128:
3.28 + number += ((csd & 127) << shift)
3.29 + shift += 7
3.30 + csd = ord(read(1))
3.31 + else:
3.32 + return number + (csd << shift)
3.33 + else:
3.34 raise EOFError
3.35
3.36 - return number
3.37 -
3.38 def read_string(self, decompress=0):
3.39
3.40 """
4.1 --- a/iixr/positions.py Fri Sep 18 01:39:08 2009 +0200
4.2 +++ b/iixr/positions.py Fri Sep 18 21:07:35 2009 +0200
4.3 @@ -32,16 +32,11 @@
4.4
4.5 """
4.6 Write for the document 'docnum' the given 'positions'.
4.7 - Return the offset of the written record.
4.8 """
4.9
4.10 if docnum < self.last_docnum:
4.11 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
4.12
4.13 - # Record the offset of this record.
4.14 -
4.15 - offset = self.f.tell()
4.16 -
4.17 # Make sure that the positions are sorted.
4.18
4.19 positions.sort()
4.20 @@ -63,7 +58,6 @@
4.21 self.f.write("".join([vint(x) for x in output]))
4.22
4.23 self.last_docnum = docnum
4.24 - return offset
4.25
4.26 class PositionOpener(FileOpener):
4.27
4.28 @@ -97,10 +91,6 @@
4.29 position index file.
4.30 """
4.31
4.32 - # Record the offset of this record.
4.33 -
4.34 - offset = self.f.tell()
4.35 -
4.36 # Write the document number delta.
4.37 # Write the position file offset delta.
4.38 # Write the document count.
4.39 @@ -113,7 +103,6 @@
4.40
4.41 self.last_pos_offset = pos_offset
4.42 self.last_docnum = docnum
4.43 - return offset
4.44
4.45 class PositionIndexOpener(FileOpener):
4.46
4.47 @@ -274,57 +263,52 @@
4.48 self.position_writer.reset()
4.49 self.position_index_writer.reset()
4.50
4.51 - index_offset = None
4.52 + # Remember the first index entry offset.
4.53 +
4.54 + index_offset = self.position_index_writer.f.tell()
4.55
4.56 # Write the positions.
4.57
4.58 frequency = 0
4.59 - first_docnum = None
4.60 - first_offset = None
4.61 count = 0
4.62
4.63 - doc_positions.sort()
4.64 -
4.65 - for docnum, positions in doc_positions:
4.66 - pos_offset = self.position_writer.write_positions(docnum, positions)
4.67 + if doc_positions:
4.68
4.69 # Retain the first record offset for a subsequent index entry.
4.70
4.71 - if first_offset is None:
4.72 - first_offset = pos_offset
4.73 - first_docnum = docnum
4.74 + first_offset = self.position_writer.f.tell()
4.75 + first_docnum = None
4.76
4.77 - frequency += len(positions)
4.78 - count += 1
4.79 + doc_positions.sort()
4.80
4.81 - # Every {interval} entries, write an index entry.
4.82 -
4.83 - if count % self.interval == 0:
4.84 - io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
4.85 + for docnum, positions in doc_positions:
4.86 + if first_docnum is None:
4.87 + first_docnum = docnum
4.88
4.89 - # Remember the first index entry offset.
4.90 + self.position_writer.write_positions(docnum, positions)
4.91
4.92 - if index_offset is None:
4.93 - index_offset = io
4.94 + frequency += len(positions)
4.95 + count += 1
4.96 +
4.97 + # Every {interval} entries, write an index entry.
4.98
4.99 - first_offset = None
4.100 - first_docnum = None
4.101 + if count % self.interval == 0:
4.102
4.103 - # Reset the position writer so that position readers accessing
4.104 - # a section start with the correct document number.
4.105 + self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
4.106
4.107 - self.position_writer.reset()
4.108 + first_offset = self.position_writer.f.tell()
4.109 + first_docnum = None
4.110
4.111 - # Finish writing an index entry for the remaining documents.
4.112 + # Reset the position writer so that position readers accessing
4.113 + # a section start with the correct document number.
4.114 +
4.115 + self.position_writer.reset()
4.116
4.117 - else:
4.118 - if first_offset is not None:
4.119 - io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
4.120 + # Finish writing an index entry for the remaining documents.
4.121
4.122 - # Remember the first index entry offset.
4.123 -
4.124 - if index_offset is None:
4.125 - index_offset = io
4.126 + else:
4.127 + if first_docnum is not None:
4.128 + self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
4.129
4.130 return index_offset, frequency, count
4.131
5.1 --- a/iixr/terms.py Fri Sep 18 01:39:08 2009 +0200
5.2 +++ b/iixr/terms.py Fri Sep 18 21:07:35 2009 +0200
5.3 @@ -36,8 +36,7 @@
5.4 """
5.5 Write the given 'term', its position file 'offset', its 'frequency' and
5.6 its 'doc_frequency' (number of documents in which it appears) to the
5.7 - term information file. Return the offset after the term information was
5.8 - written to the file.
5.9 + term information file.
5.10 """
5.11
5.12 # Write the prefix length and term suffix.
5.13 @@ -63,8 +62,6 @@
5.14 self.last_term = term
5.15 self.last_offset = offset
5.16
5.17 - return self.f.tell()
5.18 -
5.19 class TermReader(FileReader):
5.20
5.21 "Reading term information from files."
5.22 @@ -179,9 +176,10 @@
5.23 written to the file.
5.24 """
5.25
5.26 - info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
5.27 + self.info_writer.write_term(term, offset, frequency, doc_frequency)
5.28
5.29 if self.entry % self.interval == 0:
5.30 + info_offset = self.info_writer.f.tell()
5.31 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
5.32
5.33 self.entry += 1
6.1 --- a/test.py Fri Sep 18 01:39:08 2009 +0200
6.2 +++ b/test.py Fri Sep 18 21:07:35 2009 +0200
6.3 @@ -94,9 +94,9 @@
6.4 doc_frequency = 0
6.5 w.reset()
6.6 for docnum, pos_offset, count in term_positions:
6.7 - io = w.write_positions(docnum, pos_offset, count)
6.8 if offset is None:
6.9 - offset = io
6.10 + offset = w.f.tell()
6.11 + w.write_positions(docnum, pos_offset, count)
6.12 doc_frequency += count
6.13 offsets.append((offset, doc_frequency))
6.14 w.close()