1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/iixr.py Wed Aug 19 21:49:28 2009 +0200
1.3 @@ -0,0 +1,204 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +A simple (and sane) text indexing library.
1.8 +"""
1.9 +
1.10 +# Foundation classes.
1.11 +
1.12 +class File:
1.13 +
1.14 + "A basic file abstraction."
1.15 +
1.16 + def __init__(self, f):
1.17 + self.f = f
1.18 + self.reset()
1.19 +
1.20 + def reset(self):
1.21 + pass
1.22 +
1.23 + def close(self):
1.24 + self.f.close()
1.25 +
1.26 +class FileWriter(File):
1.27 +
1.28 + "Writing basic data types to files."
1.29 +
1.30 + def write_number(self, number):
1.31 +
1.32 + "Write 'number' to the file using a variable length encoding."
1.33 +
1.34 + # Negative numbers are not supported.
1.35 +
1.36 + if number < 0:
1.37 + raise ValueError, "Number %r is negative." % number
1.38 +
1.39 + # Special case: one byte containing zero.
1.40 +
1.41 + elif number == 0:
1.42 + self.f.write(chr(1) + chr(0))
1.43 + return
1.44 +
1.45 + # Write the number from least to most significant digits.
1.46 +
1.47 + nbytes = 0
1.48 + bytes = []
1.49 +
1.50 + while number != 0:
1.51 + lsd = number & 255
1.52 + bytes.append(chr(lsd))
1.53 + number = number >> 8
1.54 + nbytes += 1
1.55 +
1.56 + # Too large numbers are not supported.
1.57 +
1.58 + if nbytes > 255:
1.59 + raise ValueError, "Number %r is too large." % number
1.60 +
1.61 + bytes.insert(0, chr(nbytes))
1.62 + record = "".join(bytes)
1.63 + self.f.write(record)
1.64 +
1.65 +class FileReader(File):
1.66 +
1.67 + "Reading basic data types from files."
1.68 +
1.69 + def read_number(self):
1.70 +
1.71 + "Read a number from the file."
1.72 +
1.73 + nbytes = ord(self.f.read(1))
1.74 +
1.75 + # Read each byte, adding it to the number.
1.76 +
1.77 + bytes = self.f.read(nbytes)
1.78 +
1.79 + i = 0
1.80 + shift = 0
1.81 + number = 0
1.82 +
1.83 + while i < nbytes:
1.84 + csd = ord(bytes[i])
1.85 + number += (csd << shift)
1.86 + shift += 8
1.87 + i += 1
1.88 +
1.89 + return number
1.90 +
1.91 +# Specific classes.
1.92 +
1.93 +class PositionWriter(FileWriter):
1.94 +
1.95 + "Writing position information to files."
1.96 +
1.97 + def reset(self):
1.98 + self.last_docnum = 0
1.99 +
1.100 + def write_positions(self, docnum, positions):
1.101 +
1.102 + "Write for the document 'docnum' the given 'positions'."
1.103 +
1.104 + if docnum < self.last_docnum:
1.105 + raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
1.106 +
1.107 + # Write the document number delta.
1.108 +
1.109 + self.write_number(docnum - self.last_docnum)
1.110 +
1.111 + # Write the number of positions.
1.112 +
1.113 + self.write_number(len(positions))
1.114 +
1.115 + # Write the position deltas.
1.116 +
1.117 + last = 0
1.118 + for position in positions:
1.119 + pos = position - last
1.120 + self.write_number(pos)
1.121 + last = position
1.122 +
1.123 + self.last_docnum = docnum
1.124 +
1.125 + def write_all_positions(self, doc_positions):
1.126 +
1.127 + """
1.128 + Write all 'doc_positions' - a collection of tuples of the form (document
1.129 + number, position list) - to the file, returning the offset at which they
1.130 + were stored.
1.131 + """
1.132 +
1.133 + # Reset the writer and record the current file offset.
1.134 +
1.135 + self.reset()
1.136 + offset = self.f.tell()
1.137 +
1.138 + # Write the number of documents.
1.139 +
1.140 + self.write_number(len(doc_positions))
1.141 +
1.142 + # Write the positions.
1.143 +
1.144 + for docnum, positions in doc_positions:
1.145 + self.write_positions(docnum, positions)
1.146 +
1.147 + return offset
1.148 +
1.149 +class PositionReader(FileReader):
1.150 +
1.151 + "Reading position information from files."
1.152 +
1.153 + def reset(self):
1.154 + self.last_docnum = 0
1.155 +
1.156 + def read_positions(self):
1.157 +
1.158 + "Read positions, returning a document number and a list of positions."
1.159 +
1.160 + # Read the document number delta and add it to the last number.
1.161 +
1.162 + self.last_docnum += self.read_number()
1.163 +
1.164 + # Read the number of positions.
1.165 +
1.166 + npositions = self.read_number()
1.167 +
1.168 + # Read the position deltas, adding each previous position to get the
1.169 + # appropriate collection of absolute positions.
1.170 +
1.171 + i = 0
1.172 + last = 0
1.173 + positions = []
1.174 +
1.175 + while i < npositions:
1.176 + last += self.read_number()
1.177 + positions.append(last)
1.178 + i += 1
1.179 +
1.180 + return self.last_docnum, positions
1.181 +
1.182 + def read_all_positions(self, offset):
1.183 +
1.184 + """
1.185 + Read all positions from 'offset', seeking to that position in the file
1.186 + before reading.
1.187 + """
1.188 +
1.189 + self.reset()
1.190 + self.f.seek(offset)
1.191 +
1.192 + # Read the number of documents.
1.193 +
1.194 + ndocuments = self.read_number()
1.195 +
1.196 + # Read all records.
1.197 +
1.198 + i = 0
1.199 + doc_positions = []
1.200 +
1.201 + while i < ndocuments:
1.202 + doc_positions.append(self.read_positions())
1.203 + i += 1
1.204 +
1.205 + return doc_positions
1.206 +
1.207 +# vim: tabstop=4 expandtab shiftwidth=4