1.1 --- a/iixr.py Tue Aug 25 22:10:37 2009 +0200
1.2 +++ b/iixr.py Tue Aug 25 22:44:15 2009 +0200
1.3 @@ -18,6 +18,8 @@
1.4 with this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 +from os.path import commonprefix # to find common string prefixes
1.8 +
1.9 # Foundation classes.
1.10
1.11 class File:
1.12 @@ -73,6 +75,27 @@
1.13 record = "".join(bytes)
1.14 self.f.write(record)
1.15
1.16 + def write_unsigned_byte(self, number):
1.17 +
1.18 + "Write 'number' to the file using a single byte."
1.19 +
1.20 + if not (0 <= number <= 255):
1.21 + raise ValueError, "Number %r is out of range." % number
1.22 +
1.23 + self.f.write(chr(number))
1.24 +
1.25 + def write_string(self, s):
1.26 +
1.27 + "Write 's' to the file, recording its length."
1.28 +
1.29 + length = len(s)
1.30 +
1.31 + if not (0 <= length <= 255):
1.32 + raise ValueError, "String %r is too long." % s
1.33 +
1.34 + self.write_unsigned_byte(length)
1.35 + self.f.write(s)
1.36 +
1.37 class FileReader(File):
1.38
1.39 "Reading basic data types from files."
1.40 @@ -99,6 +122,19 @@
1.41
1.42 return number
1.43
1.44 + def read_unsigned_byte(self):
1.45 +
1.46 + "Read a number from the file, consuming a single byte."
1.47 +
1.48 + return ord(self.f.read(1))
1.49 +
1.50 + def read_string(self):
1.51 +
1.52 + "Read a string from the file."
1.53 +
1.54 + length = self.read_unsigned_byte()
1.55 + return self.f.read(length)
1.56 +
1.57 # Specific classes.
1.58
1.59 class PositionWriter(FileWriter):
1.60 @@ -215,4 +251,66 @@
1.61
1.62 return doc_positions
1.63
1.64 +class TermWriter(FileWriter):
1.65 +
1.66 + "Writing term information to files."
1.67 +
1.68 + def reset(self):
1.69 + self.last_term = ""
1.70 + self.last_offset = 0
1.71 +
1.72 + def write_term(self, term, offset):
1.73 +
1.74 + """
1.75 + Write the given 'term' and its position file 'offset' to the term
1.76 + information file.
1.77 + """
1.78 +
1.79 + # Too long terms are not currently supported.
1.80 +
1.81 + if len(term) > 255:
1.82 + raise ValueError, "Term %r is too long." % term
1.83 +
1.84 + # Write the prefix length and term suffix.
1.85 +
1.86 + common = len(commonprefix([self.last_term, term]))
1.87 + suffix = term[common:]
1.88 +
1.89 + self.write_unsigned_byte(common)
1.90 + self.write_string(suffix)
1.91 +
1.92 + # Write the offset delta.
1.93 +
1.94 + self.write_number(offset - self.last_offset)
1.95 +
1.96 + self.last_term = term
1.97 + self.last_offset = offset
1.98 +
1.99 +class TermReader(FileReader):
1.100 +
1.101 + "Reading term information from files."
1.102 +
1.103 + def reset(self):
1.104 + self.last_term = ""
1.105 + self.last_offset = 0
1.106 +
1.107 + def read_term(self):
1.108 +
1.109 + """
1.110 + Read a term and its position file offset from the term information file.
1.111 + """
1.112 +
1.113 + # Read the prefix length and term suffix.
1.114 +
1.115 + common = self.read_unsigned_byte()
1.116 + suffix = self.read_string()
1.117 +
1.118 + self.last_term = self.last_term[:common] + suffix
1.119 +
1.120 + # Read the offset delta.
1.121 +
1.122 + self.last_offset += self.read_number()
1.123 +
1.124 + return self.last_term, self.last_offset
1.125 +
1.126 # vim: tabstop=4 expandtab shiftwidth=4