1.1 --- a/iixr.py Thu Aug 27 20:52:48 2009 +0200
1.2 +++ b/iixr.py Fri Aug 28 01:15:17 2009 +0200
1.3 @@ -22,6 +22,7 @@
1.4 from os.path import exists, join
1.5 from os.path import commonprefix # to find common string prefixes
1.6 from bisect import bisect_right # to find terms in the dictionary index
1.7 +import bz2 # for field compression
1.8
1.9 # Constants.
1.10
1.11 @@ -78,20 +79,26 @@
1.12 record = "".join(bytes)
1.13 self.f.write(record)
1.14
1.15 - def write_string(self, s):
1.16 + def write_string(self, s, compress=0):
1.17
1.18 - "Write 's' to the file, recording its length."
1.19 + """
1.20 + Write 's' to the file, recording its length and compressing the string
1.21 + if 'compress' is set to a true value.
1.22 + """
1.23
1.24 # Convert Unicode objects to strings.
1.25
1.26 if isinstance(s, unicode):
1.27 s = s.encode("utf-8")
1.28
1.29 - length = len(s)
1.30 + # Compress the string if requested.
1.31
1.32 - if not (0 <= length <= 255):
1.33 - raise ValueError, "String %r is too long." % s
1.34 + if compress:
1.35 + s = bz2.compress(s)
1.36
1.37 + # Write the length of the data before the data itself.
1.38 +
1.39 + length = len(s)
1.40 self.write_number(length)
1.41 self.f.write(s)
1.42
1.43 @@ -123,15 +130,24 @@
1.44
1.45 return number
1.46
1.47 - def read_string(self):
1.48 + def read_string(self, decompress=0):
1.49
1.50 - "Read a string from the file."
1.51 + """
1.52 + Read a string from the file, decompressing the stored data if
1.53 + 'decompress' is set to a true value.
1.54 + """
1.55
1.56 length = self.read_number()
1.57 + s = self.f.read(length)
1.58 +
1.59 + # Decompress the data if requested.
1.60 +
1.61 + if decompress:
1.62 + s = bz2.decompress(s)
1.63
1.64 # Convert strings to Unicode objects.
1.65
1.66 - return unicode(self.f.read(length), "utf-8")
1.67 + return unicode(s, "utf-8")
1.68
1.69 # Specific classes.
1.70
1.71 @@ -484,6 +500,62 @@
1.72 self.index_reader.close()
1.73 self.position_reader.close()
1.74
1.75 +class FieldWriter(FileWriter):
1.76 +
1.77 + "Writing field data to files."
1.78 +
1.79 + def write_fields(self, fields):
1.80 +
1.81 + """
1.82 + Write the given list of 'fields' (strings representing field values).
1.83 + Return the offset at which the fields are stored.
1.84 + """
1.85 +
1.86 + offset = self.f.tell()
1.87 +
1.88 + # Write the number of fields.
1.89 +
1.90 + self.write_number(len(fields))
1.91 +
1.92 + # Write the fields themselves.
1.93 +
1.94 + for field in fields:
1.95 + self.write_string(field, 0) # compress
1.96 +
1.97 + return offset
1.98 +
1.99 +class FieldReader(FileReader):
1.100 +
1.101 + "Reading field data from files."
1.102 +
1.103 + def read_fields(self):
1.104 +
1.105 + "Read fields from the file, returning the field values in a list."
1.106 +
1.107 + # Read the number of fields.
1.108 +
1.109 + nfields = self.read_number()
1.110 +
1.111 + # Collect the fields.
1.112 +
1.113 + fields = []
1.114 + i = 0
1.115 +
1.116 + while i < nfields:
1.117 + fields.append(self.read_string(0)) # decompress
1.118 + i += 1
1.119 +
1.120 + return fields
1.121 +
1.122 + def read_doc_fields(self, offset):
1.123 +
1.124 + "Read all fields at the given 'offset."
1.125 +
1.126 + self.f.seek(offset)
1.127 + return self.read_fields()
1.128 +
1.129 +# High-level classes.
1.130 +
1.131 class IndexWriter:
1.132
1.133 "Building term information and writing it to the term dictionary."