1.1 --- a/iixr.py Thu Aug 27 20:52:48 2009 +0200
1.2 +++ b/iixr.py Fri Aug 28 01:15:17 2009 +0200
1.3 @@ -22,6 +22,7 @@
1.4 from os.path import exists, join
1.5 from os.path import commonprefix # to find common string prefixes
1.6 from bisect import bisect_right # to find terms in the dictionary index
1.7 +import bz2 # for field compression
1.8
1.9 # Constants.
1.10
1.11 @@ -78,20 +79,26 @@
1.12 record = "".join(bytes)
1.13 self.f.write(record)
1.14
1.15 - def write_string(self, s):
1.16 + def write_string(self, s, compress=0):
1.17
1.18 - "Write 's' to the file, recording its length."
1.19 + """
1.20 + Write 's' to the file, recording its length and compressing the string
1.21 + if 'compress' is set to a true value.
1.22 + """
1.23
1.24 # Convert Unicode objects to strings.
1.25
1.26 if isinstance(s, unicode):
1.27 s = s.encode("utf-8")
1.28
1.29 - length = len(s)
1.30 + # Compress the string if requested.
1.31
1.32 - if not (0 <= length <= 255):
1.33 - raise ValueError, "String %r is too long." % s
1.34 + if compress:
1.35 + s = bz2.compress(s)
1.36
1.37 + # Write the length of the data before the data itself.
1.38 +
1.39 + length = len(s)
1.40 self.write_number(length)
1.41 self.f.write(s)
1.42
1.43 @@ -123,15 +130,24 @@
1.44
1.45 return number
1.46
1.47 - def read_string(self):
1.48 + def read_string(self, decompress=0):
1.49
1.50 - "Read a string from the file."
1.51 + """
1.52 + Read a string from the file, decompressing the stored data if
1.53 + 'decompress' is set to a true value.
1.54 + """
1.55
1.56 length = self.read_number()
1.57 + s = self.f.read(length)
1.58 +
1.59 + # Decompress the data if requested.
1.60 +
1.61 + if decompress:
1.62 + s = bz2.decompress(s)
1.63
1.64 # Convert strings to Unicode objects.
1.65
1.66 - return unicode(self.f.read(length), "utf-8")
1.67 + return unicode(s, "utf-8")
1.68
1.69 # Specific classes.
1.70
1.71 @@ -484,6 +500,62 @@
1.72 self.index_reader.close()
1.73 self.position_reader.close()
1.74
1.75 +class FieldWriter(FileWriter):
1.76 +
1.77 + "Writing field data to files."
1.78 +
1.79 + def write_fields(self, fields):
1.80 +
1.81 + """
1.82 + Write the given list of 'fields' (strings representing field values).
1.83 + Return the offset at which the fields are stored.
1.84 + """
1.85 +
1.86 + offset = self.f.tell()
1.87 +
1.88 + # Write the number of fields.
1.89 +
1.90 + self.write_number(len(fields))
1.91 +
1.92 + # Write the fields themselves.
1.93 +
1.94 + for field in fields:
1.95 + self.write_string(field, 0) # compress
1.96 +
1.97 + return offset
1.98 +
1.99 +class FieldReader(FileReader):
1.100 +
1.101 + "Reading field data from files."
1.102 +
1.103 + def read_fields(self):
1.104 +
1.105 + "Read fields from the file, returning the field values in a list."
1.106 +
1.107 + # Read the number of fields.
1.108 +
1.109 + nfields = self.read_number()
1.110 +
1.111 + # Collect the fields.
1.112 +
1.113 + fields = []
1.114 + i = 0
1.115 +
1.116 + while i < nfields:
1.117 + fields.append(self.read_string(0)) # decompress
1.118 + i += 1
1.119 +
1.120 + return fields
1.121 +
1.122 + def read_doc_fields(self, offset):
1.123 +
1.124 + "Read all fields at the given 'offset."
1.125 +
1.126 + self.f.seek(offset)
1.127 + return self.read_fields()
1.128 +
1.129 +# High-level classes.
1.130 +
1.131 class IndexWriter:
1.132
1.133 "Building term information and writing it to the term dictionary."
2.1 --- a/test.py Thu Aug 27 20:52:48 2009 +0200
2.2 +++ b/test.py Fri Aug 28 01:15:17 2009 +0200
2.3 @@ -64,6 +64,30 @@
2.4 print doc_positions == dp, doc_positions, dp
2.5 r.close()
2.6
2.7 +doc_fields = [
2.8 + ["testing", "fields", "stored", "compressed"],
2.9 + ["fields", "for a second", "document"]
2.10 + ]
2.11 +
2.12 +f = open("testF", "wb")
2.13 +w = iixr.FieldWriter(f)
2.14 +offsets = []
2.15 +for fields in doc_fields:
2.16 + offsets.append(w.write_fields(fields))
2.17 +w.close()
2.18 +
2.19 +f = open("testF", "rb")
2.20 +r = iixr.FieldReader(f)
2.21 +for fields in doc_fields:
2.22 + df = r.read_fields()
2.23 + print fields == df, fields, df
2.24 +offsets.reverse()
2.25 +doc_fields.reverse()
2.26 +for offset, fields in zip(offsets, doc_fields):
2.27 + df = r.read_doc_fields(offset)
2.28 + print fields == df, fields, df
2.29 +r.close()
2.30 +
2.31 terms = [
2.32 ("aardvark", 100000123),
2.33 ("anteater", 100000456),