Introduced conditional compression for fields using bzip2 and zlib compression. Added an IndexReader class to encapsulate all reading operations (using term and field dictionaries). Added field-related file operations to the IndexWriter class. Added field-related file initialisation to the Index class. Changed the field index format to use offset deltas.

     1.1 --- a/iixr.py	Sat Aug 29 02:15:29 2009 +0200
     1.2 +++ b/iixr.py	Sat Aug 29 21:15:47 2009 +0200
     1.3 @@ -22,12 +22,15 @@
     1.4  from os.path import exists, join
     1.5  from os.path import commonprefix # to find common string prefixes
     1.6  from bisect import bisect_right  # to find terms in the dictionary index
     1.7 -import bz2                       # for field compression
     1.8 +import bz2, zlib                 # for field compression
     1.9  
    1.10  # Constants.
    1.11  
    1.12  INTERVAL = 100
    1.13  
    1.14 +compressors = [("b", bz2.compress), ("z", zlib.compress)]
    1.15 +decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
    1.16 +
    1.17  # Foundation classes.
    1.18  
    1.19  class File:
    1.20 @@ -94,7 +97,20 @@
    1.21          # Compress the string if requested.
    1.22  
    1.23          if compress:
    1.24 -            s = bz2.compress(s)
    1.25 +            for flag, fn in compressors:
    1.26 +                cs = fn(s)
    1.27 +
    1.28 +                # Take the first string shorter than the original.
    1.29 +
    1.30 +                if len(cs) < len(s):
    1.31 +                    s = cs
    1.32 +                    break
    1.33 +            else:
    1.34 +                flag = "-"
    1.35 +
    1.36 +            # Record whether compression was used.
    1.37 +
    1.38 +            self.f.write(flag)
    1.39  
    1.40          # Write the length of the data before the data itself.
    1.41  
    1.42 @@ -137,13 +153,21 @@
    1.43          'decompress' is set to a true value.
    1.44          """
    1.45  
    1.46 +        # Decompress the data if requested.
    1.47 +
    1.48 +        if decompress:
    1.49 +            flag = self.f.read(1)
    1.50 +        else:
    1.51 +            flag = "-"
    1.52 +
    1.53          length = self.read_number()
    1.54          s = self.f.read(length)
    1.55  
    1.56 -        # Decompress the data if requested.
    1.57 +        # Perform decompression if applicable.
    1.58  
    1.59 -        if decompress:
    1.60 -            s = bz2.decompress(s)
    1.61 +        if flag != "-":
    1.62 +            fn = decompressors[flag]
    1.63 +            s = fn(s)
    1.64  
    1.65          # Convert strings to Unicode objects.
    1.66  
    1.67 @@ -532,7 +556,7 @@
    1.68          # Write the fields themselves.
    1.69  
    1.70          for field in fields:
    1.71 -            self.write_string(field, 0) # compress
    1.72 +            self.write_string(field, 1) # compress
    1.73  
    1.74          self.last_docnum = docnum
    1.75          return offset
    1.76 @@ -565,7 +589,7 @@
    1.77          i = 0
    1.78  
    1.79          while i < nfields:
    1.80 -            fields.append(self.read_string(0)) # decompress
    1.81 +            fields.append(self.read_string(1)) # decompress
    1.82              i += 1
    1.83  
    1.84          return self.last_docnum, fields
    1.85 @@ -589,6 +613,7 @@
    1.86  
    1.87      def reset(self):
    1.88          self.last_docnum = 0
    1.89 +        self.last_offset = 0
    1.90  
    1.91      def write_document(self, docnum, offset):
    1.92  
    1.93 @@ -597,12 +622,13 @@
    1.94          document are stored in the fields file.
    1.95          """
    1.96  
    1.97 -        # Write the document number delta and offset.
    1.98 +        # Write the document number and offset deltas.
    1.99  
   1.100          self.write_number(docnum - self.last_docnum)
   1.101 -        self.write_number(offset)
   1.102 +        self.write_number(offset - self.last_offset)
   1.103  
   1.104          self.last_docnum = docnum
   1.105 +        self.last_offset = offset
   1.106  
   1.107  class FieldIndexReader(FileReader):
   1.108  
   1.109 @@ -610,6 +636,7 @@
   1.110  
   1.111      def reset(self):
   1.112          self.last_docnum = 0
   1.113 +        self.last_offset = 0
   1.114  
   1.115      def read_document(self):
   1.116  
   1.117 @@ -618,9 +645,9 @@
   1.118          # Read the document number delta and offset.
   1.119  
   1.120          self.last_docnum += self.read_number()
   1.121 -        offset = self.read_number()
   1.122 +        self.last_offset += self.read_number()
   1.123  
   1.124 -        return self.last_docnum, offset
   1.125 +        return self.last_docnum, self.last_offset
   1.126  
   1.127  class FieldDictionaryWriter:
   1.128  
   1.129 @@ -706,11 +733,15 @@
   1.130  
   1.131  class IndexWriter:
   1.132  
   1.133 -    "Building term information and writing it to the term dictionary."
   1.134 +    """
   1.135 +    Building term information and writing it to the term and field dictionaries.
   1.136 +    """
   1.137  
   1.138 -    def __init__(self, dict_writer):
   1.139 +    def __init__(self, dict_writer, field_dict_writer):
   1.140          self.dict_writer = dict_writer
   1.141 +        self.field_dict_writer = field_dict_writer
   1.142          self.terms = {}
   1.143 +        self.docs = {}
   1.144  
   1.145      def add_position(self, term, docnum, position):
   1.146  
   1.147 @@ -731,6 +762,15 @@
   1.148  
   1.149          doc.append(position)
   1.150  
   1.151 +    def add_fields(self, docnum, fields):
   1.152 +
   1.153 +        "Add for the document with the given 'docnum' a list of 'fields'."
   1.154 +
   1.155 +        if not self.docs.has_key(docnum):
   1.156 +            doc_fields = self.docs[docnum] = fields
   1.157 +        else:
   1.158 +            self.docs[docnum] += fields
   1.159 +
   1.160      def close(self):
   1.161          if self.dict_writer is None:
   1.162              return
   1.163 @@ -748,6 +788,35 @@
   1.164          self.dict_writer.close()
   1.165          self.dict_writer = None
   1.166  
   1.167 +        # Get the documents in order.
   1.168 +
   1.169 +        docs = self.docs.items()
   1.170 +        docs.sort()
   1.171 +
   1.172 +        for docnum, fields in docs:
   1.173 +            self.field_dict_writer.write_fields(docnum, fields)
   1.174 +
   1.175 +        self.field_dict_writer.close()
   1.176 +        self.field_dict_writer = None
   1.177 +
   1.178 +class IndexReader:
   1.179 +
   1.180 +    "Accessing the term and field dictionaries."
   1.181 +
   1.182 +    def __init__(self, dict_reader, field_dict_reader):
   1.183 +        self.dict_reader = dict_reader
   1.184 +        self.field_dict_reader = field_dict_reader
   1.185 +
   1.186 +    def find_positions(self, term):
   1.187 +        return self.dict_reader.find_positions(term)
   1.188 +
   1.189 +    def get_fields(self, docnum):
   1.190 +        return self.field_dict_reader.read_fields(docnum)
   1.191 +
   1.192 +    def close(self):
   1.193 +        self.dict_reader.close()
   1.194 +        self.field_dict_reader.close()
   1.195 +
   1.196  class Index:
   1.197  
   1.198      "An inverted index solution encapsulating the various components."
   1.199 @@ -775,7 +844,15 @@
   1.200  
   1.201          dict_writer = TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)
   1.202  
   1.203 -        self.writer = IndexWriter(dict_writer)
   1.204 +        ff = open(join(self.pathname, "fields"), "wb")
   1.205 +        field_writer = FieldWriter(ff)
   1.206 +
   1.207 +        fif = open(join(self.pathname, "fields_index"), "wb")
   1.208 +        field_index_writer = FieldIndexWriter(fif)
   1.209 +
   1.210 +        field_dict_writer = FieldDictionaryWriter(field_writer, field_index_writer, interval)
   1.211 +
   1.212 +        self.writer = IndexWriter(dict_writer, field_dict_writer)
   1.213          return self.writer
   1.214  
   1.215      def get_reader(self):
   1.216 @@ -794,7 +871,17 @@
   1.217          tpf = open(join(self.pathname, "positions"), "rb")
   1.218          positions_reader = PositionReader(tpf)
   1.219  
   1.220 -        self.reader = TermDictionaryReader(info_reader, index_reader, positions_reader)
   1.221 +        dict_reader = TermDictionaryReader(info_reader, index_reader, positions_reader)
   1.222 +
   1.223 +        ff = open(join(self.pathname, "fields"), "rb")
   1.224 +        field_reader = FieldReader(ff)
   1.225 +
   1.226 +        fif = open(join(self.pathname, "fields_index"), "rb")
   1.227 +        field_index_reader = FieldIndexReader(fif)
   1.228 +
   1.229 +        field_dict_reader = FieldDictionaryReader(field_reader, field_index_reader)
   1.230 +
   1.231 +        self.reader = IndexReader(dict_reader, field_dict_reader)
   1.232          return self.reader
   1.233  
   1.234      def close(self):

     2.1 --- a/test.py	Sat Aug 29 02:15:29 2009 +0200
     2.2 +++ b/test.py	Sat Aug 29 21:15:47 2009 +0200
     2.3 @@ -285,12 +285,16 @@
     2.4  for docnum, text in docs:
     2.5      for position, term in enumerate(text.split()):
     2.6          wi.add_position(term, docnum, position)
     2.7 +    wi.add_fields(docnum, [text])
     2.8  wi.close()
     2.9  
    2.10  rd = index.get_reader()
    2.11  for term, doc_positions in doc_tests:
    2.12      dp = rd.find_positions(term)
    2.13      print doc_positions == dp, doc_positions, dp
    2.14 +for docnum, text in docs:
    2.15 +    df = rd.get_fields(docnum)
    2.16 +    print text == df[0], text, df[0]
    2.17  index.close()
    2.18  
    2.19  # vim: tabstop=4 expandtab shiftwidth=4
2009-08-29	Paul Boddie	raw files shortlog changelog graph	Introduced conditional compression for fields using bzip2 and zlib compression. Added an IndexReader class to encapsulate all reading operations (using term and field dictionaries). Added field-related file operations to the IndexWriter class. Added field-related file initialisation to the Index class. Changed the field index format to use offset deltas.
			iixr.py (file) test.py (file)