Added field dictionary and field index readers and writers. Renamed various internal methods. Added document number deltas to field collections in order to support scanning for documents.

     1.1 --- a/iixr.py	Fri Aug 28 01:15:17 2009 +0200
     1.2 +++ b/iixr.py	Sat Aug 29 02:15:29 2009 +0200
     1.3 @@ -149,7 +149,7 @@
     1.4  
     1.5          return unicode(s, "utf-8")
     1.6  
     1.7 -# Specific classes.
     1.8 +# Specific classes for storing term and position information.
     1.9  
    1.10  class PositionWriter(FileWriter):
    1.11  
    1.12 @@ -336,7 +336,10 @@
    1.13  
    1.14      def go_to_term(self, term, offset, info_offset):
    1.15  
    1.16 -        "Seek past the entry for 'term' having 'offset' to 'info_offset'."
    1.17 +        """
    1.18 +        Seek past the entry for 'term' having 'offset' to 'info_offset'. This
    1.19 +        permits the scanning for later terms from the specified term.
    1.20 +        """
    1.21  
    1.22          self.f.seek(info_offset)
    1.23          self.last_term = term
    1.24 @@ -399,7 +402,7 @@
    1.25          self.interval = interval
    1.26          self.entry = 0
    1.27  
    1.28 -    def write_term(self, term, offset):
    1.29 +    def _write_term(self, term, offset):
    1.30  
    1.31          """
    1.32          Write the given 'term' and its position file 'offset' to the term
    1.33 @@ -421,7 +424,7 @@
    1.34          """
    1.35  
    1.36          offset = self.position_writer.write_all_positions(doc_positions)
    1.37 -        self.write_term(term, offset)
    1.38 +        self._write_term(term, offset)
    1.39  
    1.40      def close(self):
    1.41          self.info_writer.close()
    1.42 @@ -449,7 +452,7 @@
    1.43          self.max_offset = self.terms[-1][1]
    1.44          self.max_info_offset = self.terms[-1][2]
    1.45  
    1.46 -    def find_term(self, term):
    1.47 +    def _find_term(self, term):
    1.48  
    1.49          "Find the position file offset of 'term' from the term dictionary."
    1.50  
    1.51 @@ -489,7 +492,7 @@
    1.52  
    1.53          "Return the documents and positions at which the given 'term' is found."
    1.54  
    1.55 -        offset = self.find_term(term)
    1.56 +        offset = self._find_term(term)
    1.57          if offset is None:
    1.58              return None
    1.59          else:
    1.60 @@ -500,19 +503,28 @@
    1.61          self.index_reader.close()
    1.62          self.position_reader.close()
    1.63  
    1.64 +# Specific classes for storing document information.
    1.65 +
    1.66  class FieldWriter(FileWriter):
    1.67  
    1.68      "Writing field data to files."
    1.69  
    1.70 -    def write_fields(self, fields):
    1.71 +    def reset(self):
    1.72 +        self.last_docnum = 0
    1.73 +
    1.74 +    def write_fields(self, docnum, fields):
    1.75  
    1.76          """
    1.77 -        Write the given list of 'fields' (strings representing field values).
    1.78 -        Return the offset at which the fields are stored.
    1.79 +        Write for the given 'docnum', a list of 'fields' (strings representing
    1.80 +        field values). Return the offset at which the fields are stored.
    1.81          """
    1.82  
    1.83          offset = self.f.tell()
    1.84  
    1.85 +        # Write the document number delta.
    1.86 +
    1.87 +        self.write_number(docnum - self.last_docnum)
    1.88 +
    1.89          # Write the number of fields.
    1.90  
    1.91          self.write_number(len(fields))
    1.92 @@ -522,15 +534,26 @@
    1.93          for field in fields:
    1.94              self.write_string(field, 0) # compress
    1.95  
    1.96 +        self.last_docnum = docnum
    1.97          return offset
    1.98  
    1.99  class FieldReader(FileReader):
   1.100  
   1.101      "Reading field data from files."
   1.102  
   1.103 +    def reset(self):
   1.104 +        self.last_docnum = 0
   1.105 +
   1.106      def read_fields(self):
   1.107  
   1.108 -        "Read fields from the file, returning the field values in a list."
   1.109 +        """
   1.110 +        Read fields from the file, returning a tuple containing the document
   1.111 +        number and a list of field values.
   1.112 +        """
   1.113 +
   1.114 +        # Read the document number.
   1.115 +
   1.116 +        self.last_docnum += self.read_number()
   1.117  
   1.118          # Read the number of fields.
   1.119  
   1.120 @@ -545,14 +568,139 @@
   1.121              fields.append(self.read_string(0)) # decompress
   1.122              i += 1
   1.123  
   1.124 -        return fields
   1.125 +        return self.last_docnum, fields
   1.126 +
   1.127 +    def read_document_fields(self, docnum, offset):
   1.128  
   1.129 -    def read_doc_fields(self, offset):
   1.130 -
   1.131 -        "Read all fields at the given 'offset."
   1.132 +        """
   1.133 +        Read fields for 'docnum' at the given 'offset'. This permits the
   1.134 +        retrieval of details for the specified document, as well as scanning for
   1.135 +        later documents.
   1.136 +        """
   1.137  
   1.138          self.f.seek(offset)
   1.139 -        return self.read_fields()
   1.140 +        bad_docnum, fields = self.read_fields()
   1.141 +        self.last_docnum = docnum
   1.142 +        return docnum, fields
   1.143 +        
   1.144 +class FieldIndexWriter(FileWriter):
   1.145 +
   1.146 +    "Writing field index details to files."
   1.147 +
   1.148 +    def reset(self):
   1.149 +        self.last_docnum = 0
   1.150 +
   1.151 +    def write_document(self, docnum, offset):
   1.152 +
   1.153 +        """
   1.154 +        Write for the given 'docnum', the 'offset' at which the fields for the
   1.155 +        document are stored in the fields file.
   1.156 +        """
   1.157 +
   1.158 +        # Write the document number delta and offset.
   1.159 +
   1.160 +        self.write_number(docnum - self.last_docnum)
   1.161 +        self.write_number(offset)
   1.162 +
   1.163 +        self.last_docnum = docnum
   1.164 +
   1.165 +class FieldIndexReader(FileReader):
   1.166 +
   1.167 +    "Reading field index details from files."
   1.168 +
   1.169 +    def reset(self):
   1.170 +        self.last_docnum = 0
   1.171 +
   1.172 +    def read_document(self):
   1.173 +
   1.174 +        "Read a document number and field file offset."
   1.175 +
   1.176 +        # Read the document number delta and offset.
   1.177 +
   1.178 +        self.last_docnum += self.read_number()
   1.179 +        offset = self.read_number()
   1.180 +
   1.181 +        return self.last_docnum, offset
   1.182 +
   1.183 +class FieldDictionaryWriter:
   1.184 +
   1.185 +    "Writing field dictionary details."
   1.186 +
   1.187 +    def __init__(self, field_writer, field_index_writer, interval):
   1.188 +        self.field_writer = field_writer
   1.189 +        self.field_index_writer = field_index_writer
   1.190 +        self.interval = interval
   1.191 +        self.entry = 0
   1.192 +
   1.193 +    def write_fields(self, docnum, fields):
   1.194 +
   1.195 +        "Write details of the document with the given 'docnum' and 'fields'."
   1.196 +
   1.197 +        offset = self.field_writer.write_fields(docnum, fields)
   1.198 +
   1.199 +        if self.entry % self.interval == 0:
   1.200 +            self.field_index_writer.write_document(docnum, offset)
   1.201 +
   1.202 +        self.entry += 1
   1.203 +
   1.204 +    def close(self):
   1.205 +        self.field_writer.close()
   1.206 +        self.field_index_writer.close()
   1.207 +
   1.208 +class FieldDictionaryReader:
   1.209 +
   1.210 +    "Reading field dictionary details."
   1.211 +
   1.212 +    def __init__(self, field_reader, field_index_reader):
   1.213 +        self.field_reader = field_reader
   1.214 +        self.field_index_reader = field_index_reader
   1.215 +
   1.216 +        self.docs = []
   1.217 +        try:
   1.218 +            while 1:
   1.219 +                self.docs.append(self.field_index_reader.read_document())
   1.220 +        except EOFError:
   1.221 +            pass
   1.222 +
   1.223 +        # Large numbers for ordering purposes.
   1.224 +
   1.225 +        self.max_offset = self.docs[-1][1]
   1.226 +
   1.227 +    def read_fields(self, docnum):
   1.228 +
   1.229 +        "Read the fields of the document with the given 'docnum'."
   1.230 +
   1.231 +        i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
   1.232 +
   1.233 +        # Get the entry position providing the term or one preceding it.
   1.234 +
   1.235 +        if i == -1:
   1.236 +            return None
   1.237 +
   1.238 +        found_docnum, offset = self.docs[i]
   1.239 +
   1.240 +        # Read from the fields file.
   1.241 +
   1.242 +        found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
   1.243 +
   1.244 +        # Scan for the document, if necessary.
   1.245 +
   1.246 +        try:
   1.247 +            while docnum > found_docnum:
   1.248 +                found_docnum, fields = self.field_reader.read_fields()
   1.249 +        except EOFError:
   1.250 +            pass
   1.251 +
   1.252 +        # If the document is found, return the fields.
   1.253 +
   1.254 +        if docnum == found_docnum:
   1.255 +            return fields
   1.256 +        else:
   1.257 +            return None
   1.258 +
   1.259 +    def close(self):
   1.260 +        self.field_reader.close()
   1.261 +        self.field_index_reader.close()
   1.262  
   1.263  # High-level classes.
   1.264  

     2.1 --- a/test.py	Fri Aug 28 01:15:17 2009 +0200
     2.2 +++ b/test.py	Sat Aug 29 02:15:29 2009 +0200
     2.3 @@ -2,6 +2,8 @@
     2.4  
     2.5  import iixr
     2.6  
     2.7 +# Test basic data types.
     2.8 +
     2.9  numbers = [12345678, 0, 1, 127, 128, 255, 256]
    2.10  
    2.11  f = open("test", "wb")
    2.12 @@ -17,6 +19,8 @@
    2.13      print number == n, number, n
    2.14  r.close()
    2.15  
    2.16 +# Test positions.
    2.17 +
    2.18  all_doc_positions = [
    2.19      [
    2.20          (123, [1, 3, 5, 15, 25]),
    2.21 @@ -64,30 +68,82 @@
    2.22      print doc_positions == dp, doc_positions, dp
    2.23  r.close()
    2.24  
    2.25 +# Test fields.
    2.26 +
    2.27  doc_fields = [
    2.28 -    ["testing", "fields", "stored", "compressed"],
    2.29 -    ["fields", "for a second", "document"]
    2.30 +    (123, ["testing", "fields", "stored", "compressed"]),
    2.31 +    (456, ["fields", "for a second", "document"]),
    2.32 +    (789, ["field value"]),
    2.33 +    (1234, []),
    2.34 +    (2345, ["abc", "def"]),
    2.35 +    (3456, ["apple", "banana", "cherry"]),
    2.36 +    (4567, ["drue", "eple"])
    2.37      ]
    2.38  
    2.39  f = open("testF", "wb")
    2.40  w = iixr.FieldWriter(f)
    2.41 -offsets = []
    2.42 -for fields in doc_fields:
    2.43 -    offsets.append(w.write_fields(fields))
    2.44 +for docnum, fields in doc_fields:
    2.45 +    w.write_fields(docnum, fields)
    2.46  w.close()
    2.47  
    2.48  f = open("testF", "rb")
    2.49  r = iixr.FieldReader(f)
    2.50 -for fields in doc_fields:
    2.51 -    df = r.read_fields()
    2.52 -    print fields == df, fields, df
    2.53 -offsets.reverse()
    2.54 -doc_fields.reverse()
    2.55 -for offset, fields in zip(offsets, doc_fields):
    2.56 -    df = r.read_doc_fields(offset)
    2.57 +for docnum, fields in doc_fields:
    2.58 +    dn, df = r.read_fields()
    2.59 +    print docnum == dn, docnum, dn
    2.60      print fields == df, fields, df
    2.61  r.close()
    2.62  
    2.63 +# Test field index files.
    2.64 +
    2.65 +indexed_docs = [
    2.66 +    (123, 100000987),
    2.67 +    (456, 100004321),
    2.68 +    (789, 100008765)
    2.69 +    ]
    2.70 +
    2.71 +f = open("testFI", "wb")
    2.72 +w = iixr.FieldIndexWriter(f)
    2.73 +for docnum, offset in indexed_docs:
    2.74 +    w.write_document(docnum, offset)
    2.75 +w.close()
    2.76 +
    2.77 +f = open("testFI", "rb")
    2.78 +r = iixr.FieldIndexReader(f)
    2.79 +for docnum, offset in indexed_docs:
    2.80 +    dn, o = r.read_document()
    2.81 +    print docnum == dn, docnum, dn
    2.82 +    print offset == o, offset, o
    2.83 +r.close()
    2.84 +
    2.85 +# Test field dictionaries.
    2.86 +
    2.87 +f = open("testF", "wb")
    2.88 +w = iixr.FieldWriter(f)
    2.89 +f2 = open("testFI", "wb")
    2.90 +w2 = iixr.FieldIndexWriter(f2)
    2.91 +wd = iixr.FieldDictionaryWriter(w, w2, 3)
    2.92 +for docnum, fields in doc_fields:
    2.93 +    wd.write_fields(docnum, fields)
    2.94 +wd.close()
    2.95 +
    2.96 +f = open("testF", "rb")
    2.97 +r = iixr.FieldReader(f)
    2.98 +f2 = open("testFI", "rb")
    2.99 +r2 = iixr.FieldIndexReader(f2)
   2.100 +rd = iixr.FieldDictionaryReader(r, r2)
   2.101 +doc_fields_reversed = doc_fields[:]
   2.102 +doc_fields_reversed.reverse()
   2.103 +for docnum, fields in doc_fields_reversed:
   2.104 +    df = rd.read_fields(docnum)
   2.105 +    print fields == df, fields, df
   2.106 +for docnum in (13579, 246810):
   2.107 +    df = rd.read_fields(docnum)
   2.108 +    print df is None, df
   2.109 +rd.close()
   2.110 +
   2.111 +# Test terms.
   2.112 +
   2.113  terms = [
   2.114      ("aardvark",  100000123),
   2.115      ("anteater",  100000456),
   2.116 @@ -111,6 +167,8 @@
   2.117      print offset == o, offset, o
   2.118  r.close()
   2.119  
   2.120 +# Test terms in index files.
   2.121 +
   2.122  indexed_terms = [
   2.123      ("aardvark",  100000123, 200000321),
   2.124      ("anteater",  100000456, 200000654),
   2.125 @@ -135,6 +193,8 @@
   2.126      print info_offset == i, info_offset, i
   2.127  r.close()
   2.128  
   2.129 +# Test dictionaries with only term data.
   2.130 +
   2.131  f = open("test", "wb")
   2.132  w = iixr.TermWriter(f)
   2.133  f2 = open("testI", "wb")
   2.134 @@ -143,7 +203,7 @@
   2.135  w3 = iixr.PositionWriter(f3)
   2.136  wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
   2.137  for term, offset in terms:
   2.138 -    wd.write_term(term, offset)
   2.139 +    wd._write_term(term, offset)
   2.140  wd.close()
   2.141  
   2.142  f = open("test", "rb")
   2.143 @@ -156,13 +216,15 @@
   2.144  terms_reversed = terms[:]
   2.145  terms_reversed.reverse()
   2.146  for term, offset in terms_reversed:
   2.147 -    o = rd.find_term(term)
   2.148 +    o = rd._find_term(term)
   2.149      print offset == o, offset, o
   2.150  for term in ("dog", "dingo"):
   2.151 -    o = rd.find_term(term)
   2.152 +    o = rd._find_term(term)
   2.153      print o is None, o
   2.154  rd.close()
   2.155  
   2.156 +# Test dictionaries with term and position data.
   2.157 +
   2.158  terms_with_positions = [
   2.159      ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),
   2.160      ("anteater",  [(1, [43, 44])]),
   2.161 @@ -200,6 +262,8 @@
   2.162      print dp is None, dp
   2.163  rd.close()
   2.164  
   2.165 +# Test high-level index operations.
   2.166 +
   2.167  docs = [
   2.168      (1, "The cat sat on the mat"),
   2.169      (2, "Every good boy deserves football"),
2009-08-29	Paul Boddie	raw files shortlog changelog graph	Added field dictionary and field index readers and writers. Renamed various internal methods. Added document number deltas to field collections in order to support scanning for documents.
			iixr.py (file) test.py (file)