1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing document information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from bisect import bisect_right # to find terms in the dictionary index 23 24 class FieldWriter(FileWriter): 25 26 "Writing field data to files." 27 28 def reset(self): 29 self.last_docnum = 0 30 31 def write_fields(self, docnum, fields): 32 33 """ 34 Write for the given 'docnum', a list of 'fields' (integer, string pairs 35 representing field identifiers and values respectively). 36 """ 37 38 # Write the document number delta. 39 40 self.write_number(docnum - self.last_docnum) 41 42 # Write the number of fields. 43 44 self.write_number(len(fields)) 45 46 # Write the fields themselves. 47 48 for i, field in fields: 49 self.write_number(i) 50 self.write_string(field, 1) # compress 51 52 self.last_docnum = docnum 53 54 class FieldReader(FileReader): 55 56 "Reading field data from files." 57 58 def reset(self): 59 self.last_docnum = 0 60 61 def read_fields(self): 62 63 """ 64 Read fields from the file, returning a tuple containing the document 65 number and a list of field (identifier, value) pairs. 66 """ 67 68 # Read the document number. 69 70 self.last_docnum += self.read_number() 71 72 # Read the number of fields. 73 74 nfields = self.read_number() 75 76 # Collect the fields. 77 78 fields = [] 79 i = 0 80 81 while i < nfields: 82 identifier = self.read_number() 83 value = self.read_string(1) # decompress 84 fields.append((identifier, value)) 85 i += 1 86 87 return self.last_docnum, fields 88 89 def read_document_fields(self, docnum, offset): 90 91 """ 92 Read fields for 'docnum' at the given 'offset'. This permits the 93 retrieval of details for the specified document, as well as scanning for 94 later documents. 95 """ 96 97 self.f.seek(offset) 98 bad_docnum, fields = self.read_fields() 99 self.last_docnum = docnum 100 return docnum, fields 101 102 class FieldIndexWriter(FileWriter): 103 104 "Writing field index details to files." 105 106 def reset(self): 107 self.last_docnum = 0 108 self.last_offset = 0 109 110 def write_document(self, docnum, offset): 111 112 """ 113 Write for the given 'docnum', the 'offset' at which the fields for the 114 document are stored in the fields file. 115 """ 116 117 # Write the document number and offset deltas. 118 119 self.write_number(docnum - self.last_docnum) 120 self.write_number(offset - self.last_offset) 121 122 self.last_docnum = docnum 123 self.last_offset = offset 124 125 class FieldIndexReader(FileReader): 126 127 "Reading field index details from files." 128 129 def reset(self): 130 self.last_docnum = 0 131 self.last_offset = 0 132 133 def read_document(self): 134 135 "Read a document number and field file offset." 136 137 # Read the document number delta and offset. 138 139 self.last_docnum += self.read_number() 140 self.last_offset += self.read_number() 141 142 return self.last_docnum, self.last_offset 143 144 class FieldDictionaryWriter: 145 146 "Writing field dictionary details." 147 148 def __init__(self, field_writer, field_index_writer, interval): 149 self.field_writer = field_writer 150 self.field_index_writer = field_index_writer 151 self.interval = interval 152 self.entry = 0 153 154 def write_fields(self, docnum, fields): 155 156 "Write details of the document with the given 'docnum' and 'fields'." 157 158 if self.entry % self.interval == 0: 159 offset = self.field_writer.f.tell() 160 self.field_writer.write_fields(docnum, fields) 161 self.field_index_writer.write_document(docnum, offset) 162 else: 163 self.field_writer.write_fields(docnum, fields) 164 165 self.entry += 1 166 167 def close(self): 168 self.field_writer.close() 169 self.field_index_writer.close() 170 171 class FieldDictionaryReader: 172 173 "Reading field dictionary details." 174 175 def __init__(self, field_reader, field_index_reader): 176 self.field_reader = field_reader 177 self.field_index_reader = field_index_reader 178 179 self.docs = [] 180 try: 181 while 1: 182 self.docs.append(self.field_index_reader.read_document()) 183 except EOFError: 184 pass 185 186 # Large numbers for ordering purposes. 187 188 if self.docs: 189 self.max_offset = self.docs[-1][1] 190 else: 191 self.max_offset = None 192 193 # Iterator convenience methods. 194 195 def __iter__(self): 196 self.rewind() 197 return self 198 199 def next(self): 200 try: 201 return self.read_fields() 202 except EOFError: 203 raise StopIteration 204 205 # Sequential access methods. 206 207 def rewind(self): 208 self.field_reader.rewind() 209 210 def read_fields(self): 211 212 "Return the next document number and fields." 213 214 return self.field_reader.read_fields() 215 216 # Random access methods. 217 218 def get_fields(self, docnum): 219 220 "Read the fields of the document with the given 'docnum'." 221 222 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 223 224 # Get the entry position providing the term or one preceding it. 225 226 if i == -1: 227 return None 228 229 found_docnum, offset = self.docs[i] 230 231 # Read from the fields file. 232 233 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 234 235 # Scan for the document, if necessary. 236 237 try: 238 while docnum > found_docnum: 239 found_docnum, fields = self.field_reader.read_fields() 240 except EOFError: 241 pass 242 243 # If the document is found, return the fields. 244 245 if docnum == found_docnum: 246 return fields 247 else: 248 return None 249 250 def close(self): 251 self.field_reader.close() 252 self.field_index_reader.close() 253 254 # vim: tabstop=4 expandtab shiftwidth=4