1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing document information. 5 6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from bisect import bisect_right # to find terms in the dictionary index 23 24 DOCUMENT_CACHE_LIMIT = 10000 25 26 class FieldWriter(FileWriter): 27 28 "Writing field data to files." 29 30 def reset(self): 31 self.last_docnum = None 32 self.docnum_size = None 33 34 def write_fields(self, docnum, fields): 35 36 """ 37 Write for the given 'docnum', a list of 'fields' (integer, string pairs 38 representing field identifiers and values respectively). 39 """ 40 41 # Find the size of document number values. 42 43 if self.docnum_size is None: 44 self.docnum_size = self.get_value_size(docnum) 45 self.last_docnum = self.get_initial_value(self.docnum_size) 46 47 # Write the number of values per document number. 48 # Write the document number delta. 49 50 output = array('B') 51 vint_to_array(self.docnum_size, output) 52 self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) 53 54 # Write the number of fields. 55 56 vint_to_array(len(fields), output) 57 output.tofile(self.f) 58 59 # Write the fields themselves. 60 61 for i, field in fields: 62 self.write_number(i) 63 self.write_string(field, 1) # compress 64 65 class FieldReader(FileReader): 66 67 "Reading field data from files." 68 69 def reset(self): 70 self.last_docnum = None 71 72 def read_fields(self): 73 74 """ 75 Read fields from the file, returning a tuple containing the document 76 number and a list of field (identifier, value) pairs. 77 """ 78 79 # Read the number of values per document number. 80 81 docnum_size = self.read_number() 82 83 if self.last_docnum is None: 84 self.last_docnum = self.get_initial_value(docnum_size) 85 86 # Read the document number delta and add it to the last number. 87 88 self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) 89 90 # Read the number of fields. 91 92 nfields = self.read_number() 93 94 # Collect the fields. 95 96 fields = [] 97 i = 0 98 99 while i < nfields: 100 identifier = self.read_number() 101 value = self.read_string(1) # decompress 102 fields.append((identifier, value)) 103 i += 1 104 105 return self.last_docnum, fields 106 107 def read_document_fields(self, docnum, offset): 108 109 """ 110 Read fields for 'docnum' at the given 'offset'. This permits the 111 retrieval of details for the specified document, as well as scanning for 112 later documents. 113 """ 114 115 self.seek(offset) 116 bad_docnum, fields = self.read_fields() 117 self.last_docnum = docnum 118 return docnum, fields 119 120 class FieldIndexWriter(FileWriter): 121 122 "Writing field index details to files." 123 124 def reset(self): 125 self.last_docnum = None 126 self.docnum_size = None 127 self.last_offset = 0 128 129 def write_document(self, docnum, offset): 130 131 """ 132 Write for the given 'docnum', the 'offset' at which the fields for the 133 document are stored in the fields file. 134 """ 135 136 # Find the size of document number values. 137 138 if self.docnum_size is None: 139 self.docnum_size = self.get_value_size(docnum) 140 self.last_docnum = self.get_initial_value(self.docnum_size) 141 142 # Write the number of values per document number. 143 # Write the document number delta. 144 145 output = array('B') 146 vint_to_array(self.docnum_size, output) 147 self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0) 148 output.tofile(self.f) 149 150 # Write the offset delta. 151 152 self.write_number(offset - self.last_offset) 153 self.last_offset = offset 154 155 class FieldIndexReader(FileReader): 156 157 "Reading field index details from files." 158 159 def reset(self): 160 self.last_docnum = None 161 self.last_offset = 0 162 163 def read_document(self): 164 165 "Read a document number and field file offset." 166 167 # Read the number of values per document number. 168 169 docnum_size = self.read_number() 170 171 if self.last_docnum is None: 172 self.last_docnum = self.get_initial_value(docnum_size) 173 174 # Read the document number delta and add it to the last number. 175 176 self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) 177 178 # Read the offset. 179 180 self.last_offset += self.read_number() 181 182 return self.last_docnum, self.last_offset 183 184 class FieldDictionaryWriter: 185 186 "Writing field dictionary details." 187 188 def __init__(self, field_writer, field_index_writer, interval): 189 self.field_writer = field_writer 190 self.field_index_writer = field_index_writer 191 self.interval = interval 192 self.entry = 0 193 194 def write_fields(self, docnum, fields): 195 196 "Write details of the document with the given 'docnum' and 'fields'." 197 198 if self.entry % self.interval == 0: 199 offset = self.field_writer.f.tell() 200 self.field_writer.write_fields(docnum, fields) 201 self.field_index_writer.write_document(docnum, offset) 202 else: 203 self.field_writer.write_fields(docnum, fields) 204 205 self.entry += 1 206 207 def close(self): 208 self.field_writer.close() 209 self.field_index_writer.close() 210 211 class FieldDictionaryReader: 212 213 "Reading field dictionary details." 214 215 def __init__(self, field_reader, field_index_reader): 216 self.field_reader = field_reader 217 self.field_index_reader = field_index_reader 218 219 self.cache = {} 220 self.docs = [] 221 try: 222 while 1: 223 self.docs.append(self.field_index_reader.read_document()) 224 except EOFError: 225 pass 226 227 # Large numbers for ordering purposes. 228 229 if self.docs: 230 self.max_offset = self.docs[-1][1] 231 else: 232 self.max_offset = None 233 234 # Iterator convenience methods. 235 236 def __iter__(self): 237 self.rewind() 238 return self 239 240 def next(self): 241 try: 242 return self.read_fields() 243 except EOFError: 244 raise StopIteration 245 246 # Sequential access methods. 247 248 def rewind(self): 249 self.field_reader.rewind() 250 251 def read_fields(self): 252 253 "Return the next document number and fields." 254 255 return self.field_reader.read_fields() 256 257 # Random access methods. 258 259 def get_fields(self, docnum): 260 261 "Read the fields of the document with the given 'docnum'." 262 263 if self.cache.has_key(docnum): 264 return self.cache[docnum] 265 266 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 267 268 # Get the entry position providing the term or one preceding it. 269 270 if i == -1: 271 return None 272 273 found_docnum, offset = self.docs[i] 274 275 # Read from the fields file. 276 277 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 278 279 # Scan for the document, if necessary. 280 281 try: 282 while docnum > found_docnum: 283 found_docnum, fields = self.field_reader.read_fields() 284 except EOFError: 285 pass 286 287 # If the document is found, return the fields. 288 289 if docnum == found_docnum: 290 291 # Store the fields in the cache, removing entries if the limit has 292 # been reached. 293 294 keys = self.cache.keys() 295 296 if len(keys) == DOCUMENT_CACHE_LIMIT: 297 del self.cache[keys[0]] 298 299 self.cache[docnum] = fields 300 return fields 301 else: 302 return None 303 304 def close(self): 305 self.field_reader.close() 306 self.field_index_reader.close() 307 308 # vim: tabstop=4 expandtab shiftwidth=4