1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing document information. 5 6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from bisect import bisect_right # to find terms in the dictionary index 23 24 DOCUMENT_CACHE_LIMIT = 10000 25 26 class FieldWriter(FileWriter): 27 28 "Writing field data to files." 29 30 def reset(self): 31 self.last_docnum = None 32 self.docnum_size = None 33 34 def write_fields(self, docnum, fields): 35 36 """ 37 Write for the given 'docnum', a list of 'fields' (integer, string pairs 38 representing field identifiers and values respectively). 39 """ 40 41 # Find the size of document number values. 42 43 if self.docnum_size is None: 44 self.docnum_size = self.get_value_size(docnum) 45 self.last_docnum = self.get_initial_value(self.docnum_size) 46 47 # Write the number of values per document number. 48 # Write the document number delta. 49 50 self.write_number(self.docnum_size) 51 self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) 52 53 # Write the number of fields. 54 55 self.write_number(len(fields)) 56 57 # Write the fields themselves. 58 59 for i, field in fields: 60 self.write_number(i) 61 self.write_string(field, 1) # compress 62 63 class FieldReader(FileReader): 64 65 "Reading field data from files." 66 67 def reset(self): 68 self.last_docnum = None 69 70 def read_fields(self): 71 72 """ 73 Read fields from the file, returning a tuple containing the document 74 number and a list of field (identifier, value) pairs. 75 """ 76 77 # Read the number of values per document number. 78 79 docnum_size = self.read_number() 80 81 if self.last_docnum is None: 82 self.last_docnum = self.get_initial_value(docnum_size) 83 84 # Read the document number delta and add it to the last number. 85 86 self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) 87 88 # Read the number of fields. 89 90 nfields = self.read_number() 91 92 # Collect the fields. 93 94 fields = [] 95 i = 0 96 97 while i < nfields: 98 identifier = self.read_number() 99 value = self.read_string(1) # decompress 100 fields.append((identifier, value)) 101 i += 1 102 103 return self.last_docnum, fields 104 105 def read_document_fields(self, docnum, offset): 106 107 """ 108 Read fields for 'docnum' at the given 'offset'. This permits the 109 retrieval of details for the specified document, as well as scanning for 110 later documents. 111 """ 112 113 self.seek(offset) 114 bad_docnum, fields = self.read_fields() 115 self.last_docnum = docnum 116 return docnum, fields 117 118 class FieldIndexWriter(FileWriter): 119 120 "Writing field index details to files." 121 122 def reset(self): 123 self.last_docnum = None 124 self.docnum_size = None 125 self.last_offset = 0 126 127 def write_document(self, docnum, offset): 128 129 """ 130 Write for the given 'docnum', the 'offset' at which the fields for the 131 document are stored in the fields file. 132 """ 133 134 # Find the size of document number values. 135 136 if self.docnum_size is None: 137 self.docnum_size = self.get_value_size(docnum) 138 self.last_docnum = self.get_initial_value(self.docnum_size) 139 140 # Write the number of values per document number. 141 # Write the document number delta. 142 143 self.write_number(self.docnum_size) 144 self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0) 145 146 # Write the offset delta. 147 148 self.write_number(offset - self.last_offset) 149 self.last_offset = offset 150 151 class FieldIndexReader(FileReader): 152 153 "Reading field index details from files." 154 155 def reset(self): 156 self.last_docnum = None 157 self.last_offset = 0 158 159 def read_document(self): 160 161 "Read a document number and field file offset." 162 163 # Read the number of values per document number. 164 165 docnum_size = self.read_number() 166 167 if self.last_docnum is None: 168 self.last_docnum = self.get_initial_value(docnum_size) 169 170 # Read the document number delta and add it to the last number. 171 172 self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0) 173 174 # Read the offset. 175 176 self.last_offset += self.read_number() 177 178 return self.last_docnum, self.last_offset 179 180 class FieldDictionaryWriter: 181 182 "Writing field dictionary details." 183 184 def __init__(self, field_writer, field_index_writer, interval): 185 self.field_writer = field_writer 186 self.field_index_writer = field_index_writer 187 self.interval = interval 188 self.entry = 0 189 190 def write_fields(self, docnum, fields): 191 192 "Write details of the document with the given 'docnum' and 'fields'." 193 194 if self.entry % self.interval == 0: 195 offset = self.field_writer.tell() 196 self.field_writer.write_fields(docnum, fields) 197 self.field_index_writer.write_document(docnum, offset) 198 else: 199 self.field_writer.write_fields(docnum, fields) 200 201 self.entry += 1 202 203 def close(self): 204 self.field_writer.close() 205 self.field_index_writer.close() 206 207 class FieldDictionaryReader: 208 209 "Reading field dictionary details." 210 211 def __init__(self, field_reader, field_index_reader): 212 self.field_reader = field_reader 213 self.field_index_reader = field_index_reader 214 215 self.cache = {} 216 self.docs = [] 217 try: 218 while 1: 219 self.docs.append(self.field_index_reader.read_document()) 220 except EOFError: 221 pass 222 223 # Large numbers for ordering purposes. 224 225 if self.docs: 226 self.max_offset = self.docs[-1][1] 227 else: 228 self.max_offset = None 229 230 # Iterator convenience methods. 231 232 def __iter__(self): 233 self.rewind() 234 return self 235 236 def next(self): 237 try: 238 return self.read_fields() 239 except EOFError: 240 raise StopIteration 241 242 # Sequential access methods. 243 244 def rewind(self): 245 self.field_reader.rewind() 246 247 def read_fields(self): 248 249 "Return the next document number and fields." 250 251 return self.field_reader.read_fields() 252 253 # Random access methods. 254 255 def get_fields(self, docnum): 256 257 "Read the fields of the document with the given 'docnum'." 258 259 if self.cache.has_key(docnum): 260 return self.cache[docnum] 261 262 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 263 264 # Get the entry position providing the term or one preceding it. 265 266 if i == -1: 267 return None 268 269 found_docnum, offset = self.docs[i] 270 271 # Read from the fields file. 272 273 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 274 275 # Scan for the document, if necessary. 276 277 try: 278 while docnum > found_docnum: 279 found_docnum, fields = self.field_reader.read_fields() 280 except EOFError: 281 pass 282 283 # If the document is found, return the fields. 284 285 if docnum == found_docnum: 286 287 # Store the fields in the cache, removing entries if the limit has 288 # been reached. 289 290 keys = self.cache.keys() 291 292 if len(keys) == DOCUMENT_CACHE_LIMIT: 293 del self.cache[keys[0]] 294 295 self.cache[docnum] = fields 296 return fields 297 else: 298 return None 299 300 def close(self): 301 self.field_reader.close() 302 self.field_index_reader.close() 303 304 # vim: tabstop=4 expandtab shiftwidth=4