1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing document information. 5 6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from bisect import bisect_right # to find terms in the dictionary index 23 24 DOCUMENT_CACHE_LIMIT = 10000 25 26 class FieldWriter(FileWriter): 27 28 "Writing field data to files." 29 30 def reset(self): 31 self.last_docnum = 0 32 33 def write_fields(self, docnum, fields): 34 35 """ 36 Write for the given 'docnum', a list of 'fields' (integer, string pairs 37 representing field identifiers and values respectively). 38 """ 39 40 # Write the document number delta. 41 42 self.write_number(docnum - self.last_docnum) 43 44 # Write the number of fields. 45 46 self.write_number(len(fields)) 47 48 # Write the fields themselves. 49 50 for i, field in fields: 51 self.write_number(i) 52 self.write_string(field, 1) # compress 53 54 self.last_docnum = docnum 55 56 class FieldReader(FileReader): 57 58 "Reading field data from files." 59 60 def reset(self): 61 self.last_docnum = 0 62 63 def read_fields(self): 64 65 """ 66 Read fields from the file, returning a tuple containing the document 67 number and a list of field (identifier, value) pairs. 68 """ 69 70 # Read the document number. 71 72 self.last_docnum += self.read_number() 73 74 # Read the number of fields. 75 76 nfields = self.read_number() 77 78 # Collect the fields. 79 80 fields = [] 81 i = 0 82 83 while i < nfields: 84 identifier = self.read_number() 85 value = self.read_string(1) # decompress 86 fields.append((identifier, value)) 87 i += 1 88 89 return self.last_docnum, fields 90 91 def read_document_fields(self, docnum, offset): 92 93 """ 94 Read fields for 'docnum' at the given 'offset'. This permits the 95 retrieval of details for the specified document, as well as scanning for 96 later documents. 97 """ 98 99 self.seek(offset) 100 bad_docnum, fields = self.read_fields() 101 self.last_docnum = docnum 102 return docnum, fields 103 104 class FieldIndexWriter(FileWriter): 105 106 "Writing field index details to files." 107 108 def reset(self): 109 self.last_docnum = 0 110 self.last_offset = 0 111 112 def write_document(self, docnum, offset): 113 114 """ 115 Write for the given 'docnum', the 'offset' at which the fields for the 116 document are stored in the fields file. 117 """ 118 119 # Write the document number and offset deltas. 120 121 self.write_number(docnum - self.last_docnum) 122 self.write_number(offset - self.last_offset) 123 124 self.last_docnum = docnum 125 self.last_offset = offset 126 127 class FieldIndexReader(FileReader): 128 129 "Reading field index details from files." 130 131 def reset(self): 132 self.last_docnum = 0 133 self.last_offset = 0 134 135 def read_document(self): 136 137 "Read a document number and field file offset." 138 139 # Read the document number delta and offset. 140 141 self.last_docnum += self.read_number() 142 self.last_offset += self.read_number() 143 144 return self.last_docnum, self.last_offset 145 146 class FieldDictionaryWriter: 147 148 "Writing field dictionary details." 149 150 def __init__(self, field_writer, field_index_writer, interval): 151 self.field_writer = field_writer 152 self.field_index_writer = field_index_writer 153 self.interval = interval 154 self.entry = 0 155 156 def write_fields(self, docnum, fields): 157 158 "Write details of the document with the given 'docnum' and 'fields'." 159 160 if self.entry % self.interval == 0: 161 offset = self.field_writer.f.tell() 162 self.field_writer.write_fields(docnum, fields) 163 self.field_index_writer.write_document(docnum, offset) 164 else: 165 self.field_writer.write_fields(docnum, fields) 166 167 self.entry += 1 168 169 def close(self): 170 self.field_writer.close() 171 self.field_index_writer.close() 172 173 class FieldDictionaryReader: 174 175 "Reading field dictionary details." 176 177 def __init__(self, field_reader, field_index_reader): 178 self.field_reader = field_reader 179 self.field_index_reader = field_index_reader 180 181 self.cache = {} 182 self.docs = [] 183 try: 184 while 1: 185 self.docs.append(self.field_index_reader.read_document()) 186 except EOFError: 187 pass 188 189 # Large numbers for ordering purposes. 190 191 if self.docs: 192 self.max_offset = self.docs[-1][1] 193 else: 194 self.max_offset = None 195 196 # Iterator convenience methods. 197 198 def __iter__(self): 199 self.rewind() 200 return self 201 202 def next(self): 203 try: 204 return self.read_fields() 205 except EOFError: 206 raise StopIteration 207 208 # Sequential access methods. 209 210 def rewind(self): 211 self.field_reader.rewind() 212 213 def read_fields(self): 214 215 "Return the next document number and fields." 216 217 return self.field_reader.read_fields() 218 219 # Random access methods. 220 221 def get_fields(self, docnum): 222 223 "Read the fields of the document with the given 'docnum'." 224 225 if self.cache.has_key(docnum): 226 return self.cache[docnum] 227 228 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 229 230 # Get the entry position providing the term or one preceding it. 231 232 if i == -1: 233 return None 234 235 found_docnum, offset = self.docs[i] 236 237 # Read from the fields file. 238 239 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 240 241 # Scan for the document, if necessary. 242 243 try: 244 while docnum > found_docnum: 245 found_docnum, fields = self.field_reader.read_fields() 246 except EOFError: 247 pass 248 249 # If the document is found, return the fields. 250 251 if docnum == found_docnum: 252 253 # Store the fields in the cache, removing entries if the limit has 254 # been reached. 255 256 keys = self.cache.keys() 257 258 if len(keys) == DOCUMENT_CACHE_LIMIT: 259 del self.cache[keys[0]] 260 261 self.cache[docnum] = fields 262 return fields 263 else: 264 return None 265 266 def close(self): 267 self.field_reader.close() 268 self.field_index_reader.close() 269 270 # vim: tabstop=4 expandtab shiftwidth=4