1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing document information. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.files import * 22 from bisect import bisect_right # to find terms in the dictionary index 23 24 class FieldWriter(FileWriter): 25 26 "Writing field data to files." 27 28 def reset(self): 29 self.last_docnum = 0 30 31 def write_fields(self, docnum, fields): 32 33 """ 34 Write for the given 'docnum', a list of 'fields' (integer, string pairs 35 representing field identifiers and values respectively). 36 Return the offset at which the fields are stored. 37 """ 38 39 offset = self.f.tell() 40 41 # Write the document number delta. 42 43 self.write_number(docnum - self.last_docnum) 44 45 # Write the number of fields. 46 47 self.write_number(len(fields)) 48 49 # Write the fields themselves. 50 51 for i, field in fields: 52 self.write_number(i) 53 self.write_string(field, 1) # compress 54 55 self.last_docnum = docnum 56 return offset 57 58 class FieldReader(FileReader): 59 60 "Reading field data from files." 61 62 def reset(self): 63 self.last_docnum = 0 64 65 def read_fields(self): 66 67 """ 68 Read fields from the file, returning a tuple containing the document 69 number and a list of field (identifier, value) pairs. 70 """ 71 72 # Read the document number. 73 74 self.last_docnum += self.read_number() 75 76 # Read the number of fields. 77 78 nfields = self.read_number() 79 80 # Collect the fields. 81 82 fields = [] 83 i = 0 84 85 while i < nfields: 86 identifier = self.read_number() 87 value = self.read_string(1) # decompress 88 fields.append((identifier, value)) 89 i += 1 90 91 return self.last_docnum, fields 92 93 def read_document_fields(self, docnum, offset): 94 95 """ 96 Read fields for 'docnum' at the given 'offset'. This permits the 97 retrieval of details for the specified document, as well as scanning for 98 later documents. 99 """ 100 101 self.f.seek(offset) 102 bad_docnum, fields = self.read_fields() 103 self.last_docnum = docnum 104 return docnum, fields 105 106 class FieldIndexWriter(FileWriter): 107 108 "Writing field index details to files." 109 110 def reset(self): 111 self.last_docnum = 0 112 self.last_offset = 0 113 114 def write_document(self, docnum, offset): 115 116 """ 117 Write for the given 'docnum', the 'offset' at which the fields for the 118 document are stored in the fields file. 119 """ 120 121 # Write the document number and offset deltas. 122 123 self.write_number(docnum - self.last_docnum) 124 self.write_number(offset - self.last_offset) 125 126 self.last_docnum = docnum 127 self.last_offset = offset 128 129 class FieldIndexReader(FileReader): 130 131 "Reading field index details from files." 132 133 def reset(self): 134 self.last_docnum = 0 135 self.last_offset = 0 136 137 def read_document(self): 138 139 "Read a document number and field file offset." 140 141 # Read the document number delta and offset. 142 143 self.last_docnum += self.read_number() 144 self.last_offset += self.read_number() 145 146 return self.last_docnum, self.last_offset 147 148 class FieldDictionaryWriter: 149 150 "Writing field dictionary details." 151 152 def __init__(self, field_writer, field_index_writer, interval): 153 self.field_writer = field_writer 154 self.field_index_writer = field_index_writer 155 self.interval = interval 156 self.entry = 0 157 158 def write_fields(self, docnum, fields): 159 160 "Write details of the document with the given 'docnum' and 'fields'." 161 162 offset = self.field_writer.write_fields(docnum, fields) 163 164 if self.entry % self.interval == 0: 165 self.field_index_writer.write_document(docnum, offset) 166 167 self.entry += 1 168 169 def close(self): 170 self.field_writer.close() 171 self.field_index_writer.close() 172 173 class FieldDictionaryReader: 174 175 "Reading field dictionary details." 176 177 def __init__(self, field_reader, field_index_reader): 178 self.field_reader = field_reader 179 self.field_index_reader = field_index_reader 180 181 self.docs = [] 182 try: 183 while 1: 184 self.docs.append(self.field_index_reader.read_document()) 185 except EOFError: 186 pass 187 188 # Large numbers for ordering purposes. 189 190 if self.docs: 191 self.max_offset = self.docs[-1][1] 192 else: 193 self.max_offset = None 194 195 # Iterator convenience methods. 196 197 def __iter__(self): 198 self.rewind() 199 return self 200 201 def next(self): 202 try: 203 return self.read_fields() 204 except EOFError: 205 raise StopIteration 206 207 # Sequential access methods. 208 209 def rewind(self): 210 self.field_reader.rewind() 211 212 def read_fields(self): 213 214 "Return the next document number and fields." 215 216 return self.field_reader.read_fields() 217 218 # Random access methods. 219 220 def get_fields(self, docnum): 221 222 "Read the fields of the document with the given 'docnum'." 223 224 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 225 226 # Get the entry position providing the term or one preceding it. 227 228 if i == -1: 229 return None 230 231 found_docnum, offset = self.docs[i] 232 233 # Read from the fields file. 234 235 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 236 237 # Scan for the document, if necessary. 238 239 try: 240 while docnum > found_docnum: 241 found_docnum, fields = self.field_reader.read_fields() 242 except EOFError: 243 pass 244 245 # If the document is found, return the fields. 246 247 if docnum == found_docnum: 248 return fields 249 else: 250 return None 251 252 def close(self): 253 self.field_reader.close() 254 self.field_index_reader.close() 255 256 # vim: tabstop=4 expandtab shiftwidth=4