1 #!/usr/bin/env python 2 3 """ 4 High-level classes. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.filesystem import * 22 from os import listdir, mkdir # index and partition discovery 23 from os.path import exists 24 25 try: 26 set 27 except NameError: 28 from sets import Set as set 29 30 # Constants. 31 32 TERM_INTERVAL = 100 33 DOCUMENT_INTERVAL = 100 34 FIELD_INTERVAL = 100 35 FLUSH_INTERVAL = 10000 36 37 # High-level classes. 38 39 class Document: 40 41 "A container of document information." 42 43 def __init__(self, docnum): 44 self.docnum = docnum 45 self.fields = [] 46 self.terms = {} 47 48 def add_position(self, term, position): 49 50 """ 51 Add a position entry for the given 'term', indicating the given 52 'position'. 53 """ 54 55 self.terms.setdefault(term, []).append(position) 56 57 def add_field(self, identifier, value): 58 59 "Add a field having the given 'identifier' and 'value'." 60 61 self.fields.append((identifier, unicode(value))) # convert to string 62 63 def set_fields(self, fields): 64 65 """ 66 Set the document's 'fields': a list of tuples each containing an integer 67 identifier and a string value. 68 """ 69 70 self.fields = fields 71 72 class IndexWriter: 73 74 """ 75 Building term information and writing it to the term and field dictionaries. 76 """ 77 78 def __init__(self, pathname, interval, doc_interval, flush_interval): 79 self.pathname = pathname 80 self.interval = interval 81 self.doc_interval = doc_interval 82 self.flush_interval = flush_interval 83 84 self.dict_partition = 0 85 self.field_dict_partition = 0 86 87 self.terms = {} 88 self.docs = {} 89 90 self.doc_counter = 0 91 92 def add_document(self, doc): 93 94 """ 95 Add the given document 'doc', updating the document counter and flushing 96 terms and fields if appropriate. 97 """ 98 99 for term, positions in doc.terms.items(): 100 self.terms.setdefault(term, {})[doc.docnum] = positions 101 102 self.docs[doc.docnum] = doc.fields 103 104 self.doc_counter += 1 105 if self.flush_interval and self.doc_counter >= self.flush_interval: 106 self.flush_terms() 107 self.flush_fields() 108 self.doc_counter = 0 109 110 def get_term_writer(self): 111 112 "Return a term dictionary writer for the current partition." 113 114 return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval) 115 116 def get_field_writer(self): 117 118 "Return a field dictionary writer for the current partition." 119 120 return get_field_writer(self.pathname, self.field_dict_partition, self.interval) 121 122 def flush_terms(self): 123 124 "Flush terms into the current term dictionary partition." 125 126 # Get the terms in order. 127 128 all_terms = self.terms 129 terms = all_terms.keys() 130 terms.sort() 131 132 dict_writer = self.get_term_writer() 133 134 for term in terms: 135 doc_positions = all_terms[term].items() 136 dict_writer.write_term_positions(term, doc_positions) 137 138 dict_writer.close() 139 140 self.terms = {} 141 self.dict_partition += 1 142 143 def flush_fields(self): 144 145 "Flush fields into the current term dictionary partition." 146 147 # Get the documents in order. 148 149 docs = self.docs.items() 150 docs.sort() 151 152 field_dict_writer = self.get_field_writer() 153 154 for docnum, fields in docs: 155 field_dict_writer.write_fields(docnum, fields) 156 157 field_dict_writer.close() 158 159 self.docs = {} 160 self.field_dict_partition += 1 161 162 def close(self): 163 if self.terms: 164 self.flush_terms() 165 if self.docs: 166 self.flush_fields() 167 168 class IndexReader: 169 170 "Accessing the term and field dictionaries." 171 172 def __init__(self, pathname): 173 self.dict_reader = get_term_reader(pathname, "merged") 174 self.field_dict_reader = get_field_reader(pathname, "merged") 175 176 def find_terms(self, term): 177 return self.dict_reader.find_terms(term) 178 179 def find_positions(self, term): 180 return self.dict_reader.find_positions(term) 181 182 def get_frequency(self, term): 183 return self.dict_reader.get_frequency(term) 184 185 def get_document_frequency(self, term): 186 return self.dict_reader.get_document_frequency(term) 187 188 def get_fields(self, docnum): 189 return self.field_dict_reader.get_fields(docnum) 190 191 def close(self): 192 self.dict_reader.close() 193 self.field_dict_reader.close() 194 195 class Index: 196 197 "An inverted index solution encapsulating the various components." 198 199 def __init__(self, pathname): 200 self.pathname = pathname 201 self.reader = None 202 self.writer = None 203 204 def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL): 205 206 """ 207 Return a writer, optionally using the given indexing 'interval', 208 'doc_interval' and 'flush_interval'. 209 """ 210 211 if not exists(self.pathname): 212 mkdir(self.pathname) 213 214 self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval) 215 return self.writer 216 217 def get_reader(self, partition=0): 218 219 "Return a reader for the index." 220 221 # Ensure that only one partition exists. 222 223 self.merge() 224 return self._get_reader(partition) 225 226 def _get_reader(self, partition): 227 228 "Return a reader for the index." 229 230 if not exists(self.pathname): 231 raise OSError, "Index path %r does not exist." % self.pathname 232 233 self.reader = IndexReader(self.pathname) 234 return self.reader 235 236 def merge(self): 237 238 "Merge/optimise index partitions." 239 240 self.merge_terms() 241 self.merge_fields() 242 243 def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL): 244 245 """ 246 Merge term dictionaries using the given indexing 'interval' and 247 'doc_interval'. 248 """ 249 250 readers = [] 251 partitions = set() 252 253 for filename in listdir(self.pathname): 254 if filename.startswith("terms-"): # 6 character prefix 255 partition = filename[6:] 256 readers.append(get_term_reader(self.pathname, partition)) 257 partitions.add(partition) 258 259 # Write directly to a dictionary. 260 261 if len(readers) > 1: 262 if "merged" in partitions: 263 rename_term_files(self.pathname, "merged", "old-merged") 264 partitions.remove("merged") 265 partitions.add("old-merged") 266 267 writer = get_term_writer(self.pathname, "merged", interval, doc_interval) 268 merger = TermDictionaryMerger(writer, readers) 269 merger.merge() 270 merger.close() 271 272 # Remove old files. 273 274 for partition in partitions: 275 remove_term_files(self.pathname, partition) 276 277 elif len(readers) == 1: 278 partition = list(partitions)[0] 279 if partition != "merged": 280 rename_term_files(self.pathname, partition, "merged") 281 282 def merge_fields(self, interval=FIELD_INTERVAL): 283 284 "Merge field dictionaries using the given indexing 'interval'." 285 286 readers = [] 287 partitions = set() 288 289 for filename in listdir(self.pathname): 290 if filename.startswith("fields-"): # 7 character prefix 291 partition = filename[7:] 292 readers.append(get_field_reader(self.pathname, partition)) 293 partitions.add(partition) 294 295 # Write directly to a dictionary. 296 297 if len(readers) > 1: 298 if "merged" in partitions: 299 rename_field_files(self.pathname, "merged", "old-merged") 300 partitions.remove("merged") 301 partitions.add("old-merged") 302 303 writer = get_field_writer(self.pathname, "merged", interval) 304 merger = FieldDictionaryMerger(writer, readers) 305 merger.merge() 306 merger.close() 307 308 # Remove old files. 309 310 for partition in partitions: 311 remove_field_files(self.pathname, partition) 312 313 elif len(readers) == 1: 314 partition = list(partitions)[0] 315 if partition != "merged": 316 rename_field_files(self.pathname, partition, "merged") 317 318 def close(self): 319 if self.reader is not None: 320 self.reader.close() 321 self.reader = None 322 if self.writer is not None: 323 self.writer.close() 324 self.writer = None 325 326 # vim: tabstop=4 expandtab shiftwidth=4