# HG changeset patch # User Paul Boddie # Date 1254340971 -7200 # Node ID ff3800a700d52bde0b8e3f07d15b4521f2f352d9 # Parent de111fdce60f8bfba452256fc4e10493c6493c16 Simplified the IndexWriter document cache, adopting a list of items instead of a dictionary. Added a get_document method to the IndexReader class in order to support more convenient access to indexed documents, also adding dictionary-style methods to the Document class for field access. Removed inaccurate comments. diff -r de111fdce60f -r ff3800a700d5 iixr/index.py --- a/iixr/index.py Sun Sep 27 23:03:19 2009 +0200 +++ b/iixr/index.py Wed Sep 30 22:02:51 2009 +0200 @@ -41,10 +41,11 @@ "A container of document information." - def __init__(self, docnum): + def __init__(self, docnum, fields=None): self.docnum = docnum - self.fields = [] + self.fields = fields or [] self.terms = {} + self.field_dict = None def add_position(self, term, position): @@ -70,6 +71,18 @@ self.fields = fields + def _ensure_dict(self): + if self.field_dict is None: + self.field_dict = dict(self.fields) + + def keys(self): + self._ensure_dict() + return self.field_dict.keys() + + def __getitem__(self, key): + self._ensure_dict() + return self.field_dict[key] + class IndexWriter: """ @@ -86,7 +99,7 @@ self.field_dict_partition = 0 self.terms = {} - self.docs = {} + self.docs = [] self.doc_counter = 0 @@ -100,7 +113,7 @@ for term, positions in doc.terms.items(): self.terms.setdefault(term, {})[doc.docnum] = positions - self.docs[doc.docnum] = doc.fields + self.docs.append((doc.docnum, doc.fields)) self.doc_counter += 1 if self.flush_interval and self.doc_counter >= self.flush_interval: @@ -147,17 +160,16 @@ # Get the documents in order. - docs = self.docs.items() - docs.sort() + self.docs.sort() field_dict_writer = self.get_field_writer() - for docnum, fields in docs: + for docnum, fields in self.docs: field_dict_writer.write_fields(docnum, fields) field_dict_writer.close() - self.docs = {} + self.docs = [] self.field_dict_partition += 1 def close(self): @@ -192,6 +204,9 @@ def get_fields(self, docnum): return self.field_dict_reader.get_fields(docnum) + def get_document(self, docnum): + return Document(docnum, self.get_fields(docnum)) + def close(self): self.dict_reader.close() self.field_dict_reader.close() diff -r de111fdce60f -r ff3800a700d5 iixr/positions.py --- a/iixr/positions.py Sun Sep 27 23:03:19 2009 +0200 +++ b/iixr/positions.py Wed Sep 30 22:02:51 2009 +0200 @@ -73,8 +73,6 @@ to 'count'. """ - # Duplicate the file handle. - f = self.open("rb") return PositionIterator(f, offset, count) @@ -121,8 +119,6 @@ to 'doc_frequency'. """ - # Duplicate the file handle. - f = self.open("rb") return PositionIndexIterator(f, offset, doc_frequency)