# HG changeset patch # User Paul Boddie # Date 1295912191 -3600 # Node ID c4da9505f73e98eb5cf54d1a23433a395f95aa2d # Parent 80df3e7605a4b1022fe730341800ee9f2536504f Added a threshold or interval which causes the term dictionary to be flushed when a certain number of document positions have been recorded. Updated the copyright information. diff -r 80df3e7605a4 -r c4da9505f73e docs/COPYING.txt --- a/docs/COPYING.txt Fri Jan 21 00:22:03 2011 +0100 +++ b/docs/COPYING.txt Tue Jan 25 00:36:31 2011 +0100 @@ -1,7 +1,7 @@ Licence Agreement for iixr -------------------------- -Copyright (C) 2009, 2010 Paul Boddie +Copyright (C) 2009, 2010, 2011 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software diff -r 80df3e7605a4 -r c4da9505f73e iixr/index.py --- a/iixr/index.py Fri Jan 21 00:22:03 2011 +0100 +++ b/iixr/index.py Tue Jan 25 00:36:31 2011 +0100 @@ -3,7 +3,7 @@ """ High-level classes. -Copyright (C) 2009, 2010 Paul Boddie +Copyright (C) 2009, 2010, 2011 Paul Boddie This program is free software; you can redistribute it and/or modify it under the terms of the GNU General Public License as published by the Free Software @@ -30,6 +30,7 @@ DOCUMENT_INTERVAL = 100 FIELD_INTERVAL = 100 FLUSH_INTERVAL = 10000 +POSITIONS_FLUSH_INTERVAL = 1000000 OPEN_PARTITIONS = 20 # High-level classes. @@ -86,12 +87,13 @@ Building term information and writing it to the term and field dictionaries. """ - def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval): + def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval, positions_flush_interval): self.pathname = pathname self.interval = interval self.doc_interval = doc_interval self.field_interval = field_interval self.flush_interval = flush_interval + self.positions_flush_interval = positions_flush_interval self.dict_partition = get_next_partition(get_term_partitions(self.pathname)) self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname)) @@ -100,6 +102,7 @@ self.docs = [] self.doc_counter = 0 + self.position_counter = 0 def add_document(self, doc): @@ -108,16 +111,23 @@ terms and fields if appropriate. """ + docnum = doc.docnum + for term, positions in doc.terms.items(): - self.terms.setdefault(term, {})[doc.docnum] = positions + self.terms.setdefault(term, {})[docnum] = positions + self.position_counter += len(positions) - self.docs.append((doc.docnum, doc.fields)) + self.docs.append((docnum, doc.fields)) self.doc_counter += 1 - if self.flush_interval and self.doc_counter >= self.flush_interval: + + if self.flush_interval and self.doc_counter >= self.flush_interval or \ + self.positions_flush_interval and self.position_counter >= self.positions_flush_interval: + self.flush_terms() self.flush_fields() self.doc_counter = 0 + self.position_counter = 0 def get_term_writer(self): @@ -227,13 +237,14 @@ "An inverted index solution encapsulating the various components." def __init__(self, pathname, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, field_interval=FIELD_INTERVAL, - flush_interval=FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS): + flush_interval=FLUSH_INTERVAL, positions_flush_interval=POSITIONS_FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS): self.pathname = pathname self.interval = interval self.doc_interval = doc_interval self.field_interval = field_interval self.flush_interval = flush_interval + self.positions_flush_interval = positions_flush_interval self.open_partitions = open_partitions self.reader = None self.writer = None @@ -244,7 +255,7 @@ self._ensure_directory() self.writer = IndexWriter(self.pathname, self.interval, self.doc_interval, - self.field_interval, self.flush_interval) + self.field_interval, self.flush_interval, self.positions_flush_interval) return self.writer def _ensure_directory(self):