# HG changeset patch
# User Paul Boddie <paul@boddie.org.uk>
# Date 1295912191 -3600
# Node ID c4da9505f73e98eb5cf54d1a23433a395f95aa2d
# Parent  80df3e7605a4b1022fe730341800ee9f2536504f
Added a threshold or interval which causes the term dictionary to be flushed
when a certain number of document positions have been recorded.
Updated the copyright information.

diff -r 80df3e7605a4 -r c4da9505f73e docs/COPYING.txt
--- a/docs/COPYING.txt	Fri Jan 21 00:22:03 2011 +0100
+++ b/docs/COPYING.txt	Tue Jan 25 00:36:31 2011 +0100
@@ -1,7 +1,7 @@
 Licence Agreement for iixr
 --------------------------
 
-Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
+Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
diff -r 80df3e7605a4 -r c4da9505f73e iixr/index.py
--- a/iixr/index.py	Fri Jan 21 00:22:03 2011 +0100
+++ b/iixr/index.py	Tue Jan 25 00:36:31 2011 +0100
@@ -3,7 +3,7 @@
 """
 High-level classes.
 
-Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
+Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
 
 This program is free software; you can redistribute it and/or modify it under
 the terms of the GNU General Public License as published by the Free Software
@@ -30,6 +30,7 @@
 DOCUMENT_INTERVAL = 100
 FIELD_INTERVAL    = 100
 FLUSH_INTERVAL    = 10000
+POSITIONS_FLUSH_INTERVAL = 1000000
 OPEN_PARTITIONS   = 20
 
 # High-level classes.
@@ -86,12 +87,13 @@
     Building term information and writing it to the term and field dictionaries.
     """
 
-    def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval):
+    def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval, positions_flush_interval):
         self.pathname = pathname
         self.interval = interval
         self.doc_interval = doc_interval
         self.field_interval = field_interval
         self.flush_interval = flush_interval
+        self.positions_flush_interval = positions_flush_interval
 
         self.dict_partition = get_next_partition(get_term_partitions(self.pathname))
         self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname))
@@ -100,6 +102,7 @@
         self.docs = []
 
         self.doc_counter = 0
+        self.position_counter = 0
 
     def add_document(self, doc):
 
@@ -108,16 +111,23 @@
         terms and fields if appropriate.
         """
 
+        docnum = doc.docnum
+
         for term, positions in doc.terms.items():
-            self.terms.setdefault(term, {})[doc.docnum] = positions
+            self.terms.setdefault(term, {})[docnum] = positions
+            self.position_counter += len(positions)
 
-        self.docs.append((doc.docnum, doc.fields))
+        self.docs.append((docnum, doc.fields))
 
         self.doc_counter += 1
-        if self.flush_interval and self.doc_counter >= self.flush_interval:
+
+        if self.flush_interval and self.doc_counter >= self.flush_interval or \
+            self.positions_flush_interval and self.position_counter >= self.positions_flush_interval:
+
             self.flush_terms()
             self.flush_fields()
             self.doc_counter = 0
+            self.position_counter = 0
 
     def get_term_writer(self):
 
@@ -227,13 +237,14 @@
     "An inverted index solution encapsulating the various components."
 
     def __init__(self, pathname, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, field_interval=FIELD_INTERVAL,
-        flush_interval=FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS):
+        flush_interval=FLUSH_INTERVAL, positions_flush_interval=POSITIONS_FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS):
 
         self.pathname = pathname
         self.interval = interval
         self.doc_interval = doc_interval
         self.field_interval = field_interval
         self.flush_interval = flush_interval
+        self.positions_flush_interval = positions_flush_interval
         self.open_partitions = open_partitions
         self.reader = None
         self.writer = None
@@ -244,7 +255,7 @@
 
         self._ensure_directory()
         self.writer = IndexWriter(self.pathname, self.interval, self.doc_interval,
-            self.field_interval, self.flush_interval)
+            self.field_interval, self.flush_interval, self.positions_flush_interval)
         return self.writer
 
     def _ensure_directory(self):