1.1 --- a/docs/COPYING.txt Fri Jan 21 00:22:03 2011 +0100
1.2 +++ b/docs/COPYING.txt Tue Jan 25 00:36:31 2011 +0100
1.3 @@ -1,7 +1,7 @@
1.4 Licence Agreement for iixr
1.5 --------------------------
1.6
1.7 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
1.8 +Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
1.9
1.10 This program is free software; you can redistribute it and/or modify it under
1.11 the terms of the GNU General Public License as published by the Free Software
2.1 --- a/iixr/index.py Fri Jan 21 00:22:03 2011 +0100
2.2 +++ b/iixr/index.py Tue Jan 25 00:36:31 2011 +0100
2.3 @@ -3,7 +3,7 @@
2.4 """
2.5 High-level classes.
2.6
2.7 -Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
2.8 +Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
2.9
2.10 This program is free software; you can redistribute it and/or modify it under
2.11 the terms of the GNU General Public License as published by the Free Software
2.12 @@ -30,6 +30,7 @@
2.13 DOCUMENT_INTERVAL = 100
2.14 FIELD_INTERVAL = 100
2.15 FLUSH_INTERVAL = 10000
2.16 +POSITIONS_FLUSH_INTERVAL = 1000000
2.17 OPEN_PARTITIONS = 20
2.18
2.19 # High-level classes.
2.20 @@ -86,12 +87,13 @@
2.21 Building term information and writing it to the term and field dictionaries.
2.22 """
2.23
2.24 - def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval):
2.25 + def __init__(self, pathname, interval, doc_interval, field_interval, flush_interval, positions_flush_interval):
2.26 self.pathname = pathname
2.27 self.interval = interval
2.28 self.doc_interval = doc_interval
2.29 self.field_interval = field_interval
2.30 self.flush_interval = flush_interval
2.31 + self.positions_flush_interval = positions_flush_interval
2.32
2.33 self.dict_partition = get_next_partition(get_term_partitions(self.pathname))
2.34 self.field_dict_partition = get_next_partition(get_field_partitions(self.pathname))
2.35 @@ -100,6 +102,7 @@
2.36 self.docs = []
2.37
2.38 self.doc_counter = 0
2.39 + self.position_counter = 0
2.40
2.41 def add_document(self, doc):
2.42
2.43 @@ -108,16 +111,23 @@
2.44 terms and fields if appropriate.
2.45 """
2.46
2.47 + docnum = doc.docnum
2.48 +
2.49 for term, positions in doc.terms.items():
2.50 - self.terms.setdefault(term, {})[doc.docnum] = positions
2.51 + self.terms.setdefault(term, {})[docnum] = positions
2.52 + self.position_counter += len(positions)
2.53
2.54 - self.docs.append((doc.docnum, doc.fields))
2.55 + self.docs.append((docnum, doc.fields))
2.56
2.57 self.doc_counter += 1
2.58 - if self.flush_interval and self.doc_counter >= self.flush_interval:
2.59 +
2.60 + if self.flush_interval and self.doc_counter >= self.flush_interval or \
2.61 + self.positions_flush_interval and self.position_counter >= self.positions_flush_interval:
2.62 +
2.63 self.flush_terms()
2.64 self.flush_fields()
2.65 self.doc_counter = 0
2.66 + self.position_counter = 0
2.67
2.68 def get_term_writer(self):
2.69
2.70 @@ -227,13 +237,14 @@
2.71 "An inverted index solution encapsulating the various components."
2.72
2.73 def __init__(self, pathname, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, field_interval=FIELD_INTERVAL,
2.74 - flush_interval=FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS):
2.75 + flush_interval=FLUSH_INTERVAL, positions_flush_interval=POSITIONS_FLUSH_INTERVAL, open_partitions=OPEN_PARTITIONS):
2.76
2.77 self.pathname = pathname
2.78 self.interval = interval
2.79 self.doc_interval = doc_interval
2.80 self.field_interval = field_interval
2.81 self.flush_interval = flush_interval
2.82 + self.positions_flush_interval = positions_flush_interval
2.83 self.open_partitions = open_partitions
2.84 self.reader = None
2.85 self.writer = None
2.86 @@ -244,7 +255,7 @@
2.87
2.88 self._ensure_directory()
2.89 self.writer = IndexWriter(self.pathname, self.interval, self.doc_interval,
2.90 - self.field_interval, self.flush_interval)
2.91 + self.field_interval, self.flush_interval, self.positions_flush_interval)
2.92 return self.writer
2.93
2.94 def _ensure_directory(self):