1.1 --- a/iixr.py Tue Sep 08 00:13:23 2009 +0200
1.2 +++ b/iixr.py Tue Sep 08 19:49:36 2009 +0200
1.3 @@ -37,7 +37,7 @@
1.4 TERM_INTERVAL = 100
1.5 DOCUMENT_INTERVAL = 100
1.6 FIELD_INTERVAL = 100
1.7 -FLUSH_INTERVAL = 100000
1.8 +FLUSH_INTERVAL = 10000
1.9
1.10 TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
1.11 FIELD_FILENAMES = "fields", "fields_index"
1.12 @@ -877,7 +877,10 @@
1.13
1.14 # Large numbers for ordering purposes.
1.15
1.16 - self.max_offset = self.terms[-1][1] + 1
1.17 + if self.terms:
1.18 + self.max_offset = self.terms[-1][1] + 1
1.19 + else:
1.20 + self.max_offset = None
1.21
1.22 def _find_closest_entry(self, term):
1.23
1.24 @@ -1204,7 +1207,10 @@
1.25
1.26 # Large numbers for ordering purposes.
1.27
1.28 - self.max_offset = self.docs[-1][1]
1.29 + if self.docs:
1.30 + self.max_offset = self.docs[-1][1]
1.31 + else:
1.32 + self.max_offset = None
1.33
1.34 def rewind(self):
1.35 self.field_reader.rewind()
1.36 @@ -1509,6 +1515,39 @@
1.37
1.38 # High-level classes.
1.39
1.40 +class Document:
1.41 +
1.42 + "A container of document information."
1.43 +
1.44 + def __init__(self, docnum):
1.45 + self.docnum = docnum
1.46 + self.fields = []
1.47 + self.terms = {}
1.48 +
1.49 + def add_position(self, term, position):
1.50 +
1.51 + """
1.52 + Add a position entry for the given 'term', indicating the given
1.53 + 'position'.
1.54 + """
1.55 +
1.56 + self.terms.setdefault(term, []).append(position)
1.57 +
1.58 + def add_field(self, identifier, value):
1.59 +
1.60 + "Add a field having the given 'identifier' and 'value'."
1.61 +
1.62 + self.fields.append((identifier, unicode(value))) # convert to string
1.63 +
1.64 + def set_fields(self, docnum, fields):
1.65 +
1.66 + """
1.67 + Add for the document with the given 'docnum' the given 'fields': a list
1.68 + of tuples each containing an integer identifier and a string value.
1.69 + """
1.70 +
1.71 + self.fields = fields
1.72 +
1.73 class IndexWriter:
1.74
1.75 """
1.76 @@ -1529,51 +1568,17 @@
1.77
1.78 self.doc_counter = 0
1.79
1.80 - def add_position(self, term, docnum, position):
1.81 -
1.82 - """
1.83 - Add a position entry for the given 'term' in the document with the given
1.84 - 'docnum', indicating the given 'position'.
1.85 - """
1.86 -
1.87 - if not self.terms.has_key(term):
1.88 - doc_positions = self.terms[term] = {}
1.89 - else:
1.90 - doc_positions = self.terms[term]
1.91 -
1.92 - if not doc_positions.has_key(docnum):
1.93 - doc = doc_positions[docnum] = []
1.94 - else:
1.95 - doc = doc_positions[docnum]
1.96 -
1.97 - doc.append(position)
1.98 -
1.99 - def add_field(self, docnum, identifier, value):
1.100 + def add_document(self, doc):
1.101
1.102 """
1.103 - Add for the document with the given 'docnum' a field having the given
1.104 - 'identifier' and 'value'.
1.105 + Add the given document 'doc', updating the document counter and flushing
1.106 + terms and fields if appropriate.
1.107 """
1.108
1.109 - if not self.docs.has_key(docnum):
1.110 - doc_fields = self.docs[docnum] = []
1.111 - else:
1.112 - doc_fields = self.docs[docnum]
1.113 -
1.114 - doc_fields.append((identifier, unicode(value))) # convert to string
1.115 -
1.116 - def set_fields(self, docnum, fields):
1.117 -
1.118 - """
1.119 - Add for the document with the given 'docnum' the given 'fields': a list
1.120 - of tuples each containing an integer identifier and a string value.
1.121 - """
1.122 -
1.123 - self.docs[docnum] = fields
1.124 -
1.125 - def commit_document(self):
1.126 -
1.127 - "Update the document counter, flushing terms and fields if appropriate."
1.128 + for term, positions in doc.terms.items():
1.129 + self.terms.setdefault(term, {})[doc.docnum] = positions
1.130 +
1.131 + self.docs[doc.docnum] = doc.fields
1.132
1.133 self.doc_counter += 1
1.134 if self.flush_interval and self.doc_counter >= self.flush_interval:
2.1 --- a/test.py Tue Sep 08 00:13:23 2009 +0200
2.2 +++ b/test.py Tue Sep 08 19:49:36 2009 +0200
2.3 @@ -417,9 +417,11 @@
2.4 index = iixr.Index("test_index")
2.5 wi = index.get_writer(3, 2, 6)
2.6 for docnum, text in docs:
2.7 + doc = iixr.Document(docnum)
2.8 for position, term in enumerate(text.split()):
2.9 - wi.add_position(term, docnum, position)
2.10 - wi.add_field(docnum, 123, text)
2.11 + doc.add_position(term, position)
2.12 + doc.add_field(123, text)
2.13 + wi.add_document(doc)
2.14 wi.close()
2.15
2.16 rd = index.get_reader()