1.1 --- a/iixr/terms.py Sun Feb 13 02:49:55 2011 +0100
1.2 +++ b/iixr/terms.py Mon Feb 14 00:44:57 2011 +0100
1.3 @@ -20,8 +20,10 @@
1.4
1.5 from iixr.data import *
1.6 from iixr.files import *
1.7 -from iixr.phrases import PhraseIterator
1.8 +from itermerge import itermerge
1.9 from os.path import commonprefix # to find common string prefixes
1.10 +from bisect import bisect_right, insort_right
1.11 +import operator
1.12
1.13 class TermWriter(FileWriter):
1.14
1.15 @@ -251,13 +253,91 @@
1.16
1.17 return doc_positions
1.18
1.19 -class TermIterator(TermReader):
1.20 +# Indexes covering the information files.
1.21 +
1.22 +class TermIndexWriter(FileWriter):
1.23 +
1.24 + "Writing term index information to files."
1.25 +
1.26 + def begin(self):
1.27 +
1.28 + "Begin writing to the file."
1.29 +
1.30 + self.data_start = self.tell()
1.31 + self.last_term = ""
1.32 + self.last_offset = 0
1.33 +
1.34 + def write_term(self, term, offset):
1.35 +
1.36 + "Write the given 'term' and 'offset'."
1.37 +
1.38 + if term <= self.last_term:
1.39 + raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
1.40 +
1.41 + # Write the prefix length and term suffix.
1.42 +
1.43 + common = len(commonprefix([self.last_term, term]))
1.44 + suffix = term[common:]
1.45 +
1.46 + self.write_number(common)
1.47 + self.write_string(suffix)
1.48 +
1.49 + # Write the offset delta.
1.50 +
1.51 + self.write_number(offset - self.last_offset)
1.52 +
1.53 + self.last_term = term
1.54 + self.last_offset = offset
1.55 +
1.56 +class TermIndexReader(FileReader):
1.57
1.58 - "An iterator over terms and positions read from a file."
1.59 + "Reading term index information to files."
1.60 +
1.61 + def begin(self):
1.62 +
1.63 + "Begin reading from the file."
1.64 +
1.65 + self.data_start = self.tell()
1.66 + self.last_term = ""
1.67 + self.last_offset = 0
1.68 +
1.69 + def read_term(self):
1.70 +
1.71 + "Read a term and an offset from the file."
1.72 +
1.73 + # Read the prefix length and term suffix.
1.74 +
1.75 + common = self.read_number()
1.76 + suffix = self.read_string()
1.77 +
1.78 + self.last_term = self.last_term[:common] + suffix
1.79 +
1.80 + # Read the offset delta.
1.81 +
1.82 + self.last_offset += self.read_number()
1.83 + return self.last_term, self.last_offset
1.84 +
1.85 +# Iterator support classes.
1.86 +
1.87 +class Iterator:
1.88 +
1.89 + "Common iterator support."
1.90 +
1.91 + def go_to_term(self, term):
1.92 + t, dp = self.next()
1.93 + while t < term:
1.94 + t, dp = self.next()
1.95 + return t, dp
1.96
1.97 def __iter__(self):
1.98 return self
1.99
1.100 +# External reading classes.
1.101 +
1.102 +class TermIterator(TermReader, Iterator):
1.103 +
1.104 + "An iterator over terms and positions read from a file."
1.105 +
1.106 def next(self):
1.107 try:
1.108 self.begin_record()
1.109 @@ -265,7 +345,7 @@
1.110 except EOFError:
1.111 raise StopIteration
1.112
1.113 -class TermDataIterator(TermReader):
1.114 +class TermDataIterator(TermReader, Iterator):
1.115
1.116 "An iterator over terms and unprocessed document positions data."
1.117
1.118 @@ -279,4 +359,148 @@
1.119 except EOFError:
1.120 raise StopIteration
1.121
1.122 +class TermIndexIterator(TermIndexReader):
1.123 +
1.124 + "An iterator over terms and offsets read from a file."
1.125 +
1.126 + def __iter__(self):
1.127 + return self
1.128 +
1.129 + def next(self):
1.130 + try:
1.131 + self.begin_record()
1.132 + return self.read_term()
1.133 + except EOFError:
1.134 + raise StopIteration
1.135 +
1.136 +class CombinedIterator:
1.137 +
1.138 + "An iterator providing index and information file access."
1.139 +
1.140 + def __init__(self, reader, index_reader):
1.141 + self.reader = reader
1.142 + self.index_reader = index_reader
1.143 + self.records = list(index_reader)
1.144 +
1.145 + def go_to_term(self, term):
1.146 +
1.147 + # Get the record providing a term less than or equal to the requested
1.148 + # term, getting the first entry if no such records exist.
1.149 +
1.150 + i = max(0, bisect_right(self.records, (term, None)) - 1)
1.151 + t, offset = self.records[i]
1.152 +
1.153 + # Seek to the corresponding record in the information file.
1.154 +
1.155 + self.reader.seek(offset)
1.156 +
1.157 + # Where the found term is equal or greater, just read the positions for
1.158 + # the index entry.
1.159 +
1.160 + if t >= term:
1.161 +
1.162 + # Skip the term information, overwrite the reader's state, and get
1.163 + # the positions.
1.164 +
1.165 + self.reader.begin_record()
1.166 + self.reader.read_term_only()
1.167 + self.reader.last_term = t
1.168 +
1.169 + return t, self.reader.read_positions()
1.170 +
1.171 + # Where the found term is less, use the information file to find the
1.172 + # term or the one after.
1.173 +
1.174 + else:
1.175 +
1.176 + # Overwrite the reader's state, then scan for the term.
1.177 +
1.178 + self.reader.last_term = t
1.179 + t, dp = self.reader.next()
1.180 + while t < term:
1.181 + t, dp = self.reader.next()
1.182 +
1.183 + return t, dp
1.184 +
1.185 + def __iter__(self):
1.186 + return self
1.187 +
1.188 + def next(self):
1.189 + return self.reader.next()
1.190 +
1.191 + def close(self):
1.192 + if self.reader is not None:
1.193 + self.reader.close()
1.194 + self.reader = None
1.195 + if self.index_reader is not None:
1.196 + self.index_reader.close()
1.197 + self.index_reader = None
1.198 +
1.199 +class MultipleReader(itermerge):
1.200 +
1.201 + "Accessing many term readers at once."
1.202 +
1.203 + def __init__(self, readers, combine=None):
1.204 +
1.205 + """
1.206 + Initialise a master index reader using underlying 'readers' and a
1.207 + 'combine' function which knows how to combine position information from
1.208 + different sources.
1.209 + """
1.210 +
1.211 + self.readers = readers
1.212 + self.combine = combine or operator.add
1.213 +
1.214 + # Initialise this object as an iterator over the readers.
1.215 +
1.216 + itermerge.__init__(self, self.readers)
1.217 + self.next_value = None
1.218 +
1.219 + def get_sizes(self):
1.220 +
1.221 + # Readers must have compatible sizes.
1.222 +
1.223 + if self.readers:
1.224 + return self.readers[0].get_sizes()
1.225 + else:
1.226 + return 0, 0
1.227 +
1.228 + def go_to_term(self, term):
1.229 + self.iters = []
1.230 + for reader in self.readers:
1.231 + try:
1.232 + insort_right(self.iters, (reader.go_to_term(term), reader.next))
1.233 + except StopIteration:
1.234 + pass
1.235 + self.next_value = None
1.236 + return self.next()
1.237 +
1.238 + def next(self):
1.239 + if self.next_value is not None:
1.240 + term, positions = self.next_value
1.241 + else:
1.242 + term, positions = itermerge.next(self)
1.243 +
1.244 + # Look at the next item to see if it is has positions for the current
1.245 + # term.
1.246 +
1.247 + try:
1.248 + t, p = itermerge.next(self)
1.249 + while t == term:
1.250 + positions = self.combine(positions, p)
1.251 + t, p = itermerge.next(self)
1.252 + self.next_value = t, p
1.253 +
1.254 + # Where an item could not be fetched, cause future requests to fail.
1.255 +
1.256 + except StopIteration:
1.257 + self.next_value = None
1.258 +
1.259 + return term, positions
1.260 +
1.261 + def close(self):
1.262 + for reader in self.readers:
1.263 + reader.close()
1.264 + self.readers = []
1.265 +
1.266 # vim: tabstop=4 expandtab shiftwidth=4