1.1 --- a/iixr/terms.py Sat Feb 12 01:23:58 2011 +0100
1.2 +++ b/iixr/terms.py Sun Feb 13 02:49:55 2011 +0100
1.3 @@ -18,29 +18,87 @@
1.4 with this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 +from iixr.data import *
1.8 from iixr.files import *
1.9 -from iixr.positions import *
1.10 from iixr.phrases import PhraseIterator
1.11 from os.path import commonprefix # to find common string prefixes
1.12 -from bisect import bisect_right # to find terms in the dictionary index
1.13
1.14 class TermWriter(FileWriter):
1.15
1.16 "Writing term information to files."
1.17
1.18 - def reset(self):
1.19 + def begin(self, docnum_size, position_size):
1.20 +
1.21 + "Begin writing to the file."
1.22 +
1.23 + self.write_numbers((docnum_size, position_size))
1.24 self.end_record()
1.25 +
1.26 + self.data_start = self.tell()
1.27 + self.docnum_size = docnum_size
1.28 + self.position_size = position_size
1.29 + self.subtractor = get_subtractor(docnum_size)
1.30 self.last_term = ""
1.31 - self.last_offset = 0
1.32
1.33 - def write_term(self, term, offset, frequency, doc_frequency):
1.34 + def write_terms(self, terms):
1.35
1.36 """
1.37 - Write the given 'term', its position file 'offset', its 'frequency' and
1.38 - its 'doc_frequency' (number of documents in which it appears) to the
1.39 - term information file.
1.40 + Write the 'terms' to the term information file, with each term's details
1.41 + stored in a separate record.
1.42 """
1.43
1.44 + if hasattr(terms, "items"):
1.45 + terms = terms.items()
1.46 + terms.sort()
1.47 +
1.48 + for term, doc_positions in terms:
1.49 + if not doc_positions:
1.50 + continue
1.51 +
1.52 + if hasattr(doc_positions, "items"):
1.53 + doc_positions = doc_positions.items()
1.54 +
1.55 + docnum, positions = doc_positions[0]
1.56 +
1.57 + if not positions:
1.58 + continue
1.59 +
1.60 + # Start the writing, if appropriate.
1.61 +
1.62 + if self.data_start is None:
1.63 + self.begin(sizeof(docnum), sizeof(positions[0]))
1.64 +
1.65 + # Write each term and document positions.
1.66 +
1.67 + self.write_term(term, doc_positions)
1.68 + self.end_record()
1.69 +
1.70 + # Methods requiring an open record.
1.71 +
1.72 + def write_term(self, term, doc_positions):
1.73 +
1.74 + """
1.75 + Write the given 'term', its document frequency (number of documents in
1.76 + which it appears), and 'doc_positions' to the term information file.
1.77 + """
1.78 +
1.79 + self.write_term_only(term)
1.80 +
1.81 + # Write the document frequency and the term positions.
1.82 +
1.83 + self.write_positions(doc_positions)
1.84 +
1.85 + def write_term_plus_remaining(self, term, data):
1.86 +
1.87 + "Write the given 'term' and the document position 'data'."
1.88 +
1.89 + self.write_term_only(term)
1.90 + self.write_remaining(data)
1.91 +
1.92 + def write_term_only(self, term):
1.93 +
1.94 + "Write only the given 'term'."
1.95 +
1.96 if term <= self.last_term:
1.97 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
1.98
1.99 @@ -52,430 +110,173 @@
1.100 self.write_number(common)
1.101 self.write_string(suffix)
1.102
1.103 - # Write the offset delta.
1.104 - # Write the frequency.
1.105 + self.last_term = term
1.106 +
1.107 + def write_positions(self, doc_positions):
1.108 +
1.109 + "Write the given 'doc_positions' to the file."
1.110 +
1.111 + # Make sure that the positions are sorted.
1.112 +
1.113 + doc_positions.sort()
1.114 +
1.115 # Write the document frequency.
1.116
1.117 - self.write_numbers((
1.118 - offset - self.last_offset,
1.119 - frequency,
1.120 - doc_frequency
1.121 - ))
1.122 + self.write_number(len(doc_positions))
1.123 +
1.124 + last_docnum = None
1.125 +
1.126 + for docnum, positions in doc_positions:
1.127 +
1.128 + # Store the first document number as it is.
1.129 +
1.130 + if last_docnum is None:
1.131 + docnum_seq = docnum
1.132 +
1.133 + # Reject out-of-order documents.
1.134 +
1.135 + elif docnum < last_docnum:
1.136 + raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum)
1.137
1.138 - self.last_term = term
1.139 - self.last_offset = offset
1.140 + # Calculate an ongoing delta.
1.141 +
1.142 + else:
1.143 + docnum_seq = self.subtractor(docnum, last_docnum)
1.144 +
1.145 + # Write the document number and positions.
1.146 +
1.147 + self.write_sequence_value(docnum_seq, self.docnum_size)
1.148 + self.write_monotonic_sequence(positions, self.position_size)
1.149 +
1.150 + last_docnum = docnum
1.151 +
1.152 + # Write a terminating byte to indicate that no more document pages
1.153 + # exist.
1.154 +
1.155 + self.write_byte(0)
1.156
1.157 class TermReader(FileReader):
1.158
1.159 "Reading term information from files."
1.160
1.161 - def reset(self):
1.162 + def begin(self):
1.163 +
1.164 + "Begin reading from the file."
1.165 +
1.166 + self.begin_record()
1.167 + try:
1.168 + self.docnum_size, self.position_size = self.read_numbers(2)
1.169 + except EOFError:
1.170 + self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
1.171 +
1.172 + self.data_start = self.tell()
1.173 + self.adder = get_adder(self.docnum_size)
1.174 self.last_term = ""
1.175 - self.last_offset = 0
1.176 - self.begin_record()
1.177 +
1.178 + def get_sizes(self):
1.179 + return self.docnum_size, self.position_size
1.180 +
1.181 + # Methods requiring an open record.
1.182
1.183 def read_term(self):
1.184
1.185 + "Read a term and its document positions from the term information file."
1.186 +
1.187 + # Read the term.
1.188 +
1.189 + self.read_term_only()
1.190 +
1.191 + # Read the document frequency and the term positions.
1.192 +
1.193 + positions = self.read_positions()
1.194 +
1.195 + return self.last_term, positions
1.196 +
1.197 + def read_term_plus_remaining(self):
1.198 +
1.199 """
1.200 - Read a term, its position file offset, its frequency and its document
1.201 - frequency from the term information file.
1.202 + Read a term and the unprocessed document position data.
1.203 """
1.204
1.205 + self.read_term_only()
1.206 + return self.last_term, self.read_remaining()
1.207 +
1.208 + def read_term_only(self):
1.209 +
1.210 + "Read a term only."
1.211 +
1.212 # Read the prefix length and term suffix.
1.213
1.214 common = self.read_number()
1.215 suffix = self.read_string()
1.216
1.217 self.last_term = self.last_term[:common] + suffix
1.218 -
1.219 - # Read the offset delta.
1.220 -
1.221 - self.last_offset += self.read_number()
1.222 -
1.223 - # Read the frequency.
1.224 -
1.225 - frequency = self.read_number()
1.226 -
1.227 - # Read the document frequency.
1.228 -
1.229 - doc_frequency = self.read_number()
1.230 + return self.last_term
1.231
1.232 - return self.last_term, self.last_offset, frequency, doc_frequency
1.233 -
1.234 - def go_to_term(self, term, offset, info_offset):
1.235 -
1.236 - """
1.237 - Seek past the entry for 'term' having 'offset' to 'info_offset'. This
1.238 - permits the scanning for later terms from the specified term.
1.239 - """
1.240 -
1.241 - self.seek(info_offset)
1.242 - self.last_term = term
1.243 - self.last_offset = offset
1.244 -
1.245 -class TermIndexWriter(TermWriter):
1.246 + def read_positions(self):
1.247
1.248 - "Writing term dictionary index details to files."
1.249 -
1.250 - def reset(self):
1.251 - TermWriter.reset(self)
1.252 - self.last_info_offset = 0
1.253 -
1.254 - def write_term(self, term, offset, frequency, doc_frequency, info_offset):
1.255 -
1.256 - """
1.257 - Write the given 'term', its position file 'offset', its 'frequency' and
1.258 - its 'doc_frequency' to the term dictionary index file, along with the
1.259 - 'info_offset' in the term information file.
1.260 - """
1.261 + "Read document positions from the term information file."
1.262
1.263 - TermWriter.write_term(self, term, offset, frequency, doc_frequency)
1.264 -
1.265 - # Write the information file offset delta.
1.266 -
1.267 - self.write_number(info_offset - self.last_info_offset)
1.268 -
1.269 - self.last_info_offset = info_offset
1.270 + doc_positions = []
1.271
1.272 -class TermIndexReader(TermReader):
1.273 -
1.274 - "Reading term dictionary index details from files."
1.275 -
1.276 - def reset(self):
1.277 - TermReader.reset(self)
1.278 - self.last_info_offset = 0
1.279 + while 1:
1.280
1.281 - def read_term(self):
1.282 -
1.283 - """
1.284 - Read a term, its position file offset, its frequency, its document
1.285 - frequency and a term information file offset from the term dictionary
1.286 - index file.
1.287 - """
1.288 -
1.289 - term, offset, frequency, doc_frequency = TermReader.read_term(self)
1.290 -
1.291 - # Read the offset delta.
1.292 -
1.293 - self.last_info_offset += self.read_number()
1.294 + # Read the document frequency.
1.295
1.296 - return term, offset, frequency, doc_frequency, self.last_info_offset
1.297 -
1.298 -class TermDictionaryWriter:
1.299 -
1.300 - "Writing term dictionaries."
1.301 -
1.302 - def __init__(self, info_writer, index_writer, position_dict_writer, interval):
1.303 - self.info_writer = info_writer
1.304 - self.index_writer = index_writer
1.305 - self.position_dict_writer = position_dict_writer
1.306 - self.interval = interval
1.307 - self.entry = 0
1.308 -
1.309 - self.index_writer.reset()
1.310 + npositions = self.read_number()
1.311
1.312 - def _write_term(self, term, offset, frequency, doc_frequency):
1.313 -
1.314 - """
1.315 - Write the given 'term', its position file 'offset', its 'frequency' and
1.316 - its 'doc_frequency' (number of documents in which it appears) to the
1.317 - term information file. Return the offset before the term information was
1.318 - written to the file.
1.319 - """
1.320 -
1.321 - if self.entry % self.interval == 0:
1.322 - self.info_writer.reset()
1.323 - info_offset = self.info_writer.tell()
1.324 - self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
1.325 + last_docnum = None
1.326 + i = 0
1.327 + while i < npositions:
1.328
1.329 - self.info_writer.write_term(term, offset, frequency, doc_frequency)
1.330 - self.entry += 1
1.331 -
1.332 - def write_term_positions(self, term, doc_positions):
1.333 -
1.334 - """
1.335 - Write the given 'term' and the 'doc_positions' recording the documents
1.336 - and positions at which the term is found.
1.337 - """
1.338 -
1.339 - offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
1.340 -
1.341 - if not frequency or not doc_frequency:
1.342 - raise ValueError, "Term %r has no occurrences recorded: %r" % (term, doc_positions)
1.343 -
1.344 - self._write_term(term, offset, frequency, doc_frequency)
1.345 + # Read the document number.
1.346
1.347 - def close(self):
1.348 - self.info_writer.close()
1.349 - self.index_writer.close()
1.350 - self.position_dict_writer.close()
1.351 -
1.352 -class TermDictionaryReader:
1.353 -
1.354 - "Reading term dictionaries."
1.355 + docnum = self.read_sequence_value(self.docnum_size)
1.356 + if last_docnum is not None:
1.357 + docnum = self.adder(docnum, last_docnum)
1.358
1.359 - def __init__(self, info_reader, index_reader, position_dict_reader):
1.360 - self.info_reader = info_reader
1.361 - self.index_reader = index_reader
1.362 - self.position_dict_reader = position_dict_reader
1.363 -
1.364 - self.info_reader.reset()
1.365 - self.index_reader.reset()
1.366 -
1.367 - self.entry = 0
1.368 - self.terms = []
1.369 - try:
1.370 - while 1:
1.371 - self.terms.append(self.index_reader.read_term())
1.372 - except EOFError:
1.373 - pass
1.374 -
1.375 - # Large numbers for ordering purposes.
1.376 + # Read the positions.
1.377
1.378 - if self.terms:
1.379 - self.max_offset = self.terms[-1][1] + 1
1.380 - else:
1.381 - self.max_offset = None
1.382 -
1.383 - def _find_closest_entry(self, term):
1.384 -
1.385 - """
1.386 - Find the offsets and frequencies of 'term' from the term dictionary or
1.387 - the closest term starting with the value of 'term'.
1.388 -
1.389 - Return the closest index entry consisting of a term, the position file
1.390 - offset, the term frequency, the document frequency, and the term details
1.391 - file offset.
1.392 - """
1.393 + positions = self.read_monotonic_sequence(self.position_size)
1.394 + doc_positions.append((docnum, positions))
1.395
1.396 - i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
1.397 -
1.398 - # Get the entry position providing the term or one preceding it.
1.399 - # If no entry precedes the requested term, return the very first entry
1.400 - # as the closest.
1.401 -
1.402 - if i == -1:
1.403 - self.entry = 0
1.404 - return self.terms[0]
1.405 - else:
1.406 - self.entry = i
1.407 - return self.terms[i]
1.408 -
1.409 - def _find_closest_term(self, term):
1.410 -
1.411 - """
1.412 - Find the offsets and frequencies of 'term' from the term dictionary or
1.413 - the closest term starting with the value of 'term'.
1.414 + last_docnum = docnum
1.415 + i += 1
1.416
1.417 - Return the closest term (or the term itself), the position file offset,
1.418 - the term frequency, the document frequency, and the term details file
1.419 - offset (or None if the reader is already positioned).
1.420 - """
1.421 -
1.422 - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
1.423 -
1.424 - # Where the term is found immediately, return the offset and
1.425 - # frequencies. If the term does not appear, return the details of the
1.426 - # closest entry.
1.427 -
1.428 - if term <= found_term:
1.429 - return found_term, offset, frequency, doc_frequency, info_offset
1.430 + # Read a terminating byte to discover whether more document pages
1.431 + # exist.
1.432
1.433 - # Otherwise, seek past the index term's entry in the information file
1.434 - # and scan for the desired term.
1.435 -
1.436 - else:
1.437 - # Reset the term and offset for the new page.
1.438 - self.info_reader.go_to_term("", 0, info_offset)
1.439 - try:
1.440 - while term > found_term:
1.441 - found_term, offset, frequency, doc_frequency = self._read_term()
1.442 - except EOFError:
1.443 - pass
1.444 -
1.445 - return found_term, offset, frequency, doc_frequency, None
1.446 -
1.447 - def _find_term(self, term):
1.448 + if not self.read_byte():
1.449 + break
1.450
1.451 - """
1.452 - Find the position file offset and frequency of 'term' from the term
1.453 - dictionary.
1.454 - """
1.455 -
1.456 - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
1.457 -
1.458 - # If the term is found, return the offset and frequencies.
1.459 -
1.460 - if term == found_term:
1.461 - return offset, frequency, doc_frequency
1.462 - else:
1.463 - return None
1.464 -
1.465 - def _get_term_and_positions(self, term, offset, frequency, doc_frequency):
1.466 + return doc_positions
1.467
1.468 - """
1.469 - Return the term plus positions details using the given 'term', 'offset',
1.470 - 'frequency' and 'doc_frequency'.
1.471 - """
1.472 -
1.473 - return term, frequency, doc_frequency, self._get_positions(offset, doc_frequency)
1.474 -
1.475 - def _get_positions(self, offset, doc_frequency):
1.476 +class TermIterator(TermReader):
1.477
1.478 - """
1.479 - Obtain positions from the position index 'offset' expecting a number of
1.480 - documents equal to the given 'doc_frequency'.
1.481 - """
1.482 -
1.483 - return self.position_dict_reader.read_term_positions(offset, doc_frequency)
1.484 -
1.485 - # Iterator convenience methods.
1.486 + "An iterator over terms and positions read from a file."
1.487
1.488 def __iter__(self):
1.489 - self.rewind()
1.490 return self
1.491
1.492 def next(self):
1.493 try:
1.494 + self.begin_record()
1.495 return self.read_term()
1.496 except EOFError:
1.497 raise StopIteration
1.498
1.499 - # Sequential access methods.
1.500 -
1.501 - def rewind(self):
1.502 - self.entry = 0
1.503 - self.info_reader.rewind()
1.504 -
1.505 - def read_term(self):
1.506 -
1.507 - """
1.508 - Return the next term, its frequency, its document frequency, and the
1.509 - documents and positions at which the term is found.
1.510 - """
1.511 -
1.512 - return self._get_term_and_positions(*self._read_term())
1.513 -
1.514 - def _read_term(self):
1.515 -
1.516 - try:
1.517 - term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.518 - except EOFError:
1.519 - self.entry += 1
1.520 - try:
1.521 - term, offset, frequency, doc_frequency, info_offset = self.terms[self.entry]
1.522 - except IndexError:
1.523 - raise EOFError
1.524 - else:
1.525 - # Reset the term and offset for the new page.
1.526 -
1.527 - self.info_reader.go_to_term("", 0, info_offset)
1.528 -
1.529 - # Skip the term in the information file.
1.530 -
1.531 - self.info_reader.read_term()
1.532 +class TermDataIterator(TermReader):
1.533
1.534 - return term, offset, frequency, doc_frequency
1.535 -
1.536 - def go_to_term(self, term):
1.537 -
1.538 - """
1.539 - Navigate to 'term' in the dictionary, returning the details from its
1.540 - entry. The returned details can be augmented with position information
1.541 - when presented to the _get_term_and_positions method.
1.542 - """
1.543 -
1.544 - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
1.545 -
1.546 - # Position the reader, if necessary.
1.547 -
1.548 - if info_offset is not None:
1.549 + "An iterator over terms and unprocessed document positions data."
1.550
1.551 - # Reset the term and offset for the new page.
1.552 -
1.553 - self.info_reader.go_to_term("", 0, info_offset)
1.554 -
1.555 - # Skip the term in the information file.
1.556 -
1.557 - self.info_reader.read_term()
1.558 -
1.559 - return found_term, offset, frequency, doc_frequency
1.560 -
1.561 - # Query methods.
1.562 -
1.563 - def get_terms(self):
1.564 -
1.565 - "Return a list of all terms."
1.566 -
1.567 - return iter(self)
1.568 + def __iter__(self):
1.569 + return self
1.570
1.571 - def find_terms(self, term):
1.572 -
1.573 - "Return all terms whose values start with the value of 'term'."
1.574 -
1.575 - terms = []
1.576 -
1.577 - found_term, offset, frequency, doc_frequency = self.go_to_term(term)
1.578 -
1.579 - # Read and record terms.
1.580 -
1.581 + def next(self):
1.582 try:
1.583 - # Add the found term if it starts with the specified term.
1.584 -
1.585 - while found_term.startswith(term):
1.586 - terms.append(found_term)
1.587 - found_term, offset, frequency, doc_frequency = self._read_term()
1.588 -
1.589 + self.begin_record()
1.590 + return self.read_term_plus_remaining()
1.591 except EOFError:
1.592 - pass
1.593 -
1.594 - return terms
1.595 -
1.596 - def find_positions(self, term):
1.597 -
1.598 - "Return the documents and positions at which the given 'term' is found."
1.599 -
1.600 - t = self._find_term(term)
1.601 - if t is None:
1.602 - return []
1.603 - else:
1.604 - offset, frequency, doc_frequency = t
1.605 - return self._get_positions(offset, doc_frequency)
1.606 -
1.607 - def find_common_positions(self, terms):
1.608 -
1.609 - """
1.610 - Return the documents and positions at which all the given 'terms' are
1.611 - found, where only common documents are returned.
1.612 - """
1.613 -
1.614 - return PhraseIterator([self.find_positions(term) for term in terms])
1.615 -
1.616 - def get_frequency(self, term):
1.617 -
1.618 - "Return the frequency of the given 'term'."
1.619 -
1.620 - t = self._find_term(term)
1.621 - if t is None:
1.622 - return None
1.623 - else:
1.624 - offset, frequency, doc_frequency = t
1.625 - return frequency
1.626 -
1.627 - def get_document_frequency(self, term):
1.628 -
1.629 - "Return the document frequency of the given 'term'."
1.630 -
1.631 - t = self._find_term(term)
1.632 - if t is None:
1.633 - return None
1.634 - else:
1.635 - offset, frequency, doc_frequency = t
1.636 - return doc_frequency
1.637 -
1.638 - def close(self):
1.639 - self.info_reader.close()
1.640 - self.index_reader.close()
1.641 - self.position_dict_reader.close()
1.642 + raise StopIteration
1.643
1.644 # vim: tabstop=4 expandtab shiftwidth=4