1.1 --- a/iixr.py Tue Sep 15 00:15:11 2009 +0200
1.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
1.3 @@ -1,1876 +0,0 @@
1.4 -#!/usr/bin/env python
1.5 -
1.6 -"""
1.7 -A simple (and sane) text indexing library.
1.8 -
1.9 -Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>
1.10 -
1.11 -This program is free software; you can redistribute it and/or modify it under
1.12 -the terms of the GNU General Public License as published by the Free Software
1.13 -Foundation; either version 3 of the License, or (at your option) any later
1.14 -version.
1.15 -
1.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 -PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 -
1.20 -You should have received a copy of the GNU General Public License along
1.21 -with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 -"""
1.23 -
1.24 -from os import dup, fdopen # independent iterator access to files
1.25 -from os import listdir, mkdir # index and partition discovery
1.26 -from os import remove, rename # partition manipulation
1.27 -from os.path import exists, join
1.28 -from os.path import commonprefix # to find common string prefixes
1.29 -from bisect import bisect_right # to find terms in the dictionary index
1.30 -import bz2, zlib # for field compression
1.31 -from itermerge import itermerge
1.32 -
1.33 -try:
1.34 - set
1.35 -except NameError:
1.36 - from sets import Set as set
1.37 -
1.38 -# Constants.
1.39 -
1.40 -TERM_INTERVAL = 100
1.41 -DOCUMENT_INTERVAL = 100
1.42 -FIELD_INTERVAL = 100
1.43 -FLUSH_INTERVAL = 10000
1.44 -
1.45 -WRITE_CACHE_SIZE = 100000
1.46 -READ_CACHE_SIZE = 10000
1.47 -READ_CACHE_RESIZE = 5000
1.48 -
1.49 -TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index"
1.50 -FIELD_FILENAMES = "fields", "fields_index"
1.51 -
1.52 -compressors = [("b", bz2.compress), ("z", zlib.compress)]
1.53 -decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}
1.54 -
1.55 -# Utility functions.
1.56 -
1.57 -try:
1.58 - from vint import vint as _vint
1.59 -
1.60 - def vint(number):
1.61 -
1.62 - "Write 'number' as a variable-length integer."
1.63 -
1.64 - if number >= 0:
1.65 - return _vint(number)
1.66 - else:
1.67 - raise ValueError, "Number %r is negative." % number
1.68 -
1.69 -except ImportError:
1.70 -
1.71 - def vint(number):
1.72 -
1.73 - "Write 'number' as a variable-length integer."
1.74 -
1.75 - if number >= 0:
1.76 -
1.77 - # Special case: one byte containing a 7-bit number.
1.78 -
1.79 - if number < 128:
1.80 - return chr(number)
1.81 -
1.82 - # Write the number from least to most significant digits.
1.83 -
1.84 - bytes = []
1.85 -
1.86 - while number != 0:
1.87 - lsd = number & 127
1.88 - number = number >> 7
1.89 - if number != 0:
1.90 - lsd |= 128
1.91 - bytes.append(chr(lsd))
1.92 -
1.93 - return "".join(bytes)
1.94 -
1.95 - # Negative numbers are not supported.
1.96 -
1.97 - else:
1.98 - raise ValueError, "Number %r is negative." % number
1.99 -
1.100 -# Foundation classes.
1.101 -
1.102 -class File:
1.103 -
1.104 - "A basic file abstraction."
1.105 -
1.106 - def __init__(self, f):
1.107 - self.f = f
1.108 - self.reset()
1.109 -
1.110 - def reset(self):
1.111 -
1.112 - "To be used to reset the state of the reader or writer between records."
1.113 -
1.114 - pass
1.115 -
1.116 - def rewind(self):
1.117 - self.seek(0)
1.118 - self.reset()
1.119 -
1.120 - def seek(self, offset):
1.121 -
1.122 - "To be defined by readers."
1.123 -
1.124 - pass
1.125 -
1.126 - def flush(self):
1.127 -
1.128 - "To be defined by writers."
1.129 -
1.130 - pass
1.131 -
1.132 - def close(self):
1.133 - if self.f is not None:
1.134 - self.flush()
1.135 - self.f.close()
1.136 - self.f = None
1.137 -
1.138 -class FileWriter(File):
1.139 -
1.140 - "Writing basic data types to files."
1.141 -
1.142 - def __init__(self, f):
1.143 - File.__init__(self, f)
1.144 - self.cache = []
1.145 - self.cache_length = 0
1.146 -
1.147 - def write_number(self, number):
1.148 -
1.149 - "Write 'number' to the file using a variable length encoding."
1.150 -
1.151 - self.write(vint(number))
1.152 -
1.153 - def write_string(self, s, compress=0):
1.154 -
1.155 - """
1.156 - Write 's' to the file, recording its length and compressing the string
1.157 - if 'compress' is set to a true value.
1.158 - """
1.159 -
1.160 - # Convert Unicode objects to strings.
1.161 -
1.162 - if isinstance(s, unicode):
1.163 - s = s.encode("utf-8")
1.164 -
1.165 - # Compress the string if requested.
1.166 -
1.167 - if compress:
1.168 - for flag, fn in compressors:
1.169 - cs = fn(s)
1.170 -
1.171 - # Take the first string shorter than the original.
1.172 -
1.173 - if len(cs) < len(s):
1.174 - s = cs
1.175 - break
1.176 - else:
1.177 - flag = "-"
1.178 -
1.179 - else:
1.180 - flag = ""
1.181 -
1.182 - # Write the length of the data before the data itself.
1.183 -
1.184 - length = len(s)
1.185 - self.write(flag + vint(length) + s)
1.186 -
1.187 - # Cache-affected methods.
1.188 -
1.189 - def write(self, s):
1.190 - self.cache.append(s)
1.191 - self.cache_length += len(s)
1.192 - if self.cache_length >= WRITE_CACHE_SIZE:
1.193 - self.flush()
1.194 -
1.195 - def tell(self):
1.196 - return self.f.tell() + self.cache_length
1.197 -
1.198 - def flush(self):
1.199 - self.f.write("".join(self.cache))
1.200 - self.cache = []
1.201 - self.cache_length = 0
1.202 -
1.203 -class FileReader(File):
1.204 -
1.205 - "Reading basic data types from files."
1.206 -
1.207 - def __init__(self, f):
1.208 - File.__init__(self, f)
1.209 - self.reset_cache()
1.210 -
1.211 - def reset_cache(self):
1.212 - self.cache = ""
1.213 - self.cache_length = 0
1.214 - self.cache_start = 0
1.215 -
1.216 - def read_number(self):
1.217 -
1.218 - "Read a number from the file."
1.219 -
1.220 - # Read each byte, adding it to the number.
1.221 -
1.222 - shift = 0
1.223 - number = 0
1.224 - read = self.read
1.225 -
1.226 - try:
1.227 - csd = ord(read(1))
1.228 - while csd & 128:
1.229 - number += ((csd & 127) << shift)
1.230 - shift += 7
1.231 - csd = ord(read(1))
1.232 - else:
1.233 - number += (csd << shift)
1.234 - except TypeError:
1.235 - raise EOFError
1.236 -
1.237 - return number
1.238 -
1.239 - def read_string(self, decompress=0):
1.240 -
1.241 - """
1.242 - Read a string from the file, decompressing the stored data if
1.243 - 'decompress' is set to a true value.
1.244 - """
1.245 -
1.246 - # Decompress the data if requested.
1.247 -
1.248 - if decompress:
1.249 - flag = self.read(1)
1.250 - else:
1.251 - flag = "-"
1.252 -
1.253 - length = self.read_number()
1.254 - s = self.read(length)
1.255 -
1.256 - # Perform decompression if applicable.
1.257 -
1.258 - if flag != "-":
1.259 - fn = decompressors[flag]
1.260 - s = fn(s)
1.261 -
1.262 - # Convert strings to Unicode objects.
1.263 -
1.264 - return unicode(s, "utf-8")
1.265 -
1.266 - # Cache-affected methods.
1.267 -
1.268 - def read(self, n):
1.269 - needed = n - (self.cache_length - self.cache_start)
1.270 -
1.271 - # Read the needed number of characters, if possible.
1.272 -
1.273 - if needed > 0:
1.274 - s = self.f.read(max(needed, READ_CACHE_SIZE))
1.275 - self.cache += s
1.276 - self.cache_length += len(s)
1.277 -
1.278 - # Get the end of the requested block.
1.279 -
1.280 - next_start = self.cache_start + n
1.281 - s = self.cache[self.cache_start:next_start]
1.282 -
1.283 - # Reposition the pointer to the cache.
1.284 -
1.285 - self._seek_cache(len(s))
1.286 - return s
1.287 -
1.288 - def tell(self):
1.289 - return self.f.tell() - self.cache_length + self.cache_start
1.290 -
1.291 - def seek(self, offset):
1.292 - current = self.tell()
1.293 - self.f.seek(offset)
1.294 -
1.295 - # If seeking forward, attempt to navigate the cache.
1.296 -
1.297 - if offset >= current:
1.298 - self._seek_cache(offset - current)
1.299 - else:
1.300 - self.reset_cache()
1.301 -
1.302 - def _seek_cache(self, delta):
1.303 - next_start = self.cache_start + delta
1.304 -
1.305 - if next_start > 0 and next_start >= len(self.cache):
1.306 - self.reset_cache()
1.307 -
1.308 - # If the cache is too big, resize it.
1.309 -
1.310 - elif next_start > READ_CACHE_RESIZE:
1.311 - self.cache = self.cache[next_start:]
1.312 - self.cache_length = len(self.cache)
1.313 - self.cache_start = 0
1.314 -
1.315 - # Otherwise, just reference the next part of the cache.
1.316 -
1.317 - else:
1.318 - self.cache_start = next_start
1.319 -
1.320 -class FileOpener:
1.321 -
1.322 - "Opening files using their filenames."
1.323 -
1.324 - def __init__(self, filename):
1.325 - self.filename = filename
1.326 -
1.327 - def open(self, mode):
1.328 - return open(self.filename, mode)
1.329 -
1.330 - def close(self):
1.331 - pass
1.332 -
1.333 -# Specific classes for storing term and position information.
1.334 -
1.335 -class PositionWriter(FileWriter):
1.336 -
1.337 - "Writing position information to files."
1.338 -
1.339 - def reset(self):
1.340 - self.last_docnum = 0
1.341 -
1.342 - def write_positions(self, docnum, positions):
1.343 -
1.344 - """
1.345 - Write for the document 'docnum' the given 'positions'.
1.346 - Return the offset of the written record.
1.347 - """
1.348 -
1.349 - if docnum < self.last_docnum:
1.350 - raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
1.351 -
1.352 - # Record the offset of this record.
1.353 -
1.354 - offset = self.tell()
1.355 -
1.356 - # Make sure that the positions are sorted.
1.357 -
1.358 - positions.sort()
1.359 -
1.360 - # Write the position deltas.
1.361 -
1.362 - output = []
1.363 - last = 0
1.364 -
1.365 - for position in positions:
1.366 - output.append(vint(position - last))
1.367 - last = position
1.368 -
1.369 - # Write the document number delta.
1.370 - # Write the number of positions.
1.371 - # Then write the positions.
1.372 -
1.373 - self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))
1.374 -
1.375 - self.last_docnum = docnum
1.376 - return offset
1.377 -
1.378 -class PositionOpener(FileOpener):
1.379 -
1.380 - "Reading position information from files."
1.381 -
1.382 - def read_term_positions(self, offset, count):
1.383 -
1.384 - """
1.385 - Read all positions from 'offset', seeking to that position in the file
1.386 - before reading. The number of documents available for reading is limited
1.387 - to 'count'.
1.388 - """
1.389 -
1.390 - # Duplicate the file handle.
1.391 -
1.392 - f = self.open("rb")
1.393 - return PositionIterator(f, offset, count)
1.394 -
1.395 -class PositionIndexWriter(FileWriter):
1.396 -
1.397 - "Writing position index information to files."
1.398 -
1.399 - def reset(self):
1.400 - self.last_docnum = 0
1.401 - self.last_pos_offset = 0
1.402 -
1.403 - def write_positions(self, docnum, pos_offset, count):
1.404 -
1.405 - """
1.406 - Write the given 'docnum, 'pos_offset' and document 'count' to the
1.407 - position index file.
1.408 - """
1.409 -
1.410 - # Record the offset of this record.
1.411 -
1.412 - offset = self.tell()
1.413 - output = []
1.414 -
1.415 - # Write the document number delta.
1.416 -
1.417 - output.append(vint(docnum - self.last_docnum))
1.418 - self.last_docnum = docnum
1.419 -
1.420 - # Write the position file offset delta.
1.421 -
1.422 - output.append(vint(pos_offset - self.last_pos_offset))
1.423 - self.last_pos_offset = pos_offset
1.424 -
1.425 - # Write the document count.
1.426 -
1.427 - output.append(vint(count))
1.428 -
1.429 - # Actually write the data.
1.430 -
1.431 - self.write("".join(output))
1.432 -
1.433 - return offset
1.434 -
1.435 -class PositionIndexOpener(FileOpener):
1.436 -
1.437 - "Reading position index information from files."
1.438 -
1.439 - def read_term_positions(self, offset, doc_frequency):
1.440 -
1.441 - """
1.442 - Read all positions from 'offset', seeking to that position in the file
1.443 - before reading. The number of documents available for reading is limited
1.444 - to 'doc_frequency'.
1.445 - """
1.446 -
1.447 - # Duplicate the file handle.
1.448 -
1.449 - f = self.open("rb")
1.450 - return PositionIndexIterator(f, offset, doc_frequency)
1.451 -
1.452 -# Iterators for position-related files.
1.453 -
1.454 -class IteratorBase:
1.455 -
1.456 - def __init__(self, count):
1.457 - self.replenish(count)
1.458 -
1.459 - def replenish(self, count):
1.460 - self.count = count
1.461 - self.read_documents = 0
1.462 -
1.463 - def __len__(self):
1.464 - return self.count
1.465 -
1.466 - def sort(self):
1.467 - pass # Stored document positions are already sorted.
1.468 -
1.469 - def __iter__(self):
1.470 - return self
1.471 -
1.472 -class PositionIterator(FileReader, IteratorBase):
1.473 -
1.474 - "Iterating over document positions."
1.475 -
1.476 - def __init__(self, f, offset, count):
1.477 - FileReader.__init__(self, f)
1.478 - IteratorBase.__init__(self, count)
1.479 - self.seek(offset)
1.480 -
1.481 - def reset(self):
1.482 - self.last_docnum = 0
1.483 -
1.484 - def read_positions(self):
1.485 -
1.486 - "Read positions, returning a document number and a list of positions."
1.487 -
1.488 - # Read the document number delta and add it to the last number.
1.489 -
1.490 - self.last_docnum += self.read_number()
1.491 -
1.492 - # Read the number of positions.
1.493 -
1.494 - npositions = self.read_number()
1.495 -
1.496 - # Read the position deltas, adding each previous position to get the
1.497 - # appropriate collection of absolute positions.
1.498 -
1.499 - i = 0
1.500 - last = 0
1.501 - positions = []
1.502 -
1.503 - while i < npositions:
1.504 - last += self.read_number()
1.505 - positions.append(last)
1.506 - i += 1
1.507 -
1.508 - return self.last_docnum, positions
1.509 -
1.510 - def next(self):
1.511 -
1.512 - "Read positions for a single document."
1.513 -
1.514 - if self.read_documents < self.count:
1.515 - self.read_documents += 1
1.516 - return self.read_positions()
1.517 - else:
1.518 - raise StopIteration
1.519 -
1.520 -class PositionIndexIterator(FileReader, IteratorBase):
1.521 -
1.522 - "Iterating over document positions."
1.523 -
1.524 - def __init__(self, f, offset, count):
1.525 - FileReader.__init__(self, f)
1.526 - IteratorBase.__init__(self, count)
1.527 - self.seek(offset)
1.528 - self.section_count = 0
1.529 -
1.530 - def reset(self):
1.531 - self.last_docnum = 0
1.532 - self.last_pos_offset = 0
1.533 -
1.534 - def read_positions(self):
1.535 -
1.536 - """
1.537 - Read a document number, a position file offset for the position index
1.538 - file, and the number of documents in a section of that file.
1.539 - """
1.540 -
1.541 - # Read the document number delta.
1.542 -
1.543 - self.last_docnum += self.read_number()
1.544 -
1.545 - # Read the offset delta.
1.546 -
1.547 - self.last_pos_offset += self.read_number()
1.548 -
1.549 - # Read the document count.
1.550 -
1.551 - count = self.read_number()
1.552 -
1.553 - return self.last_docnum, self.last_pos_offset, count
1.554 -
1.555 - def next(self):
1.556 -
1.557 - "Read positions for a single document."
1.558 -
1.559 - self.read_documents += self.section_count
1.560 - if self.read_documents < self.count:
1.561 - docnum, pos_offset, self.section_count = t = self.read_positions()
1.562 - return t
1.563 - else:
1.564 - raise StopIteration
1.565 -
1.566 -class PositionDictionaryWriter:
1.567 -
1.568 - "Writing position dictionaries."
1.569 -
1.570 - def __init__(self, position_writer, position_index_writer, interval):
1.571 - self.position_writer = position_writer
1.572 - self.position_index_writer = position_index_writer
1.573 - self.interval = interval
1.574 -
1.575 - def write_term_positions(self, doc_positions):
1.576 -
1.577 - """
1.578 - Write all 'doc_positions' - a collection of tuples of the form (document
1.579 - number, position list) - to the file.
1.580 -
1.581 - Add some records to the index, making dictionary entries.
1.582 -
1.583 - Return a tuple containing the offset of the written data, the frequency
1.584 - (number of positions), and document frequency (number of documents) for
1.585 - the term involved.
1.586 - """
1.587 -
1.588 - # Reset the writers.
1.589 -
1.590 - self.position_writer.reset()
1.591 - self.position_index_writer.reset()
1.592 -
1.593 - index_offset = None
1.594 -
1.595 - # Write the positions.
1.596 -
1.597 - frequency = 0
1.598 - first_docnum = None
1.599 - first_offset = None
1.600 - count = 0
1.601 -
1.602 - doc_positions.sort()
1.603 -
1.604 - for docnum, positions in doc_positions:
1.605 - pos_offset = self.position_writer.write_positions(docnum, positions)
1.606 -
1.607 - # Retain the first record offset for a subsequent index entry.
1.608 -
1.609 - if first_offset is None:
1.610 - first_offset = pos_offset
1.611 - first_docnum = docnum
1.612 -
1.613 - frequency += len(positions)
1.614 - count += 1
1.615 -
1.616 - # Every {interval} entries, write an index entry.
1.617 -
1.618 - if count % self.interval == 0:
1.619 - io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)
1.620 -
1.621 - # Remember the first index entry offset.
1.622 -
1.623 - if index_offset is None:
1.624 - index_offset = io
1.625 -
1.626 - first_offset = None
1.627 - first_docnum = None
1.628 -
1.629 - # Reset the position writer so that position readers accessing
1.630 - # a section start with the correct document number.
1.631 -
1.632 - self.position_writer.reset()
1.633 -
1.634 - # Finish writing an index entry for the remaining documents.
1.635 -
1.636 - else:
1.637 - if first_offset is not None:
1.638 - io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)
1.639 -
1.640 - # Remember the first index entry offset.
1.641 -
1.642 - if index_offset is None:
1.643 - index_offset = io
1.644 -
1.645 - return index_offset, frequency, count
1.646 -
1.647 - def close(self):
1.648 - self.position_writer.close()
1.649 - self.position_index_writer.close()
1.650 -
1.651 -class PositionDictionaryReader:
1.652 -
1.653 - "Reading position dictionaries."
1.654 -
1.655 - def __init__(self, position_opener, position_index_opener):
1.656 - self.position_opener = position_opener
1.657 - self.position_index_opener = position_index_opener
1.658 -
1.659 - def read_term_positions(self, offset, doc_frequency):
1.660 -
1.661 - """
1.662 - Return an iterator for dictionary entries starting at 'offset' with the
1.663 - given 'doc_frequency'.
1.664 - """
1.665 -
1.666 - return PositionDictionaryIterator(self.position_opener,
1.667 - self.position_index_opener, offset, doc_frequency)
1.668 -
1.669 - def close(self):
1.670 - pass
1.671 -
1.672 -class PositionDictionaryIterator:
1.673 -
1.674 - "Iteration over position dictionary entries."
1.675 -
1.676 - def __init__(self, position_opener, position_index_opener, offset, doc_frequency):
1.677 - self.position_opener = position_opener
1.678 - self.doc_frequency = doc_frequency
1.679 - self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)
1.680 - self.iterator = None
1.681 -
1.682 - # Remember the last values.
1.683 -
1.684 - self.found_docnum, self.found_positions = None, None
1.685 -
1.686 - # Maintain state for the next index entry, if read.
1.687 -
1.688 - self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.689 -
1.690 - # Initialise the current index entry and current position file iterator.
1.691 -
1.692 - self._next_section()
1.693 - self._init_section()
1.694 -
1.695 - # Sequence methods.
1.696 -
1.697 - def __len__(self):
1.698 - return self.doc_frequency
1.699 -
1.700 - def sort(self):
1.701 - pass
1.702 -
1.703 - # Iterator methods.
1.704 -
1.705 - def __iter__(self):
1.706 - return self
1.707 -
1.708 - def next(self):
1.709 -
1.710 - """
1.711 - Attempt to get the next document record from the section in the
1.712 - positions file.
1.713 - """
1.714 -
1.715 - # Return any visited but unrequested record.
1.716 -
1.717 - if self.found_docnum is not None:
1.718 - t = self.found_docnum, self.found_positions
1.719 - self.found_docnum, self.found_positions = None, None
1.720 - return t
1.721 -
1.722 - # Or search for the next record.
1.723 -
1.724 - while 1:
1.725 -
1.726 - # Either return the next record.
1.727 -
1.728 - try:
1.729 - return self.iterator.next()
1.730 -
1.731 - # Or, where a section is finished, get the next section and try again.
1.732 -
1.733 - except StopIteration:
1.734 -
1.735 - # Where a section follows, update the index iterator, but keep
1.736 - # reading using the same file iterator (since the data should
1.737 - # just follow on from the last section).
1.738 -
1.739 - self._next_section()
1.740 - self.iterator.replenish(self.section_count)
1.741 -
1.742 - # Reset the state of the iterator to make sure that document
1.743 - # numbers are correct.
1.744 -
1.745 - self.iterator.reset()
1.746 -
1.747 - def from_document(self, docnum):
1.748 -
1.749 - """
1.750 - Attempt to navigate to a positions entry for the given 'docnum',
1.751 - returning the positions for 'docnum', or None otherwise.
1.752 - """
1.753 -
1.754 - # Return any unrequested document positions.
1.755 -
1.756 - if docnum == self.found_docnum:
1.757 - return self.found_positions
1.758 -
1.759 - # Read ahead in the index until the next entry refers to a document
1.760 - # later than the desired document.
1.761 -
1.762 - try:
1.763 - if self.next_docnum is None:
1.764 - self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
1.765 -
1.766 - # Read until the next entry is after the desired document number,
1.767 - # or until the end of the results.
1.768 -
1.769 - while self.next_docnum <= docnum:
1.770 - self._next_read_section()
1.771 - if self.docnum < docnum:
1.772 - self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()
1.773 - else:
1.774 - break
1.775 -
1.776 - except StopIteration:
1.777 - pass
1.778 -
1.779 - # Navigate in the position file to the document.
1.780 -
1.781 - self._init_section()
1.782 -
1.783 - try:
1.784 - while 1:
1.785 - found_docnum, found_positions = self.iterator.next()
1.786 -
1.787 - # Return the desired document positions or None (retaining the
1.788 - # positions for the document immediately after).
1.789 -
1.790 - if docnum == found_docnum:
1.791 - return found_positions
1.792 - elif docnum < found_docnum:
1.793 - self.found_docnum, self.found_positions = found_docnum, found_positions
1.794 - return None
1.795 -
1.796 - except StopIteration:
1.797 - return None
1.798 -
1.799 - # Internal methods.
1.800 -
1.801 - def _next_section(self):
1.802 -
1.803 - "Attempt to get the next section in the index."
1.804 -
1.805 - if self.next_docnum is None:
1.806 - self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()
1.807 - else:
1.808 - self._next_read_section()
1.809 -
1.810 - def _next_read_section(self):
1.811 -
1.812 - """
1.813 - Make the next index entry the current one without reading from the
1.814 - index.
1.815 - """
1.816 -
1.817 - self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count
1.818 - self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None
1.819 -
1.820 - def _init_section(self):
1.821 -
1.822 - "Initialise the iterator for the section in the position file."
1.823 -
1.824 - if self.iterator is not None:
1.825 - self.iterator.close()
1.826 - self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)
1.827 -
1.828 - def close(self):
1.829 - if self.iterator is not None:
1.830 - self.iterator.close()
1.831 - self.iterator = None
1.832 - if self.index_iterator is not None:
1.833 - self.index_iterator.close()
1.834 - self.index_iterator = None
1.835 -
1.836 -class TermWriter(FileWriter):
1.837 -
1.838 - "Writing term information to files."
1.839 -
1.840 - def reset(self):
1.841 - self.last_term = ""
1.842 - self.last_offset = 0
1.843 -
1.844 - def write_term(self, term, offset, frequency, doc_frequency):
1.845 -
1.846 - """
1.847 - Write the given 'term', its position file 'offset', its 'frequency' and
1.848 - its 'doc_frequency' (number of documents in which it appears) to the
1.849 - term information file. Return the offset after the term information was
1.850 - written to the file.
1.851 - """
1.852 -
1.853 - # Write the prefix length and term suffix.
1.854 -
1.855 - common = len(commonprefix([self.last_term, term]))
1.856 - suffix = term[common:]
1.857 -
1.858 - self.write_number(common)
1.859 - self.write_string(suffix)
1.860 -
1.861 - # Write the offset delta.
1.862 -
1.863 - self.write_number(offset - self.last_offset)
1.864 -
1.865 - # Write the frequency.
1.866 -
1.867 - self.write_number(frequency)
1.868 -
1.869 - # Write the document frequency.
1.870 -
1.871 - self.write_number(doc_frequency)
1.872 -
1.873 - self.last_term = term
1.874 - self.last_offset = offset
1.875 -
1.876 - return self.tell()
1.877 -
1.878 -class TermReader(FileReader):
1.879 -
1.880 - "Reading term information from files."
1.881 -
1.882 - def reset(self):
1.883 - self.last_term = ""
1.884 - self.last_offset = 0
1.885 -
1.886 - def read_term(self):
1.887 -
1.888 - """
1.889 - Read a term, its position file offset, its frequency and its document
1.890 - frequency from the term information file.
1.891 - """
1.892 -
1.893 - # Read the prefix length and term suffix.
1.894 -
1.895 - common = self.read_number()
1.896 - suffix = self.read_string()
1.897 -
1.898 - self.last_term = self.last_term[:common] + suffix
1.899 -
1.900 - # Read the offset delta.
1.901 -
1.902 - self.last_offset += self.read_number()
1.903 -
1.904 - # Read the frequency.
1.905 -
1.906 - frequency = self.read_number()
1.907 -
1.908 - # Read the document frequency.
1.909 -
1.910 - doc_frequency = self.read_number()
1.911 -
1.912 - return self.last_term, self.last_offset, frequency, doc_frequency
1.913 -
1.914 - def go_to_term(self, term, offset, info_offset):
1.915 -
1.916 - """
1.917 - Seek past the entry for 'term' having 'offset' to 'info_offset'. This
1.918 - permits the scanning for later terms from the specified term.
1.919 - """
1.920 -
1.921 - self.seek(info_offset)
1.922 - self.last_term = term
1.923 - self.last_offset = offset
1.924 -
1.925 -class TermIndexWriter(TermWriter):
1.926 -
1.927 - "Writing term dictionary index details to files."
1.928 -
1.929 - def reset(self):
1.930 - TermWriter.reset(self)
1.931 - self.last_info_offset = 0
1.932 -
1.933 - def write_term(self, term, offset, frequency, doc_frequency, info_offset):
1.934 -
1.935 - """
1.936 - Write the given 'term', its position file 'offset', its 'frequency' and
1.937 - its 'doc_frequency' to the term dictionary index file, along with the
1.938 - 'info_offset' in the term information file.
1.939 - """
1.940 -
1.941 - TermWriter.write_term(self, term, offset, frequency, doc_frequency)
1.942 -
1.943 - # Write the information file offset delta.
1.944 -
1.945 - self.write_number(info_offset - self.last_info_offset)
1.946 - self.last_info_offset = info_offset
1.947 -
1.948 -class TermIndexReader(TermReader):
1.949 -
1.950 - "Reading term dictionary index details from files."
1.951 -
1.952 - def reset(self):
1.953 - TermReader.reset(self)
1.954 - self.last_info_offset = 0
1.955 -
1.956 - def read_term(self):
1.957 -
1.958 - """
1.959 - Read a term, its position file offset, its frequency, its document
1.960 - frequency and a term information file offset from the term dictionary
1.961 - index file.
1.962 - """
1.963 -
1.964 - term, offset, frequency, doc_frequency = TermReader.read_term(self)
1.965 -
1.966 - # Read the offset delta.
1.967 -
1.968 - self.last_info_offset += self.read_number()
1.969 -
1.970 - return term, offset, frequency, doc_frequency, self.last_info_offset
1.971 -
1.972 -class TermDictionaryWriter:
1.973 -
1.974 - "Writing term dictionaries."
1.975 -
1.976 - def __init__(self, info_writer, index_writer, position_dict_writer, interval):
1.977 - self.info_writer = info_writer
1.978 - self.index_writer = index_writer
1.979 - self.position_dict_writer = position_dict_writer
1.980 - self.interval = interval
1.981 - self.entry = 0
1.982 -
1.983 - def _write_term(self, term, offset, frequency, doc_frequency):
1.984 -
1.985 - """
1.986 - Write the given 'term', its position file 'offset', its 'frequency' and
1.987 - its 'doc_frequency' (number of documents in which it appears) to the
1.988 - term information file. Return the offset after the term information was
1.989 - written to the file.
1.990 - """
1.991 -
1.992 - info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
1.993 -
1.994 - if self.entry % self.interval == 0:
1.995 - self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
1.996 -
1.997 - self.entry += 1
1.998 -
1.999 - def write_term_positions(self, term, doc_positions):
1.1000 -
1.1001 - """
1.1002 - Write the given 'term' and the 'doc_positions' recording the documents
1.1003 - and positions at which the term is found.
1.1004 - """
1.1005 -
1.1006 - offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
1.1007 - self._write_term(term, offset, frequency, doc_frequency)
1.1008 -
1.1009 - def close(self):
1.1010 - self.info_writer.close()
1.1011 - self.index_writer.close()
1.1012 - self.position_dict_writer.close()
1.1013 -
1.1014 -class TermDictionaryReader:
1.1015 -
1.1016 - "Reading term dictionaries."
1.1017 -
1.1018 - def __init__(self, info_reader, index_reader, position_dict_reader):
1.1019 - self.info_reader = info_reader
1.1020 - self.index_reader = index_reader
1.1021 - self.position_dict_reader = position_dict_reader
1.1022 -
1.1023 - self.terms = []
1.1024 - try:
1.1025 - while 1:
1.1026 - self.terms.append(self.index_reader.read_term())
1.1027 - except EOFError:
1.1028 - pass
1.1029 -
1.1030 - # Large numbers for ordering purposes.
1.1031 -
1.1032 - if self.terms:
1.1033 - self.max_offset = self.terms[-1][1] + 1
1.1034 - else:
1.1035 - self.max_offset = None
1.1036 -
1.1037 - def _find_closest_entry(self, term):
1.1038 -
1.1039 - """
1.1040 - Find the offsets and frequencies of 'term' from the term dictionary or
1.1041 - the closest term starting with the value of 'term'.
1.1042 -
1.1043 - Return the closest index entry consisting of a term, the position file
1.1044 - offset, the term frequency, the document frequency, and the term details
1.1045 - file offset.
1.1046 - """
1.1047 -
1.1048 - i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1
1.1049 -
1.1050 - # Get the entry position providing the term or one preceding it.
1.1051 - # If no entry precedes the requested term, return the very first entry
1.1052 - # as the closest.
1.1053 -
1.1054 - if i == -1:
1.1055 - return self.terms[0]
1.1056 - else:
1.1057 - return self.terms[i]
1.1058 -
1.1059 - def _find_closest_term(self, term):
1.1060 -
1.1061 - """
1.1062 - Find the offsets and frequencies of 'term' from the term dictionary or
1.1063 - the closest term starting with the value of 'term'.
1.1064 -
1.1065 - Return the closest term (or the term itself), the position file offset,
1.1066 - the term frequency, the document frequency, and the term details file
1.1067 - offset (or None if the reader is already positioned).
1.1068 - """
1.1069 -
1.1070 - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)
1.1071 -
1.1072 - # Where the term is found immediately, return the offset and
1.1073 - # frequencies. If the term does not appear, return the details of the
1.1074 - # closest entry.
1.1075 -
1.1076 - if term <= found_term:
1.1077 - return found_term, offset, frequency, doc_frequency, info_offset
1.1078 -
1.1079 - # Otherwise, seek past the index term's entry in the information file
1.1080 - # and scan for the desired term.
1.1081 -
1.1082 - else:
1.1083 - self.info_reader.go_to_term(found_term, offset, info_offset)
1.1084 - try:
1.1085 - while term > found_term:
1.1086 - found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.1087 - except EOFError:
1.1088 - pass
1.1089 -
1.1090 - return found_term, offset, frequency, doc_frequency, None
1.1091 -
1.1092 - def _find_term(self, term):
1.1093 -
1.1094 - """
1.1095 - Find the position file offset and frequency of 'term' from the term
1.1096 - dictionary.
1.1097 - """
1.1098 -
1.1099 - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
1.1100 -
1.1101 - # If the term is found, return the offset and frequencies.
1.1102 -
1.1103 - if term == found_term:
1.1104 - return offset, frequency, doc_frequency
1.1105 - else:
1.1106 - return None
1.1107 -
1.1108 - def _get_positions(self, offset, doc_frequency):
1.1109 - return self.position_dict_reader.read_term_positions(offset, doc_frequency)
1.1110 -
1.1111 - # Iterator convenience methods.
1.1112 -
1.1113 - def __iter__(self):
1.1114 - self.rewind()
1.1115 - return self
1.1116 -
1.1117 - def next(self):
1.1118 - try:
1.1119 - return self.read_term()
1.1120 - except EOFError:
1.1121 - raise StopIteration
1.1122 -
1.1123 - # Sequential access methods.
1.1124 -
1.1125 - def rewind(self):
1.1126 - self.info_reader.rewind()
1.1127 -
1.1128 - def read_term(self):
1.1129 -
1.1130 - """
1.1131 - Return the next term, its frequency, its document frequency, and the
1.1132 - documents and positions at which the term is found.
1.1133 - """
1.1134 -
1.1135 - term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.1136 - positions = self._get_positions(offset, doc_frequency)
1.1137 - return term, frequency, doc_frequency, positions
1.1138 -
1.1139 - # Query methods.
1.1140 -
1.1141 - def find_terms(self, term):
1.1142 -
1.1143 - "Return all terms whose values start with the value of 'term'."
1.1144 -
1.1145 - terms = []
1.1146 -
1.1147 - found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)
1.1148 -
1.1149 - # Position the reader, if necessary.
1.1150 -
1.1151 - if info_offset is not None:
1.1152 - self.info_reader.go_to_term(found_term, offset, info_offset)
1.1153 -
1.1154 - # Read and record terms.
1.1155 -
1.1156 - try:
1.1157 - # Add the found term if it starts with the specified term.
1.1158 -
1.1159 - while found_term.startswith(term):
1.1160 - terms.append(found_term)
1.1161 - found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.1162 -
1.1163 - except EOFError:
1.1164 - pass
1.1165 -
1.1166 - return terms
1.1167 -
1.1168 - def find_positions(self, term):
1.1169 -
1.1170 - "Return the documents and positions at which the given 'term' is found."
1.1171 -
1.1172 - t = self._find_term(term)
1.1173 - if t is None:
1.1174 - return None
1.1175 - else:
1.1176 - offset, frequency, doc_frequency = t
1.1177 - return self._get_positions(offset, doc_frequency)
1.1178 -
1.1179 - def get_frequency(self, term):
1.1180 -
1.1181 - "Return the frequency of the given 'term'."
1.1182 -
1.1183 - t = self._find_term(term)
1.1184 - if t is None:
1.1185 - return None
1.1186 - else:
1.1187 - offset, frequency, doc_frequency = t
1.1188 - return frequency
1.1189 -
1.1190 - def get_document_frequency(self, term):
1.1191 -
1.1192 - "Return the document frequency of the given 'term'."
1.1193 -
1.1194 - t = self._find_term(term)
1.1195 - if t is None:
1.1196 - return None
1.1197 - else:
1.1198 - offset, frequency, doc_frequency = t
1.1199 - return doc_frequency
1.1200 -
1.1201 - def close(self):
1.1202 - self.info_reader.close()
1.1203 - self.index_reader.close()
1.1204 - self.position_dict_reader.close()
1.1205 -
1.1206 -# Specific classes for storing document information.
1.1207 -
1.1208 -class FieldWriter(FileWriter):
1.1209 -
1.1210 - "Writing field data to files."
1.1211 -
1.1212 - def reset(self):
1.1213 - self.last_docnum = 0
1.1214 -
1.1215 - def write_fields(self, docnum, fields):
1.1216 -
1.1217 - """
1.1218 - Write for the given 'docnum', a list of 'fields' (integer, string pairs
1.1219 - representing field identifiers and values respectively).
1.1220 - Return the offset at which the fields are stored.
1.1221 - """
1.1222 -
1.1223 - offset = self.tell()
1.1224 -
1.1225 - # Write the document number delta.
1.1226 -
1.1227 - self.write_number(docnum - self.last_docnum)
1.1228 -
1.1229 - # Write the number of fields.
1.1230 -
1.1231 - self.write_number(len(fields))
1.1232 -
1.1233 - # Write the fields themselves.
1.1234 -
1.1235 - for i, field in fields:
1.1236 - self.write_number(i)
1.1237 - self.write_string(field, 1) # compress
1.1238 -
1.1239 - self.last_docnum = docnum
1.1240 - return offset
1.1241 -
1.1242 -class FieldReader(FileReader):
1.1243 -
1.1244 - "Reading field data from files."
1.1245 -
1.1246 - def reset(self):
1.1247 - self.last_docnum = 0
1.1248 -
1.1249 - def read_fields(self):
1.1250 -
1.1251 - """
1.1252 - Read fields from the file, returning a tuple containing the document
1.1253 - number and a list of field (identifier, value) pairs.
1.1254 - """
1.1255 -
1.1256 - # Read the document number.
1.1257 -
1.1258 - self.last_docnum += self.read_number()
1.1259 -
1.1260 - # Read the number of fields.
1.1261 -
1.1262 - nfields = self.read_number()
1.1263 -
1.1264 - # Collect the fields.
1.1265 -
1.1266 - fields = []
1.1267 - i = 0
1.1268 -
1.1269 - while i < nfields:
1.1270 - identifier = self.read_number()
1.1271 - value = self.read_string(1) # decompress
1.1272 - fields.append((identifier, value))
1.1273 - i += 1
1.1274 -
1.1275 - return self.last_docnum, fields
1.1276 -
1.1277 - def read_document_fields(self, docnum, offset):
1.1278 -
1.1279 - """
1.1280 - Read fields for 'docnum' at the given 'offset'. This permits the
1.1281 - retrieval of details for the specified document, as well as scanning for
1.1282 - later documents.
1.1283 - """
1.1284 -
1.1285 - self.seek(offset)
1.1286 - bad_docnum, fields = self.read_fields()
1.1287 - self.last_docnum = docnum
1.1288 - return docnum, fields
1.1289 -
1.1290 -class FieldIndexWriter(FileWriter):
1.1291 -
1.1292 - "Writing field index details to files."
1.1293 -
1.1294 - def reset(self):
1.1295 - self.last_docnum = 0
1.1296 - self.last_offset = 0
1.1297 -
1.1298 - def write_document(self, docnum, offset):
1.1299 -
1.1300 - """
1.1301 - Write for the given 'docnum', the 'offset' at which the fields for the
1.1302 - document are stored in the fields file.
1.1303 - """
1.1304 -
1.1305 - # Write the document number and offset deltas.
1.1306 -
1.1307 - self.write_number(docnum - self.last_docnum)
1.1308 - self.write_number(offset - self.last_offset)
1.1309 -
1.1310 - self.last_docnum = docnum
1.1311 - self.last_offset = offset
1.1312 -
1.1313 -class FieldIndexReader(FileReader):
1.1314 -
1.1315 - "Reading field index details from files."
1.1316 -
1.1317 - def reset(self):
1.1318 - self.last_docnum = 0
1.1319 - self.last_offset = 0
1.1320 -
1.1321 - def read_document(self):
1.1322 -
1.1323 - "Read a document number and field file offset."
1.1324 -
1.1325 - # Read the document number delta and offset.
1.1326 -
1.1327 - self.last_docnum += self.read_number()
1.1328 - self.last_offset += self.read_number()
1.1329 -
1.1330 - return self.last_docnum, self.last_offset
1.1331 -
1.1332 -class FieldDictionaryWriter:
1.1333 -
1.1334 - "Writing field dictionary details."
1.1335 -
1.1336 - def __init__(self, field_writer, field_index_writer, interval):
1.1337 - self.field_writer = field_writer
1.1338 - self.field_index_writer = field_index_writer
1.1339 - self.interval = interval
1.1340 - self.entry = 0
1.1341 -
1.1342 - def write_fields(self, docnum, fields):
1.1343 -
1.1344 - "Write details of the document with the given 'docnum' and 'fields'."
1.1345 -
1.1346 - offset = self.field_writer.write_fields(docnum, fields)
1.1347 -
1.1348 - if self.entry % self.interval == 0:
1.1349 - self.field_index_writer.write_document(docnum, offset)
1.1350 -
1.1351 - self.entry += 1
1.1352 -
1.1353 - def close(self):
1.1354 - self.field_writer.close()
1.1355 - self.field_index_writer.close()
1.1356 -
1.1357 -class FieldDictionaryReader:
1.1358 -
1.1359 - "Reading field dictionary details."
1.1360 -
1.1361 - def __init__(self, field_reader, field_index_reader):
1.1362 - self.field_reader = field_reader
1.1363 - self.field_index_reader = field_index_reader
1.1364 -
1.1365 - self.docs = []
1.1366 - try:
1.1367 - while 1:
1.1368 - self.docs.append(self.field_index_reader.read_document())
1.1369 - except EOFError:
1.1370 - pass
1.1371 -
1.1372 - # Large numbers for ordering purposes.
1.1373 -
1.1374 - if self.docs:
1.1375 - self.max_offset = self.docs[-1][1]
1.1376 - else:
1.1377 - self.max_offset = None
1.1378 -
1.1379 - # Iterator convenience methods.
1.1380 -
1.1381 - def __iter__(self):
1.1382 - self.rewind()
1.1383 - return self
1.1384 -
1.1385 - def next(self):
1.1386 - try:
1.1387 - return self.read_fields()
1.1388 - except EOFError:
1.1389 - raise StopIteration
1.1390 -
1.1391 - # Sequential access methods.
1.1392 -
1.1393 - def rewind(self):
1.1394 - self.field_reader.rewind()
1.1395 -
1.1396 - def read_fields(self):
1.1397 -
1.1398 - "Return the next document number and fields."
1.1399 -
1.1400 - return self.field_reader.read_fields()
1.1401 -
1.1402 - # Random access methods.
1.1403 -
1.1404 - def get_fields(self, docnum):
1.1405 -
1.1406 - "Read the fields of the document with the given 'docnum'."
1.1407 -
1.1408 - i = bisect_right(self.docs, (docnum, self.max_offset)) - 1
1.1409 -
1.1410 - # Get the entry position providing the term or one preceding it.
1.1411 -
1.1412 - if i == -1:
1.1413 - return None
1.1414 -
1.1415 - found_docnum, offset = self.docs[i]
1.1416 -
1.1417 - # Read from the fields file.
1.1418 -
1.1419 - found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)
1.1420 -
1.1421 - # Scan for the document, if necessary.
1.1422 -
1.1423 - try:
1.1424 - while docnum > found_docnum:
1.1425 - found_docnum, fields = self.field_reader.read_fields()
1.1426 - except EOFError:
1.1427 - pass
1.1428 -
1.1429 - # If the document is found, return the fields.
1.1430 -
1.1431 - if docnum == found_docnum:
1.1432 - return fields
1.1433 - else:
1.1434 - return None
1.1435 -
1.1436 - def close(self):
1.1437 - self.field_reader.close()
1.1438 - self.field_index_reader.close()
1.1439 -
1.1440 -# Dictionary merging classes.
1.1441 -
1.1442 -class Merger:
1.1443 -
1.1444 - "Merge files."
1.1445 -
1.1446 - def __init__(self, writer, readers):
1.1447 - self.writer = writer
1.1448 - self.readers = readers
1.1449 -
1.1450 - def close(self):
1.1451 - for reader in self.readers:
1.1452 - reader.close()
1.1453 - self.writer.close()
1.1454 -
1.1455 -class TermDictionaryMerger(Merger):
1.1456 -
1.1457 - "Merge term and position files."
1.1458 -
1.1459 - def merge(self):
1.1460 -
1.1461 - """
1.1462 - Merge terms and positions from the readers, sending them to the writer.
1.1463 - """
1.1464 -
1.1465 - last_term = None
1.1466 - current_readers = []
1.1467 -
1.1468 - for term, frequency, doc_frequency, positions in itermerge(self.readers):
1.1469 - if term == last_term:
1.1470 - current_readers.append(positions)
1.1471 - else:
1.1472 - if current_readers:
1.1473 - self.writer.write_term_positions(last_term, itermerge(current_readers))
1.1474 - last_term = term
1.1475 - current_readers = [positions]
1.1476 - else:
1.1477 - if current_readers:
1.1478 - self.writer.write_term_positions(last_term, itermerge(current_readers))
1.1479 -
1.1480 -class FieldDictionaryMerger(Merger):
1.1481 -
1.1482 - "Merge field files."
1.1483 -
1.1484 - def merge(self):
1.1485 -
1.1486 - """
1.1487 - Merge fields from the readers, sending them to the writer.
1.1488 - """
1.1489 -
1.1490 - for docnum, fields in itermerge(self.readers):
1.1491 - self.writer.write_fields(docnum, fields)
1.1492 -
1.1493 -# Utility functions.
1.1494 -
1.1495 -def get_term_writer(pathname, partition, interval, doc_interval):
1.1496 -
1.1497 - """
1.1498 - Return a term dictionary writer using files under the given 'pathname'
1.1499 - labelled according to the given 'partition', using the given indexing
1.1500 - 'interval' for terms and 'doc_interval' for document position records.
1.1501 - """
1.1502 -
1.1503 - tdf = open(join(pathname, "terms-%s" % partition), "wb")
1.1504 - info_writer = TermWriter(tdf)
1.1505 -
1.1506 - tdif = open(join(pathname, "terms_index-%s" % partition), "wb")
1.1507 - index_writer = TermIndexWriter(tdif)
1.1508 -
1.1509 - tpf = open(join(pathname, "positions-%s" % partition), "wb")
1.1510 - positions_writer = PositionWriter(tpf)
1.1511 -
1.1512 - tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
1.1513 - positions_index_writer = PositionIndexWriter(tpif)
1.1514 -
1.1515 - positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
1.1516 -
1.1517 - return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
1.1518 -
1.1519 -def get_field_writer(pathname, partition, interval):
1.1520 -
1.1521 - """
1.1522 - Return a field dictionary writer using files under the given 'pathname'
1.1523 - labelled according to the given 'partition', using the given indexing
1.1524 - 'interval'.
1.1525 - """
1.1526 -
1.1527 - ff = open(join(pathname, "fields-%s" % partition), "wb")
1.1528 - field_writer = FieldWriter(ff)
1.1529 -
1.1530 - fif = open(join(pathname, "fields_index-%s" % partition), "wb")
1.1531 - field_index_writer = FieldIndexWriter(fif)
1.1532 -
1.1533 - return FieldDictionaryWriter(field_writer, field_index_writer, interval)
1.1534 -
1.1535 -def get_term_reader(pathname, partition):
1.1536 -
1.1537 - """
1.1538 - Return a term dictionary reader using files under the given 'pathname'
1.1539 - labelled according to the given 'partition'.
1.1540 - """
1.1541 -
1.1542 - tdf = open(join(pathname, "terms-%s" % partition), "rb")
1.1543 - info_reader = TermReader(tdf)
1.1544 -
1.1545 - tdif = open(join(pathname, "terms_index-%s" % partition), "rb")
1.1546 - index_reader = TermIndexReader(tdif)
1.1547 -
1.1548 - positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))
1.1549 - positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))
1.1550 -
1.1551 - positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)
1.1552 -
1.1553 - return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
1.1554 -
1.1555 -def get_field_reader(pathname, partition):
1.1556 -
1.1557 - """
1.1558 - Return a field dictionary reader using files under the given 'pathname'
1.1559 - labelled according to the given 'partition'.
1.1560 - """
1.1561 -
1.1562 - ff = open(join(pathname, "fields-%s" % partition), "rb")
1.1563 - field_reader = FieldReader(ff)
1.1564 -
1.1565 - fif = open(join(pathname, "fields_index-%s" % partition), "rb")
1.1566 - field_index_reader = FieldIndexReader(fif)
1.1567 -
1.1568 - return FieldDictionaryReader(field_reader, field_index_reader)
1.1569 -
1.1570 -def rename_files(pathname, names, from_partition, to_partition):
1.1571 - for name in names:
1.1572 - rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))
1.1573 -
1.1574 -def rename_term_files(pathname, from_partition, to_partition):
1.1575 - rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)
1.1576 -
1.1577 -def rename_field_files(pathname, from_partition, to_partition):
1.1578 - rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)
1.1579 -
1.1580 -def remove_files(pathname, names, partition):
1.1581 - for name in names:
1.1582 - remove(join(pathname, "%s-%s" % (name, partition)))
1.1583 -
1.1584 -def remove_term_files(pathname, partition):
1.1585 - remove_files(pathname, TERM_FILENAMES, partition)
1.1586 -
1.1587 -def remove_field_files(pathname, partition):
1.1588 - remove_files(pathname, FIELD_FILENAMES, partition)
1.1589 -
1.1590 -# High-level classes.
1.1591 -
1.1592 -class Document:
1.1593 -
1.1594 - "A container of document information."
1.1595 -
1.1596 - def __init__(self, docnum):
1.1597 - self.docnum = docnum
1.1598 - self.fields = []
1.1599 - self.terms = {}
1.1600 -
1.1601 - def add_position(self, term, position):
1.1602 -
1.1603 - """
1.1604 - Add a position entry for the given 'term', indicating the given
1.1605 - 'position'.
1.1606 - """
1.1607 -
1.1608 - self.terms.setdefault(term, []).append(position)
1.1609 -
1.1610 - def add_field(self, identifier, value):
1.1611 -
1.1612 - "Add a field having the given 'identifier' and 'value'."
1.1613 -
1.1614 - self.fields.append((identifier, unicode(value))) # convert to string
1.1615 -
1.1616 - def set_fields(self, fields):
1.1617 -
1.1618 - """
1.1619 - Set the document's 'fields': a list of tuples each containing an integer
1.1620 - identifier and a string value.
1.1621 - """
1.1622 -
1.1623 - self.fields = fields
1.1624 -
1.1625 -class IndexWriter:
1.1626 -
1.1627 - """
1.1628 - Building term information and writing it to the term and field dictionaries.
1.1629 - """
1.1630 -
1.1631 - def __init__(self, pathname, interval, doc_interval, flush_interval):
1.1632 - self.pathname = pathname
1.1633 - self.interval = interval
1.1634 - self.doc_interval = doc_interval
1.1635 - self.flush_interval = flush_interval
1.1636 -
1.1637 - self.dict_partition = 0
1.1638 - self.field_dict_partition = 0
1.1639 -
1.1640 - self.terms = {}
1.1641 - self.docs = {}
1.1642 -
1.1643 - self.doc_counter = 0
1.1644 -
1.1645 - def add_document(self, doc):
1.1646 -
1.1647 - """
1.1648 - Add the given document 'doc', updating the document counter and flushing
1.1649 - terms and fields if appropriate.
1.1650 - """
1.1651 -
1.1652 - for term, positions in doc.terms.items():
1.1653 - self.terms.setdefault(term, {})[doc.docnum] = positions
1.1654 -
1.1655 - self.docs[doc.docnum] = doc.fields
1.1656 -
1.1657 - self.doc_counter += 1
1.1658 - if self.flush_interval and self.doc_counter >= self.flush_interval:
1.1659 - self.flush_terms()
1.1660 - self.flush_fields()
1.1661 - self.doc_counter = 0
1.1662 -
1.1663 - def get_term_writer(self):
1.1664 -
1.1665 - "Return a term dictionary writer for the current partition."
1.1666 -
1.1667 - return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)
1.1668 -
1.1669 - def get_field_writer(self):
1.1670 -
1.1671 - "Return a field dictionary writer for the current partition."
1.1672 -
1.1673 - return get_field_writer(self.pathname, self.field_dict_partition, self.interval)
1.1674 -
1.1675 - def flush_terms(self):
1.1676 -
1.1677 - "Flush terms into the current term dictionary partition."
1.1678 -
1.1679 - # Get the terms in order.
1.1680 -
1.1681 - all_terms = self.terms
1.1682 - terms = all_terms.keys()
1.1683 - terms.sort()
1.1684 -
1.1685 - dict_writer = self.get_term_writer()
1.1686 -
1.1687 - for term in terms:
1.1688 - doc_positions = all_terms[term].items()
1.1689 - dict_writer.write_term_positions(term, doc_positions)
1.1690 -
1.1691 - dict_writer.close()
1.1692 -
1.1693 - self.terms = {}
1.1694 - self.dict_partition += 1
1.1695 -
1.1696 - def flush_fields(self):
1.1697 -
1.1698 - "Flush fields into the current term dictionary partition."
1.1699 -
1.1700 - # Get the documents in order.
1.1701 -
1.1702 - docs = self.docs.items()
1.1703 - docs.sort()
1.1704 -
1.1705 - field_dict_writer = self.get_field_writer()
1.1706 -
1.1707 - for docnum, fields in docs:
1.1708 - field_dict_writer.write_fields(docnum, fields)
1.1709 -
1.1710 - field_dict_writer.close()
1.1711 -
1.1712 - self.docs = {}
1.1713 - self.field_dict_partition += 1
1.1714 -
1.1715 - def close(self):
1.1716 - if self.terms:
1.1717 - self.flush_terms()
1.1718 - if self.docs:
1.1719 - self.flush_fields()
1.1720 -
1.1721 -class IndexReader:
1.1722 -
1.1723 - "Accessing the term and field dictionaries."
1.1724 -
1.1725 - def __init__(self, pathname):
1.1726 - self.dict_reader = get_term_reader(pathname, "merged")
1.1727 - self.field_dict_reader = get_field_reader(pathname, "merged")
1.1728 -
1.1729 - def find_terms(self, term):
1.1730 - return self.dict_reader.find_terms(term)
1.1731 -
1.1732 - def find_positions(self, term):
1.1733 - return self.dict_reader.find_positions(term)
1.1734 -
1.1735 - def get_frequency(self, term):
1.1736 - return self.dict_reader.get_frequency(term)
1.1737 -
1.1738 - def get_document_frequency(self, term):
1.1739 - return self.dict_reader.get_document_frequency(term)
1.1740 -
1.1741 - def get_fields(self, docnum):
1.1742 - return self.field_dict_reader.get_fields(docnum)
1.1743 -
1.1744 - def close(self):
1.1745 - self.dict_reader.close()
1.1746 - self.field_dict_reader.close()
1.1747 -
1.1748 -class Index:
1.1749 -
1.1750 - "An inverted index solution encapsulating the various components."
1.1751 -
1.1752 - def __init__(self, pathname):
1.1753 - self.pathname = pathname
1.1754 - self.reader = None
1.1755 - self.writer = None
1.1756 -
1.1757 - def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):
1.1758 -
1.1759 - """
1.1760 - Return a writer, optionally using the given indexing 'interval',
1.1761 - 'doc_interval' and 'flush_interval'.
1.1762 - """
1.1763 -
1.1764 - if not exists(self.pathname):
1.1765 - mkdir(self.pathname)
1.1766 -
1.1767 - self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)
1.1768 - return self.writer
1.1769 -
1.1770 - def get_reader(self, partition=0):
1.1771 -
1.1772 - "Return a reader for the index."
1.1773 -
1.1774 - # Ensure that only one partition exists.
1.1775 -
1.1776 - self.merge()
1.1777 - return self._get_reader(partition)
1.1778 -
1.1779 - def _get_reader(self, partition):
1.1780 -
1.1781 - "Return a reader for the index."
1.1782 -
1.1783 - if not exists(self.pathname):
1.1784 - raise OSError, "Index path %r does not exist." % self.pathname
1.1785 -
1.1786 - self.reader = IndexReader(self.pathname)
1.1787 - return self.reader
1.1788 -
1.1789 - def merge(self):
1.1790 -
1.1791 - "Merge/optimise index partitions."
1.1792 -
1.1793 - self.merge_terms()
1.1794 - self.merge_fields()
1.1795 -
1.1796 - def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):
1.1797 -
1.1798 - """
1.1799 - Merge term dictionaries using the given indexing 'interval' and
1.1800 - 'doc_interval'.
1.1801 - """
1.1802 -
1.1803 - readers = []
1.1804 - partitions = set()
1.1805 -
1.1806 - for filename in listdir(self.pathname):
1.1807 - if filename.startswith("terms-"): # 6 character prefix
1.1808 - partition = filename[6:]
1.1809 - readers.append(get_term_reader(self.pathname, partition))
1.1810 - partitions.add(partition)
1.1811 -
1.1812 - # Write directly to a dictionary.
1.1813 -
1.1814 - if len(readers) > 1:
1.1815 - if "merged" in partitions:
1.1816 - rename_term_files(self.pathname, "merged", "old-merged")
1.1817 - partitions.remove("merged")
1.1818 - partitions.add("old-merged")
1.1819 -
1.1820 - writer = get_term_writer(self.pathname, "merged", interval, doc_interval)
1.1821 - merger = TermDictionaryMerger(writer, readers)
1.1822 - merger.merge()
1.1823 - merger.close()
1.1824 -
1.1825 - # Remove old files.
1.1826 -
1.1827 - for partition in partitions:
1.1828 - remove_term_files(self.pathname, partition)
1.1829 -
1.1830 - elif len(readers) == 1:
1.1831 - partition = list(partitions)[0]
1.1832 - if partition != "merged":
1.1833 - rename_term_files(self.pathname, partition, "merged")
1.1834 -
1.1835 - def merge_fields(self, interval=FIELD_INTERVAL):
1.1836 -
1.1837 - "Merge field dictionaries using the given indexing 'interval'."
1.1838 -
1.1839 - readers = []
1.1840 - partitions = set()
1.1841 -
1.1842 - for filename in listdir(self.pathname):
1.1843 - if filename.startswith("fields-"): # 7 character prefix
1.1844 - partition = filename[7:]
1.1845 - readers.append(get_field_reader(self.pathname, partition))
1.1846 - partitions.add(partition)
1.1847 -
1.1848 - # Write directly to a dictionary.
1.1849 -
1.1850 - if len(readers) > 1:
1.1851 - if "merged" in partitions:
1.1852 - rename_field_files(self.pathname, "merged", "old-merged")
1.1853 - partitions.remove("merged")
1.1854 - partitions.add("old-merged")
1.1855 -
1.1856 - writer = get_field_writer(self.pathname, "merged", interval)
1.1857 - merger = FieldDictionaryMerger(writer, readers)
1.1858 - merger.merge()
1.1859 - merger.close()
1.1860 -
1.1861 - # Remove old files.
1.1862 -
1.1863 - for partition in partitions:
1.1864 - remove_field_files(self.pathname, partition)
1.1865 -
1.1866 - elif len(readers) == 1:
1.1867 - partition = list(partitions)[0]
1.1868 - if partition != "merged":
1.1869 - rename_field_files(self.pathname, partition, "merged")
1.1870 -
1.1871 - def close(self):
1.1872 - if self.reader is not None:
1.1873 - self.reader.close()
1.1874 - self.reader = None
1.1875 - if self.writer is not None:
1.1876 - self.writer.close()
1.1877 - self.writer = None
1.1878 -
1.1879 -# vim: tabstop=4 expandtab shiftwidth=4