1.1 --- a/iixr.py Mon Aug 31 21:02:30 2009 +0200
1.2 +++ b/iixr.py Wed Sep 02 01:30:42 2009 +0200
1.3 @@ -18,6 +18,7 @@
1.4 with this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 +from os import dup, fdopen # independent iterator access to files
1.8 from os import listdir, mkdir # index and partition discovery
1.9 from os import remove, rename # partition manipulation
1.10 from os.path import exists, join
1.11 @@ -194,11 +195,18 @@
1.12
1.13 def write_positions(self, docnum, positions):
1.14
1.15 - "Write for the document 'docnum' the given 'positions'."
1.16 + """
1.17 + Write for the document 'docnum' the given 'positions'.
1.18 + Return the offset of the written record.
1.19 + """
1.20
1.21 if docnum < self.last_docnum:
1.22 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)
1.23
1.24 + # Record the offset of this record.
1.25 +
1.26 + offset = self.f.tell()
1.27 +
1.28 # Write the document number delta.
1.29
1.30 self.write_number(docnum - self.last_docnum)
1.31 @@ -221,34 +229,7 @@
1.32
1.33 self.last_docnum = docnum
1.34
1.35 - def write_term_positions(self, doc_positions):
1.36 -
1.37 - """
1.38 - Write all 'doc_positions' - a collection of tuples of the form (document
1.39 - number, position list) - to the file, returning a tuple containing the
1.40 - offset at which they were stored together with the frequency (number of
1.41 - positions) for the term involved.
1.42 - """
1.43 -
1.44 - # Reset the writer and record the current file offset.
1.45 -
1.46 - self.reset()
1.47 - offset = self.f.tell()
1.48 -
1.49 - # Write the number of documents.
1.50 -
1.51 - self.write_number(len(doc_positions))
1.52 - doc_positions.sort()
1.53 -
1.54 - # Write the positions.
1.55 -
1.56 - frequency = 0
1.57 -
1.58 - for docnum, positions in doc_positions:
1.59 - self.write_positions(docnum, positions)
1.60 - frequency += len(positions)
1.61 -
1.62 - return offset, frequency
1.63 + return offset
1.64
1.65 class PositionReader(FileReader):
1.66
1.67 @@ -283,54 +264,295 @@
1.68
1.69 return self.last_docnum, positions
1.70
1.71 - def read_term_positions(self, offset):
1.72 + def read_term_positions(self, offset, count):
1.73
1.74 """
1.75 Read all positions from 'offset', seeking to that position in the file
1.76 - before reading.
1.77 + before reading. The number of documents available for reading is limited
1.78 + to 'count'.
1.79 """
1.80
1.81 self.reset()
1.82 - self.f.seek(offset)
1.83 +
1.84 + # Duplicate the file handle.
1.85 +
1.86 + f = fdopen(dup(self.f.fileno()), "rb")
1.87 + f.seek(offset)
1.88 + return PositionIterator(f, count)
1.89 +
1.90 +class IteratorBase:
1.91 +
1.92 + def __init__(self, count):
1.93 + self.replenish(count)
1.94
1.95 - # Could duplicate the file handle using...
1.96 - # fdopen(dup(self.f.fileno()), "rb")
1.97 + def replenish(self, count):
1.98 + self.count = count
1.99 + self.read_documents = 0
1.100 +
1.101 + def __len__(self):
1.102 + return self.count
1.103
1.104 - return PositionIterator(self.f)
1.105 + def sort(self):
1.106 + pass # Stored document positions are already sorted.
1.107
1.108 -class PositionIterator(PositionReader):
1.109 + def __iter__(self):
1.110 + return self
1.111 +
1.112 +class PositionIterator(PositionReader, IteratorBase):
1.113
1.114 "Iterating over document positions."
1.115
1.116 - def __init__(self, f):
1.117 + def __init__(self, f, count):
1.118 PositionReader.__init__(self, f)
1.119 + IteratorBase.__init__(self, count)
1.120 +
1.121 + def next(self):
1.122 +
1.123 + "Read positions for a single document."
1.124 +
1.125 + if self.read_documents < self.count:
1.126 + self.read_documents += 1
1.127 + return self.read_positions()
1.128 + else:
1.129 + raise StopIteration
1.130 +
1.131 +class PositionIndexWriter(FileWriter):
1.132 +
1.133 + "Writing position index information to files."
1.134 +
1.135 + def reset(self):
1.136 + self.last_docnum = 0
1.137 + self.last_pos_offset = 0
1.138 +
1.139 + def write_positions(self, docnum, pos_offset, count):
1.140 +
1.141 + """
1.142 + Write the given 'docnum, 'pos_offset' and document 'count' to the
1.143 + position index file.
1.144 + """
1.145 +
1.146 + # Record the offset of this record.
1.147 +
1.148 + offset = self.f.tell()
1.149 +
1.150 + # Write the document number delta.
1.151 +
1.152 + self.write_number(docnum - self.last_docnum)
1.153 + self.last_docnum = docnum
1.154 +
1.155 + # Write the position file offset delta.
1.156 +
1.157 + self.write_number(pos_offset - self.last_pos_offset)
1.158 + self.last_pos_offset = pos_offset
1.159 +
1.160 + # Write the document count.
1.161 +
1.162 + self.write_number(count)
1.163 +
1.164 + return offset
1.165 +
1.166 +class PositionIndexReader(FileReader):
1.167 +
1.168 + "Reading position index information from files."
1.169
1.170 - # Read the number of documents.
1.171 + def reset(self):
1.172 + self.last_docnum = 0
1.173 + self.last_pos_offset = 0
1.174 +
1.175 + def read_positions(self):
1.176 +
1.177 + """
1.178 + Read a document number, a position file offset for the position index
1.179 + file, and the number of documents in a section of that file.
1.180 + """
1.181 +
1.182 + # Read the document number delta.
1.183 +
1.184 + self.last_docnum += self.read_number()
1.185 +
1.186 + # Read the offset delta.
1.187 +
1.188 + self.last_pos_offset += self.read_number()
1.189 +
1.190 + # Read the document count.
1.191 +
1.192 + count = self.read_number()
1.193 +
1.194 + return self.last_docnum, self.last_pos_offset, count
1.195 +
1.196 + def read_term_positions(self, offset, doc_frequency):
1.197
1.198 - self.ndocuments = self.read_number()
1.199 - self.read_documents = 0
1.200 + """
1.201 + Read all positions from 'offset', seeking to that position in the file
1.202 + before reading. The number of documents available for reading is limited
1.203 + to 'doc_frequency'.
1.204 + """
1.205 +
1.206 + # NOTE: This is almost a duplication of PositionReader.read_term_positions.
1.207 +
1.208 + self.reset()
1.209 +
1.210 + # Duplicate the file handle.
1.211 +
1.212 + f = fdopen(dup(self.f.fileno()), "rb")
1.213 + f.seek(offset)
1.214 + return PositionIndexIterator(f, doc_frequency)
1.215 +
1.216 +class PositionIndexIterator(PositionIndexReader, IteratorBase):
1.217 +
1.218 + "Iterating over document positions."
1.219 +
1.220 + def __init__(self, f, count):
1.221 + PositionIndexReader.__init__(self, f)
1.222 + IteratorBase.__init__(self, count)
1.223 + self.section_count = 0
1.224 +
1.225 + def next(self):
1.226 +
1.227 + "Read positions for a single document."
1.228
1.229 - def __len__(self):
1.230 - return self.ndocuments
1.231 + self.read_documents += self.section_count
1.232 + if self.read_documents < self.count:
1.233 + docnum, pos_offset, self.section_count = t = self.read_positions()
1.234 + return t
1.235 + else:
1.236 + raise StopIteration
1.237 +
1.238 +class PositionDictionaryWriter:
1.239 +
1.240 + "Writing position dictionaries."
1.241 +
1.242 + def __init__(self, position_writer, position_index_writer, interval):
1.243 + self.position_writer = position_writer
1.244 + self.position_index_writer = position_index_writer
1.245 + self.interval = interval
1.246 +
1.247 + def write_term_positions(self, doc_positions):
1.248 +
1.249 + """
1.250 + Write all 'doc_positions' - a collection of tuples of the form (document
1.251 + number, position list) - to the file.
1.252 +
1.253 + Add some records to the index, making dictionary entries.
1.254 +
1.255 + Return a tuple containing the offset of the written data, the frequency
1.256 + (number of positions), and document frequency (number of documents) for
1.257 + the term involved.
1.258 + """
1.259 +
1.260 + # Reset the writer.
1.261 +
1.262 + self.position_writer.reset()
1.263 + index_offset = None
1.264 +
1.265 + # Write the positions.
1.266 +
1.267 + frequency = 0
1.268 + first_offset = None
1.269 + count = 0
1.270 +
1.271 + doc_positions.sort()
1.272 +
1.273 + for docnum, positions in doc_positions:
1.274 + pos_offset = self.position_writer.write_positions(docnum, positions)
1.275 +
1.276 + # Retain the first record offset for a subsequent index entry.
1.277 +
1.278 + if first_offset is None:
1.279 + first_offset = pos_offset
1.280 +
1.281 + frequency += len(positions)
1.282 +
1.283 + # Every {interval} entries, write an index entry.
1.284 +
1.285 + if count == self.interval:
1.286 + io = self.position_index_writer.write_positions(docnum, first_offset, self.interval)
1.287
1.288 - def sort(self):
1.289 + # Remember the first index entry offset.
1.290 +
1.291 + if index_offset is None:
1.292 + index_offset = io
1.293 +
1.294 + first_offset = None
1.295 + count = 0
1.296 +
1.297 + count += 1
1.298 +
1.299 + # Finish writing an index entry for the remaining documents.
1.300 +
1.301 + else:
1.302 + if first_offset is not None:
1.303 + io = self.position_index_writer.write_positions(docnum, first_offset, count)
1.304 +
1.305 + # Remember the first index entry offset.
1.306 +
1.307 + if index_offset is None:
1.308 + index_offset = io
1.309 +
1.310 + return index_offset, frequency, len(doc_positions)
1.311 +
1.312 + def close(self):
1.313 + self.position_writer.close()
1.314 + self.position_index_writer.close()
1.315 +
1.316 +class PositionDictionaryReader:
1.317
1.318 - "Stored document positions are already sorted."
1.319 + "Reading position dictionaries."
1.320 +
1.321 + def __init__(self, position_reader, position_index_reader):
1.322 + self.position_reader = position_reader
1.323 + self.position_index_reader = position_index_reader
1.324 +
1.325 + def read_term_positions(self, offset, doc_frequency):
1.326 +
1.327 + """
1.328 + Return an iterator for dictionary entries starting at 'offset' with the
1.329 + given 'doc_frequency'.
1.330 + """
1.331
1.332 - pass
1.333 + return PositionDictionaryIterator(self.position_reader,
1.334 + self.position_index_reader, offset, doc_frequency)
1.335 +
1.336 + def close(self):
1.337 + self.position_reader.close()
1.338 + self.position_index_reader.close()
1.339 +
1.340 +class PositionDictionaryIterator:
1.341 +
1.342 + "Iteration over position dictionary entries."
1.343 +
1.344 + def __init__(self, position_reader, position_index_reader, offset, doc_frequency):
1.345 + self.position_reader = position_reader
1.346 +
1.347 + self.index_iterator = position_index_reader.read_term_positions(offset, doc_frequency)
1.348 + self.next_section()
1.349 + self.init_section()
1.350
1.351 def __iter__(self):
1.352 return self
1.353
1.354 def next(self):
1.355
1.356 - "Read positions for a single document."
1.357 + # Attempt to get the next document record from the section in the positions file.
1.358 +
1.359 + while 1:
1.360 +
1.361 + # Either return the next record.
1.362 +
1.363 + try:
1.364 + return self.iterator.next()
1.365
1.366 - if self.read_documents < self.ndocuments:
1.367 - self.read_documents += 1
1.368 - return self.read_positions()
1.369 - else:
1.370 - raise StopIteration
1.371 + # Or, where a section is finished, get the next section and try again.
1.372 +
1.373 + except StopIteration:
1.374 + self.next_section()
1.375 + self.iterator.replenish(self.section_count)
1.376 +
1.377 + def next_section(self):
1.378 + self.docnum, self.pos_offset, self.section_count = self.index_iterator.read_positions()
1.379 +
1.380 + def init_section(self):
1.381 + self.iterator = self.position_reader.read_term_positions(self.pos_offset, self.section_count)
1.382
1.383 class TermWriter(FileWriter):
1.384
1.385 @@ -340,12 +562,13 @@
1.386 self.last_term = ""
1.387 self.last_offset = 0
1.388
1.389 - def write_term(self, term, offset, frequency):
1.390 + def write_term(self, term, offset, frequency, doc_frequency):
1.391
1.392 """
1.393 - Write the given 'term', its position file 'offset', and its 'frequency'
1.394 - to the term information file. Return the offset after the term
1.395 - information was written to the file.
1.396 + Write the given 'term', its position file 'offset', its 'frequency' and
1.397 + its 'doc_frequency' (number of documents in which it appears) to the
1.398 + term information file. Return the offset after the term information was
1.399 + written to the file.
1.400 """
1.401
1.402 # Write the prefix length and term suffix.
1.403 @@ -364,6 +587,10 @@
1.404
1.405 self.write_number(frequency)
1.406
1.407 + # Write the document frequency.
1.408 +
1.409 + self.write_number(doc_frequency)
1.410 +
1.411 self.last_term = term
1.412 self.last_offset = offset
1.413
1.414 @@ -380,8 +607,8 @@
1.415 def read_term(self):
1.416
1.417 """
1.418 - Read a term, its position file offset, and its frequency from the term
1.419 - information file.
1.420 + Read a term, its position file offset, its frequency and its document
1.421 + frequence from the term information file.
1.422 """
1.423
1.424 # Read the prefix length and term suffix.
1.425 @@ -399,7 +626,11 @@
1.426
1.427 frequency = self.read_number()
1.428
1.429 - return self.last_term, self.last_offset, frequency
1.430 + # Read the document frequency.
1.431 +
1.432 + doc_frequency = self.read_number()
1.433 +
1.434 + return self.last_term, self.last_offset, frequency, doc_frequency
1.435
1.436 def go_to_term(self, term, offset, info_offset):
1.437
1.438 @@ -420,15 +651,15 @@
1.439 TermWriter.reset(self)
1.440 self.last_info_offset = 0
1.441
1.442 - def write_term(self, term, offset, frequency, info_offset):
1.443 + def write_term(self, term, offset, frequency, doc_frequency, info_offset):
1.444
1.445 """
1.446 - Write the given 'term', its position file 'offset', and its 'frequency'
1.447 - to the term dictionary index file, along with the 'info_offset' in the
1.448 - term information file.
1.449 + Write the given 'term', its position file 'offset', its 'frequency' and
1.450 + its 'doc_frequency' to the term dictionary index file, along with the
1.451 + 'info_offset' in the term information file.
1.452 """
1.453
1.454 - TermWriter.write_term(self, term, offset, frequency)
1.455 + TermWriter.write_term(self, term, offset, frequency, doc_frequency)
1.456
1.457 # Write the information file offset delta.
1.458
1.459 @@ -446,41 +677,43 @@
1.460 def read_term(self):
1.461
1.462 """
1.463 - Read a term, its position file offset, its frequency, and its term
1.464 - information file offset from the term dictionary index file.
1.465 + Read a term, its position file offset, its frequency, its document
1.466 + frequency and a term information file offset from the term dictionary
1.467 + index file.
1.468 """
1.469
1.470 - term, offset, frequency = TermReader.read_term(self)
1.471 + term, offset, frequency, doc_frequency = TermReader.read_term(self)
1.472
1.473 # Read the offset delta.
1.474
1.475 self.last_info_offset += self.read_number()
1.476
1.477 - return term, offset, frequency, self.last_info_offset
1.478 + return term, offset, frequency, doc_frequency, self.last_info_offset
1.479
1.480 class TermDictionaryWriter:
1.481
1.482 "Writing term dictionaries."
1.483
1.484 - def __init__(self, info_writer, index_writer, position_writer, interval):
1.485 + def __init__(self, info_writer, index_writer, position_dict_writer, interval):
1.486 self.info_writer = info_writer
1.487 self.index_writer = index_writer
1.488 - self.position_writer = position_writer
1.489 + self.position_dict_writer = position_dict_writer
1.490 self.interval = interval
1.491 self.entry = 0
1.492
1.493 - def _write_term(self, term, offset, frequency):
1.494 + def _write_term(self, term, offset, frequency, doc_frequency):
1.495
1.496 """
1.497 - Write the given 'term', its position file 'offset', and its 'frequency'
1.498 - to the term information file and optionally to the index, making a
1.499 - dictionary entry.
1.500 + Write the given 'term', its position file 'offset', its 'frequency' and
1.501 + its 'doc_frequency' (number of documents in which it appears) to the
1.502 + term information file. Return the offset after the term information was
1.503 + written to the file.
1.504 """
1.505
1.506 - info_offset = self.info_writer.write_term(term, offset, frequency)
1.507 + info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)
1.508
1.509 if self.entry % self.interval == 0:
1.510 - self.index_writer.write_term(term, offset, frequency, info_offset)
1.511 + self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)
1.512
1.513 self.entry += 1
1.514
1.515 @@ -491,13 +724,13 @@
1.516 and positions at which the term is found.
1.517 """
1.518
1.519 - offset, frequency = self.position_writer.write_term_positions(doc_positions)
1.520 - self._write_term(term, offset, frequency)
1.521 + offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)
1.522 + self._write_term(term, offset, frequency, doc_frequency)
1.523
1.524 def close(self):
1.525 self.info_writer.close()
1.526 self.index_writer.close()
1.527 - self.position_writer.close()
1.528 + self.position_dict_writer.close()
1.529
1.530 class TermDictionaryReader:
1.531
1.532 @@ -533,12 +766,13 @@
1.533 if i == -1:
1.534 return None
1.535
1.536 - found_term, offset, frequency, info_offset = self.terms[i]
1.537 + found_term, offset, frequency, doc_frequency, info_offset = self.terms[i]
1.538
1.539 - # Where the term is found immediately, return the offset.
1.540 + # Where the term is found immediately, return the offset and
1.541 + # frequencies.
1.542
1.543 if term == found_term:
1.544 - return offset, frequency
1.545 + return offset, frequency, doc_frequency
1.546
1.547 # Otherwise, seek past the index term's entry in the information file
1.548 # and scan for the desired term.
1.549 @@ -547,33 +781,33 @@
1.550 self.info_reader.go_to_term(found_term, offset, info_offset)
1.551 try:
1.552 while term > found_term:
1.553 - found_term, offset, frequency = self.info_reader.read_term()
1.554 + found_term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.555 except EOFError:
1.556 pass
1.557
1.558 - # If the term is found, return the offset and frequency.
1.559 + # If the term is found, return the offset and frequencies.
1.560
1.561 if term == found_term:
1.562 - return offset, frequency
1.563 + return offset, frequency, doc_frequency
1.564 else:
1.565 return None
1.566
1.567 def rewind(self):
1.568 self.info_reader.rewind()
1.569
1.570 - def _get_positions(self, offset):
1.571 - return self.position_reader.read_term_positions(offset)
1.572 + def _get_positions(self, offset, doc_frequency):
1.573 + return self.position_reader.read_term_positions(offset, doc_frequency)
1.574
1.575 def read_term(self):
1.576
1.577 """
1.578 - Return the next term, its frequency and the documents and positions at
1.579 - which the term is found.
1.580 + Return the next term, its frequency, its document frequency, and the
1.581 + documents and positions at which the term is found.
1.582 """
1.583
1.584 - term, offset, frequency = self.info_reader.read_term()
1.585 - positions = self._get_positions(offset)
1.586 - return term, frequency, positions
1.587 + term, offset, frequency, doc_frequency = self.info_reader.read_term()
1.588 + positions = self._get_positions(offset, doc_frequency)
1.589 + return term, frequency, doc_frequency, positions
1.590
1.591 def find_positions(self, term):
1.592
1.593 @@ -583,8 +817,8 @@
1.594 if t is None:
1.595 return None
1.596 else:
1.597 - offset, frequency = t
1.598 - return self._get_positions(offset)
1.599 + offset, frequency, doc_frequency = t
1.600 + return self._get_positions(offset, doc_frequency)
1.601
1.602 def get_frequency(self, term):
1.603
1.604 @@ -594,9 +828,20 @@
1.605 if t is None:
1.606 return None
1.607 else:
1.608 - offset, frequency = t
1.609 + offset, frequency, doc_frequency = t
1.610 return frequency
1.611
1.612 + def get_document_frequency(self, term):
1.613 +
1.614 + "Return the document frequency of the given 'term'."
1.615 +
1.616 + t = self._find_term(term)
1.617 + if t is None:
1.618 + return None
1.619 + else:
1.620 + offset, frequency, doc_frequency = t
1.621 + return doc_frequency
1.622 +
1.623 def close(self):
1.624 self.info_reader.close()
1.625 self.index_reader.close()
1.626 @@ -850,7 +1095,7 @@
1.627 reader.rewind()
1.628
1.629 try:
1.630 - term, frequency, positions = reader.read_term()
1.631 + term, frequency, doc_frequency, positions = reader.read_term()
1.632 insort_right(entries, (term, positions, partition))
1.633 except EOFError:
1.634 pass
1.635 @@ -889,7 +1134,7 @@
1.636
1.637 for partition in to_update:
1.638 try:
1.639 - term, frequency, positions = self.readers[partition].read_term()
1.640 + term, frequency, doc_frequency, positions = self.readers[partition].read_term()
1.641 insort_right(entries, (term, positions, partition))
1.642 except EOFError:
1.643 pass
1.644 @@ -975,12 +1220,12 @@
1.645
1.646 # Utility functions.
1.647
1.648 -def get_term_writer(pathname, partition, interval):
1.649 +def get_term_writer(pathname, partition, interval, doc_interval):
1.650
1.651 """
1.652 Return a term dictionary writer using files under the given 'pathname'
1.653 labelled according to the given 'partition', using the given indexing
1.654 - 'interval'.
1.655 + 'interval' for terms and 'doc_interval' for document position records.
1.656 """
1.657
1.658 tdf = open(join(pathname, "terms-%s" % partition), "wb")
1.659 @@ -992,7 +1237,12 @@
1.660 tpf = open(join(pathname, "positions-%s" % partition), "wb")
1.661 positions_writer = PositionWriter(tpf)
1.662
1.663 - return TermDictionaryWriter(info_writer, index_writer, positions_writer, interval)
1.664 + tpif = open(join(pathname, "positions_index-%s" % partition), "wb")
1.665 + positions_index_writer = PositionIndexWriter(tpif)
1.666 +
1.667 + positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)
1.668 +
1.669 + return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)
1.670
1.671 def get_field_writer(pathname, partition, interval):
1.672
1.673 @@ -1026,7 +1276,12 @@
1.674 tpf = open(join(pathname, "positions-%s" % partition), "rb")
1.675 positions_reader = PositionReader(tpf)
1.676
1.677 - return TermDictionaryReader(info_reader, index_reader, positions_reader)
1.678 + tpif = open(join(pathname, "positions_index-%s" % partition), "rb")
1.679 + positions_index_reader = PositionIndexReader(tpif)
1.680 +
1.681 + positions_dict_reader = PositionDictionaryReader(positions_reader, positions_index_reader)
1.682 +
1.683 + return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)
1.684
1.685 def get_field_reader(pathname, partition):
1.686
2.1 --- a/test.py Mon Aug 31 21:02:30 2009 +0200
2.2 +++ b/test.py Wed Sep 02 01:30:42 2009 +0200
2.3 @@ -38,15 +38,18 @@
2.4 all_doc_positions = [
2.5 [
2.6 (123, [1, 3, 5, 15, 25]),
2.7 - (124, [0, 100])
2.8 + (124, [0, 100]),
2.9 + (125, [11, 99, 199]),
2.10 + (130, [77, 78, 80, 82, 89])
2.11 ],
2.12 [
2.13 (78, [9]),
2.14 - (196, [10, 11])
2.15 + (196, [10, 11]),
2.16 + (197, [17, 21, 30])
2.17 ]
2.18 ]
2.19
2.20 -f = open("test", "wb")
2.21 +f = open("testP", "wb")
2.22 w = iixr.PositionWriter(f)
2.23 for doc_positions in all_doc_positions:
2.24 for docnum, positions in doc_positions:
2.25 @@ -54,7 +57,7 @@
2.26 w.reset()
2.27 w.close()
2.28
2.29 -f = open("test", "rb")
2.30 +f = open("testP", "rb")
2.31 r = iixr.PositionReader(f)
2.32 for doc_positions in all_doc_positions:
2.33 for docnum, positions in doc_positions:
2.34 @@ -64,20 +67,68 @@
2.35 r.reset()
2.36 r.close()
2.37
2.38 -f = open("test", "wb")
2.39 +# Test position index files.
2.40 +
2.41 +indexed_positions = [
2.42 + [
2.43 + (1234, 0, 100),
2.44 + (2345, 700, 100),
2.45 + (3456, 1900, 50)
2.46 + ],
2.47 + [
2.48 + (4567, 2800, 20)
2.49 + ]
2.50 + ]
2.51 +
2.52 +offsets = []
2.53 +f = open("testPI", "wb")
2.54 +w = iixr.PositionIndexWriter(f)
2.55 +for term_positions in indexed_positions:
2.56 + offset = None
2.57 + doc_frequency = 0
2.58 + w.reset()
2.59 + for docnum, pos_offset, count in term_positions:
2.60 + io = w.write_positions(docnum, pos_offset, count)
2.61 + if offset is None:
2.62 + offset = io
2.63 + doc_frequency += count
2.64 + offsets.append((offset, doc_frequency))
2.65 +w.close()
2.66 +
2.67 +f = open("testPI", "rb")
2.68 +r = iixr.PositionIndexReader(f)
2.69 +offsets.reverse()
2.70 +indexed_positions.reverse()
2.71 +for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
2.72 + found_positions = r.read_term_positions(offset, doc_frequency)
2.73 + for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):
2.74 + print docnum == dn, docnum, dn
2.75 + print pos_offset == po, pos_offset, po
2.76 + print count == c, count, c
2.77 +r.close()
2.78 +
2.79 +# Test position dictionaries.
2.80 +
2.81 +f = open("testP", "wb")
2.82 w = iixr.PositionWriter(f)
2.83 +f2 = open("testPI", "wb")
2.84 +w2 = iixr.PositionIndexWriter(f2)
2.85 +wd = iixr.PositionDictionaryWriter(w, w2, 2)
2.86 offsets = []
2.87 for doc_positions in all_doc_positions:
2.88 - offset, frequency = w.write_term_positions(doc_positions)
2.89 - offsets.append(offset)
2.90 + offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
2.91 + offsets.append((offset, doc_frequency))
2.92 w.close()
2.93
2.94 -f = open("test", "rb")
2.95 +f = open("testP", "rb")
2.96 r = iixr.PositionReader(f)
2.97 +f2 = open("testPI", "rb")
2.98 +r2 = iixr.PositionIndexReader(f2)
2.99 +rd = iixr.PositionDictionaryReader(r, r2)
2.100 offsets.reverse()
2.101 all_doc_positions.reverse()
2.102 -for offset, doc_positions in zip(offsets, all_doc_positions):
2.103 - dp = list(r.read_term_positions(offset))
2.104 +for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
2.105 + dp = list(rd.read_term_positions(offset, doc_frequency))
2.106 print doc_positions == dp, doc_positions, dp
2.107 r.close()
2.108
2.109 @@ -166,55 +217,57 @@
2.110 # Test terms.
2.111
2.112 terms = [
2.113 - # term offset frequency
2.114 - ("aardvark", 100000123, 1),
2.115 - ("anteater", 100000456, 2),
2.116 - ("badger", 100000789, 13),
2.117 - ("bull", 1000001234, 59),
2.118 - ("bulldog", 1000002345, 99),
2.119 - ("cat", 1000003456, 89)
2.120 + # term offset frequency doc_frequency
2.121 + ("aardvark", 100000123, 1, 1),
2.122 + ("anteater", 100000456, 2, 1),
2.123 + ("badger", 100000789, 13, 7),
2.124 + ("bull", 1000001234, 59, 17),
2.125 + ("bulldog", 1000002345, 99, 80),
2.126 + ("cat", 1000003456, 89, 28)
2.127 ]
2.128
2.129 f = open("test", "wb")
2.130 w = iixr.TermWriter(f)
2.131 -for term, offset, frequency in terms:
2.132 - w.write_term(term, offset, frequency)
2.133 +for term, offset, frequency, doc_frequency in terms:
2.134 + w.write_term(term, offset, frequency, doc_frequency)
2.135 w.close()
2.136
2.137 f = open("test", "rb")
2.138 r = iixr.TermReader(f)
2.139 -for term, offset, frequency in terms:
2.140 - t, o, fr = r.read_term()
2.141 +for term, offset, frequency, doc_frequency in terms:
2.142 + t, o, fr, df = r.read_term()
2.143 print term == t, term, t
2.144 print offset == o, offset, o
2.145 print frequency == fr, frequency, fr
2.146 + print doc_frequency == df, doc_frequency, df
2.147 r.close()
2.148
2.149 # Test terms in index files.
2.150
2.151 indexed_terms = [
2.152 - # term offset frequency info_offset
2.153 - ("aardvark", 100000123, 1, 200000321),
2.154 - ("anteater", 100000456, 2, 200000654),
2.155 - ("badger", 100000789, 13, 200000987),
2.156 - ("bull", 1000001234, 59, 200004321),
2.157 - ("bulldog", 1000002345, 99, 200005432),
2.158 - ("cat", 1000003456, 89, 200006543)
2.159 + # term offset frequency doc_frequency info_offset
2.160 + ("aardvark", 100000123, 1, 1, 200000321),
2.161 + ("anteater", 100000456, 2, 1, 200000654),
2.162 + ("badger", 100000789, 13, 7, 200000987),
2.163 + ("bull", 1000001234, 59, 17, 200004321),
2.164 + ("bulldog", 1000002345, 99, 80, 200005432),
2.165 + ("cat", 1000003456, 89, 28, 200006543)
2.166 ]
2.167
2.168 f = open("test", "wb")
2.169 w = iixr.TermIndexWriter(f)
2.170 -for term, offset, frequency, info_offset in indexed_terms:
2.171 - w.write_term(term, offset, frequency, info_offset)
2.172 +for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
2.173 + w.write_term(term, offset, frequency, doc_frequency, info_offset)
2.174 w.close()
2.175
2.176 f = open("test", "rb")
2.177 r = iixr.TermIndexReader(f)
2.178 -for term, offset, frequency, info_offset in indexed_terms:
2.179 - t, o, fr, i = r.read_term()
2.180 +for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
2.181 + t, o, fr, df, i = r.read_term()
2.182 print term == t, term, t
2.183 print offset == o, offset, o
2.184 print frequency == fr, frequency, fr
2.185 + print doc_frequency == df, doc_frequency, df
2.186 print info_offset == i, info_offset, i
2.187 r.close()
2.188
2.189 @@ -224,26 +277,23 @@
2.190 w = iixr.TermWriter(f)
2.191 f2 = open("testI", "wb")
2.192 w2 = iixr.TermIndexWriter(f2)
2.193 -f3 = open("testP", "wb")
2.194 -w3 = iixr.PositionWriter(f3)
2.195 -wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
2.196 -for term, offset, frequency in terms:
2.197 - wd._write_term(term, offset, frequency)
2.198 +wd = iixr.TermDictionaryWriter(w, w2, None, 3)
2.199 +for term, offset, frequency, doc_frequency in terms:
2.200 + wd._write_term(term, offset, frequency, doc_frequency)
2.201 wd.close()
2.202
2.203 f = open("test", "rb")
2.204 r = iixr.TermReader(f)
2.205 f2 = open("testI", "rb")
2.206 r2 = iixr.TermIndexReader(f2)
2.207 -f3 = open("testP", "rb")
2.208 -r3 = iixr.PositionReader(f3)
2.209 -rd = iixr.TermDictionaryReader(r, r2, r3)
2.210 +rd = iixr.TermDictionaryReader(r, r2, None)
2.211 terms_reversed = terms[:]
2.212 terms_reversed.reverse()
2.213 -for term, offset, frequency in terms_reversed:
2.214 - o, fr = rd._find_term(term)
2.215 +for term, offset, frequency, doc_frequency in terms_reversed:
2.216 + o, fr, df = rd._find_term(term)
2.217 print offset == o, offset, o
2.218 print frequency == fr, frequency, fr
2.219 + print doc_frequency == df, doc_frequency, df
2.220 for term in ("dog", "dingo"):
2.221 t = rd._find_term(term)
2.222 print t is None, t
2.223 @@ -255,7 +305,7 @@
2.224 ("aardvark", [(1, [2, 45, 96]), (20, [13])]),
2.225 ("anteater", [(1, [43, 44])]),
2.226 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
2.227 - ("bull", [(6, [128]), (16, [12])]),
2.228 + ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
2.229 ("bulldog", [(43, [17, 19, 256, 512])]),
2.230 ("cat", [(123, [12, 145, 196]), (1200, [113])])
2.231 ]
2.232 @@ -266,7 +316,10 @@
2.233 w2 = iixr.TermIndexWriter(f2)
2.234 f3 = open("testP", "wb")
2.235 w3 = iixr.PositionWriter(f3)
2.236 -wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
2.237 +f4 = open("testPI", "wb")
2.238 +w4 = iixr.PositionIndexWriter(f4)
2.239 +wp = iixr.PositionDictionaryWriter(r3, r4, 2)
2.240 +wd = iixr.TermDictionaryWriter(w, w2, wp, 3)
2.241 for term, doc_positions in terms_with_positions:
2.242 wd.write_term_positions(term, doc_positions)
2.243 wd.close()
2.244 @@ -277,7 +330,10 @@
2.245 r2 = iixr.TermIndexReader(f2)
2.246 f3 = open("testP", "rb")
2.247 r3 = iixr.PositionReader(f3)
2.248 -rd = iixr.TermDictionaryReader(r, r2, r3)
2.249 +f4 = open("testPI", "rb")
2.250 +r4 = iixr.PositionIndexReader(f4)
2.251 +rp = iixr.PositionDictionaryReader(r3, r4)
2.252 +rd = iixr.TermDictionaryReader(r, r2, rp)
2.253 terms_reversed = terms_with_positions[:]
2.254 terms_reversed.reverse()
2.255 for term, doc_positions in terms_reversed:
2.256 @@ -291,7 +347,7 @@
2.257
2.258 rd.rewind()
2.259 for term, doc_positions in terms_with_positions:
2.260 - t, fr, dp = rd.read_term()
2.261 + t, fr, df, dp = rd.read_term()
2.262 dp = list(dp)
2.263 print term == t, term, t
2.264 print doc_positions == dp, doc_positions, dp