paul@44 | 1 | #!/usr/bin/env python |
paul@44 | 2 | |
paul@44 | 3 | """ |
paul@44 | 4 | Specific classes for storing term information. |
paul@44 | 5 | |
paul@89 | 6 | Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> |
paul@44 | 7 | |
paul@44 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@44 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@44 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@44 | 11 | version. |
paul@44 | 12 | |
paul@44 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT ANY |
paul@44 | 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A |
paul@44 | 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. |
paul@44 | 16 | |
paul@44 | 17 | You should have received a copy of the GNU General Public License along |
paul@44 | 18 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@44 | 19 | """ |
paul@44 | 20 | |
paul@96 | 21 | from iixr.data import * |
paul@44 | 22 | from iixr.files import * |
paul@97 | 23 | from itermerge import itermerge |
paul@44 | 24 | from os.path import commonprefix # to find common string prefixes |
paul@97 | 25 | from bisect import bisect_right, insort_right |
paul@97 | 26 | import operator |
paul@44 | 27 | |
paul@44 | 28 | class TermWriter(FileWriter): |
paul@44 | 29 | |
paul@44 | 30 | "Writing term information to files." |
paul@44 | 31 | |
paul@96 | 32 | def begin(self, docnum_size, position_size): |
paul@96 | 33 | |
paul@96 | 34 | "Begin writing to the file." |
paul@96 | 35 | |
paul@96 | 36 | self.write_numbers((docnum_size, position_size)) |
paul@90 | 37 | self.end_record() |
paul@96 | 38 | |
paul@96 | 39 | self.data_start = self.tell() |
paul@96 | 40 | self.docnum_size = docnum_size |
paul@96 | 41 | self.position_size = position_size |
paul@96 | 42 | self.subtractor = get_subtractor(docnum_size) |
paul@44 | 43 | self.last_term = "" |
paul@44 | 44 | |
paul@96 | 45 | def write_terms(self, terms): |
paul@44 | 46 | |
paul@44 | 47 | """ |
paul@96 | 48 | Write the 'terms' to the term information file, with each term's details |
paul@96 | 49 | stored in a separate record. |
paul@44 | 50 | """ |
paul@44 | 51 | |
paul@96 | 52 | if hasattr(terms, "items"): |
paul@96 | 53 | terms = terms.items() |
paul@96 | 54 | terms.sort() |
paul@96 | 55 | |
paul@96 | 56 | for term, doc_positions in terms: |
paul@96 | 57 | if not doc_positions: |
paul@96 | 58 | continue |
paul@96 | 59 | |
paul@96 | 60 | if hasattr(doc_positions, "items"): |
paul@96 | 61 | doc_positions = doc_positions.items() |
paul@96 | 62 | |
paul@96 | 63 | docnum, positions = doc_positions[0] |
paul@96 | 64 | |
paul@96 | 65 | if not positions: |
paul@96 | 66 | continue |
paul@96 | 67 | |
paul@96 | 68 | # Start the writing, if appropriate. |
paul@96 | 69 | |
paul@96 | 70 | if self.data_start is None: |
paul@96 | 71 | self.begin(sizeof(docnum), sizeof(positions[0])) |
paul@96 | 72 | |
paul@96 | 73 | # Write each term and document positions. |
paul@96 | 74 | |
paul@96 | 75 | self.write_term(term, doc_positions) |
paul@96 | 76 | self.end_record() |
paul@96 | 77 | |
paul@96 | 78 | # Methods requiring an open record. |
paul@96 | 79 | |
paul@96 | 80 | def write_term(self, term, doc_positions): |
paul@96 | 81 | |
paul@96 | 82 | """ |
paul@96 | 83 | Write the given 'term', its document frequency (number of documents in |
paul@96 | 84 | which it appears), and 'doc_positions' to the term information file. |
paul@96 | 85 | """ |
paul@96 | 86 | |
paul@96 | 87 | self.write_term_only(term) |
paul@96 | 88 | |
paul@96 | 89 | # Write the document frequency and the term positions. |
paul@96 | 90 | |
paul@96 | 91 | self.write_positions(doc_positions) |
paul@96 | 92 | |
paul@96 | 93 | def write_term_plus_remaining(self, term, data): |
paul@96 | 94 | |
paul@96 | 95 | "Write the given 'term' and the document position 'data'." |
paul@96 | 96 | |
paul@96 | 97 | self.write_term_only(term) |
paul@96 | 98 | self.write_remaining(data) |
paul@96 | 99 | |
paul@96 | 100 | def write_term_only(self, term): |
paul@96 | 101 | |
paul@96 | 102 | "Write only the given 'term'." |
paul@96 | 103 | |
paul@75 | 104 | if term <= self.last_term: |
paul@75 | 105 | raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) |
paul@75 | 106 | |
paul@44 | 107 | # Write the prefix length and term suffix. |
paul@44 | 108 | |
paul@44 | 109 | common = len(commonprefix([self.last_term, term])) |
paul@44 | 110 | suffix = term[common:] |
paul@44 | 111 | |
paul@44 | 112 | self.write_number(common) |
paul@44 | 113 | self.write_string(suffix) |
paul@44 | 114 | |
paul@96 | 115 | self.last_term = term |
paul@96 | 116 | |
paul@96 | 117 | def write_positions(self, doc_positions): |
paul@96 | 118 | |
paul@96 | 119 | "Write the given 'doc_positions' to the file." |
paul@96 | 120 | |
paul@96 | 121 | # Make sure that the positions are sorted. |
paul@96 | 122 | |
paul@96 | 123 | doc_positions.sort() |
paul@96 | 124 | |
paul@44 | 125 | # Write the document frequency. |
paul@44 | 126 | |
paul@96 | 127 | self.write_number(len(doc_positions)) |
paul@96 | 128 | |
paul@96 | 129 | last_docnum = None |
paul@96 | 130 | |
paul@96 | 131 | for docnum, positions in doc_positions: |
paul@96 | 132 | |
paul@96 | 133 | # Store the first document number as it is. |
paul@96 | 134 | |
paul@96 | 135 | if last_docnum is None: |
paul@96 | 136 | docnum_seq = docnum |
paul@96 | 137 | |
paul@96 | 138 | # Reject out-of-order documents. |
paul@96 | 139 | |
paul@96 | 140 | elif docnum < last_docnum: |
paul@96 | 141 | raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum) |
paul@44 | 142 | |
paul@96 | 143 | # Calculate an ongoing delta. |
paul@96 | 144 | |
paul@96 | 145 | else: |
paul@96 | 146 | docnum_seq = self.subtractor(docnum, last_docnum) |
paul@96 | 147 | |
paul@96 | 148 | # Write the document number and positions. |
paul@96 | 149 | |
paul@96 | 150 | self.write_sequence_value(docnum_seq, self.docnum_size) |
paul@96 | 151 | self.write_monotonic_sequence(positions, self.position_size) |
paul@96 | 152 | |
paul@96 | 153 | last_docnum = docnum |
paul@96 | 154 | |
paul@96 | 155 | # Write a terminating byte to indicate that no more document pages |
paul@96 | 156 | # exist. |
paul@96 | 157 | |
paul@96 | 158 | self.write_byte(0) |
paul@44 | 159 | |
paul@44 | 160 | class TermReader(FileReader): |
paul@44 | 161 | |
paul@44 | 162 | "Reading term information from files." |
paul@44 | 163 | |
paul@96 | 164 | def begin(self): |
paul@96 | 165 | |
paul@96 | 166 | "Begin reading from the file." |
paul@96 | 167 | |
paul@96 | 168 | self.begin_record() |
paul@96 | 169 | try: |
paul@96 | 170 | self.docnum_size, self.position_size = self.read_numbers(2) |
paul@96 | 171 | except EOFError: |
paul@96 | 172 | self.docnum_size, self.position_size = 0, 0 # NOTE: No positions! |
paul@96 | 173 | |
paul@96 | 174 | self.data_start = self.tell() |
paul@96 | 175 | self.adder = get_adder(self.docnum_size) |
paul@44 | 176 | self.last_term = "" |
paul@96 | 177 | |
paul@96 | 178 | def get_sizes(self): |
paul@96 | 179 | return self.docnum_size, self.position_size |
paul@96 | 180 | |
paul@96 | 181 | # Methods requiring an open record. |
paul@44 | 182 | |
paul@44 | 183 | def read_term(self): |
paul@44 | 184 | |
paul@96 | 185 | "Read a term and its document positions from the term information file." |
paul@96 | 186 | |
paul@96 | 187 | # Read the term. |
paul@96 | 188 | |
paul@96 | 189 | self.read_term_only() |
paul@96 | 190 | |
paul@96 | 191 | # Read the document frequency and the term positions. |
paul@96 | 192 | |
paul@96 | 193 | positions = self.read_positions() |
paul@96 | 194 | |
paul@96 | 195 | return self.last_term, positions |
paul@96 | 196 | |
paul@96 | 197 | def read_term_plus_remaining(self): |
paul@96 | 198 | |
paul@44 | 199 | """ |
paul@96 | 200 | Read a term and the unprocessed document position data. |
paul@44 | 201 | """ |
paul@44 | 202 | |
paul@96 | 203 | self.read_term_only() |
paul@96 | 204 | return self.last_term, self.read_remaining() |
paul@96 | 205 | |
paul@96 | 206 | def read_term_only(self): |
paul@96 | 207 | |
paul@96 | 208 | "Read a term only." |
paul@96 | 209 | |
paul@44 | 210 | # Read the prefix length and term suffix. |
paul@44 | 211 | |
paul@44 | 212 | common = self.read_number() |
paul@44 | 213 | suffix = self.read_string() |
paul@44 | 214 | |
paul@44 | 215 | self.last_term = self.last_term[:common] + suffix |
paul@96 | 216 | return self.last_term |
paul@44 | 217 | |
paul@96 | 218 | def read_positions(self): |
paul@44 | 219 | |
paul@96 | 220 | "Read document positions from the term information file." |
paul@44 | 221 | |
paul@96 | 222 | doc_positions = [] |
paul@44 | 223 | |
paul@96 | 224 | while 1: |
paul@44 | 225 | |
paul@96 | 226 | # Read the document frequency. |
paul@44 | 227 | |
paul@96 | 228 | npositions = self.read_number() |
paul@91 | 229 | |
paul@96 | 230 | last_docnum = None |
paul@96 | 231 | i = 0 |
paul@96 | 232 | while i < npositions: |
paul@44 | 233 | |
paul@96 | 234 | # Read the document number. |
paul@44 | 235 | |
paul@96 | 236 | docnum = self.read_sequence_value(self.docnum_size) |
paul@96 | 237 | if last_docnum is not None: |
paul@96 | 238 | docnum = self.adder(docnum, last_docnum) |
paul@44 | 239 | |
paul@96 | 240 | # Read the positions. |
paul@44 | 241 | |
paul@96 | 242 | positions = self.read_monotonic_sequence(self.position_size) |
paul@96 | 243 | doc_positions.append((docnum, positions)) |
paul@44 | 244 | |
paul@96 | 245 | last_docnum = docnum |
paul@96 | 246 | i += 1 |
paul@44 | 247 | |
paul@96 | 248 | # Read a terminating byte to discover whether more document pages |
paul@96 | 249 | # exist. |
paul@44 | 250 | |
paul@96 | 251 | if not self.read_byte(): |
paul@96 | 252 | break |
paul@44 | 253 | |
paul@96 | 254 | return doc_positions |
paul@81 | 255 | |
paul@97 | 256 | # Indexes covering the information files. |
paul@97 | 257 | |
paul@97 | 258 | class TermIndexWriter(FileWriter): |
paul@97 | 259 | |
paul@97 | 260 | "Writing term index information to files." |
paul@97 | 261 | |
paul@97 | 262 | def begin(self): |
paul@97 | 263 | |
paul@97 | 264 | "Begin writing to the file." |
paul@97 | 265 | |
paul@97 | 266 | self.data_start = self.tell() |
paul@97 | 267 | self.last_term = "" |
paul@97 | 268 | self.last_offset = 0 |
paul@97 | 269 | |
paul@97 | 270 | def write_term(self, term, offset): |
paul@97 | 271 | |
paul@97 | 272 | "Write the given 'term' and 'offset'." |
paul@97 | 273 | |
paul@97 | 274 | if term <= self.last_term: |
paul@97 | 275 | raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) |
paul@97 | 276 | |
paul@97 | 277 | # Write the prefix length and term suffix. |
paul@97 | 278 | |
paul@97 | 279 | common = len(commonprefix([self.last_term, term])) |
paul@97 | 280 | suffix = term[common:] |
paul@97 | 281 | |
paul@97 | 282 | self.write_number(common) |
paul@97 | 283 | self.write_string(suffix) |
paul@97 | 284 | |
paul@97 | 285 | # Write the offset delta. |
paul@97 | 286 | |
paul@97 | 287 | self.write_number(offset - self.last_offset) |
paul@97 | 288 | |
paul@97 | 289 | self.last_term = term |
paul@97 | 290 | self.last_offset = offset |
paul@97 | 291 | |
paul@97 | 292 | class TermIndexReader(FileReader): |
paul@58 | 293 | |
paul@97 | 294 | "Reading term index information to files." |
paul@97 | 295 | |
paul@97 | 296 | def begin(self): |
paul@97 | 297 | |
paul@97 | 298 | "Begin reading from the file." |
paul@97 | 299 | |
paul@97 | 300 | self.data_start = self.tell() |
paul@97 | 301 | self.last_term = "" |
paul@97 | 302 | self.last_offset = 0 |
paul@97 | 303 | |
paul@97 | 304 | def read_term(self): |
paul@97 | 305 | |
paul@97 | 306 | "Read a term and an offset from the file." |
paul@97 | 307 | |
paul@97 | 308 | # Read the prefix length and term suffix. |
paul@97 | 309 | |
paul@97 | 310 | common = self.read_number() |
paul@97 | 311 | suffix = self.read_string() |
paul@97 | 312 | |
paul@97 | 313 | self.last_term = self.last_term[:common] + suffix |
paul@97 | 314 | |
paul@97 | 315 | # Read the offset delta. |
paul@97 | 316 | |
paul@97 | 317 | self.last_offset += self.read_number() |
paul@97 | 318 | return self.last_term, self.last_offset |
paul@97 | 319 | |
paul@100 | 320 | # External reading classes. |
paul@99 | 321 | |
paul@100 | 322 | class TermIterator(TermReader): |
paul@99 | 323 | |
paul@100 | 324 | "An iterator over terms and positions read from a file." |
paul@44 | 325 | |
paul@44 | 326 | def __iter__(self): |
paul@44 | 327 | return self |
paul@44 | 328 | |
paul@99 | 329 | def next(self): |
paul@44 | 330 | try: |
paul@96 | 331 | self.begin_record() |
paul@44 | 332 | return self.read_term() |
paul@44 | 333 | except EOFError: |
paul@44 | 334 | raise StopIteration |
paul@44 | 335 | |
paul@100 | 336 | class TermDataIterator(TermReader): |
paul@90 | 337 | |
paul@96 | 338 | "An iterator over terms and unprocessed document positions data." |
paul@90 | 339 | |
paul@100 | 340 | def __iter__(self): |
paul@100 | 341 | return self |
paul@72 | 342 | |
paul@100 | 343 | def next(self): |
paul@44 | 344 | try: |
paul@96 | 345 | self.begin_record() |
paul@96 | 346 | return self.read_term_plus_remaining() |
paul@44 | 347 | except EOFError: |
paul@96 | 348 | raise StopIteration |
paul@44 | 349 | |
paul@97 | 350 | class TermIndexIterator(TermIndexReader): |
paul@97 | 351 | |
paul@97 | 352 | "An iterator over terms and offsets read from a file." |
paul@97 | 353 | |
paul@97 | 354 | def __iter__(self): |
paul@97 | 355 | return self |
paul@97 | 356 | |
paul@97 | 357 | def next(self): |
paul@97 | 358 | try: |
paul@97 | 359 | self.begin_record() |
paul@97 | 360 | return self.read_term() |
paul@97 | 361 | except EOFError: |
paul@97 | 362 | raise StopIteration |
paul@97 | 363 | |
paul@100 | 364 | class CombinedIterator: |
paul@97 | 365 | |
paul@97 | 366 | "An iterator providing index and information file access." |
paul@97 | 367 | |
paul@97 | 368 | def __init__(self, reader, index_reader): |
paul@97 | 369 | self.reader = reader |
paul@97 | 370 | self.index_reader = index_reader |
paul@97 | 371 | self.records = list(index_reader) |
paul@98 | 372 | self.terms = [t for t, dp in self.records] |
paul@98 | 373 | |
paul@100 | 374 | # Cache the last term and positions. |
paul@100 | 375 | |
paul@100 | 376 | self.last_term_returned = None |
paul@100 | 377 | self.last_positions_returned = None |
paul@100 | 378 | |
paul@97 | 379 | def go_to_term(self, term): |
paul@97 | 380 | |
paul@98 | 381 | """ |
paul@98 | 382 | Return the 'term' and positions or nearest following term and positions. |
paul@98 | 383 | """ |
paul@98 | 384 | |
paul@100 | 385 | if self.last_term_returned is not None and self.last_term_returned == term: |
paul@99 | 386 | return self.last_term_returned, self.last_positions_returned |
paul@98 | 387 | |
paul@97 | 388 | # Get the record providing a term less than or equal to the requested |
paul@97 | 389 | # term, getting the first entry if no such records exist. |
paul@97 | 390 | |
paul@98 | 391 | after = bisect_right(self.terms, term) |
paul@98 | 392 | before = max(0, after - 1) |
paul@98 | 393 | |
paul@98 | 394 | t, offset = self.records[before] |
paul@98 | 395 | terms_after = self.terms[after:] |
paul@97 | 396 | |
paul@97 | 397 | # Seek to the corresponding record in the information file. |
paul@98 | 398 | # Only do this if the term is more quickly reached by seeking. |
paul@97 | 399 | |
paul@100 | 400 | if term <= t or ( |
paul@100 | 401 | self.last_term_returned is None or term < self.last_term_returned or |
paul@100 | 402 | self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned |
paul@100 | 403 | ): |
paul@98 | 404 | |
paul@98 | 405 | self.reader.seek(offset) |
paul@100 | 406 | |
paul@100 | 407 | # Skip the term information, overwrite the reader's state. |
paul@100 | 408 | |
paul@100 | 409 | self.reader.begin_record() |
paul@100 | 410 | self.reader.read_term_only() |
paul@99 | 411 | self.reader.last_term = t |
paul@97 | 412 | |
paul@97 | 413 | # Where the found term is equal or greater, just read the positions for |
paul@97 | 414 | # the index entry. |
paul@97 | 415 | |
paul@97 | 416 | if t >= term: |
paul@97 | 417 | |
paul@100 | 418 | # Get the positions. |
paul@97 | 419 | |
paul@99 | 420 | self.last_term_returned, self.last_positions_returned = t, self.reader.read_positions() |
paul@99 | 421 | return self.last_term_returned, self.last_positions_returned |
paul@97 | 422 | |
paul@97 | 423 | # Where the found term is less, use the information file to find the |
paul@97 | 424 | # term or the one after. |
paul@97 | 425 | |
paul@97 | 426 | else: |
paul@97 | 427 | |
paul@100 | 428 | # Scan for the term. |
paul@97 | 429 | |
paul@97 | 430 | while t < term: |
paul@100 | 431 | t, dp = self.next() # remembers the term and positions |
paul@97 | 432 | |
paul@97 | 433 | return t, dp |
paul@97 | 434 | |
paul@100 | 435 | def __iter__(self): |
paul@100 | 436 | return self |
paul@100 | 437 | |
paul@100 | 438 | def next(self): |
paul@100 | 439 | self.last_term_returned, self.last_positions_returned = record = self.reader.next() |
paul@100 | 440 | return record |
paul@97 | 441 | |
paul@97 | 442 | def close(self): |
paul@97 | 443 | if self.reader is not None: |
paul@97 | 444 | self.reader.close() |
paul@97 | 445 | self.reader = None |
paul@97 | 446 | if self.index_reader is not None: |
paul@97 | 447 | self.index_reader.close() |
paul@97 | 448 | self.index_reader = None |
paul@97 | 449 | |
paul@97 | 450 | class MultipleReader(itermerge): |
paul@97 | 451 | |
paul@97 | 452 | "Accessing many term readers at once." |
paul@97 | 453 | |
paul@97 | 454 | def __init__(self, readers, combine=None): |
paul@97 | 455 | |
paul@97 | 456 | """ |
paul@97 | 457 | Initialise a master index reader using underlying 'readers' and a |
paul@97 | 458 | 'combine' function which knows how to combine position information from |
paul@97 | 459 | different sources. |
paul@97 | 460 | """ |
paul@97 | 461 | |
paul@97 | 462 | self.readers = readers |
paul@97 | 463 | self.combine = combine or operator.add |
paul@97 | 464 | |
paul@97 | 465 | # Initialise this object as an iterator over the readers. |
paul@97 | 466 | |
paul@97 | 467 | itermerge.__init__(self, self.readers) |
paul@97 | 468 | self.next_value = None |
paul@97 | 469 | |
paul@97 | 470 | def get_sizes(self): |
paul@97 | 471 | |
paul@97 | 472 | # Readers must have compatible sizes. |
paul@97 | 473 | |
paul@97 | 474 | if self.readers: |
paul@97 | 475 | return self.readers[0].get_sizes() |
paul@97 | 476 | else: |
paul@97 | 477 | return 0, 0 |
paul@97 | 478 | |
paul@97 | 479 | def go_to_term(self, term): |
paul@100 | 480 | |
paul@100 | 481 | # Refresh the iterators to provide sought values. |
paul@100 | 482 | |
paul@97 | 483 | self.iters = [] |
paul@97 | 484 | for reader in self.readers: |
paul@97 | 485 | try: |
paul@97 | 486 | insort_right(self.iters, (reader.go_to_term(term), reader.next)) |
paul@97 | 487 | except StopIteration: |
paul@97 | 488 | pass |
paul@100 | 489 | |
paul@97 | 490 | self.next_value = None |
paul@97 | 491 | return self.next() |
paul@97 | 492 | |
paul@97 | 493 | def next(self): |
paul@100 | 494 | |
paul@100 | 495 | # Return any prematurely read value. |
paul@100 | 496 | |
paul@97 | 497 | if self.next_value is not None: |
paul@97 | 498 | term, positions = self.next_value |
paul@100 | 499 | |
paul@100 | 500 | # Otherwise, get the next value. |
paul@100 | 501 | |
paul@97 | 502 | else: |
paul@97 | 503 | term, positions = itermerge.next(self) |
paul@97 | 504 | |
paul@100 | 505 | # Look at the next item to see if it has positions for the current term. |
paul@97 | 506 | |
paul@97 | 507 | try: |
paul@97 | 508 | t, p = itermerge.next(self) |
paul@97 | 509 | while t == term: |
paul@97 | 510 | positions = self.combine(positions, p) |
paul@97 | 511 | t, p = itermerge.next(self) |
paul@97 | 512 | self.next_value = t, p |
paul@97 | 513 | |
paul@97 | 514 | # Where an item could not be fetched, cause future requests to fail. |
paul@97 | 515 | |
paul@97 | 516 | except StopIteration: |
paul@97 | 517 | self.next_value = None |
paul@97 | 518 | |
paul@97 | 519 | return term, positions |
paul@97 | 520 | |
paul@97 | 521 | def close(self): |
paul@97 | 522 | for reader in self.readers: |
paul@97 | 523 | reader.close() |
paul@97 | 524 | self.readers = [] |
paul@97 | 525 | |
paul@44 | 526 | # vim: tabstop=4 expandtab shiftwidth=4 |