1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing term information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.data import * 22 from iixr.files import * 23 from itermerge import itermerge 24 from os.path import commonprefix # to find common string prefixes 25 from bisect import bisect_right, insort_right 26 import operator 27 28 class TermWriter(FileWriter): 29 30 "Writing term information to files." 31 32 def begin(self, docnum_size, position_size): 33 34 "Begin writing to the file." 35 36 self.write_numbers((docnum_size, position_size)) 37 self.end_record() 38 39 self.data_start = self.tell() 40 self.docnum_size = docnum_size 41 self.position_size = position_size 42 self.subtractor = get_subtractor(docnum_size) 43 self.last_term = "" 44 45 def write_terms(self, terms): 46 47 """ 48 Write the 'terms' to the term information file, with each term's details 49 stored in a separate record. 50 """ 51 52 if hasattr(terms, "items"): 53 terms = terms.items() 54 terms.sort() 55 56 for term, doc_positions in terms: 57 if not doc_positions: 58 continue 59 60 if hasattr(doc_positions, "items"): 61 doc_positions = doc_positions.items() 62 63 docnum, positions = doc_positions[0] 64 65 if not positions: 66 continue 67 68 # Start the writing, if appropriate. 69 70 if self.data_start is None: 71 self.begin(sizeof(docnum), sizeof(positions[0])) 72 73 # Write each term and document positions. 74 75 self.write_term(term, doc_positions) 76 self.end_record() 77 78 # Methods requiring an open record. 79 80 def write_term(self, term, doc_positions): 81 82 """ 83 Write the given 'term', its document frequency (number of documents in 84 which it appears), and 'doc_positions' to the term information file. 85 """ 86 87 self.write_term_only(term) 88 89 # Write the document frequency and the term positions. 90 91 self.write_positions(doc_positions) 92 93 def write_term_plus_remaining(self, term, data): 94 95 "Write the given 'term' and the document position 'data'." 96 97 self.write_term_only(term) 98 self.write_remaining(data) 99 100 def write_term_only(self, term): 101 102 "Write only the given 'term'." 103 104 if term <= self.last_term: 105 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) 106 107 # Write the prefix length and term suffix. 108 109 common = len(commonprefix([self.last_term, term])) 110 suffix = term[common:] 111 112 self.write_number(common) 113 self.write_string(suffix) 114 115 self.last_term = term 116 117 def write_positions(self, doc_positions): 118 119 "Write the given 'doc_positions' to the file." 120 121 # Make sure that the positions are sorted. 122 123 doc_positions.sort() 124 125 # Write the document frequency. 126 127 self.write_number(len(doc_positions)) 128 129 last_docnum = None 130 131 for docnum, positions in doc_positions: 132 133 # Store the first document number as it is. 134 135 if last_docnum is None: 136 docnum_seq = docnum 137 138 # Reject out-of-order documents. 139 140 elif docnum < last_docnum: 141 raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum) 142 143 # Calculate an ongoing delta. 144 145 else: 146 docnum_seq = self.subtractor(docnum, last_docnum) 147 148 # Write the document number and positions. 149 150 self.write_sequence_value(docnum_seq, self.docnum_size) 151 self.write_monotonic_sequence(positions, self.position_size) 152 153 last_docnum = docnum 154 155 # Write a terminating byte to indicate that no more document pages 156 # exist. 157 158 self.write_byte(0) 159 160 class TermReader(FileReader): 161 162 "Reading term information from files." 163 164 def begin(self): 165 166 "Begin reading from the file." 167 168 self.begin_record() 169 try: 170 self.docnum_size, self.position_size = self.read_numbers(2) 171 except EOFError: 172 self.docnum_size, self.position_size = 0, 0 # NOTE: No positions! 173 174 self.data_start = self.tell() 175 self.adder = get_adder(self.docnum_size) 176 self.last_term = "" 177 178 def get_sizes(self): 179 return self.docnum_size, self.position_size 180 181 # Methods requiring an open record. 182 183 def read_term(self): 184 185 "Read a term and its document positions from the term information file." 186 187 # Read the term. 188 189 self.read_term_only() 190 191 # Read the document frequency and the term positions. 192 193 positions = self.read_positions() 194 195 return self.last_term, positions 196 197 def read_term_plus_remaining(self): 198 199 """ 200 Read a term and the unprocessed document position data. 201 """ 202 203 self.read_term_only() 204 return self.last_term, self.read_remaining() 205 206 def read_term_only(self): 207 208 "Read a term only." 209 210 # Read the prefix length and term suffix. 211 212 common = self.read_number() 213 suffix = self.read_string() 214 215 self.last_term = self.last_term[:common] + suffix 216 return self.last_term 217 218 def read_positions(self): 219 220 "Read document positions from the term information file." 221 222 doc_positions = [] 223 224 while 1: 225 226 # Read the document frequency. 227 228 npositions = self.read_number() 229 230 last_docnum = None 231 i = 0 232 while i < npositions: 233 234 # Read the document number. 235 236 docnum = self.read_sequence_value(self.docnum_size) 237 if last_docnum is not None: 238 docnum = self.adder(docnum, last_docnum) 239 240 # Read the positions. 241 242 positions = self.read_monotonic_sequence(self.position_size) 243 doc_positions.append((docnum, positions)) 244 245 last_docnum = docnum 246 i += 1 247 248 # Read a terminating byte to discover whether more document pages 249 # exist. 250 251 if not self.read_byte(): 252 break 253 254 return doc_positions 255 256 # Indexes covering the information files. 257 258 class TermIndexWriter(FileWriter): 259 260 "Writing term index information to files." 261 262 def begin(self): 263 264 "Begin writing to the file." 265 266 self.data_start = self.tell() 267 self.last_term = "" 268 self.last_offset = 0 269 270 def write_term(self, term, offset): 271 272 "Write the given 'term' and 'offset'." 273 274 if term <= self.last_term: 275 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) 276 277 # Write the prefix length and term suffix. 278 279 common = len(commonprefix([self.last_term, term])) 280 suffix = term[common:] 281 282 self.write_number(common) 283 self.write_string(suffix) 284 285 # Write the offset delta. 286 287 self.write_number(offset - self.last_offset) 288 289 self.last_term = term 290 self.last_offset = offset 291 292 class TermIndexReader(FileReader): 293 294 "Reading term index information to files." 295 296 def begin(self): 297 298 "Begin reading from the file." 299 300 self.data_start = self.tell() 301 self.last_term = "" 302 self.last_offset = 0 303 304 def read_term(self): 305 306 "Read a term and an offset from the file." 307 308 # Read the prefix length and term suffix. 309 310 common = self.read_number() 311 suffix = self.read_string() 312 313 self.last_term = self.last_term[:common] + suffix 314 315 # Read the offset delta. 316 317 self.last_offset += self.read_number() 318 return self.last_term, self.last_offset 319 320 # Iterator support classes. 321 322 class Iterator: 323 324 "Common iterator support." 325 326 def __init__(self): 327 328 "Cache the last term and positions." 329 330 self.last_term_returned = None 331 self.last_positions_returned = None 332 333 def go_to_term(self, term): 334 if term == self.last_term_returned: 335 return self.last_term_returned, self.last_positions_returned 336 337 t, dp = self.next() 338 while t < term: 339 t, dp = self.next() 340 self.last_term_returned, self.last_positions_returned = t, dp 341 return t, dp 342 343 def __iter__(self): 344 return self 345 346 def next(self): 347 self.last_term_returned, self.last_positions_returned = t = self._next() 348 return t 349 350 # External reading classes. 351 352 class TermIterator(TermReader, Iterator): 353 354 "An iterator over terms and positions read from a file." 355 356 def __init__(self, f): 357 TermReader.__init__(self, f) 358 Iterator.__init__(self) 359 360 def _next(self): 361 try: 362 self.begin_record() 363 return self.read_term() 364 except EOFError: 365 raise StopIteration 366 367 class TermDataIterator(TermReader, Iterator): 368 369 "An iterator over terms and unprocessed document positions data." 370 371 def __init__(self, f): 372 TermReader.__init__(self, f) 373 Iterator.__init__(self) 374 375 def _next(self): 376 try: 377 self.begin_record() 378 return self.read_term_plus_remaining() 379 except EOFError: 380 raise StopIteration 381 382 class TermIndexIterator(TermIndexReader): 383 384 "An iterator over terms and offsets read from a file." 385 386 def __iter__(self): 387 return self 388 389 def next(self): 390 try: 391 self.begin_record() 392 return self.read_term() 393 except EOFError: 394 raise StopIteration 395 396 class CombinedIterator(Iterator): 397 398 "An iterator providing index and information file access." 399 400 def __init__(self, reader, index_reader): 401 Iterator.__init__(self) 402 self.reader = reader 403 self.index_reader = index_reader 404 self.records = list(index_reader) 405 self.terms = [t for t, dp in self.records] 406 407 def go_to_term(self, term): 408 409 """ 410 Return the 'term' and positions or nearest following term and positions. 411 """ 412 413 if self.last_term_returned == term: 414 return self.last_term_returned, self.last_positions_returned 415 416 # Get the record providing a term less than or equal to the requested 417 # term, getting the first entry if no such records exist. 418 419 after = bisect_right(self.terms, term) 420 before = max(0, after - 1) 421 422 t, offset = self.records[before] 423 terms_after = self.terms[after:] 424 425 # Seek to the corresponding record in the information file. 426 # Only do this if the term is more quickly reached by seeking. 427 428 if term <= t or self.last_term_returned is None or term <= self.last_term_returned or \ 429 self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned: 430 431 self.reader.seek(offset) 432 self.reader.last_term = t 433 434 # Where the found term is equal or greater, just read the positions for 435 # the index entry. 436 437 if t >= term: 438 439 # Skip the term information, overwrite the reader's state, and get 440 # the positions. 441 442 self.reader.begin_record() 443 self.reader.read_term_only() 444 self.reader.last_term = t 445 446 self.last_term_returned, self.last_positions_returned = t, self.reader.read_positions() 447 return self.last_term_returned, self.last_positions_returned 448 449 # Where the found term is less, use the information file to find the 450 # term or the one after. 451 452 else: 453 454 # Overwrite the reader's state, then scan for the term. 455 456 t, dp = self.reader.next() 457 while t < term: 458 t, dp = self.reader.next() 459 460 self.last_term_returned, self.last_positions_returned = t, dp 461 return t, dp 462 463 def _next(self): 464 return self.reader.next() 465 466 def close(self): 467 if self.reader is not None: 468 self.reader.close() 469 self.reader = None 470 if self.index_reader is not None: 471 self.index_reader.close() 472 self.index_reader = None 473 474 class MultipleReader(itermerge): 475 476 "Accessing many term readers at once." 477 478 def __init__(self, readers, combine=None): 479 480 """ 481 Initialise a master index reader using underlying 'readers' and a 482 'combine' function which knows how to combine position information from 483 different sources. 484 """ 485 486 self.readers = readers 487 self.combine = combine or operator.add 488 489 # Initialise this object as an iterator over the readers. 490 491 itermerge.__init__(self, self.readers) 492 self.next_value = None 493 494 def get_sizes(self): 495 496 # Readers must have compatible sizes. 497 498 if self.readers: 499 return self.readers[0].get_sizes() 500 else: 501 return 0, 0 502 503 def go_to_term(self, term): 504 self.iters = [] 505 for reader in self.readers: 506 try: 507 insort_right(self.iters, (reader.go_to_term(term), reader.next)) 508 except StopIteration: 509 pass 510 self.next_value = None 511 return self.next() 512 513 def next(self): 514 if self.next_value is not None: 515 term, positions = self.next_value 516 else: 517 term, positions = itermerge.next(self) 518 519 # Look at the next item to see if it is has positions for the current 520 # term. 521 522 try: 523 t, p = itermerge.next(self) 524 while t == term: 525 positions = self.combine(positions, p) 526 t, p = itermerge.next(self) 527 self.next_value = t, p 528 529 # Where an item could not be fetched, cause future requests to fail. 530 531 except StopIteration: 532 self.next_value = None 533 534 return term, positions 535 536 def close(self): 537 for reader in self.readers: 538 reader.close() 539 self.readers = [] 540 541 # vim: tabstop=4 expandtab shiftwidth=4