1 #!/usr/bin/env python 2 3 """ 4 A simple (and sane) text indexing library. 5 6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from os import dup, fdopen # independent iterator access to files 22 from os import listdir, mkdir # index and partition discovery 23 from os import remove, rename # partition manipulation 24 from os.path import exists, join 25 from os.path import commonprefix # to find common string prefixes 26 from bisect import bisect_right # to find terms in the dictionary index 27 import bz2, zlib # for field compression 28 from itermerge import itermerge 29 30 try: 31 set 32 except NameError: 33 from sets import Set as set 34 35 # Constants. 36 37 TERM_INTERVAL = 100 38 DOCUMENT_INTERVAL = 100 39 FIELD_INTERVAL = 100 40 FLUSH_INTERVAL = 10000 41 42 TERM_FILENAMES = "terms", "terms_index", "positions", "positions_index" 43 FIELD_FILENAMES = "fields", "fields_index" 44 45 compressors = [("b", bz2.compress), ("z", zlib.compress)] 46 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress} 47 48 # Utility functions. 49 50 try: 51 from vint import vint as _vint 52 53 def vint(number): 54 55 "Write 'number' as a variable-length integer." 56 57 if number >= 0: 58 return _vint(number) 59 else: 60 raise ValueError, "Number %r is negative." % number 61 62 except ImportError: 63 64 def vint(number): 65 66 "Write 'number' as a variable-length integer." 67 68 if number >= 0: 69 70 # Special case: one byte containing a 7-bit number. 71 72 if number < 128: 73 return chr(number) 74 75 # Write the number from least to most significant digits. 76 77 bytes = [] 78 79 while number != 0: 80 lsd = number & 127 81 number = number >> 7 82 if number != 0: 83 lsd |= 128 84 bytes.append(chr(lsd)) 85 86 return "".join(bytes) 87 88 # Negative numbers are not supported. 89 90 else: 91 raise ValueError, "Number %r is negative." % number 92 93 # Foundation classes. 94 95 class File: 96 97 "A basic file abstraction." 98 99 def __init__(self, f): 100 self.f = f 101 self.reset() 102 self.cache = [] 103 self.cache_length = 0 104 105 def reset(self): 106 107 "To be used to reset the state of the reader or writer between records." 108 109 pass 110 111 def rewind(self): 112 self.f.seek(0) 113 self.reset() 114 115 def write(self, s): 116 self.cache.append(s) 117 self.cache_length += len(s) 118 if len(self.cache) >= 1000: 119 self.flush() 120 121 def tell(self): 122 return self.f.tell() + self.cache_length 123 124 def flush(self): 125 self.f.write("".join(self.cache)) 126 self.cache = [] 127 self.cache_length = 0 128 129 def close(self): 130 if self.f is not None: 131 self.flush() 132 self.f.close() 133 self.f = None 134 135 class FileWriter(File): 136 137 "Writing basic data types to files." 138 139 def write_number(self, number): 140 141 "Write 'number' to the file using a variable length encoding." 142 143 self.write(vint(number)) 144 145 def write_string(self, s, compress=0): 146 147 """ 148 Write 's' to the file, recording its length and compressing the string 149 if 'compress' is set to a true value. 150 """ 151 152 # Convert Unicode objects to strings. 153 154 if isinstance(s, unicode): 155 s = s.encode("utf-8") 156 157 # Compress the string if requested. 158 159 if compress: 160 for flag, fn in compressors: 161 cs = fn(s) 162 163 # Take the first string shorter than the original. 164 165 if len(cs) < len(s): 166 s = cs 167 break 168 else: 169 flag = "-" 170 171 else: 172 flag = "" 173 174 # Write the length of the data before the data itself. 175 176 length = len(s) 177 self.write(flag + vint(length) + s) 178 179 class FileReader(File): 180 181 "Reading basic data types from files." 182 183 def read_number(self): 184 185 "Read a number from the file." 186 187 # Read each byte, adding it to the number. 188 189 shift = 0 190 number = 0 191 read = self.f.read 192 193 try: 194 csd = ord(read(1)) 195 while csd & 128: 196 number += ((csd & 127) << shift) 197 shift += 7 198 csd = ord(read(1)) 199 else: 200 number += (csd << shift) 201 except TypeError: 202 raise EOFError 203 204 return number 205 206 def read_string(self, decompress=0): 207 208 """ 209 Read a string from the file, decompressing the stored data if 210 'decompress' is set to a true value. 211 """ 212 213 # Decompress the data if requested. 214 215 if decompress: 216 flag = self.f.read(1) 217 else: 218 flag = "-" 219 220 length = self.read_number() 221 s = self.f.read(length) 222 223 # Perform decompression if applicable. 224 225 if flag != "-": 226 fn = decompressors[flag] 227 s = fn(s) 228 229 # Convert strings to Unicode objects. 230 231 return unicode(s, "utf-8") 232 233 class FileOpener: 234 235 "Opening files using their filenames." 236 237 def __init__(self, filename): 238 self.filename = filename 239 240 def open(self, mode): 241 return open(self.filename, mode) 242 243 def close(self): 244 pass 245 246 # Specific classes for storing term and position information. 247 248 class PositionWriter(FileWriter): 249 250 "Writing position information to files." 251 252 def reset(self): 253 self.last_docnum = 0 254 255 def write_positions(self, docnum, positions): 256 257 """ 258 Write for the document 'docnum' the given 'positions'. 259 Return the offset of the written record. 260 """ 261 262 if docnum < self.last_docnum: 263 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum) 264 265 # Record the offset of this record. 266 267 offset = self.tell() 268 269 # Make sure that the positions are sorted. 270 271 positions.sort() 272 273 # Write the position deltas. 274 275 output = [] 276 last = 0 277 278 for position in positions: 279 output.append(vint(position - last)) 280 last = position 281 282 # Write the document number delta. 283 # Write the number of positions. 284 # Then write the positions. 285 286 self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output)) 287 288 self.last_docnum = docnum 289 return offset 290 291 class PositionOpener(FileOpener): 292 293 "Reading position information from files." 294 295 def read_term_positions(self, offset, count): 296 297 """ 298 Read all positions from 'offset', seeking to that position in the file 299 before reading. The number of documents available for reading is limited 300 to 'count'. 301 """ 302 303 # Duplicate the file handle. 304 305 f = self.open("rb") 306 f.seek(offset) 307 return PositionIterator(f, count) 308 309 class PositionIndexWriter(FileWriter): 310 311 "Writing position index information to files." 312 313 def reset(self): 314 self.last_docnum = 0 315 self.last_pos_offset = 0 316 317 def write_positions(self, docnum, pos_offset, count): 318 319 """ 320 Write the given 'docnum, 'pos_offset' and document 'count' to the 321 position index file. 322 """ 323 324 # Record the offset of this record. 325 326 offset = self.tell() 327 output = [] 328 329 # Write the document number delta. 330 331 output.append(vint(docnum - self.last_docnum)) 332 self.last_docnum = docnum 333 334 # Write the position file offset delta. 335 336 output.append(vint(pos_offset - self.last_pos_offset)) 337 self.last_pos_offset = pos_offset 338 339 # Write the document count. 340 341 output.append(vint(count)) 342 343 # Actually write the data. 344 345 self.write("".join(output)) 346 347 return offset 348 349 class PositionIndexOpener(FileOpener): 350 351 "Reading position index information from files." 352 353 def read_term_positions(self, offset, doc_frequency): 354 355 """ 356 Read all positions from 'offset', seeking to that position in the file 357 before reading. The number of documents available for reading is limited 358 to 'doc_frequency'. 359 """ 360 361 # Duplicate the file handle. 362 363 f = self.open("rb") 364 f.seek(offset) 365 return PositionIndexIterator(f, doc_frequency) 366 367 # Iterators for position-related files. 368 369 class IteratorBase: 370 371 def __init__(self, count): 372 self.replenish(count) 373 374 def replenish(self, count): 375 self.count = count 376 self.read_documents = 0 377 378 def __len__(self): 379 return self.count 380 381 def sort(self): 382 pass # Stored document positions are already sorted. 383 384 def __iter__(self): 385 return self 386 387 class PositionIterator(FileReader, IteratorBase): 388 389 "Iterating over document positions." 390 391 def __init__(self, f, count): 392 FileReader.__init__(self, f) 393 IteratorBase.__init__(self, count) 394 395 def reset(self): 396 self.last_docnum = 0 397 398 def read_positions(self): 399 400 "Read positions, returning a document number and a list of positions." 401 402 # Read the document number delta and add it to the last number. 403 404 self.last_docnum += self.read_number() 405 406 # Read the number of positions. 407 408 npositions = self.read_number() 409 410 # Read the position deltas, adding each previous position to get the 411 # appropriate collection of absolute positions. 412 413 i = 0 414 last = 0 415 positions = [] 416 417 while i < npositions: 418 last += self.read_number() 419 positions.append(last) 420 i += 1 421 422 return self.last_docnum, positions 423 424 def next(self): 425 426 "Read positions for a single document." 427 428 if self.read_documents < self.count: 429 self.read_documents += 1 430 return self.read_positions() 431 else: 432 raise StopIteration 433 434 class PositionIndexIterator(FileReader, IteratorBase): 435 436 "Iterating over document positions." 437 438 def __init__(self, f, count): 439 FileReader.__init__(self, f) 440 IteratorBase.__init__(self, count) 441 self.section_count = 0 442 443 def reset(self): 444 self.last_docnum = 0 445 self.last_pos_offset = 0 446 447 def read_positions(self): 448 449 """ 450 Read a document number, a position file offset for the position index 451 file, and the number of documents in a section of that file. 452 """ 453 454 # Read the document number delta. 455 456 self.last_docnum += self.read_number() 457 458 # Read the offset delta. 459 460 self.last_pos_offset += self.read_number() 461 462 # Read the document count. 463 464 count = self.read_number() 465 466 return self.last_docnum, self.last_pos_offset, count 467 468 def next(self): 469 470 "Read positions for a single document." 471 472 self.read_documents += self.section_count 473 if self.read_documents < self.count: 474 docnum, pos_offset, self.section_count = t = self.read_positions() 475 return t 476 else: 477 raise StopIteration 478 479 class PositionDictionaryWriter: 480 481 "Writing position dictionaries." 482 483 def __init__(self, position_writer, position_index_writer, interval): 484 self.position_writer = position_writer 485 self.position_index_writer = position_index_writer 486 self.interval = interval 487 488 def write_term_positions(self, doc_positions): 489 490 """ 491 Write all 'doc_positions' - a collection of tuples of the form (document 492 number, position list) - to the file. 493 494 Add some records to the index, making dictionary entries. 495 496 Return a tuple containing the offset of the written data, the frequency 497 (number of positions), and document frequency (number of documents) for 498 the term involved. 499 """ 500 501 # Reset the writers. 502 503 self.position_writer.reset() 504 self.position_index_writer.reset() 505 506 index_offset = None 507 508 # Write the positions. 509 510 frequency = 0 511 first_docnum = None 512 first_offset = None 513 count = 0 514 515 doc_positions.sort() 516 517 for docnum, positions in doc_positions: 518 pos_offset = self.position_writer.write_positions(docnum, positions) 519 520 # Retain the first record offset for a subsequent index entry. 521 522 if first_offset is None: 523 first_offset = pos_offset 524 first_docnum = docnum 525 526 frequency += len(positions) 527 count += 1 528 529 # Every {interval} entries, write an index entry. 530 531 if count % self.interval == 0: 532 io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval) 533 534 # Remember the first index entry offset. 535 536 if index_offset is None: 537 index_offset = io 538 539 first_offset = None 540 first_docnum = None 541 542 # Reset the position writer so that position readers accessing 543 # a section start with the correct document number. 544 545 self.position_writer.reset() 546 547 # Finish writing an index entry for the remaining documents. 548 549 else: 550 if first_offset is not None: 551 io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval) 552 553 # Remember the first index entry offset. 554 555 if index_offset is None: 556 index_offset = io 557 558 return index_offset, frequency, count 559 560 def close(self): 561 self.position_writer.close() 562 self.position_index_writer.close() 563 564 class PositionDictionaryReader: 565 566 "Reading position dictionaries." 567 568 def __init__(self, position_opener, position_index_opener): 569 self.position_opener = position_opener 570 self.position_index_opener = position_index_opener 571 572 def read_term_positions(self, offset, doc_frequency): 573 574 """ 575 Return an iterator for dictionary entries starting at 'offset' with the 576 given 'doc_frequency'. 577 """ 578 579 return PositionDictionaryIterator(self.position_opener, 580 self.position_index_opener, offset, doc_frequency) 581 582 def close(self): 583 pass 584 585 class PositionDictionaryIterator: 586 587 "Iteration over position dictionary entries." 588 589 def __init__(self, position_opener, position_index_opener, offset, doc_frequency): 590 self.position_opener = position_opener 591 self.doc_frequency = doc_frequency 592 self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency) 593 self.iterator = None 594 595 # Remember the last values. 596 597 self.found_docnum, self.found_positions = None, None 598 599 # Maintain state for the next index entry, if read. 600 601 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 602 603 # Initialise the current index entry and current position file iterator. 604 605 self._next_section() 606 self._init_section() 607 608 # Sequence methods. 609 610 def __len__(self): 611 return self.doc_frequency 612 613 def sort(self): 614 pass 615 616 # Iterator methods. 617 618 def __iter__(self): 619 return self 620 621 def next(self): 622 623 """ 624 Attempt to get the next document record from the section in the 625 positions file. 626 """ 627 628 # Return any visited but unrequested record. 629 630 if self.found_docnum is not None: 631 t = self.found_docnum, self.found_positions 632 self.found_docnum, self.found_positions = None, None 633 return t 634 635 # Or search for the next record. 636 637 while 1: 638 639 # Either return the next record. 640 641 try: 642 return self.iterator.next() 643 644 # Or, where a section is finished, get the next section and try again. 645 646 except StopIteration: 647 648 # Where a section follows, update the index iterator, but keep 649 # reading using the same file iterator (since the data should 650 # just follow on from the last section). 651 652 self._next_section() 653 self.iterator.replenish(self.section_count) 654 655 # Reset the state of the iterator to make sure that document 656 # numbers are correct. 657 658 self.iterator.reset() 659 660 def from_document(self, docnum): 661 662 """ 663 Attempt to navigate to a positions entry for the given 'docnum', 664 returning the positions for 'docnum', or None otherwise. 665 """ 666 667 # Return any unrequested document positions. 668 669 if docnum == self.found_docnum: 670 return self.found_positions 671 672 # Read ahead in the index until the next entry refers to a document 673 # later than the desired document. 674 675 try: 676 if self.next_docnum is None: 677 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 678 679 # Read until the next entry is after the desired document number, 680 # or until the end of the results. 681 682 while self.next_docnum <= docnum: 683 self._next_read_section() 684 if self.docnum < docnum: 685 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next() 686 else: 687 break 688 689 except StopIteration: 690 pass 691 692 # Navigate in the position file to the document. 693 694 self._init_section() 695 696 try: 697 while 1: 698 found_docnum, found_positions = self.iterator.next() 699 700 # Return the desired document positions or None (retaining the 701 # positions for the document immediately after). 702 703 if docnum == found_docnum: 704 return found_positions 705 elif docnum < found_docnum: 706 self.found_docnum, self.found_positions = found_docnum, found_positions 707 return None 708 709 except StopIteration: 710 return None 711 712 # Internal methods. 713 714 def _next_section(self): 715 716 "Attempt to get the next section in the index." 717 718 if self.next_docnum is None: 719 self.docnum, self.pos_offset, self.section_count = self.index_iterator.next() 720 else: 721 self._next_read_section() 722 723 def _next_read_section(self): 724 725 """ 726 Make the next index entry the current one without reading from the 727 index. 728 """ 729 730 self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count 731 self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None 732 733 def _init_section(self): 734 735 "Initialise the iterator for the section in the position file." 736 737 if self.iterator is not None: 738 self.iterator.close() 739 self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count) 740 741 def close(self): 742 if self.iterator is not None: 743 self.iterator.close() 744 self.iterator = None 745 if self.index_iterator is not None: 746 self.index_iterator.close() 747 self.index_iterator = None 748 749 class TermWriter(FileWriter): 750 751 "Writing term information to files." 752 753 def reset(self): 754 self.last_term = "" 755 self.last_offset = 0 756 757 def write_term(self, term, offset, frequency, doc_frequency): 758 759 """ 760 Write the given 'term', its position file 'offset', its 'frequency' and 761 its 'doc_frequency' (number of documents in which it appears) to the 762 term information file. Return the offset after the term information was 763 written to the file. 764 """ 765 766 # Write the prefix length and term suffix. 767 768 common = len(commonprefix([self.last_term, term])) 769 suffix = term[common:] 770 771 self.write_number(common) 772 self.write_string(suffix) 773 774 # Write the offset delta. 775 776 self.write_number(offset - self.last_offset) 777 778 # Write the frequency. 779 780 self.write_number(frequency) 781 782 # Write the document frequency. 783 784 self.write_number(doc_frequency) 785 786 self.last_term = term 787 self.last_offset = offset 788 789 return self.tell() 790 791 class TermReader(FileReader): 792 793 "Reading term information from files." 794 795 def reset(self): 796 self.last_term = "" 797 self.last_offset = 0 798 799 def read_term(self): 800 801 """ 802 Read a term, its position file offset, its frequency and its document 803 frequency from the term information file. 804 """ 805 806 # Read the prefix length and term suffix. 807 808 common = self.read_number() 809 suffix = self.read_string() 810 811 self.last_term = self.last_term[:common] + suffix 812 813 # Read the offset delta. 814 815 self.last_offset += self.read_number() 816 817 # Read the frequency. 818 819 frequency = self.read_number() 820 821 # Read the document frequency. 822 823 doc_frequency = self.read_number() 824 825 return self.last_term, self.last_offset, frequency, doc_frequency 826 827 def go_to_term(self, term, offset, info_offset): 828 829 """ 830 Seek past the entry for 'term' having 'offset' to 'info_offset'. This 831 permits the scanning for later terms from the specified term. 832 """ 833 834 self.f.seek(info_offset) 835 self.last_term = term 836 self.last_offset = offset 837 838 class TermIndexWriter(TermWriter): 839 840 "Writing term dictionary index details to files." 841 842 def reset(self): 843 TermWriter.reset(self) 844 self.last_info_offset = 0 845 846 def write_term(self, term, offset, frequency, doc_frequency, info_offset): 847 848 """ 849 Write the given 'term', its position file 'offset', its 'frequency' and 850 its 'doc_frequency' to the term dictionary index file, along with the 851 'info_offset' in the term information file. 852 """ 853 854 TermWriter.write_term(self, term, offset, frequency, doc_frequency) 855 856 # Write the information file offset delta. 857 858 self.write_number(info_offset - self.last_info_offset) 859 self.last_info_offset = info_offset 860 861 class TermIndexReader(TermReader): 862 863 "Reading term dictionary index details from files." 864 865 def reset(self): 866 TermReader.reset(self) 867 self.last_info_offset = 0 868 869 def read_term(self): 870 871 """ 872 Read a term, its position file offset, its frequency, its document 873 frequency and a term information file offset from the term dictionary 874 index file. 875 """ 876 877 term, offset, frequency, doc_frequency = TermReader.read_term(self) 878 879 # Read the offset delta. 880 881 self.last_info_offset += self.read_number() 882 883 return term, offset, frequency, doc_frequency, self.last_info_offset 884 885 class TermDictionaryWriter: 886 887 "Writing term dictionaries." 888 889 def __init__(self, info_writer, index_writer, position_dict_writer, interval): 890 self.info_writer = info_writer 891 self.index_writer = index_writer 892 self.position_dict_writer = position_dict_writer 893 self.interval = interval 894 self.entry = 0 895 896 def _write_term(self, term, offset, frequency, doc_frequency): 897 898 """ 899 Write the given 'term', its position file 'offset', its 'frequency' and 900 its 'doc_frequency' (number of documents in which it appears) to the 901 term information file. Return the offset after the term information was 902 written to the file. 903 """ 904 905 info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency) 906 907 if self.entry % self.interval == 0: 908 self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset) 909 910 self.entry += 1 911 912 def write_term_positions(self, term, doc_positions): 913 914 """ 915 Write the given 'term' and the 'doc_positions' recording the documents 916 and positions at which the term is found. 917 """ 918 919 offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions) 920 self._write_term(term, offset, frequency, doc_frequency) 921 922 def close(self): 923 self.info_writer.close() 924 self.index_writer.close() 925 self.position_dict_writer.close() 926 927 class TermDictionaryReader: 928 929 "Reading term dictionaries." 930 931 def __init__(self, info_reader, index_reader, position_dict_reader): 932 self.info_reader = info_reader 933 self.index_reader = index_reader 934 self.position_dict_reader = position_dict_reader 935 936 self.terms = [] 937 try: 938 while 1: 939 self.terms.append(self.index_reader.read_term()) 940 except EOFError: 941 pass 942 943 # Large numbers for ordering purposes. 944 945 if self.terms: 946 self.max_offset = self.terms[-1][1] + 1 947 else: 948 self.max_offset = None 949 950 def _find_closest_entry(self, term): 951 952 """ 953 Find the offsets and frequencies of 'term' from the term dictionary or 954 the closest term starting with the value of 'term'. 955 956 Return the closest index entry consisting of a term, the position file 957 offset, the term frequency, the document frequency, and the term details 958 file offset. 959 """ 960 961 i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1 962 963 # Get the entry position providing the term or one preceding it. 964 # If no entry precedes the requested term, return the very first entry 965 # as the closest. 966 967 if i == -1: 968 return self.terms[0] 969 else: 970 return self.terms[i] 971 972 def _find_closest_term(self, term): 973 974 """ 975 Find the offsets and frequencies of 'term' from the term dictionary or 976 the closest term starting with the value of 'term'. 977 978 Return the closest term (or the term itself), the position file offset, 979 the term frequency, the document frequency, and the term details file 980 offset (or None if the reader is already positioned). 981 """ 982 983 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term) 984 985 # Where the term is found immediately, return the offset and 986 # frequencies. If the term does not appear, return the details of the 987 # closest entry. 988 989 if term <= found_term: 990 return found_term, offset, frequency, doc_frequency, info_offset 991 992 # Otherwise, seek past the index term's entry in the information file 993 # and scan for the desired term. 994 995 else: 996 self.info_reader.go_to_term(found_term, offset, info_offset) 997 try: 998 while term > found_term: 999 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 1000 except EOFError: 1001 pass 1002 1003 return found_term, offset, frequency, doc_frequency, None 1004 1005 def _find_term(self, term): 1006 1007 """ 1008 Find the position file offset and frequency of 'term' from the term 1009 dictionary. 1010 """ 1011 1012 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 1013 1014 # If the term is found, return the offset and frequencies. 1015 1016 if term == found_term: 1017 return offset, frequency, doc_frequency 1018 else: 1019 return None 1020 1021 def _get_positions(self, offset, doc_frequency): 1022 return self.position_dict_reader.read_term_positions(offset, doc_frequency) 1023 1024 # Iterator convenience methods. 1025 1026 def __iter__(self): 1027 self.rewind() 1028 return self 1029 1030 def next(self): 1031 try: 1032 return self.read_term() 1033 except EOFError: 1034 raise StopIteration 1035 1036 # Sequential access methods. 1037 1038 def rewind(self): 1039 self.info_reader.rewind() 1040 1041 def read_term(self): 1042 1043 """ 1044 Return the next term, its frequency, its document frequency, and the 1045 documents and positions at which the term is found. 1046 """ 1047 1048 term, offset, frequency, doc_frequency = self.info_reader.read_term() 1049 positions = self._get_positions(offset, doc_frequency) 1050 return term, frequency, doc_frequency, positions 1051 1052 # Query methods. 1053 1054 def find_terms(self, term): 1055 1056 "Return all terms whose values start with the value of 'term'." 1057 1058 terms = [] 1059 1060 found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term) 1061 1062 # Position the reader, if necessary. 1063 1064 if info_offset is not None: 1065 self.info_reader.go_to_term(found_term, offset, info_offset) 1066 1067 # Read and record terms. 1068 1069 try: 1070 # Add the found term if it starts with the specified term. 1071 1072 while found_term.startswith(term): 1073 terms.append(found_term) 1074 found_term, offset, frequency, doc_frequency = self.info_reader.read_term() 1075 1076 except EOFError: 1077 pass 1078 1079 return terms 1080 1081 def find_positions(self, term): 1082 1083 "Return the documents and positions at which the given 'term' is found." 1084 1085 t = self._find_term(term) 1086 if t is None: 1087 return None 1088 else: 1089 offset, frequency, doc_frequency = t 1090 return self._get_positions(offset, doc_frequency) 1091 1092 def get_frequency(self, term): 1093 1094 "Return the frequency of the given 'term'." 1095 1096 t = self._find_term(term) 1097 if t is None: 1098 return None 1099 else: 1100 offset, frequency, doc_frequency = t 1101 return frequency 1102 1103 def get_document_frequency(self, term): 1104 1105 "Return the document frequency of the given 'term'." 1106 1107 t = self._find_term(term) 1108 if t is None: 1109 return None 1110 else: 1111 offset, frequency, doc_frequency = t 1112 return doc_frequency 1113 1114 def close(self): 1115 self.info_reader.close() 1116 self.index_reader.close() 1117 self.position_dict_reader.close() 1118 1119 # Specific classes for storing document information. 1120 1121 class FieldWriter(FileWriter): 1122 1123 "Writing field data to files." 1124 1125 def reset(self): 1126 self.last_docnum = 0 1127 1128 def write_fields(self, docnum, fields): 1129 1130 """ 1131 Write for the given 'docnum', a list of 'fields' (integer, string pairs 1132 representing field identifiers and values respectively). 1133 Return the offset at which the fields are stored. 1134 """ 1135 1136 offset = self.tell() 1137 1138 # Write the document number delta. 1139 1140 self.write_number(docnum - self.last_docnum) 1141 1142 # Write the number of fields. 1143 1144 self.write_number(len(fields)) 1145 1146 # Write the fields themselves. 1147 1148 for i, field in fields: 1149 self.write_number(i) 1150 self.write_string(field, 1) # compress 1151 1152 self.last_docnum = docnum 1153 return offset 1154 1155 class FieldReader(FileReader): 1156 1157 "Reading field data from files." 1158 1159 def reset(self): 1160 self.last_docnum = 0 1161 1162 def read_fields(self): 1163 1164 """ 1165 Read fields from the file, returning a tuple containing the document 1166 number and a list of field (identifier, value) pairs. 1167 """ 1168 1169 # Read the document number. 1170 1171 self.last_docnum += self.read_number() 1172 1173 # Read the number of fields. 1174 1175 nfields = self.read_number() 1176 1177 # Collect the fields. 1178 1179 fields = [] 1180 i = 0 1181 1182 while i < nfields: 1183 identifier = self.read_number() 1184 value = self.read_string(1) # decompress 1185 fields.append((identifier, value)) 1186 i += 1 1187 1188 return self.last_docnum, fields 1189 1190 def read_document_fields(self, docnum, offset): 1191 1192 """ 1193 Read fields for 'docnum' at the given 'offset'. This permits the 1194 retrieval of details for the specified document, as well as scanning for 1195 later documents. 1196 """ 1197 1198 self.f.seek(offset) 1199 bad_docnum, fields = self.read_fields() 1200 self.last_docnum = docnum 1201 return docnum, fields 1202 1203 class FieldIndexWriter(FileWriter): 1204 1205 "Writing field index details to files." 1206 1207 def reset(self): 1208 self.last_docnum = 0 1209 self.last_offset = 0 1210 1211 def write_document(self, docnum, offset): 1212 1213 """ 1214 Write for the given 'docnum', the 'offset' at which the fields for the 1215 document are stored in the fields file. 1216 """ 1217 1218 # Write the document number and offset deltas. 1219 1220 self.write_number(docnum - self.last_docnum) 1221 self.write_number(offset - self.last_offset) 1222 1223 self.last_docnum = docnum 1224 self.last_offset = offset 1225 1226 class FieldIndexReader(FileReader): 1227 1228 "Reading field index details from files." 1229 1230 def reset(self): 1231 self.last_docnum = 0 1232 self.last_offset = 0 1233 1234 def read_document(self): 1235 1236 "Read a document number and field file offset." 1237 1238 # Read the document number delta and offset. 1239 1240 self.last_docnum += self.read_number() 1241 self.last_offset += self.read_number() 1242 1243 return self.last_docnum, self.last_offset 1244 1245 class FieldDictionaryWriter: 1246 1247 "Writing field dictionary details." 1248 1249 def __init__(self, field_writer, field_index_writer, interval): 1250 self.field_writer = field_writer 1251 self.field_index_writer = field_index_writer 1252 self.interval = interval 1253 self.entry = 0 1254 1255 def write_fields(self, docnum, fields): 1256 1257 "Write details of the document with the given 'docnum' and 'fields'." 1258 1259 offset = self.field_writer.write_fields(docnum, fields) 1260 1261 if self.entry % self.interval == 0: 1262 self.field_index_writer.write_document(docnum, offset) 1263 1264 self.entry += 1 1265 1266 def close(self): 1267 self.field_writer.close() 1268 self.field_index_writer.close() 1269 1270 class FieldDictionaryReader: 1271 1272 "Reading field dictionary details." 1273 1274 def __init__(self, field_reader, field_index_reader): 1275 self.field_reader = field_reader 1276 self.field_index_reader = field_index_reader 1277 1278 self.docs = [] 1279 try: 1280 while 1: 1281 self.docs.append(self.field_index_reader.read_document()) 1282 except EOFError: 1283 pass 1284 1285 # Large numbers for ordering purposes. 1286 1287 if self.docs: 1288 self.max_offset = self.docs[-1][1] 1289 else: 1290 self.max_offset = None 1291 1292 # Iterator convenience methods. 1293 1294 def __iter__(self): 1295 self.rewind() 1296 return self 1297 1298 def next(self): 1299 try: 1300 return self.read_fields() 1301 except EOFError: 1302 raise StopIteration 1303 1304 # Sequential access methods. 1305 1306 def rewind(self): 1307 self.field_reader.rewind() 1308 1309 def read_fields(self): 1310 1311 "Return the next document number and fields." 1312 1313 return self.field_reader.read_fields() 1314 1315 # Random access methods. 1316 1317 def get_fields(self, docnum): 1318 1319 "Read the fields of the document with the given 'docnum'." 1320 1321 i = bisect_right(self.docs, (docnum, self.max_offset)) - 1 1322 1323 # Get the entry position providing the term or one preceding it. 1324 1325 if i == -1: 1326 return None 1327 1328 found_docnum, offset = self.docs[i] 1329 1330 # Read from the fields file. 1331 1332 found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset) 1333 1334 # Scan for the document, if necessary. 1335 1336 try: 1337 while docnum > found_docnum: 1338 found_docnum, fields = self.field_reader.read_fields() 1339 except EOFError: 1340 pass 1341 1342 # If the document is found, return the fields. 1343 1344 if docnum == found_docnum: 1345 return fields 1346 else: 1347 return None 1348 1349 def close(self): 1350 self.field_reader.close() 1351 self.field_index_reader.close() 1352 1353 # Dictionary merging classes. 1354 1355 class Merger: 1356 1357 "Merge files." 1358 1359 def __init__(self, writer, readers): 1360 self.writer = writer 1361 self.readers = readers 1362 1363 def close(self): 1364 for reader in self.readers: 1365 reader.close() 1366 self.writer.close() 1367 1368 class TermDictionaryMerger(Merger): 1369 1370 "Merge term and position files." 1371 1372 def merge(self): 1373 1374 """ 1375 Merge terms and positions from the readers, sending them to the writer. 1376 """ 1377 1378 last_term = None 1379 current_readers = [] 1380 1381 for term, frequency, doc_frequency, positions in itermerge(self.readers): 1382 if term == last_term: 1383 current_readers.append(positions) 1384 else: 1385 if current_readers: 1386 self.writer.write_term_positions(last_term, itermerge(current_readers)) 1387 last_term = term 1388 current_readers = [positions] 1389 else: 1390 if current_readers: 1391 self.writer.write_term_positions(last_term, itermerge(current_readers)) 1392 1393 class FieldDictionaryMerger(Merger): 1394 1395 "Merge field files." 1396 1397 def merge(self): 1398 1399 """ 1400 Merge fields from the readers, sending them to the writer. 1401 """ 1402 1403 for docnum, fields in itermerge(self.readers): 1404 self.writer.write_fields(docnum, fields) 1405 1406 # Utility functions. 1407 1408 def get_term_writer(pathname, partition, interval, doc_interval): 1409 1410 """ 1411 Return a term dictionary writer using files under the given 'pathname' 1412 labelled according to the given 'partition', using the given indexing 1413 'interval' for terms and 'doc_interval' for document position records. 1414 """ 1415 1416 tdf = open(join(pathname, "terms-%s" % partition), "wb") 1417 info_writer = TermWriter(tdf) 1418 1419 tdif = open(join(pathname, "terms_index-%s" % partition), "wb") 1420 index_writer = TermIndexWriter(tdif) 1421 1422 tpf = open(join(pathname, "positions-%s" % partition), "wb") 1423 positions_writer = PositionWriter(tpf) 1424 1425 tpif = open(join(pathname, "positions_index-%s" % partition), "wb") 1426 positions_index_writer = PositionIndexWriter(tpif) 1427 1428 positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval) 1429 1430 return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval) 1431 1432 def get_field_writer(pathname, partition, interval): 1433 1434 """ 1435 Return a field dictionary writer using files under the given 'pathname' 1436 labelled according to the given 'partition', using the given indexing 1437 'interval'. 1438 """ 1439 1440 ff = open(join(pathname, "fields-%s" % partition), "wb") 1441 field_writer = FieldWriter(ff) 1442 1443 fif = open(join(pathname, "fields_index-%s" % partition), "wb") 1444 field_index_writer = FieldIndexWriter(fif) 1445 1446 return FieldDictionaryWriter(field_writer, field_index_writer, interval) 1447 1448 def get_term_reader(pathname, partition): 1449 1450 """ 1451 Return a term dictionary reader using files under the given 'pathname' 1452 labelled according to the given 'partition'. 1453 """ 1454 1455 tdf = open(join(pathname, "terms-%s" % partition), "rb") 1456 info_reader = TermReader(tdf) 1457 1458 tdif = open(join(pathname, "terms_index-%s" % partition), "rb") 1459 index_reader = TermIndexReader(tdif) 1460 1461 positions_opener = PositionOpener(join(pathname, "positions-%s" % partition)) 1462 positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition)) 1463 1464 positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener) 1465 1466 return TermDictionaryReader(info_reader, index_reader, positions_dict_reader) 1467 1468 def get_field_reader(pathname, partition): 1469 1470 """ 1471 Return a field dictionary reader using files under the given 'pathname' 1472 labelled according to the given 'partition'. 1473 """ 1474 1475 ff = open(join(pathname, "fields-%s" % partition), "rb") 1476 field_reader = FieldReader(ff) 1477 1478 fif = open(join(pathname, "fields_index-%s" % partition), "rb") 1479 field_index_reader = FieldIndexReader(fif) 1480 1481 return FieldDictionaryReader(field_reader, field_index_reader) 1482 1483 def rename_files(pathname, names, from_partition, to_partition): 1484 for name in names: 1485 rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition))) 1486 1487 def rename_term_files(pathname, from_partition, to_partition): 1488 rename_files(pathname, TERM_FILENAMES, from_partition, to_partition) 1489 1490 def rename_field_files(pathname, from_partition, to_partition): 1491 rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition) 1492 1493 def remove_files(pathname, names, partition): 1494 for name in names: 1495 remove(join(pathname, "%s-%s" % (name, partition))) 1496 1497 def remove_term_files(pathname, partition): 1498 remove_files(pathname, TERM_FILENAMES, partition) 1499 1500 def remove_field_files(pathname, partition): 1501 remove_files(pathname, FIELD_FILENAMES, partition) 1502 1503 # High-level classes. 1504 1505 class Document: 1506 1507 "A container of document information." 1508 1509 def __init__(self, docnum): 1510 self.docnum = docnum 1511 self.fields = [] 1512 self.terms = {} 1513 1514 def add_position(self, term, position): 1515 1516 """ 1517 Add a position entry for the given 'term', indicating the given 1518 'position'. 1519 """ 1520 1521 self.terms.setdefault(term, []).append(position) 1522 1523 def add_field(self, identifier, value): 1524 1525 "Add a field having the given 'identifier' and 'value'." 1526 1527 self.fields.append((identifier, unicode(value))) # convert to string 1528 1529 def set_fields(self, fields): 1530 1531 """ 1532 Set the document's 'fields': a list of tuples each containing an integer 1533 identifier and a string value. 1534 """ 1535 1536 self.fields = fields 1537 1538 class IndexWriter: 1539 1540 """ 1541 Building term information and writing it to the term and field dictionaries. 1542 """ 1543 1544 def __init__(self, pathname, interval, doc_interval, flush_interval): 1545 self.pathname = pathname 1546 self.interval = interval 1547 self.doc_interval = doc_interval 1548 self.flush_interval = flush_interval 1549 1550 self.dict_partition = 0 1551 self.field_dict_partition = 0 1552 1553 self.terms = {} 1554 self.docs = {} 1555 1556 self.doc_counter = 0 1557 1558 def add_document(self, doc): 1559 1560 """ 1561 Add the given document 'doc', updating the document counter and flushing 1562 terms and fields if appropriate. 1563 """ 1564 1565 for term, positions in doc.terms.items(): 1566 self.terms.setdefault(term, {})[doc.docnum] = positions 1567 1568 self.docs[doc.docnum] = doc.fields 1569 1570 self.doc_counter += 1 1571 if self.flush_interval and self.doc_counter >= self.flush_interval: 1572 self.flush_terms() 1573 self.flush_fields() 1574 self.doc_counter = 0 1575 1576 def get_term_writer(self): 1577 1578 "Return a term dictionary writer for the current partition." 1579 1580 return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval) 1581 1582 def get_field_writer(self): 1583 1584 "Return a field dictionary writer for the current partition." 1585 1586 return get_field_writer(self.pathname, self.field_dict_partition, self.interval) 1587 1588 def flush_terms(self): 1589 1590 "Flush terms into the current term dictionary partition." 1591 1592 # Get the terms in order. 1593 1594 all_terms = self.terms 1595 terms = all_terms.keys() 1596 terms.sort() 1597 1598 dict_writer = self.get_term_writer() 1599 1600 for term in terms: 1601 doc_positions = all_terms[term].items() 1602 dict_writer.write_term_positions(term, doc_positions) 1603 1604 dict_writer.close() 1605 1606 self.terms = {} 1607 self.dict_partition += 1 1608 1609 def flush_fields(self): 1610 1611 "Flush fields into the current term dictionary partition." 1612 1613 # Get the documents in order. 1614 1615 docs = self.docs.items() 1616 docs.sort() 1617 1618 field_dict_writer = self.get_field_writer() 1619 1620 for docnum, fields in docs: 1621 field_dict_writer.write_fields(docnum, fields) 1622 1623 field_dict_writer.close() 1624 1625 self.docs = {} 1626 self.field_dict_partition += 1 1627 1628 def close(self): 1629 if self.terms: 1630 self.flush_terms() 1631 if self.docs: 1632 self.flush_fields() 1633 1634 class IndexReader: 1635 1636 "Accessing the term and field dictionaries." 1637 1638 def __init__(self, pathname): 1639 self.dict_reader = get_term_reader(pathname, "merged") 1640 self.field_dict_reader = get_field_reader(pathname, "merged") 1641 1642 def find_terms(self, term): 1643 return self.dict_reader.find_terms(term) 1644 1645 def find_positions(self, term): 1646 return self.dict_reader.find_positions(term) 1647 1648 def get_frequency(self, term): 1649 return self.dict_reader.get_frequency(term) 1650 1651 def get_document_frequency(self, term): 1652 return self.dict_reader.get_document_frequency(term) 1653 1654 def get_fields(self, docnum): 1655 return self.field_dict_reader.get_fields(docnum) 1656 1657 def close(self): 1658 self.dict_reader.close() 1659 self.field_dict_reader.close() 1660 1661 class Index: 1662 1663 "An inverted index solution encapsulating the various components." 1664 1665 def __init__(self, pathname): 1666 self.pathname = pathname 1667 self.reader = None 1668 self.writer = None 1669 1670 def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL): 1671 1672 """ 1673 Return a writer, optionally using the given indexing 'interval', 1674 'doc_interval' and 'flush_interval'. 1675 """ 1676 1677 if not exists(self.pathname): 1678 mkdir(self.pathname) 1679 1680 self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval) 1681 return self.writer 1682 1683 def get_reader(self, partition=0): 1684 1685 "Return a reader for the index." 1686 1687 # Ensure that only one partition exists. 1688 1689 self.merge() 1690 return self._get_reader(partition) 1691 1692 def _get_reader(self, partition): 1693 1694 "Return a reader for the index." 1695 1696 if not exists(self.pathname): 1697 raise OSError, "Index path %r does not exist." % self.pathname 1698 1699 self.reader = IndexReader(self.pathname) 1700 return self.reader 1701 1702 def merge(self): 1703 1704 "Merge/optimise index partitions." 1705 1706 self.merge_terms() 1707 self.merge_fields() 1708 1709 def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL): 1710 1711 """ 1712 Merge term dictionaries using the given indexing 'interval' and 1713 'doc_interval'. 1714 """ 1715 1716 readers = [] 1717 partitions = set() 1718 1719 for filename in listdir(self.pathname): 1720 if filename.startswith("terms-"): # 6 character prefix 1721 partition = filename[6:] 1722 readers.append(get_term_reader(self.pathname, partition)) 1723 partitions.add(partition) 1724 1725 # Write directly to a dictionary. 1726 1727 if len(readers) > 1: 1728 if "merged" in partitions: 1729 rename_term_files(self.pathname, "merged", "old-merged") 1730 partitions.remove("merged") 1731 partitions.add("old-merged") 1732 1733 writer = get_term_writer(self.pathname, "merged", interval, doc_interval) 1734 merger = TermDictionaryMerger(writer, readers) 1735 merger.merge() 1736 merger.close() 1737 1738 # Remove old files. 1739 1740 for partition in partitions: 1741 remove_term_files(self.pathname, partition) 1742 1743 elif len(readers) == 1: 1744 partition = list(partitions)[0] 1745 if partition != "merged": 1746 rename_term_files(self.pathname, partition, "merged") 1747 1748 def merge_fields(self, interval=FIELD_INTERVAL): 1749 1750 "Merge field dictionaries using the given indexing 'interval'." 1751 1752 readers = [] 1753 partitions = set() 1754 1755 for filename in listdir(self.pathname): 1756 if filename.startswith("fields-"): # 7 character prefix 1757 partition = filename[7:] 1758 readers.append(get_field_reader(self.pathname, partition)) 1759 partitions.add(partition) 1760 1761 # Write directly to a dictionary. 1762 1763 if len(readers) > 1: 1764 if "merged" in partitions: 1765 rename_field_files(self.pathname, "merged", "old-merged") 1766 partitions.remove("merged") 1767 partitions.add("old-merged") 1768 1769 writer = get_field_writer(self.pathname, "merged", interval) 1770 merger = FieldDictionaryMerger(writer, readers) 1771 merger.merge() 1772 merger.close() 1773 1774 # Remove old files. 1775 1776 for partition in partitions: 1777 remove_field_files(self.pathname, partition) 1778 1779 elif len(readers) == 1: 1780 partition = list(partitions)[0] 1781 if partition != "merged": 1782 rename_field_files(self.pathname, partition, "merged") 1783 1784 def close(self): 1785 if self.reader is not None: 1786 self.reader.close() 1787 self.reader = None 1788 if self.writer is not None: 1789 self.writer.close() 1790 self.writer = None 1791 1792 # vim: tabstop=4 expandtab shiftwidth=4