iixr (file iixr.py at f0f1799c9f4c)

     1 #!/usr/bin/env python     2      3 """     4 A simple (and sane) text indexing library.     5      6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from os import dup, fdopen       # independent iterator access to files    22 from os import listdir, mkdir    # index and partition discovery    23 from os import remove, rename    # partition manipulation    24 from os.path import exists, join    25 from os.path import commonprefix # to find common string prefixes    26 from bisect import bisect_right  # to find terms in the dictionary index    27 import bz2, zlib                 # for field compression    28 from itermerge import itermerge    29     30 try:    31     set    32 except NameError:    33     from sets import Set as set    34     35 # Constants.    36     37 TERM_INTERVAL     = 100    38 DOCUMENT_INTERVAL = 100    39 FIELD_INTERVAL    = 100    40 FLUSH_INTERVAL    = 10000    41     42 TERM_FILENAMES    = "terms", "terms_index", "positions", "positions_index"    43 FIELD_FILENAMES   = "fields", "fields_index"    44     45 compressors = [("b", bz2.compress), ("z", zlib.compress)]    46 decompressors = {"b" : bz2.decompress, "z" : zlib.decompress}    47     48 # Utility functions.    49     50 try:    51     from vint import vint as _vint    52     53     def vint(number):    54     55         "Write 'number' as a variable-length integer."    56     57         if number >= 0:    58             return _vint(number)    59         else:    60             raise ValueError, "Number %r is negative." % number    61     62 except ImportError:    63     64     def vint(number):    65     66         "Write 'number' as a variable-length integer."    67     68         if number >= 0:    69     70             # Special case: one byte containing a 7-bit number.    71     72             if number < 128:    73                 return chr(number)    74     75             # Write the number from least to most significant digits.    76     77             bytes = []    78     79             while number != 0:    80                 lsd = number & 127    81                 number = number >> 7    82                 if number != 0:    83                     lsd |= 128    84                 bytes.append(chr(lsd))    85     86             return "".join(bytes)    87     88         # Negative numbers are not supported.    89     90         else:    91             raise ValueError, "Number %r is negative." % number    92     93 # Foundation classes.    94     95 class File:    96     97     "A basic file abstraction."    98     99     def __init__(self, f):   100         self.f = f   101         self.reset()   102         self.cache = []   103         self.cache_length = 0   104    105     def reset(self):   106    107         "To be used to reset the state of the reader or writer between records."   108    109         pass   110    111     def rewind(self):   112         self.f.seek(0)   113         self.reset()   114    115     def write(self, s):   116         self.cache.append(s)   117         self.cache_length += len(s)   118         if len(self.cache) >= 1000:   119             self.flush()   120    121     def tell(self):   122         return self.f.tell() + self.cache_length   123    124     def flush(self):   125         self.f.write("".join(self.cache))   126         self.cache = []   127         self.cache_length = 0   128    129     def close(self):   130         if self.f is not None:   131             self.flush()   132             self.f.close()   133             self.f = None   134    135 class FileWriter(File):   136    137     "Writing basic data types to files."   138    139     def write_number(self, number):   140    141         "Write 'number' to the file using a variable length encoding."   142    143         self.write(vint(number))   144    145     def write_string(self, s, compress=0):   146    147         """   148         Write 's' to the file, recording its length and compressing the string   149         if 'compress' is set to a true value.   150         """   151    152         # Convert Unicode objects to strings.   153    154         if isinstance(s, unicode):   155             s = s.encode("utf-8")   156    157         # Compress the string if requested.   158    159         if compress:   160             for flag, fn in compressors:   161                 cs = fn(s)   162    163                 # Take the first string shorter than the original.   164    165                 if len(cs) < len(s):   166                     s = cs   167                     break   168             else:   169                 flag = "-"   170    171         else:   172             flag = ""   173    174         # Write the length of the data before the data itself.   175    176         length = len(s)   177         self.write(flag + vint(length) + s)   178    179 class FileReader(File):   180    181     "Reading basic data types from files."   182    183     def read_number(self):   184    185         "Read a number from the file."   186    187         # Read each byte, adding it to the number.   188    189         shift = 0   190         number = 0   191         read = self.f.read   192    193         try:   194             csd = ord(read(1))   195             while csd & 128:   196                 number += ((csd & 127) << shift)   197                 shift += 7   198                 csd = ord(read(1))   199             else:   200                 number += (csd << shift)   201         except TypeError:   202             raise EOFError   203    204         return number   205    206     def read_string(self, decompress=0):   207    208         """   209         Read a string from the file, decompressing the stored data if   210         'decompress' is set to a true value.   211         """   212    213         # Decompress the data if requested.   214    215         if decompress:   216             flag = self.f.read(1)   217         else:   218             flag = "-"   219    220         length = self.read_number()   221         s = self.f.read(length)   222    223         # Perform decompression if applicable.   224    225         if flag != "-":   226             fn = decompressors[flag]   227             s = fn(s)   228    229         # Convert strings to Unicode objects.   230    231         return unicode(s, "utf-8")   232    233 class FileOpener:   234    235     "Opening files using their filenames."   236    237     def __init__(self, filename):   238         self.filename = filename   239    240     def open(self, mode):   241         return open(self.filename, mode)   242    243     def close(self):   244         pass   245    246 # Specific classes for storing term and position information.   247    248 class PositionWriter(FileWriter):   249    250     "Writing position information to files."   251    252     def reset(self):   253         self.last_docnum = 0   254    255     def write_positions(self, docnum, positions):   256    257         """   258         Write for the document 'docnum' the given 'positions'.   259         Return the offset of the written record.   260         """   261    262         if docnum < self.last_docnum:   263             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)   264    265         # Record the offset of this record.   266    267         offset = self.tell()   268    269         # Make sure that the positions are sorted.   270    271         positions.sort()   272    273         # Write the position deltas.   274    275         output = []   276         last = 0   277    278         for position in positions:   279             output.append(vint(position - last))   280             last = position   281    282         # Write the document number delta.   283         # Write the number of positions.   284         # Then write the positions.   285    286         self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))   287    288         self.last_docnum = docnum   289         return offset   290    291 class PositionOpener(FileOpener):   292    293     "Reading position information from files."   294    295     def read_term_positions(self, offset, count):   296    297         """   298         Read all positions from 'offset', seeking to that position in the file   299         before reading. The number of documents available for reading is limited   300         to 'count'.   301         """   302    303         # Duplicate the file handle.   304    305         f = self.open("rb")   306         f.seek(offset)   307         return PositionIterator(f, count)   308    309 class PositionIndexWriter(FileWriter):   310    311     "Writing position index information to files."   312    313     def reset(self):   314         self.last_docnum = 0   315         self.last_pos_offset = 0   316    317     def write_positions(self, docnum, pos_offset, count):   318    319         """   320         Write the given 'docnum, 'pos_offset' and document 'count' to the   321         position index file.   322         """   323    324         # Record the offset of this record.   325    326         offset = self.tell()   327         output = []   328    329         # Write the document number delta.   330    331         output.append(vint(docnum - self.last_docnum))   332         self.last_docnum = docnum   333    334         # Write the position file offset delta.   335    336         output.append(vint(pos_offset - self.last_pos_offset))   337         self.last_pos_offset = pos_offset   338    339         # Write the document count.   340    341         output.append(vint(count))   342    343         # Actually write the data.   344    345         self.write("".join(output))   346    347         return offset   348    349 class PositionIndexOpener(FileOpener):   350    351     "Reading position index information from files."   352    353     def read_term_positions(self, offset, doc_frequency):   354    355         """   356         Read all positions from 'offset', seeking to that position in the file   357         before reading. The number of documents available for reading is limited   358         to 'doc_frequency'.   359         """   360    361         # Duplicate the file handle.   362    363         f = self.open("rb")   364         f.seek(offset)   365         return PositionIndexIterator(f, doc_frequency)   366    367 # Iterators for position-related files.   368    369 class IteratorBase:   370    371     def __init__(self, count):   372         self.replenish(count)   373    374     def replenish(self, count):   375         self.count = count   376         self.read_documents = 0   377    378     def __len__(self):   379         return self.count   380    381     def sort(self):   382         pass # Stored document positions are already sorted.   383    384     def __iter__(self):   385         return self   386    387 class PositionIterator(FileReader, IteratorBase):   388    389     "Iterating over document positions."   390    391     def __init__(self, f, count):   392         FileReader.__init__(self, f)   393         IteratorBase.__init__(self, count)   394    395     def reset(self):   396         self.last_docnum = 0   397    398     def read_positions(self):   399    400         "Read positions, returning a document number and a list of positions."   401    402         # Read the document number delta and add it to the last number.   403    404         self.last_docnum += self.read_number()   405    406         # Read the number of positions.   407    408         npositions = self.read_number()   409    410         # Read the position deltas, adding each previous position to get the   411         # appropriate collection of absolute positions.   412    413         i = 0   414         last = 0   415         positions = []   416    417         while i < npositions:   418             last += self.read_number()   419             positions.append(last)   420             i += 1   421    422         return self.last_docnum, positions   423    424     def next(self):   425    426         "Read positions for a single document."   427    428         if self.read_documents < self.count:   429             self.read_documents += 1   430             return self.read_positions()   431         else:   432             raise StopIteration   433    434 class PositionIndexIterator(FileReader, IteratorBase):   435    436     "Iterating over document positions."   437    438     def __init__(self, f, count):   439         FileReader.__init__(self, f)   440         IteratorBase.__init__(self, count)   441         self.section_count = 0   442    443     def reset(self):   444         self.last_docnum = 0   445         self.last_pos_offset = 0   446    447     def read_positions(self):   448    449         """   450         Read a document number, a position file offset for the position index   451         file, and the number of documents in a section of that file.   452         """   453    454         # Read the document number delta.   455    456         self.last_docnum += self.read_number()   457    458         # Read the offset delta.   459    460         self.last_pos_offset += self.read_number()   461    462         # Read the document count.   463    464         count = self.read_number()   465    466         return self.last_docnum, self.last_pos_offset, count   467    468     def next(self):   469    470         "Read positions for a single document."   471    472         self.read_documents += self.section_count   473         if self.read_documents < self.count:   474             docnum, pos_offset, self.section_count = t = self.read_positions()   475             return t   476         else:   477             raise StopIteration   478    479 class PositionDictionaryWriter:   480    481     "Writing position dictionaries."   482    483     def __init__(self, position_writer, position_index_writer, interval):   484         self.position_writer = position_writer   485         self.position_index_writer = position_index_writer   486         self.interval = interval   487    488     def write_term_positions(self, doc_positions):   489    490         """   491         Write all 'doc_positions' - a collection of tuples of the form (document   492         number, position list) - to the file.   493    494         Add some records to the index, making dictionary entries.   495    496         Return a tuple containing the offset of the written data, the frequency   497         (number of positions), and document frequency (number of documents) for   498         the term involved.   499         """   500    501         # Reset the writers.   502    503         self.position_writer.reset()   504         self.position_index_writer.reset()   505    506         index_offset = None   507    508         # Write the positions.   509    510         frequency = 0   511         first_docnum = None   512         first_offset = None   513         count = 0   514    515         doc_positions.sort()   516    517         for docnum, positions in doc_positions:   518             pos_offset = self.position_writer.write_positions(docnum, positions)   519    520             # Retain the first record offset for a subsequent index entry.   521    522             if first_offset is None:   523                 first_offset = pos_offset   524                 first_docnum = docnum   525    526             frequency += len(positions)   527             count += 1   528    529             # Every {interval} entries, write an index entry.   530    531             if count % self.interval == 0:   532                 io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)   533    534                 # Remember the first index entry offset.   535    536                 if index_offset is None:   537                     index_offset = io   538    539                 first_offset = None   540                 first_docnum = None   541    542                 # Reset the position writer so that position readers accessing   543                 # a section start with the correct document number.   544    545                 self.position_writer.reset()   546    547         # Finish writing an index entry for the remaining documents.   548    549         else:   550             if first_offset is not None:   551                 io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)   552    553                 # Remember the first index entry offset.   554    555                 if index_offset is None:   556                     index_offset = io   557    558         return index_offset, frequency, count   559    560     def close(self):   561         self.position_writer.close()   562         self.position_index_writer.close()   563    564 class PositionDictionaryReader:   565    566     "Reading position dictionaries."   567    568     def __init__(self, position_opener, position_index_opener):   569         self.position_opener = position_opener   570         self.position_index_opener = position_index_opener   571    572     def read_term_positions(self, offset, doc_frequency):   573    574         """   575         Return an iterator for dictionary entries starting at 'offset' with the   576         given 'doc_frequency'.   577         """   578    579         return PositionDictionaryIterator(self.position_opener,   580             self.position_index_opener, offset, doc_frequency)   581    582     def close(self):   583         pass   584    585 class PositionDictionaryIterator:   586    587     "Iteration over position dictionary entries."   588    589     def __init__(self, position_opener, position_index_opener, offset, doc_frequency):   590         self.position_opener = position_opener   591         self.doc_frequency = doc_frequency   592         self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)   593         self.iterator = None   594    595         # Remember the last values.   596    597         self.found_docnum, self.found_positions = None, None   598    599         # Maintain state for the next index entry, if read.   600    601         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   602    603         # Initialise the current index entry and current position file iterator.   604    605         self._next_section()   606         self._init_section()   607    608     # Sequence methods.   609    610     def __len__(self):   611         return self.doc_frequency   612    613     def sort(self):   614         pass   615    616     # Iterator methods.   617    618     def __iter__(self):   619         return self   620    621     def next(self):   622    623         """   624         Attempt to get the next document record from the section in the   625         positions file.   626         """   627    628         # Return any visited but unrequested record.   629    630         if self.found_docnum is not None:   631             t = self.found_docnum, self.found_positions   632             self.found_docnum, self.found_positions = None, None   633             return t   634    635         # Or search for the next record.   636    637         while 1:   638    639             # Either return the next record.   640    641             try:   642                 return self.iterator.next()   643    644             # Or, where a section is finished, get the next section and try again.   645    646             except StopIteration:   647    648                 # Where a section follows, update the index iterator, but keep   649                 # reading using the same file iterator (since the data should   650                 # just follow on from the last section).   651    652                 self._next_section()   653                 self.iterator.replenish(self.section_count)   654    655                 # Reset the state of the iterator to make sure that document   656                 # numbers are correct.   657    658                 self.iterator.reset()   659    660     def from_document(self, docnum):   661    662         """   663         Attempt to navigate to a positions entry for the given 'docnum',   664         returning the positions for 'docnum', or None otherwise.   665         """   666    667         # Return any unrequested document positions.   668    669         if docnum == self.found_docnum:   670             return self.found_positions   671    672         # Read ahead in the index until the next entry refers to a document   673         # later than the desired document.   674    675         try:   676             if self.next_docnum is None:   677                 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()   678    679             # Read until the next entry is after the desired document number,   680             # or until the end of the results.   681    682             while self.next_docnum <= docnum:   683                 self._next_read_section()   684                 if self.docnum < docnum:   685                     self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()   686                 else:   687                     break   688    689         except StopIteration:   690             pass   691    692         # Navigate in the position file to the document.   693    694         self._init_section()   695    696         try:   697             while 1:   698                 found_docnum, found_positions = self.iterator.next()   699    700                 # Return the desired document positions or None (retaining the   701                 # positions for the document immediately after).   702    703                 if docnum == found_docnum:   704                     return found_positions   705                 elif docnum < found_docnum:   706                     self.found_docnum, self.found_positions = found_docnum, found_positions   707                     return None   708    709         except StopIteration:   710             return None   711    712     # Internal methods.   713    714     def _next_section(self):   715    716         "Attempt to get the next section in the index."   717    718         if self.next_docnum is None:   719             self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()   720         else:   721             self._next_read_section()   722    723     def _next_read_section(self):   724    725         """   726         Make the next index entry the current one without reading from the   727         index.   728         """   729    730         self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count   731         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   732    733     def _init_section(self):   734    735         "Initialise the iterator for the section in the position file."   736    737         if self.iterator is not None:   738             self.iterator.close()   739         self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)   740    741     def close(self):   742         if self.iterator is not None:   743             self.iterator.close()   744             self.iterator = None   745         if self.index_iterator is not None:   746             self.index_iterator.close()   747             self.index_iterator = None   748    749 class TermWriter(FileWriter):   750    751     "Writing term information to files."   752    753     def reset(self):   754         self.last_term = ""   755         self.last_offset = 0   756    757     def write_term(self, term, offset, frequency, doc_frequency):   758    759         """   760         Write the given 'term', its position file 'offset', its 'frequency' and   761         its 'doc_frequency' (number of documents in which it appears) to the   762         term information file. Return the offset after the term information was   763         written to the file.   764         """   765    766         # Write the prefix length and term suffix.   767    768         common = len(commonprefix([self.last_term, term]))   769         suffix = term[common:]   770    771         self.write_number(common)   772         self.write_string(suffix)   773    774         # Write the offset delta.   775    776         self.write_number(offset - self.last_offset)   777    778         # Write the frequency.   779    780         self.write_number(frequency)   781    782         # Write the document frequency.   783    784         self.write_number(doc_frequency)   785    786         self.last_term = term   787         self.last_offset = offset   788    789         return self.tell()   790    791 class TermReader(FileReader):   792    793     "Reading term information from files."   794    795     def reset(self):   796         self.last_term = ""   797         self.last_offset = 0   798    799     def read_term(self):   800    801         """   802         Read a term, its position file offset, its frequency and its document   803         frequency from the term information file.   804         """   805    806         # Read the prefix length and term suffix.   807    808         common = self.read_number()   809         suffix = self.read_string()   810    811         self.last_term = self.last_term[:common] + suffix   812    813         # Read the offset delta.   814    815         self.last_offset += self.read_number()   816    817         # Read the frequency.   818    819         frequency = self.read_number()   820    821         # Read the document frequency.   822    823         doc_frequency = self.read_number()   824    825         return self.last_term, self.last_offset, frequency, doc_frequency   826    827     def go_to_term(self, term, offset, info_offset):   828    829         """   830         Seek past the entry for 'term' having 'offset' to 'info_offset'. This   831         permits the scanning for later terms from the specified term.   832         """   833    834         self.f.seek(info_offset)   835         self.last_term = term   836         self.last_offset = offset   837    838 class TermIndexWriter(TermWriter):   839    840     "Writing term dictionary index details to files."   841    842     def reset(self):   843         TermWriter.reset(self)   844         self.last_info_offset = 0   845    846     def write_term(self, term, offset, frequency, doc_frequency, info_offset):   847    848         """   849         Write the given 'term', its position file 'offset', its 'frequency' and   850         its 'doc_frequency' to the term dictionary index file, along with the   851         'info_offset' in the term information file.   852         """   853    854         TermWriter.write_term(self, term, offset, frequency, doc_frequency)   855    856         # Write the information file offset delta.   857    858         self.write_number(info_offset - self.last_info_offset)   859         self.last_info_offset = info_offset   860    861 class TermIndexReader(TermReader):   862    863     "Reading term dictionary index details from files."   864    865     def reset(self):   866         TermReader.reset(self)   867         self.last_info_offset = 0   868    869     def read_term(self):   870    871         """   872         Read a term, its position file offset, its frequency, its document   873         frequency and a term information file offset from the term dictionary   874         index file.   875         """   876    877         term, offset, frequency, doc_frequency = TermReader.read_term(self)   878    879         # Read the offset delta.   880    881         self.last_info_offset += self.read_number()   882    883         return term, offset, frequency, doc_frequency, self.last_info_offset   884    885 class TermDictionaryWriter:   886    887     "Writing term dictionaries."   888    889     def __init__(self, info_writer, index_writer, position_dict_writer, interval):   890         self.info_writer = info_writer   891         self.index_writer = index_writer   892         self.position_dict_writer = position_dict_writer   893         self.interval = interval   894         self.entry = 0   895    896     def _write_term(self, term, offset, frequency, doc_frequency):   897    898         """   899         Write the given 'term', its position file 'offset', its 'frequency' and   900         its 'doc_frequency' (number of documents in which it appears) to the   901         term information file. Return the offset after the term information was   902         written to the file.   903         """   904    905         info_offset = self.info_writer.write_term(term, offset, frequency, doc_frequency)   906    907         if self.entry % self.interval == 0:   908             self.index_writer.write_term(term, offset, frequency, doc_frequency, info_offset)   909    910         self.entry += 1   911    912     def write_term_positions(self, term, doc_positions):   913    914         """   915         Write the given 'term' and the 'doc_positions' recording the documents   916         and positions at which the term is found.   917         """   918    919         offset, frequency, doc_frequency = self.position_dict_writer.write_term_positions(doc_positions)   920         self._write_term(term, offset, frequency, doc_frequency)   921    922     def close(self):   923         self.info_writer.close()   924         self.index_writer.close()   925         self.position_dict_writer.close()   926    927 class TermDictionaryReader:   928    929     "Reading term dictionaries."   930    931     def __init__(self, info_reader, index_reader, position_dict_reader):   932         self.info_reader = info_reader   933         self.index_reader = index_reader   934         self.position_dict_reader = position_dict_reader   935    936         self.terms = []   937         try:   938             while 1:   939                 self.terms.append(self.index_reader.read_term())   940         except EOFError:   941             pass   942    943         # Large numbers for ordering purposes.   944    945         if self.terms:   946             self.max_offset = self.terms[-1][1] + 1   947         else:   948             self.max_offset = None   949    950     def _find_closest_entry(self, term):   951    952         """   953         Find the offsets and frequencies of 'term' from the term dictionary or   954         the closest term starting with the value of 'term'.   955    956         Return the closest index entry consisting of a term, the position file   957         offset, the term frequency, the document frequency, and the term details   958         file offset.   959         """   960    961         i = bisect_right(self.terms, (term, self.max_offset, 0, 0)) - 1   962    963         # Get the entry position providing the term or one preceding it.   964         # If no entry precedes the requested term, return the very first entry   965         # as the closest.   966    967         if i == -1:   968             return self.terms[0]   969         else:   970             return self.terms[i]   971    972     def _find_closest_term(self, term):   973    974         """   975         Find the offsets and frequencies of 'term' from the term dictionary or   976         the closest term starting with the value of 'term'.   977    978         Return the closest term (or the term itself), the position file offset,   979         the term frequency, the document frequency, and the term details file   980         offset (or None if the reader is already positioned).   981         """   982    983         found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_entry(term)   984    985         # Where the term is found immediately, return the offset and   986         # frequencies. If the term does not appear, return the details of the   987         # closest entry.   988    989         if term <= found_term:   990             return found_term, offset, frequency, doc_frequency, info_offset   991    992         # Otherwise, seek past the index term's entry in the information file   993         # and scan for the desired term.   994    995         else:   996             self.info_reader.go_to_term(found_term, offset, info_offset)   997             try:   998                 while term > found_term:   999                     found_term, offset, frequency, doc_frequency = self.info_reader.read_term()  1000             except EOFError:  1001                 pass  1002   1003             return found_term, offset, frequency, doc_frequency, None  1004   1005     def _find_term(self, term):  1006   1007         """  1008         Find the position file offset and frequency of 'term' from the term  1009         dictionary.  1010         """  1011   1012         found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)  1013   1014         # If the term is found, return the offset and frequencies.  1015   1016         if term == found_term:  1017             return offset, frequency, doc_frequency  1018         else:  1019             return None  1020   1021     def _get_positions(self, offset, doc_frequency):  1022         return self.position_dict_reader.read_term_positions(offset, doc_frequency)  1023   1024     # Iterator convenience methods.  1025   1026     def __iter__(self):  1027         self.rewind()  1028         return self  1029   1030     def next(self):  1031         try:  1032             return self.read_term()  1033         except EOFError:  1034             raise StopIteration  1035   1036     # Sequential access methods.  1037   1038     def rewind(self):  1039         self.info_reader.rewind()  1040   1041     def read_term(self):  1042   1043         """  1044         Return the next term, its frequency, its document frequency, and the  1045         documents and positions at which the term is found.  1046         """  1047   1048         term, offset, frequency, doc_frequency = self.info_reader.read_term()  1049         positions = self._get_positions(offset, doc_frequency)  1050         return term, frequency, doc_frequency, positions  1051   1052     # Query methods.  1053   1054     def find_terms(self, term):  1055   1056         "Return all terms whose values start with the value of 'term'."  1057   1058         terms = []  1059   1060         found_term, offset, frequency, doc_frequency, info_offset = self._find_closest_term(term)  1061   1062         # Position the reader, if necessary.  1063   1064         if info_offset is not None:  1065             self.info_reader.go_to_term(found_term, offset, info_offset)  1066   1067         # Read and record terms.  1068   1069         try:  1070             # Add the found term if it starts with the specified term.  1071   1072             while found_term.startswith(term):  1073                 terms.append(found_term)  1074                 found_term, offset, frequency, doc_frequency = self.info_reader.read_term()  1075   1076         except EOFError:  1077             pass  1078   1079         return terms  1080   1081     def find_positions(self, term):  1082   1083         "Return the documents and positions at which the given 'term' is found."  1084   1085         t = self._find_term(term)  1086         if t is None:  1087             return None  1088         else:  1089             offset, frequency, doc_frequency = t  1090             return self._get_positions(offset, doc_frequency)  1091   1092     def get_frequency(self, term):  1093   1094         "Return the frequency of the given 'term'."  1095   1096         t = self._find_term(term)  1097         if t is None:  1098             return None  1099         else:  1100             offset, frequency, doc_frequency = t  1101             return frequency  1102   1103     def get_document_frequency(self, term):  1104   1105         "Return the document frequency of the given 'term'."  1106   1107         t = self._find_term(term)  1108         if t is None:  1109             return None  1110         else:  1111             offset, frequency, doc_frequency = t  1112             return doc_frequency  1113   1114     def close(self):  1115         self.info_reader.close()  1116         self.index_reader.close()  1117         self.position_dict_reader.close()  1118   1119 # Specific classes for storing document information.  1120   1121 class FieldWriter(FileWriter):  1122   1123     "Writing field data to files."  1124   1125     def reset(self):  1126         self.last_docnum = 0  1127   1128     def write_fields(self, docnum, fields):  1129   1130         """  1131         Write for the given 'docnum', a list of 'fields' (integer, string pairs  1132         representing field identifiers and values respectively).  1133         Return the offset at which the fields are stored.  1134         """  1135   1136         offset = self.tell()  1137   1138         # Write the document number delta.  1139   1140         self.write_number(docnum - self.last_docnum)  1141   1142         # Write the number of fields.  1143   1144         self.write_number(len(fields))  1145   1146         # Write the fields themselves.  1147   1148         for i, field in fields:  1149             self.write_number(i)  1150             self.write_string(field, 1) # compress  1151   1152         self.last_docnum = docnum  1153         return offset  1154   1155 class FieldReader(FileReader):  1156   1157     "Reading field data from files."  1158   1159     def reset(self):  1160         self.last_docnum = 0  1161   1162     def read_fields(self):  1163   1164         """  1165         Read fields from the file, returning a tuple containing the document  1166         number and a list of field (identifier, value) pairs.  1167         """  1168   1169         # Read the document number.  1170   1171         self.last_docnum += self.read_number()  1172   1173         # Read the number of fields.  1174   1175         nfields = self.read_number()  1176   1177         # Collect the fields.  1178   1179         fields = []  1180         i = 0  1181   1182         while i < nfields:  1183             identifier = self.read_number()  1184             value = self.read_string(1) # decompress  1185             fields.append((identifier, value))  1186             i += 1  1187   1188         return self.last_docnum, fields  1189   1190     def read_document_fields(self, docnum, offset):  1191   1192         """  1193         Read fields for 'docnum' at the given 'offset'. This permits the  1194         retrieval of details for the specified document, as well as scanning for  1195         later documents.  1196         """  1197   1198         self.f.seek(offset)  1199         bad_docnum, fields = self.read_fields()  1200         self.last_docnum = docnum  1201         return docnum, fields  1202   1203 class FieldIndexWriter(FileWriter):  1204   1205     "Writing field index details to files."  1206   1207     def reset(self):  1208         self.last_docnum = 0  1209         self.last_offset = 0  1210   1211     def write_document(self, docnum, offset):  1212   1213         """  1214         Write for the given 'docnum', the 'offset' at which the fields for the  1215         document are stored in the fields file.  1216         """  1217   1218         # Write the document number and offset deltas.  1219   1220         self.write_number(docnum - self.last_docnum)  1221         self.write_number(offset - self.last_offset)  1222   1223         self.last_docnum = docnum  1224         self.last_offset = offset  1225   1226 class FieldIndexReader(FileReader):  1227   1228     "Reading field index details from files."  1229   1230     def reset(self):  1231         self.last_docnum = 0  1232         self.last_offset = 0  1233   1234     def read_document(self):  1235   1236         "Read a document number and field file offset."  1237   1238         # Read the document number delta and offset.  1239   1240         self.last_docnum += self.read_number()  1241         self.last_offset += self.read_number()  1242   1243         return self.last_docnum, self.last_offset  1244   1245 class FieldDictionaryWriter:  1246   1247     "Writing field dictionary details."  1248   1249     def __init__(self, field_writer, field_index_writer, interval):  1250         self.field_writer = field_writer  1251         self.field_index_writer = field_index_writer  1252         self.interval = interval  1253         self.entry = 0  1254   1255     def write_fields(self, docnum, fields):  1256   1257         "Write details of the document with the given 'docnum' and 'fields'."  1258   1259         offset = self.field_writer.write_fields(docnum, fields)  1260   1261         if self.entry % self.interval == 0:  1262             self.field_index_writer.write_document(docnum, offset)  1263   1264         self.entry += 1  1265   1266     def close(self):  1267         self.field_writer.close()  1268         self.field_index_writer.close()  1269   1270 class FieldDictionaryReader:  1271   1272     "Reading field dictionary details."  1273   1274     def __init__(self, field_reader, field_index_reader):  1275         self.field_reader = field_reader  1276         self.field_index_reader = field_index_reader  1277   1278         self.docs = []  1279         try:  1280             while 1:  1281                 self.docs.append(self.field_index_reader.read_document())  1282         except EOFError:  1283             pass  1284   1285         # Large numbers for ordering purposes.  1286   1287         if self.docs:  1288             self.max_offset = self.docs[-1][1]  1289         else:  1290             self.max_offset = None  1291   1292     # Iterator convenience methods.  1293   1294     def __iter__(self):  1295         self.rewind()  1296         return self  1297   1298     def next(self):  1299         try:  1300             return self.read_fields()  1301         except EOFError:  1302             raise StopIteration  1303   1304     # Sequential access methods.  1305   1306     def rewind(self):  1307         self.field_reader.rewind()  1308   1309     def read_fields(self):  1310   1311         "Return the next document number and fields."  1312   1313         return self.field_reader.read_fields()  1314   1315     # Random access methods.  1316   1317     def get_fields(self, docnum):  1318   1319         "Read the fields of the document with the given 'docnum'."  1320   1321         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1  1322   1323         # Get the entry position providing the term or one preceding it.  1324   1325         if i == -1:  1326             return None  1327   1328         found_docnum, offset = self.docs[i]  1329   1330         # Read from the fields file.  1331   1332         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)  1333   1334         # Scan for the document, if necessary.  1335   1336         try:  1337             while docnum > found_docnum:  1338                 found_docnum, fields = self.field_reader.read_fields()  1339         except EOFError:  1340             pass  1341   1342         # If the document is found, return the fields.  1343   1344         if docnum == found_docnum:  1345             return fields  1346         else:  1347             return None  1348   1349     def close(self):  1350         self.field_reader.close()  1351         self.field_index_reader.close()  1352   1353 # Dictionary merging classes.  1354   1355 class Merger:  1356   1357     "Merge files."  1358   1359     def __init__(self, writer, readers):  1360         self.writer = writer  1361         self.readers = readers  1362   1363     def close(self):  1364         for reader in self.readers:  1365             reader.close()  1366         self.writer.close()  1367   1368 class TermDictionaryMerger(Merger):  1369   1370     "Merge term and position files."  1371   1372     def merge(self):  1373   1374         """  1375         Merge terms and positions from the readers, sending them to the writer.  1376         """  1377   1378         last_term = None  1379         current_readers = []  1380   1381         for term, frequency, doc_frequency, positions in itermerge(self.readers):  1382             if term == last_term:  1383                 current_readers.append(positions)  1384             else:  1385                 if current_readers:  1386                     self.writer.write_term_positions(last_term, itermerge(current_readers))  1387                 last_term = term  1388                 current_readers = [positions]  1389         else:  1390             if current_readers:  1391                 self.writer.write_term_positions(last_term, itermerge(current_readers))  1392   1393 class FieldDictionaryMerger(Merger):  1394   1395     "Merge field files."  1396   1397     def merge(self):  1398   1399         """  1400         Merge fields from the readers, sending them to the writer.  1401         """  1402   1403         for docnum, fields in itermerge(self.readers):  1404             self.writer.write_fields(docnum, fields)  1405   1406 # Utility functions.  1407   1408 def get_term_writer(pathname, partition, interval, doc_interval):  1409   1410     """  1411     Return a term dictionary writer using files under the given 'pathname'  1412     labelled according to the given 'partition', using the given indexing  1413     'interval' for terms and 'doc_interval' for document position records.  1414     """  1415   1416     tdf = open(join(pathname, "terms-%s" % partition), "wb")  1417     info_writer = TermWriter(tdf)  1418   1419     tdif = open(join(pathname, "terms_index-%s" % partition), "wb")  1420     index_writer = TermIndexWriter(tdif)  1421   1422     tpf = open(join(pathname, "positions-%s" % partition), "wb")  1423     positions_writer = PositionWriter(tpf)  1424   1425     tpif = open(join(pathname, "positions_index-%s" % partition), "wb")  1426     positions_index_writer = PositionIndexWriter(tpif)  1427   1428     positions_dict_writer = PositionDictionaryWriter(positions_writer, positions_index_writer, doc_interval)  1429   1430     return TermDictionaryWriter(info_writer, index_writer, positions_dict_writer, interval)  1431   1432 def get_field_writer(pathname, partition, interval):  1433   1434     """  1435     Return a field dictionary writer using files under the given 'pathname'  1436     labelled according to the given 'partition', using the given indexing  1437     'interval'.  1438     """  1439   1440     ff = open(join(pathname, "fields-%s" % partition), "wb")  1441     field_writer = FieldWriter(ff)  1442   1443     fif = open(join(pathname, "fields_index-%s" % partition), "wb")  1444     field_index_writer = FieldIndexWriter(fif)  1445   1446     return FieldDictionaryWriter(field_writer, field_index_writer, interval)  1447   1448 def get_term_reader(pathname, partition):  1449   1450     """  1451     Return a term dictionary reader using files under the given 'pathname'  1452     labelled according to the given 'partition'.  1453     """  1454   1455     tdf = open(join(pathname, "terms-%s" % partition), "rb")  1456     info_reader = TermReader(tdf)  1457   1458     tdif = open(join(pathname, "terms_index-%s" % partition), "rb")  1459     index_reader = TermIndexReader(tdif)  1460   1461     positions_opener = PositionOpener(join(pathname, "positions-%s" % partition))  1462     positions_index_opener = PositionIndexOpener(join(pathname, "positions_index-%s" % partition))  1463   1464     positions_dict_reader = PositionDictionaryReader(positions_opener, positions_index_opener)  1465   1466     return TermDictionaryReader(info_reader, index_reader, positions_dict_reader)  1467   1468 def get_field_reader(pathname, partition):  1469   1470     """  1471     Return a field dictionary reader using files under the given 'pathname'  1472     labelled according to the given 'partition'.  1473     """  1474   1475     ff = open(join(pathname, "fields-%s" % partition), "rb")  1476     field_reader = FieldReader(ff)  1477   1478     fif = open(join(pathname, "fields_index-%s" % partition), "rb")  1479     field_index_reader = FieldIndexReader(fif)  1480   1481     return FieldDictionaryReader(field_reader, field_index_reader)  1482   1483 def rename_files(pathname, names, from_partition, to_partition):  1484     for name in names:  1485         rename(join(pathname, "%s-%s" % (name, from_partition)), join(pathname, "%s-%s" % (name, to_partition)))  1486   1487 def rename_term_files(pathname, from_partition, to_partition):  1488     rename_files(pathname, TERM_FILENAMES, from_partition, to_partition)  1489   1490 def rename_field_files(pathname, from_partition, to_partition):  1491     rename_files(pathname, FIELD_FILENAMES, from_partition, to_partition)  1492   1493 def remove_files(pathname, names, partition):  1494     for name in names:  1495         remove(join(pathname, "%s-%s" % (name, partition)))  1496   1497 def remove_term_files(pathname, partition):  1498     remove_files(pathname, TERM_FILENAMES, partition)  1499   1500 def remove_field_files(pathname, partition):  1501     remove_files(pathname, FIELD_FILENAMES, partition)  1502   1503 # High-level classes.  1504   1505 class Document:  1506   1507     "A container of document information."  1508   1509     def __init__(self, docnum):  1510         self.docnum = docnum  1511         self.fields = []  1512         self.terms = {}  1513   1514     def add_position(self, term, position):  1515   1516         """  1517         Add a position entry for the given 'term', indicating the given  1518         'position'.  1519         """  1520   1521         self.terms.setdefault(term, []).append(position)  1522   1523     def add_field(self, identifier, value):  1524   1525         "Add a field having the given 'identifier' and 'value'."  1526   1527         self.fields.append((identifier, unicode(value))) # convert to string  1528   1529     def set_fields(self, fields):  1530   1531         """  1532         Set the document's 'fields': a list of tuples each containing an integer  1533         identifier and a string value.  1534         """  1535   1536         self.fields = fields  1537   1538 class IndexWriter:  1539   1540     """  1541     Building term information and writing it to the term and field dictionaries.  1542     """  1543   1544     def __init__(self, pathname, interval, doc_interval, flush_interval):  1545         self.pathname = pathname  1546         self.interval = interval  1547         self.doc_interval = doc_interval  1548         self.flush_interval = flush_interval  1549   1550         self.dict_partition = 0  1551         self.field_dict_partition = 0  1552   1553         self.terms = {}  1554         self.docs = {}  1555   1556         self.doc_counter = 0  1557   1558     def add_document(self, doc):  1559   1560         """  1561         Add the given document 'doc', updating the document counter and flushing  1562         terms and fields if appropriate.  1563         """  1564   1565         for term, positions in doc.terms.items():  1566             self.terms.setdefault(term, {})[doc.docnum] = positions  1567   1568         self.docs[doc.docnum] = doc.fields  1569   1570         self.doc_counter += 1  1571         if self.flush_interval and self.doc_counter >= self.flush_interval:  1572             self.flush_terms()  1573             self.flush_fields()  1574             self.doc_counter = 0  1575   1576     def get_term_writer(self):  1577   1578         "Return a term dictionary writer for the current partition."  1579   1580         return get_term_writer(self.pathname, self.dict_partition, self.interval, self.doc_interval)  1581   1582     def get_field_writer(self):  1583   1584         "Return a field dictionary writer for the current partition."  1585   1586         return get_field_writer(self.pathname, self.field_dict_partition, self.interval)  1587   1588     def flush_terms(self):  1589   1590         "Flush terms into the current term dictionary partition."  1591   1592         # Get the terms in order.  1593   1594         all_terms = self.terms  1595         terms = all_terms.keys()  1596         terms.sort()  1597   1598         dict_writer = self.get_term_writer()  1599   1600         for term in terms:  1601             doc_positions = all_terms[term].items()  1602             dict_writer.write_term_positions(term, doc_positions)  1603   1604         dict_writer.close()  1605   1606         self.terms = {}  1607         self.dict_partition += 1  1608   1609     def flush_fields(self):  1610   1611         "Flush fields into the current term dictionary partition."  1612   1613         # Get the documents in order.  1614   1615         docs = self.docs.items()  1616         docs.sort()  1617   1618         field_dict_writer = self.get_field_writer()  1619   1620         for docnum, fields in docs:  1621             field_dict_writer.write_fields(docnum, fields)  1622   1623         field_dict_writer.close()  1624   1625         self.docs = {}  1626         self.field_dict_partition += 1  1627   1628     def close(self):  1629         if self.terms:  1630             self.flush_terms()  1631         if self.docs:  1632             self.flush_fields()  1633   1634 class IndexReader:  1635   1636     "Accessing the term and field dictionaries."  1637   1638     def __init__(self, pathname):  1639         self.dict_reader = get_term_reader(pathname, "merged")  1640         self.field_dict_reader = get_field_reader(pathname, "merged")  1641   1642     def find_terms(self, term):  1643         return self.dict_reader.find_terms(term)  1644   1645     def find_positions(self, term):  1646         return self.dict_reader.find_positions(term)  1647   1648     def get_frequency(self, term):  1649         return self.dict_reader.get_frequency(term)  1650   1651     def get_document_frequency(self, term):  1652         return self.dict_reader.get_document_frequency(term)  1653   1654     def get_fields(self, docnum):  1655         return self.field_dict_reader.get_fields(docnum)  1656   1657     def close(self):  1658         self.dict_reader.close()  1659         self.field_dict_reader.close()  1660   1661 class Index:  1662   1663     "An inverted index solution encapsulating the various components."  1664   1665     def __init__(self, pathname):  1666         self.pathname = pathname  1667         self.reader = None  1668         self.writer = None  1669   1670     def get_writer(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL, flush_interval=FLUSH_INTERVAL):  1671   1672         """  1673         Return a writer, optionally using the given indexing 'interval',  1674         'doc_interval' and 'flush_interval'.  1675         """  1676   1677         if not exists(self.pathname):  1678             mkdir(self.pathname)  1679   1680         self.writer = IndexWriter(self.pathname, interval, doc_interval, flush_interval)  1681         return self.writer  1682   1683     def get_reader(self, partition=0):  1684   1685         "Return a reader for the index."  1686   1687         # Ensure that only one partition exists.  1688   1689         self.merge()  1690         return self._get_reader(partition)  1691   1692     def _get_reader(self, partition):  1693   1694         "Return a reader for the index."  1695   1696         if not exists(self.pathname):  1697             raise OSError, "Index path %r does not exist." % self.pathname  1698   1699         self.reader = IndexReader(self.pathname)  1700         return self.reader  1701   1702     def merge(self):  1703   1704         "Merge/optimise index partitions."  1705   1706         self.merge_terms()  1707         self.merge_fields()  1708   1709     def merge_terms(self, interval=TERM_INTERVAL, doc_interval=DOCUMENT_INTERVAL):  1710   1711         """  1712         Merge term dictionaries using the given indexing 'interval' and  1713         'doc_interval'.  1714         """  1715   1716         readers = []  1717         partitions = set()  1718   1719         for filename in listdir(self.pathname):  1720             if filename.startswith("terms-"): # 6 character prefix  1721                 partition = filename[6:]  1722                 readers.append(get_term_reader(self.pathname, partition))  1723                 partitions.add(partition)  1724   1725         # Write directly to a dictionary.  1726   1727         if len(readers) > 1:  1728             if "merged" in partitions:  1729                 rename_term_files(self.pathname, "merged", "old-merged")  1730                 partitions.remove("merged")  1731                 partitions.add("old-merged")  1732   1733             writer = get_term_writer(self.pathname, "merged", interval, doc_interval)  1734             merger = TermDictionaryMerger(writer, readers)  1735             merger.merge()  1736             merger.close()  1737   1738             # Remove old files.  1739   1740             for partition in partitions:  1741                 remove_term_files(self.pathname, partition)  1742   1743         elif len(readers) == 1:  1744             partition = list(partitions)[0]  1745             if partition != "merged":  1746                 rename_term_files(self.pathname, partition, "merged")  1747   1748     def merge_fields(self, interval=FIELD_INTERVAL):  1749   1750         "Merge field dictionaries using the given indexing 'interval'."  1751   1752         readers = []  1753         partitions = set()  1754   1755         for filename in listdir(self.pathname):  1756             if filename.startswith("fields-"): # 7 character prefix  1757                 partition = filename[7:]  1758                 readers.append(get_field_reader(self.pathname, partition))  1759                 partitions.add(partition)  1760   1761         # Write directly to a dictionary.  1762   1763         if len(readers) > 1:  1764             if "merged" in partitions:  1765                 rename_field_files(self.pathname, "merged", "old-merged")  1766                 partitions.remove("merged")  1767                 partitions.add("old-merged")  1768   1769             writer = get_field_writer(self.pathname, "merged", interval)  1770             merger = FieldDictionaryMerger(writer, readers)  1771             merger.merge()  1772             merger.close()  1773   1774             # Remove old files.  1775   1776             for partition in partitions:  1777                 remove_field_files(self.pathname, partition)  1778   1779         elif len(readers) == 1:  1780             partition = list(partitions)[0]  1781             if partition != "merged":  1782                 rename_field_files(self.pathname, partition, "merged")  1783   1784     def close(self):  1785         if self.reader is not None:  1786             self.reader.close()  1787             self.reader = None  1788         if self.writer is not None:  1789             self.writer.close()  1790             self.writer = None  1791   1792 # vim: tabstop=4 expandtab shiftwidth=4