iixr (file iixr/positions.py at fad9698e2c46)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing position information.     5      6 Copyright (C) 2009 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.files import *    22 from iixr.data import vint    23     24 class PositionWriter(FileWriter):    25     26     "Writing position information to files."    27     28     def reset(self):    29         self.last_docnum = 0    30     31     def write_positions(self, docnum, positions):    32     33         """    34         Write for the document 'docnum' the given 'positions'.    35         Return the offset of the written record.    36         """    37     38         if docnum < self.last_docnum:    39             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)    40     41         # Record the offset of this record.    42     43         offset = self.tell()    44     45         # Make sure that the positions are sorted.    46     47         positions.sort()    48     49         # Write the position deltas.    50     51         output = []    52         last = 0    53     54         for position in positions:    55             output.append(vint(position - last))    56             last = position    57     58         # Write the document number delta.    59         # Write the number of positions.    60         # Then write the positions.    61     62         self.write(vint(docnum - self.last_docnum) + vint(len(positions)) + "".join(output))    63     64         self.last_docnum = docnum    65         return offset    66     67 class PositionOpener(FileOpener):    68     69     "Reading position information from files."    70     71     def read_term_positions(self, offset, count):    72     73         """    74         Read all positions from 'offset', seeking to that position in the file    75         before reading. The number of documents available for reading is limited    76         to 'count'.    77         """    78     79         # Duplicate the file handle.    80     81         f = self.open("rb")    82         return PositionIterator(f, offset, count)    83     84 class PositionIndexWriter(FileWriter):    85     86     "Writing position index information to files."    87     88     def reset(self):    89         self.last_docnum = 0    90         self.last_pos_offset = 0    91     92     def write_positions(self, docnum, pos_offset, count):    93     94         """    95         Write the given 'docnum, 'pos_offset' and document 'count' to the    96         position index file.    97         """    98     99         # Record the offset of this record.   100    101         offset = self.tell()   102         output = []   103    104         # Write the document number delta.   105    106         output.append(vint(docnum - self.last_docnum))   107         self.last_docnum = docnum   108    109         # Write the position file offset delta.   110    111         output.append(vint(pos_offset - self.last_pos_offset))   112         self.last_pos_offset = pos_offset   113    114         # Write the document count.   115    116         output.append(vint(count))   117    118         # Actually write the data.   119    120         self.write("".join(output))   121    122         return offset   123    124 class PositionIndexOpener(FileOpener):   125    126     "Reading position index information from files."   127    128     def read_term_positions(self, offset, doc_frequency):   129    130         """   131         Read all positions from 'offset', seeking to that position in the file   132         before reading. The number of documents available for reading is limited   133         to 'doc_frequency'.   134         """   135    136         # Duplicate the file handle.   137    138         f = self.open("rb")   139         return PositionIndexIterator(f, offset, doc_frequency)   140    141 # Iterators for position-related files.   142    143 class IteratorBase:   144    145     def __init__(self, count):   146         self.replenish(count)   147    148     def replenish(self, count):   149         self.count = count   150         self.read_documents = 0   151    152     def __len__(self):   153         return self.count   154    155     def sort(self):   156         pass # Stored document positions are already sorted.   157    158     def __iter__(self):   159         return self   160    161 class PositionIterator(FileReader, IteratorBase):   162    163     "Iterating over document positions."   164    165     def __init__(self, f, offset, count):   166         FileReader.__init__(self, f)   167         IteratorBase.__init__(self, count)   168         self.seek(offset)   169    170     def reset(self):   171         self.last_docnum = 0   172    173     def read_positions(self):   174    175         "Read positions, returning a document number and a list of positions."   176    177         # Read the document number delta and add it to the last number.   178    179         self.last_docnum += self.read_number()   180    181         # Read the number of positions.   182    183         npositions = self.read_number()   184    185         # Read the position deltas, adding each previous position to get the   186         # appropriate collection of absolute positions.   187    188         i = 0   189         last = 0   190         positions = []   191    192         while i < npositions:   193             last += self.read_number()   194             positions.append(last)   195             i += 1   196    197         return self.last_docnum, positions   198    199     def next(self):   200    201         "Read positions for a single document."   202    203         if self.read_documents < self.count:   204             self.read_documents += 1   205             return self.read_positions()   206         else:   207             raise StopIteration   208    209 class PositionIndexIterator(FileReader, IteratorBase):   210    211     "Iterating over document positions."   212    213     def __init__(self, f, offset, count):   214         FileReader.__init__(self, f)   215         IteratorBase.__init__(self, count)   216         self.seek(offset)   217         self.section_count = 0   218    219     def reset(self):   220         self.last_docnum = 0   221         self.last_pos_offset = 0   222    223     def read_positions(self):   224    225         """   226         Read a document number, a position file offset for the position index   227         file, and the number of documents in a section of that file.   228         """   229    230         # Read the document number delta.   231    232         self.last_docnum += self.read_number()   233    234         # Read the offset delta.   235    236         self.last_pos_offset += self.read_number()   237    238         # Read the document count.   239    240         count = self.read_number()   241    242         return self.last_docnum, self.last_pos_offset, count   243    244     def next(self):   245    246         "Read positions for a single document."   247    248         self.read_documents += self.section_count   249         if self.read_documents < self.count:   250             docnum, pos_offset, self.section_count = t = self.read_positions()   251             return t   252         else:   253             raise StopIteration   254    255 class PositionDictionaryWriter:   256    257     "Writing position dictionaries."   258    259     def __init__(self, position_writer, position_index_writer, interval):   260         self.position_writer = position_writer   261         self.position_index_writer = position_index_writer   262         self.interval = interval   263    264     def write_term_positions(self, doc_positions):   265    266         """   267         Write all 'doc_positions' - a collection of tuples of the form (document   268         number, position list) - to the file.   269    270         Add some records to the index, making dictionary entries.   271    272         Return a tuple containing the offset of the written data, the frequency   273         (number of positions), and document frequency (number of documents) for   274         the term involved.   275         """   276    277         # Reset the writers.   278    279         self.position_writer.reset()   280         self.position_index_writer.reset()   281    282         index_offset = None   283    284         # Write the positions.   285    286         frequency = 0   287         first_docnum = None   288         first_offset = None   289         count = 0   290    291         doc_positions.sort()   292    293         for docnum, positions in doc_positions:   294             pos_offset = self.position_writer.write_positions(docnum, positions)   295    296             # Retain the first record offset for a subsequent index entry.   297    298             if first_offset is None:   299                 first_offset = pos_offset   300                 first_docnum = docnum   301    302             frequency += len(positions)   303             count += 1   304    305             # Every {interval} entries, write an index entry.   306    307             if count % self.interval == 0:   308                 io = self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)   309    310                 # Remember the first index entry offset.   311    312                 if index_offset is None:   313                     index_offset = io   314    315                 first_offset = None   316                 first_docnum = None   317    318                 # Reset the position writer so that position readers accessing   319                 # a section start with the correct document number.   320    321                 self.position_writer.reset()   322    323         # Finish writing an index entry for the remaining documents.   324    325         else:   326             if first_offset is not None:   327                 io = self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)   328    329                 # Remember the first index entry offset.   330    331                 if index_offset is None:   332                     index_offset = io   333    334         return index_offset, frequency, count   335    336     def close(self):   337         self.position_writer.close()   338         self.position_index_writer.close()   339    340 class PositionDictionaryReader:   341    342     "Reading position dictionaries."   343    344     def __init__(self, position_opener, position_index_opener):   345         self.position_opener = position_opener   346         self.position_index_opener = position_index_opener   347    348     def read_term_positions(self, offset, doc_frequency):   349    350         """   351         Return an iterator for dictionary entries starting at 'offset' with the   352         given 'doc_frequency'.   353         """   354    355         return PositionDictionaryIterator(self.position_opener,   356             self.position_index_opener, offset, doc_frequency)   357    358     def close(self):   359         pass   360    361 class PositionDictionaryIterator:   362    363     "Iteration over position dictionary entries."   364    365     def __init__(self, position_opener, position_index_opener, offset, doc_frequency):   366         self.position_opener = position_opener   367         self.doc_frequency = doc_frequency   368         self.index_iterator = position_index_opener.read_term_positions(offset, doc_frequency)   369         self.iterator = None   370    371         # Remember the last values.   372    373         self.found_docnum, self.found_positions = None, None   374    375         # Maintain state for the next index entry, if read.   376    377         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   378    379         # Initialise the current index entry and current position file iterator.   380    381         self._next_section()   382         self._init_section()   383    384     # Sequence methods.   385    386     def __len__(self):   387         return self.doc_frequency   388    389     def sort(self):   390         pass   391    392     # Iterator methods.   393    394     def __iter__(self):   395         return self   396    397     def next(self):   398    399         """   400         Attempt to get the next document record from the section in the   401         positions file.   402         """   403    404         # Return any visited but unrequested record.   405    406         if self.found_docnum is not None:   407             t = self.found_docnum, self.found_positions   408             self.found_docnum, self.found_positions = None, None   409             return t   410    411         # Or search for the next record.   412    413         while 1:   414    415             # Either return the next record.   416    417             try:   418                 return self.iterator.next()   419    420             # Or, where a section is finished, get the next section and try again.   421    422             except StopIteration:   423    424                 # Where a section follows, update the index iterator, but keep   425                 # reading using the same file iterator (since the data should   426                 # just follow on from the last section).   427    428                 self._next_section()   429                 self.iterator.replenish(self.section_count)   430    431                 # Reset the state of the iterator to make sure that document   432                 # numbers are correct.   433    434                 self.iterator.reset()   435    436     def from_document(self, docnum):   437    438         """   439         Attempt to navigate to a positions entry for the given 'docnum',   440         returning the positions for 'docnum', or None otherwise.   441         """   442    443         # Return any unrequested document positions.   444    445         if docnum == self.found_docnum:   446             return self.found_positions   447    448         # Read ahead in the index until the next entry refers to a document   449         # later than the desired document.   450    451         try:   452             if self.next_docnum is None:   453                 self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()   454    455             # Read until the next entry is after the desired document number,   456             # or until the end of the results.   457    458             while self.next_docnum <= docnum:   459                 self._next_read_section()   460                 if self.docnum < docnum:   461                     self.next_docnum, self.next_pos_offset, self.next_section_count = self.index_iterator.next()   462                 else:   463                     break   464    465         except StopIteration:   466             pass   467    468         # Navigate in the position file to the document.   469    470         self._init_section()   471    472         try:   473             while 1:   474                 found_docnum, found_positions = self.iterator.next()   475    476                 # Return the desired document positions or None (retaining the   477                 # positions for the document immediately after).   478    479                 if docnum == found_docnum:   480                     return found_positions   481                 elif docnum < found_docnum:   482                     self.found_docnum, self.found_positions = found_docnum, found_positions   483                     return None   484    485         except StopIteration:   486             return None   487    488     # Internal methods.   489    490     def _next_section(self):   491    492         "Attempt to get the next section in the index."   493    494         if self.next_docnum is None:   495             self.docnum, self.pos_offset, self.section_count = self.index_iterator.next()   496         else:   497             self._next_read_section()   498    499     def _next_read_section(self):   500    501         """   502         Make the next index entry the current one without reading from the   503         index.   504         """   505    506         self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count   507         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   508    509     def _init_section(self):   510    511         "Initialise the iterator for the section in the position file."   512    513         if self.iterator is not None:   514             self.iterator.close()   515         self.iterator = self.position_opener.read_term_positions(self.pos_offset, self.section_count)   516    517     def close(self):   518         if self.iterator is not None:   519             self.iterator.close()   520             self.iterator = None   521         if self.index_iterator is not None:   522             self.index_iterator.close()   523             self.index_iterator = None   524    525 # vim: tabstop=4 expandtab shiftwidth=4