iixr (file iixr/positions.py at fc0e9882717b)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing position information.     5      6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.data import *    22 from iixr.files import *    23     24 class PositionWriter(FileWriter):    25     26     "Writing position information to files."    27     28     def reset(self):    29         self.end_record()    30         self.last_docnum = None    31         self.subtractor = None    32     33     def write_positions(self, docnum, positions):    34     35         """    36         Write for the document 'docnum' the given 'positions'.    37         """    38     39         if not positions:    40             return    41     42         # Make sure that the positions are sorted.    43     44         positions.sort()    45     46         # Calculate an ongoing delta.    47     48         if self.last_docnum is not None:    49             if docnum < self.last_docnum:    50                 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)    51     52             docnum_seq = self.subtractor(docnum, self.last_docnum)    53     54         # Or preserve the document number and prepare for future deltas.    55     56         else:    57             self.subtractor = get_subtractor(docnum)    58             docnum_seq = docnum    59     60         self.write_sequence_value(docnum_seq)    61         self.write_monotonic_sequence(positions)    62     63         self.last_docnum = docnum    64     65 class PositionReader(FileReader):    66     67     "Reading position information within term-specific regions of a file."    68     69     def reset(self):    70         self.last_docnum = None    71         self.adder = None    72         self.begin_record()    73     74     def read_positions(self):    75     76         """    77         Read positions, returning a document number and a list of positions.    78         """    79     80         # Read the document number.    81     82         docnum = self.read_sequence_value()    83     84         # Calculate an ongoing delta.    85     86         if self.last_docnum is not None:    87             self.last_docnum = self.adder(docnum, self.last_docnum)    88     89         # Or preserve the document number and prepare for future deltas.    90     91         else:    92             self.adder = get_adder(docnum)    93             self.last_docnum = docnum    94     95         positions = self.read_monotonic_sequence()    96     97         return self.last_docnum, positions    98     99 class PositionIndexWriter(FileWriter):   100    101     "Writing position index information to files."   102    103     def reset(self):   104         self.end_record()   105         self.last_docnum = None   106         self.subtractor = None   107         self.last_pos_offset = 0   108    109     def write_positions(self, docnum, pos_offset, count):   110    111         """   112         Write the given 'docnum, 'pos_offset' and document 'count' to the   113         position index file.   114         """   115    116         # Find the size of document number values.   117    118         if self.last_docnum is not None:   119             docnum_seq = self.subtractor(docnum, self.last_docnum)   120         else:   121             self.subtractor = get_subtractor(docnum)   122             docnum_seq = docnum   123    124         self.write_sequence_value(docnum_seq)   125         self.write_number(pos_offset - self.last_pos_offset)   126         self.write_number(count)   127    128         self.last_docnum = docnum   129         self.last_pos_offset = pos_offset   130    131 class PositionIndexReader(FileReader):   132    133     "Reading position index information within term-specific regions of a file."   134    135     def reset(self):   136         self.last_docnum = None   137         self.adder = None   138         self.last_pos_offset = 0   139         self.begin_record()   140    141     def read_positions(self):   142    143         """   144         Read a document number, a position file offset for the position index   145         file, and the number of documents in a section of that file.   146         """   147    148         # Read the document number.   149    150         docnum = self.read_sequence_value()   151    152         if self.last_docnum is not None:   153             self.last_docnum = self.adder(docnum, self.last_docnum)   154         else:   155             self.adder = get_adder(docnum)   156             self.last_docnum = docnum   157    158         # Read the offset delta.   159    160         self.last_pos_offset += self.read_number()   161    162         # Read the document count.   163    164         count = self.read_number()   165    166         return self.last_docnum, self.last_pos_offset, count   167    168 # Iterators for position-related files.   169    170 class IteratorBase:   171    172     "Support for iterating over results."   173    174     def __init__(self, reader):   175    176         "Initialise the iterator using the given 'reader'."   177    178         self.reader = reader   179         self.replenish(0) # no iteration initially permitted   180    181     def replenish(self, count):   182    183         "Replenish the iterator with 'count' results."   184    185         self.count = count   186         self.read_documents = 0   187    188     def __len__(self):   189    190         "Return the total number of results."   191    192         return self.count   193    194     def sort(self):   195         pass # Stored document positions are already sorted.   196    197     def __iter__(self):   198         return self   199    200 class PositionIterator(IteratorBase):   201    202     "Iterating over document positions."   203    204     def replenish(self, count):   205         IteratorBase.replenish(self, count)   206    207         # Fill a cache of positions.   208    209         self.cache = []   210         n = 0   211    212         while n < self.count:   213             self.cache.append(self.reader.read_positions())   214             n += 1   215    216     def seek(self, offset, count):   217    218         """   219         Seek to 'offset' in the file, limiting the number of documents available   220         for reading to 'count'.   221         """   222    223         self.reader.seek(offset)   224         self.replenish(count)   225    226     def next(self):   227    228         "Read positions for a single document."   229    230         if self.read_documents < self.count:   231             positions = self.cache[self.read_documents]   232             self.read_documents += 1   233             return positions   234         else:   235             raise StopIteration   236    237 class PositionIndexIterator(IteratorBase):   238    239     "Iterating over document positions."   240    241     def replenish(self, count):   242         IteratorBase.replenish(self, count)   243    244         # Fill a cache of offsets.   245    246         self.cache = []   247         self.current = 0   248         n = 0   249    250         while n < self.count:   251             docnum, pos_offset, section_count = t = self.reader.read_positions()   252             self.cache.append(t)   253             n += section_count   254    255     def seek(self, offset, doc_frequency):   256    257         """   258         Seek to 'offset' in the file, limiting the number of documents available   259         for reading to 'doc_frequency'.   260         """   261    262         self.reader.seek(offset)   263         self.replenish(doc_frequency)   264    265     def next(self):   266    267         "Read positions for a single document."   268    269         if self.current < len(self.cache):   270             docnum, pos_offset, self.section_count = t = self.cache[self.current]   271             self.current += 1   272             return t   273         else:   274             raise StopIteration   275    276 class PositionDictionaryWriter:   277    278     "Writing position dictionaries."   279    280     def __init__(self, position_writer, position_index_writer, interval):   281         self.position_writer = position_writer   282         self.position_index_writer = position_index_writer   283         self.interval = interval   284    285     def write_term_positions(self, doc_positions):   286    287         """   288         Write all 'doc_positions' - a collection of tuples of the form (document   289         number, position list) - to the file.   290    291         Add some records to the index, making dictionary entries.   292    293         Return a tuple containing the offset of the written data, the frequency   294         (number of positions), and document frequency (number of documents) for   295         the term involved.   296         """   297    298         # Reset the writers.   299    300         self.position_writer.reset()   301         self.position_index_writer.reset()   302    303         # Remember the first index entry offset.   304    305         index_offset = self.position_index_writer.tell()   306    307         # Write the positions.   308    309         frequency = 0   310         count = 0   311    312         if doc_positions:   313    314             # Retain the first record offset for a subsequent index entry.   315    316             first_offset = self.position_writer.tell()   317             first_docnum = None   318    319             doc_positions.sort()   320    321             for docnum, positions in doc_positions:   322                 if first_docnum is None:   323                     first_docnum = docnum   324    325                 self.position_writer.write_positions(docnum, positions)   326    327                 frequency += len(positions)   328                 count += 1   329    330                 # Every {interval} entries, write an index entry.   331    332                 if count % self.interval == 0:   333    334                     self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)   335    336                     # Reset the position writer so that position readers accessing   337                     # a section start with the correct document number.   338    339                     self.position_writer.reset()   340    341                     first_offset = self.position_writer.tell()   342                     first_docnum = None   343    344             # Finish writing an index entry for the remaining documents.   345    346             else:   347                 if first_docnum is not None:   348                     self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)   349    350         return index_offset, frequency, count   351    352     def close(self):   353         self.position_writer.close()   354         self.position_index_writer.close()   355    356 class PositionDictionaryReader:   357    358     "Access to position dictionary entries through iterators."   359    360     def __init__(self, position_reader, position_index_reader):   361         self.position_reader = position_reader   362         self.position_index_reader = position_index_reader   363    364     def read_term_positions(self, offset, doc_frequency):   365         iterator = PositionDictionaryIterator(   366             PositionIterator(self.position_reader),   367             PositionIndexIterator(self.position_index_reader)   368             )   369         iterator.seek(offset, doc_frequency)   370         return iterator   371    372     def close(self):   373         self.position_reader.close()   374         self.position_index_reader.close()   375    376 class PositionDictionaryIterator:   377    378     "Iteration over position dictionary entries."   379    380     def __init__(self, position_iterator, position_index_iterator):   381         self.position_iterator = position_iterator   382         self.position_index_iterator = position_index_iterator   383         self.reset()   384    385     def reset(self):   386    387         # Remember the last values.   388    389         self.found_docnum, self.found_positions = None, None   390    391         # Maintain state for the next index entry, if read.   392    393         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   394    395     def seek(self, offset, doc_frequency):   396    397         """   398         Seek to 'offset' in the index file, limiting the number of documents   399         available for reading to 'doc_frequency'.   400         """   401    402         self.reset()   403    404         # Seek to the appropriate index entry.   405    406         self.position_index_iterator.seek(offset, doc_frequency)   407    408         # Initialise the current index entry and current position file iterator.   409    410         self._next_section()   411         self._init_section()   412    413     # Sequence methods.   414    415     def __len__(self):   416         return len(self.position_index_iterator)   417    418     def sort(self):   419         pass   420    421     # Iterator methods.   422    423     def __iter__(self):   424         return self   425    426     def next(self):   427    428         """   429         Attempt to get the next document record from the section in the   430         positions file.   431         """   432    433         # Return any visited but unrequested record.   434    435         if self.found_docnum is not None:   436             t = self.found_docnum, self.found_positions   437             self.found_docnum, self.found_positions = None, None   438             return t   439    440         # Or search for the next record.   441    442         while 1:   443    444             # Either return the next record.   445    446             try:   447                 return self.position_iterator.next()   448    449             # Or, where a section is finished, get the next section and try again.   450    451             except StopIteration:   452    453                 # Although, where a single iterator is in use, the file reader   454                 # would be positioned appropriately, this is not guaranteed in a   455                 # multiple iterator situation.   456    457                 self._next_section()   458                 self._init_section()   459    460     def from_document(self, docnum):   461    462         """   463         Attempt to navigate to a positions entry for the given 'docnum',   464         returning the positions for 'docnum', or None otherwise.   465         """   466    467         # Return any unrequested document positions.   468    469         if docnum == self.found_docnum:   470             return self.found_positions   471    472         # Read ahead in the index until the next entry refers to a document   473         # later than the desired document.   474    475         try:   476             if self.next_docnum is None:   477                 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   478    479             # Read until the next entry is after the desired document number,   480             # or until the end of the results.   481    482             while self.next_docnum <= docnum:   483                 self._next_read_section()   484                 if self.docnum < docnum:   485                     self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   486                 else:   487                     break   488    489         except StopIteration:   490             pass   491    492         # Navigate in the position file to the document.   493    494         self._init_section()   495    496         try:   497             while 1:   498                 found_docnum, found_positions = self.position_iterator.next()   499    500                 # Return the desired document positions or None (retaining the   501                 # positions for the document immediately after).   502    503                 if docnum <= found_docnum:   504                     self.found_docnum, self.found_positions = found_docnum, found_positions   505                     if docnum == found_docnum:   506                         return found_positions   507                     elif docnum < found_docnum:   508                         return None   509    510         except StopIteration:   511             return None   512    513     # Internal methods.   514    515     def _next_section(self):   516    517         "Attempt to get the next section in the index."   518    519         if self.next_docnum is None:   520             self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()   521         else:   522             self._next_read_section()   523    524     def _next_read_section(self):   525    526         """   527         Make the next index entry the current one without reading from the   528         index.   529         """   530    531         self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count   532         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   533    534     def _init_section(self):   535    536         "Initialise the iterator for the section in the position file."   537    538         # Seek to the position entry.   539    540         self.position_iterator.seek(self.pos_offset, self.section_count)   541    542 # vim: tabstop=4 expandtab shiftwidth=4