iixr (file iixr/positions.py at 218fd6522891)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing position information.     5      6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.data import *    22 from iixr.files import *    23     24 class PositionWriter(FileWriter):    25     26     "Writing position information to files."    27     28     def begin(self, docnum_size, position_size):    29         self.write_numbers((docnum_size, position_size))    30         self.end_record()    31         self.data_start = self.tell()    32         self.docnum_size = docnum_size    33         self.position_size = position_size    34     35     def reset(self):    36         self.end_record()    37         self.last_docnum = None    38         self.subtractor = None    39     40     def write_positions(self, docnum, positions):    41     42         """    43         Write for the document 'docnum' the given 'positions'.    44         """    45     46         if not positions:    47             return    48     49         # Make sure that the positions are sorted.    50     51         positions.sort()    52     53         # Calculate an ongoing delta.    54     55         if self.last_docnum is not None:    56             if docnum < self.last_docnum:    57                 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)    58     59             docnum_seq = self.subtractor(docnum, self.last_docnum)    60     61         # Or preserve the document number and prepare for future deltas.    62     63         else:    64             self.subtractor = get_subtractor(docnum)    65             docnum_seq = docnum    66     67         self.write_sequence_value(docnum_seq, self.docnum_size)    68         self.write_monotonic_sequence(positions, self.position_size)    69     70         self.last_docnum = docnum    71     72 class PositionReader(FileReader):    73     74     "Reading position information within term-specific regions of a file."    75     76     def begin(self):    77         self.begin_record()    78         try:    79             self.docnum_size, self.position_size = self.read_numbers(2)    80         except EOFError:    81             self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!    82         self.data_start = self.tell()    83     84     def reset(self):    85         self.last_docnum = None    86         self.adder = None    87         self.begin_record()    88     89     def read_positions(self):    90     91         """    92         Read positions, returning a document number and a list of positions.    93         """    94     95         # Read the document number.    96     97         docnum = self.read_sequence_value(self.docnum_size)    98     99         # Calculate an ongoing delta.   100    101         if self.last_docnum is not None:   102             self.last_docnum = self.adder(docnum, self.last_docnum)   103    104         # Or preserve the document number and prepare for future deltas.   105    106         else:   107             self.adder = get_adder(docnum)   108             self.last_docnum = docnum   109    110         positions = self.read_monotonic_sequence(self.position_size)   111    112         return self.last_docnum, positions   113    114 class PositionIndexWriter(PositionWriter):   115    116     "Writing position index information to files."   117    118     def begin(self, docnum_size):   119         PositionWriter.begin(self, docnum_size, 0)   120    121     def reset(self):   122         PositionWriter.reset(self)   123         self.last_pos_offset = 0   124    125     def write_positions(self, docnum, pos_offset, count):   126    127         """   128         Write the given 'docnum, 'pos_offset' and document 'count' to the   129         position index file.   130         """   131    132         # Find the size of document number values.   133    134         if self.last_docnum is not None:   135             docnum_seq = self.subtractor(docnum, self.last_docnum)   136         else:   137             self.subtractor = get_subtractor(docnum)   138             docnum_seq = docnum   139    140         self.write_sequence_value(docnum_seq, self.docnum_size)   141         self.write_number(pos_offset - self.last_pos_offset)   142         self.write_number(count)   143    144         self.last_docnum = docnum   145         self.last_pos_offset = pos_offset   146    147 class PositionIndexReader(PositionReader):   148    149     "Reading position index information within term-specific regions of a file."   150    151     def reset(self):   152         PositionReader.reset(self)   153         self.last_pos_offset = 0   154    155     def read_positions(self):   156    157         """   158         Read a document number, a position file offset for the position index   159         file, and the number of documents in a section of that file.   160         """   161    162         # Read the document number.   163    164         docnum = self.read_sequence_value(self.docnum_size)   165    166         if self.last_docnum is not None:   167             self.last_docnum = self.adder(docnum, self.last_docnum)   168         else:   169             self.adder = get_adder(docnum)   170             self.last_docnum = docnum   171    172         # Read the offset delta.   173    174         self.last_pos_offset += self.read_number()   175    176         # Read the document count.   177    178         count = self.read_number()   179    180         return self.last_docnum, self.last_pos_offset, count   181    182 # Iterators for position-related files.   183    184 class IteratorBase:   185    186     "Support for iterating over results."   187    188     def __init__(self, reader):   189    190         "Initialise the iterator using the given 'reader'."   191    192         self.reader = reader   193         self.replenish(0) # no iteration initially permitted   194    195     def replenish(self, count):   196    197         "Replenish the iterator with 'count' results."   198    199         self.count = count   200         self.read_documents = 0   201    202     def __len__(self):   203    204         "Return the total number of results."   205    206         return self.count   207    208     def sort(self):   209         pass # Stored document positions are already sorted.   210    211     def __iter__(self):   212         return self   213    214 class PositionIterator(IteratorBase):   215    216     "Iterating over document positions."   217    218     def replenish(self, count):   219         IteratorBase.replenish(self, count)   220    221         # Fill a cache of positions.   222    223         self.cache = []   224         n = 0   225    226         while n < self.count:   227             self.cache.append(self.reader.read_positions())   228             n += 1   229    230     def seek(self, offset, count):   231    232         """   233         Seek to 'offset' in the file, limiting the number of documents available   234         for reading to 'count'.   235         """   236    237         self.reader.seek(offset)   238         self.replenish(count)   239    240     def next(self):   241    242         "Read positions for a single document."   243    244         if self.read_documents < self.count:   245             positions = self.cache[self.read_documents]   246             self.read_documents += 1   247             return positions   248         else:   249             raise StopIteration   250    251 class PositionIndexIterator(IteratorBase):   252    253     "Iterating over document positions."   254    255     def replenish(self, count):   256         IteratorBase.replenish(self, count)   257    258         # Fill a cache of offsets.   259    260         self.cache = []   261         self.current = 0   262         n = 0   263    264         while n < self.count:   265             docnum, pos_offset, section_count = t = self.reader.read_positions()   266             self.cache.append(t)   267             n += section_count   268    269     def seek(self, offset, doc_frequency):   270    271         """   272         Seek to 'offset' in the file, limiting the number of documents available   273         for reading to 'doc_frequency'.   274         """   275    276         self.reader.seek(offset)   277         self.replenish(doc_frequency)   278    279     def next(self):   280    281         "Read positions for a single document."   282    283         if self.current < len(self.cache):   284             docnum, pos_offset, self.section_count = t = self.cache[self.current]   285             self.current += 1   286             return t   287         else:   288             raise StopIteration   289    290 class PositionDictionaryWriter:   291    292     "Writing position dictionaries."   293    294     def __init__(self, position_writer, position_index_writer, interval):   295         self.position_writer = position_writer   296         self.position_index_writer = position_index_writer   297         self.interval = interval   298    299     def write_term_positions(self, doc_positions):   300    301         """   302         Write all 'doc_positions' - a collection of tuples of the form (document   303         number, position list) - to the file.   304    305         Add some records to the index, making dictionary entries.   306    307         Return a tuple containing the offset of the written data, the frequency   308         (number of positions), and document frequency (number of documents) for   309         the term involved.   310         """   311    312         # Write the positions.   313    314         frequency = 0   315         count = 0   316    317         if doc_positions:   318             doc_positions.sort()   319    320             # Look ahead at the first document record.   321             # NOTE: Any iterator would need to support this.   322    323             first_docnum, first_positions = doc_positions[0]   324             first_position = first_positions[0]   325    326             # Write out size details.   327    328             docnum_size, position_size = sizeof(first_docnum), sizeof(first_position)   329             self.position_writer.begin(docnum_size, position_size)   330             self.position_index_writer.begin(docnum_size)   331    332             # Reset the writers.   333    334             self.position_writer.reset()   335             self.position_index_writer.reset()   336    337             # Remember the first index entry offset.   338    339             index_offset = self.position_index_writer.tell()   340    341             # Retain the first record offset for a subsequent index entry.   342    343             first_offset = self.position_writer.tell()   344    345             for docnum, positions in doc_positions:   346                 if first_docnum is None:   347                     first_docnum = docnum   348    349                 self.position_writer.write_positions(docnum, positions)   350    351                 frequency += len(positions)   352                 count += 1   353    354                 # Every {interval} entries, write an index entry.   355    356                 if count % self.interval == 0:   357    358                     self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)   359    360                     # Reset the position writer so that position readers accessing   361                     # a section start with the correct document number.   362    363                     self.position_writer.reset()   364    365                     first_offset = self.position_writer.tell()   366                     first_docnum = None   367    368             # Finish writing an index entry for the remaining documents.   369    370             else:   371                 if first_docnum is not None:   372                     self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)   373    374         return index_offset, frequency, count   375    376     def close(self):   377         self.position_writer.close()   378         self.position_index_writer.close()   379    380 class PositionDictionaryReader:   381    382     "Access to position dictionary entries through iterators."   383    384     def __init__(self, position_reader, position_index_reader):   385         self.position_reader = position_reader   386         self.position_index_reader = position_index_reader   387    388     def read_term_positions(self, offset, doc_frequency):   389         iterator = PositionDictionaryIterator(   390             PositionIterator(self.position_reader),   391             PositionIndexIterator(self.position_index_reader)   392             )   393         iterator.seek(offset, doc_frequency)   394         return iterator   395    396     def close(self):   397         self.position_reader.close()   398         self.position_index_reader.close()   399    400 class PositionDictionaryIterator:   401    402     "Iteration over position dictionary entries."   403    404     def __init__(self, position_iterator, position_index_iterator):   405         self.position_iterator = position_iterator   406         self.position_index_iterator = position_index_iterator   407         self.reset()   408    409     def reset(self):   410    411         # Remember the last values.   412    413         self.found_docnum, self.found_positions = None, None   414    415         # Maintain state for the next index entry, if read.   416    417         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   418    419     def seek(self, offset, doc_frequency):   420    421         """   422         Seek to 'offset' in the index file, limiting the number of documents   423         available for reading to 'doc_frequency'.   424         """   425    426         self.reset()   427    428         # Seek to the appropriate index entry.   429    430         self.position_index_iterator.seek(offset, doc_frequency)   431    432         # Initialise the current index entry and current position file iterator.   433    434         self._next_section()   435         self._init_section()   436    437     # Sequence methods.   438    439     def __len__(self):   440         return len(self.position_index_iterator)   441    442     def sort(self):   443         pass   444    445     # Iterator methods.   446    447     def __iter__(self):   448         return self   449    450     def next(self):   451    452         """   453         Attempt to get the next document record from the section in the   454         positions file.   455         """   456    457         # Return any visited but unrequested record.   458    459         if self.found_docnum is not None:   460             t = self.found_docnum, self.found_positions   461             self.found_docnum, self.found_positions = None, None   462             return t   463    464         # Or search for the next record.   465    466         while 1:   467    468             # Either return the next record.   469    470             try:   471                 return self.position_iterator.next()   472    473             # Or, where a section is finished, get the next section and try again.   474    475             except StopIteration:   476    477                 # Although, where a single iterator is in use, the file reader   478                 # would be positioned appropriately, this is not guaranteed in a   479                 # multiple iterator situation.   480    481                 self._next_section()   482                 self._init_section()   483    484     def from_document(self, docnum):   485    486         """   487         Attempt to navigate to a positions entry for the given 'docnum',   488         returning the positions for 'docnum', or None otherwise.   489         """   490    491         # Return any unrequested document positions.   492    493         if docnum == self.found_docnum:   494             return self.found_positions   495    496         # Read ahead in the index until the next entry refers to a document   497         # later than the desired document.   498    499         try:   500             if self.next_docnum is None:   501                 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   502    503             # Read until the next entry is after the desired document number,   504             # or until the end of the results.   505    506             while self.next_docnum <= docnum:   507                 self._next_read_section()   508                 if self.docnum < docnum:   509                     self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   510                 else:   511                     break   512    513         except StopIteration:   514             pass   515    516         # Navigate in the position file to the document.   517    518         self._init_section()   519    520         try:   521             while 1:   522                 found_docnum, found_positions = self.position_iterator.next()   523    524                 # Return the desired document positions or None (retaining the   525                 # positions for the document immediately after).   526    527                 if docnum <= found_docnum:   528                     self.found_docnum, self.found_positions = found_docnum, found_positions   529                     if docnum == found_docnum:   530                         return found_positions   531                     elif docnum < found_docnum:   532                         return None   533    534         except StopIteration:   535             return None   536    537     # Internal methods.   538    539     def _next_section(self):   540    541         "Attempt to get the next section in the index."   542    543         if self.next_docnum is None:   544             self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()   545         else:   546             self._next_read_section()   547    548     def _next_read_section(self):   549    550         """   551         Make the next index entry the current one without reading from the   552         index.   553         """   554    555         self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count   556         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   557    558     def _init_section(self):   559    560         "Initialise the iterator for the section in the position file."   561    562         # Seek to the position entry.   563    564         self.position_iterator.seek(self.pos_offset, self.section_count)   565    566 # vim: tabstop=4 expandtab shiftwidth=4