iixr (file iixr/positions.py at 1077b05c9b76)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing position information.     5      6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.files import *    22 from iixr.data import vint, vint_to_array    23 from array import array    24     25 class PositionWriter(FileWriter):    26     27     "Writing position information to files."    28     29     def reset(self):    30         self.last_docnum = 0    31     32     def write_positions(self, docnum, positions):    33     34         """    35         Write for the document 'docnum' the given 'positions'.    36         """    37     38         if docnum < self.last_docnum:    39             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)    40     41         # Make sure that the positions are sorted.    42     43         positions.sort()    44     45         # Write the document number delta.    46         # Write the number of positions.    47     48         output = array('B')    49         vint_to_array(docnum - self.last_docnum, output)    50         vint_to_array(len(positions), output)    51     52         # Write the position deltas.    53     54         last = 0    55     56         for position in positions:    57             vint_to_array(position - last, output)    58             last = position    59     60         output.tofile(self.f)    61     62         self.last_docnum = docnum    63     64 class PositionReader(FileReader):    65     66     "Reading position information within term-specific regions of a file."    67     68     def reset(self):    69         self.last_docnum = 0    70     71     def read_positions(self):    72     73         "Read positions, returning a document number and a list of positions."    74     75         # Read the document number delta and add it to the last number.    76     77         self.last_docnum += self.read_number()    78     79         # Read the number of positions.    80     81         npositions = self.read_number()    82     83         # Read the position deltas, adding each previous position to get the    84         # appropriate collection of absolute positions.    85     86         i = 0    87         last = 0    88         positions = []    89     90         while i < npositions:    91             last += self.read_number()    92             positions.append(last)    93             i += 1    94     95         return self.last_docnum, positions    96     97 class PositionIndexWriter(FileWriter):    98     99     "Writing position index information to files."   100    101     def reset(self):   102         self.last_docnum = 0   103         self.last_pos_offset = 0   104    105     def write_positions(self, docnum, pos_offset, count):   106    107         """   108         Write the given 'docnum, 'pos_offset' and document 'count' to the   109         position index file.   110         """   111    112         # Write the document number delta.   113         # Write the position file offset delta.   114         # Write the document count.   115    116         output = array('B')   117         vint_to_array(docnum - self.last_docnum, output)   118         vint_to_array(pos_offset - self.last_pos_offset, output)   119         vint_to_array(count, output)   120    121         # Actually write the data.   122    123         output.tofile(self.f)   124    125         self.last_pos_offset = pos_offset   126         self.last_docnum = docnum   127    128 class PositionIndexReader(FileReader):   129    130     "Reading position index information within term-specific regions of a file."   131    132     def reset(self):   133         self.last_docnum = 0   134         self.last_pos_offset = 0   135    136     def read_positions(self):   137    138         """   139         Read a document number, a position file offset for the position index   140         file, and the number of documents in a section of that file.   141         """   142    143         # Read the document number delta.   144    145         self.last_docnum += self.read_number()   146    147         # Read the offset delta.   148    149         self.last_pos_offset += self.read_number()   150    151         # Read the document count.   152    153         count = self.read_number()   154    155         return self.last_docnum, self.last_pos_offset, count   156    157 # Iterators for position-related files.   158    159 class IteratorBase:   160    161     "Support for iterating over results."   162    163     def __init__(self, reader):   164    165         "Initialise the iterator using the given 'reader'."   166    167         self.reader = reader   168         self.replenish(0) # no iteration initially permitted   169    170     def replenish(self, count):   171    172         "Replenish the iterator with 'count' results."   173    174         self.count = count   175         self.read_documents = 0   176    177     def __len__(self):   178    179         "Return the total number of results."   180    181         return self.count   182    183     def sort(self):   184         pass # Stored document positions are already sorted.   185    186     def __iter__(self):   187         return self   188    189 class PositionIterator(IteratorBase):   190    191     "Iterating over document positions."   192    193     def replenish(self, count):   194         IteratorBase.replenish(self, count)   195    196         # Fill a cache of positions.   197    198         self.cache = []   199         n = 0   200    201         while n < self.count:   202             self.cache.append(self.reader.read_positions())   203             n += 1   204    205     def seek(self, offset, count):   206    207         """   208         Seek to 'offset' in the file, limiting the number of documents available   209         for reading to 'count'.   210         """   211    212         self.reader.seek(offset)   213         self.replenish(count)   214    215     def next(self):   216    217         "Read positions for a single document."   218    219         if self.read_documents < self.count:   220             positions = self.cache[self.read_documents]   221             self.read_documents += 1   222             return positions   223         else:   224             raise StopIteration   225    226 class PositionIndexIterator(IteratorBase):   227    228     "Iterating over document positions."   229    230     def replenish(self, count):   231         IteratorBase.replenish(self, count)   232    233         # Fill a cache of offsets.   234    235         self.cache = []   236         self.current = 0   237         n = 0   238    239         while n < self.count:   240             docnum, pos_offset, section_count = t = self.reader.read_positions()   241             self.cache.append(t)   242             n += section_count   243    244     def seek(self, offset, doc_frequency):   245    246         """   247         Seek to 'offset' in the file, limiting the number of documents available   248         for reading to 'doc_frequency'.   249         """   250    251         self.reader.seek(offset)   252         self.replenish(doc_frequency)   253    254     def next(self):   255    256         "Read positions for a single document."   257    258         if self.current < len(self.cache):   259             docnum, pos_offset, self.section_count = t = self.cache[self.current]   260             self.current += 1   261             return t   262         else:   263             raise StopIteration   264    265 class PositionDictionaryWriter:   266    267     "Writing position dictionaries."   268    269     def __init__(self, position_writer, position_index_writer, interval):   270         self.position_writer = position_writer   271         self.position_index_writer = position_index_writer   272         self.interval = interval   273    274     def write_term_positions(self, doc_positions):   275    276         """   277         Write all 'doc_positions' - a collection of tuples of the form (document   278         number, position list) - to the file.   279    280         Add some records to the index, making dictionary entries.   281    282         Return a tuple containing the offset of the written data, the frequency   283         (number of positions), and document frequency (number of documents) for   284         the term involved.   285         """   286    287         # Reset the writers.   288    289         self.position_writer.reset()   290         self.position_index_writer.reset()   291    292         # Remember the first index entry offset.   293    294         index_offset = self.position_index_writer.f.tell()   295    296         # Write the positions.   297    298         frequency = 0   299         count = 0   300    301         if doc_positions:   302    303             # Retain the first record offset for a subsequent index entry.   304    305             first_offset = self.position_writer.f.tell()   306             first_docnum = None   307    308             doc_positions.sort()   309    310             for docnum, positions in doc_positions:   311                 if first_docnum is None:   312                     first_docnum = docnum   313    314                 self.position_writer.write_positions(docnum, positions)   315    316                 frequency += len(positions)   317                 count += 1   318    319                 # Every {interval} entries, write an index entry.   320    321                 if count % self.interval == 0:   322    323                     self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)   324    325                     first_offset = self.position_writer.f.tell()   326                     first_docnum = None   327    328                     # Reset the position writer so that position readers accessing   329                     # a section start with the correct document number.   330    331                     self.position_writer.reset()   332    333             # Finish writing an index entry for the remaining documents.   334    335             else:   336                 if first_docnum is not None:   337                     self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)   338    339         return index_offset, frequency, count   340    341     def close(self):   342         self.position_writer.close()   343         self.position_index_writer.close()   344    345 class PositionDictionaryReader:   346    347     "Access to position dictionary entries through iterators."   348    349     def __init__(self, position_reader, position_index_reader):   350         self.position_reader = position_reader   351         self.position_index_reader = position_index_reader   352    353     def read_term_positions(self, offset, doc_frequency):   354         iterator = PositionDictionaryIterator(   355             PositionIterator(self.position_reader),   356             PositionIndexIterator(self.position_index_reader)   357             )   358         iterator.seek(offset, doc_frequency)   359         return iterator   360    361     def close(self):   362         self.position_reader.close()   363         self.position_index_reader.close()   364    365 class PositionDictionaryIterator:   366    367     "Iteration over position dictionary entries."   368    369     def __init__(self, position_iterator, position_index_iterator):   370         self.position_iterator = position_iterator   371         self.position_index_iterator = position_index_iterator   372         self.reset()   373    374     def reset(self):   375    376         # Remember the last values.   377    378         self.found_docnum, self.found_positions = None, None   379    380         # Maintain state for the next index entry, if read.   381    382         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   383    384     def seek(self, offset, doc_frequency):   385    386         """   387         Seek to 'offset' in the index file, limiting the number of documents   388         available for reading to 'doc_frequency'.   389         """   390    391         self.reset()   392    393         # Seek to the appropriate index entry.   394    395         self.position_index_iterator.seek(offset, doc_frequency)   396    397         # Initialise the current index entry and current position file iterator.   398    399         self._next_section()   400         self._init_section()   401    402     # Sequence methods.   403    404     def __len__(self):   405         return len(self.position_index_iterator)   406    407     def sort(self):   408         pass   409    410     # Iterator methods.   411    412     def __iter__(self):   413         return self   414    415     def next(self):   416    417         """   418         Attempt to get the next document record from the section in the   419         positions file.   420         """   421    422         # Return any visited but unrequested record.   423    424         if self.found_docnum is not None:   425             t = self.found_docnum, self.found_positions   426             self.found_docnum, self.found_positions = None, None   427             return t   428    429         # Or search for the next record.   430    431         while 1:   432    433             # Either return the next record.   434    435             try:   436                 return self.position_iterator.next()   437    438             # Or, where a section is finished, get the next section and try again.   439    440             except StopIteration:   441    442                 # Although, where a single iterator is in use, the file reader   443                 # would be positioned appropriately, this is not guaranteed in a   444                 # multiple iterator situation.   445    446                 self._next_section()   447                 self._init_section()   448    449     def from_document(self, docnum):   450    451         """   452         Attempt to navigate to a positions entry for the given 'docnum',   453         returning the positions for 'docnum', or None otherwise.   454         """   455    456         # Return any unrequested document positions.   457    458         if docnum == self.found_docnum:   459             return self.found_positions   460    461         # Read ahead in the index until the next entry refers to a document   462         # later than the desired document.   463    464         try:   465             if self.next_docnum is None:   466                 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   467    468             # Read until the next entry is after the desired document number,   469             # or until the end of the results.   470    471             while self.next_docnum <= docnum:   472                 self._next_read_section()   473                 if self.docnum < docnum:   474                     self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   475                 else:   476                     break   477    478         except StopIteration:   479             pass   480    481         # Navigate in the position file to the document.   482    483         self._init_section()   484    485         try:   486             while 1:   487                 found_docnum, found_positions = self.position_iterator.next()   488    489                 # Return the desired document positions or None (retaining the   490                 # positions for the document immediately after).   491    492                 if docnum == found_docnum:   493                     return found_positions   494                 elif docnum < found_docnum:   495                     self.found_docnum, self.found_positions = found_docnum, found_positions   496                     return None   497    498         except StopIteration:   499             return None   500    501     # Internal methods.   502    503     def _next_section(self):   504    505         "Attempt to get the next section in the index."   506    507         if self.next_docnum is None:   508             self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()   509         else:   510             self._next_read_section()   511    512     def _next_read_section(self):   513    514         """   515         Make the next index entry the current one without reading from the   516         index.   517         """   518    519         self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count   520         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   521    522     def _init_section(self):   523    524         "Initialise the iterator for the section in the position file."   525    526         # Seek to the position entry.   527    528         self.position_iterator.seek(self.pos_offset, self.section_count)   529    530 # vim: tabstop=4 expandtab shiftwidth=4