iixr (file iixr/positions.py at 489129c7f225)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing position information.     5      6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.files import *    22 from iixr.data import vint, vint_to_array    23 from array import array    24     25 class PositionWriter(FileWriter):    26     27     "Writing position information to files."    28     29     def reset(self):    30         self.last_docnum = None    31         self.docnum_size = None    32     33     def write_positions(self, docnum, positions):    34     35         """    36         Write for the document 'docnum' the given 'positions'.    37         """    38     39         # Find the size of document number values.    40     41         if self.docnum_size is None:    42             self.docnum_size = self.get_value_size(docnum)    43             self.last_docnum = self.get_initial_value(self.docnum_size)    44     45         if docnum < self.last_docnum:    46             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)    47     48         # Make sure that the positions are sorted.    49     50         positions.sort()    51     52         # Find the size of position values.    53     54         size = self.get_value_size(positions[0])    55     56         # Write the number of values per document number.    57         # Write the document number delta.    58         # Write the number of positions.    59         # Write the number of values per position.    60     61         output = array('B')    62         vint_to_array(self.docnum_size, output)    63         self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)    64         vint_to_array(len(positions), output)    65         vint_to_array(size, output)    66     67         # Write the position deltas.    68     69         last = self.get_initial_value(size)    70     71         for position in positions:    72             last = self.write_sequence(output, position, last, size)    73     74         output.tofile(self.f)    75     76         self.last_docnum = docnum    77     78 class PositionReader(FileReader):    79     80     "Reading position information within term-specific regions of a file."    81     82     def reset(self):    83         self.last_docnum = None    84     85     def read_positions(self):    86     87         """    88         Read positions, returning a document number and a list of positions.    89         """    90     91         # Read the number of values per document number.    92     93         docnum_size = self.read_number()    94     95         if self.last_docnum is None:    96             self.last_docnum = self.get_initial_value(docnum_size)    97     98         # Read the document number delta and add it to the last number.    99    100         self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)   101    102         # Read the number of positions.   103    104         npositions = self.read_number()   105    106         # Read the number of values per position.   107    108         size = self.read_number()   109    110         # Read the position deltas, adding each previous position to get the   111         # appropriate collection of absolute positions.   112    113         i = 0   114    115         last = self.get_initial_value(size)   116    117         positions = []   118    119         while i < npositions:   120             last = self.read_sequence(last, size)   121             positions.append(last)   122             i += 1   123    124         return self.last_docnum, positions   125    126 class PositionIndexWriter(FileWriter):   127    128     "Writing position index information to files."   129    130     def reset(self):   131         self.last_docnum = None   132         self.docnum_size = None   133         self.last_pos_offset = 0   134    135     def write_positions(self, docnum, pos_offset, count):   136    137         """   138         Write the given 'docnum, 'pos_offset' and document 'count' to the   139         position index file.   140         """   141    142         # Find the size of document number values.   143    144         if self.docnum_size is None:   145             self.docnum_size = self.get_value_size(docnum)   146             self.last_docnum = self.get_initial_value(self.docnum_size)   147    148         # Write the number of values per document number.   149         # Write the document number delta.   150         # Write the position file offset delta.   151         # Write the document count.   152    153         output = array('B')   154         vint_to_array(self.docnum_size, output)   155         self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)   156         vint_to_array(pos_offset - self.last_pos_offset, output)   157         vint_to_array(count, output)   158    159         # Actually write the data.   160    161         output.tofile(self.f)   162    163         self.last_pos_offset = pos_offset   164    165 class PositionIndexReader(FileReader):   166    167     "Reading position index information within term-specific regions of a file."   168    169     def reset(self):   170         self.last_docnum = None   171         self.last_pos_offset = 0   172    173     def read_positions(self):   174    175         """   176         Read a document number, a position file offset for the position index   177         file, and the number of documents in a section of that file.   178         """   179    180         # Read the number of values per document number.   181    182         docnum_size = self.read_number()   183    184         if self.last_docnum is None:   185             self.last_docnum = self.get_initial_value(docnum_size)   186    187         # Read the document number delta and add it to the last number.   188    189         self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)   190    191         # Read the offset delta.   192    193         self.last_pos_offset += self.read_number()   194    195         # Read the document count.   196    197         count = self.read_number()   198    199         return self.last_docnum, self.last_pos_offset, count   200    201 # Iterators for position-related files.   202    203 class IteratorBase:   204    205     "Support for iterating over results."   206    207     def __init__(self, reader):   208    209         "Initialise the iterator using the given 'reader'."   210    211         self.reader = reader   212         self.replenish(0) # no iteration initially permitted   213    214     def replenish(self, count):   215    216         "Replenish the iterator with 'count' results."   217    218         self.count = count   219         self.read_documents = 0   220    221     def __len__(self):   222    223         "Return the total number of results."   224    225         return self.count   226    227     def sort(self):   228         pass # Stored document positions are already sorted.   229    230     def __iter__(self):   231         return self   232    233 class PositionIterator(IteratorBase):   234    235     "Iterating over document positions."   236    237     def replenish(self, count):   238         IteratorBase.replenish(self, count)   239    240         # Fill a cache of positions.   241    242         self.cache = []   243         n = 0   244    245         while n < self.count:   246             self.cache.append(self.reader.read_positions())   247             n += 1   248    249     def seek(self, offset, count):   250    251         """   252         Seek to 'offset' in the file, limiting the number of documents available   253         for reading to 'count'.   254         """   255    256         self.reader.seek(offset)   257         self.replenish(count)   258    259     def next(self):   260    261         "Read positions for a single document."   262    263         if self.read_documents < self.count:   264             positions = self.cache[self.read_documents]   265             self.read_documents += 1   266             return positions   267         else:   268             raise StopIteration   269    270 class PositionIndexIterator(IteratorBase):   271    272     "Iterating over document positions."   273    274     def replenish(self, count):   275         IteratorBase.replenish(self, count)   276    277         # Fill a cache of offsets.   278    279         self.cache = []   280         self.current = 0   281         n = 0   282    283         while n < self.count:   284             docnum, pos_offset, section_count = t = self.reader.read_positions()   285             self.cache.append(t)   286             n += section_count   287    288     def seek(self, offset, doc_frequency):   289    290         """   291         Seek to 'offset' in the file, limiting the number of documents available   292         for reading to 'doc_frequency'.   293         """   294    295         self.reader.seek(offset)   296         self.replenish(doc_frequency)   297    298     def next(self):   299    300         "Read positions for a single document."   301    302         if self.current < len(self.cache):   303             docnum, pos_offset, self.section_count = t = self.cache[self.current]   304             self.current += 1   305             return t   306         else:   307             raise StopIteration   308    309 class PositionDictionaryWriter:   310    311     "Writing position dictionaries."   312    313     def __init__(self, position_writer, position_index_writer, interval):   314         self.position_writer = position_writer   315         self.position_index_writer = position_index_writer   316         self.interval = interval   317    318     def write_term_positions(self, doc_positions):   319    320         """   321         Write all 'doc_positions' - a collection of tuples of the form (document   322         number, position list) - to the file.   323    324         Add some records to the index, making dictionary entries.   325    326         Return a tuple containing the offset of the written data, the frequency   327         (number of positions), and document frequency (number of documents) for   328         the term involved.   329         """   330    331         # Reset the writers.   332    333         self.position_writer.reset()   334         self.position_index_writer.reset()   335    336         # Remember the first index entry offset.   337    338         index_offset = self.position_index_writer.f.tell()   339    340         # Write the positions.   341    342         frequency = 0   343         count = 0   344    345         if doc_positions:   346    347             # Retain the first record offset for a subsequent index entry.   348    349             first_offset = self.position_writer.f.tell()   350             first_docnum = None   351    352             doc_positions.sort()   353    354             for docnum, positions in doc_positions:   355                 if first_docnum is None:   356                     first_docnum = docnum   357    358                 self.position_writer.write_positions(docnum, positions)   359    360                 frequency += len(positions)   361                 count += 1   362    363                 # Every {interval} entries, write an index entry.   364    365                 if count % self.interval == 0:   366    367                     self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)   368    369                     first_offset = self.position_writer.f.tell()   370                     first_docnum = None   371    372                     # Reset the position writer so that position readers accessing   373                     # a section start with the correct document number.   374    375                     self.position_writer.reset()   376    377             # Finish writing an index entry for the remaining documents.   378    379             else:   380                 if first_docnum is not None:   381                     self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)   382    383         return index_offset, frequency, count   384    385     def close(self):   386         self.position_writer.close()   387         self.position_index_writer.close()   388    389 class PositionDictionaryReader:   390    391     "Access to position dictionary entries through iterators."   392    393     def __init__(self, position_reader, position_index_reader):   394         self.position_reader = position_reader   395         self.position_index_reader = position_index_reader   396    397     def read_term_positions(self, offset, doc_frequency):   398         iterator = PositionDictionaryIterator(   399             PositionIterator(self.position_reader),   400             PositionIndexIterator(self.position_index_reader)   401             )   402         iterator.seek(offset, doc_frequency)   403         return iterator   404    405     def close(self):   406         self.position_reader.close()   407         self.position_index_reader.close()   408    409 class PositionDictionaryIterator:   410    411     "Iteration over position dictionary entries."   412    413     def __init__(self, position_iterator, position_index_iterator):   414         self.position_iterator = position_iterator   415         self.position_index_iterator = position_index_iterator   416         self.reset()   417    418     def reset(self):   419    420         # Remember the last values.   421    422         self.found_docnum, self.found_positions = None, None   423    424         # Maintain state for the next index entry, if read.   425    426         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   427    428     def seek(self, offset, doc_frequency):   429    430         """   431         Seek to 'offset' in the index file, limiting the number of documents   432         available for reading to 'doc_frequency'.   433         """   434    435         self.reset()   436    437         # Seek to the appropriate index entry.   438    439         self.position_index_iterator.seek(offset, doc_frequency)   440    441         # Initialise the current index entry and current position file iterator.   442    443         self._next_section()   444         self._init_section()   445    446     # Sequence methods.   447    448     def __len__(self):   449         return len(self.position_index_iterator)   450    451     def sort(self):   452         pass   453    454     # Iterator methods.   455    456     def __iter__(self):   457         return self   458    459     def next(self):   460    461         """   462         Attempt to get the next document record from the section in the   463         positions file.   464         """   465    466         # Return any visited but unrequested record.   467    468         if self.found_docnum is not None:   469             t = self.found_docnum, self.found_positions   470             self.found_docnum, self.found_positions = None, None   471             return t   472    473         # Or search for the next record.   474    475         while 1:   476    477             # Either return the next record.   478    479             try:   480                 return self.position_iterator.next()   481    482             # Or, where a section is finished, get the next section and try again.   483    484             except StopIteration:   485    486                 # Although, where a single iterator is in use, the file reader   487                 # would be positioned appropriately, this is not guaranteed in a   488                 # multiple iterator situation.   489    490                 self._next_section()   491                 self._init_section()   492    493     def from_document(self, docnum):   494    495         """   496         Attempt to navigate to a positions entry for the given 'docnum',   497         returning the positions for 'docnum', or None otherwise.   498         """   499    500         # Return any unrequested document positions.   501    502         if docnum == self.found_docnum:   503             return self.found_positions   504    505         # Read ahead in the index until the next entry refers to a document   506         # later than the desired document.   507    508         try:   509             if self.next_docnum is None:   510                 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   511    512             # Read until the next entry is after the desired document number,   513             # or until the end of the results.   514    515             while self.next_docnum <= docnum:   516                 self._next_read_section()   517                 if self.docnum < docnum:   518                     self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   519                 else:   520                     break   521    522         except StopIteration:   523             pass   524    525         # Navigate in the position file to the document.   526    527         self._init_section()   528    529         try:   530             while 1:   531                 found_docnum, found_positions = self.position_iterator.next()   532    533                 # Return the desired document positions or None (retaining the   534                 # positions for the document immediately after).   535    536                 if docnum <= found_docnum:   537                     self.found_docnum, self.found_positions = found_docnum, found_positions   538                     if docnum == found_docnum:   539                         return found_positions   540                     elif docnum < found_docnum:   541                         return None   542    543         except StopIteration:   544             return None   545    546     # Internal methods.   547    548     def _next_section(self):   549    550         "Attempt to get the next section in the index."   551    552         if self.next_docnum is None:   553             self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()   554         else:   555             self._next_read_section()   556    557     def _next_read_section(self):   558    559         """   560         Make the next index entry the current one without reading from the   561         index.   562         """   563    564         self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count   565         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   566    567     def _init_section(self):   568    569         "Initialise the iterator for the section in the position file."   570    571         # Seek to the position entry.   572    573         self.position_iterator.seek(self.pos_offset, self.section_count)   574    575 # vim: tabstop=4 expandtab shiftwidth=4