iixr (file iixr/positions.py at 00995a70f535)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing position information.     5      6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.files import *    22 from iixr.data import vint, vint_to_array, string_to_array    23 from array import array    24     25 class PositionWriter(FileWriter):    26     27     "Writing position information to files."    28     29     def reset(self):    30         self.last_docnum = 0    31     32     def write_positions(self, docnum, positions):    33     34         """    35         Write for the document 'docnum' the given 'positions'.    36         """    37     38         if docnum < self.last_docnum:    39             raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)    40     41         # Make sure that the positions are sorted.    42     43         positions.sort()    44     45         # Write the document number delta.    46         # Write the number of positions.    47     48         output = array('B')    49         vint_to_array(docnum - self.last_docnum, output)    50         vint_to_array(len(positions), output)    51     52         # Write the position deltas.    53     54         last = 0    55     56         # Handle tuples incorporating preceding text.    57     58         for position, preceding in positions:    59             vint_to_array(position - last, output)    60             string_to_array(preceding, output)    61             last = position    62     63         output.tofile(self.f)    64     65         self.last_docnum = docnum    66     67 class PositionReader(FileReader):    68     69     "Reading position information within term-specific regions of a file."    70     71     def reset(self):    72         self.last_docnum = 0    73     74     def read_positions(self):    75     76         "Read positions, returning a document number and a list of positions."    77     78         # Read the document number delta and add it to the last number.    79     80         self.last_docnum += self.read_number()    81     82         # Read the number of positions.    83     84         npositions = self.read_number()    85     86         # Read the position deltas, adding each previous position to get the    87         # appropriate collection of absolute positions.    88     89         i = 0    90         last = 0    91         positions = []    92     93         while i < npositions:    94             last += self.read_number()    95             preceding = self.read_string()    96             positions.append((last, preceding))    97             i += 1    98     99         return self.last_docnum, positions   100    101 class PositionIndexWriter(FileWriter):   102    103     "Writing position index information to files."   104    105     def reset(self):   106         self.last_docnum = 0   107         self.last_pos_offset = 0   108    109     def write_positions(self, docnum, pos_offset, count):   110    111         """   112         Write the given 'docnum, 'pos_offset' and document 'count' to the   113         position index file.   114         """   115    116         # Write the document number delta.   117         # Write the position file offset delta.   118         # Write the document count.   119    120         output = array('B')   121         vint_to_array(docnum - self.last_docnum, output)   122         vint_to_array(pos_offset - self.last_pos_offset, output)   123         vint_to_array(count, output)   124    125         # Actually write the data.   126    127         output.tofile(self.f)   128    129         self.last_pos_offset = pos_offset   130         self.last_docnum = docnum   131    132 class PositionIndexReader(FileReader):   133    134     "Reading position index information within term-specific regions of a file."   135    136     def reset(self):   137         self.last_docnum = 0   138         self.last_pos_offset = 0   139    140     def read_positions(self):   141    142         """   143         Read a document number, a position file offset for the position index   144         file, and the number of documents in a section of that file.   145         """   146    147         # Read the document number delta.   148    149         self.last_docnum += self.read_number()   150    151         # Read the offset delta.   152    153         self.last_pos_offset += self.read_number()   154    155         # Read the document count.   156    157         count = self.read_number()   158    159         return self.last_docnum, self.last_pos_offset, count   160    161 # Iterators for position-related files.   162    163 class IteratorBase:   164    165     "Support for iterating over results."   166    167     def __init__(self, reader):   168    169         "Initialise the iterator using the given 'reader'."   170    171         self.reader = reader   172         self.replenish(0) # no iteration initially permitted   173    174     def replenish(self, count):   175    176         "Replenish the iterator with 'count' results."   177    178         self.count = count   179         self.read_documents = 0   180    181     def __len__(self):   182    183         "Return the total number of results."   184    185         return self.count   186    187     def sort(self):   188         pass # Stored document positions are already sorted.   189    190     def __iter__(self):   191         return self   192    193 class PositionIterator(IteratorBase):   194    195     "Iterating over document positions."   196    197     def replenish(self, count):   198         IteratorBase.replenish(self, count)   199    200         # Fill a cache of positions.   201    202         self.cache = []   203         n = 0   204    205         while n < self.count:   206             self.cache.append(self.reader.read_positions())   207             n += 1   208    209     def seek(self, offset, count):   210    211         """   212         Seek to 'offset' in the file, limiting the number of documents available   213         for reading to 'count'.   214         """   215    216         self.reader.seek(offset)   217         self.replenish(count)   218    219     def next(self):   220    221         "Read positions for a single document."   222    223         if self.read_documents < self.count:   224             positions = self.cache[self.read_documents]   225             self.read_documents += 1   226             return positions   227         else:   228             raise StopIteration   229    230 class PositionIndexIterator(IteratorBase):   231    232     "Iterating over document positions."   233    234     def replenish(self, count):   235         IteratorBase.replenish(self, count)   236    237         # Fill a cache of offsets.   238    239         self.cache = []   240         self.current = 0   241         n = 0   242    243         while n < self.count:   244             docnum, pos_offset, section_count = t = self.reader.read_positions()   245             self.cache.append(t)   246             n += section_count   247    248     def seek(self, offset, doc_frequency):   249    250         """   251         Seek to 'offset' in the file, limiting the number of documents available   252         for reading to 'doc_frequency'.   253         """   254    255         self.reader.seek(offset)   256         self.replenish(doc_frequency)   257    258     def next(self):   259    260         "Read positions for a single document."   261    262         if self.current < len(self.cache):   263             docnum, pos_offset, self.section_count = t = self.cache[self.current]   264             self.current += 1   265             return t   266         else:   267             raise StopIteration   268    269 class PositionDictionaryWriter:   270    271     "Writing position dictionaries."   272    273     def __init__(self, position_writer, position_index_writer, interval):   274         self.position_writer = position_writer   275         self.position_index_writer = position_index_writer   276         self.interval = interval   277    278     def write_term_positions(self, doc_positions):   279    280         """   281         Write all 'doc_positions' - a collection of tuples of the form (document   282         number, position list) - to the file.   283    284         Add some records to the index, making dictionary entries.   285    286         Return a tuple containing the offset of the written data, the frequency   287         (number of positions), and document frequency (number of documents) for   288         the term involved.   289         """   290    291         # Reset the writers.   292    293         self.position_writer.reset()   294         self.position_index_writer.reset()   295    296         # Remember the first index entry offset.   297    298         index_offset = self.position_index_writer.f.tell()   299    300         # Write the positions.   301    302         frequency = 0   303         count = 0   304    305         if doc_positions:   306    307             # Retain the first record offset for a subsequent index entry.   308    309             first_offset = self.position_writer.f.tell()   310             first_docnum = None   311    312             doc_positions.sort()   313    314             for docnum, positions in doc_positions:   315                 if first_docnum is None:   316                     first_docnum = docnum   317    318                 self.position_writer.write_positions(docnum, positions)   319    320                 frequency += len(positions)   321                 count += 1   322    323                 # Every {interval} entries, write an index entry.   324    325                 if count % self.interval == 0:   326    327                     self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)   328    329                     first_offset = self.position_writer.f.tell()   330                     first_docnum = None   331    332                     # Reset the position writer so that position readers accessing   333                     # a section start with the correct document number.   334    335                     self.position_writer.reset()   336    337             # Finish writing an index entry for the remaining documents.   338    339             else:   340                 if first_docnum is not None:   341                     self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)   342    343         return index_offset, frequency, count   344    345     def close(self):   346         self.position_writer.close()   347         self.position_index_writer.close()   348    349 class PositionDictionaryReader:   350    351     "Access to position dictionary entries through iterators."   352    353     def __init__(self, position_reader, position_index_reader):   354         self.position_reader = position_reader   355         self.position_index_reader = position_index_reader   356    357     def read_term_positions(self, offset, doc_frequency):   358         iterator = PositionDictionaryIterator(   359             PositionIterator(self.position_reader),   360             PositionIndexIterator(self.position_index_reader)   361             )   362         iterator.seek(offset, doc_frequency)   363         return iterator   364    365     def close(self):   366         self.position_reader.close()   367         self.position_index_reader.close()   368    369 class PositionDictionaryIterator:   370    371     "Iteration over position dictionary entries."   372    373     def __init__(self, position_iterator, position_index_iterator):   374         self.position_iterator = position_iterator   375         self.position_index_iterator = position_index_iterator   376         self.reset()   377    378     def reset(self):   379    380         # Remember the last values.   381    382         self.found_docnum, self.found_positions = None, None   383    384         # Maintain state for the next index entry, if read.   385    386         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   387    388     def seek(self, offset, doc_frequency):   389    390         """   391         Seek to 'offset' in the index file, limiting the number of documents   392         available for reading to 'doc_frequency'.   393         """   394    395         self.reset()   396    397         # Seek to the appropriate index entry.   398    399         self.position_index_iterator.seek(offset, doc_frequency)   400    401         # Initialise the current index entry and current position file iterator.   402    403         self._next_section()   404         self._init_section()   405    406     # Sequence methods.   407    408     def __len__(self):   409         return len(self.position_index_iterator)   410    411     def sort(self):   412         pass   413    414     # Iterator methods.   415    416     def __iter__(self):   417         return self   418    419     def next(self):   420    421         """   422         Attempt to get the next document record from the section in the   423         positions file.   424         """   425    426         # Return any visited but unrequested record.   427    428         if self.found_docnum is not None:   429             t = self.found_docnum, self.found_positions   430             self.found_docnum, self.found_positions = None, None   431             return t   432    433         # Or search for the next record.   434    435         while 1:   436    437             # Either return the next record.   438    439             try:   440                 return self.position_iterator.next()   441    442             # Or, where a section is finished, get the next section and try again.   443    444             except StopIteration:   445    446                 # Although, where a single iterator is in use, the file reader   447                 # would be positioned appropriately, this is not guaranteed in a   448                 # multiple iterator situation.   449    450                 self._next_section()   451                 self._init_section()   452    453     def from_document(self, docnum):   454    455         """   456         Attempt to navigate to a positions entry for the given 'docnum',   457         returning the positions for 'docnum', or None otherwise.   458         """   459    460         # Return any unrequested document positions.   461    462         if docnum == self.found_docnum:   463             return self.found_positions   464    465         # Read ahead in the index until the next entry refers to a document   466         # later than the desired document.   467    468         try:   469             if self.next_docnum is None:   470                 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   471    472             # Read until the next entry is after the desired document number,   473             # or until the end of the results.   474    475             while self.next_docnum <= docnum:   476                 self._next_read_section()   477                 if self.docnum < docnum:   478                     self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   479                 else:   480                     break   481    482         except StopIteration:   483             pass   484    485         # Navigate in the position file to the document.   486    487         self._init_section()   488    489         try:   490             while 1:   491                 found_docnum, found_positions = self.position_iterator.next()   492    493                 # Return the desired document positions or None (retaining the   494                 # positions for the document immediately after).   495    496                 if docnum == found_docnum:   497                     return found_positions   498                 elif docnum < found_docnum:   499                     self.found_docnum, self.found_positions = found_docnum, found_positions   500                     return None   501    502         except StopIteration:   503             return None   504    505     # Internal methods.   506    507     def _next_section(self):   508    509         "Attempt to get the next section in the index."   510    511         if self.next_docnum is None:   512             self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()   513         else:   514             self._next_read_section()   515    516     def _next_read_section(self):   517    518         """   519         Make the next index entry the current one without reading from the   520         index.   521         """   522    523         self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count   524         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   525    526     def _init_section(self):   527    528         "Initialise the iterator for the section in the position file."   529    530         # Seek to the position entry.   531    532         self.position_iterator.seek(self.pos_offset, self.section_count)   533    534 # vim: tabstop=4 expandtab shiftwidth=4