iixr (file iixr/positions.py at 1f3986bca1a3)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing position information.     5      6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.data import *    22 from iixr.files import *    23     24 class PositionWriter(FileWriter):    25     26     "Writing position information to files."    27     28     def reset(self):    29         self.last_docnum = None    30         self.subtractor = None    31     32     def write_positions(self, docnum, positions):    33     34         """    35         Write for the document 'docnum' the given 'positions'.    36         """    37     38         if not positions:    39             return    40     41         # Make sure that the positions are sorted.    42     43         positions.sort()    44     45         # Calculate an ongoing delta.    46     47         if self.last_docnum is not None:    48             if docnum < self.last_docnum:    49                 raise ValueError, "Document number %r is less than previous number %r." % (docnum, self.last_docnum)    50     51             docnum_seq = self.subtractor(docnum, self.last_docnum)    52     53         # Or preserve the document number and prepare for future deltas.    54     55         else:    56             self.subtractor = get_subtractor(docnum)    57             docnum_seq = docnum    58     59         self.begin_record()    60         self.write_sequence_value(docnum_seq)    61         self.write_monotonic_sequence(positions)    62         self.end_record()    63     64         self.last_docnum = docnum    65     66 class PositionReader(FileReader):    67     68     "Reading position information within term-specific regions of a file."    69     70     def reset(self):    71         self.last_docnum = None    72         self.adder = None    73     74     def read_positions(self):    75     76         """    77         Read positions, returning a document number and a list of positions.    78         """    79     80         self.begin_record()    81     82         # Read the document number.    83     84         docnum = self.read_sequence_value()    85     86         # Calculate an ongoing delta.    87     88         if self.last_docnum is not None:    89             self.last_docnum = self.adder(docnum, self.last_docnum)    90     91         # Or preserve the document number and prepare for future deltas.    92     93         else:    94             self.adder = get_adder(docnum)    95             self.last_docnum = docnum    96     97         positions = self.read_monotonic_sequence()    98         self.end_record()    99    100         return self.last_docnum, positions   101    102 class PositionIndexWriter(FileWriter):   103    104     "Writing position index information to files."   105    106     def reset(self):   107         self.last_docnum = None   108         self.subtractor = None   109         self.last_pos_offset = 0   110    111     def write_positions(self, docnum, pos_offset, count):   112    113         """   114         Write the given 'docnum, 'pos_offset' and document 'count' to the   115         position index file.   116         """   117    118         # Find the size of document number values.   119    120         if self.last_docnum is not None:   121             docnum_seq = self.subtractor(docnum, self.last_docnum)   122         else:   123             self.subtractor = get_subtractor(docnum)   124             docnum_seq = docnum   125    126         self.begin_record()   127         self.write_sequence_value(docnum_seq)   128         self.write_number(pos_offset - self.last_pos_offset)   129         self.write_number(count)   130         self.end_record()   131    132         self.last_docnum = docnum   133         self.last_pos_offset = pos_offset   134    135 class PositionIndexReader(FileReader):   136    137     "Reading position index information within term-specific regions of a file."   138    139     def reset(self):   140         self.last_docnum = None   141         self.adder = None   142         self.last_pos_offset = 0   143    144     def read_positions(self):   145    146         """   147         Read a document number, a position file offset for the position index   148         file, and the number of documents in a section of that file.   149         """   150    151         self.begin_record()   152    153         # Read the document number.   154    155         docnum = self.read_sequence_value()   156    157         if self.last_docnum is not None:   158             self.last_docnum = self.adder(docnum, self.last_docnum)   159         else:   160             self.adder = get_adder(docnum)   161             self.last_docnum = docnum   162    163         # Read the offset delta.   164    165         self.last_pos_offset += self.read_number()   166    167         # Read the document count.   168    169         count = self.read_number()   170         self.end_record()   171    172         return self.last_docnum, self.last_pos_offset, count   173    174 # Iterators for position-related files.   175    176 class IteratorBase:   177    178     "Support for iterating over results."   179    180     def __init__(self, reader):   181    182         "Initialise the iterator using the given 'reader'."   183    184         self.reader = reader   185         self.replenish(0) # no iteration initially permitted   186    187     def replenish(self, count):   188    189         "Replenish the iterator with 'count' results."   190    191         self.count = count   192         self.read_documents = 0   193    194     def __len__(self):   195    196         "Return the total number of results."   197    198         return self.count   199    200     def sort(self):   201         pass # Stored document positions are already sorted.   202    203     def __iter__(self):   204         return self   205    206 class PositionIterator(IteratorBase):   207    208     "Iterating over document positions."   209    210     def replenish(self, count):   211         IteratorBase.replenish(self, count)   212    213         # Fill a cache of positions.   214    215         self.cache = []   216         n = 0   217    218         while n < self.count:   219             self.cache.append(self.reader.read_positions())   220             n += 1   221    222     def seek(self, offset, count):   223    224         """   225         Seek to 'offset' in the file, limiting the number of documents available   226         for reading to 'count'.   227         """   228    229         self.reader.seek(offset)   230         self.replenish(count)   231    232     def next(self):   233    234         "Read positions for a single document."   235    236         if self.read_documents < self.count:   237             positions = self.cache[self.read_documents]   238             self.read_documents += 1   239             return positions   240         else:   241             raise StopIteration   242    243 class PositionIndexIterator(IteratorBase):   244    245     "Iterating over document positions."   246    247     def replenish(self, count):   248         IteratorBase.replenish(self, count)   249    250         # Fill a cache of offsets.   251    252         self.cache = []   253         self.current = 0   254         n = 0   255    256         while n < self.count:   257             docnum, pos_offset, section_count = t = self.reader.read_positions()   258             self.cache.append(t)   259             n += section_count   260    261     def seek(self, offset, doc_frequency):   262    263         """   264         Seek to 'offset' in the file, limiting the number of documents available   265         for reading to 'doc_frequency'.   266         """   267    268         self.reader.seek(offset)   269         self.replenish(doc_frequency)   270    271     def next(self):   272    273         "Read positions for a single document."   274    275         if self.current < len(self.cache):   276             docnum, pos_offset, self.section_count = t = self.cache[self.current]   277             self.current += 1   278             return t   279         else:   280             raise StopIteration   281    282 class PositionDictionaryWriter:   283    284     "Writing position dictionaries."   285    286     def __init__(self, position_writer, position_index_writer, interval):   287         self.position_writer = position_writer   288         self.position_index_writer = position_index_writer   289         self.interval = interval   290    291     def write_term_positions(self, doc_positions):   292    293         """   294         Write all 'doc_positions' - a collection of tuples of the form (document   295         number, position list) - to the file.   296    297         Add some records to the index, making dictionary entries.   298    299         Return a tuple containing the offset of the written data, the frequency   300         (number of positions), and document frequency (number of documents) for   301         the term involved.   302         """   303    304         # Reset the writers.   305    306         self.position_writer.reset()   307         self.position_index_writer.reset()   308    309         # Remember the first index entry offset.   310    311         index_offset = self.position_index_writer.tell()   312    313         # Write the positions.   314    315         frequency = 0   316         count = 0   317    318         if doc_positions:   319    320             # Retain the first record offset for a subsequent index entry.   321    322             first_offset = self.position_writer.tell()   323             first_docnum = None   324    325             doc_positions.sort()   326    327             for docnum, positions in doc_positions:   328                 if first_docnum is None:   329                     first_docnum = docnum   330    331                 self.position_writer.write_positions(docnum, positions)   332    333                 frequency += len(positions)   334                 count += 1   335    336                 # Every {interval} entries, write an index entry.   337    338                 if count % self.interval == 0:   339    340                     self.position_index_writer.write_positions(first_docnum, first_offset, self.interval)   341    342                     first_offset = self.position_writer.tell()   343                     first_docnum = None   344    345                     # Reset the position writer so that position readers accessing   346                     # a section start with the correct document number.   347    348                     self.position_writer.reset()   349    350             # Finish writing an index entry for the remaining documents.   351    352             else:   353                 if first_docnum is not None:   354                     self.position_index_writer.write_positions(first_docnum, first_offset, count % self.interval)   355    356         return index_offset, frequency, count   357    358     def close(self):   359         self.position_writer.close()   360         self.position_index_writer.close()   361    362 class PositionDictionaryReader:   363    364     "Access to position dictionary entries through iterators."   365    366     def __init__(self, position_reader, position_index_reader):   367         self.position_reader = position_reader   368         self.position_index_reader = position_index_reader   369    370     def read_term_positions(self, offset, doc_frequency):   371         iterator = PositionDictionaryIterator(   372             PositionIterator(self.position_reader),   373             PositionIndexIterator(self.position_index_reader)   374             )   375         iterator.seek(offset, doc_frequency)   376         return iterator   377    378     def close(self):   379         self.position_reader.close()   380         self.position_index_reader.close()   381    382 class PositionDictionaryIterator:   383    384     "Iteration over position dictionary entries."   385    386     def __init__(self, position_iterator, position_index_iterator):   387         self.position_iterator = position_iterator   388         self.position_index_iterator = position_index_iterator   389         self.reset()   390    391     def reset(self):   392    393         # Remember the last values.   394    395         self.found_docnum, self.found_positions = None, None   396    397         # Maintain state for the next index entry, if read.   398    399         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   400    401     def seek(self, offset, doc_frequency):   402    403         """   404         Seek to 'offset' in the index file, limiting the number of documents   405         available for reading to 'doc_frequency'.   406         """   407    408         self.reset()   409    410         # Seek to the appropriate index entry.   411    412         self.position_index_iterator.seek(offset, doc_frequency)   413    414         # Initialise the current index entry and current position file iterator.   415    416         self._next_section()   417         self._init_section()   418    419     # Sequence methods.   420    421     def __len__(self):   422         return len(self.position_index_iterator)   423    424     def sort(self):   425         pass   426    427     # Iterator methods.   428    429     def __iter__(self):   430         return self   431    432     def next(self):   433    434         """   435         Attempt to get the next document record from the section in the   436         positions file.   437         """   438    439         # Return any visited but unrequested record.   440    441         if self.found_docnum is not None:   442             t = self.found_docnum, self.found_positions   443             self.found_docnum, self.found_positions = None, None   444             return t   445    446         # Or search for the next record.   447    448         while 1:   449    450             # Either return the next record.   451    452             try:   453                 return self.position_iterator.next()   454    455             # Or, where a section is finished, get the next section and try again.   456    457             except StopIteration:   458    459                 # Although, where a single iterator is in use, the file reader   460                 # would be positioned appropriately, this is not guaranteed in a   461                 # multiple iterator situation.   462    463                 self._next_section()   464                 self._init_section()   465    466     def from_document(self, docnum):   467    468         """   469         Attempt to navigate to a positions entry for the given 'docnum',   470         returning the positions for 'docnum', or None otherwise.   471         """   472    473         # Return any unrequested document positions.   474    475         if docnum == self.found_docnum:   476             return self.found_positions   477    478         # Read ahead in the index until the next entry refers to a document   479         # later than the desired document.   480    481         try:   482             if self.next_docnum is None:   483                 self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   484    485             # Read until the next entry is after the desired document number,   486             # or until the end of the results.   487    488             while self.next_docnum <= docnum:   489                 self._next_read_section()   490                 if self.docnum < docnum:   491                     self.next_docnum, self.next_pos_offset, self.next_section_count = self.position_index_iterator.next()   492                 else:   493                     break   494    495         except StopIteration:   496             pass   497    498         # Navigate in the position file to the document.   499    500         self._init_section()   501    502         try:   503             while 1:   504                 found_docnum, found_positions = self.position_iterator.next()   505    506                 # Return the desired document positions or None (retaining the   507                 # positions for the document immediately after).   508    509                 if docnum <= found_docnum:   510                     self.found_docnum, self.found_positions = found_docnum, found_positions   511                     if docnum == found_docnum:   512                         return found_positions   513                     elif docnum < found_docnum:   514                         return None   515    516         except StopIteration:   517             return None   518    519     # Internal methods.   520    521     def _next_section(self):   522    523         "Attempt to get the next section in the index."   524    525         if self.next_docnum is None:   526             self.docnum, self.pos_offset, self.section_count = self.position_index_iterator.next()   527         else:   528             self._next_read_section()   529    530     def _next_read_section(self):   531    532         """   533         Make the next index entry the current one without reading from the   534         index.   535         """   536    537         self.docnum, self.pos_offset, self.section_count = self.next_docnum, self.next_pos_offset, self.next_section_count   538         self.next_docnum, self.next_pos_offset, self.next_section_count = None, None, None   539    540     def _init_section(self):   541    542         "Initialise the iterator for the section in the position file."   543    544         # Seek to the position entry.   545    546         self.position_iterator.seek(self.pos_offset, self.section_count)   547    548 # vim: tabstop=4 expandtab shiftwidth=4