iixr (file iixr/fields.py at fc0e9882717b)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing document information.     5      6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.data import *    22 from iixr.files import *    23 from bisect import bisect_right  # to find terms in the dictionary index    24     25 DOCUMENT_CACHE_LIMIT = 10000    26     27 class FieldWriter(FileWriter):    28     29     "Writing field data to files."    30     31     def reset(self):    32         self.end_record()    33         self.last_docnum = None    34         self.subtractor = None    35     36     def write_fields(self, docnum, fields):    37     38         """    39         Write for the given 'docnum', a list of 'fields' (integer, string pairs    40         representing field identifiers and values respectively).    41         """    42     43         # Find the size of document number values.    44     45         if self.last_docnum is not None:    46             docnum_seq = self.subtractor(docnum, self.last_docnum)    47         else:    48             self.subtractor = get_subtractor(docnum)    49             docnum_seq = docnum    50     51         # Write the document number.    52     53         self.write_sequence_value(docnum_seq)    54     55         # Write the number of fields.    56     57         self.write_number(len(fields))    58     59         # Write the fields themselves.    60     61         for i, field in fields:    62             self.write_number(i)    63             self.write_string(field, 1) # compress    64     65         self.last_docnum = docnum    66     67 class FieldReader(FileReader):    68     69     "Reading field data from files."    70     71     def reset(self):    72         self.last_docnum = None    73         self.adder = None    74         self.begin_record()    75     76     def read_fields(self):    77     78         """    79         Read fields from the file, returning a tuple containing the document    80         number and a list of field (identifier, value) pairs.    81         """    82     83         # Read the document number.    84     85         docnum = self.read_sequence_value()    86     87         if self.last_docnum is not None:    88             self.last_docnum = self.adder(docnum, self.last_docnum)    89         else:    90             self.adder = get_adder(docnum)    91             self.last_docnum = docnum    92     93         # Read the number of fields.    94     95         nfields = self.read_number()    96     97         # Collect the fields.    98     99         fields = []   100         i = 0   101    102         while i < nfields:   103             identifier = self.read_number()   104             value = self.read_string(1) # decompress   105             fields.append((identifier, value))   106             i += 1   107    108         return self.last_docnum, fields   109    110     def read_document_fields(self, docnum, offset):   111    112         """   113         Read fields for 'docnum' at the given 'offset'. This permits the   114         retrieval of details for the specified document, as well as scanning for   115         later documents.   116         """   117    118         self.seek(offset)   119         bad_docnum, fields = self.read_fields()   120         self.last_docnum = docnum   121         return docnum, fields   122    123 class FieldIndexWriter(FileWriter):   124    125     "Writing field index details to files."   126    127     def reset(self):   128         self.end_record()   129         self.last_docnum = None   130         self.subtractor = None   131         self.last_offset = 0   132    133     def write_document(self, docnum, offset):   134    135         """   136         Write for the given 'docnum', the 'offset' at which the fields for the   137         document are stored in the fields file.   138         """   139    140         # Find the size of document number values.   141    142         if self.last_docnum is not None:   143             docnum_seq = self.subtractor(docnum, self.last_docnum)   144         else:   145             self.subtractor = get_subtractor(docnum)   146             docnum_seq = docnum   147    148         # Write the document number.   149    150         self.write_sequence_value(docnum_seq)   151    152         # Write the offset delta.   153    154         self.write_number(offset - self.last_offset)   155    156         self.last_docnum = docnum   157         self.last_offset = offset   158    159 class FieldIndexReader(FileReader):   160    161     "Reading field index details from files."   162    163     def reset(self):   164         self.last_docnum = None   165         self.adder = None   166         self.last_offset = 0   167         self.begin_record()   168    169     def read_document(self):   170    171         "Read a document number and field file offset."   172    173         # Read the document number.   174    175         docnum = self.read_sequence_value()   176    177         if self.last_docnum is not None:   178             self.last_docnum = self.adder(docnum, self.last_docnum)   179         else:   180             self.adder = get_adder(docnum)   181             self.last_docnum = docnum   182    183         # Read the offset.   184    185         self.last_offset += self.read_number()   186    187         return self.last_docnum, self.last_offset   188    189 class FieldDictionaryWriter:   190    191     "Writing field dictionary details."   192    193     def __init__(self, field_writer, field_index_writer, interval):   194         self.field_writer = field_writer   195         self.field_index_writer = field_index_writer   196         self.interval = interval   197         self.entry = 0   198    199     def write_fields(self, docnum, fields):   200    201         "Write details of the document with the given 'docnum' and 'fields'."   202    203         if self.entry % self.interval == 0:   204             self.field_writer.reset()   205             offset = self.field_writer.tell()   206             self.field_writer.write_fields(docnum, fields)   207             self.field_index_writer.write_document(docnum, offset)   208         else:   209             self.field_writer.write_fields(docnum, fields)   210    211         self.entry += 1   212    213     def close(self):   214         self.field_writer.close()   215         self.field_index_writer.close()   216    217 class FieldDictionaryReader:   218    219     "Reading field dictionary details."   220    221     def __init__(self, field_reader, field_index_reader):   222         self.field_reader = field_reader   223         self.field_index_reader = field_index_reader   224         self.entry = 0   225    226         self.cache = {}   227         self.docs = []   228         try:   229             while 1:   230                 self.docs.append(self.field_index_reader.read_document())   231         except EOFError:   232             pass   233    234         # Large numbers for ordering purposes.   235    236         if self.docs:   237             self.max_offset = self.docs[-1][1]   238         else:   239             self.max_offset = None   240    241     # Iterator convenience methods.   242    243     def __iter__(self):   244         self.rewind()   245         return self   246    247     def next(self):   248         try:   249             return self.read_fields()   250         except EOFError:   251             raise StopIteration   252    253     # Sequential access methods.   254    255     def rewind(self):   256         self.field_reader.rewind()   257    258     def read_fields(self):   259    260         "Return the next document number and fields."   261    262         try:   263             return self.field_reader.read_fields()   264         except EOFError:   265             self.entry += 1   266             try:   267                 found_docnum, offset = self.docs[self.entry]   268             except IndexError:   269                 raise EOFError   270             else:   271                 self.field_reader.reset()   272                 return self.field_reader.read_fields()   273    274     # Random access methods.   275    276     def get_fields(self, docnum):   277    278         "Read the fields of the document with the given 'docnum'."   279    280         if self.cache.has_key(docnum):   281             return self.cache[docnum]   282    283         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1   284    285         # Get the entry position providing the term or one preceding it.   286    287         if i == -1:   288             return None   289    290         found_docnum, offset = self.docs[i]   291    292         # Read from the fields file.   293    294         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)   295    296         # Scan for the document, if necessary.   297    298         try:   299             while docnum > found_docnum:   300                 found_docnum, fields = self.field_reader.read_fields()   301         except EOFError:   302             pass   303    304         # If the document is found, return the fields.   305    306         if docnum == found_docnum:   307    308             # Store the fields in the cache, removing entries if the limit has   309             # been reached.   310    311             keys = self.cache.keys()   312    313             if len(keys) == DOCUMENT_CACHE_LIMIT:   314                 del self.cache[keys[0]]   315    316             self.cache[docnum] = fields   317             return fields   318         else:   319             return None   320    321     def close(self):   322         self.field_reader.close()   323         self.field_index_reader.close()   324    325 # vim: tabstop=4 expandtab shiftwidth=4