iixr (file iixr/fields.py at 4c35f0aa339c)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing document information.     5      6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.files import *    22 from bisect import bisect_right  # to find terms in the dictionary index    23     24 DOCUMENT_CACHE_LIMIT = 10000    25     26 class FieldWriter(FileWriter):    27     28     "Writing field data to files."    29     30     def reset(self):    31         self.last_docnum = None    32         self.docnum_size = None    33     34     def write_fields(self, docnum, fields):    35     36         """    37         Write for the given 'docnum', a list of 'fields' (integer, string pairs    38         representing field identifiers and values respectively).    39         """    40     41         # Find the size of document number values.    42     43         if self.docnum_size is None:    44             self.docnum_size = self.get_value_size(docnum)    45             self.last_docnum = self.get_initial_value(self.docnum_size)    46     47         # Write the number of values per document number.    48         # Write the document number delta.    49     50         self.write_number(self.docnum_size)    51         self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)    52     53         # Write the number of fields.    54     55         self.write_number(len(fields))    56     57         # Write the fields themselves.    58     59         for i, field in fields:    60             self.write_number(i)    61             self.write_string(field, 1) # compress    62     63 class FieldReader(FileReader):    64     65     "Reading field data from files."    66     67     def reset(self):    68         self.last_docnum = None    69     70     def read_fields(self):    71     72         """    73         Read fields from the file, returning a tuple containing the document    74         number and a list of field (identifier, value) pairs.    75         """    76     77         # Read the number of values per document number.    78     79         docnum_size = self.read_number()    80     81         if self.last_docnum is None:    82             self.last_docnum = self.get_initial_value(docnum_size)    83     84         # Read the document number delta and add it to the last number.    85     86         self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)    87     88         # Read the number of fields.    89     90         nfields = self.read_number()    91     92         # Collect the fields.    93     94         fields = []    95         i = 0    96     97         while i < nfields:    98             identifier = self.read_number()    99             value = self.read_string(1) # decompress   100             fields.append((identifier, value))   101             i += 1   102    103         return self.last_docnum, fields   104    105     def read_document_fields(self, docnum, offset):   106    107         """   108         Read fields for 'docnum' at the given 'offset'. This permits the   109         retrieval of details for the specified document, as well as scanning for   110         later documents.   111         """   112    113         self.seek(offset)   114         bad_docnum, fields = self.read_fields()   115         self.last_docnum = docnum   116         return docnum, fields   117    118 class FieldIndexWriter(FileWriter):   119    120     "Writing field index details to files."   121    122     def reset(self):   123         self.last_docnum = None   124         self.docnum_size = None   125         self.last_offset = 0   126    127     def write_document(self, docnum, offset):   128    129         """   130         Write for the given 'docnum', the 'offset' at which the fields for the   131         document are stored in the fields file.   132         """   133    134         # Find the size of document number values.   135    136         if self.docnum_size is None:   137             self.docnum_size = self.get_value_size(docnum)   138             self.last_docnum = self.get_initial_value(self.docnum_size)   139    140         # Write the number of values per document number.   141         # Write the document number delta.   142    143         self.write_number(self.docnum_size)   144         self.last_docnum = self.write_sequence(docnum, self.last_docnum, self.docnum_size, monotonic=0)   145    146         # Write the offset delta.   147    148         self.write_number(offset - self.last_offset)   149         self.last_offset = offset   150    151 class FieldIndexReader(FileReader):   152    153     "Reading field index details from files."   154    155     def reset(self):   156         self.last_docnum = None   157         self.last_offset = 0   158    159     def read_document(self):   160    161         "Read a document number and field file offset."   162    163         # Read the number of values per document number.   164    165         docnum_size = self.read_number()   166    167         if self.last_docnum is None:   168             self.last_docnum = self.get_initial_value(docnum_size)   169    170         # Read the document number delta and add it to the last number.   171    172         self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)   173    174         # Read the offset.   175    176         self.last_offset += self.read_number()   177    178         return self.last_docnum, self.last_offset   179    180 class FieldDictionaryWriter:   181    182     "Writing field dictionary details."   183    184     def __init__(self, field_writer, field_index_writer, interval):   185         self.field_writer = field_writer   186         self.field_index_writer = field_index_writer   187         self.interval = interval   188         self.entry = 0   189    190     def write_fields(self, docnum, fields):   191    192         "Write details of the document with the given 'docnum' and 'fields'."   193    194         if self.entry % self.interval == 0:   195             offset = self.field_writer.tell()   196             self.field_writer.write_fields(docnum, fields)   197             self.field_index_writer.write_document(docnum, offset)   198         else:   199             self.field_writer.write_fields(docnum, fields)   200    201         self.entry += 1   202    203     def close(self):   204         self.field_writer.close()   205         self.field_index_writer.close()   206    207 class FieldDictionaryReader:   208    209     "Reading field dictionary details."   210    211     def __init__(self, field_reader, field_index_reader):   212         self.field_reader = field_reader   213         self.field_index_reader = field_index_reader   214    215         self.cache = {}   216         self.docs = []   217         try:   218             while 1:   219                 self.docs.append(self.field_index_reader.read_document())   220         except EOFError:   221             pass   222    223         # Large numbers for ordering purposes.   224    225         if self.docs:   226             self.max_offset = self.docs[-1][1]   227         else:   228             self.max_offset = None   229    230     # Iterator convenience methods.   231    232     def __iter__(self):   233         self.rewind()   234         return self   235    236     def next(self):   237         try:   238             return self.read_fields()   239         except EOFError:   240             raise StopIteration   241    242     # Sequential access methods.   243    244     def rewind(self):   245         self.field_reader.rewind()   246    247     def read_fields(self):   248    249         "Return the next document number and fields."   250    251         return self.field_reader.read_fields()   252    253     # Random access methods.   254    255     def get_fields(self, docnum):   256    257         "Read the fields of the document with the given 'docnum'."   258    259         if self.cache.has_key(docnum):   260             return self.cache[docnum]   261    262         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1   263    264         # Get the entry position providing the term or one preceding it.   265    266         if i == -1:   267             return None   268    269         found_docnum, offset = self.docs[i]   270    271         # Read from the fields file.   272    273         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)   274    275         # Scan for the document, if necessary.   276    277         try:   278             while docnum > found_docnum:   279                 found_docnum, fields = self.field_reader.read_fields()   280         except EOFError:   281             pass   282    283         # If the document is found, return the fields.   284    285         if docnum == found_docnum:   286    287             # Store the fields in the cache, removing entries if the limit has   288             # been reached.   289    290             keys = self.cache.keys()   291    292             if len(keys) == DOCUMENT_CACHE_LIMIT:   293                 del self.cache[keys[0]]   294    295             self.cache[docnum] = fields   296             return fields   297         else:   298             return None   299    300     def close(self):   301         self.field_reader.close()   302         self.field_index_reader.close()   303    304 # vim: tabstop=4 expandtab shiftwidth=4