iixr (file iixr/fields.py at 1f3986bca1a3)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing document information.     5      6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.data import *    22 from iixr.files import *    23 from bisect import bisect_right  # to find terms in the dictionary index    24     25 DOCUMENT_CACHE_LIMIT = 10000    26     27 class FieldWriter(FileWriter):    28     29     "Writing field data to files."    30     31     def reset(self):    32         self.last_docnum = None    33         self.subtractor = None    34     35     def write_fields(self, docnum, fields):    36     37         """    38         Write for the given 'docnum', a list of 'fields' (integer, string pairs    39         representing field identifiers and values respectively).    40         """    41     42         # Find the size of document number values.    43     44         if self.last_docnum is not None:    45             docnum_seq = self.subtractor(docnum, self.last_docnum)    46         else:    47             self.subtractor = get_subtractor(docnum)    48             docnum_seq = docnum    49     50         self.begin_record()    51     52         # Write the document number.    53     54         self.write_sequence_value(docnum_seq)    55     56         # Write the number of fields.    57     58         self.write_number(len(fields))    59     60         # Write the fields themselves.    61     62         for i, field in fields:    63             self.write_number(i)    64             self.write_string(field, 1) # compress    65     66         self.end_record()    67     68         self.last_docnum = docnum    69     70 class FieldReader(FileReader):    71     72     "Reading field data from files."    73     74     def reset(self):    75         self.last_docnum = None    76         self.adder = None    77     78     def read_fields(self):    79     80         """    81         Read fields from the file, returning a tuple containing the document    82         number and a list of field (identifier, value) pairs.    83         """    84     85         self.begin_record()    86     87         # Read the document number.    88     89         docnum = self.read_sequence_value()    90     91         if self.last_docnum is not None:    92             self.last_docnum = self.adder(docnum, self.last_docnum)    93         else:    94             self.adder = get_adder(docnum)    95             self.last_docnum = docnum    96     97         # Read the number of fields.    98     99         nfields = self.read_number()   100    101         # Collect the fields.   102    103         fields = []   104         i = 0   105    106         while i < nfields:   107             identifier = self.read_number()   108             value = self.read_string(1) # decompress   109             fields.append((identifier, value))   110             i += 1   111    112         self.end_record()   113    114         return self.last_docnum, fields   115    116     def read_document_fields(self, docnum, offset):   117    118         """   119         Read fields for 'docnum' at the given 'offset'. This permits the   120         retrieval of details for the specified document, as well as scanning for   121         later documents.   122         """   123    124         self.seek(offset)   125         bad_docnum, fields = self.read_fields()   126         self.last_docnum = docnum   127         return docnum, fields   128    129 class FieldIndexWriter(FileWriter):   130    131     "Writing field index details to files."   132    133     def reset(self):   134         self.last_docnum = None   135         self.subtractor = None   136         self.last_offset = 0   137    138     def write_document(self, docnum, offset):   139    140         """   141         Write for the given 'docnum', the 'offset' at which the fields for the   142         document are stored in the fields file.   143         """   144    145         # Find the size of document number values.   146    147         if self.last_docnum is not None:   148             docnum_seq = self.subtractor(docnum, self.last_docnum)   149         else:   150             self.subtractor = get_subtractor(docnum)   151             docnum_seq = docnum   152    153         self.begin_record()   154    155         # Write the document number.   156    157         self.write_sequence_value(docnum_seq)   158    159         # Write the offset delta.   160    161         self.write_number(offset - self.last_offset)   162         self.end_record()   163    164         self.last_docnum = docnum   165         self.last_offset = offset   166    167 class FieldIndexReader(FileReader):   168    169     "Reading field index details from files."   170    171     def reset(self):   172         self.last_docnum = None   173         self.adder = None   174         self.last_offset = 0   175    176     def read_document(self):   177    178         "Read a document number and field file offset."   179    180         self.begin_record()   181    182         # Read the document number.   183    184         docnum = self.read_sequence_value()   185    186         if self.last_docnum is not None:   187             self.last_docnum = self.adder(docnum, self.last_docnum)   188         else:   189             self.adder = get_adder(docnum)   190             self.last_docnum = docnum   191    192         # Read the offset.   193    194         self.last_offset += self.read_number()   195         self.end_record()   196    197         return self.last_docnum, self.last_offset   198    199 class FieldDictionaryWriter:   200    201     "Writing field dictionary details."   202    203     def __init__(self, field_writer, field_index_writer, interval):   204         self.field_writer = field_writer   205         self.field_index_writer = field_index_writer   206         self.interval = interval   207         self.entry = 0   208    209     def write_fields(self, docnum, fields):   210    211         "Write details of the document with the given 'docnum' and 'fields'."   212    213         if self.entry % self.interval == 0:   214             offset = self.field_writer.tell()   215             self.field_writer.write_fields(docnum, fields)   216             self.field_index_writer.write_document(docnum, offset)   217         else:   218             self.field_writer.write_fields(docnum, fields)   219    220         self.entry += 1   221    222     def close(self):   223         self.field_writer.close()   224         self.field_index_writer.close()   225    226 class FieldDictionaryReader:   227    228     "Reading field dictionary details."   229    230     def __init__(self, field_reader, field_index_reader):   231         self.field_reader = field_reader   232         self.field_index_reader = field_index_reader   233    234         self.cache = {}   235         self.docs = []   236         try:   237             while 1:   238                 self.docs.append(self.field_index_reader.read_document())   239         except EOFError:   240             pass   241    242         # Large numbers for ordering purposes.   243    244         if self.docs:   245             self.max_offset = self.docs[-1][1]   246         else:   247             self.max_offset = None   248    249     # Iterator convenience methods.   250    251     def __iter__(self):   252         self.rewind()   253         return self   254    255     def next(self):   256         try:   257             return self.read_fields()   258         except EOFError:   259             raise StopIteration   260    261     # Sequential access methods.   262    263     def rewind(self):   264         self.field_reader.rewind()   265    266     def read_fields(self):   267    268         "Return the next document number and fields."   269    270         return self.field_reader.read_fields()   271    272     # Random access methods.   273    274     def get_fields(self, docnum):   275    276         "Read the fields of the document with the given 'docnum'."   277    278         if self.cache.has_key(docnum):   279             return self.cache[docnum]   280    281         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1   282    283         # Get the entry position providing the term or one preceding it.   284    285         if i == -1:   286             return None   287    288         found_docnum, offset = self.docs[i]   289    290         # Read from the fields file.   291    292         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)   293    294         # Scan for the document, if necessary.   295    296         try:   297             while docnum > found_docnum:   298                 found_docnum, fields = self.field_reader.read_fields()   299         except EOFError:   300             pass   301    302         # If the document is found, return the fields.   303    304         if docnum == found_docnum:   305    306             # Store the fields in the cache, removing entries if the limit has   307             # been reached.   308    309             keys = self.cache.keys()   310    311             if len(keys) == DOCUMENT_CACHE_LIMIT:   312                 del self.cache[keys[0]]   313    314             self.cache[docnum] = fields   315             return fields   316         else:   317             return None   318    319     def close(self):   320         self.field_reader.close()   321         self.field_index_reader.close()   322    323 # vim: tabstop=4 expandtab shiftwidth=4