iixr (file iixr/fields.py at d308dc25f5a2)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing document information.     5      6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.files import *    22 from bisect import bisect_right  # to find terms in the dictionary index    23     24 DOCUMENT_CACHE_LIMIT = 10000    25     26 class FieldWriter(FileWriter):    27     28     "Writing field data to files."    29     30     def reset(self):    31         self.last_docnum = None    32         self.docnum_size = None    33     34     def write_fields(self, docnum, fields):    35     36         """    37         Write for the given 'docnum', a list of 'fields' (integer, string pairs    38         representing field identifiers and values respectively).    39         """    40     41         # Find the size of document number values.    42     43         if self.docnum_size is None:    44             self.docnum_size = self.get_value_size(docnum)    45             self.last_docnum = self.get_initial_value(self.docnum_size)    46     47         # Write the number of values per document number.    48         # Write the document number delta.    49     50         output = array('B')    51         vint_to_array(self.docnum_size, output)    52         self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)    53     54         # Write the number of fields.    55     56         vint_to_array(len(fields), output)    57         output.tofile(self.f)    58     59         # Write the fields themselves.    60     61         for i, field in fields:    62             self.write_number(i)    63             self.write_string(field, 1) # compress    64     65 class FieldReader(FileReader):    66     67     "Reading field data from files."    68     69     def reset(self):    70         self.last_docnum = None    71     72     def read_fields(self):    73     74         """    75         Read fields from the file, returning a tuple containing the document    76         number and a list of field (identifier, value) pairs.    77         """    78     79         # Read the number of values per document number.    80     81         docnum_size = self.read_number()    82     83         if self.last_docnum is None:    84             self.last_docnum = self.get_initial_value(docnum_size)    85     86         # Read the document number delta and add it to the last number.    87     88         self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)    89     90         # Read the number of fields.    91     92         nfields = self.read_number()    93     94         # Collect the fields.    95     96         fields = []    97         i = 0    98     99         while i < nfields:   100             identifier = self.read_number()   101             value = self.read_string(1) # decompress   102             fields.append((identifier, value))   103             i += 1   104    105         return self.last_docnum, fields   106    107     def read_document_fields(self, docnum, offset):   108    109         """   110         Read fields for 'docnum' at the given 'offset'. This permits the   111         retrieval of details for the specified document, as well as scanning for   112         later documents.   113         """   114    115         self.seek(offset)   116         bad_docnum, fields = self.read_fields()   117         self.last_docnum = docnum   118         return docnum, fields   119    120 class FieldIndexWriter(FileWriter):   121    122     "Writing field index details to files."   123    124     def reset(self):   125         self.last_docnum = None   126         self.docnum_size = None   127         self.last_offset = 0   128    129     def write_document(self, docnum, offset):   130    131         """   132         Write for the given 'docnum', the 'offset' at which the fields for the   133         document are stored in the fields file.   134         """   135    136         # Find the size of document number values.   137    138         if self.docnum_size is None:   139             self.docnum_size = self.get_value_size(docnum)   140             self.last_docnum = self.get_initial_value(self.docnum_size)   141    142         # Write the number of values per document number.   143         # Write the document number delta.   144    145         output = array('B')   146         vint_to_array(self.docnum_size, output)   147         self.last_docnum = self.write_sequence(output, docnum, self.last_docnum, self.docnum_size, monotonic=0)   148         output.tofile(self.f)   149    150         # Write the offset delta.   151    152         self.write_number(offset - self.last_offset)   153         self.last_offset = offset   154    155 class FieldIndexReader(FileReader):   156    157     "Reading field index details from files."   158    159     def reset(self):   160         self.last_docnum = None   161         self.last_offset = 0   162    163     def read_document(self):   164    165         "Read a document number and field file offset."   166    167         # Read the number of values per document number.   168    169         docnum_size = self.read_number()   170    171         if self.last_docnum is None:   172             self.last_docnum = self.get_initial_value(docnum_size)   173    174         # Read the document number delta and add it to the last number.   175    176         self.last_docnum = self.read_sequence(self.last_docnum, docnum_size, monotonic=0)   177    178         # Read the offset.   179    180         self.last_offset += self.read_number()   181    182         return self.last_docnum, self.last_offset   183    184 class FieldDictionaryWriter:   185    186     "Writing field dictionary details."   187    188     def __init__(self, field_writer, field_index_writer, interval):   189         self.field_writer = field_writer   190         self.field_index_writer = field_index_writer   191         self.interval = interval   192         self.entry = 0   193    194     def write_fields(self, docnum, fields):   195    196         "Write details of the document with the given 'docnum' and 'fields'."   197    198         if self.entry % self.interval == 0:   199             offset = self.field_writer.f.tell()   200             self.field_writer.write_fields(docnum, fields)   201             self.field_index_writer.write_document(docnum, offset)   202         else:   203             self.field_writer.write_fields(docnum, fields)   204    205         self.entry += 1   206    207     def close(self):   208         self.field_writer.close()   209         self.field_index_writer.close()   210    211 class FieldDictionaryReader:   212    213     "Reading field dictionary details."   214    215     def __init__(self, field_reader, field_index_reader):   216         self.field_reader = field_reader   217         self.field_index_reader = field_index_reader   218    219         self.cache = {}   220         self.docs = []   221         try:   222             while 1:   223                 self.docs.append(self.field_index_reader.read_document())   224         except EOFError:   225             pass   226    227         # Large numbers for ordering purposes.   228    229         if self.docs:   230             self.max_offset = self.docs[-1][1]   231         else:   232             self.max_offset = None   233    234     # Iterator convenience methods.   235    236     def __iter__(self):   237         self.rewind()   238         return self   239    240     def next(self):   241         try:   242             return self.read_fields()   243         except EOFError:   244             raise StopIteration   245    246     # Sequential access methods.   247    248     def rewind(self):   249         self.field_reader.rewind()   250    251     def read_fields(self):   252    253         "Return the next document number and fields."   254    255         return self.field_reader.read_fields()   256    257     # Random access methods.   258    259     def get_fields(self, docnum):   260    261         "Read the fields of the document with the given 'docnum'."   262    263         if self.cache.has_key(docnum):   264             return self.cache[docnum]   265    266         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1   267    268         # Get the entry position providing the term or one preceding it.   269    270         if i == -1:   271             return None   272    273         found_docnum, offset = self.docs[i]   274    275         # Read from the fields file.   276    277         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)   278    279         # Scan for the document, if necessary.   280    281         try:   282             while docnum > found_docnum:   283                 found_docnum, fields = self.field_reader.read_fields()   284         except EOFError:   285             pass   286    287         # If the document is found, return the fields.   288    289         if docnum == found_docnum:   290    291             # Store the fields in the cache, removing entries if the limit has   292             # been reached.   293    294             keys = self.cache.keys()   295    296             if len(keys) == DOCUMENT_CACHE_LIMIT:   297                 del self.cache[keys[0]]   298    299             self.cache[docnum] = fields   300             return fields   301         else:   302             return None   303    304     def close(self):   305         self.field_reader.close()   306         self.field_index_reader.close()   307    308 # vim: tabstop=4 expandtab shiftwidth=4