iixr (file iixr/fields.py at 1077b05c9b76)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing document information.     5      6 Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.files import *    22 from bisect import bisect_right  # to find terms in the dictionary index    23     24 DOCUMENT_CACHE_LIMIT = 10000    25     26 class FieldWriter(FileWriter):    27     28     "Writing field data to files."    29     30     def reset(self):    31         self.last_docnum = 0    32     33     def write_fields(self, docnum, fields):    34     35         """    36         Write for the given 'docnum', a list of 'fields' (integer, string pairs    37         representing field identifiers and values respectively).    38         """    39     40         # Write the document number delta.    41     42         self.write_number(docnum - self.last_docnum)    43     44         # Write the number of fields.    45     46         self.write_number(len(fields))    47     48         # Write the fields themselves.    49     50         for i, field in fields:    51             self.write_number(i)    52             self.write_string(field, 1) # compress    53     54         self.last_docnum = docnum    55     56 class FieldReader(FileReader):    57     58     "Reading field data from files."    59     60     def reset(self):    61         self.last_docnum = 0    62     63     def read_fields(self):    64     65         """    66         Read fields from the file, returning a tuple containing the document    67         number and a list of field (identifier, value) pairs.    68         """    69     70         # Read the document number.    71     72         self.last_docnum += self.read_number()    73     74         # Read the number of fields.    75     76         nfields = self.read_number()    77     78         # Collect the fields.    79     80         fields = []    81         i = 0    82     83         while i < nfields:    84             identifier = self.read_number()    85             value = self.read_string(1) # decompress    86             fields.append((identifier, value))    87             i += 1    88     89         return self.last_docnum, fields    90     91     def read_document_fields(self, docnum, offset):    92     93         """    94         Read fields for 'docnum' at the given 'offset'. This permits the    95         retrieval of details for the specified document, as well as scanning for    96         later documents.    97         """    98     99         self.seek(offset)   100         bad_docnum, fields = self.read_fields()   101         self.last_docnum = docnum   102         return docnum, fields   103    104 class FieldIndexWriter(FileWriter):   105    106     "Writing field index details to files."   107    108     def reset(self):   109         self.last_docnum = 0   110         self.last_offset = 0   111    112     def write_document(self, docnum, offset):   113    114         """   115         Write for the given 'docnum', the 'offset' at which the fields for the   116         document are stored in the fields file.   117         """   118    119         # Write the document number and offset deltas.   120    121         self.write_number(docnum - self.last_docnum)   122         self.write_number(offset - self.last_offset)   123    124         self.last_docnum = docnum   125         self.last_offset = offset   126    127 class FieldIndexReader(FileReader):   128    129     "Reading field index details from files."   130    131     def reset(self):   132         self.last_docnum = 0   133         self.last_offset = 0   134    135     def read_document(self):   136    137         "Read a document number and field file offset."   138    139         # Read the document number delta and offset.   140    141         self.last_docnum += self.read_number()   142         self.last_offset += self.read_number()   143    144         return self.last_docnum, self.last_offset   145    146 class FieldDictionaryWriter:   147    148     "Writing field dictionary details."   149    150     def __init__(self, field_writer, field_index_writer, interval):   151         self.field_writer = field_writer   152         self.field_index_writer = field_index_writer   153         self.interval = interval   154         self.entry = 0   155    156     def write_fields(self, docnum, fields):   157    158         "Write details of the document with the given 'docnum' and 'fields'."   159    160         if self.entry % self.interval == 0:   161             offset = self.field_writer.f.tell()   162             self.field_writer.write_fields(docnum, fields)   163             self.field_index_writer.write_document(docnum, offset)   164         else:   165             self.field_writer.write_fields(docnum, fields)   166    167         self.entry += 1   168    169     def close(self):   170         self.field_writer.close()   171         self.field_index_writer.close()   172    173 class FieldDictionaryReader:   174    175     "Reading field dictionary details."   176    177     def __init__(self, field_reader, field_index_reader):   178         self.field_reader = field_reader   179         self.field_index_reader = field_index_reader   180    181         self.cache = {}   182         self.docs = []   183         try:   184             while 1:   185                 self.docs.append(self.field_index_reader.read_document())   186         except EOFError:   187             pass   188    189         # Large numbers for ordering purposes.   190    191         if self.docs:   192             self.max_offset = self.docs[-1][1]   193         else:   194             self.max_offset = None   195    196     # Iterator convenience methods.   197    198     def __iter__(self):   199         self.rewind()   200         return self   201    202     def next(self):   203         try:   204             return self.read_fields()   205         except EOFError:   206             raise StopIteration   207    208     # Sequential access methods.   209    210     def rewind(self):   211         self.field_reader.rewind()   212    213     def read_fields(self):   214    215         "Return the next document number and fields."   216    217         return self.field_reader.read_fields()   218    219     # Random access methods.   220    221     def get_fields(self, docnum):   222    223         "Read the fields of the document with the given 'docnum'."   224    225         if self.cache.has_key(docnum):   226             return self.cache[docnum]   227    228         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1   229    230         # Get the entry position providing the term or one preceding it.   231    232         if i == -1:   233             return None   234    235         found_docnum, offset = self.docs[i]   236    237         # Read from the fields file.   238    239         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)   240    241         # Scan for the document, if necessary.   242    243         try:   244             while docnum > found_docnum:   245                 found_docnum, fields = self.field_reader.read_fields()   246         except EOFError:   247             pass   248    249         # If the document is found, return the fields.   250    251         if docnum == found_docnum:   252    253             # Store the fields in the cache, removing entries if the limit has   254             # been reached.   255    256             keys = self.cache.keys()   257    258             if len(keys) == DOCUMENT_CACHE_LIMIT:   259                 del self.cache[keys[0]]   260    261             self.cache[docnum] = fields   262             return fields   263         else:   264             return None   265    266     def close(self):   267         self.field_reader.close()   268         self.field_index_reader.close()   269    270 # vim: tabstop=4 expandtab shiftwidth=4