iixr (file iixr/fields.py at 74e2e30aabea)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing document information.     5      6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.data import *    22 from iixr.files import *    23 from bisect import bisect_right  # to find terms in the dictionary index    24     25 DOCUMENT_CACHE_LIMIT = 10000    26     27 class FieldWriter(FileWriter):    28     29     "Writing field data to files."    30     31     def begin(self, docnum_size):    32         self.write_number(docnum_size)    33         self.end_record()    34         self.docnum_size = docnum_size    35         self.data_start = self.tell()    36     37     def reset(self):    38         self.end_record()    39         self.last_docnum = None    40         self.subtractor = None    41     42     def write_fields(self, docnum, fields):    43     44         """    45         Write for the given 'docnum', a list of 'fields' (integer, string pairs    46         representing field identifiers and values respectively).    47         """    48     49         # Find the size of document number values.    50     51         if self.last_docnum is not None:    52             docnum_seq = self.subtractor(docnum, self.last_docnum)    53         else:    54             self.subtractor = get_subtractor(docnum)    55             docnum_seq = docnum    56     57         # Write the document number.    58     59         self.write_sequence_value(docnum_seq, self.docnum_size)    60     61         # Write the number of fields.    62     63         self.write_number(len(fields))    64     65         # Write the fields themselves.    66     67         for i, field in fields:    68             self.write_number(i)    69             self.write_string(field, 1) # compress    70     71         self.last_docnum = docnum    72     73 class FieldReader(FileReader):    74     75     "Reading field data from files."    76     77     def begin(self):    78         self.begin_record()    79         try:    80             self.docnum_size = self.read_number()    81         except EOFError:    82             self.docnum_size = 0 # NOTE: No fields!    83         self.data_start = self.tell()    84     85     def reset(self):    86         self.last_docnum = None    87         self.adder = None    88         self.begin_record()    89     90     def read_fields(self):    91     92         """    93         Read fields from the file, returning a tuple containing the document    94         number and a list of field (identifier, value) pairs.    95         """    96     97         # Read the document number.    98     99         docnum = self.read_sequence_value(self.docnum_size)   100    101         if self.last_docnum is not None:   102             self.last_docnum = self.adder(docnum, self.last_docnum)   103         else:   104             self.adder = get_adder(docnum)   105             self.last_docnum = docnum   106    107         # Read the number of fields.   108    109         nfields = self.read_number()   110    111         # Collect the fields.   112    113         fields = []   114         i = 0   115    116         while i < nfields:   117             identifier = self.read_number()   118             value = self.read_string(1) # decompress   119             fields.append((identifier, value))   120             i += 1   121    122         return self.last_docnum, fields   123    124     def read_document_fields(self, docnum, offset):   125    126         """   127         Read fields for 'docnum' at the given 'offset'. This permits the   128         retrieval of details for the specified document, as well as scanning for   129         later documents.   130         """   131    132         self.seek(offset)   133         bad_docnum, fields = self.read_fields()   134         self.last_docnum = docnum   135         return docnum, fields   136    137 class FieldIndexWriter(FieldWriter):   138    139     "Writing field index details to files."   140    141     def reset(self):   142         FieldWriter.reset(self)   143         self.last_offset = 0   144    145     def write_document(self, docnum, offset):   146    147         """   148         Write for the given 'docnum', the 'offset' at which the fields for the   149         document are stored in the fields file.   150         """   151    152         # Find the size of document number values.   153    154         if self.last_docnum is not None:   155             docnum_seq = self.subtractor(docnum, self.last_docnum)   156         else:   157             self.subtractor = get_subtractor(docnum)   158             docnum_seq = docnum   159    160         # Write the document number.   161    162         self.write_sequence_value(docnum_seq, self.docnum_size)   163    164         # Write the offset delta.   165    166         self.write_number(offset - self.last_offset)   167    168         self.last_docnum = docnum   169         self.last_offset = offset   170    171 class FieldIndexReader(FieldReader):   172    173     "Reading field index details from files."   174    175     def reset(self):   176         FieldReader.reset(self)   177         self.last_offset = 0   178    179     def read_document(self):   180    181         "Read a document number and field file offset."   182    183         # Read the document number.   184    185         docnum = self.read_sequence_value(self.docnum_size)   186    187         if self.last_docnum is not None:   188             self.last_docnum = self.adder(docnum, self.last_docnum)   189         else:   190             self.adder = get_adder(docnum)   191             self.last_docnum = docnum   192    193         # Read the offset.   194    195         self.last_offset += self.read_number()   196    197         return self.last_docnum, self.last_offset   198    199 class FieldDictionaryWriter:   200    201     "Writing field dictionary details."   202    203     def __init__(self, field_writer, field_index_writer, interval):   204         self.field_writer = field_writer   205         self.field_index_writer = field_index_writer   206         self.interval = interval   207         self.entry = 0   208    209     def write_fields(self, docnum, fields):   210    211         "Write details of the given 'docnum' and 'fields'."   212    213         if self.entry == 0:   214             docnum_size = sizeof(docnum)   215             self.field_writer.begin(docnum_size)   216             self.field_index_writer.begin(docnum_size)   217             self.field_index_writer.reset()   218    219         if self.entry % self.interval == 0:   220             self.field_writer.reset()   221             offset = self.field_writer.tell()   222             self.field_writer.write_fields(docnum, fields)   223             self.field_index_writer.write_document(docnum, offset)   224         else:   225             self.field_writer.write_fields(docnum, fields)   226    227         self.entry += 1   228    229     def close(self):   230         self.field_writer.close()   231         self.field_index_writer.close()   232    233 class FieldDictionaryReader:   234    235     "Reading field dictionary details."   236    237     def __init__(self, field_reader, field_index_reader):   238         self.field_reader = field_reader   239         self.field_index_reader = field_index_reader   240    241         self.field_reader.reset()   242         self.field_index_reader.reset()   243    244         self.cache = {}   245    246         self.entry = 0   247         self.docs = []   248         try:   249             while 1:   250                 self.docs.append(self.field_index_reader.read_document())   251         except EOFError:   252             pass   253    254         # Large numbers for ordering purposes.   255    256         if self.docs:   257             self.max_offset = self.docs[-1][1]   258         else:   259             self.max_offset = None   260    261     # Iterator convenience methods.   262    263     def __iter__(self):   264         self.rewind()   265         return self   266    267     def next(self):   268         try:   269             return self.read_fields()   270         except EOFError:   271             raise StopIteration   272    273     # Sequential access methods.   274    275     def rewind(self):   276         self.field_reader.rewind()   277    278     def read_fields(self):   279    280         "Return the next document number and fields."   281    282         try:   283             return self.field_reader.read_fields()   284         except EOFError:   285             self.entry += 1   286             try:   287                 found_docnum, offset = self.docs[self.entry]   288             except IndexError:   289                 raise EOFError   290             else:   291                 self.field_reader.reset()   292                 return self.field_reader.read_fields()   293    294     # Random access methods.   295    296     def get_fields(self, docnum):   297    298         "Read the fields of the document with the given 'docnum'."   299    300         if self.cache.has_key(docnum):   301             return self.cache[docnum]   302    303         i = bisect_right(self.docs, (docnum, self.max_offset)) - 1   304    305         # Get the entry position providing the term or one preceding it.   306    307         if i == -1:   308             return None   309    310         found_docnum, offset = self.docs[i]   311    312         # Read from the fields file.   313    314         found_docnum, fields = self.field_reader.read_document_fields(found_docnum, offset)   315    316         # Scan for the document, if necessary.   317    318         try:   319             while docnum > found_docnum:   320                 found_docnum, fields = self.field_reader.read_fields()   321         except EOFError:   322             pass   323    324         # If the document is found, return the fields.   325    326         if docnum == found_docnum:   327    328             # Store the fields in the cache, removing entries if the limit has   329             # been reached.   330    331             keys = self.cache.keys()   332    333             if len(keys) == DOCUMENT_CACHE_LIMIT:   334                 del self.cache[keys[0]]   335    336             self.cache[docnum] = fields   337             return fields   338         else:   339             return None   340    341     def close(self):   342         self.field_reader.close()   343         self.field_index_reader.close()   344    345 # vim: tabstop=4 expandtab shiftwidth=4