iixr (file iixr/terms.py at 6542c54d115b)

     1 #!/usr/bin/env python     2      3 """     4 Specific classes for storing term information.     5      6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT ANY    14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A    15 PARTICULAR PURPOSE.  See the GNU General Public License for more details.    16     17 You should have received a copy of the GNU General Public License along    18 with this program.  If not, see <http://www.gnu.org/licenses/>.    19 """    20     21 from iixr.data import *    22 from iixr.files import *    23 from iixr.phrases import PhraseIterator    24 from os.path import commonprefix # to find common string prefixes    25     26 class TermWriter(FileWriter):    27     28     "Writing term information to files."    29     30     def begin(self, docnum_size, position_size):    31     32         "Begin writing to the file."    33     34         self.write_numbers((docnum_size, position_size))    35         self.end_record()    36     37         self.data_start = self.tell()    38         self.docnum_size = docnum_size    39         self.position_size = position_size    40         self.subtractor = get_subtractor(docnum_size)    41         self.last_term = ""    42     43     def write_terms(self, terms):    44     45         """    46         Write the 'terms' to the term information file, with each term's details    47         stored in a separate record.    48         """    49     50         if hasattr(terms, "items"):    51             terms = terms.items()    52             terms.sort()    53     54         for term, doc_positions in terms:    55             if not doc_positions:    56                 continue    57     58             if hasattr(doc_positions, "items"):    59                 doc_positions = doc_positions.items()    60     61             docnum, positions = doc_positions[0]    62     63             if not positions:    64                 continue    65     66             # Start the writing, if appropriate.    67     68             if self.data_start is None:    69                 self.begin(sizeof(docnum), sizeof(positions[0]))    70     71             # Write each term and document positions.    72     73             self.write_term(term, doc_positions)    74             self.end_record()    75     76     # Methods requiring an open record.    77     78     def write_term(self, term, doc_positions):    79     80         """    81         Write the given 'term', its document frequency (number of documents in    82         which it appears), and 'doc_positions' to the term information file.    83         """    84     85         self.write_term_only(term)    86     87         # Write the document frequency and the term positions.    88     89         self.write_positions(doc_positions)    90     91     def write_term_plus_remaining(self, term, data):    92     93         "Write the given 'term' and the document position 'data'."    94     95         self.write_term_only(term)    96         self.write_remaining(data)    97     98     def write_term_only(self, term):    99    100         "Write only the given 'term'."   101    102         if term <= self.last_term:   103             raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)   104    105         # Write the prefix length and term suffix.   106    107         common = len(commonprefix([self.last_term, term]))   108         suffix = term[common:]   109    110         self.write_number(common)   111         self.write_string(suffix)   112    113         self.last_term = term   114    115     def write_positions(self, doc_positions):   116    117         "Write the given 'doc_positions' to the file."   118    119         # Make sure that the positions are sorted.   120    121         doc_positions.sort()   122    123         # Write the document frequency.   124    125         self.write_number(len(doc_positions))   126    127         last_docnum = None   128    129         for docnum, positions in doc_positions:   130    131             # Store the first document number as it is.   132    133             if last_docnum is None:   134                 docnum_seq = docnum   135    136             # Reject out-of-order documents.   137    138             elif docnum < last_docnum:   139                 raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum)   140    141             # Calculate an ongoing delta.   142    143             else:   144                 docnum_seq = self.subtractor(docnum, last_docnum)   145    146             # Write the document number and positions.   147    148             self.write_sequence_value(docnum_seq, self.docnum_size)   149             self.write_monotonic_sequence(positions, self.position_size)   150    151             last_docnum = docnum   152    153         # Write a terminating byte to indicate that no more document pages   154         # exist.   155    156         self.write_byte(0)   157    158 class TermReader(FileReader):   159    160     "Reading term information from files."   161    162     def begin(self):   163    164         "Begin reading from the file."   165    166         self.begin_record()   167         try:   168             self.docnum_size, self.position_size = self.read_numbers(2)   169         except EOFError:   170             self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!   171    172         self.data_start = self.tell()   173         self.adder = get_adder(self.docnum_size)   174         self.last_term = ""   175    176     def get_sizes(self):   177         return self.docnum_size, self.position_size   178    179     # Methods requiring an open record.   180    181     def read_term(self):   182    183         "Read a term and its document positions from the term information file."   184    185         # Read the term.   186    187         self.read_term_only()   188    189         # Read the document frequency and the term positions.   190    191         positions = self.read_positions()   192    193         return self.last_term, positions   194    195     def read_term_plus_remaining(self):   196    197         """   198         Read a term and the unprocessed document position data.   199         """   200    201         self.read_term_only()   202         return self.last_term, self.read_remaining()   203    204     def read_term_only(self):   205    206         "Read a term only."   207    208         # Read the prefix length and term suffix.   209    210         common = self.read_number()   211         suffix = self.read_string()   212    213         self.last_term = self.last_term[:common] + suffix   214         return self.last_term   215    216     def read_positions(self):   217    218         "Read document positions from the term information file."   219    220         doc_positions = []   221    222         while 1:   223    224             # Read the document frequency.   225    226             npositions = self.read_number()   227    228             last_docnum = None   229             i = 0   230             while i < npositions:   231    232                 # Read the document number.   233    234                 docnum = self.read_sequence_value(self.docnum_size)   235                 if last_docnum is not None:   236                     docnum = self.adder(docnum, last_docnum)   237    238                 # Read the positions.   239    240                 positions = self.read_monotonic_sequence(self.position_size)   241                 doc_positions.append((docnum, positions))   242    243                 last_docnum = docnum   244                 i += 1   245    246             # Read a terminating byte to discover whether more document pages   247             # exist.   248    249             if not self.read_byte():   250                 break   251    252         return doc_positions   253    254 class TermIterator(TermReader):   255    256     "An iterator over terms and positions read from a file."   257    258     def __iter__(self):   259         return self   260    261     def next(self):   262         try:   263             self.begin_record()   264             return self.read_term()   265         except EOFError:   266             raise StopIteration   267    268 class TermDataIterator(TermReader):   269    270     "An iterator over terms and unprocessed document positions data."   271    272     def __iter__(self):   273         return self   274    275     def next(self):   276         try:   277             self.begin_record()   278             return self.read_term_plus_remaining()   279         except EOFError:   280             raise StopIteration   281    282 # vim: tabstop=4 expandtab shiftwidth=4