1 #!/usr/bin/env python 2 3 """ 4 Specific classes for storing term information. 5 6 Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 """ 20 21 from iixr.data import * 22 from iixr.files import * 23 from iixr.phrases import PhraseIterator 24 from os.path import commonprefix # to find common string prefixes 25 26 class TermWriter(FileWriter): 27 28 "Writing term information to files." 29 30 def begin(self, docnum_size, position_size): 31 32 "Begin writing to the file." 33 34 self.write_numbers((docnum_size, position_size)) 35 self.end_record() 36 37 self.data_start = self.tell() 38 self.docnum_size = docnum_size 39 self.position_size = position_size 40 self.subtractor = get_subtractor(docnum_size) 41 self.last_term = "" 42 43 def write_terms(self, terms): 44 45 """ 46 Write the 'terms' to the term information file, with each term's details 47 stored in a separate record. 48 """ 49 50 if hasattr(terms, "items"): 51 terms = terms.items() 52 terms.sort() 53 54 for term, doc_positions in terms: 55 if not doc_positions: 56 continue 57 58 if hasattr(doc_positions, "items"): 59 doc_positions = doc_positions.items() 60 61 docnum, positions = doc_positions[0] 62 63 if not positions: 64 continue 65 66 # Start the writing, if appropriate. 67 68 if self.data_start is None: 69 self.begin(sizeof(docnum), sizeof(positions[0])) 70 71 # Write each term and document positions. 72 73 self.write_term(term, doc_positions) 74 self.end_record() 75 76 # Methods requiring an open record. 77 78 def write_term(self, term, doc_positions): 79 80 """ 81 Write the given 'term', its document frequency (number of documents in 82 which it appears), and 'doc_positions' to the term information file. 83 """ 84 85 self.write_term_only(term) 86 87 # Write the document frequency and the term positions. 88 89 self.write_positions(doc_positions) 90 91 def write_term_plus_remaining(self, term, data): 92 93 "Write the given 'term' and the document position 'data'." 94 95 self.write_term_only(term) 96 self.write_remaining(data) 97 98 def write_term_only(self, term): 99 100 "Write only the given 'term'." 101 102 if term <= self.last_term: 103 raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) 104 105 # Write the prefix length and term suffix. 106 107 common = len(commonprefix([self.last_term, term])) 108 suffix = term[common:] 109 110 self.write_number(common) 111 self.write_string(suffix) 112 113 self.last_term = term 114 115 def write_positions(self, doc_positions): 116 117 "Write the given 'doc_positions' to the file." 118 119 # Make sure that the positions are sorted. 120 121 doc_positions.sort() 122 123 # Write the document frequency. 124 125 self.write_number(len(doc_positions)) 126 127 last_docnum = None 128 129 for docnum, positions in doc_positions: 130 131 # Store the first document number as it is. 132 133 if last_docnum is None: 134 docnum_seq = docnum 135 136 # Reject out-of-order documents. 137 138 elif docnum < last_docnum: 139 raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum) 140 141 # Calculate an ongoing delta. 142 143 else: 144 docnum_seq = self.subtractor(docnum, last_docnum) 145 146 # Write the document number and positions. 147 148 self.write_sequence_value(docnum_seq, self.docnum_size) 149 self.write_monotonic_sequence(positions, self.position_size) 150 151 last_docnum = docnum 152 153 # Write a terminating byte to indicate that no more document pages 154 # exist. 155 156 self.write_byte(0) 157 158 class TermReader(FileReader): 159 160 "Reading term information from files." 161 162 def begin(self): 163 164 "Begin reading from the file." 165 166 self.begin_record() 167 try: 168 self.docnum_size, self.position_size = self.read_numbers(2) 169 except EOFError: 170 self.docnum_size, self.position_size = 0, 0 # NOTE: No positions! 171 172 self.data_start = self.tell() 173 self.adder = get_adder(self.docnum_size) 174 self.last_term = "" 175 176 def get_sizes(self): 177 return self.docnum_size, self.position_size 178 179 # Methods requiring an open record. 180 181 def read_term(self): 182 183 "Read a term and its document positions from the term information file." 184 185 # Read the term. 186 187 self.read_term_only() 188 189 # Read the document frequency and the term positions. 190 191 positions = self.read_positions() 192 193 return self.last_term, positions 194 195 def read_term_plus_remaining(self): 196 197 """ 198 Read a term and the unprocessed document position data. 199 """ 200 201 self.read_term_only() 202 return self.last_term, self.read_remaining() 203 204 def read_term_only(self): 205 206 "Read a term only." 207 208 # Read the prefix length and term suffix. 209 210 common = self.read_number() 211 suffix = self.read_string() 212 213 self.last_term = self.last_term[:common] + suffix 214 return self.last_term 215 216 def read_positions(self): 217 218 "Read document positions from the term information file." 219 220 doc_positions = [] 221 222 while 1: 223 224 # Read the document frequency. 225 226 npositions = self.read_number() 227 228 last_docnum = None 229 i = 0 230 while i < npositions: 231 232 # Read the document number. 233 234 docnum = self.read_sequence_value(self.docnum_size) 235 if last_docnum is not None: 236 docnum = self.adder(docnum, last_docnum) 237 238 # Read the positions. 239 240 positions = self.read_monotonic_sequence(self.position_size) 241 doc_positions.append((docnum, positions)) 242 243 last_docnum = docnum 244 i += 1 245 246 # Read a terminating byte to discover whether more document pages 247 # exist. 248 249 if not self.read_byte(): 250 break 251 252 return doc_positions 253 254 class TermIterator(TermReader): 255 256 "An iterator over terms and positions read from a file." 257 258 def __iter__(self): 259 return self 260 261 def next(self): 262 try: 263 self.begin_record() 264 return self.read_term() 265 except EOFError: 266 raise StopIteration 267 268 class TermDataIterator(TermReader): 269 270 "An iterator over terms and unprocessed document positions data." 271 272 def __iter__(self): 273 return self 274 275 def next(self): 276 try: 277 self.begin_record() 278 return self.read_term_plus_remaining() 279 except EOFError: 280 raise StopIteration 281 282 # vim: tabstop=4 expandtab shiftwidth=4