paul@44 | 1 | #!/usr/bin/env python |
paul@44 | 2 | |
paul@44 | 3 | """ |
paul@44 | 4 | Specific classes for storing term information. |
paul@44 | 5 | |
paul@89 | 6 | Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk> |
paul@44 | 7 | |
paul@44 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@44 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@44 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@44 | 11 | version. |
paul@44 | 12 | |
paul@44 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT ANY |
paul@44 | 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A |
paul@44 | 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. |
paul@44 | 16 | |
paul@44 | 17 | You should have received a copy of the GNU General Public License along |
paul@44 | 18 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@44 | 19 | """ |
paul@44 | 20 | |
paul@96 | 21 | from iixr.data import * |
paul@44 | 22 | from iixr.files import * |
paul@60 | 23 | from iixr.phrases import PhraseIterator |
paul@44 | 24 | from os.path import commonprefix # to find common string prefixes |
paul@44 | 25 | |
paul@44 | 26 | class TermWriter(FileWriter): |
paul@44 | 27 | |
paul@44 | 28 | "Writing term information to files." |
paul@44 | 29 | |
paul@96 | 30 | def begin(self, docnum_size, position_size): |
paul@96 | 31 | |
paul@96 | 32 | "Begin writing to the file." |
paul@96 | 33 | |
paul@96 | 34 | self.write_numbers((docnum_size, position_size)) |
paul@90 | 35 | self.end_record() |
paul@96 | 36 | |
paul@96 | 37 | self.data_start = self.tell() |
paul@96 | 38 | self.docnum_size = docnum_size |
paul@96 | 39 | self.position_size = position_size |
paul@96 | 40 | self.subtractor = get_subtractor(docnum_size) |
paul@44 | 41 | self.last_term = "" |
paul@44 | 42 | |
paul@96 | 43 | def write_terms(self, terms): |
paul@44 | 44 | |
paul@44 | 45 | """ |
paul@96 | 46 | Write the 'terms' to the term information file, with each term's details |
paul@96 | 47 | stored in a separate record. |
paul@44 | 48 | """ |
paul@44 | 49 | |
paul@96 | 50 | if hasattr(terms, "items"): |
paul@96 | 51 | terms = terms.items() |
paul@96 | 52 | terms.sort() |
paul@96 | 53 | |
paul@96 | 54 | for term, doc_positions in terms: |
paul@96 | 55 | if not doc_positions: |
paul@96 | 56 | continue |
paul@96 | 57 | |
paul@96 | 58 | if hasattr(doc_positions, "items"): |
paul@96 | 59 | doc_positions = doc_positions.items() |
paul@96 | 60 | |
paul@96 | 61 | docnum, positions = doc_positions[0] |
paul@96 | 62 | |
paul@96 | 63 | if not positions: |
paul@96 | 64 | continue |
paul@96 | 65 | |
paul@96 | 66 | # Start the writing, if appropriate. |
paul@96 | 67 | |
paul@96 | 68 | if self.data_start is None: |
paul@96 | 69 | self.begin(sizeof(docnum), sizeof(positions[0])) |
paul@96 | 70 | |
paul@96 | 71 | # Write each term and document positions. |
paul@96 | 72 | |
paul@96 | 73 | self.write_term(term, doc_positions) |
paul@96 | 74 | self.end_record() |
paul@96 | 75 | |
paul@96 | 76 | # Methods requiring an open record. |
paul@96 | 77 | |
paul@96 | 78 | def write_term(self, term, doc_positions): |
paul@96 | 79 | |
paul@96 | 80 | """ |
paul@96 | 81 | Write the given 'term', its document frequency (number of documents in |
paul@96 | 82 | which it appears), and 'doc_positions' to the term information file. |
paul@96 | 83 | """ |
paul@96 | 84 | |
paul@96 | 85 | self.write_term_only(term) |
paul@96 | 86 | |
paul@96 | 87 | # Write the document frequency and the term positions. |
paul@96 | 88 | |
paul@96 | 89 | self.write_positions(doc_positions) |
paul@96 | 90 | |
paul@96 | 91 | def write_term_plus_remaining(self, term, data): |
paul@96 | 92 | |
paul@96 | 93 | "Write the given 'term' and the document position 'data'." |
paul@96 | 94 | |
paul@96 | 95 | self.write_term_only(term) |
paul@96 | 96 | self.write_remaining(data) |
paul@96 | 97 | |
paul@96 | 98 | def write_term_only(self, term): |
paul@96 | 99 | |
paul@96 | 100 | "Write only the given 'term'." |
paul@96 | 101 | |
paul@75 | 102 | if term <= self.last_term: |
paul@75 | 103 | raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term) |
paul@75 | 104 | |
paul@44 | 105 | # Write the prefix length and term suffix. |
paul@44 | 106 | |
paul@44 | 107 | common = len(commonprefix([self.last_term, term])) |
paul@44 | 108 | suffix = term[common:] |
paul@44 | 109 | |
paul@44 | 110 | self.write_number(common) |
paul@44 | 111 | self.write_string(suffix) |
paul@44 | 112 | |
paul@96 | 113 | self.last_term = term |
paul@96 | 114 | |
paul@96 | 115 | def write_positions(self, doc_positions): |
paul@96 | 116 | |
paul@96 | 117 | "Write the given 'doc_positions' to the file." |
paul@96 | 118 | |
paul@96 | 119 | # Make sure that the positions are sorted. |
paul@96 | 120 | |
paul@96 | 121 | doc_positions.sort() |
paul@96 | 122 | |
paul@44 | 123 | # Write the document frequency. |
paul@44 | 124 | |
paul@96 | 125 | self.write_number(len(doc_positions)) |
paul@96 | 126 | |
paul@96 | 127 | last_docnum = None |
paul@96 | 128 | |
paul@96 | 129 | for docnum, positions in doc_positions: |
paul@96 | 130 | |
paul@96 | 131 | # Store the first document number as it is. |
paul@96 | 132 | |
paul@96 | 133 | if last_docnum is None: |
paul@96 | 134 | docnum_seq = docnum |
paul@96 | 135 | |
paul@96 | 136 | # Reject out-of-order documents. |
paul@96 | 137 | |
paul@96 | 138 | elif docnum < last_docnum: |
paul@96 | 139 | raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum) |
paul@44 | 140 | |
paul@96 | 141 | # Calculate an ongoing delta. |
paul@96 | 142 | |
paul@96 | 143 | else: |
paul@96 | 144 | docnum_seq = self.subtractor(docnum, last_docnum) |
paul@96 | 145 | |
paul@96 | 146 | # Write the document number and positions. |
paul@96 | 147 | |
paul@96 | 148 | self.write_sequence_value(docnum_seq, self.docnum_size) |
paul@96 | 149 | self.write_monotonic_sequence(positions, self.position_size) |
paul@96 | 150 | |
paul@96 | 151 | last_docnum = docnum |
paul@96 | 152 | |
paul@96 | 153 | # Write a terminating byte to indicate that no more document pages |
paul@96 | 154 | # exist. |
paul@96 | 155 | |
paul@96 | 156 | self.write_byte(0) |
paul@44 | 157 | |
paul@44 | 158 | class TermReader(FileReader): |
paul@44 | 159 | |
paul@44 | 160 | "Reading term information from files." |
paul@44 | 161 | |
paul@96 | 162 | def begin(self): |
paul@96 | 163 | |
paul@96 | 164 | "Begin reading from the file." |
paul@96 | 165 | |
paul@96 | 166 | self.begin_record() |
paul@96 | 167 | try: |
paul@96 | 168 | self.docnum_size, self.position_size = self.read_numbers(2) |
paul@96 | 169 | except EOFError: |
paul@96 | 170 | self.docnum_size, self.position_size = 0, 0 # NOTE: No positions! |
paul@96 | 171 | |
paul@96 | 172 | self.data_start = self.tell() |
paul@96 | 173 | self.adder = get_adder(self.docnum_size) |
paul@44 | 174 | self.last_term = "" |
paul@96 | 175 | |
paul@96 | 176 | def get_sizes(self): |
paul@96 | 177 | return self.docnum_size, self.position_size |
paul@96 | 178 | |
paul@96 | 179 | # Methods requiring an open record. |
paul@44 | 180 | |
paul@44 | 181 | def read_term(self): |
paul@44 | 182 | |
paul@96 | 183 | "Read a term and its document positions from the term information file." |
paul@96 | 184 | |
paul@96 | 185 | # Read the term. |
paul@96 | 186 | |
paul@96 | 187 | self.read_term_only() |
paul@96 | 188 | |
paul@96 | 189 | # Read the document frequency and the term positions. |
paul@96 | 190 | |
paul@96 | 191 | positions = self.read_positions() |
paul@96 | 192 | |
paul@96 | 193 | return self.last_term, positions |
paul@96 | 194 | |
paul@96 | 195 | def read_term_plus_remaining(self): |
paul@96 | 196 | |
paul@44 | 197 | """ |
paul@96 | 198 | Read a term and the unprocessed document position data. |
paul@44 | 199 | """ |
paul@44 | 200 | |
paul@96 | 201 | self.read_term_only() |
paul@96 | 202 | return self.last_term, self.read_remaining() |
paul@96 | 203 | |
paul@96 | 204 | def read_term_only(self): |
paul@96 | 205 | |
paul@96 | 206 | "Read a term only." |
paul@96 | 207 | |
paul@44 | 208 | # Read the prefix length and term suffix. |
paul@44 | 209 | |
paul@44 | 210 | common = self.read_number() |
paul@44 | 211 | suffix = self.read_string() |
paul@44 | 212 | |
paul@44 | 213 | self.last_term = self.last_term[:common] + suffix |
paul@96 | 214 | return self.last_term |
paul@44 | 215 | |
paul@96 | 216 | def read_positions(self): |
paul@44 | 217 | |
paul@96 | 218 | "Read document positions from the term information file." |
paul@44 | 219 | |
paul@96 | 220 | doc_positions = [] |
paul@44 | 221 | |
paul@96 | 222 | while 1: |
paul@44 | 223 | |
paul@96 | 224 | # Read the document frequency. |
paul@44 | 225 | |
paul@96 | 226 | npositions = self.read_number() |
paul@91 | 227 | |
paul@96 | 228 | last_docnum = None |
paul@96 | 229 | i = 0 |
paul@96 | 230 | while i < npositions: |
paul@44 | 231 | |
paul@96 | 232 | # Read the document number. |
paul@44 | 233 | |
paul@96 | 234 | docnum = self.read_sequence_value(self.docnum_size) |
paul@96 | 235 | if last_docnum is not None: |
paul@96 | 236 | docnum = self.adder(docnum, last_docnum) |
paul@44 | 237 | |
paul@96 | 238 | # Read the positions. |
paul@44 | 239 | |
paul@96 | 240 | positions = self.read_monotonic_sequence(self.position_size) |
paul@96 | 241 | doc_positions.append((docnum, positions)) |
paul@44 | 242 | |
paul@96 | 243 | last_docnum = docnum |
paul@96 | 244 | i += 1 |
paul@44 | 245 | |
paul@96 | 246 | # Read a terminating byte to discover whether more document pages |
paul@96 | 247 | # exist. |
paul@44 | 248 | |
paul@96 | 249 | if not self.read_byte(): |
paul@96 | 250 | break |
paul@44 | 251 | |
paul@96 | 252 | return doc_positions |
paul@81 | 253 | |
paul@96 | 254 | class TermIterator(TermReader): |
paul@58 | 255 | |
paul@96 | 256 | "An iterator over terms and positions read from a file." |
paul@44 | 257 | |
paul@44 | 258 | def __iter__(self): |
paul@44 | 259 | return self |
paul@44 | 260 | |
paul@44 | 261 | def next(self): |
paul@44 | 262 | try: |
paul@96 | 263 | self.begin_record() |
paul@44 | 264 | return self.read_term() |
paul@44 | 265 | except EOFError: |
paul@44 | 266 | raise StopIteration |
paul@44 | 267 | |
paul@96 | 268 | class TermDataIterator(TermReader): |
paul@90 | 269 | |
paul@96 | 270 | "An iterator over terms and unprocessed document positions data." |
paul@90 | 271 | |
paul@96 | 272 | def __iter__(self): |
paul@96 | 273 | return self |
paul@72 | 274 | |
paul@96 | 275 | def next(self): |
paul@44 | 276 | try: |
paul@96 | 277 | self.begin_record() |
paul@96 | 278 | return self.read_term_plus_remaining() |
paul@44 | 279 | except EOFError: |
paul@96 | 280 | raise StopIteration |
paul@44 | 281 | |
paul@44 | 282 | # vim: tabstop=4 expandtab shiftwidth=4 |