1 #!/usr/bin/env python 2 3 import iixr 4 import os 5 6 # Remove old test files. 7 8 for filename in ("test", "testF", "testFI", "testI", "testP"): 9 try: 10 os.remove(filename) 11 except OSError: 12 pass 13 14 try: 15 for filename in os.listdir("test_index"): 16 os.remove(os.path.join("test_index", filename)) 17 os.rmdir("test_index") 18 except OSError: 19 pass 20 21 # Test basic data types. 22 23 numbers = [12345678, 0, 1, 127, 128, 255, 256] 24 25 f = open("test", "wb") 26 w = iixr.FileWriter(f) 27 for number in numbers: 28 w.write_number(number) 29 w.close() 30 31 f = open("test", "rb") 32 r = iixr.FileReader(f) 33 for number in numbers: 34 n = r.read_number() 35 print number == n, number, n 36 r.close() 37 38 # Test positions. 39 40 all_doc_positions = [ 41 [ 42 (123, [1, 3, 5, 15, 25]), 43 (124, [0, 100]), 44 (125, [11, 99, 199]), 45 (130, [77, 78, 80, 82, 89]) 46 ], 47 [ 48 (78, [9]), 49 (196, [10, 11]), 50 (197, [17, 21, 30]) 51 ] 52 ] 53 54 f = open("testP", "wb") 55 w = iixr.PositionWriter(f) 56 for doc_positions in all_doc_positions: 57 for docnum, positions in doc_positions: 58 w.write_positions(docnum, positions) 59 w.reset() 60 w.close() 61 62 f = open("testP", "rb") 63 r = iixr.PositionReader(f) 64 for doc_positions in all_doc_positions: 65 for docnum, positions in doc_positions: 66 d, p = r.read_positions() 67 print docnum == d, docnum, d 68 print positions == p, positions, p 69 r.reset() 70 r.close() 71 72 # Test position index files. 73 74 indexed_positions = [ 75 [ 76 (1234, 0, 100), 77 (2345, 700, 100), 78 (3456, 1900, 50) 79 ], 80 [ 81 (4567, 2800, 20) 82 ] 83 ] 84 85 offsets = [] 86 f = open("testPI", "wb") 87 w = iixr.PositionIndexWriter(f) 88 for term_positions in indexed_positions: 89 offset = None 90 doc_frequency = 0 91 w.reset() 92 for docnum, pos_offset, count in term_positions: 93 io = w.write_positions(docnum, pos_offset, count) 94 if offset is None: 95 offset = io 96 doc_frequency += count 97 offsets.append((offset, doc_frequency)) 98 w.close() 99 100 f = open("testPI", "rb") 101 r = iixr.PositionIndexReader(f) 102 offsets.reverse() 103 indexed_positions.reverse() 104 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): 105 found_positions = r.read_term_positions(offset, doc_frequency) 106 for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions): 107 print docnum == dn, docnum, dn 108 print pos_offset == po, pos_offset, po 109 print count == c, count, c 110 r.close() 111 112 # Test position dictionaries. 113 114 f = open("testP", "wb") 115 w = iixr.PositionWriter(f) 116 f2 = open("testPI", "wb") 117 w2 = iixr.PositionIndexWriter(f2) 118 wd = iixr.PositionDictionaryWriter(w, w2, 2) 119 offsets = [] 120 for doc_positions in all_doc_positions: 121 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) 122 offsets.append((offset, doc_frequency)) 123 wd.close() 124 125 f = open("testP", "rb") 126 r = iixr.PositionReader(f) 127 f2 = open("testPI", "rb") 128 r2 = iixr.PositionIndexReader(f2) 129 rd = iixr.PositionDictionaryReader(r, r2) 130 offsets.reverse() 131 all_doc_positions.reverse() 132 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): 133 dp = list(rd.read_term_positions(offset, doc_frequency)) 134 print doc_positions == dp, doc_positions, dp 135 rd.close() 136 137 # Test fields. 138 139 doc_fields = [ 140 (123, ["testing", "fields", "stored", "compressed"]), 141 (456, ["fields", "for a second", "document"]), 142 (789, ["field value"]), 143 (1234, []), 144 (2345, ["abc", "def"]), 145 (3456, ["apple", "banana", "cherry"]), 146 (4567, ["drue", "eple"]) 147 ] 148 149 f = open("testF", "wb") 150 w = iixr.FieldWriter(f) 151 for docnum, fields in doc_fields: 152 w.write_fields(docnum, list(enumerate(fields))) 153 w.close() 154 155 f = open("testF", "rb") 156 r = iixr.FieldReader(f) 157 for docnum, fields in doc_fields: 158 dn, df = r.read_fields() 159 print docnum == dn, docnum, dn 160 print list(enumerate(fields)) == df, list(enumerate(fields)), df 161 r.close() 162 163 # Test field index files. 164 165 indexed_docs = [ 166 (123, 100000987), 167 (456, 100004321), 168 (789, 100008765) 169 ] 170 171 f = open("testFI", "wb") 172 w = iixr.FieldIndexWriter(f) 173 for docnum, offset in indexed_docs: 174 w.write_document(docnum, offset) 175 w.close() 176 177 f = open("testFI", "rb") 178 r = iixr.FieldIndexReader(f) 179 for docnum, offset in indexed_docs: 180 dn, o = r.read_document() 181 print docnum == dn, docnum, dn 182 print offset == o, offset, o 183 r.close() 184 185 # Test field dictionaries. 186 187 f = open("testF", "wb") 188 w = iixr.FieldWriter(f) 189 f2 = open("testFI", "wb") 190 w2 = iixr.FieldIndexWriter(f2) 191 wd = iixr.FieldDictionaryWriter(w, w2, 3) 192 for docnum, fields in doc_fields: 193 wd.write_fields(docnum, list(enumerate(fields))) 194 wd.close() 195 196 f = open("testF", "rb") 197 r = iixr.FieldReader(f) 198 f2 = open("testFI", "rb") 199 r2 = iixr.FieldIndexReader(f2) 200 rd = iixr.FieldDictionaryReader(r, r2) 201 doc_fields_reversed = doc_fields[:] 202 doc_fields_reversed.reverse() 203 for docnum, fields in doc_fields_reversed: 204 df = dict(rd.get_fields(docnum)) 205 print dict(enumerate(fields)) == df, dict(enumerate(fields)), df 206 for docnum in (13579, 246810): 207 df = rd.get_fields(docnum) 208 print df is None, df 209 210 # (Test sequential access.) 211 212 rd.rewind() 213 for docnum, fields in doc_fields: 214 dn, df = rd.read_fields() 215 print docnum == dn, docnum, dn 216 print list(enumerate(fields)) == df, list(enumerate(fields)), df 217 rd.close() 218 219 # Test terms. 220 221 terms = [ 222 # term offset frequency doc_frequency 223 ("aardvark", 100000123, 1, 1), 224 ("anteater", 100000456, 2, 1), 225 ("badger", 100000789, 13, 7), 226 ("bull", 1000001234, 59, 17), 227 ("bulldog", 1000002345, 99, 80), 228 ("cat", 1000003456, 89, 28) 229 ] 230 231 f = open("test", "wb") 232 w = iixr.TermWriter(f) 233 for term, offset, frequency, doc_frequency in terms: 234 w.write_term(term, offset, frequency, doc_frequency) 235 w.close() 236 237 f = open("test", "rb") 238 r = iixr.TermReader(f) 239 for term, offset, frequency, doc_frequency in terms: 240 t, o, fr, df = r.read_term() 241 print term == t, term, t 242 print offset == o, offset, o 243 print frequency == fr, frequency, fr 244 print doc_frequency == df, doc_frequency, df 245 r.close() 246 247 # Test terms in index files. 248 249 indexed_terms = [ 250 # term offset frequency doc_frequency info_offset 251 ("aardvark", 100000123, 1, 1, 200000321), 252 ("anteater", 100000456, 2, 1, 200000654), 253 ("badger", 100000789, 13, 7, 200000987), 254 ("bull", 1000001234, 59, 17, 200004321), 255 ("bulldog", 1000002345, 99, 80, 200005432), 256 ("cat", 1000003456, 89, 28, 200006543) 257 ] 258 259 f = open("test", "wb") 260 w = iixr.TermIndexWriter(f) 261 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 262 w.write_term(term, offset, frequency, doc_frequency, info_offset) 263 w.close() 264 265 f = open("test", "rb") 266 r = iixr.TermIndexReader(f) 267 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 268 t, o, fr, df, i = r.read_term() 269 print term == t, term, t 270 print offset == o, offset, o 271 print frequency == fr, frequency, fr 272 print doc_frequency == df, doc_frequency, df 273 print info_offset == i, info_offset, i 274 r.close() 275 276 # Test dictionaries with only term data. 277 278 f = open("test", "wb") 279 w = iixr.TermWriter(f) 280 f2 = open("testI", "wb") 281 w2 = iixr.TermIndexWriter(f2) 282 f3 = open("testP", "wb") 283 w3 = iixr.PositionWriter(f3) 284 f4 = open("testPI", "wb") 285 w4 = iixr.PositionIndexWriter(f4) 286 wp = iixr.PositionDictionaryWriter(w3, w4, 2) 287 wd = iixr.TermDictionaryWriter(w, w2, wp, 3) 288 for term, offset, frequency, doc_frequency in terms: 289 wd._write_term(term, offset, frequency, doc_frequency) 290 wd.close() 291 292 f = open("test", "rb") 293 r = iixr.TermReader(f) 294 f2 = open("testI", "rb") 295 r2 = iixr.TermIndexReader(f2) 296 f3 = open("testP", "rb") 297 r3 = iixr.PositionReader(f3) 298 f4 = open("testPI", "rb") 299 r4 = iixr.PositionIndexReader(f4) 300 rp = iixr.PositionDictionaryReader(r3, r4) 301 rd = iixr.TermDictionaryReader(r, r2, rp) 302 terms_reversed = terms[:] 303 terms_reversed.reverse() 304 for term, offset, frequency, doc_frequency in terms_reversed: 305 o, fr, df = rd._find_term(term) 306 print offset == o, offset, o 307 print frequency == fr, frequency, fr 308 print doc_frequency == df, doc_frequency, df 309 for term in ("dog", "dingo"): 310 t = rd._find_term(term) 311 print t is None, t 312 313 # (Test term prefix searching.) 314 315 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] 316 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] 317 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] 318 print rd.find_terms("d") == [], rd.find_terms("d"), [] 319 rd.close() 320 321 # Test dictionaries with term and position data. 322 323 terms_with_positions = [ 324 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 325 ("anteater", [(1, [43, 44])]), 326 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 327 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 328 ("bulldog", [(43, [17, 19, 256, 512])]), 329 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 330 ] 331 332 position_dict_tests = [ 333 ("badger", 19, [55, 1333]), 334 ("badger", 20, None), 335 ("bull", 6, [128]), 336 ("bull", 26, [1, 3, 5, 7, 9]), 337 ("cat", 111, None), 338 ("cat", 123, [12, 145, 196]), 339 ("cat", 1234, None) 340 ] 341 342 f = open("test", "wb") 343 w = iixr.TermWriter(f) 344 f2 = open("testI", "wb") 345 w2 = iixr.TermIndexWriter(f2) 346 f3 = open("testP", "wb") 347 w3 = iixr.PositionWriter(f3) 348 f4 = open("testPI", "wb") 349 w4 = iixr.PositionIndexWriter(f4) 350 wp = iixr.PositionDictionaryWriter(w3, w4, 2) 351 wd = iixr.TermDictionaryWriter(w, w2, wp, 3) 352 for term, doc_positions in terms_with_positions: 353 wd.write_term_positions(term, doc_positions) 354 wd.close() 355 356 f = open("test", "rb") 357 r = iixr.TermReader(f) 358 f2 = open("testI", "rb") 359 r2 = iixr.TermIndexReader(f2) 360 f3 = open("testP", "rb") 361 r3 = iixr.PositionReader(f3) 362 f4 = open("testPI", "rb") 363 r4 = iixr.PositionIndexReader(f4) 364 rp = iixr.PositionDictionaryReader(r3, r4) 365 rd = iixr.TermDictionaryReader(r, r2, rp) 366 terms_reversed = terms_with_positions[:] 367 terms_reversed.reverse() 368 for term, doc_positions in terms_reversed: 369 dp = list(rd.find_positions(term)) 370 print doc_positions == dp, doc_positions, dp 371 for term in ("aaa", "dog", "dingo"): 372 dp = rd.find_positions(term) 373 print dp is None, dp 374 375 # (Test iterators.) 376 377 for term, docnum, positions in position_dict_tests: 378 dp = rd.find_positions(term) 379 pos = dp.from_document(docnum) 380 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 381 382 # (Test sequential access.) 383 384 rd.rewind() 385 for term, doc_positions in terms_with_positions: 386 t, fr, df, dp = rd.read_term() 387 dp = list(dp) 388 print term == t, term, t 389 print doc_positions == dp, doc_positions, dp 390 rd.close() 391 392 # Test high-level index operations (including merging). 393 394 docs = [ 395 (1, "The cat sat on the mat"), 396 (2, "Every good boy deserves football"), 397 (13, "One good turn deserves another"), 398 (14, "Every man for himself"), 399 (25, "Red sky at night shepherd's delight"), 400 (36, "She sells sea shells on the sea shore") 401 ] 402 403 doc_tests = [ 404 ("Every", 2, [(2, [0]), (14, [0])]), 405 ("good", 2, [(2, [1]), (13, [1])]), 406 ("deserves", 2, [(2, [3]), (13, [3])]), 407 ("sea", 2, [(36, [2, 6])]) 408 ] 409 410 position_tests = [ 411 ("Every", 14, [0]), 412 ("sea", 36, [2, 6]), 413 ("shells", 1, None), 414 ("shells", 37, None) 415 ] 416 417 index = iixr.Index("test_index") 418 wi = index.get_writer(3, 2, 6) 419 for docnum, text in docs: 420 doc = iixr.Document(docnum) 421 for position, term in enumerate(text.split()): 422 doc.add_position(term, position) 423 doc.add_field(123, text) 424 wi.add_document(doc) 425 wi.close() 426 427 rd = index.get_reader() 428 for term, frequency, doc_positions in doc_tests: 429 dp = list(rd.find_positions(term)) 430 print doc_positions == dp, doc_positions, dp 431 fr = rd.get_frequency(term) 432 print frequency == fr, frequency, fr 433 for docnum, text in docs: 434 df = dict(rd.get_fields(docnum)) 435 print df[123] == text, text, df[123] 436 for term, docnum, positions in position_tests: 437 dp = rd.find_positions(term) 438 pos = dp.from_document(docnum) 439 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 440 index.close() 441 442 # vim: tabstop=4 expandtab shiftwidth=4