1 #!/usr/bin/env python 2 3 from iixr.files import * 4 from iixr.fields import * 5 from iixr.terms import * 6 from iixr.positions import * 7 from iixr.index import * 8 import os, sys 9 10 # Remove old test files. 11 12 for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"): 13 try: 14 os.remove(filename) 15 except OSError: 16 pass 17 18 try: 19 for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"): 20 for filename in os.listdir(dirname): 21 os.remove(os.path.join(dirname, filename)) 22 os.rmdir(dirname) 23 except OSError: 24 pass 25 26 if "clean" in sys.argv: 27 sys.exit(0) 28 29 print "- Test basic data types." 30 31 numbers = [12345678, 0, 1, 127, 128, 255, 256] 32 33 f = open("test", "wb") 34 w = FileWriter(f) 35 w.begin_record() 36 for number in numbers: 37 w.write_number(number) 38 w.end_record() 39 w.close() 40 41 f = open("test", "rb") 42 r = FileReader(f) 43 r.begin_record() 44 for number in numbers: 45 n = r.read_number() 46 print number == n, number, n 47 r.end_record() 48 r.close() 49 50 tuples = [(0, 0), (1, 3), (2, 5), (3, 9)] 51 52 f = open("testMS", "wb") 53 w = FileWriter(f) 54 w.begin_record() 55 w.write_monotonic_sequence(tuples, 2) 56 w.end_record() 57 w.close() 58 59 f = open("testMS", "rb") 60 r = FileReader(f) 61 r.begin_record() 62 for t, t2 in zip(r.read_monotonic_sequence(2), tuples): 63 print t == t2, t, t2 64 r.end_record() 65 r.close() 66 67 tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)] 68 69 f = open("testNMS", "wb") 70 w = FileWriter(f) 71 w.begin_record() 72 w.write_delta_sequence(tuples2, 2) 73 w.end_record() 74 w.close() 75 76 f = open("testNMS", "rb") 77 r = FileReader(f) 78 r.begin_record() 79 for t, t2 in zip(r.read_delta_sequence(2), tuples2): 80 print t == t2, t, t2 81 r.end_record() 82 r.close() 83 84 print "- Test positions." 85 86 all_doc_positions = [ 87 [ 88 (123, [1, 3, 5, 15, 25]), 89 (124, [0, 100]), 90 (125, [11, 99, 199]), 91 (130, [77, 78, 80, 82, 89]) 92 ], 93 [ 94 (78, [9]), 95 (196, [10, 11]), 96 (197, [17, 21, 30]) 97 ] 98 ] 99 100 f = open("testP", "wb") 101 w = PositionWriter(f) 102 w.begin(0, 0) 103 for doc_positions in all_doc_positions: 104 w.reset() 105 for docnum, positions in doc_positions: 106 w.write_positions(docnum, positions) 107 w.close() 108 109 f = open("testP", "rb") 110 r = PositionReader(f) 111 for doc_positions in all_doc_positions: 112 r.reset() 113 for docnum, positions in doc_positions: 114 d, p = r.read_positions() 115 print docnum == d, docnum, d 116 print positions == p, positions, p 117 r.close() 118 119 all_doc_positions_seq = [ 120 [ 121 ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]), 122 ((124, 1), [(0, 0), (100, 350)]), 123 ((124, 2), [(11, 38), (99, 379), (199, 720)]), 124 ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)]) 125 ], 126 [ 127 ((78, 1), [(9, 19)]), 128 ((196, 0), [(10, 27), (11, 29)]), 129 ((196, 1), [(17, 46), (21, 52), (30, 60)]) 130 ] 131 ] 132 133 f = open("testP2", "wb") 134 w = PositionWriter(f) 135 w.begin(2, 2) 136 for doc_positions in all_doc_positions_seq: 137 w.reset() 138 for docnum, positions in doc_positions: 139 w.write_positions(docnum, positions) 140 w.close() 141 142 f = open("testP2", "rb") 143 r = PositionReader(f) 144 for doc_positions in all_doc_positions_seq: 145 r.reset() 146 for docnum, positions in doc_positions: 147 d, p = r.read_positions() 148 print docnum == d, docnum, d 149 print positions == p, positions, p 150 r.close() 151 152 print "- Test position index files." 153 154 indexed_positions = [ 155 [ 156 (1234, 0, 100), 157 (2345, 700, 100), 158 (3456, 1900, 50) 159 ], 160 [ 161 (4567, 2800, 20) 162 ] 163 ] 164 165 offsets = [] 166 f = open("testPI", "wb") 167 w = PositionIndexWriter(f) 168 w.begin(0) 169 for term_positions in indexed_positions: 170 offset = None 171 doc_frequency = 0 172 w.reset() 173 for docnum, pos_offset, count in term_positions: 174 if offset is None: 175 offset = w.tell() 176 w.write_positions(docnum, pos_offset, count) 177 doc_frequency += count 178 offsets.append((offset, doc_frequency)) 179 w.close() 180 181 r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb"))) 182 offsets.reverse() 183 indexed_positions.reverse() 184 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): 185 r.seek(offset, doc_frequency) 186 for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r): 187 print docnum == dn, docnum, dn 188 print pos_offset == po, pos_offset, po 189 print count == c, count, c 190 r.reader.close() 191 192 print "- Test position dictionaries." 193 194 f = open("testP", "wb") 195 w = PositionWriter(f) 196 f2 = open("testPI", "wb") 197 w2 = PositionIndexWriter(f2) 198 wd = PositionDictionaryWriter(w, w2, 2) 199 offsets = [] 200 for doc_positions in all_doc_positions: 201 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) 202 offsets.append((offset, doc_frequency)) 203 wd.close() 204 205 r = PositionReader(open("testP", "rb")) 206 r2 = PositionIndexReader(open("testPI", "rb")) 207 rd = PositionDictionaryReader(r, r2) 208 offsets.reverse() 209 all_doc_positions.reverse() 210 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): 211 it = rd.read_term_positions(offset, doc_frequency) 212 dp = list(it) 213 print doc_positions == dp, doc_positions, dp 214 rd.close() 215 216 print "- Test fields." 217 218 doc_fields = [ 219 (123, ["testing", "fields", "stored", "compressed"]), 220 (456, ["fields", "for a second", "document"]), 221 (789, ["field value"]), 222 (1234, []), 223 (2345, ["abc", "def"]), 224 (3456, ["apple", "banana", "cherry"]), 225 (4567, ["drue", "eple"]) 226 ] 227 228 f = open("testF", "wb") 229 w = FieldWriter(f) 230 w.begin(0) 231 w.reset() 232 for docnum, fields in doc_fields: 233 w.write_fields(docnum, list(enumerate(fields))) 234 w.close() 235 236 f = open("testF", "rb") 237 r = FieldReader(f) 238 r.reset() 239 for docnum, fields in doc_fields: 240 dn, df = r.read_fields() 241 print docnum == dn, docnum, dn 242 print list(enumerate(fields)) == df, list(enumerate(fields)), df 243 r.close() 244 245 print "- Test field index files." 246 247 indexed_docs = [ 248 (123, 100000987), 249 (456, 100004321), 250 (789, 100008765) 251 ] 252 253 f = open("testFI", "wb") 254 w = FieldIndexWriter(f) 255 w.begin(0) 256 w.reset() 257 for docnum, offset in indexed_docs: 258 w.write_document(docnum, offset) 259 w.close() 260 261 f = open("testFI", "rb") 262 r = FieldIndexReader(f) 263 r.reset() 264 for docnum, offset in indexed_docs: 265 dn, o = r.read_document() 266 print docnum == dn, docnum, dn 267 print offset == o, offset, o 268 r.close() 269 270 print "- Test field dictionaries." 271 272 f = open("testF", "wb") 273 w = FieldWriter(f) 274 f2 = open("testFI", "wb") 275 w2 = FieldIndexWriter(f2) 276 wd = FieldDictionaryWriter(w, w2, 3) 277 for docnum, fields in doc_fields: 278 wd.write_fields(docnum, list(enumerate(fields))) 279 wd.close() 280 281 f = open("testF", "rb") 282 r = FieldReader(f) 283 f2 = open("testFI", "rb") 284 r2 = FieldIndexReader(f2) 285 rd = FieldDictionaryReader(r, r2) 286 doc_fields_reversed = doc_fields[:] 287 doc_fields_reversed.reverse() 288 for docnum, fields in doc_fields_reversed: 289 df = dict(rd.get_fields(docnum)) 290 print dict(enumerate(fields)) == df, dict(enumerate(fields)), df 291 for docnum in (13579, 246810): 292 df = rd.get_fields(docnum) 293 print df is None, df 294 295 print "- (Test sequential access.)" 296 297 rd.rewind() 298 for docnum, fields in doc_fields: 299 dn, df = rd.read_fields() 300 print docnum == dn, docnum, dn 301 print list(enumerate(fields)) == df, list(enumerate(fields)), df 302 rd.close() 303 304 print "- Test terms." 305 306 terms = [ 307 # term offset frequency doc_frequency 308 ("aardvark", 100000123, 1, 1), 309 ("anteater", 100000456, 2, 1), 310 ("badger", 100000789, 13, 7), 311 ("bull", 1000001234, 59, 17), 312 ("bulldog", 1000002345, 99, 80), 313 ("cat", 1000003456, 89, 28) 314 ] 315 316 f = open("test", "wb") 317 w = TermWriter(f) 318 w.reset() 319 for term, offset, frequency, doc_frequency in terms: 320 w.write_term(term, offset, frequency, doc_frequency) 321 w.close() 322 323 f = open("test", "rb") 324 r = TermReader(f) 325 r.reset() 326 for term, offset, frequency, doc_frequency in terms: 327 t, o, fr, df = r.read_term() 328 print term == t, term, t 329 print offset == o, offset, o 330 print frequency == fr, frequency, fr 331 print doc_frequency == df, doc_frequency, df 332 r.close() 333 334 print "- Test terms in index files." 335 336 indexed_terms = [ 337 # term offset frequency doc_frequency info_offset 338 ("aardvark", 100000123, 1, 1, 200000321), 339 ("anteater", 100000456, 2, 1, 200000654), 340 ("badger", 100000789, 13, 7, 200000987), 341 ("bull", 1000001234, 59, 17, 200004321), 342 ("bulldog", 1000002345, 99, 80, 200005432), 343 ("cat", 1000003456, 89, 28, 200006543) 344 ] 345 346 f = open("test", "wb") 347 w = TermIndexWriter(f) 348 w.reset() 349 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 350 w.write_term(term, offset, frequency, doc_frequency, info_offset) 351 w.close() 352 353 f = open("test", "rb") 354 r = TermIndexReader(f) 355 r.reset() 356 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 357 t, o, fr, df, i = r.read_term() 358 print term == t, term, t 359 print offset == o, offset, o 360 print frequency == fr, frequency, fr 361 print doc_frequency == df, doc_frequency, df 362 print info_offset == i, info_offset, i 363 r.close() 364 365 print "- Test dictionaries with only term data." 366 367 f = open("test", "wb") 368 w = TermWriter(f) 369 f2 = open("testI", "wb") 370 w2 = TermIndexWriter(f2) 371 f3 = open("testP", "wb") 372 w3 = PositionWriter(f3) 373 f4 = open("testPI", "wb") 374 w4 = PositionIndexWriter(f4) 375 wp = PositionDictionaryWriter(w3, w4, 2) 376 wd = TermDictionaryWriter(w, w2, wp, 3) 377 for term, offset, frequency, doc_frequency in terms: 378 wd._write_term(term, offset, frequency, doc_frequency) 379 wd.close() 380 381 f = open("test", "rb") 382 r = TermReader(f) 383 f2 = open("testI", "rb") 384 r2 = TermIndexReader(f2) 385 r3 = PositionReader(open("testP", "rb")) 386 r4 = PositionIndexReader(open("testPI", "rb")) 387 rp = PositionDictionaryReader(r3, r4) 388 rd = TermDictionaryReader(r, r2, rp) 389 terms_reversed = terms[:] 390 terms_reversed.reverse() 391 for term, offset, frequency, doc_frequency in terms_reversed: 392 o, fr, df = rd._find_term(term) 393 print offset == o, offset, o 394 print frequency == fr, frequency, fr 395 print doc_frequency == df, doc_frequency, df 396 for term in ("dog", "dingo"): 397 t = rd._find_term(term) 398 print t is None, t 399 400 print "- (Test term prefix searching.)" 401 402 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] 403 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] 404 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] 405 print rd.find_terms("d") == [], rd.find_terms("d"), [] 406 rd.close() 407 408 print "- Test dictionaries with term and position data." 409 410 terms_with_positions = [ 411 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 412 ("anteater", [(1, [43, 44])]), 413 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 414 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 415 ("bulldog", [(43, [17, 19, 256, 512])]), 416 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 417 ] 418 419 position_dict_tests = [ 420 ("badger", 19, [55, 1333]), 421 ("badger", 20, None), 422 ("bull", 6, [128]), 423 ("bull", 26, [1, 3, 5, 7, 9]), 424 ("cat", 111, None), 425 ("cat", 123, [12, 145, 196]), 426 ("cat", 1234, None) 427 ] 428 429 f = open("test", "wb") 430 w = TermWriter(f) 431 f2 = open("testI", "wb") 432 w2 = TermIndexWriter(f2) 433 f3 = open("testP", "wb") 434 w3 = PositionWriter(f3) 435 f4 = open("testPI", "wb") 436 w4 = PositionIndexWriter(f4) 437 wp = PositionDictionaryWriter(w3, w4, 2) 438 wd = TermDictionaryWriter(w, w2, wp, 3) 439 for term, doc_positions in terms_with_positions: 440 wd.write_term_positions(term, doc_positions) 441 wd.close() 442 443 f = open("test", "rb") 444 r = TermReader(f) 445 f2 = open("testI", "rb") 446 r2 = TermIndexReader(f2) 447 r3 = PositionReader(open("testP", "rb")) 448 r4 = PositionIndexReader(open("testPI", "rb")) 449 rp = PositionDictionaryReader(r3, r4) 450 rd = TermDictionaryReader(r, r2, rp) 451 terms_reversed = terms_with_positions[:] 452 terms_reversed.reverse() 453 for term, doc_positions in terms_reversed: 454 dp = list(rd.find_positions(term)) 455 print doc_positions == dp, doc_positions, dp 456 for term in ("aaa", "dog", "dingo"): 457 dp = rd.find_positions(term) 458 print dp == [], dp 459 460 print "- (Test iterators.)" 461 462 for term, docnum, positions in position_dict_tests: 463 dp = rd.find_positions(term) 464 pos = dp.from_document(docnum) 465 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 466 467 print "- (Test sequential access.)" 468 469 rd.rewind() 470 for term, doc_positions in terms_with_positions: 471 t, fr, df, dp = rd.read_term() 472 dp = list(dp) 473 print term == t, term, t 474 print doc_positions == dp, doc_positions, dp 475 rd.close() 476 477 print "- Test high-level index operations (including merging)." 478 479 docs = [ 480 (1, "The cat sat on the mat"), 481 (2, "Every good boy deserves football"), 482 (13, "One good turn deserves another"), 483 (14, "Every man for himself"), 484 (25, "Red sky at night shepherd's delight"), 485 (36, "She sells sea shells on the sea shore") 486 ] 487 488 doc_tests = [ 489 ("Every", 2, [(2, [0]), (14, [0])]), 490 ("good", 2, [(2, [1]), (13, [1])]), 491 ("deserves", 2, [(2, [3]), (13, [3])]), 492 ("sea", 2, [(36, [2, 6])]) 493 ] 494 495 position_tests = [ 496 ("Every", 14, [0]), 497 ("sea", 36, [2, 6]), 498 ("shells", 1, None), 499 ("shells", 37, None) 500 ] 501 502 phrase_tests = [ 503 (["good", "boy"], [(2, [1, 2])]), 504 (["on", "the"], [(1, [3, 4]), (36, [4, 5])]), 505 (["sea", "shore"], [(36, [6, 7])]) 506 ] 507 508 index = Index("test_index", 3, 2, 3, 6) 509 wi = index.get_writer() 510 for docnum, text in docs: 511 doc = Document(docnum) 512 for position, term in enumerate(text.split()): 513 doc.add_position(term, position) 514 doc.add_field(123, text) 515 wi.add_document(doc) 516 wi.close() 517 518 rd = index.get_reader() 519 520 print "- (Test searching.)" 521 522 for term, frequency, doc_positions in doc_tests: 523 dp = list(rd.find_positions(term)) 524 print doc_positions == dp, doc_positions, dp 525 fr = rd.get_frequency(term) 526 print frequency == fr, frequency, fr 527 528 print "- (Test fields.)" 529 530 for docnum, text in docs: 531 df = dict(rd.get_fields(docnum)) 532 print df[123] == text, text, df[123] 533 534 print "- (Test navigation.)" 535 536 for term, docnum, positions in position_tests: 537 dp = rd.find_positions(term) 538 pos = dp.from_document(docnum) 539 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 540 541 print "- (Test phrases.)" 542 543 for terms, results in phrase_tests: 544 res = list(rd.find_common_positions(terms)) 545 print results == res, results, res 546 547 index.close() 548 549 docs2 = [ 550 ((1, 0), "The cat sat on the mat"), 551 ((1, 2), "Every good boy deserves football"), 552 ((13, 1), "One good turn deserves another"), 553 ((14, 0), "Every man for himself"), 554 ((14, 25), "Red sky at night shepherd's delight"), 555 ((36, 12), "She sells sea shells on the sea shore") 556 ] 557 558 doc_tests2 = [ 559 ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]), 560 ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]), 561 ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]), 562 ("sea", 2, [((36, 12), [(2, 10), (6, 28)])]) 563 ] 564 565 position_tests2 = [ 566 ("Every", (14, 0), [(0, 0)]), 567 ("sea", (36, 12), [(2, 10), (6, 28)]), 568 ("shells", (1, 0), None), 569 ("shells", (37, 0), None) 570 ] 571 572 phrase_tests2 = [ 573 (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]), 574 (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]), 575 (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])]) 576 ] 577 578 index = Index("test_indexT", 3, 2, 3, 6) 579 wi = index.get_writer() 580 for docnum, text in docs2: 581 doc = Document(docnum) 582 offset = 0 583 for position, term in enumerate(text.split()): 584 doc.add_position(term, (position, offset)) 585 offset += len(term) + 1 # assume one space after the term 586 doc.add_field(123, text) 587 wi.add_document(doc) 588 wi.close() 589 590 rd = index.get_reader() 591 592 print "- (Test searching.)" 593 594 for term, frequency, doc_positions in doc_tests2: 595 dp = list(rd.find_positions(term)) 596 print doc_positions == dp, doc_positions, dp 597 fr = rd.get_frequency(term) 598 print frequency == fr, frequency, fr 599 600 print "- (Test fields.)" 601 602 for docnum, text in docs2: 603 df = dict(rd.get_fields(docnum)) 604 print df[123] == text, text, df[123] 605 606 print "- (Test navigation.)" 607 608 for term, docnum, positions in position_tests2: 609 dp = rd.find_positions(term) 610 pos = dp.from_document(docnum) 611 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 612 613 print "- (Test phrases.)" 614 615 for terms, results in phrase_tests2: 616 res = list(rd.find_common_positions(terms)) 617 print results == res, results, res 618 619 index.close() 620 621 print "- Test index updates." 622 623 index = Index("test_index") 624 index2 = Index("test_index2", 3, 2, 3, 6) 625 wi = index2.get_writer() 626 for docnum, text in docs: 627 628 # Add the same documents but with different numbers. 629 630 doc = Document(docnum + 100) 631 for position, term in enumerate(text.split()): 632 doc.add_position(term, position) 633 doc.add_field(123, text) 634 wi.add_document(doc) 635 wi.close() 636 637 index2.update([index]) 638 index.close() 639 640 rd = index2.get_reader() 641 for term, frequency, doc_positions in doc_tests: 642 643 # Add the extra documents to the expected result. 644 645 orig_doc_positions = doc_positions 646 doc_positions = doc_positions[:] 647 648 for docnum, positions in orig_doc_positions: 649 doc_positions.append((docnum + 100, positions)) 650 frequency *= 2 651 652 dp = list(rd.find_positions(term)) 653 print doc_positions == dp, doc_positions, dp 654 fr = rd.get_frequency(term) 655 print frequency == fr, frequency, fr 656 index2.close() 657 658 print "- (Test update of an empty index.)" 659 660 index = Index("test_index") 661 index3 = Index("test_index3") 662 index3.update([index]) 663 index.close() 664 665 rd = index3.get_reader() 666 for term, frequency, doc_positions in doc_tests: 667 dp = list(rd.find_positions(term)) 668 print doc_positions == dp, doc_positions, dp 669 fr = rd.get_frequency(term) 670 print frequency == fr, frequency, fr 671 index3.close() 672 673 # vim: tabstop=4 expandtab shiftwidth=4