1 #!/usr/bin/env python 2 3 from iixr.files import * 4 from iixr.fields import * 5 from iixr.terms import * 6 from iixr.positions import * 7 from iixr.index import * 8 import os, sys 9 10 # Remove old test files. 11 12 for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"): 13 try: 14 os.remove(filename) 15 except OSError: 16 pass 17 18 try: 19 for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"): 20 for filename in os.listdir(dirname): 21 os.remove(os.path.join(dirname, filename)) 22 os.rmdir(dirname) 23 except OSError: 24 pass 25 26 if "clean" in sys.argv: 27 sys.exit(0) 28 29 print "- Test basic data types." 30 31 numbers = [12345678, 0, 1, 127, 128, 255, 256] 32 33 f = open("test", "wb") 34 w = FileWriter(f) 35 w.begin_record() 36 for number in numbers: 37 w.write_number(number) 38 w.end_record() 39 w.close() 40 41 f = open("test", "rb") 42 r = FileReader(f) 43 r.begin_record() 44 for number in numbers: 45 n = r.read_number() 46 print number == n, number, n 47 r.end_record() 48 r.close() 49 50 tuples = [(0, 0), (1, 3), (2, 5), (3, 9)] 51 52 f = open("testMS", "wb") 53 w = FileWriter(f) 54 w.begin_record() 55 w.write_monotonic_sequence(tuples) 56 w.end_record() 57 w.close() 58 59 f = open("testMS", "rb") 60 r = FileReader(f) 61 r.begin_record() 62 for t, t2 in zip(r.read_monotonic_sequence(), tuples): 63 print t == t2, t, t2 64 r.end_record() 65 r.close() 66 67 tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)] 68 69 f = open("testNMS", "wb") 70 w = FileWriter(f) 71 w.begin_record() 72 w.write_delta_sequence(tuples2) 73 w.end_record() 74 w.close() 75 76 f = open("testNMS", "rb") 77 r = FileReader(f) 78 r.begin_record() 79 for t, t2 in zip(r.read_delta_sequence(), tuples2): 80 print t == t2, t, t2 81 r.end_record() 82 r.close() 83 84 print "- Test positions." 85 86 all_doc_positions = [ 87 [ 88 (123, [1, 3, 5, 15, 25]), 89 (124, [0, 100]), 90 (125, [11, 99, 199]), 91 (130, [77, 78, 80, 82, 89]) 92 ], 93 [ 94 (78, [9]), 95 (196, [10, 11]), 96 (197, [17, 21, 30]) 97 ] 98 ] 99 100 f = open("testP", "wb") 101 w = PositionWriter(f) 102 for doc_positions in all_doc_positions: 103 for docnum, positions in doc_positions: 104 w.write_positions(docnum, positions) 105 w.reset() 106 w.close() 107 108 f = open("testP", "rb") 109 r = PositionReader(f) 110 for doc_positions in all_doc_positions: 111 for docnum, positions in doc_positions: 112 d, p = r.read_positions() 113 print docnum == d, docnum, d 114 print positions == p, positions, p 115 r.reset() 116 r.close() 117 118 all_doc_positions_seq = [ 119 [ 120 ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]), 121 ((124, 1), [(0, 0), (100, 350)]), 122 ((124, 2), [(11, 38), (99, 379), (199, 720)]), 123 ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)]) 124 ], 125 [ 126 ((78, 1), [(9, 19)]), 127 ((196, 0), [(10, 27), (11, 29)]), 128 ((196, 1), [(17, 46), (21, 52), (30, 60)]) 129 ] 130 ] 131 132 f = open("testP2", "wb") 133 w = PositionWriter(f) 134 for doc_positions in all_doc_positions_seq: 135 for docnum, positions in doc_positions: 136 w.write_positions(docnum, positions) 137 w.reset() 138 w.close() 139 140 f = open("testP2", "rb") 141 r = PositionReader(f) 142 for doc_positions in all_doc_positions_seq: 143 for docnum, positions in doc_positions: 144 d, p = r.read_positions() 145 print docnum == d, docnum, d 146 print positions == p, positions, p 147 r.reset() 148 r.close() 149 150 print "- Test position index files." 151 152 indexed_positions = [ 153 [ 154 (1234, 0, 100), 155 (2345, 700, 100), 156 (3456, 1900, 50) 157 ], 158 [ 159 (4567, 2800, 20) 160 ] 161 ] 162 163 offsets = [] 164 f = open("testPI", "wb") 165 w = PositionIndexWriter(f) 166 for term_positions in indexed_positions: 167 offset = None 168 doc_frequency = 0 169 w.reset() 170 for docnum, pos_offset, count in term_positions: 171 if offset is None: 172 offset = w.tell() 173 w.write_positions(docnum, pos_offset, count) 174 doc_frequency += count 175 offsets.append((offset, doc_frequency)) 176 w.close() 177 178 r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb"))) 179 offsets.reverse() 180 indexed_positions.reverse() 181 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): 182 r.seek(offset, doc_frequency) 183 for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r): 184 print docnum == dn, docnum, dn 185 print pos_offset == po, pos_offset, po 186 print count == c, count, c 187 r.reader.close() 188 189 print "- Test position dictionaries." 190 191 f = open("testP", "wb") 192 w = PositionWriter(f) 193 f2 = open("testPI", "wb") 194 w2 = PositionIndexWriter(f2) 195 wd = PositionDictionaryWriter(w, w2, 2) 196 offsets = [] 197 for doc_positions in all_doc_positions: 198 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) 199 offsets.append((offset, doc_frequency)) 200 wd.close() 201 202 r = PositionReader(open("testP", "rb")) 203 r2 = PositionIndexReader(open("testPI", "rb")) 204 rd = PositionDictionaryReader(r, r2) 205 offsets.reverse() 206 all_doc_positions.reverse() 207 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): 208 it = rd.read_term_positions(offset, doc_frequency) 209 dp = list(it) 210 print doc_positions == dp, doc_positions, dp 211 rd.close() 212 213 print "- Test fields." 214 215 doc_fields = [ 216 (123, ["testing", "fields", "stored", "compressed"]), 217 (456, ["fields", "for a second", "document"]), 218 (789, ["field value"]), 219 (1234, []), 220 (2345, ["abc", "def"]), 221 (3456, ["apple", "banana", "cherry"]), 222 (4567, ["drue", "eple"]) 223 ] 224 225 f = open("testF", "wb") 226 w = FieldWriter(f) 227 for docnum, fields in doc_fields: 228 w.write_fields(docnum, list(enumerate(fields))) 229 w.close() 230 231 f = open("testF", "rb") 232 r = FieldReader(f) 233 for docnum, fields in doc_fields: 234 dn, df = r.read_fields() 235 print docnum == dn, docnum, dn 236 print list(enumerate(fields)) == df, list(enumerate(fields)), df 237 r.close() 238 239 print "- Test field index files." 240 241 indexed_docs = [ 242 (123, 100000987), 243 (456, 100004321), 244 (789, 100008765) 245 ] 246 247 f = open("testFI", "wb") 248 w = FieldIndexWriter(f) 249 for docnum, offset in indexed_docs: 250 w.write_document(docnum, offset) 251 w.close() 252 253 f = open("testFI", "rb") 254 r = FieldIndexReader(f) 255 for docnum, offset in indexed_docs: 256 dn, o = r.read_document() 257 print docnum == dn, docnum, dn 258 print offset == o, offset, o 259 r.close() 260 261 print "- Test field dictionaries." 262 263 f = open("testF", "wb") 264 w = FieldWriter(f) 265 f2 = open("testFI", "wb") 266 w2 = FieldIndexWriter(f2) 267 wd = FieldDictionaryWriter(w, w2, 3) 268 for docnum, fields in doc_fields: 269 wd.write_fields(docnum, list(enumerate(fields))) 270 wd.close() 271 272 f = open("testF", "rb") 273 r = FieldReader(f) 274 f2 = open("testFI", "rb") 275 r2 = FieldIndexReader(f2) 276 rd = FieldDictionaryReader(r, r2) 277 doc_fields_reversed = doc_fields[:] 278 doc_fields_reversed.reverse() 279 for docnum, fields in doc_fields_reversed: 280 df = dict(rd.get_fields(docnum)) 281 print dict(enumerate(fields)) == df, dict(enumerate(fields)), df 282 for docnum in (13579, 246810): 283 df = rd.get_fields(docnum) 284 print df is None, df 285 286 print "- (Test sequential access.)" 287 288 rd.rewind() 289 for docnum, fields in doc_fields: 290 dn, df = rd.read_fields() 291 print docnum == dn, docnum, dn 292 print list(enumerate(fields)) == df, list(enumerate(fields)), df 293 rd.close() 294 295 print "- Test terms." 296 297 terms = [ 298 # term offset frequency doc_frequency 299 ("aardvark", 100000123, 1, 1), 300 ("anteater", 100000456, 2, 1), 301 ("badger", 100000789, 13, 7), 302 ("bull", 1000001234, 59, 17), 303 ("bulldog", 1000002345, 99, 80), 304 ("cat", 1000003456, 89, 28) 305 ] 306 307 f = open("test", "wb") 308 w = TermWriter(f) 309 for term, offset, frequency, doc_frequency in terms: 310 w.write_term(term, offset, frequency, doc_frequency) 311 w.close() 312 313 f = open("test", "rb") 314 r = TermReader(f) 315 for term, offset, frequency, doc_frequency in terms: 316 t, o, fr, df = r.read_term() 317 print term == t, term, t 318 print offset == o, offset, o 319 print frequency == fr, frequency, fr 320 print doc_frequency == df, doc_frequency, df 321 r.close() 322 323 print "- Test terms in index files." 324 325 indexed_terms = [ 326 # term offset frequency doc_frequency info_offset 327 ("aardvark", 100000123, 1, 1, 200000321), 328 ("anteater", 100000456, 2, 1, 200000654), 329 ("badger", 100000789, 13, 7, 200000987), 330 ("bull", 1000001234, 59, 17, 200004321), 331 ("bulldog", 1000002345, 99, 80, 200005432), 332 ("cat", 1000003456, 89, 28, 200006543) 333 ] 334 335 f = open("test", "wb") 336 w = TermIndexWriter(f) 337 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 338 w.write_term(term, offset, frequency, doc_frequency, info_offset) 339 w.close() 340 341 f = open("test", "rb") 342 r = TermIndexReader(f) 343 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 344 t, o, fr, df, i = r.read_term() 345 print term == t, term, t 346 print offset == o, offset, o 347 print frequency == fr, frequency, fr 348 print doc_frequency == df, doc_frequency, df 349 print info_offset == i, info_offset, i 350 r.close() 351 352 print "- Test dictionaries with only term data." 353 354 f = open("test", "wb") 355 w = TermWriter(f) 356 f2 = open("testI", "wb") 357 w2 = TermIndexWriter(f2) 358 f3 = open("testP", "wb") 359 w3 = PositionWriter(f3) 360 f4 = open("testPI", "wb") 361 w4 = PositionIndexWriter(f4) 362 wp = PositionDictionaryWriter(w3, w4, 2) 363 wd = TermDictionaryWriter(w, w2, wp, 3) 364 for term, offset, frequency, doc_frequency in terms: 365 wd._write_term(term, offset, frequency, doc_frequency) 366 wd.close() 367 368 f = open("test", "rb") 369 r = TermReader(f) 370 f2 = open("testI", "rb") 371 r2 = TermIndexReader(f2) 372 r3 = PositionReader(open("testP", "rb")) 373 r4 = PositionIndexReader(open("testPI", "rb")) 374 rp = PositionDictionaryReader(r3, r4) 375 rd = TermDictionaryReader(r, r2, rp) 376 terms_reversed = terms[:] 377 terms_reversed.reverse() 378 for term, offset, frequency, doc_frequency in terms_reversed: 379 o, fr, df = rd._find_term(term) 380 print offset == o, offset, o 381 print frequency == fr, frequency, fr 382 print doc_frequency == df, doc_frequency, df 383 for term in ("dog", "dingo"): 384 t = rd._find_term(term) 385 print t is None, t 386 387 print "- (Test term prefix searching.)" 388 389 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] 390 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] 391 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] 392 print rd.find_terms("d") == [], rd.find_terms("d"), [] 393 rd.close() 394 395 print "- Test dictionaries with term and position data." 396 397 terms_with_positions = [ 398 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 399 ("anteater", [(1, [43, 44])]), 400 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 401 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 402 ("bulldog", [(43, [17, 19, 256, 512])]), 403 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 404 ] 405 406 position_dict_tests = [ 407 ("badger", 19, [55, 1333]), 408 ("badger", 20, None), 409 ("bull", 6, [128]), 410 ("bull", 26, [1, 3, 5, 7, 9]), 411 ("cat", 111, None), 412 ("cat", 123, [12, 145, 196]), 413 ("cat", 1234, None) 414 ] 415 416 f = open("test", "wb") 417 w = TermWriter(f) 418 f2 = open("testI", "wb") 419 w2 = TermIndexWriter(f2) 420 f3 = open("testP", "wb") 421 w3 = PositionWriter(f3) 422 f4 = open("testPI", "wb") 423 w4 = PositionIndexWriter(f4) 424 wp = PositionDictionaryWriter(w3, w4, 2) 425 wd = TermDictionaryWriter(w, w2, wp, 3) 426 for term, doc_positions in terms_with_positions: 427 wd.write_term_positions(term, doc_positions) 428 wd.close() 429 430 f = open("test", "rb") 431 r = TermReader(f) 432 f2 = open("testI", "rb") 433 r2 = TermIndexReader(f2) 434 r3 = PositionReader(open("testP", "rb")) 435 r4 = PositionIndexReader(open("testPI", "rb")) 436 rp = PositionDictionaryReader(r3, r4) 437 rd = TermDictionaryReader(r, r2, rp) 438 terms_reversed = terms_with_positions[:] 439 terms_reversed.reverse() 440 for term, doc_positions in terms_reversed: 441 dp = list(rd.find_positions(term)) 442 print doc_positions == dp, doc_positions, dp 443 for term in ("aaa", "dog", "dingo"): 444 dp = rd.find_positions(term) 445 print dp == [], dp 446 447 print "- (Test iterators.)" 448 449 for term, docnum, positions in position_dict_tests: 450 dp = rd.find_positions(term) 451 pos = dp.from_document(docnum) 452 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 453 454 print "- (Test sequential access.)" 455 456 rd.rewind() 457 for term, doc_positions in terms_with_positions: 458 t, fr, df, dp = rd.read_term() 459 dp = list(dp) 460 print term == t, term, t 461 print doc_positions == dp, doc_positions, dp 462 rd.close() 463 464 print "- Test high-level index operations (including merging)." 465 466 docs = [ 467 (1, "The cat sat on the mat"), 468 (2, "Every good boy deserves football"), 469 (13, "One good turn deserves another"), 470 (14, "Every man for himself"), 471 (25, "Red sky at night shepherd's delight"), 472 (36, "She sells sea shells on the sea shore") 473 ] 474 475 doc_tests = [ 476 ("Every", 2, [(2, [0]), (14, [0])]), 477 ("good", 2, [(2, [1]), (13, [1])]), 478 ("deserves", 2, [(2, [3]), (13, [3])]), 479 ("sea", 2, [(36, [2, 6])]) 480 ] 481 482 position_tests = [ 483 ("Every", 14, [0]), 484 ("sea", 36, [2, 6]), 485 ("shells", 1, None), 486 ("shells", 37, None) 487 ] 488 489 phrase_tests = [ 490 (["good", "boy"], [(2, [1, 2])]), 491 (["on", "the"], [(1, [3, 4]), (36, [4, 5])]), 492 (["sea", "shore"], [(36, [6, 7])]) 493 ] 494 495 index = Index("test_index", 3, 2, 3, 6) 496 wi = index.get_writer() 497 for docnum, text in docs: 498 doc = Document(docnum) 499 for position, term in enumerate(text.split()): 500 doc.add_position(term, position) 501 doc.add_field(123, text) 502 wi.add_document(doc) 503 wi.close() 504 505 rd = index.get_reader() 506 507 print "- (Test searching.)" 508 509 for term, frequency, doc_positions in doc_tests: 510 dp = list(rd.find_positions(term)) 511 print doc_positions == dp, doc_positions, dp 512 fr = rd.get_frequency(term) 513 print frequency == fr, frequency, fr 514 515 print "- (Test fields.)" 516 517 for docnum, text in docs: 518 df = dict(rd.get_fields(docnum)) 519 print df[123] == text, text, df[123] 520 521 print "- (Test navigation.)" 522 523 for term, docnum, positions in position_tests: 524 dp = rd.find_positions(term) 525 pos = dp.from_document(docnum) 526 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 527 528 print "- (Test phrases.)" 529 530 for terms, results in phrase_tests: 531 res = list(rd.find_common_positions(terms)) 532 print results == res, results, res 533 534 index.close() 535 536 docs2 = [ 537 ((1, 0), "The cat sat on the mat"), 538 ((1, 2), "Every good boy deserves football"), 539 ((13, 1), "One good turn deserves another"), 540 ((14, 0), "Every man for himself"), 541 ((14, 25), "Red sky at night shepherd's delight"), 542 ((36, 12), "She sells sea shells on the sea shore") 543 ] 544 545 doc_tests2 = [ 546 ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]), 547 ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]), 548 ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]), 549 ("sea", 2, [((36, 12), [(2, 10), (6, 28)])]) 550 ] 551 552 position_tests2 = [ 553 ("Every", (14, 0), [(0, 0)]), 554 ("sea", (36, 12), [(2, 10), (6, 28)]), 555 ("shells", (1, 0), None), 556 ("shells", (37, 0), None) 557 ] 558 559 phrase_tests2 = [ 560 (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]), 561 (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]), 562 (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])]) 563 ] 564 565 index = Index("test_indexT", 3, 2, 3, 6) 566 wi = index.get_writer() 567 for docnum, text in docs2: 568 doc = Document(docnum) 569 offset = 0 570 for position, term in enumerate(text.split()): 571 doc.add_position(term, (position, offset)) 572 offset += len(term) + 1 # assume one space after the term 573 doc.add_field(123, text) 574 wi.add_document(doc) 575 wi.close() 576 577 rd = index.get_reader() 578 579 print "- (Test searching.)" 580 581 for term, frequency, doc_positions in doc_tests2: 582 dp = list(rd.find_positions(term)) 583 print doc_positions == dp, doc_positions, dp 584 fr = rd.get_frequency(term) 585 print frequency == fr, frequency, fr 586 587 print "- (Test fields.)" 588 589 for docnum, text in docs2: 590 df = dict(rd.get_fields(docnum)) 591 print df[123] == text, text, df[123] 592 593 print "- (Test navigation.)" 594 595 for term, docnum, positions in position_tests2: 596 dp = rd.find_positions(term) 597 pos = dp.from_document(docnum) 598 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 599 600 print "- (Test phrases.)" 601 602 for terms, results in phrase_tests2: 603 res = list(rd.find_common_positions(terms)) 604 print results == res, results, res 605 606 index.close() 607 608 print "- Test index updates." 609 610 index = Index("test_index") 611 index2 = Index("test_index2", 3, 2, 3, 6) 612 wi = index2.get_writer() 613 for docnum, text in docs: 614 615 # Add the same documents but with different numbers. 616 617 doc = Document(docnum + 100) 618 for position, term in enumerate(text.split()): 619 doc.add_position(term, position) 620 doc.add_field(123, text) 621 wi.add_document(doc) 622 wi.close() 623 624 index2.update([index]) 625 index.close() 626 627 rd = index2.get_reader() 628 for term, frequency, doc_positions in doc_tests: 629 630 # Add the extra documents to the expected result. 631 632 orig_doc_positions = doc_positions 633 doc_positions = doc_positions[:] 634 635 for docnum, positions in orig_doc_positions: 636 doc_positions.append((docnum + 100, positions)) 637 frequency *= 2 638 639 dp = list(rd.find_positions(term)) 640 print doc_positions == dp, doc_positions, dp 641 fr = rd.get_frequency(term) 642 print frequency == fr, frequency, fr 643 index2.close() 644 645 print "- (Test update of an empty index.)" 646 647 index = Index("test_index") 648 index3 = Index("test_index3") 649 index3.update([index]) 650 index.close() 651 652 rd = index3.get_reader() 653 for term, frequency, doc_positions in doc_tests: 654 dp = list(rd.find_positions(term)) 655 print doc_positions == dp, doc_positions, dp 656 fr = rd.get_frequency(term) 657 print frequency == fr, frequency, fr 658 index3.close() 659 660 # vim: tabstop=4 expandtab shiftwidth=4