1 #!/usr/bin/env python 2 3 from iixr.files import * 4 from iixr.fields import * 5 from iixr.terms import * 6 from iixr.positions import * 7 from iixr.index import * 8 from array import array 9 import os, sys 10 11 # Remove old test files. 12 13 for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"): 14 try: 15 os.remove(filename) 16 except OSError: 17 pass 18 19 try: 20 for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"): 21 for filename in os.listdir(dirname): 22 os.remove(os.path.join(dirname, filename)) 23 os.rmdir(dirname) 24 except OSError: 25 pass 26 27 if "clean" in sys.argv: 28 sys.exit(0) 29 30 print "- Test basic data types." 31 32 numbers = [12345678, 0, 1, 127, 128, 255, 256] 33 34 f = open("test", "wb") 35 w = FileWriter(f) 36 for number in numbers: 37 w.write_number(number) 38 w.close() 39 40 f = open("test", "rb") 41 r = FileReader(f) 42 for number in numbers: 43 n = r.read_number() 44 print number == n, number, n 45 r.close() 46 47 tuples = [(0, 0), (1, 3), (2, 5), (3, 9)] 48 49 f = open("testMS", "wb") 50 w = FileWriter(f) 51 b = array("B") 52 last = w.get_initial_value(2) 53 for t in tuples: 54 last = w.write_sequence(b, t, last, 2) 55 b.tofile(w.f) 56 w.close() 57 58 f = open("testMS", "rb") 59 r = FileReader(f) 60 last = r.get_initial_value(2) 61 for t in tuples: 62 last = t2 = r.read_sequence(last, 2) 63 print t == t2, t, t2 64 r.close() 65 66 tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)] 67 68 f = open("testNMS", "wb") 69 w = FileWriter(f) 70 b = array("B") 71 last = w.get_initial_value(2) 72 for t in tuples2: 73 last = w.write_sequence(b, t, last, 2, monotonic=0) 74 b.tofile(w.f) 75 w.close() 76 77 f = open("testNMS", "rb") 78 r = FileReader(f) 79 last = r.get_initial_value(2) 80 for t in tuples2: 81 last = t2 = r.read_sequence(last, 2, monotonic=0) 82 print t == t2, t, t2 83 r.close() 84 85 print "- Test positions." 86 87 all_doc_positions = [ 88 [ 89 (123, [1, 3, 5, 15, 25]), 90 (124, [0, 100]), 91 (125, [11, 99, 199]), 92 (130, [77, 78, 80, 82, 89]) 93 ], 94 [ 95 (78, [9]), 96 (196, [10, 11]), 97 (197, [17, 21, 30]) 98 ] 99 ] 100 101 f = open("testP", "wb") 102 w = PositionWriter(f) 103 for doc_positions in all_doc_positions: 104 for docnum, positions in doc_positions: 105 w.write_positions(docnum, positions) 106 w.reset() 107 w.close() 108 109 f = open("testP", "rb") 110 r = PositionReader(f) 111 for doc_positions in all_doc_positions: 112 for docnum, positions in doc_positions: 113 d, p = r.read_positions() 114 print docnum == d, docnum, d 115 print positions == p, positions, p 116 r.reset() 117 r.close() 118 119 all_doc_positions_seq = [ 120 [ 121 ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]), 122 ((124, 1), [(0, 0), (100, 350)]), 123 ((124, 2), [(11, 38), (99, 379), (199, 720)]), 124 ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)]) 125 ], 126 [ 127 ((78, 1), [(9, 19)]), 128 ((196, 0), [(10, 27), (11, 29)]), 129 ((196, 1), [(17, 46), (21, 52), (30, 60)]) 130 ] 131 ] 132 133 f = open("testP2", "wb") 134 w = PositionWriter(f) 135 for doc_positions in all_doc_positions_seq: 136 for docnum, positions in doc_positions: 137 w.write_positions(docnum, positions) 138 w.reset() 139 w.close() 140 141 f = open("testP2", "rb") 142 r = PositionReader(f) 143 for doc_positions in all_doc_positions_seq: 144 for docnum, positions in doc_positions: 145 d, p = r.read_positions() 146 print tuple(docnum) == tuple(d), docnum, d 147 print tuple(positions) == tuple(p), positions, p 148 r.reset() 149 r.close() 150 151 print "- Test position index files." 152 153 indexed_positions = [ 154 [ 155 (1234, 0, 100), 156 (2345, 700, 100), 157 (3456, 1900, 50) 158 ], 159 [ 160 (4567, 2800, 20) 161 ] 162 ] 163 164 offsets = [] 165 f = open("testPI", "wb") 166 w = PositionIndexWriter(f) 167 for term_positions in indexed_positions: 168 offset = None 169 doc_frequency = 0 170 w.reset() 171 for docnum, pos_offset, count in term_positions: 172 if offset is None: 173 offset = w.f.tell() 174 w.write_positions(docnum, pos_offset, count) 175 doc_frequency += count 176 offsets.append((offset, doc_frequency)) 177 w.close() 178 179 r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb"))) 180 offsets.reverse() 181 indexed_positions.reverse() 182 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): 183 r.seek(offset, doc_frequency) 184 for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r): 185 print docnum == dn, docnum, dn 186 print pos_offset == po, pos_offset, po 187 print count == c, count, c 188 r.reader.close() 189 190 print "- Test position dictionaries." 191 192 f = open("testP", "wb") 193 w = PositionWriter(f) 194 f2 = open("testPI", "wb") 195 w2 = PositionIndexWriter(f2) 196 wd = PositionDictionaryWriter(w, w2, 2) 197 offsets = [] 198 for doc_positions in all_doc_positions: 199 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) 200 offsets.append((offset, doc_frequency)) 201 wd.close() 202 203 r = PositionReader(open("testP", "rb")) 204 r2 = PositionIndexReader(open("testPI", "rb")) 205 rd = PositionDictionaryReader(r, r2) 206 offsets.reverse() 207 all_doc_positions.reverse() 208 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): 209 it = rd.read_term_positions(offset, doc_frequency) 210 dp = list(it) 211 print doc_positions == dp, doc_positions, dp 212 rd.close() 213 214 print "- Test fields." 215 216 doc_fields = [ 217 (123, ["testing", "fields", "stored", "compressed"]), 218 (456, ["fields", "for a second", "document"]), 219 (789, ["field value"]), 220 (1234, []), 221 (2345, ["abc", "def"]), 222 (3456, ["apple", "banana", "cherry"]), 223 (4567, ["drue", "eple"]) 224 ] 225 226 f = open("testF", "wb") 227 w = FieldWriter(f) 228 for docnum, fields in doc_fields: 229 w.write_fields(docnum, list(enumerate(fields))) 230 w.close() 231 232 f = open("testF", "rb") 233 r = FieldReader(f) 234 for docnum, fields in doc_fields: 235 dn, df = r.read_fields() 236 print docnum == dn, docnum, dn 237 print list(enumerate(fields)) == df, list(enumerate(fields)), df 238 r.close() 239 240 print "- Test field index files." 241 242 indexed_docs = [ 243 (123, 100000987), 244 (456, 100004321), 245 (789, 100008765) 246 ] 247 248 f = open("testFI", "wb") 249 w = FieldIndexWriter(f) 250 for docnum, offset in indexed_docs: 251 w.write_document(docnum, offset) 252 w.close() 253 254 f = open("testFI", "rb") 255 r = FieldIndexReader(f) 256 for docnum, offset in indexed_docs: 257 dn, o = r.read_document() 258 print docnum == dn, docnum, dn 259 print offset == o, offset, o 260 r.close() 261 262 print "- Test field dictionaries." 263 264 f = open("testF", "wb") 265 w = FieldWriter(f) 266 f2 = open("testFI", "wb") 267 w2 = FieldIndexWriter(f2) 268 wd = FieldDictionaryWriter(w, w2, 3) 269 for docnum, fields in doc_fields: 270 wd.write_fields(docnum, list(enumerate(fields))) 271 wd.close() 272 273 f = open("testF", "rb") 274 r = FieldReader(f) 275 f2 = open("testFI", "rb") 276 r2 = FieldIndexReader(f2) 277 rd = FieldDictionaryReader(r, r2) 278 doc_fields_reversed = doc_fields[:] 279 doc_fields_reversed.reverse() 280 for docnum, fields in doc_fields_reversed: 281 df = dict(rd.get_fields(docnum)) 282 print dict(enumerate(fields)) == df, dict(enumerate(fields)), df 283 for docnum in (13579, 246810): 284 df = rd.get_fields(docnum) 285 print df is None, df 286 287 print "- (Test sequential access.)" 288 289 rd.rewind() 290 for docnum, fields in doc_fields: 291 dn, df = rd.read_fields() 292 print docnum == dn, docnum, dn 293 print list(enumerate(fields)) == df, list(enumerate(fields)), df 294 rd.close() 295 296 print "- Test terms." 297 298 terms = [ 299 # term offset frequency doc_frequency 300 ("aardvark", 100000123, 1, 1), 301 ("anteater", 100000456, 2, 1), 302 ("badger", 100000789, 13, 7), 303 ("bull", 1000001234, 59, 17), 304 ("bulldog", 1000002345, 99, 80), 305 ("cat", 1000003456, 89, 28) 306 ] 307 308 f = open("test", "wb") 309 w = TermWriter(f) 310 for term, offset, frequency, doc_frequency in terms: 311 w.write_term(term, offset, frequency, doc_frequency) 312 w.close() 313 314 f = open("test", "rb") 315 r = TermReader(f) 316 for term, offset, frequency, doc_frequency in terms: 317 t, o, fr, df = r.read_term() 318 print term == t, term, t 319 print offset == o, offset, o 320 print frequency == fr, frequency, fr 321 print doc_frequency == df, doc_frequency, df 322 r.close() 323 324 print "- Test terms in index files." 325 326 indexed_terms = [ 327 # term offset frequency doc_frequency info_offset 328 ("aardvark", 100000123, 1, 1, 200000321), 329 ("anteater", 100000456, 2, 1, 200000654), 330 ("badger", 100000789, 13, 7, 200000987), 331 ("bull", 1000001234, 59, 17, 200004321), 332 ("bulldog", 1000002345, 99, 80, 200005432), 333 ("cat", 1000003456, 89, 28, 200006543) 334 ] 335 336 f = open("test", "wb") 337 w = TermIndexWriter(f) 338 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 339 w.write_term(term, offset, frequency, doc_frequency, info_offset) 340 w.close() 341 342 f = open("test", "rb") 343 r = TermIndexReader(f) 344 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 345 t, o, fr, df, i = r.read_term() 346 print term == t, term, t 347 print offset == o, offset, o 348 print frequency == fr, frequency, fr 349 print doc_frequency == df, doc_frequency, df 350 print info_offset == i, info_offset, i 351 r.close() 352 353 print "- Test dictionaries with only term data." 354 355 f = open("test", "wb") 356 w = TermWriter(f) 357 f2 = open("testI", "wb") 358 w2 = TermIndexWriter(f2) 359 f3 = open("testP", "wb") 360 w3 = PositionWriter(f3) 361 f4 = open("testPI", "wb") 362 w4 = PositionIndexWriter(f4) 363 wp = PositionDictionaryWriter(w3, w4, 2) 364 wd = TermDictionaryWriter(w, w2, wp, 3) 365 for term, offset, frequency, doc_frequency in terms: 366 wd._write_term(term, offset, frequency, doc_frequency) 367 wd.close() 368 369 f = open("test", "rb") 370 r = TermReader(f) 371 f2 = open("testI", "rb") 372 r2 = TermIndexReader(f2) 373 r3 = PositionReader(open("testP", "rb")) 374 r4 = PositionIndexReader(open("testPI", "rb")) 375 rp = PositionDictionaryReader(r3, r4) 376 rd = TermDictionaryReader(r, r2, rp) 377 terms_reversed = terms[:] 378 terms_reversed.reverse() 379 for term, offset, frequency, doc_frequency in terms_reversed: 380 o, fr, df = rd._find_term(term) 381 print offset == o, offset, o 382 print frequency == fr, frequency, fr 383 print doc_frequency == df, doc_frequency, df 384 for term in ("dog", "dingo"): 385 t = rd._find_term(term) 386 print t is None, t 387 388 print "- (Test term prefix searching.)" 389 390 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] 391 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] 392 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] 393 print rd.find_terms("d") == [], rd.find_terms("d"), [] 394 rd.close() 395 396 print "- Test dictionaries with term and position data." 397 398 terms_with_positions = [ 399 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 400 ("anteater", [(1, [43, 44])]), 401 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 402 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 403 ("bulldog", [(43, [17, 19, 256, 512])]), 404 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 405 ] 406 407 position_dict_tests = [ 408 ("badger", 19, [55, 1333]), 409 ("badger", 20, None), 410 ("bull", 6, [128]), 411 ("bull", 26, [1, 3, 5, 7, 9]), 412 ("cat", 111, None), 413 ("cat", 123, [12, 145, 196]), 414 ("cat", 1234, None) 415 ] 416 417 f = open("test", "wb") 418 w = TermWriter(f) 419 f2 = open("testI", "wb") 420 w2 = TermIndexWriter(f2) 421 f3 = open("testP", "wb") 422 w3 = PositionWriter(f3) 423 f4 = open("testPI", "wb") 424 w4 = PositionIndexWriter(f4) 425 wp = PositionDictionaryWriter(w3, w4, 2) 426 wd = TermDictionaryWriter(w, w2, wp, 3) 427 for term, doc_positions in terms_with_positions: 428 wd.write_term_positions(term, doc_positions) 429 wd.close() 430 431 f = open("test", "rb") 432 r = TermReader(f) 433 f2 = open("testI", "rb") 434 r2 = TermIndexReader(f2) 435 r3 = PositionReader(open("testP", "rb")) 436 r4 = PositionIndexReader(open("testPI", "rb")) 437 rp = PositionDictionaryReader(r3, r4) 438 rd = TermDictionaryReader(r, r2, rp) 439 terms_reversed = terms_with_positions[:] 440 terms_reversed.reverse() 441 for term, doc_positions in terms_reversed: 442 dp = list(rd.find_positions(term)) 443 print doc_positions == dp, doc_positions, dp 444 for term in ("aaa", "dog", "dingo"): 445 dp = rd.find_positions(term) 446 print dp == [], dp 447 448 print "- (Test iterators.)" 449 450 for term, docnum, positions in position_dict_tests: 451 dp = rd.find_positions(term) 452 pos = dp.from_document(docnum) 453 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 454 455 print "- (Test sequential access.)" 456 457 rd.rewind() 458 for term, doc_positions in terms_with_positions: 459 t, fr, df, dp = rd.read_term() 460 dp = list(dp) 461 print term == t, term, t 462 print doc_positions == dp, doc_positions, dp 463 rd.close() 464 465 print "- Test high-level index operations (including merging)." 466 467 docs = [ 468 (1, "The cat sat on the mat"), 469 (2, "Every good boy deserves football"), 470 (13, "One good turn deserves another"), 471 (14, "Every man for himself"), 472 (25, "Red sky at night shepherd's delight"), 473 (36, "She sells sea shells on the sea shore") 474 ] 475 476 doc_tests = [ 477 ("Every", 2, [(2, [0]), (14, [0])]), 478 ("good", 2, [(2, [1]), (13, [1])]), 479 ("deserves", 2, [(2, [3]), (13, [3])]), 480 ("sea", 2, [(36, [2, 6])]) 481 ] 482 483 position_tests = [ 484 ("Every", 14, [0]), 485 ("sea", 36, [2, 6]), 486 ("shells", 1, None), 487 ("shells", 37, None) 488 ] 489 490 phrase_tests = [ 491 (["good", "boy"], [(2, [1, 2])]), 492 (["on", "the"], [(1, [3, 4]), (36, [4, 5])]), 493 (["sea", "shore"], [(36, [6, 7])]) 494 ] 495 496 index = Index("test_index", 3, 2, 3, 6) 497 wi = index.get_writer() 498 for docnum, text in docs: 499 doc = Document(docnum) 500 for position, term in enumerate(text.split()): 501 doc.add_position(term, position) 502 doc.add_field(123, text) 503 wi.add_document(doc) 504 wi.close() 505 506 rd = index.get_reader() 507 508 print "- (Test searching.)" 509 510 for term, frequency, doc_positions in doc_tests: 511 dp = list(rd.find_positions(term)) 512 print doc_positions == dp, doc_positions, dp 513 fr = rd.get_frequency(term) 514 print frequency == fr, frequency, fr 515 516 print "- (Test fields.)" 517 518 for docnum, text in docs: 519 df = dict(rd.get_fields(docnum)) 520 print df[123] == text, text, df[123] 521 522 print "- (Test navigation.)" 523 524 for term, docnum, positions in position_tests: 525 dp = rd.find_positions(term) 526 pos = dp.from_document(docnum) 527 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 528 529 print "- (Test phrases.)" 530 531 for terms, results in phrase_tests: 532 res = list(rd.find_common_positions(terms)) 533 print results == res, results, res 534 535 index.close() 536 537 docs2 = [ 538 ((1, 0), "The cat sat on the mat"), 539 ((1, 2), "Every good boy deserves football"), 540 ((13, 1), "One good turn deserves another"), 541 ((14, 0), "Every man for himself"), 542 ((14, 25), "Red sky at night shepherd's delight"), 543 ((36, 12), "She sells sea shells on the sea shore") 544 ] 545 546 doc_tests2 = [ 547 ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]), 548 ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]), 549 ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]), 550 ("sea", 2, [((36, 12), [(2, 10), (6, 28)])]) 551 ] 552 553 position_tests2 = [ 554 ("Every", (14, 0), [(0, 0)]), 555 ("sea", (36, 12), [(2, 10), (6, 28)]), 556 ("shells", (1, 0), None), 557 ("shells", (37, 0), None) 558 ] 559 560 phrase_tests2 = [ 561 (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]), 562 (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]), 563 (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])]) 564 ] 565 566 index = Index("test_indexT", 3, 2, 3, 6) 567 wi = index.get_writer() 568 for docnum, text in docs2: 569 doc = Document(docnum) 570 offset = 0 571 for position, term in enumerate(text.split()): 572 doc.add_position(term, (position, offset)) 573 offset += len(term) + 1 # assume one space after the term 574 doc.add_field(123, text) 575 wi.add_document(doc) 576 wi.close() 577 578 rd = index.get_reader() 579 580 print "- (Test searching.)" 581 582 for term, frequency, doc_positions in doc_tests2: 583 dp = list(rd.find_positions(term)) 584 print doc_positions == dp, doc_positions, dp 585 fr = rd.get_frequency(term) 586 print frequency == fr, frequency, fr 587 588 print "- (Test fields.)" 589 590 for docnum, text in docs2: 591 df = dict(rd.get_fields(docnum)) 592 print df[123] == text, text, df[123] 593 594 print "- (Test navigation.)" 595 596 for term, docnum, positions in position_tests2: 597 dp = rd.find_positions(term) 598 pos = dp.from_document(docnum) 599 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 600 601 print "- (Test phrases.)" 602 603 for terms, results in phrase_tests2: 604 res = list(rd.find_common_positions(terms)) 605 print results == res, results, res 606 607 index.close() 608 609 print "- Test index updates." 610 611 index = Index("test_index") 612 index2 = Index("test_index2", 3, 2, 3, 6) 613 wi = index2.get_writer() 614 for docnum, text in docs: 615 616 # Add the same documents but with different numbers. 617 618 doc = Document(docnum + 100) 619 for position, term in enumerate(text.split()): 620 doc.add_position(term, position) 621 doc.add_field(123, text) 622 wi.add_document(doc) 623 wi.close() 624 625 index2.update([index]) 626 index.close() 627 628 rd = index2.get_reader() 629 for term, frequency, doc_positions in doc_tests: 630 631 # Add the extra documents to the expected result. 632 633 orig_doc_positions = doc_positions 634 doc_positions = doc_positions[:] 635 636 for docnum, positions in orig_doc_positions: 637 doc_positions.append((docnum + 100, positions)) 638 frequency *= 2 639 640 dp = list(rd.find_positions(term)) 641 print doc_positions == dp, doc_positions, dp 642 fr = rd.get_frequency(term) 643 print frequency == fr, frequency, fr 644 index2.close() 645 646 print "- (Test update of an empty index.)" 647 648 index = Index("test_index") 649 index3 = Index("test_index3") 650 index3.update([index]) 651 index.close() 652 653 rd = index3.get_reader() 654 for term, frequency, doc_positions in doc_tests: 655 dp = list(rd.find_positions(term)) 656 print doc_positions == dp, doc_positions, dp 657 fr = rd.get_frequency(term) 658 print frequency == fr, frequency, fr 659 index3.close() 660 661 # vim: tabstop=4 expandtab shiftwidth=4