1 #!/usr/bin/env python 2 3 from iixr.files import * 4 from iixr.fields import * 5 from iixr.terms import * 6 from iixr.positions import * 7 from iixr.index import * 8 import os, sys 9 10 # Remove old test files. 11 12 for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"): 13 try: 14 os.remove(filename) 15 except OSError: 16 pass 17 18 try: 19 for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"): 20 for filename in os.listdir(dirname): 21 os.remove(os.path.join(dirname, filename)) 22 os.rmdir(dirname) 23 except OSError: 24 pass 25 26 if "clean" in sys.argv: 27 sys.exit(0) 28 29 print "- Test basic data types." 30 31 numbers = [12345678, 0, 1, 127, 128, 255, 256] 32 33 f = open("test", "wb") 34 w = FileWriter(f) 35 for number in numbers: 36 w.write_number(number) 37 w.close() 38 39 f = open("test", "rb") 40 r = FileReader(f) 41 for number in numbers: 42 n = r.read_number() 43 print number == n, number, n 44 r.close() 45 46 tuples = [(0, 0), (1, 3), (2, 5), (3, 9)] 47 48 f = open("testMS", "wb") 49 w = FileWriter(f) 50 last = w.get_initial_value(2) 51 for t in tuples: 52 last = w.write_sequence(t, last, 2) 53 w.close() 54 55 f = open("testMS", "rb") 56 r = FileReader(f) 57 last = r.get_initial_value(2) 58 for t in tuples: 59 last = t2 = r.read_sequence(last, 2) 60 print t == t2, t, t2 61 r.close() 62 63 tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)] 64 65 f = open("testNMS", "wb") 66 w = FileWriter(f) 67 last = w.get_initial_value(2) 68 for t in tuples2: 69 last = w.write_sequence(t, last, 2, monotonic=0) 70 w.close() 71 72 f = open("testNMS", "rb") 73 r = FileReader(f) 74 last = r.get_initial_value(2) 75 for t in tuples2: 76 last = t2 = r.read_sequence(last, 2, monotonic=0) 77 print t == t2, t, t2 78 r.close() 79 80 print "- Test positions." 81 82 all_doc_positions = [ 83 [ 84 (123, [1, 3, 5, 15, 25]), 85 (124, [0, 100]), 86 (125, [11, 99, 199]), 87 (130, [77, 78, 80, 82, 89]) 88 ], 89 [ 90 (78, [9]), 91 (196, [10, 11]), 92 (197, [17, 21, 30]) 93 ] 94 ] 95 96 f = open("testP", "wb") 97 w = PositionWriter(f) 98 for doc_positions in all_doc_positions: 99 for docnum, positions in doc_positions: 100 w.write_positions(docnum, positions) 101 w.reset() 102 w.close() 103 104 f = open("testP", "rb") 105 r = PositionReader(f) 106 for doc_positions in all_doc_positions: 107 for docnum, positions in doc_positions: 108 d, p = r.read_positions() 109 print docnum == d, docnum, d 110 print positions == p, positions, p 111 r.reset() 112 r.close() 113 114 all_doc_positions_seq = [ 115 [ 116 ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]), 117 ((124, 1), [(0, 0), (100, 350)]), 118 ((124, 2), [(11, 38), (99, 379), (199, 720)]), 119 ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)]) 120 ], 121 [ 122 ((78, 1), [(9, 19)]), 123 ((196, 0), [(10, 27), (11, 29)]), 124 ((196, 1), [(17, 46), (21, 52), (30, 60)]) 125 ] 126 ] 127 128 f = open("testP2", "wb") 129 w = PositionWriter(f) 130 for doc_positions in all_doc_positions_seq: 131 for docnum, positions in doc_positions: 132 w.write_positions(docnum, positions) 133 w.reset() 134 w.close() 135 136 f = open("testP2", "rb") 137 r = PositionReader(f) 138 for doc_positions in all_doc_positions_seq: 139 for docnum, positions in doc_positions: 140 d, p = r.read_positions() 141 print tuple(docnum) == tuple(d), docnum, d 142 print tuple(positions) == tuple(p), positions, p 143 r.reset() 144 r.close() 145 146 print "- Test position index files." 147 148 indexed_positions = [ 149 [ 150 (1234, 0, 100), 151 (2345, 700, 100), 152 (3456, 1900, 50) 153 ], 154 [ 155 (4567, 2800, 20) 156 ] 157 ] 158 159 offsets = [] 160 f = open("testPI", "wb") 161 w = PositionIndexWriter(f) 162 for term_positions in indexed_positions: 163 offset = None 164 doc_frequency = 0 165 w.reset() 166 for docnum, pos_offset, count in term_positions: 167 if offset is None: 168 offset = w.tell() 169 w.write_positions(docnum, pos_offset, count) 170 doc_frequency += count 171 offsets.append((offset, doc_frequency)) 172 w.close() 173 174 r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb"))) 175 offsets.reverse() 176 indexed_positions.reverse() 177 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): 178 r.seek(offset, doc_frequency) 179 for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r): 180 print docnum == dn, docnum, dn 181 print pos_offset == po, pos_offset, po 182 print count == c, count, c 183 r.reader.close() 184 185 print "- Test position dictionaries." 186 187 f = open("testP", "wb") 188 w = PositionWriter(f) 189 f2 = open("testPI", "wb") 190 w2 = PositionIndexWriter(f2) 191 wd = PositionDictionaryWriter(w, w2, 2) 192 offsets = [] 193 for doc_positions in all_doc_positions: 194 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) 195 offsets.append((offset, doc_frequency)) 196 wd.close() 197 198 r = PositionReader(open("testP", "rb")) 199 r2 = PositionIndexReader(open("testPI", "rb")) 200 rd = PositionDictionaryReader(r, r2) 201 offsets.reverse() 202 all_doc_positions.reverse() 203 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): 204 it = rd.read_term_positions(offset, doc_frequency) 205 dp = list(it) 206 print doc_positions == dp, doc_positions, dp 207 rd.close() 208 209 print "- Test fields." 210 211 doc_fields = [ 212 (123, ["testing", "fields", "stored", "compressed"]), 213 (456, ["fields", "for a second", "document"]), 214 (789, ["field value"]), 215 (1234, []), 216 (2345, ["abc", "def"]), 217 (3456, ["apple", "banana", "cherry"]), 218 (4567, ["drue", "eple"]) 219 ] 220 221 f = open("testF", "wb") 222 w = FieldWriter(f) 223 for docnum, fields in doc_fields: 224 w.write_fields(docnum, list(enumerate(fields))) 225 w.close() 226 227 f = open("testF", "rb") 228 r = FieldReader(f) 229 for docnum, fields in doc_fields: 230 dn, df = r.read_fields() 231 print docnum == dn, docnum, dn 232 print list(enumerate(fields)) == df, list(enumerate(fields)), df 233 r.close() 234 235 print "- Test field index files." 236 237 indexed_docs = [ 238 (123, 100000987), 239 (456, 100004321), 240 (789, 100008765) 241 ] 242 243 f = open("testFI", "wb") 244 w = FieldIndexWriter(f) 245 for docnum, offset in indexed_docs: 246 w.write_document(docnum, offset) 247 w.close() 248 249 f = open("testFI", "rb") 250 r = FieldIndexReader(f) 251 for docnum, offset in indexed_docs: 252 dn, o = r.read_document() 253 print docnum == dn, docnum, dn 254 print offset == o, offset, o 255 r.close() 256 257 print "- Test field dictionaries." 258 259 f = open("testF", "wb") 260 w = FieldWriter(f) 261 f2 = open("testFI", "wb") 262 w2 = FieldIndexWriter(f2) 263 wd = FieldDictionaryWriter(w, w2, 3) 264 for docnum, fields in doc_fields: 265 wd.write_fields(docnum, list(enumerate(fields))) 266 wd.close() 267 268 f = open("testF", "rb") 269 r = FieldReader(f) 270 f2 = open("testFI", "rb") 271 r2 = FieldIndexReader(f2) 272 rd = FieldDictionaryReader(r, r2) 273 doc_fields_reversed = doc_fields[:] 274 doc_fields_reversed.reverse() 275 for docnum, fields in doc_fields_reversed: 276 df = dict(rd.get_fields(docnum)) 277 print dict(enumerate(fields)) == df, dict(enumerate(fields)), df 278 for docnum in (13579, 246810): 279 df = rd.get_fields(docnum) 280 print df is None, df 281 282 print "- (Test sequential access.)" 283 284 rd.rewind() 285 for docnum, fields in doc_fields: 286 dn, df = rd.read_fields() 287 print docnum == dn, docnum, dn 288 print list(enumerate(fields)) == df, list(enumerate(fields)), df 289 rd.close() 290 291 print "- Test terms." 292 293 terms = [ 294 # term offset frequency doc_frequency 295 ("aardvark", 100000123, 1, 1), 296 ("anteater", 100000456, 2, 1), 297 ("badger", 100000789, 13, 7), 298 ("bull", 1000001234, 59, 17), 299 ("bulldog", 1000002345, 99, 80), 300 ("cat", 1000003456, 89, 28) 301 ] 302 303 f = open("test", "wb") 304 w = TermWriter(f) 305 for term, offset, frequency, doc_frequency in terms: 306 w.write_term(term, offset, frequency, doc_frequency) 307 w.close() 308 309 f = open("test", "rb") 310 r = TermReader(f) 311 for term, offset, frequency, doc_frequency in terms: 312 t, o, fr, df = r.read_term() 313 print term == t, term, t 314 print offset == o, offset, o 315 print frequency == fr, frequency, fr 316 print doc_frequency == df, doc_frequency, df 317 r.close() 318 319 print "- Test terms in index files." 320 321 indexed_terms = [ 322 # term offset frequency doc_frequency info_offset 323 ("aardvark", 100000123, 1, 1, 200000321), 324 ("anteater", 100000456, 2, 1, 200000654), 325 ("badger", 100000789, 13, 7, 200000987), 326 ("bull", 1000001234, 59, 17, 200004321), 327 ("bulldog", 1000002345, 99, 80, 200005432), 328 ("cat", 1000003456, 89, 28, 200006543) 329 ] 330 331 f = open("test", "wb") 332 w = TermIndexWriter(f) 333 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 334 w.write_term(term, offset, frequency, doc_frequency, info_offset) 335 w.close() 336 337 f = open("test", "rb") 338 r = TermIndexReader(f) 339 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 340 t, o, fr, df, i = r.read_term() 341 print term == t, term, t 342 print offset == o, offset, o 343 print frequency == fr, frequency, fr 344 print doc_frequency == df, doc_frequency, df 345 print info_offset == i, info_offset, i 346 r.close() 347 348 print "- Test dictionaries with only term data." 349 350 f = open("test", "wb") 351 w = TermWriter(f) 352 f2 = open("testI", "wb") 353 w2 = TermIndexWriter(f2) 354 f3 = open("testP", "wb") 355 w3 = PositionWriter(f3) 356 f4 = open("testPI", "wb") 357 w4 = PositionIndexWriter(f4) 358 wp = PositionDictionaryWriter(w3, w4, 2) 359 wd = TermDictionaryWriter(w, w2, wp, 3) 360 for term, offset, frequency, doc_frequency in terms: 361 wd._write_term(term, offset, frequency, doc_frequency) 362 wd.close() 363 364 f = open("test", "rb") 365 r = TermReader(f) 366 f2 = open("testI", "rb") 367 r2 = TermIndexReader(f2) 368 r3 = PositionReader(open("testP", "rb")) 369 r4 = PositionIndexReader(open("testPI", "rb")) 370 rp = PositionDictionaryReader(r3, r4) 371 rd = TermDictionaryReader(r, r2, rp) 372 terms_reversed = terms[:] 373 terms_reversed.reverse() 374 for term, offset, frequency, doc_frequency in terms_reversed: 375 o, fr, df = rd._find_term(term) 376 print offset == o, offset, o 377 print frequency == fr, frequency, fr 378 print doc_frequency == df, doc_frequency, df 379 for term in ("dog", "dingo"): 380 t = rd._find_term(term) 381 print t is None, t 382 383 print "- (Test term prefix searching.)" 384 385 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] 386 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] 387 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] 388 print rd.find_terms("d") == [], rd.find_terms("d"), [] 389 rd.close() 390 391 print "- Test dictionaries with term and position data." 392 393 terms_with_positions = [ 394 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 395 ("anteater", [(1, [43, 44])]), 396 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 397 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 398 ("bulldog", [(43, [17, 19, 256, 512])]), 399 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 400 ] 401 402 position_dict_tests = [ 403 ("badger", 19, [55, 1333]), 404 ("badger", 20, None), 405 ("bull", 6, [128]), 406 ("bull", 26, [1, 3, 5, 7, 9]), 407 ("cat", 111, None), 408 ("cat", 123, [12, 145, 196]), 409 ("cat", 1234, None) 410 ] 411 412 f = open("test", "wb") 413 w = TermWriter(f) 414 f2 = open("testI", "wb") 415 w2 = TermIndexWriter(f2) 416 f3 = open("testP", "wb") 417 w3 = PositionWriter(f3) 418 f4 = open("testPI", "wb") 419 w4 = PositionIndexWriter(f4) 420 wp = PositionDictionaryWriter(w3, w4, 2) 421 wd = TermDictionaryWriter(w, w2, wp, 3) 422 for term, doc_positions in terms_with_positions: 423 wd.write_term_positions(term, doc_positions) 424 wd.close() 425 426 f = open("test", "rb") 427 r = TermReader(f) 428 f2 = open("testI", "rb") 429 r2 = TermIndexReader(f2) 430 r3 = PositionReader(open("testP", "rb")) 431 r4 = PositionIndexReader(open("testPI", "rb")) 432 rp = PositionDictionaryReader(r3, r4) 433 rd = TermDictionaryReader(r, r2, rp) 434 terms_reversed = terms_with_positions[:] 435 terms_reversed.reverse() 436 for term, doc_positions in terms_reversed: 437 dp = list(rd.find_positions(term)) 438 print doc_positions == dp, doc_positions, dp 439 for term in ("aaa", "dog", "dingo"): 440 dp = rd.find_positions(term) 441 print dp == [], dp 442 443 print "- (Test iterators.)" 444 445 for term, docnum, positions in position_dict_tests: 446 dp = rd.find_positions(term) 447 pos = dp.from_document(docnum) 448 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 449 450 print "- (Test sequential access.)" 451 452 rd.rewind() 453 for term, doc_positions in terms_with_positions: 454 t, fr, df, dp = rd.read_term() 455 dp = list(dp) 456 print term == t, term, t 457 print doc_positions == dp, doc_positions, dp 458 rd.close() 459 460 print "- Test high-level index operations (including merging)." 461 462 docs = [ 463 (1, "The cat sat on the mat"), 464 (2, "Every good boy deserves football"), 465 (13, "One good turn deserves another"), 466 (14, "Every man for himself"), 467 (25, "Red sky at night shepherd's delight"), 468 (36, "She sells sea shells on the sea shore") 469 ] 470 471 doc_tests = [ 472 ("Every", 2, [(2, [0]), (14, [0])]), 473 ("good", 2, [(2, [1]), (13, [1])]), 474 ("deserves", 2, [(2, [3]), (13, [3])]), 475 ("sea", 2, [(36, [2, 6])]) 476 ] 477 478 position_tests = [ 479 ("Every", 14, [0]), 480 ("sea", 36, [2, 6]), 481 ("shells", 1, None), 482 ("shells", 37, None) 483 ] 484 485 phrase_tests = [ 486 (["good", "boy"], [(2, [1, 2])]), 487 (["on", "the"], [(1, [3, 4]), (36, [4, 5])]), 488 (["sea", "shore"], [(36, [6, 7])]) 489 ] 490 491 index = Index("test_index", 3, 2, 3, 6) 492 wi = index.get_writer() 493 for docnum, text in docs: 494 doc = Document(docnum) 495 for position, term in enumerate(text.split()): 496 doc.add_position(term, position) 497 doc.add_field(123, text) 498 wi.add_document(doc) 499 wi.close() 500 501 rd = index.get_reader() 502 503 print "- (Test searching.)" 504 505 for term, frequency, doc_positions in doc_tests: 506 dp = list(rd.find_positions(term)) 507 print doc_positions == dp, doc_positions, dp 508 fr = rd.get_frequency(term) 509 print frequency == fr, frequency, fr 510 511 print "- (Test fields.)" 512 513 for docnum, text in docs: 514 df = dict(rd.get_fields(docnum)) 515 print df[123] == text, text, df[123] 516 517 print "- (Test navigation.)" 518 519 for term, docnum, positions in position_tests: 520 dp = rd.find_positions(term) 521 pos = dp.from_document(docnum) 522 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 523 524 print "- (Test phrases.)" 525 526 for terms, results in phrase_tests: 527 res = list(rd.find_common_positions(terms)) 528 print results == res, results, res 529 530 index.close() 531 532 docs2 = [ 533 ((1, 0), "The cat sat on the mat"), 534 ((1, 2), "Every good boy deserves football"), 535 ((13, 1), "One good turn deserves another"), 536 ((14, 0), "Every man for himself"), 537 ((14, 25), "Red sky at night shepherd's delight"), 538 ((36, 12), "She sells sea shells on the sea shore") 539 ] 540 541 doc_tests2 = [ 542 ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]), 543 ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]), 544 ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]), 545 ("sea", 2, [((36, 12), [(2, 10), (6, 28)])]) 546 ] 547 548 position_tests2 = [ 549 ("Every", (14, 0), [(0, 0)]), 550 ("sea", (36, 12), [(2, 10), (6, 28)]), 551 ("shells", (1, 0), None), 552 ("shells", (37, 0), None) 553 ] 554 555 phrase_tests2 = [ 556 (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]), 557 (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]), 558 (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])]) 559 ] 560 561 index = Index("test_indexT", 3, 2, 3, 6) 562 wi = index.get_writer() 563 for docnum, text in docs2: 564 doc = Document(docnum) 565 offset = 0 566 for position, term in enumerate(text.split()): 567 doc.add_position(term, (position, offset)) 568 offset += len(term) + 1 # assume one space after the term 569 doc.add_field(123, text) 570 wi.add_document(doc) 571 wi.close() 572 573 rd = index.get_reader() 574 575 print "- (Test searching.)" 576 577 for term, frequency, doc_positions in doc_tests2: 578 dp = list(rd.find_positions(term)) 579 print doc_positions == dp, doc_positions, dp 580 fr = rd.get_frequency(term) 581 print frequency == fr, frequency, fr 582 583 print "- (Test fields.)" 584 585 for docnum, text in docs2: 586 df = dict(rd.get_fields(docnum)) 587 print df[123] == text, text, df[123] 588 589 print "- (Test navigation.)" 590 591 for term, docnum, positions in position_tests2: 592 dp = rd.find_positions(term) 593 pos = dp.from_document(docnum) 594 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 595 596 print "- (Test phrases.)" 597 598 for terms, results in phrase_tests2: 599 res = list(rd.find_common_positions(terms)) 600 print results == res, results, res 601 602 index.close() 603 604 print "- Test index updates." 605 606 index = Index("test_index") 607 index2 = Index("test_index2", 3, 2, 3, 6) 608 wi = index2.get_writer() 609 for docnum, text in docs: 610 611 # Add the same documents but with different numbers. 612 613 doc = Document(docnum + 100) 614 for position, term in enumerate(text.split()): 615 doc.add_position(term, position) 616 doc.add_field(123, text) 617 wi.add_document(doc) 618 wi.close() 619 620 index2.update([index]) 621 index.close() 622 623 rd = index2.get_reader() 624 for term, frequency, doc_positions in doc_tests: 625 626 # Add the extra documents to the expected result. 627 628 orig_doc_positions = doc_positions 629 doc_positions = doc_positions[:] 630 631 for docnum, positions in orig_doc_positions: 632 doc_positions.append((docnum + 100, positions)) 633 frequency *= 2 634 635 dp = list(rd.find_positions(term)) 636 print doc_positions == dp, doc_positions, dp 637 fr = rd.get_frequency(term) 638 print frequency == fr, frequency, fr 639 index2.close() 640 641 print "- (Test update of an empty index.)" 642 643 index = Index("test_index") 644 index3 = Index("test_index3") 645 index3.update([index]) 646 index.close() 647 648 rd = index3.get_reader() 649 for term, frequency, doc_positions in doc_tests: 650 dp = list(rd.find_positions(term)) 651 print doc_positions == dp, doc_positions, dp 652 fr = rd.get_frequency(term) 653 print frequency == fr, frequency, fr 654 index3.close() 655 656 # vim: tabstop=4 expandtab shiftwidth=4