1 #!/usr/bin/env python 2 3 from iixr.files import * 4 from iixr.fields import * 5 from iixr.terms import * 6 from iixr.positions import * 7 from iixr.index import * 8 import os, sys 9 10 # Remove old test files. 11 12 for filename in ("test", "testF", "testFI", "testI", "testP", "testPI"): 13 try: 14 os.remove(filename) 15 except OSError: 16 pass 17 18 try: 19 for dirname in ("test_index", "test_index2", "test_index3"): 20 for filename in os.listdir(dirname): 21 os.remove(os.path.join(dirname, filename)) 22 os.rmdir(dirname) 23 except OSError: 24 pass 25 26 if "clean" in sys.argv: 27 sys.exit(0) 28 29 print "- Test basic data types." 30 31 numbers = [12345678, 0, 1, 127, 128, 255, 256] 32 33 f = open("test", "wb") 34 w = FileWriter(f) 35 for number in numbers: 36 w.write_number(number) 37 w.close() 38 39 f = open("test", "rb") 40 r = FileReader(f) 41 for number in numbers: 42 n = r.read_number() 43 print number == n, number, n 44 r.close() 45 46 print "- Test positions." 47 48 all_doc_positions = [ 49 [ 50 (123, [1, 3, 5, 15, 25]), 51 (124, [0, 100]), 52 (125, [11, 99, 199]), 53 (130, [77, 78, 80, 82, 89]) 54 ], 55 [ 56 (78, [9]), 57 (196, [10, 11]), 58 (197, [17, 21, 30]) 59 ] 60 ] 61 62 f = open("testP", "wb") 63 w = PositionWriter(f) 64 for doc_positions in all_doc_positions: 65 for docnum, positions in doc_positions: 66 w.write_positions(docnum, positions) 67 w.reset() 68 w.close() 69 70 f = open("testP", "rb") 71 r = PositionReader(f) 72 for doc_positions in all_doc_positions: 73 for docnum, positions in doc_positions: 74 d, p = r.read_positions() 75 print docnum == d, docnum, d 76 print positions == p, positions, p 77 r.reset() 78 r.close() 79 80 print "- Test position index files." 81 82 indexed_positions = [ 83 [ 84 (1234, 0, 100), 85 (2345, 700, 100), 86 (3456, 1900, 50) 87 ], 88 [ 89 (4567, 2800, 20) 90 ] 91 ] 92 93 offsets = [] 94 f = open("testPI", "wb") 95 w = PositionIndexWriter(f) 96 for term_positions in indexed_positions: 97 offset = None 98 doc_frequency = 0 99 w.reset() 100 for docnum, pos_offset, count in term_positions: 101 if offset is None: 102 offset = w.f.tell() 103 w.write_positions(docnum, pos_offset, count) 104 doc_frequency += count 105 offsets.append((offset, doc_frequency)) 106 w.close() 107 108 r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb"))) 109 offsets.reverse() 110 indexed_positions.reverse() 111 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): 112 r.seek(offset, doc_frequency) 113 for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r): 114 print docnum == dn, docnum, dn 115 print pos_offset == po, pos_offset, po 116 print count == c, count, c 117 r.reader.close() 118 119 print "- Test position dictionaries." 120 121 f = open("testP", "wb") 122 w = PositionWriter(f) 123 f2 = open("testPI", "wb") 124 w2 = PositionIndexWriter(f2) 125 wd = PositionDictionaryWriter(w, w2, 2) 126 offsets = [] 127 for doc_positions in all_doc_positions: 128 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) 129 offsets.append((offset, doc_frequency)) 130 wd.close() 131 132 r = PositionReader(open("testP", "rb")) 133 r2 = PositionIndexReader(open("testPI", "rb")) 134 rd = PositionDictionaryReader(r, r2) 135 offsets.reverse() 136 all_doc_positions.reverse() 137 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): 138 it = rd.read_term_positions(offset, doc_frequency) 139 dp = list(it) 140 print doc_positions == dp, doc_positions, dp 141 rd.close() 142 143 print "- Test fields." 144 145 doc_fields = [ 146 (123, ["testing", "fields", "stored", "compressed"]), 147 (456, ["fields", "for a second", "document"]), 148 (789, ["field value"]), 149 (1234, []), 150 (2345, ["abc", "def"]), 151 (3456, ["apple", "banana", "cherry"]), 152 (4567, ["drue", "eple"]) 153 ] 154 155 f = open("testF", "wb") 156 w = FieldWriter(f) 157 for docnum, fields in doc_fields: 158 w.write_fields(docnum, list(enumerate(fields))) 159 w.close() 160 161 f = open("testF", "rb") 162 r = FieldReader(f) 163 for docnum, fields in doc_fields: 164 dn, df = r.read_fields() 165 print docnum == dn, docnum, dn 166 print list(enumerate(fields)) == df, list(enumerate(fields)), df 167 r.close() 168 169 print "- Test field index files." 170 171 indexed_docs = [ 172 (123, 100000987), 173 (456, 100004321), 174 (789, 100008765) 175 ] 176 177 f = open("testFI", "wb") 178 w = FieldIndexWriter(f) 179 for docnum, offset in indexed_docs: 180 w.write_document(docnum, offset) 181 w.close() 182 183 f = open("testFI", "rb") 184 r = FieldIndexReader(f) 185 for docnum, offset in indexed_docs: 186 dn, o = r.read_document() 187 print docnum == dn, docnum, dn 188 print offset == o, offset, o 189 r.close() 190 191 print "- Test field dictionaries." 192 193 f = open("testF", "wb") 194 w = FieldWriter(f) 195 f2 = open("testFI", "wb") 196 w2 = FieldIndexWriter(f2) 197 wd = FieldDictionaryWriter(w, w2, 3) 198 for docnum, fields in doc_fields: 199 wd.write_fields(docnum, list(enumerate(fields))) 200 wd.close() 201 202 f = open("testF", "rb") 203 r = FieldReader(f) 204 f2 = open("testFI", "rb") 205 r2 = FieldIndexReader(f2) 206 rd = FieldDictionaryReader(r, r2) 207 doc_fields_reversed = doc_fields[:] 208 doc_fields_reversed.reverse() 209 for docnum, fields in doc_fields_reversed: 210 df = dict(rd.get_fields(docnum)) 211 print dict(enumerate(fields)) == df, dict(enumerate(fields)), df 212 for docnum in (13579, 246810): 213 df = rd.get_fields(docnum) 214 print df is None, df 215 216 print "- (Test sequential access.)" 217 218 rd.rewind() 219 for docnum, fields in doc_fields: 220 dn, df = rd.read_fields() 221 print docnum == dn, docnum, dn 222 print list(enumerate(fields)) == df, list(enumerate(fields)), df 223 rd.close() 224 225 print "- Test terms." 226 227 terms = [ 228 # term offset frequency doc_frequency 229 ("aardvark", 100000123, 1, 1), 230 ("anteater", 100000456, 2, 1), 231 ("badger", 100000789, 13, 7), 232 ("bull", 1000001234, 59, 17), 233 ("bulldog", 1000002345, 99, 80), 234 ("cat", 1000003456, 89, 28) 235 ] 236 237 f = open("test", "wb") 238 w = TermWriter(f) 239 for term, offset, frequency, doc_frequency in terms: 240 w.write_term(term, offset, frequency, doc_frequency) 241 w.close() 242 243 f = open("test", "rb") 244 r = TermReader(f) 245 for term, offset, frequency, doc_frequency in terms: 246 t, o, fr, df = r.read_term() 247 print term == t, term, t 248 print offset == o, offset, o 249 print frequency == fr, frequency, fr 250 print doc_frequency == df, doc_frequency, df 251 r.close() 252 253 print "- Test terms in index files." 254 255 indexed_terms = [ 256 # term offset frequency doc_frequency info_offset 257 ("aardvark", 100000123, 1, 1, 200000321), 258 ("anteater", 100000456, 2, 1, 200000654), 259 ("badger", 100000789, 13, 7, 200000987), 260 ("bull", 1000001234, 59, 17, 200004321), 261 ("bulldog", 1000002345, 99, 80, 200005432), 262 ("cat", 1000003456, 89, 28, 200006543) 263 ] 264 265 f = open("test", "wb") 266 w = TermIndexWriter(f) 267 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 268 w.write_term(term, offset, frequency, doc_frequency, info_offset) 269 w.close() 270 271 f = open("test", "rb") 272 r = TermIndexReader(f) 273 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 274 t, o, fr, df, i = r.read_term() 275 print term == t, term, t 276 print offset == o, offset, o 277 print frequency == fr, frequency, fr 278 print doc_frequency == df, doc_frequency, df 279 print info_offset == i, info_offset, i 280 r.close() 281 282 print "- Test dictionaries with only term data." 283 284 f = open("test", "wb") 285 w = TermWriter(f) 286 f2 = open("testI", "wb") 287 w2 = TermIndexWriter(f2) 288 f3 = open("testP", "wb") 289 w3 = PositionWriter(f3) 290 f4 = open("testPI", "wb") 291 w4 = PositionIndexWriter(f4) 292 wp = PositionDictionaryWriter(w3, w4, 2) 293 wd = TermDictionaryWriter(w, w2, wp, 3) 294 for term, offset, frequency, doc_frequency in terms: 295 wd._write_term(term, offset, frequency, doc_frequency) 296 wd.close() 297 298 f = open("test", "rb") 299 r = TermReader(f) 300 f2 = open("testI", "rb") 301 r2 = TermIndexReader(f2) 302 r3 = PositionReader(open("testP", "rb")) 303 r4 = PositionIndexReader(open("testPI", "rb")) 304 rp = PositionDictionaryReader(r3, r4) 305 rd = TermDictionaryReader(r, r2, rp) 306 terms_reversed = terms[:] 307 terms_reversed.reverse() 308 for term, offset, frequency, doc_frequency in terms_reversed: 309 o, fr, df = rd._find_term(term) 310 print offset == o, offset, o 311 print frequency == fr, frequency, fr 312 print doc_frequency == df, doc_frequency, df 313 for term in ("dog", "dingo"): 314 t = rd._find_term(term) 315 print t is None, t 316 317 print "- (Test term prefix searching.)" 318 319 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] 320 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] 321 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] 322 print rd.find_terms("d") == [], rd.find_terms("d"), [] 323 rd.close() 324 325 print "- Test dictionaries with term and position data." 326 327 terms_with_positions = [ 328 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 329 ("anteater", [(1, [43, 44])]), 330 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 331 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 332 ("bulldog", [(43, [17, 19, 256, 512])]), 333 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 334 ] 335 336 position_dict_tests = [ 337 ("badger", 19, [55, 1333]), 338 ("badger", 20, None), 339 ("bull", 6, [128]), 340 ("bull", 26, [1, 3, 5, 7, 9]), 341 ("cat", 111, None), 342 ("cat", 123, [12, 145, 196]), 343 ("cat", 1234, None) 344 ] 345 346 f = open("test", "wb") 347 w = TermWriter(f) 348 f2 = open("testI", "wb") 349 w2 = TermIndexWriter(f2) 350 f3 = open("testP", "wb") 351 w3 = PositionWriter(f3) 352 f4 = open("testPI", "wb") 353 w4 = PositionIndexWriter(f4) 354 wp = PositionDictionaryWriter(w3, w4, 2) 355 wd = TermDictionaryWriter(w, w2, wp, 3) 356 for term, doc_positions in terms_with_positions: 357 wd.write_term_positions(term, doc_positions) 358 wd.close() 359 360 f = open("test", "rb") 361 r = TermReader(f) 362 f2 = open("testI", "rb") 363 r2 = TermIndexReader(f2) 364 r3 = PositionReader(open("testP", "rb")) 365 r4 = PositionIndexReader(open("testPI", "rb")) 366 rp = PositionDictionaryReader(r3, r4) 367 rd = TermDictionaryReader(r, r2, rp) 368 terms_reversed = terms_with_positions[:] 369 terms_reversed.reverse() 370 for term, doc_positions in terms_reversed: 371 dp = list(rd.find_positions(term)) 372 print doc_positions == dp, doc_positions, dp 373 for term in ("aaa", "dog", "dingo"): 374 dp = rd.find_positions(term) 375 print dp == [], dp 376 377 print "- (Test iterators.)" 378 379 for term, docnum, positions in position_dict_tests: 380 dp = rd.find_positions(term) 381 pos = dp.from_document(docnum) 382 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 383 384 print "- (Test sequential access.)" 385 386 rd.rewind() 387 for term, doc_positions in terms_with_positions: 388 t, fr, df, dp = rd.read_term() 389 dp = list(dp) 390 print term == t, term, t 391 print doc_positions == dp, doc_positions, dp 392 rd.close() 393 394 print "- Test high-level index operations (including merging)." 395 396 docs = [ 397 (1, "The cat sat on the mat"), 398 (2, "Every good boy deserves football"), 399 (13, "One good turn deserves another"), 400 (14, "Every man for himself"), 401 (25, "Red sky at night shepherd's delight"), 402 (36, "She sells sea shells on the sea shore") 403 ] 404 405 doc_tests = [ 406 ("Every", 2, [(2, [0]), (14, [0])]), 407 ("good", 2, [(2, [1]), (13, [1])]), 408 ("deserves", 2, [(2, [3]), (13, [3])]), 409 ("sea", 2, [(36, [2, 6])]) 410 ] 411 412 position_tests = [ 413 ("Every", 14, [0]), 414 ("sea", 36, [2, 6]), 415 ("shells", 1, None), 416 ("shells", 37, None) 417 ] 418 419 phrase_tests = [ 420 (["good", "boy"], [(2, [1, 2])]), 421 (["on", "the"], [(1, [3, 4]), (36, [4, 5])]), 422 (["sea", "shore"], [(36, [6, 7])]) 423 ] 424 425 index = Index("test_index", 3, 2, 3, 6) 426 wi = index.get_writer() 427 for docnum, text in docs: 428 doc = Document(docnum) 429 for position, term in enumerate(text.split()): 430 doc.add_position(term, position) 431 doc.add_field(123, text) 432 wi.add_document(doc) 433 wi.close() 434 435 rd = index.get_reader() 436 437 print "- (Test searching.)" 438 439 for term, frequency, doc_positions in doc_tests: 440 dp = list(rd.find_positions(term)) 441 print doc_positions == dp, doc_positions, dp 442 fr = rd.get_frequency(term) 443 print frequency == fr, frequency, fr 444 445 print "- (Test fields.)" 446 447 for docnum, text in docs: 448 df = dict(rd.get_fields(docnum)) 449 print df[123] == text, text, df[123] 450 451 print "- (Test navigation.)" 452 453 for term, docnum, positions in position_tests: 454 dp = rd.find_positions(term) 455 pos = dp.from_document(docnum) 456 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 457 458 print "- (Test phrases.)" 459 460 for terms, results in phrase_tests: 461 res = list(rd.find_common_positions(terms)) 462 print results == res, results, res 463 464 index.close() 465 466 print "- Test index updates." 467 468 index = Index("test_index") 469 index2 = Index("test_index2", 3, 2, 3, 6) 470 wi = index2.get_writer() 471 for docnum, text in docs: 472 473 # Add the same documents but with different numbers. 474 475 doc = Document(docnum + 100) 476 for position, term in enumerate(text.split()): 477 doc.add_position(term, position) 478 doc.add_field(123, text) 479 wi.add_document(doc) 480 wi.close() 481 482 index2.update([index]) 483 index.close() 484 485 rd = index2.get_reader() 486 for term, frequency, doc_positions in doc_tests: 487 488 # Add the extra documents to the expected result. 489 490 orig_doc_positions = doc_positions 491 doc_positions = doc_positions[:] 492 493 for docnum, positions in orig_doc_positions: 494 doc_positions.append((docnum + 100, positions)) 495 frequency *= 2 496 497 dp = list(rd.find_positions(term)) 498 print doc_positions == dp, doc_positions, dp 499 fr = rd.get_frequency(term) 500 print frequency == fr, frequency, fr 501 index2.close() 502 503 print "- (Test update of an empty index.)" 504 505 index = Index("test_index") 506 index3 = Index("test_index3") 507 index3.update([index]) 508 index.close() 509 510 rd = index3.get_reader() 511 for term, frequency, doc_positions in doc_tests: 512 dp = list(rd.find_positions(term)) 513 print doc_positions == dp, doc_positions, dp 514 fr = rd.get_frequency(term) 515 print frequency == fr, frequency, fr 516 index3.close() 517 518 # vim: tabstop=4 expandtab shiftwidth=4