1 #!/usr/bin/env python 2 3 from iixr.files import * 4 from iixr.fields import * 5 from iixr.terms import * 6 from iixr.positions import * 7 from iixr.index import * 8 import os, sys 9 10 # Remove old test files. 11 12 for filename in ("test", "testF", "testFI", "testI", "testP", "testPI"): 13 try: 14 os.remove(filename) 15 except OSError: 16 pass 17 18 try: 19 for dirname in ("test_index", "test_index2", "test_index3"): 20 for filename in os.listdir(dirname): 21 os.remove(os.path.join(dirname, filename)) 22 os.rmdir(dirname) 23 except OSError: 24 pass 25 26 if "clean" in sys.argv: 27 sys.exit(0) 28 29 # Test basic data types. 30 31 numbers = [12345678, 0, 1, 127, 128, 255, 256] 32 33 f = open("test", "wb") 34 w = FileWriter(f) 35 for number in numbers: 36 w.write_number(number) 37 w.close() 38 39 f = open("test", "rb") 40 r = FileReader(f) 41 for number in numbers: 42 n = r.read_number() 43 print number == n, number, n 44 r.close() 45 46 # Test positions. 47 48 all_doc_positions = [ 49 [ 50 (123, [1, 3, 5, 15, 25]), 51 (124, [0, 100]), 52 (125, [11, 99, 199]), 53 (130, [77, 78, 80, 82, 89]) 54 ], 55 [ 56 (78, [9]), 57 (196, [10, 11]), 58 (197, [17, 21, 30]) 59 ] 60 ] 61 62 f = open("testP", "wb") 63 w = PositionWriter(f) 64 for doc_positions in all_doc_positions: 65 for docnum, positions in doc_positions: 66 w.write_positions(docnum, positions) 67 w.reset() 68 w.close() 69 70 f = open("testP", "rb") 71 r = PositionIterator(f, 0, None) 72 for doc_positions in all_doc_positions: 73 for docnum, positions in doc_positions: 74 d, p = r.read_positions() 75 print docnum == d, docnum, d 76 print positions == p, positions, p 77 r.reset() 78 r.close() 79 80 # Test position index files. 81 82 indexed_positions = [ 83 [ 84 (1234, 0, 100), 85 (2345, 700, 100), 86 (3456, 1900, 50) 87 ], 88 [ 89 (4567, 2800, 20) 90 ] 91 ] 92 93 offsets = [] 94 f = open("testPI", "wb") 95 w = PositionIndexWriter(f) 96 for term_positions in indexed_positions: 97 offset = None 98 doc_frequency = 0 99 w.reset() 100 for docnum, pos_offset, count in term_positions: 101 if offset is None: 102 offset = w.f.tell() 103 w.write_positions(docnum, pos_offset, count) 104 doc_frequency += count 105 offsets.append((offset, doc_frequency)) 106 w.close() 107 108 r = PositionIndexOpener("testPI") 109 offsets.reverse() 110 indexed_positions.reverse() 111 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): 112 found_positions = r.read_term_positions(offset, doc_frequency) 113 for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions): 114 print docnum == dn, docnum, dn 115 print pos_offset == po, pos_offset, po 116 print count == c, count, c 117 r.close() 118 119 # Test position dictionaries. 120 121 f = open("testP", "wb") 122 w = PositionWriter(f) 123 f2 = open("testPI", "wb") 124 w2 = PositionIndexWriter(f2) 125 wd = PositionDictionaryWriter(w, w2, 2) 126 offsets = [] 127 for doc_positions in all_doc_positions: 128 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) 129 offsets.append((offset, doc_frequency)) 130 wd.close() 131 132 r = PositionOpener("testP") 133 r2 = PositionIndexOpener("testPI") 134 rd = PositionDictionaryReader(r, r2) 135 offsets.reverse() 136 all_doc_positions.reverse() 137 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): 138 dp = list(rd.read_term_positions(offset, doc_frequency)) 139 print doc_positions == dp, doc_positions, dp 140 rd.close() 141 142 # Test fields. 143 144 doc_fields = [ 145 (123, ["testing", "fields", "stored", "compressed"]), 146 (456, ["fields", "for a second", "document"]), 147 (789, ["field value"]), 148 (1234, []), 149 (2345, ["abc", "def"]), 150 (3456, ["apple", "banana", "cherry"]), 151 (4567, ["drue", "eple"]) 152 ] 153 154 f = open("testF", "wb") 155 w = FieldWriter(f) 156 for docnum, fields in doc_fields: 157 w.write_fields(docnum, list(enumerate(fields))) 158 w.close() 159 160 f = open("testF", "rb") 161 r = FieldReader(f) 162 for docnum, fields in doc_fields: 163 dn, df = r.read_fields() 164 print docnum == dn, docnum, dn 165 print list(enumerate(fields)) == df, list(enumerate(fields)), df 166 r.close() 167 168 # Test field index files. 169 170 indexed_docs = [ 171 (123, 100000987), 172 (456, 100004321), 173 (789, 100008765) 174 ] 175 176 f = open("testFI", "wb") 177 w = FieldIndexWriter(f) 178 for docnum, offset in indexed_docs: 179 w.write_document(docnum, offset) 180 w.close() 181 182 f = open("testFI", "rb") 183 r = FieldIndexReader(f) 184 for docnum, offset in indexed_docs: 185 dn, o = r.read_document() 186 print docnum == dn, docnum, dn 187 print offset == o, offset, o 188 r.close() 189 190 # Test field dictionaries. 191 192 f = open("testF", "wb") 193 w = FieldWriter(f) 194 f2 = open("testFI", "wb") 195 w2 = FieldIndexWriter(f2) 196 wd = FieldDictionaryWriter(w, w2, 3) 197 for docnum, fields in doc_fields: 198 wd.write_fields(docnum, list(enumerate(fields))) 199 wd.close() 200 201 f = open("testF", "rb") 202 r = FieldReader(f) 203 f2 = open("testFI", "rb") 204 r2 = FieldIndexReader(f2) 205 rd = FieldDictionaryReader(r, r2) 206 doc_fields_reversed = doc_fields[:] 207 doc_fields_reversed.reverse() 208 for docnum, fields in doc_fields_reversed: 209 df = dict(rd.get_fields(docnum)) 210 print dict(enumerate(fields)) == df, dict(enumerate(fields)), df 211 for docnum in (13579, 246810): 212 df = rd.get_fields(docnum) 213 print df is None, df 214 215 # (Test sequential access.) 216 217 rd.rewind() 218 for docnum, fields in doc_fields: 219 dn, df = rd.read_fields() 220 print docnum == dn, docnum, dn 221 print list(enumerate(fields)) == df, list(enumerate(fields)), df 222 rd.close() 223 224 # Test terms. 225 226 terms = [ 227 # term offset frequency doc_frequency 228 ("aardvark", 100000123, 1, 1), 229 ("anteater", 100000456, 2, 1), 230 ("badger", 100000789, 13, 7), 231 ("bull", 1000001234, 59, 17), 232 ("bulldog", 1000002345, 99, 80), 233 ("cat", 1000003456, 89, 28) 234 ] 235 236 f = open("test", "wb") 237 w = TermWriter(f) 238 for term, offset, frequency, doc_frequency in terms: 239 w.write_term(term, offset, frequency, doc_frequency) 240 w.close() 241 242 f = open("test", "rb") 243 r = TermReader(f) 244 for term, offset, frequency, doc_frequency in terms: 245 t, o, fr, df = r.read_term() 246 print term == t, term, t 247 print offset == o, offset, o 248 print frequency == fr, frequency, fr 249 print doc_frequency == df, doc_frequency, df 250 r.close() 251 252 # Test terms in index files. 253 254 indexed_terms = [ 255 # term offset frequency doc_frequency info_offset 256 ("aardvark", 100000123, 1, 1, 200000321), 257 ("anteater", 100000456, 2, 1, 200000654), 258 ("badger", 100000789, 13, 7, 200000987), 259 ("bull", 1000001234, 59, 17, 200004321), 260 ("bulldog", 1000002345, 99, 80, 200005432), 261 ("cat", 1000003456, 89, 28, 200006543) 262 ] 263 264 f = open("test", "wb") 265 w = TermIndexWriter(f) 266 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 267 w.write_term(term, offset, frequency, doc_frequency, info_offset) 268 w.close() 269 270 f = open("test", "rb") 271 r = TermIndexReader(f) 272 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 273 t, o, fr, df, i = r.read_term() 274 print term == t, term, t 275 print offset == o, offset, o 276 print frequency == fr, frequency, fr 277 print doc_frequency == df, doc_frequency, df 278 print info_offset == i, info_offset, i 279 r.close() 280 281 # Test dictionaries with only term data. 282 283 f = open("test", "wb") 284 w = TermWriter(f) 285 f2 = open("testI", "wb") 286 w2 = TermIndexWriter(f2) 287 f3 = open("testP", "wb") 288 w3 = PositionWriter(f3) 289 f4 = open("testPI", "wb") 290 w4 = PositionIndexWriter(f4) 291 wp = PositionDictionaryWriter(w3, w4, 2) 292 wd = TermDictionaryWriter(w, w2, wp, 3) 293 for term, offset, frequency, doc_frequency in terms: 294 wd._write_term(term, offset, frequency, doc_frequency) 295 wd.close() 296 297 f = open("test", "rb") 298 r = TermReader(f) 299 f2 = open("testI", "rb") 300 r2 = TermIndexReader(f2) 301 r3 = PositionOpener("testP") 302 r4 = PositionIndexOpener("testPI") 303 rp = PositionDictionaryReader(r3, r4) 304 rd = TermDictionaryReader(r, r2, rp) 305 terms_reversed = terms[:] 306 terms_reversed.reverse() 307 for term, offset, frequency, doc_frequency in terms_reversed: 308 o, fr, df = rd._find_term(term) 309 print offset == o, offset, o 310 print frequency == fr, frequency, fr 311 print doc_frequency == df, doc_frequency, df 312 for term in ("dog", "dingo"): 313 t = rd._find_term(term) 314 print t is None, t 315 316 # (Test term prefix searching.) 317 318 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] 319 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] 320 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] 321 print rd.find_terms("d") == [], rd.find_terms("d"), [] 322 rd.close() 323 324 # Test dictionaries with term and position data. 325 326 terms_with_positions = [ 327 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 328 ("anteater", [(1, [43, 44])]), 329 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 330 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 331 ("bulldog", [(43, [17, 19, 256, 512])]), 332 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 333 ] 334 335 position_dict_tests = [ 336 ("badger", 19, [55, 1333]), 337 ("badger", 20, None), 338 ("bull", 6, [128]), 339 ("bull", 26, [1, 3, 5, 7, 9]), 340 ("cat", 111, None), 341 ("cat", 123, [12, 145, 196]), 342 ("cat", 1234, None) 343 ] 344 345 f = open("test", "wb") 346 w = TermWriter(f) 347 f2 = open("testI", "wb") 348 w2 = TermIndexWriter(f2) 349 f3 = open("testP", "wb") 350 w3 = PositionWriter(f3) 351 f4 = open("testPI", "wb") 352 w4 = PositionIndexWriter(f4) 353 wp = PositionDictionaryWriter(w3, w4, 2) 354 wd = TermDictionaryWriter(w, w2, wp, 3) 355 for term, doc_positions in terms_with_positions: 356 wd.write_term_positions(term, doc_positions) 357 wd.close() 358 359 f = open("test", "rb") 360 r = TermReader(f) 361 f2 = open("testI", "rb") 362 r2 = TermIndexReader(f2) 363 r3 = PositionOpener("testP") 364 r4 = PositionIndexOpener("testPI") 365 rp = PositionDictionaryReader(r3, r4) 366 rd = TermDictionaryReader(r, r2, rp) 367 terms_reversed = terms_with_positions[:] 368 terms_reversed.reverse() 369 for term, doc_positions in terms_reversed: 370 dp = list(rd.find_positions(term)) 371 print doc_positions == dp, doc_positions, dp 372 for term in ("aaa", "dog", "dingo"): 373 dp = rd.find_positions(term) 374 print dp == [], dp 375 376 # (Test iterators.) 377 378 for term, docnum, positions in position_dict_tests: 379 dp = rd.find_positions(term) 380 pos = dp.from_document(docnum) 381 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 382 383 # (Test sequential access.) 384 385 rd.rewind() 386 for term, doc_positions in terms_with_positions: 387 t, fr, df, dp = rd.read_term() 388 dp = list(dp) 389 print term == t, term, t 390 print doc_positions == dp, doc_positions, dp 391 rd.close() 392 393 # Test high-level index operations (including merging). 394 395 docs = [ 396 (1, "The cat sat on the mat"), 397 (2, "Every good boy deserves football"), 398 (13, "One good turn deserves another"), 399 (14, "Every man for himself"), 400 (25, "Red sky at night shepherd's delight"), 401 (36, "She sells sea shells on the sea shore") 402 ] 403 404 doc_tests = [ 405 ("Every", 2, [(2, [0]), (14, [0])]), 406 ("good", 2, [(2, [1]), (13, [1])]), 407 ("deserves", 2, [(2, [3]), (13, [3])]), 408 ("sea", 2, [(36, [2, 6])]) 409 ] 410 411 position_tests = [ 412 ("Every", 14, [0]), 413 ("sea", 36, [2, 6]), 414 ("shells", 1, None), 415 ("shells", 37, None) 416 ] 417 418 phrase_tests = [ 419 (["good", "boy"], [(2, [1, 2])]), 420 (["on", "the"], [(1, [3, 4]), (36, [4, 5])]), 421 (["sea", "shore"], [(36, [6, 7])]) 422 ] 423 424 index = Index("test_index") 425 wi = index.get_writer(3, 2, 6) 426 for docnum, text in docs: 427 doc = Document(docnum) 428 for position, term in enumerate(text.split()): 429 doc.add_position(term, position) 430 doc.add_field(123, text) 431 wi.add_document(doc) 432 wi.close() 433 434 rd = index.get_reader() 435 436 # (Test searching.) 437 438 for term, frequency, doc_positions in doc_tests: 439 dp = list(rd.find_positions(term)) 440 print doc_positions == dp, doc_positions, dp 441 fr = rd.get_frequency(term) 442 print frequency == fr, frequency, fr 443 444 # (Test fields.) 445 446 for docnum, text in docs: 447 df = dict(rd.get_fields(docnum)) 448 print df[123] == text, text, df[123] 449 450 # (Test navigation.) 451 452 for term, docnum, positions in position_tests: 453 dp = rd.find_positions(term) 454 pos = dp.from_document(docnum) 455 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 456 457 # (Test phrases.) 458 459 for terms, results in phrase_tests: 460 res = list(rd.find_common_positions(terms)) 461 print results == res, results, res 462 463 index.close() 464 465 # Test index updates. 466 467 index = Index("test_index") 468 index2 = Index("test_index2") 469 wi = index2.get_writer(3, 2, 6) 470 for docnum, text in docs: 471 472 # Add the same documents but with different numbers. 473 474 doc = Document(docnum + 100) 475 for position, term in enumerate(text.split()): 476 doc.add_position(term, position) 477 doc.add_field(123, text) 478 wi.add_document(doc) 479 wi.close() 480 481 index2.update([index]) 482 index.close() 483 484 rd = index2.get_reader() 485 for term, frequency, doc_positions in doc_tests: 486 487 # Add the extra documents to the expected result. 488 489 orig_doc_positions = doc_positions 490 doc_positions = doc_positions[:] 491 492 for docnum, positions in orig_doc_positions: 493 doc_positions.append((docnum + 100, positions)) 494 frequency *= 2 495 496 dp = list(rd.find_positions(term)) 497 print doc_positions == dp, doc_positions, dp 498 fr = rd.get_frequency(term) 499 print frequency == fr, frequency, fr 500 index2.close() 501 502 # (Test update of an empty index.) 503 504 index = Index("test_index") 505 index3 = Index("test_index3") 506 index3.update([index]) 507 index.close() 508 509 rd = index3.get_reader() 510 for term, frequency, doc_positions in doc_tests: 511 dp = list(rd.find_positions(term)) 512 print doc_positions == dp, doc_positions, dp 513 fr = rd.get_frequency(term) 514 print frequency == fr, frequency, fr 515 index3.close() 516 517 # vim: tabstop=4 expandtab shiftwidth=4