iixr (file test.py at 74e2e30aabea)

     1 #!/usr/bin/env python     2      3 from iixr.files import *     4 from iixr.fields import *     5 from iixr.terms import *     6 from iixr.positions import *     7 from iixr.index import *     8 import os, sys     9     10 # Remove old test files.    11     12 for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"):    13     try:    14         os.remove(filename)    15     except OSError:    16         pass    17     18 try:    19     for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"):    20         for filename in os.listdir(dirname):    21             os.remove(os.path.join(dirname, filename))    22         os.rmdir(dirname)    23 except OSError:    24     pass    25     26 if "clean" in sys.argv:    27     sys.exit(0)    28     29 print "- Test basic data types."    30     31 numbers = [12345678, 0, 1, 127, 128, 255, 256]    32     33 f = open("test", "wb")    34 w = FileWriter(f)    35 w.begin_record()    36 for number in numbers:    37     w.write_number(number)    38 w.end_record()    39 w.close()    40     41 f = open("test", "rb")    42 r = FileReader(f)    43 r.begin_record()    44 for number in numbers:    45     n = r.read_number()    46     print number == n, number, n    47 r.end_record()    48 r.close()    49     50 tuples = [(0, 0), (1, 3), (2, 5), (3, 9)]    51     52 f = open("testMS", "wb")    53 w = FileWriter(f)    54 w.begin_record()    55 w.write_monotonic_sequence(tuples, 2)    56 w.end_record()    57 w.close()    58     59 f = open("testMS", "rb")    60 r = FileReader(f)    61 r.begin_record()    62 for t, t2 in zip(r.read_monotonic_sequence(2), tuples):    63     print t == t2, t, t2    64 r.end_record()    65 r.close()    66     67 tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)]    68     69 f = open("testNMS", "wb")    70 w = FileWriter(f)    71 w.begin_record()    72 w.write_delta_sequence(tuples2, 2)    73 w.end_record()    74 w.close()    75     76 f = open("testNMS", "rb")    77 r = FileReader(f)    78 r.begin_record()    79 for t, t2 in zip(r.read_delta_sequence(2), tuples2):    80     print t == t2, t, t2    81 r.end_record()    82 r.close()    83     84 print "- Test positions."    85     86 all_doc_positions = [    87     [    88         (123, [1, 3, 5, 15, 25]),    89         (124, [0, 100]),    90         (125, [11, 99, 199]),    91         (130, [77, 78, 80, 82, 89])    92     ],    93     [    94         (78, [9]),    95         (196, [10, 11]),    96         (197, [17, 21, 30])    97     ]    98     ]    99    100 f = open("testP", "wb")   101 w = PositionWriter(f)   102 w.begin(0, 0)   103 for doc_positions in all_doc_positions:   104     w.reset()   105     for docnum, positions in doc_positions:   106         w.write_positions(docnum, positions)   107 w.close()   108    109 f = open("testP", "rb")   110 r = PositionReader(f)   111 for doc_positions in all_doc_positions:   112     r.reset()   113     for docnum, positions in doc_positions:   114         d, p = r.read_positions()   115         print docnum == d, docnum, d   116         print positions == p, positions, p   117 r.close()   118    119 all_doc_positions_seq = [   120     [   121         ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]),   122         ((124, 1), [(0, 0), (100, 350)]),   123         ((124, 2), [(11, 38), (99, 379), (199, 720)]),   124         ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)])   125     ],   126     [   127         ((78, 1), [(9, 19)]),   128         ((196, 0), [(10, 27), (11, 29)]),   129         ((196, 1), [(17, 46), (21, 52), (30, 60)])   130     ]   131     ]   132    133 f = open("testP2", "wb")   134 w = PositionWriter(f)   135 w.begin(2, 2)   136 for doc_positions in all_doc_positions_seq:   137     w.reset()   138     for docnum, positions in doc_positions:   139         w.write_positions(docnum, positions)   140 w.close()   141    142 f = open("testP2", "rb")   143 r = PositionReader(f)   144 for doc_positions in all_doc_positions_seq:   145     r.reset()   146     for docnum, positions in doc_positions:   147         d, p = r.read_positions()   148         print docnum == d, docnum, d   149         print positions == p, positions, p   150 r.close()   151    152 print "- Test position index files."   153    154 indexed_positions = [   155     [   156         (1234, 0, 100),   157         (2345, 700, 100),   158         (3456, 1900, 50)   159     ],   160     [   161         (4567, 2800, 20)   162     ]   163     ]   164    165 offsets = []   166 f = open("testPI", "wb")   167 w = PositionIndexWriter(f)   168 w.begin(0)   169 for term_positions in indexed_positions:   170     offset = None   171     doc_frequency = 0   172     w.reset()   173     for docnum, pos_offset, count in term_positions:   174         if offset is None:   175             offset = w.tell()   176         w.write_positions(docnum, pos_offset, count)   177         doc_frequency += count   178     offsets.append((offset, doc_frequency))   179 w.close()   180    181 r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb")))   182 offsets.reverse()   183 indexed_positions.reverse()   184 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):   185     r.seek(offset, doc_frequency)   186     for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):   187         print docnum == dn, docnum, dn   188         print pos_offset == po, pos_offset, po   189         print count == c, count, c   190 r.reader.close()   191    192 print "- Test position dictionaries."   193    194 f = open("testP", "wb")   195 w = PositionWriter(f)   196 f2 = open("testPI", "wb")   197 w2 = PositionIndexWriter(f2)   198 wd = PositionDictionaryWriter(w, w2, 2)   199 offsets = []   200 for doc_positions in all_doc_positions:   201     offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)   202     offsets.append((offset, doc_frequency))   203 wd.close()   204    205 r = PositionReader(open("testP", "rb"))   206 r2 = PositionIndexReader(open("testPI", "rb"))   207 rd = PositionDictionaryReader(r, r2)   208 offsets.reverse()   209 all_doc_positions.reverse()   210 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):   211     it = rd.read_term_positions(offset, doc_frequency)   212     dp = list(it)   213     print doc_positions == dp, doc_positions, dp   214 rd.close()   215    216 print "- Test fields."   217    218 doc_fields = [   219     (123, ["testing", "fields", "stored", "compressed"]),   220     (456, ["fields", "for a second", "document"]),   221     (789, ["field value"]),   222     (1234, []),   223     (2345, ["abc", "def"]),   224     (3456, ["apple", "banana", "cherry"]),   225     (4567, ["drue", "eple"])   226     ]   227    228 f = open("testF", "wb")   229 w = FieldWriter(f)   230 w.begin(0)   231 w.reset()   232 for docnum, fields in doc_fields:   233     w.write_fields(docnum, list(enumerate(fields)))   234 w.close()   235    236 f = open("testF", "rb")   237 r = FieldReader(f)   238 r.reset()   239 for docnum, fields in doc_fields:   240     dn, df = r.read_fields()   241     print docnum == dn, docnum, dn   242     print list(enumerate(fields)) == df, list(enumerate(fields)), df   243 r.close()   244    245 print "- Test field index files."   246    247 indexed_docs = [   248     (123, 100000987),   249     (456, 100004321),   250     (789, 100008765)   251     ]   252    253 f = open("testFI", "wb")   254 w = FieldIndexWriter(f)   255 w.begin(0)   256 w.reset()   257 for docnum, offset in indexed_docs:   258     w.write_document(docnum, offset)   259 w.close()   260    261 f = open("testFI", "rb")   262 r = FieldIndexReader(f)   263 r.reset()   264 for docnum, offset in indexed_docs:   265     dn, o = r.read_document()   266     print docnum == dn, docnum, dn   267     print offset == o, offset, o   268 r.close()   269    270 print "- Test field dictionaries."   271    272 f = open("testF", "wb")   273 w = FieldWriter(f)   274 f2 = open("testFI", "wb")   275 w2 = FieldIndexWriter(f2)   276 wd = FieldDictionaryWriter(w, w2, 3)   277 for docnum, fields in doc_fields:   278     wd.write_fields(docnum, list(enumerate(fields)))   279 wd.close()   280    281 f = open("testF", "rb")   282 r = FieldReader(f)   283 f2 = open("testFI", "rb")   284 r2 = FieldIndexReader(f2)   285 rd = FieldDictionaryReader(r, r2)   286 doc_fields_reversed = doc_fields[:]   287 doc_fields_reversed.reverse()   288 for docnum, fields in doc_fields_reversed:   289     df = dict(rd.get_fields(docnum))   290     print dict(enumerate(fields)) == df, dict(enumerate(fields)), df   291 for docnum in (13579, 246810):   292     df = rd.get_fields(docnum)   293     print df is None, df   294    295 print "- (Test sequential access.)"   296    297 rd.rewind()   298 for docnum, fields in doc_fields:   299     dn, df = rd.read_fields()   300     print docnum == dn, docnum, dn   301     print list(enumerate(fields)) == df, list(enumerate(fields)), df   302 rd.close()   303    304 print "- Test terms."   305    306 terms = [   307     # term       offset      frequency  doc_frequency   308     ("aardvark",  100000123,  1,         1),   309     ("anteater",  100000456,  2,         1),   310     ("badger",    100000789, 13,         7),   311     ("bull",     1000001234, 59,        17),   312     ("bulldog",  1000002345, 99,        80),   313     ("cat",      1000003456, 89,        28)   314     ]   315    316 f = open("test", "wb")   317 w = TermWriter(f)   318 w.reset()   319 for term, offset, frequency, doc_frequency in terms:   320     w.write_term(term, offset, frequency, doc_frequency)   321 w.close()   322    323 f = open("test", "rb")   324 r = TermReader(f)   325 r.reset()   326 for term, offset, frequency, doc_frequency in terms:   327     t, o, fr, df = r.read_term()   328     print term == t, term, t   329     print offset == o, offset, o   330     print frequency == fr, frequency, fr   331     print doc_frequency == df, doc_frequency, df   332 r.close()   333    334 print "- Test terms in index files."   335    336 indexed_terms = [   337     # term       offset      frequency  doc_frequency   info_offset   338     ("aardvark",  100000123,  1,         1,             200000321),   339     ("anteater",  100000456,  2,         1,             200000654),   340     ("badger",    100000789, 13,         7,             200000987),   341     ("bull",     1000001234, 59,        17,             200004321),   342     ("bulldog",  1000002345, 99,        80,             200005432),   343     ("cat",      1000003456, 89,        28,             200006543)   344     ]   345    346 f = open("test", "wb")   347 w = TermIndexWriter(f)   348 w.reset()   349 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   350     w.write_term(term, offset, frequency, doc_frequency, info_offset)   351 w.close()   352    353 f = open("test", "rb")   354 r = TermIndexReader(f)   355 r.reset()   356 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   357     t, o, fr, df, i = r.read_term()   358     print term == t, term, t   359     print offset == o, offset, o   360     print frequency == fr, frequency, fr   361     print doc_frequency == df, doc_frequency, df   362     print info_offset == i, info_offset, i   363 r.close()   364    365 print "- Test dictionaries with only term data."   366    367 f = open("test", "wb")   368 w = TermWriter(f)   369 f2 = open("testI", "wb")   370 w2 = TermIndexWriter(f2)   371 f3 = open("testP", "wb")   372 w3 = PositionWriter(f3)   373 f4 = open("testPI", "wb")   374 w4 = PositionIndexWriter(f4)   375 wp = PositionDictionaryWriter(w3, w4, 2)   376 wd = TermDictionaryWriter(w, w2, wp, 3)   377 for term, offset, frequency, doc_frequency in terms:   378     wd._write_term(term, offset, frequency, doc_frequency)   379 wd.close()   380    381 f = open("test", "rb")   382 r = TermReader(f)   383 f2 = open("testI", "rb")   384 r2 = TermIndexReader(f2)   385 r3 = PositionReader(open("testP", "rb"))   386 r4 = PositionIndexReader(open("testPI", "rb"))   387 rp = PositionDictionaryReader(r3, r4)   388 rd = TermDictionaryReader(r, r2, rp)   389 terms_reversed = terms[:]   390 terms_reversed.reverse()   391 for term, offset, frequency, doc_frequency in terms_reversed:   392     o, fr, df = rd._find_term(term)   393     print offset == o, offset, o   394     print frequency == fr, frequency, fr   395     print doc_frequency == df, doc_frequency, df   396 for term in ("dog", "dingo"):   397     t = rd._find_term(term)   398     print t is None, t   399    400 print "- (Test term prefix searching.)"   401    402 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]   403 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]   404 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]   405 print rd.find_terms("d") == [], rd.find_terms("d"), []   406 rd.close()   407    408 print "- Test dictionaries with term and position data."   409    410 terms_with_positions = [   411     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),   412     ("anteater",  [(1, [43, 44])]),   413     ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),   414     ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),   415     ("bulldog",   [(43, [17, 19, 256, 512])]),   416     ("cat",       [(123, [12, 145, 196]), (1200, [113])])   417     ]   418    419 position_dict_tests = [   420     ("badger", 19, [55, 1333]),   421     ("badger", 20, None),   422     ("bull", 6, [128]),   423     ("bull", 26, [1, 3, 5, 7, 9]),   424     ("cat", 111, None),   425     ("cat", 123, [12, 145, 196]),   426     ("cat", 1234, None)   427     ]   428    429 f = open("test", "wb")   430 w = TermWriter(f)   431 f2 = open("testI", "wb")   432 w2 = TermIndexWriter(f2)   433 f3 = open("testP", "wb")   434 w3 = PositionWriter(f3)   435 f4 = open("testPI", "wb")   436 w4 = PositionIndexWriter(f4)   437 wp = PositionDictionaryWriter(w3, w4, 2)   438 wd = TermDictionaryWriter(w, w2, wp, 3)   439 for term, doc_positions in terms_with_positions:   440     wd.write_term_positions(term, doc_positions)   441 wd.close()   442    443 f = open("test", "rb")   444 r = TermReader(f)   445 f2 = open("testI", "rb")   446 r2 = TermIndexReader(f2)   447 r3 = PositionReader(open("testP", "rb"))   448 r4 = PositionIndexReader(open("testPI", "rb"))   449 rp = PositionDictionaryReader(r3, r4)   450 rd = TermDictionaryReader(r, r2, rp)   451 terms_reversed = terms_with_positions[:]   452 terms_reversed.reverse()   453 for term, doc_positions in terms_reversed:   454     dp = list(rd.find_positions(term))   455     print doc_positions == dp, doc_positions, dp   456 for term in ("aaa", "dog", "dingo"):   457     dp = rd.find_positions(term)   458     print dp == [], dp   459    460 print "- (Test iterators.)"   461    462 for term, docnum, positions in position_dict_tests:   463     dp = rd.find_positions(term)   464     pos = dp.from_document(docnum)   465     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   466    467 print "- (Test sequential access.)"   468    469 rd.rewind()   470 for term, doc_positions in terms_with_positions:   471     t, fr, df, dp = rd.read_term()   472     dp = list(dp)   473     print term == t, term, t   474     print doc_positions == dp, doc_positions, dp   475 rd.close()   476    477 print "- Test high-level index operations (including merging)."   478    479 docs = [   480     (1, "The cat sat on the mat"),   481     (2, "Every good boy deserves football"),   482     (13, "One good turn deserves another"),   483     (14, "Every man for himself"),   484     (25, "Red sky at night shepherd's delight"),   485     (36, "She sells sea shells on the sea shore")   486     ]   487    488 doc_tests = [   489     ("Every", 2, [(2, [0]), (14, [0])]),   490     ("good", 2, [(2, [1]), (13, [1])]),   491     ("deserves", 2, [(2, [3]), (13, [3])]),   492     ("sea", 2, [(36, [2, 6])])   493     ]   494    495 position_tests = [   496     ("Every", 14, [0]),   497     ("sea", 36, [2, 6]),   498     ("shells", 1, None),   499     ("shells", 37, None)   500     ]   501    502 phrase_tests = [   503     (["good", "boy"], [(2, [1, 2])]),   504     (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),   505     (["sea", "shore"], [(36, [6, 7])])   506     ]   507    508 index = Index("test_index", 3, 2, 3, 6)   509 wi = index.get_writer()   510 for docnum, text in docs:   511     doc = Document(docnum)   512     for position, term in enumerate(text.split()):   513         doc.add_position(term, position)   514     doc.add_field(123, text)   515     wi.add_document(doc)   516 wi.close()   517    518 rd = index.get_reader()   519    520 print "- (Test searching.)"   521    522 for term, frequency, doc_positions in doc_tests:   523     dp = list(rd.find_positions(term))   524     print doc_positions == dp, doc_positions, dp   525     fr = rd.get_frequency(term)   526     print frequency == fr, frequency, fr   527    528 print "- (Test fields.)"   529    530 for docnum, text in docs:   531     df = dict(rd.get_fields(docnum))   532     print df[123] == text, text, df[123]   533    534 print "- (Test navigation.)"   535    536 for term, docnum, positions in position_tests:   537     dp = rd.find_positions(term)   538     pos = dp.from_document(docnum)   539     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   540    541 print "- (Test phrases.)"   542    543 for terms, results in phrase_tests:   544     res = list(rd.find_common_positions(terms))   545     print results == res, results, res   546    547 index.close()   548    549 docs2 = [   550     ((1, 0), "The cat sat on the mat"),   551     ((1, 2), "Every good boy deserves football"),   552     ((13, 1), "One good turn deserves another"),   553     ((14, 0), "Every man for himself"),   554     ((14, 25), "Red sky at night shepherd's delight"),   555     ((36, 12), "She sells sea shells on the sea shore")   556     ]   557    558 doc_tests2 = [   559     ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]),   560     ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]),   561     ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]),   562     ("sea", 2, [((36, 12), [(2, 10), (6, 28)])])   563     ]   564    565 position_tests2 = [   566     ("Every", (14, 0), [(0, 0)]),   567     ("sea", (36, 12), [(2, 10), (6, 28)]),   568     ("shells", (1, 0), None),   569     ("shells", (37, 0), None)   570     ]   571    572 phrase_tests2 = [   573     (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]),   574     (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]),   575     (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])])   576     ]   577    578 index = Index("test_indexT", 3, 2, 3, 6)   579 wi = index.get_writer()   580 for docnum, text in docs2:   581     doc = Document(docnum)   582     offset = 0   583     for position, term in enumerate(text.split()):   584         doc.add_position(term, (position, offset))   585         offset += len(term) + 1 # assume one space after the term   586     doc.add_field(123, text)   587     wi.add_document(doc)   588 wi.close()   589    590 rd = index.get_reader()   591    592 print "- (Test searching.)"   593    594 for term, frequency, doc_positions in doc_tests2:   595     dp = list(rd.find_positions(term))   596     print doc_positions == dp, doc_positions, dp   597     fr = rd.get_frequency(term)   598     print frequency == fr, frequency, fr   599    600 print "- (Test fields.)"   601    602 for docnum, text in docs2:   603     df = dict(rd.get_fields(docnum))   604     print df[123] == text, text, df[123]   605    606 print "- (Test navigation.)"   607    608 for term, docnum, positions in position_tests2:   609     dp = rd.find_positions(term)   610     pos = dp.from_document(docnum)   611     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   612    613 print "- (Test phrases.)"   614    615 for terms, results in phrase_tests2:   616     res = list(rd.find_common_positions(terms))   617     print results == res, results, res   618    619 index.close()   620    621 print "- Test index updates."   622    623 index = Index("test_index")   624 index2 = Index("test_index2", 3, 2, 3, 6)   625 wi = index2.get_writer()   626 for docnum, text in docs:   627    628     # Add the same documents but with different numbers.   629    630     doc = Document(docnum + 100)   631     for position, term in enumerate(text.split()):   632         doc.add_position(term, position)   633     doc.add_field(123, text)   634     wi.add_document(doc)   635 wi.close()   636    637 index2.update([index])   638 index.close()   639    640 rd = index2.get_reader()   641 for term, frequency, doc_positions in doc_tests:   642    643     # Add the extra documents to the expected result.   644    645     orig_doc_positions = doc_positions   646     doc_positions = doc_positions[:]   647    648     for docnum, positions in orig_doc_positions:   649         doc_positions.append((docnum + 100, positions))   650     frequency *= 2   651    652     dp = list(rd.find_positions(term))   653     print doc_positions == dp, doc_positions, dp   654     fr = rd.get_frequency(term)   655     print frequency == fr, frequency, fr   656 index2.close()   657    658 print "- (Test update of an empty index.)"   659    660 index = Index("test_index")   661 index3 = Index("test_index3")   662 index3.update([index])   663 index.close()   664    665 rd = index3.get_reader()   666 for term, frequency, doc_positions in doc_tests:   667     dp = list(rd.find_positions(term))   668     print doc_positions == dp, doc_positions, dp   669     fr = rd.get_frequency(term)   670     print frequency == fr, frequency, fr   671 index3.close()   672    673 # vim: tabstop=4 expandtab shiftwidth=4