iixr (file test.py at 1f3986bca1a3)

     1 #!/usr/bin/env python     2      3 from iixr.files import *     4 from iixr.fields import *     5 from iixr.terms import *     6 from iixr.positions import *     7 from iixr.index import *     8 import os, sys     9     10 # Remove old test files.    11     12 for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"):    13     try:    14         os.remove(filename)    15     except OSError:    16         pass    17     18 try:    19     for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"):    20         for filename in os.listdir(dirname):    21             os.remove(os.path.join(dirname, filename))    22         os.rmdir(dirname)    23 except OSError:    24     pass    25     26 if "clean" in sys.argv:    27     sys.exit(0)    28     29 print "- Test basic data types."    30     31 numbers = [12345678, 0, 1, 127, 128, 255, 256]    32     33 f = open("test", "wb")    34 w = FileWriter(f)    35 w.begin_record()    36 for number in numbers:    37     w.write_number(number)    38 w.end_record()    39 w.close()    40     41 f = open("test", "rb")    42 r = FileReader(f)    43 r.begin_record()    44 for number in numbers:    45     n = r.read_number()    46     print number == n, number, n    47 r.end_record()    48 r.close()    49     50 tuples = [(0, 0), (1, 3), (2, 5), (3, 9)]    51     52 f = open("testMS", "wb")    53 w = FileWriter(f)    54 w.begin_record()    55 w.write_monotonic_sequence(tuples)    56 w.end_record()    57 w.close()    58     59 f = open("testMS", "rb")    60 r = FileReader(f)    61 r.begin_record()    62 for t, t2 in zip(r.read_monotonic_sequence(), tuples):    63     print t == t2, t, t2    64 r.end_record()    65 r.close()    66     67 tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)]    68     69 f = open("testNMS", "wb")    70 w = FileWriter(f)    71 w.begin_record()    72 w.write_delta_sequence(tuples2)    73 w.end_record()    74 w.close()    75     76 f = open("testNMS", "rb")    77 r = FileReader(f)    78 r.begin_record()    79 for t, t2 in zip(r.read_delta_sequence(), tuples2):    80     print t == t2, t, t2    81 r.end_record()    82 r.close()    83     84 print "- Test positions."    85     86 all_doc_positions = [    87     [    88         (123, [1, 3, 5, 15, 25]),    89         (124, [0, 100]),    90         (125, [11, 99, 199]),    91         (130, [77, 78, 80, 82, 89])    92     ],    93     [    94         (78, [9]),    95         (196, [10, 11]),    96         (197, [17, 21, 30])    97     ]    98     ]    99    100 f = open("testP", "wb")   101 w = PositionWriter(f)   102 for doc_positions in all_doc_positions:   103     for docnum, positions in doc_positions:   104         w.write_positions(docnum, positions)   105     w.reset()   106 w.close()   107    108 f = open("testP", "rb")   109 r = PositionReader(f)   110 for doc_positions in all_doc_positions:   111     for docnum, positions in doc_positions:   112         d, p = r.read_positions()   113         print docnum == d, docnum, d   114         print positions == p, positions, p   115     r.reset()   116 r.close()   117    118 all_doc_positions_seq = [   119     [   120         ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]),   121         ((124, 1), [(0, 0), (100, 350)]),   122         ((124, 2), [(11, 38), (99, 379), (199, 720)]),   123         ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)])   124     ],   125     [   126         ((78, 1), [(9, 19)]),   127         ((196, 0), [(10, 27), (11, 29)]),   128         ((196, 1), [(17, 46), (21, 52), (30, 60)])   129     ]   130     ]   131    132 f = open("testP2", "wb")   133 w = PositionWriter(f)   134 for doc_positions in all_doc_positions_seq:   135     for docnum, positions in doc_positions:   136         w.write_positions(docnum, positions)   137     w.reset()   138 w.close()   139    140 f = open("testP2", "rb")   141 r = PositionReader(f)   142 for doc_positions in all_doc_positions_seq:   143     for docnum, positions in doc_positions:   144         d, p = r.read_positions()   145         print docnum == d, docnum, d   146         print positions == p, positions, p   147     r.reset()   148 r.close()   149    150 print "- Test position index files."   151    152 indexed_positions = [   153     [   154         (1234, 0, 100),   155         (2345, 700, 100),   156         (3456, 1900, 50)   157     ],   158     [   159         (4567, 2800, 20)   160     ]   161     ]   162    163 offsets = []   164 f = open("testPI", "wb")   165 w = PositionIndexWriter(f)   166 for term_positions in indexed_positions:   167     offset = None   168     doc_frequency = 0   169     w.reset()   170     for docnum, pos_offset, count in term_positions:   171         if offset is None:   172             offset = w.tell()   173         w.write_positions(docnum, pos_offset, count)   174         doc_frequency += count   175     offsets.append((offset, doc_frequency))   176 w.close()   177    178 r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb")))   179 offsets.reverse()   180 indexed_positions.reverse()   181 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):   182     r.seek(offset, doc_frequency)   183     for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):   184         print docnum == dn, docnum, dn   185         print pos_offset == po, pos_offset, po   186         print count == c, count, c   187 r.reader.close()   188    189 print "- Test position dictionaries."   190    191 f = open("testP", "wb")   192 w = PositionWriter(f)   193 f2 = open("testPI", "wb")   194 w2 = PositionIndexWriter(f2)   195 wd = PositionDictionaryWriter(w, w2, 2)   196 offsets = []   197 for doc_positions in all_doc_positions:   198     offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)   199     offsets.append((offset, doc_frequency))   200 wd.close()   201    202 r = PositionReader(open("testP", "rb"))   203 r2 = PositionIndexReader(open("testPI", "rb"))   204 rd = PositionDictionaryReader(r, r2)   205 offsets.reverse()   206 all_doc_positions.reverse()   207 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):   208     it = rd.read_term_positions(offset, doc_frequency)   209     dp = list(it)   210     print doc_positions == dp, doc_positions, dp   211 rd.close()   212    213 print "- Test fields."   214    215 doc_fields = [   216     (123, ["testing", "fields", "stored", "compressed"]),   217     (456, ["fields", "for a second", "document"]),   218     (789, ["field value"]),   219     (1234, []),   220     (2345, ["abc", "def"]),   221     (3456, ["apple", "banana", "cherry"]),   222     (4567, ["drue", "eple"])   223     ]   224    225 f = open("testF", "wb")   226 w = FieldWriter(f)   227 for docnum, fields in doc_fields:   228     w.write_fields(docnum, list(enumerate(fields)))   229 w.close()   230    231 f = open("testF", "rb")   232 r = FieldReader(f)   233 for docnum, fields in doc_fields:   234     dn, df = r.read_fields()   235     print docnum == dn, docnum, dn   236     print list(enumerate(fields)) == df, list(enumerate(fields)), df   237 r.close()   238    239 print "- Test field index files."   240    241 indexed_docs = [   242     (123, 100000987),   243     (456, 100004321),   244     (789, 100008765)   245     ]   246    247 f = open("testFI", "wb")   248 w = FieldIndexWriter(f)   249 for docnum, offset in indexed_docs:   250     w.write_document(docnum, offset)   251 w.close()   252    253 f = open("testFI", "rb")   254 r = FieldIndexReader(f)   255 for docnum, offset in indexed_docs:   256     dn, o = r.read_document()   257     print docnum == dn, docnum, dn   258     print offset == o, offset, o   259 r.close()   260    261 print "- Test field dictionaries."   262    263 f = open("testF", "wb")   264 w = FieldWriter(f)   265 f2 = open("testFI", "wb")   266 w2 = FieldIndexWriter(f2)   267 wd = FieldDictionaryWriter(w, w2, 3)   268 for docnum, fields in doc_fields:   269     wd.write_fields(docnum, list(enumerate(fields)))   270 wd.close()   271    272 f = open("testF", "rb")   273 r = FieldReader(f)   274 f2 = open("testFI", "rb")   275 r2 = FieldIndexReader(f2)   276 rd = FieldDictionaryReader(r, r2)   277 doc_fields_reversed = doc_fields[:]   278 doc_fields_reversed.reverse()   279 for docnum, fields in doc_fields_reversed:   280     df = dict(rd.get_fields(docnum))   281     print dict(enumerate(fields)) == df, dict(enumerate(fields)), df   282 for docnum in (13579, 246810):   283     df = rd.get_fields(docnum)   284     print df is None, df   285    286 print "- (Test sequential access.)"   287    288 rd.rewind()   289 for docnum, fields in doc_fields:   290     dn, df = rd.read_fields()   291     print docnum == dn, docnum, dn   292     print list(enumerate(fields)) == df, list(enumerate(fields)), df   293 rd.close()   294    295 print "- Test terms."   296    297 terms = [   298     # term       offset      frequency  doc_frequency   299     ("aardvark",  100000123,  1,         1),   300     ("anteater",  100000456,  2,         1),   301     ("badger",    100000789, 13,         7),   302     ("bull",     1000001234, 59,        17),   303     ("bulldog",  1000002345, 99,        80),   304     ("cat",      1000003456, 89,        28)   305     ]   306    307 f = open("test", "wb")   308 w = TermWriter(f)   309 for term, offset, frequency, doc_frequency in terms:   310     w.write_term(term, offset, frequency, doc_frequency)   311 w.close()   312    313 f = open("test", "rb")   314 r = TermReader(f)   315 for term, offset, frequency, doc_frequency in terms:   316     t, o, fr, df = r.read_term()   317     print term == t, term, t   318     print offset == o, offset, o   319     print frequency == fr, frequency, fr   320     print doc_frequency == df, doc_frequency, df   321 r.close()   322    323 print "- Test terms in index files."   324    325 indexed_terms = [   326     # term       offset      frequency  doc_frequency   info_offset   327     ("aardvark",  100000123,  1,         1,             200000321),   328     ("anteater",  100000456,  2,         1,             200000654),   329     ("badger",    100000789, 13,         7,             200000987),   330     ("bull",     1000001234, 59,        17,             200004321),   331     ("bulldog",  1000002345, 99,        80,             200005432),   332     ("cat",      1000003456, 89,        28,             200006543)   333     ]   334    335 f = open("test", "wb")   336 w = TermIndexWriter(f)   337 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   338     w.write_term(term, offset, frequency, doc_frequency, info_offset)   339 w.close()   340    341 f = open("test", "rb")   342 r = TermIndexReader(f)   343 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   344     t, o, fr, df, i = r.read_term()   345     print term == t, term, t   346     print offset == o, offset, o   347     print frequency == fr, frequency, fr   348     print doc_frequency == df, doc_frequency, df   349     print info_offset == i, info_offset, i   350 r.close()   351    352 print "- Test dictionaries with only term data."   353    354 f = open("test", "wb")   355 w = TermWriter(f)   356 f2 = open("testI", "wb")   357 w2 = TermIndexWriter(f2)   358 f3 = open("testP", "wb")   359 w3 = PositionWriter(f3)   360 f4 = open("testPI", "wb")   361 w4 = PositionIndexWriter(f4)   362 wp = PositionDictionaryWriter(w3, w4, 2)   363 wd = TermDictionaryWriter(w, w2, wp, 3)   364 for term, offset, frequency, doc_frequency in terms:   365     wd._write_term(term, offset, frequency, doc_frequency)   366 wd.close()   367    368 f = open("test", "rb")   369 r = TermReader(f)   370 f2 = open("testI", "rb")   371 r2 = TermIndexReader(f2)   372 r3 = PositionReader(open("testP", "rb"))   373 r4 = PositionIndexReader(open("testPI", "rb"))   374 rp = PositionDictionaryReader(r3, r4)   375 rd = TermDictionaryReader(r, r2, rp)   376 terms_reversed = terms[:]   377 terms_reversed.reverse()   378 for term, offset, frequency, doc_frequency in terms_reversed:   379     o, fr, df = rd._find_term(term)   380     print offset == o, offset, o   381     print frequency == fr, frequency, fr   382     print doc_frequency == df, doc_frequency, df   383 for term in ("dog", "dingo"):   384     t = rd._find_term(term)   385     print t is None, t   386    387 print "- (Test term prefix searching.)"   388    389 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]   390 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]   391 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]   392 print rd.find_terms("d") == [], rd.find_terms("d"), []   393 rd.close()   394    395 print "- Test dictionaries with term and position data."   396    397 terms_with_positions = [   398     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),   399     ("anteater",  [(1, [43, 44])]),   400     ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),   401     ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),   402     ("bulldog",   [(43, [17, 19, 256, 512])]),   403     ("cat",       [(123, [12, 145, 196]), (1200, [113])])   404     ]   405    406 position_dict_tests = [   407     ("badger", 19, [55, 1333]),   408     ("badger", 20, None),   409     ("bull", 6, [128]),   410     ("bull", 26, [1, 3, 5, 7, 9]),   411     ("cat", 111, None),   412     ("cat", 123, [12, 145, 196]),   413     ("cat", 1234, None)   414     ]   415    416 f = open("test", "wb")   417 w = TermWriter(f)   418 f2 = open("testI", "wb")   419 w2 = TermIndexWriter(f2)   420 f3 = open("testP", "wb")   421 w3 = PositionWriter(f3)   422 f4 = open("testPI", "wb")   423 w4 = PositionIndexWriter(f4)   424 wp = PositionDictionaryWriter(w3, w4, 2)   425 wd = TermDictionaryWriter(w, w2, wp, 3)   426 for term, doc_positions in terms_with_positions:   427     wd.write_term_positions(term, doc_positions)   428 wd.close()   429    430 f = open("test", "rb")   431 r = TermReader(f)   432 f2 = open("testI", "rb")   433 r2 = TermIndexReader(f2)   434 r3 = PositionReader(open("testP", "rb"))   435 r4 = PositionIndexReader(open("testPI", "rb"))   436 rp = PositionDictionaryReader(r3, r4)   437 rd = TermDictionaryReader(r, r2, rp)   438 terms_reversed = terms_with_positions[:]   439 terms_reversed.reverse()   440 for term, doc_positions in terms_reversed:   441     dp = list(rd.find_positions(term))   442     print doc_positions == dp, doc_positions, dp   443 for term in ("aaa", "dog", "dingo"):   444     dp = rd.find_positions(term)   445     print dp == [], dp   446    447 print "- (Test iterators.)"   448    449 for term, docnum, positions in position_dict_tests:   450     dp = rd.find_positions(term)   451     pos = dp.from_document(docnum)   452     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   453    454 print "- (Test sequential access.)"   455    456 rd.rewind()   457 for term, doc_positions in terms_with_positions:   458     t, fr, df, dp = rd.read_term()   459     dp = list(dp)   460     print term == t, term, t   461     print doc_positions == dp, doc_positions, dp   462 rd.close()   463    464 print "- Test high-level index operations (including merging)."   465    466 docs = [   467     (1, "The cat sat on the mat"),   468     (2, "Every good boy deserves football"),   469     (13, "One good turn deserves another"),   470     (14, "Every man for himself"),   471     (25, "Red sky at night shepherd's delight"),   472     (36, "She sells sea shells on the sea shore")   473     ]   474    475 doc_tests = [   476     ("Every", 2, [(2, [0]), (14, [0])]),   477     ("good", 2, [(2, [1]), (13, [1])]),   478     ("deserves", 2, [(2, [3]), (13, [3])]),   479     ("sea", 2, [(36, [2, 6])])   480     ]   481    482 position_tests = [   483     ("Every", 14, [0]),   484     ("sea", 36, [2, 6]),   485     ("shells", 1, None),   486     ("shells", 37, None)   487     ]   488    489 phrase_tests = [   490     (["good", "boy"], [(2, [1, 2])]),   491     (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),   492     (["sea", "shore"], [(36, [6, 7])])   493     ]   494    495 index = Index("test_index", 3, 2, 3, 6)   496 wi = index.get_writer()   497 for docnum, text in docs:   498     doc = Document(docnum)   499     for position, term in enumerate(text.split()):   500         doc.add_position(term, position)   501     doc.add_field(123, text)   502     wi.add_document(doc)   503 wi.close()   504    505 rd = index.get_reader()   506    507 print "- (Test searching.)"   508    509 for term, frequency, doc_positions in doc_tests:   510     dp = list(rd.find_positions(term))   511     print doc_positions == dp, doc_positions, dp   512     fr = rd.get_frequency(term)   513     print frequency == fr, frequency, fr   514    515 print "- (Test fields.)"   516    517 for docnum, text in docs:   518     df = dict(rd.get_fields(docnum))   519     print df[123] == text, text, df[123]   520    521 print "- (Test navigation.)"   522    523 for term, docnum, positions in position_tests:   524     dp = rd.find_positions(term)   525     pos = dp.from_document(docnum)   526     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   527    528 print "- (Test phrases.)"   529    530 for terms, results in phrase_tests:   531     res = list(rd.find_common_positions(terms))   532     print results == res, results, res   533    534 index.close()   535    536 docs2 = [   537     ((1, 0), "The cat sat on the mat"),   538     ((1, 2), "Every good boy deserves football"),   539     ((13, 1), "One good turn deserves another"),   540     ((14, 0), "Every man for himself"),   541     ((14, 25), "Red sky at night shepherd's delight"),   542     ((36, 12), "She sells sea shells on the sea shore")   543     ]   544    545 doc_tests2 = [   546     ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]),   547     ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]),   548     ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]),   549     ("sea", 2, [((36, 12), [(2, 10), (6, 28)])])   550     ]   551    552 position_tests2 = [   553     ("Every", (14, 0), [(0, 0)]),   554     ("sea", (36, 12), [(2, 10), (6, 28)]),   555     ("shells", (1, 0), None),   556     ("shells", (37, 0), None)   557     ]   558    559 phrase_tests2 = [   560     (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]),   561     (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]),   562     (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])])   563     ]   564    565 index = Index("test_indexT", 3, 2, 3, 6)   566 wi = index.get_writer()   567 for docnum, text in docs2:   568     doc = Document(docnum)   569     offset = 0   570     for position, term in enumerate(text.split()):   571         doc.add_position(term, (position, offset))   572         offset += len(term) + 1 # assume one space after the term   573     doc.add_field(123, text)   574     wi.add_document(doc)   575 wi.close()   576    577 rd = index.get_reader()   578    579 print "- (Test searching.)"   580    581 for term, frequency, doc_positions in doc_tests2:   582     dp = list(rd.find_positions(term))   583     print doc_positions == dp, doc_positions, dp   584     fr = rd.get_frequency(term)   585     print frequency == fr, frequency, fr   586    587 print "- (Test fields.)"   588    589 for docnum, text in docs2:   590     df = dict(rd.get_fields(docnum))   591     print df[123] == text, text, df[123]   592    593 print "- (Test navigation.)"   594    595 for term, docnum, positions in position_tests2:   596     dp = rd.find_positions(term)   597     pos = dp.from_document(docnum)   598     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   599    600 print "- (Test phrases.)"   601    602 for terms, results in phrase_tests2:   603     res = list(rd.find_common_positions(terms))   604     print results == res, results, res   605    606 index.close()   607    608 print "- Test index updates."   609    610 index = Index("test_index")   611 index2 = Index("test_index2", 3, 2, 3, 6)   612 wi = index2.get_writer()   613 for docnum, text in docs:   614    615     # Add the same documents but with different numbers.   616    617     doc = Document(docnum + 100)   618     for position, term in enumerate(text.split()):   619         doc.add_position(term, position)   620     doc.add_field(123, text)   621     wi.add_document(doc)   622 wi.close()   623    624 index2.update([index])   625 index.close()   626    627 rd = index2.get_reader()   628 for term, frequency, doc_positions in doc_tests:   629    630     # Add the extra documents to the expected result.   631    632     orig_doc_positions = doc_positions   633     doc_positions = doc_positions[:]   634    635     for docnum, positions in orig_doc_positions:   636         doc_positions.append((docnum + 100, positions))   637     frequency *= 2   638    639     dp = list(rd.find_positions(term))   640     print doc_positions == dp, doc_positions, dp   641     fr = rd.get_frequency(term)   642     print frequency == fr, frequency, fr   643 index2.close()   644    645 print "- (Test update of an empty index.)"   646    647 index = Index("test_index")   648 index3 = Index("test_index3")   649 index3.update([index])   650 index.close()   651    652 rd = index3.get_reader()   653 for term, frequency, doc_positions in doc_tests:   654     dp = list(rd.find_positions(term))   655     print doc_positions == dp, doc_positions, dp   656     fr = rd.get_frequency(term)   657     print frequency == fr, frequency, fr   658 index3.close()   659    660 # vim: tabstop=4 expandtab shiftwidth=4