iixr (file test.py at 4c35f0aa339c)

     1 #!/usr/bin/env python     2      3 from iixr.files import *     4 from iixr.fields import *     5 from iixr.terms import *     6 from iixr.positions import *     7 from iixr.index import *     8 import os, sys     9     10 # Remove old test files.    11     12 for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"):    13     try:    14         os.remove(filename)    15     except OSError:    16         pass    17     18 try:    19     for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"):    20         for filename in os.listdir(dirname):    21             os.remove(os.path.join(dirname, filename))    22         os.rmdir(dirname)    23 except OSError:    24     pass    25     26 if "clean" in sys.argv:    27     sys.exit(0)    28     29 print "- Test basic data types."    30     31 numbers = [12345678, 0, 1, 127, 128, 255, 256]    32     33 f = open("test", "wb")    34 w = FileWriter(f)    35 for number in numbers:    36     w.write_number(number)    37 w.close()    38     39 f = open("test", "rb")    40 r = FileReader(f)    41 for number in numbers:    42     n = r.read_number()    43     print number == n, number, n    44 r.close()    45     46 tuples = [(0, 0), (1, 3), (2, 5), (3, 9)]    47     48 f = open("testMS", "wb")    49 w = FileWriter(f)    50 last = w.get_initial_value(2)    51 for t in tuples:    52     last = w.write_sequence(t, last, 2)    53 w.close()    54     55 f = open("testMS", "rb")    56 r = FileReader(f)    57 last = r.get_initial_value(2)    58 for t in tuples:    59     last = t2 = r.read_sequence(last, 2)    60     print t == t2, t, t2    61 r.close()    62     63 tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)]    64     65 f = open("testNMS", "wb")    66 w = FileWriter(f)    67 last = w.get_initial_value(2)    68 for t in tuples2:    69     last = w.write_sequence(t, last, 2, monotonic=0)    70 w.close()    71     72 f = open("testNMS", "rb")    73 r = FileReader(f)    74 last = r.get_initial_value(2)    75 for t in tuples2:    76     last = t2 = r.read_sequence(last, 2, monotonic=0)    77     print t == t2, t, t2    78 r.close()    79     80 print "- Test positions."    81     82 all_doc_positions = [    83     [    84         (123, [1, 3, 5, 15, 25]),    85         (124, [0, 100]),    86         (125, [11, 99, 199]),    87         (130, [77, 78, 80, 82, 89])    88     ],    89     [    90         (78, [9]),    91         (196, [10, 11]),    92         (197, [17, 21, 30])    93     ]    94     ]    95     96 f = open("testP", "wb")    97 w = PositionWriter(f)    98 for doc_positions in all_doc_positions:    99     for docnum, positions in doc_positions:   100         w.write_positions(docnum, positions)   101     w.reset()   102 w.close()   103    104 f = open("testP", "rb")   105 r = PositionReader(f)   106 for doc_positions in all_doc_positions:   107     for docnum, positions in doc_positions:   108         d, p = r.read_positions()   109         print docnum == d, docnum, d   110         print positions == p, positions, p   111     r.reset()   112 r.close()   113    114 all_doc_positions_seq = [   115     [   116         ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]),   117         ((124, 1), [(0, 0), (100, 350)]),   118         ((124, 2), [(11, 38), (99, 379), (199, 720)]),   119         ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)])   120     ],   121     [   122         ((78, 1), [(9, 19)]),   123         ((196, 0), [(10, 27), (11, 29)]),   124         ((196, 1), [(17, 46), (21, 52), (30, 60)])   125     ]   126     ]   127    128 f = open("testP2", "wb")   129 w = PositionWriter(f)   130 for doc_positions in all_doc_positions_seq:   131     for docnum, positions in doc_positions:   132         w.write_positions(docnum, positions)   133     w.reset()   134 w.close()   135    136 f = open("testP2", "rb")   137 r = PositionReader(f)   138 for doc_positions in all_doc_positions_seq:   139     for docnum, positions in doc_positions:   140         d, p = r.read_positions()   141         print tuple(docnum) == tuple(d), docnum, d   142         print tuple(positions) == tuple(p), positions, p   143     r.reset()   144 r.close()   145    146 print "- Test position index files."   147    148 indexed_positions = [   149     [   150         (1234, 0, 100),   151         (2345, 700, 100),   152         (3456, 1900, 50)   153     ],   154     [   155         (4567, 2800, 20)   156     ]   157     ]   158    159 offsets = []   160 f = open("testPI", "wb")   161 w = PositionIndexWriter(f)   162 for term_positions in indexed_positions:   163     offset = None   164     doc_frequency = 0   165     w.reset()   166     for docnum, pos_offset, count in term_positions:   167         if offset is None:   168             offset = w.tell()   169         w.write_positions(docnum, pos_offset, count)   170         doc_frequency += count   171     offsets.append((offset, doc_frequency))   172 w.close()   173    174 r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb")))   175 offsets.reverse()   176 indexed_positions.reverse()   177 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):   178     r.seek(offset, doc_frequency)   179     for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):   180         print docnum == dn, docnum, dn   181         print pos_offset == po, pos_offset, po   182         print count == c, count, c   183 r.reader.close()   184    185 print "- Test position dictionaries."   186    187 f = open("testP", "wb")   188 w = PositionWriter(f)   189 f2 = open("testPI", "wb")   190 w2 = PositionIndexWriter(f2)   191 wd = PositionDictionaryWriter(w, w2, 2)   192 offsets = []   193 for doc_positions in all_doc_positions:   194     offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)   195     offsets.append((offset, doc_frequency))   196 wd.close()   197    198 r = PositionReader(open("testP", "rb"))   199 r2 = PositionIndexReader(open("testPI", "rb"))   200 rd = PositionDictionaryReader(r, r2)   201 offsets.reverse()   202 all_doc_positions.reverse()   203 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):   204     it = rd.read_term_positions(offset, doc_frequency)   205     dp = list(it)   206     print doc_positions == dp, doc_positions, dp   207 rd.close()   208    209 print "- Test fields."   210    211 doc_fields = [   212     (123, ["testing", "fields", "stored", "compressed"]),   213     (456, ["fields", "for a second", "document"]),   214     (789, ["field value"]),   215     (1234, []),   216     (2345, ["abc", "def"]),   217     (3456, ["apple", "banana", "cherry"]),   218     (4567, ["drue", "eple"])   219     ]   220    221 f = open("testF", "wb")   222 w = FieldWriter(f)   223 for docnum, fields in doc_fields:   224     w.write_fields(docnum, list(enumerate(fields)))   225 w.close()   226    227 f = open("testF", "rb")   228 r = FieldReader(f)   229 for docnum, fields in doc_fields:   230     dn, df = r.read_fields()   231     print docnum == dn, docnum, dn   232     print list(enumerate(fields)) == df, list(enumerate(fields)), df   233 r.close()   234    235 print "- Test field index files."   236    237 indexed_docs = [   238     (123, 100000987),   239     (456, 100004321),   240     (789, 100008765)   241     ]   242    243 f = open("testFI", "wb")   244 w = FieldIndexWriter(f)   245 for docnum, offset in indexed_docs:   246     w.write_document(docnum, offset)   247 w.close()   248    249 f = open("testFI", "rb")   250 r = FieldIndexReader(f)   251 for docnum, offset in indexed_docs:   252     dn, o = r.read_document()   253     print docnum == dn, docnum, dn   254     print offset == o, offset, o   255 r.close()   256    257 print "- Test field dictionaries."   258    259 f = open("testF", "wb")   260 w = FieldWriter(f)   261 f2 = open("testFI", "wb")   262 w2 = FieldIndexWriter(f2)   263 wd = FieldDictionaryWriter(w, w2, 3)   264 for docnum, fields in doc_fields:   265     wd.write_fields(docnum, list(enumerate(fields)))   266 wd.close()   267    268 f = open("testF", "rb")   269 r = FieldReader(f)   270 f2 = open("testFI", "rb")   271 r2 = FieldIndexReader(f2)   272 rd = FieldDictionaryReader(r, r2)   273 doc_fields_reversed = doc_fields[:]   274 doc_fields_reversed.reverse()   275 for docnum, fields in doc_fields_reversed:   276     df = dict(rd.get_fields(docnum))   277     print dict(enumerate(fields)) == df, dict(enumerate(fields)), df   278 for docnum in (13579, 246810):   279     df = rd.get_fields(docnum)   280     print df is None, df   281    282 print "- (Test sequential access.)"   283    284 rd.rewind()   285 for docnum, fields in doc_fields:   286     dn, df = rd.read_fields()   287     print docnum == dn, docnum, dn   288     print list(enumerate(fields)) == df, list(enumerate(fields)), df   289 rd.close()   290    291 print "- Test terms."   292    293 terms = [   294     # term       offset      frequency  doc_frequency   295     ("aardvark",  100000123,  1,         1),   296     ("anteater",  100000456,  2,         1),   297     ("badger",    100000789, 13,         7),   298     ("bull",     1000001234, 59,        17),   299     ("bulldog",  1000002345, 99,        80),   300     ("cat",      1000003456, 89,        28)   301     ]   302    303 f = open("test", "wb")   304 w = TermWriter(f)   305 for term, offset, frequency, doc_frequency in terms:   306     w.write_term(term, offset, frequency, doc_frequency)   307 w.close()   308    309 f = open("test", "rb")   310 r = TermReader(f)   311 for term, offset, frequency, doc_frequency in terms:   312     t, o, fr, df = r.read_term()   313     print term == t, term, t   314     print offset == o, offset, o   315     print frequency == fr, frequency, fr   316     print doc_frequency == df, doc_frequency, df   317 r.close()   318    319 print "- Test terms in index files."   320    321 indexed_terms = [   322     # term       offset      frequency  doc_frequency   info_offset   323     ("aardvark",  100000123,  1,         1,             200000321),   324     ("anteater",  100000456,  2,         1,             200000654),   325     ("badger",    100000789, 13,         7,             200000987),   326     ("bull",     1000001234, 59,        17,             200004321),   327     ("bulldog",  1000002345, 99,        80,             200005432),   328     ("cat",      1000003456, 89,        28,             200006543)   329     ]   330    331 f = open("test", "wb")   332 w = TermIndexWriter(f)   333 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   334     w.write_term(term, offset, frequency, doc_frequency, info_offset)   335 w.close()   336    337 f = open("test", "rb")   338 r = TermIndexReader(f)   339 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   340     t, o, fr, df, i = r.read_term()   341     print term == t, term, t   342     print offset == o, offset, o   343     print frequency == fr, frequency, fr   344     print doc_frequency == df, doc_frequency, df   345     print info_offset == i, info_offset, i   346 r.close()   347    348 print "- Test dictionaries with only term data."   349    350 f = open("test", "wb")   351 w = TermWriter(f)   352 f2 = open("testI", "wb")   353 w2 = TermIndexWriter(f2)   354 f3 = open("testP", "wb")   355 w3 = PositionWriter(f3)   356 f4 = open("testPI", "wb")   357 w4 = PositionIndexWriter(f4)   358 wp = PositionDictionaryWriter(w3, w4, 2)   359 wd = TermDictionaryWriter(w, w2, wp, 3)   360 for term, offset, frequency, doc_frequency in terms:   361     wd._write_term(term, offset, frequency, doc_frequency)   362 wd.close()   363    364 f = open("test", "rb")   365 r = TermReader(f)   366 f2 = open("testI", "rb")   367 r2 = TermIndexReader(f2)   368 r3 = PositionReader(open("testP", "rb"))   369 r4 = PositionIndexReader(open("testPI", "rb"))   370 rp = PositionDictionaryReader(r3, r4)   371 rd = TermDictionaryReader(r, r2, rp)   372 terms_reversed = terms[:]   373 terms_reversed.reverse()   374 for term, offset, frequency, doc_frequency in terms_reversed:   375     o, fr, df = rd._find_term(term)   376     print offset == o, offset, o   377     print frequency == fr, frequency, fr   378     print doc_frequency == df, doc_frequency, df   379 for term in ("dog", "dingo"):   380     t = rd._find_term(term)   381     print t is None, t   382    383 print "- (Test term prefix searching.)"   384    385 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]   386 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]   387 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]   388 print rd.find_terms("d") == [], rd.find_terms("d"), []   389 rd.close()   390    391 print "- Test dictionaries with term and position data."   392    393 terms_with_positions = [   394     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),   395     ("anteater",  [(1, [43, 44])]),   396     ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),   397     ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),   398     ("bulldog",   [(43, [17, 19, 256, 512])]),   399     ("cat",       [(123, [12, 145, 196]), (1200, [113])])   400     ]   401    402 position_dict_tests = [   403     ("badger", 19, [55, 1333]),   404     ("badger", 20, None),   405     ("bull", 6, [128]),   406     ("bull", 26, [1, 3, 5, 7, 9]),   407     ("cat", 111, None),   408     ("cat", 123, [12, 145, 196]),   409     ("cat", 1234, None)   410     ]   411    412 f = open("test", "wb")   413 w = TermWriter(f)   414 f2 = open("testI", "wb")   415 w2 = TermIndexWriter(f2)   416 f3 = open("testP", "wb")   417 w3 = PositionWriter(f3)   418 f4 = open("testPI", "wb")   419 w4 = PositionIndexWriter(f4)   420 wp = PositionDictionaryWriter(w3, w4, 2)   421 wd = TermDictionaryWriter(w, w2, wp, 3)   422 for term, doc_positions in terms_with_positions:   423     wd.write_term_positions(term, doc_positions)   424 wd.close()   425    426 f = open("test", "rb")   427 r = TermReader(f)   428 f2 = open("testI", "rb")   429 r2 = TermIndexReader(f2)   430 r3 = PositionReader(open("testP", "rb"))   431 r4 = PositionIndexReader(open("testPI", "rb"))   432 rp = PositionDictionaryReader(r3, r4)   433 rd = TermDictionaryReader(r, r2, rp)   434 terms_reversed = terms_with_positions[:]   435 terms_reversed.reverse()   436 for term, doc_positions in terms_reversed:   437     dp = list(rd.find_positions(term))   438     print doc_positions == dp, doc_positions, dp   439 for term in ("aaa", "dog", "dingo"):   440     dp = rd.find_positions(term)   441     print dp == [], dp   442    443 print "- (Test iterators.)"   444    445 for term, docnum, positions in position_dict_tests:   446     dp = rd.find_positions(term)   447     pos = dp.from_document(docnum)   448     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   449    450 print "- (Test sequential access.)"   451    452 rd.rewind()   453 for term, doc_positions in terms_with_positions:   454     t, fr, df, dp = rd.read_term()   455     dp = list(dp)   456     print term == t, term, t   457     print doc_positions == dp, doc_positions, dp   458 rd.close()   459    460 print "- Test high-level index operations (including merging)."   461    462 docs = [   463     (1, "The cat sat on the mat"),   464     (2, "Every good boy deserves football"),   465     (13, "One good turn deserves another"),   466     (14, "Every man for himself"),   467     (25, "Red sky at night shepherd's delight"),   468     (36, "She sells sea shells on the sea shore")   469     ]   470    471 doc_tests = [   472     ("Every", 2, [(2, [0]), (14, [0])]),   473     ("good", 2, [(2, [1]), (13, [1])]),   474     ("deserves", 2, [(2, [3]), (13, [3])]),   475     ("sea", 2, [(36, [2, 6])])   476     ]   477    478 position_tests = [   479     ("Every", 14, [0]),   480     ("sea", 36, [2, 6]),   481     ("shells", 1, None),   482     ("shells", 37, None)   483     ]   484    485 phrase_tests = [   486     (["good", "boy"], [(2, [1, 2])]),   487     (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),   488     (["sea", "shore"], [(36, [6, 7])])   489     ]   490    491 index = Index("test_index", 3, 2, 3, 6)   492 wi = index.get_writer()   493 for docnum, text in docs:   494     doc = Document(docnum)   495     for position, term in enumerate(text.split()):   496         doc.add_position(term, position)   497     doc.add_field(123, text)   498     wi.add_document(doc)   499 wi.close()   500    501 rd = index.get_reader()   502    503 print "- (Test searching.)"   504    505 for term, frequency, doc_positions in doc_tests:   506     dp = list(rd.find_positions(term))   507     print doc_positions == dp, doc_positions, dp   508     fr = rd.get_frequency(term)   509     print frequency == fr, frequency, fr   510    511 print "- (Test fields.)"   512    513 for docnum, text in docs:   514     df = dict(rd.get_fields(docnum))   515     print df[123] == text, text, df[123]   516    517 print "- (Test navigation.)"   518    519 for term, docnum, positions in position_tests:   520     dp = rd.find_positions(term)   521     pos = dp.from_document(docnum)   522     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   523    524 print "- (Test phrases.)"   525    526 for terms, results in phrase_tests:   527     res = list(rd.find_common_positions(terms))   528     print results == res, results, res   529    530 index.close()   531    532 docs2 = [   533     ((1, 0), "The cat sat on the mat"),   534     ((1, 2), "Every good boy deserves football"),   535     ((13, 1), "One good turn deserves another"),   536     ((14, 0), "Every man for himself"),   537     ((14, 25), "Red sky at night shepherd's delight"),   538     ((36, 12), "She sells sea shells on the sea shore")   539     ]   540    541 doc_tests2 = [   542     ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]),   543     ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]),   544     ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]),   545     ("sea", 2, [((36, 12), [(2, 10), (6, 28)])])   546     ]   547    548 position_tests2 = [   549     ("Every", (14, 0), [(0, 0)]),   550     ("sea", (36, 12), [(2, 10), (6, 28)]),   551     ("shells", (1, 0), None),   552     ("shells", (37, 0), None)   553     ]   554    555 phrase_tests2 = [   556     (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]),   557     (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]),   558     (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])])   559     ]   560    561 index = Index("test_indexT", 3, 2, 3, 6)   562 wi = index.get_writer()   563 for docnum, text in docs2:   564     doc = Document(docnum)   565     offset = 0   566     for position, term in enumerate(text.split()):   567         doc.add_position(term, (position, offset))   568         offset += len(term) + 1 # assume one space after the term   569     doc.add_field(123, text)   570     wi.add_document(doc)   571 wi.close()   572    573 rd = index.get_reader()   574    575 print "- (Test searching.)"   576    577 for term, frequency, doc_positions in doc_tests2:   578     dp = list(rd.find_positions(term))   579     print doc_positions == dp, doc_positions, dp   580     fr = rd.get_frequency(term)   581     print frequency == fr, frequency, fr   582    583 print "- (Test fields.)"   584    585 for docnum, text in docs2:   586     df = dict(rd.get_fields(docnum))   587     print df[123] == text, text, df[123]   588    589 print "- (Test navigation.)"   590    591 for term, docnum, positions in position_tests2:   592     dp = rd.find_positions(term)   593     pos = dp.from_document(docnum)   594     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   595    596 print "- (Test phrases.)"   597    598 for terms, results in phrase_tests2:   599     res = list(rd.find_common_positions(terms))   600     print results == res, results, res   601    602 index.close()   603    604 print "- Test index updates."   605    606 index = Index("test_index")   607 index2 = Index("test_index2", 3, 2, 3, 6)   608 wi = index2.get_writer()   609 for docnum, text in docs:   610    611     # Add the same documents but with different numbers.   612    613     doc = Document(docnum + 100)   614     for position, term in enumerate(text.split()):   615         doc.add_position(term, position)   616     doc.add_field(123, text)   617     wi.add_document(doc)   618 wi.close()   619    620 index2.update([index])   621 index.close()   622    623 rd = index2.get_reader()   624 for term, frequency, doc_positions in doc_tests:   625    626     # Add the extra documents to the expected result.   627    628     orig_doc_positions = doc_positions   629     doc_positions = doc_positions[:]   630    631     for docnum, positions in orig_doc_positions:   632         doc_positions.append((docnum + 100, positions))   633     frequency *= 2   634    635     dp = list(rd.find_positions(term))   636     print doc_positions == dp, doc_positions, dp   637     fr = rd.get_frequency(term)   638     print frequency == fr, frequency, fr   639 index2.close()   640    641 print "- (Test update of an empty index.)"   642    643 index = Index("test_index")   644 index3 = Index("test_index3")   645 index3.update([index])   646 index.close()   647    648 rd = index3.get_reader()   649 for term, frequency, doc_positions in doc_tests:   650     dp = list(rd.find_positions(term))   651     print doc_positions == dp, doc_positions, dp   652     fr = rd.get_frequency(term)   653     print frequency == fr, frequency, fr   654 index3.close()   655    656 # vim: tabstop=4 expandtab shiftwidth=4