iixr (file test.py at 489129c7f225)

     1 #!/usr/bin/env python     2      3 from iixr.files import *     4 from iixr.fields import *     5 from iixr.terms import *     6 from iixr.positions import *     7 from iixr.index import *     8 from array import array     9 import os, sys    10     11 # Remove old test files.    12     13 for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"):    14     try:    15         os.remove(filename)    16     except OSError:    17         pass    18     19 try:    20     for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"):    21         for filename in os.listdir(dirname):    22             os.remove(os.path.join(dirname, filename))    23         os.rmdir(dirname)    24 except OSError:    25     pass    26     27 if "clean" in sys.argv:    28     sys.exit(0)    29     30 print "- Test basic data types."    31     32 numbers = [12345678, 0, 1, 127, 128, 255, 256]    33     34 f = open("test", "wb")    35 w = FileWriter(f)    36 for number in numbers:    37     w.write_number(number)    38 w.close()    39     40 f = open("test", "rb")    41 r = FileReader(f)    42 for number in numbers:    43     n = r.read_number()    44     print number == n, number, n    45 r.close()    46     47 tuples = [(0, 0), (1, 3), (2, 5), (3, 9)]    48     49 f = open("testMS", "wb")    50 w = FileWriter(f)    51 b = array("B")    52 last = w.get_initial_value(2)    53 for t in tuples:    54     last = w.write_sequence(b, t, last, 2)    55 b.tofile(w.f)    56 w.close()    57     58 f = open("testMS", "rb")    59 r = FileReader(f)    60 last = r.get_initial_value(2)    61 for t in tuples:    62     last = t2 = r.read_sequence(last, 2)    63     print t == t2, t, t2    64 r.close()    65     66 tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)]    67     68 f = open("testNMS", "wb")    69 w = FileWriter(f)    70 b = array("B")    71 last = w.get_initial_value(2)    72 for t in tuples2:    73     last = w.write_sequence(b, t, last, 2, monotonic=0)    74 b.tofile(w.f)    75 w.close()    76     77 f = open("testNMS", "rb")    78 r = FileReader(f)    79 last = r.get_initial_value(2)    80 for t in tuples2:    81     last = t2 = r.read_sequence(last, 2, monotonic=0)    82     print t == t2, t, t2    83 r.close()    84     85 print "- Test positions."    86     87 all_doc_positions = [    88     [    89         (123, [1, 3, 5, 15, 25]),    90         (124, [0, 100]),    91         (125, [11, 99, 199]),    92         (130, [77, 78, 80, 82, 89])    93     ],    94     [    95         (78, [9]),    96         (196, [10, 11]),    97         (197, [17, 21, 30])    98     ]    99     ]   100    101 f = open("testP", "wb")   102 w = PositionWriter(f)   103 for doc_positions in all_doc_positions:   104     for docnum, positions in doc_positions:   105         w.write_positions(docnum, positions)   106     w.reset()   107 w.close()   108    109 f = open("testP", "rb")   110 r = PositionReader(f)   111 for doc_positions in all_doc_positions:   112     for docnum, positions in doc_positions:   113         d, p = r.read_positions()   114         print docnum == d, docnum, d   115         print positions == p, positions, p   116     r.reset()   117 r.close()   118    119 all_doc_positions_seq = [   120     [   121         ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]),   122         ((124, 1), [(0, 0), (100, 350)]),   123         ((124, 2), [(11, 38), (99, 379), (199, 720)]),   124         ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)])   125     ],   126     [   127         ((78, 1), [(9, 19)]),   128         ((196, 0), [(10, 27), (11, 29)]),   129         ((196, 1), [(17, 46), (21, 52), (30, 60)])   130     ]   131     ]   132    133 f = open("testP2", "wb")   134 w = PositionWriter(f)   135 for doc_positions in all_doc_positions_seq:   136     for docnum, positions in doc_positions:   137         w.write_positions(docnum, positions)   138     w.reset()   139 w.close()   140    141 f = open("testP2", "rb")   142 r = PositionReader(f)   143 for doc_positions in all_doc_positions_seq:   144     for docnum, positions in doc_positions:   145         d, p = r.read_positions()   146         print tuple(docnum) == tuple(d), docnum, d   147         print tuple(positions) == tuple(p), positions, p   148     r.reset()   149 r.close()   150    151 print "- Test position index files."   152    153 indexed_positions = [   154     [   155         (1234, 0, 100),   156         (2345, 700, 100),   157         (3456, 1900, 50)   158     ],   159     [   160         (4567, 2800, 20)   161     ]   162     ]   163    164 offsets = []   165 f = open("testPI", "wb")   166 w = PositionIndexWriter(f)   167 for term_positions in indexed_positions:   168     offset = None   169     doc_frequency = 0   170     w.reset()   171     for docnum, pos_offset, count in term_positions:   172         if offset is None:   173             offset = w.f.tell()   174         w.write_positions(docnum, pos_offset, count)   175         doc_frequency += count   176     offsets.append((offset, doc_frequency))   177 w.close()   178    179 r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb")))   180 offsets.reverse()   181 indexed_positions.reverse()   182 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):   183     r.seek(offset, doc_frequency)   184     for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):   185         print docnum == dn, docnum, dn   186         print pos_offset == po, pos_offset, po   187         print count == c, count, c   188 r.reader.close()   189    190 print "- Test position dictionaries."   191    192 f = open("testP", "wb")   193 w = PositionWriter(f)   194 f2 = open("testPI", "wb")   195 w2 = PositionIndexWriter(f2)   196 wd = PositionDictionaryWriter(w, w2, 2)   197 offsets = []   198 for doc_positions in all_doc_positions:   199     offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)   200     offsets.append((offset, doc_frequency))   201 wd.close()   202    203 r = PositionReader(open("testP", "rb"))   204 r2 = PositionIndexReader(open("testPI", "rb"))   205 rd = PositionDictionaryReader(r, r2)   206 offsets.reverse()   207 all_doc_positions.reverse()   208 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):   209     it = rd.read_term_positions(offset, doc_frequency)   210     dp = list(it)   211     print doc_positions == dp, doc_positions, dp   212 rd.close()   213    214 print "- Test fields."   215    216 doc_fields = [   217     (123, ["testing", "fields", "stored", "compressed"]),   218     (456, ["fields", "for a second", "document"]),   219     (789, ["field value"]),   220     (1234, []),   221     (2345, ["abc", "def"]),   222     (3456, ["apple", "banana", "cherry"]),   223     (4567, ["drue", "eple"])   224     ]   225    226 f = open("testF", "wb")   227 w = FieldWriter(f)   228 for docnum, fields in doc_fields:   229     w.write_fields(docnum, list(enumerate(fields)))   230 w.close()   231    232 f = open("testF", "rb")   233 r = FieldReader(f)   234 for docnum, fields in doc_fields:   235     dn, df = r.read_fields()   236     print docnum == dn, docnum, dn   237     print list(enumerate(fields)) == df, list(enumerate(fields)), df   238 r.close()   239    240 print "- Test field index files."   241    242 indexed_docs = [   243     (123, 100000987),   244     (456, 100004321),   245     (789, 100008765)   246     ]   247    248 f = open("testFI", "wb")   249 w = FieldIndexWriter(f)   250 for docnum, offset in indexed_docs:   251     w.write_document(docnum, offset)   252 w.close()   253    254 f = open("testFI", "rb")   255 r = FieldIndexReader(f)   256 for docnum, offset in indexed_docs:   257     dn, o = r.read_document()   258     print docnum == dn, docnum, dn   259     print offset == o, offset, o   260 r.close()   261    262 print "- Test field dictionaries."   263    264 f = open("testF", "wb")   265 w = FieldWriter(f)   266 f2 = open("testFI", "wb")   267 w2 = FieldIndexWriter(f2)   268 wd = FieldDictionaryWriter(w, w2, 3)   269 for docnum, fields in doc_fields:   270     wd.write_fields(docnum, list(enumerate(fields)))   271 wd.close()   272    273 f = open("testF", "rb")   274 r = FieldReader(f)   275 f2 = open("testFI", "rb")   276 r2 = FieldIndexReader(f2)   277 rd = FieldDictionaryReader(r, r2)   278 doc_fields_reversed = doc_fields[:]   279 doc_fields_reversed.reverse()   280 for docnum, fields in doc_fields_reversed:   281     df = dict(rd.get_fields(docnum))   282     print dict(enumerate(fields)) == df, dict(enumerate(fields)), df   283 for docnum in (13579, 246810):   284     df = rd.get_fields(docnum)   285     print df is None, df   286    287 print "- (Test sequential access.)"   288    289 rd.rewind()   290 for docnum, fields in doc_fields:   291     dn, df = rd.read_fields()   292     print docnum == dn, docnum, dn   293     print list(enumerate(fields)) == df, list(enumerate(fields)), df   294 rd.close()   295    296 print "- Test terms."   297    298 terms = [   299     # term       offset      frequency  doc_frequency   300     ("aardvark",  100000123,  1,         1),   301     ("anteater",  100000456,  2,         1),   302     ("badger",    100000789, 13,         7),   303     ("bull",     1000001234, 59,        17),   304     ("bulldog",  1000002345, 99,        80),   305     ("cat",      1000003456, 89,        28)   306     ]   307    308 f = open("test", "wb")   309 w = TermWriter(f)   310 for term, offset, frequency, doc_frequency in terms:   311     w.write_term(term, offset, frequency, doc_frequency)   312 w.close()   313    314 f = open("test", "rb")   315 r = TermReader(f)   316 for term, offset, frequency, doc_frequency in terms:   317     t, o, fr, df = r.read_term()   318     print term == t, term, t   319     print offset == o, offset, o   320     print frequency == fr, frequency, fr   321     print doc_frequency == df, doc_frequency, df   322 r.close()   323    324 print "- Test terms in index files."   325    326 indexed_terms = [   327     # term       offset      frequency  doc_frequency   info_offset   328     ("aardvark",  100000123,  1,         1,             200000321),   329     ("anteater",  100000456,  2,         1,             200000654),   330     ("badger",    100000789, 13,         7,             200000987),   331     ("bull",     1000001234, 59,        17,             200004321),   332     ("bulldog",  1000002345, 99,        80,             200005432),   333     ("cat",      1000003456, 89,        28,             200006543)   334     ]   335    336 f = open("test", "wb")   337 w = TermIndexWriter(f)   338 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   339     w.write_term(term, offset, frequency, doc_frequency, info_offset)   340 w.close()   341    342 f = open("test", "rb")   343 r = TermIndexReader(f)   344 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   345     t, o, fr, df, i = r.read_term()   346     print term == t, term, t   347     print offset == o, offset, o   348     print frequency == fr, frequency, fr   349     print doc_frequency == df, doc_frequency, df   350     print info_offset == i, info_offset, i   351 r.close()   352    353 print "- Test dictionaries with only term data."   354    355 f = open("test", "wb")   356 w = TermWriter(f)   357 f2 = open("testI", "wb")   358 w2 = TermIndexWriter(f2)   359 f3 = open("testP", "wb")   360 w3 = PositionWriter(f3)   361 f4 = open("testPI", "wb")   362 w4 = PositionIndexWriter(f4)   363 wp = PositionDictionaryWriter(w3, w4, 2)   364 wd = TermDictionaryWriter(w, w2, wp, 3)   365 for term, offset, frequency, doc_frequency in terms:   366     wd._write_term(term, offset, frequency, doc_frequency)   367 wd.close()   368    369 f = open("test", "rb")   370 r = TermReader(f)   371 f2 = open("testI", "rb")   372 r2 = TermIndexReader(f2)   373 r3 = PositionReader(open("testP", "rb"))   374 r4 = PositionIndexReader(open("testPI", "rb"))   375 rp = PositionDictionaryReader(r3, r4)   376 rd = TermDictionaryReader(r, r2, rp)   377 terms_reversed = terms[:]   378 terms_reversed.reverse()   379 for term, offset, frequency, doc_frequency in terms_reversed:   380     o, fr, df = rd._find_term(term)   381     print offset == o, offset, o   382     print frequency == fr, frequency, fr   383     print doc_frequency == df, doc_frequency, df   384 for term in ("dog", "dingo"):   385     t = rd._find_term(term)   386     print t is None, t   387    388 print "- (Test term prefix searching.)"   389    390 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]   391 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]   392 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]   393 print rd.find_terms("d") == [], rd.find_terms("d"), []   394 rd.close()   395    396 print "- Test dictionaries with term and position data."   397    398 terms_with_positions = [   399     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),   400     ("anteater",  [(1, [43, 44])]),   401     ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),   402     ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),   403     ("bulldog",   [(43, [17, 19, 256, 512])]),   404     ("cat",       [(123, [12, 145, 196]), (1200, [113])])   405     ]   406    407 position_dict_tests = [   408     ("badger", 19, [55, 1333]),   409     ("badger", 20, None),   410     ("bull", 6, [128]),   411     ("bull", 26, [1, 3, 5, 7, 9]),   412     ("cat", 111, None),   413     ("cat", 123, [12, 145, 196]),   414     ("cat", 1234, None)   415     ]   416    417 f = open("test", "wb")   418 w = TermWriter(f)   419 f2 = open("testI", "wb")   420 w2 = TermIndexWriter(f2)   421 f3 = open("testP", "wb")   422 w3 = PositionWriter(f3)   423 f4 = open("testPI", "wb")   424 w4 = PositionIndexWriter(f4)   425 wp = PositionDictionaryWriter(w3, w4, 2)   426 wd = TermDictionaryWriter(w, w2, wp, 3)   427 for term, doc_positions in terms_with_positions:   428     wd.write_term_positions(term, doc_positions)   429 wd.close()   430    431 f = open("test", "rb")   432 r = TermReader(f)   433 f2 = open("testI", "rb")   434 r2 = TermIndexReader(f2)   435 r3 = PositionReader(open("testP", "rb"))   436 r4 = PositionIndexReader(open("testPI", "rb"))   437 rp = PositionDictionaryReader(r3, r4)   438 rd = TermDictionaryReader(r, r2, rp)   439 terms_reversed = terms_with_positions[:]   440 terms_reversed.reverse()   441 for term, doc_positions in terms_reversed:   442     dp = list(rd.find_positions(term))   443     print doc_positions == dp, doc_positions, dp   444 for term in ("aaa", "dog", "dingo"):   445     dp = rd.find_positions(term)   446     print dp == [], dp   447    448 print "- (Test iterators.)"   449    450 for term, docnum, positions in position_dict_tests:   451     dp = rd.find_positions(term)   452     pos = dp.from_document(docnum)   453     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   454    455 print "- (Test sequential access.)"   456    457 rd.rewind()   458 for term, doc_positions in terms_with_positions:   459     t, fr, df, dp = rd.read_term()   460     dp = list(dp)   461     print term == t, term, t   462     print doc_positions == dp, doc_positions, dp   463 rd.close()   464    465 print "- Test high-level index operations (including merging)."   466    467 docs = [   468     (1, "The cat sat on the mat"),   469     (2, "Every good boy deserves football"),   470     (13, "One good turn deserves another"),   471     (14, "Every man for himself"),   472     (25, "Red sky at night shepherd's delight"),   473     (36, "She sells sea shells on the sea shore")   474     ]   475    476 doc_tests = [   477     ("Every", 2, [(2, [0]), (14, [0])]),   478     ("good", 2, [(2, [1]), (13, [1])]),   479     ("deserves", 2, [(2, [3]), (13, [3])]),   480     ("sea", 2, [(36, [2, 6])])   481     ]   482    483 position_tests = [   484     ("Every", 14, [0]),   485     ("sea", 36, [2, 6]),   486     ("shells", 1, None),   487     ("shells", 37, None)   488     ]   489    490 phrase_tests = [   491     (["good", "boy"], [(2, [1, 2])]),   492     (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),   493     (["sea", "shore"], [(36, [6, 7])])   494     ]   495    496 index = Index("test_index", 3, 2, 3, 6)   497 wi = index.get_writer()   498 for docnum, text in docs:   499     doc = Document(docnum)   500     for position, term in enumerate(text.split()):   501         doc.add_position(term, position)   502     doc.add_field(123, text)   503     wi.add_document(doc)   504 wi.close()   505    506 rd = index.get_reader()   507    508 print "- (Test searching.)"   509    510 for term, frequency, doc_positions in doc_tests:   511     dp = list(rd.find_positions(term))   512     print doc_positions == dp, doc_positions, dp   513     fr = rd.get_frequency(term)   514     print frequency == fr, frequency, fr   515    516 print "- (Test fields.)"   517    518 for docnum, text in docs:   519     df = dict(rd.get_fields(docnum))   520     print df[123] == text, text, df[123]   521    522 print "- (Test navigation.)"   523    524 for term, docnum, positions in position_tests:   525     dp = rd.find_positions(term)   526     pos = dp.from_document(docnum)   527     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   528    529 print "- (Test phrases.)"   530    531 for terms, results in phrase_tests:   532     res = list(rd.find_common_positions(terms))   533     print results == res, results, res   534    535 index.close()   536    537 docs2 = [   538     ((1, 0), "The cat sat on the mat"),   539     ((1, 2), "Every good boy deserves football"),   540     ((13, 1), "One good turn deserves another"),   541     ((14, 0), "Every man for himself"),   542     ((14, 25), "Red sky at night shepherd's delight"),   543     ((36, 12), "She sells sea shells on the sea shore")   544     ]   545    546 doc_tests2 = [   547     ("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]),   548     ("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]),   549     ("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]),   550     ("sea", 2, [((36, 12), [(2, 10), (6, 28)])])   551     ]   552    553 position_tests2 = [   554     ("Every", (14, 0), [(0, 0)]),   555     ("sea", (36, 12), [(2, 10), (6, 28)]),   556     ("shells", (1, 0), None),   557     ("shells", (37, 0), None)   558     ]   559    560 phrase_tests2 = [   561     (["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]),   562     (["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]),   563     (["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])])   564     ]   565    566 index = Index("test_indexT", 3, 2, 3, 6)   567 wi = index.get_writer()   568 for docnum, text in docs2:   569     doc = Document(docnum)   570     offset = 0   571     for position, term in enumerate(text.split()):   572         doc.add_position(term, (position, offset))   573         offset += len(term) + 1 # assume one space after the term   574     doc.add_field(123, text)   575     wi.add_document(doc)   576 wi.close()   577    578 rd = index.get_reader()   579    580 print "- (Test searching.)"   581    582 for term, frequency, doc_positions in doc_tests2:   583     dp = list(rd.find_positions(term))   584     print doc_positions == dp, doc_positions, dp   585     fr = rd.get_frequency(term)   586     print frequency == fr, frequency, fr   587    588 print "- (Test fields.)"   589    590 for docnum, text in docs2:   591     df = dict(rd.get_fields(docnum))   592     print df[123] == text, text, df[123]   593    594 print "- (Test navigation.)"   595    596 for term, docnum, positions in position_tests2:   597     dp = rd.find_positions(term)   598     pos = dp.from_document(docnum)   599     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   600    601 print "- (Test phrases.)"   602    603 for terms, results in phrase_tests2:   604     res = list(rd.find_common_positions(terms))   605     print results == res, results, res   606    607 index.close()   608    609 print "- Test index updates."   610    611 index = Index("test_index")   612 index2 = Index("test_index2", 3, 2, 3, 6)   613 wi = index2.get_writer()   614 for docnum, text in docs:   615    616     # Add the same documents but with different numbers.   617    618     doc = Document(docnum + 100)   619     for position, term in enumerate(text.split()):   620         doc.add_position(term, position)   621     doc.add_field(123, text)   622     wi.add_document(doc)   623 wi.close()   624    625 index2.update([index])   626 index.close()   627    628 rd = index2.get_reader()   629 for term, frequency, doc_positions in doc_tests:   630    631     # Add the extra documents to the expected result.   632    633     orig_doc_positions = doc_positions   634     doc_positions = doc_positions[:]   635    636     for docnum, positions in orig_doc_positions:   637         doc_positions.append((docnum + 100, positions))   638     frequency *= 2   639    640     dp = list(rd.find_positions(term))   641     print doc_positions == dp, doc_positions, dp   642     fr = rd.get_frequency(term)   643     print frequency == fr, frequency, fr   644 index2.close()   645    646 print "- (Test update of an empty index.)"   647    648 index = Index("test_index")   649 index3 = Index("test_index3")   650 index3.update([index])   651 index.close()   652    653 rd = index3.get_reader()   654 for term, frequency, doc_positions in doc_tests:   655     dp = list(rd.find_positions(term))   656     print doc_positions == dp, doc_positions, dp   657     fr = rd.get_frequency(term)   658     print frequency == fr, frequency, fr   659 index3.close()   660    661 # vim: tabstop=4 expandtab shiftwidth=4