iixr (file test.py at ff3800a700d5)

     1 #!/usr/bin/env python     2      3 from iixr.files import *     4 from iixr.fields import *     5 from iixr.terms import *     6 from iixr.positions import *     7 from iixr.index import *     8 import os, sys     9     10 # Remove old test files.    11     12 for filename in ("test", "testF", "testFI", "testI", "testP", "testPI"):    13     try:    14         os.remove(filename)    15     except OSError:    16         pass    17     18 try:    19     for dirname in ("test_index", "test_index2", "test_index3"):    20         for filename in os.listdir(dirname):    21             os.remove(os.path.join(dirname, filename))    22         os.rmdir(dirname)    23 except OSError:    24     pass    25     26 if "clean" in sys.argv:    27     sys.exit(0)    28     29 # Test basic data types.    30     31 numbers = [12345678, 0, 1, 127, 128, 255, 256]    32     33 f = open("test", "wb")    34 w = FileWriter(f)    35 for number in numbers:    36     w.write_number(number)    37 w.close()    38     39 f = open("test", "rb")    40 r = FileReader(f)    41 for number in numbers:    42     n = r.read_number()    43     print number == n, number, n    44 r.close()    45     46 # Test positions.    47     48 all_doc_positions = [    49     [    50         (123, [1, 3, 5, 15, 25]),    51         (124, [0, 100]),    52         (125, [11, 99, 199]),    53         (130, [77, 78, 80, 82, 89])    54     ],    55     [    56         (78, [9]),    57         (196, [10, 11]),    58         (197, [17, 21, 30])    59     ]    60     ]    61     62 f = open("testP", "wb")    63 w = PositionWriter(f)    64 for doc_positions in all_doc_positions:    65     for docnum, positions in doc_positions:    66         w.write_positions(docnum, positions)    67     w.reset()    68 w.close()    69     70 f = open("testP", "rb")    71 r = PositionIterator(f, 0, None)    72 for doc_positions in all_doc_positions:    73     for docnum, positions in doc_positions:    74         d, p = r.read_positions()    75         print docnum == d, docnum, d    76         print positions == p, positions, p    77     r.reset()    78 r.close()    79     80 # Test position index files.    81     82 indexed_positions = [    83     [    84         (1234, 0, 100),    85         (2345, 700, 100),    86         (3456, 1900, 50)    87     ],    88     [    89         (4567, 2800, 20)    90     ]    91     ]    92     93 offsets = []    94 f = open("testPI", "wb")    95 w = PositionIndexWriter(f)    96 for term_positions in indexed_positions:    97     offset = None    98     doc_frequency = 0    99     w.reset()   100     for docnum, pos_offset, count in term_positions:   101         if offset is None:   102             offset = w.f.tell()   103         w.write_positions(docnum, pos_offset, count)   104         doc_frequency += count   105     offsets.append((offset, doc_frequency))   106 w.close()   107    108 r = PositionIndexOpener("testPI")   109 offsets.reverse()   110 indexed_positions.reverse()   111 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):   112     found_positions = r.read_term_positions(offset, doc_frequency)   113     for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):   114         print docnum == dn, docnum, dn   115         print pos_offset == po, pos_offset, po   116         print count == c, count, c   117 r.close()   118    119 # Test position dictionaries.   120    121 f = open("testP", "wb")   122 w = PositionWriter(f)   123 f2 = open("testPI", "wb")   124 w2 = PositionIndexWriter(f2)   125 wd = PositionDictionaryWriter(w, w2, 2)   126 offsets = []   127 for doc_positions in all_doc_positions:   128     offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)   129     offsets.append((offset, doc_frequency))   130 wd.close()   131    132 r = PositionOpener("testP")   133 r2 = PositionIndexOpener("testPI")   134 rd = PositionDictionaryReader(r, r2)   135 offsets.reverse()   136 all_doc_positions.reverse()   137 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):   138     dp = list(rd.read_term_positions(offset, doc_frequency))   139     print doc_positions == dp, doc_positions, dp   140 rd.close()   141    142 # Test fields.   143    144 doc_fields = [   145     (123, ["testing", "fields", "stored", "compressed"]),   146     (456, ["fields", "for a second", "document"]),   147     (789, ["field value"]),   148     (1234, []),   149     (2345, ["abc", "def"]),   150     (3456, ["apple", "banana", "cherry"]),   151     (4567, ["drue", "eple"])   152     ]   153    154 f = open("testF", "wb")   155 w = FieldWriter(f)   156 for docnum, fields in doc_fields:   157     w.write_fields(docnum, list(enumerate(fields)))   158 w.close()   159    160 f = open("testF", "rb")   161 r = FieldReader(f)   162 for docnum, fields in doc_fields:   163     dn, df = r.read_fields()   164     print docnum == dn, docnum, dn   165     print list(enumerate(fields)) == df, list(enumerate(fields)), df   166 r.close()   167    168 # Test field index files.   169    170 indexed_docs = [   171     (123, 100000987),   172     (456, 100004321),   173     (789, 100008765)   174     ]   175    176 f = open("testFI", "wb")   177 w = FieldIndexWriter(f)   178 for docnum, offset in indexed_docs:   179     w.write_document(docnum, offset)   180 w.close()   181    182 f = open("testFI", "rb")   183 r = FieldIndexReader(f)   184 for docnum, offset in indexed_docs:   185     dn, o = r.read_document()   186     print docnum == dn, docnum, dn   187     print offset == o, offset, o   188 r.close()   189    190 # Test field dictionaries.   191    192 f = open("testF", "wb")   193 w = FieldWriter(f)   194 f2 = open("testFI", "wb")   195 w2 = FieldIndexWriter(f2)   196 wd = FieldDictionaryWriter(w, w2, 3)   197 for docnum, fields in doc_fields:   198     wd.write_fields(docnum, list(enumerate(fields)))   199 wd.close()   200    201 f = open("testF", "rb")   202 r = FieldReader(f)   203 f2 = open("testFI", "rb")   204 r2 = FieldIndexReader(f2)   205 rd = FieldDictionaryReader(r, r2)   206 doc_fields_reversed = doc_fields[:]   207 doc_fields_reversed.reverse()   208 for docnum, fields in doc_fields_reversed:   209     df = dict(rd.get_fields(docnum))   210     print dict(enumerate(fields)) == df, dict(enumerate(fields)), df   211 for docnum in (13579, 246810):   212     df = rd.get_fields(docnum)   213     print df is None, df   214    215 # (Test sequential access.)   216    217 rd.rewind()   218 for docnum, fields in doc_fields:   219     dn, df = rd.read_fields()   220     print docnum == dn, docnum, dn   221     print list(enumerate(fields)) == df, list(enumerate(fields)), df   222 rd.close()   223    224 # Test terms.   225    226 terms = [   227     # term       offset      frequency  doc_frequency   228     ("aardvark",  100000123,  1,         1),   229     ("anteater",  100000456,  2,         1),   230     ("badger",    100000789, 13,         7),   231     ("bull",     1000001234, 59,        17),   232     ("bulldog",  1000002345, 99,        80),   233     ("cat",      1000003456, 89,        28)   234     ]   235    236 f = open("test", "wb")   237 w = TermWriter(f)   238 for term, offset, frequency, doc_frequency in terms:   239     w.write_term(term, offset, frequency, doc_frequency)   240 w.close()   241    242 f = open("test", "rb")   243 r = TermReader(f)   244 for term, offset, frequency, doc_frequency in terms:   245     t, o, fr, df = r.read_term()   246     print term == t, term, t   247     print offset == o, offset, o   248     print frequency == fr, frequency, fr   249     print doc_frequency == df, doc_frequency, df   250 r.close()   251    252 # Test terms in index files.   253    254 indexed_terms = [   255     # term       offset      frequency  doc_frequency   info_offset   256     ("aardvark",  100000123,  1,         1,             200000321),   257     ("anteater",  100000456,  2,         1,             200000654),   258     ("badger",    100000789, 13,         7,             200000987),   259     ("bull",     1000001234, 59,        17,             200004321),   260     ("bulldog",  1000002345, 99,        80,             200005432),   261     ("cat",      1000003456, 89,        28,             200006543)   262     ]   263    264 f = open("test", "wb")   265 w = TermIndexWriter(f)   266 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   267     w.write_term(term, offset, frequency, doc_frequency, info_offset)   268 w.close()   269    270 f = open("test", "rb")   271 r = TermIndexReader(f)   272 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   273     t, o, fr, df, i = r.read_term()   274     print term == t, term, t   275     print offset == o, offset, o   276     print frequency == fr, frequency, fr   277     print doc_frequency == df, doc_frequency, df   278     print info_offset == i, info_offset, i   279 r.close()   280    281 # Test dictionaries with only term data.   282    283 f = open("test", "wb")   284 w = TermWriter(f)   285 f2 = open("testI", "wb")   286 w2 = TermIndexWriter(f2)   287 f3 = open("testP", "wb")   288 w3 = PositionWriter(f3)   289 f4 = open("testPI", "wb")   290 w4 = PositionIndexWriter(f4)   291 wp = PositionDictionaryWriter(w3, w4, 2)   292 wd = TermDictionaryWriter(w, w2, wp, 3)   293 for term, offset, frequency, doc_frequency in terms:   294     wd._write_term(term, offset, frequency, doc_frequency)   295 wd.close()   296    297 f = open("test", "rb")   298 r = TermReader(f)   299 f2 = open("testI", "rb")   300 r2 = TermIndexReader(f2)   301 r3 = PositionOpener("testP")   302 r4 = PositionIndexOpener("testPI")   303 rp = PositionDictionaryReader(r3, r4)   304 rd = TermDictionaryReader(r, r2, rp)   305 terms_reversed = terms[:]   306 terms_reversed.reverse()   307 for term, offset, frequency, doc_frequency in terms_reversed:   308     o, fr, df = rd._find_term(term)   309     print offset == o, offset, o   310     print frequency == fr, frequency, fr   311     print doc_frequency == df, doc_frequency, df   312 for term in ("dog", "dingo"):   313     t = rd._find_term(term)   314     print t is None, t   315    316 # (Test term prefix searching.)   317    318 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]   319 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]   320 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]   321 print rd.find_terms("d") == [], rd.find_terms("d"), []   322 rd.close()   323    324 # Test dictionaries with term and position data.   325    326 terms_with_positions = [   327     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),   328     ("anteater",  [(1, [43, 44])]),   329     ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),   330     ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),   331     ("bulldog",   [(43, [17, 19, 256, 512])]),   332     ("cat",       [(123, [12, 145, 196]), (1200, [113])])   333     ]   334    335 position_dict_tests = [   336     ("badger", 19, [55, 1333]),   337     ("badger", 20, None),   338     ("bull", 6, [128]),   339     ("bull", 26, [1, 3, 5, 7, 9]),   340     ("cat", 111, None),   341     ("cat", 123, [12, 145, 196]),   342     ("cat", 1234, None)   343     ]   344    345 f = open("test", "wb")   346 w = TermWriter(f)   347 f2 = open("testI", "wb")   348 w2 = TermIndexWriter(f2)   349 f3 = open("testP", "wb")   350 w3 = PositionWriter(f3)   351 f4 = open("testPI", "wb")   352 w4 = PositionIndexWriter(f4)   353 wp = PositionDictionaryWriter(w3, w4, 2)   354 wd = TermDictionaryWriter(w, w2, wp, 3)   355 for term, doc_positions in terms_with_positions:   356     wd.write_term_positions(term, doc_positions)   357 wd.close()   358    359 f = open("test", "rb")   360 r = TermReader(f)   361 f2 = open("testI", "rb")   362 r2 = TermIndexReader(f2)   363 r3 = PositionOpener("testP")   364 r4 = PositionIndexOpener("testPI")   365 rp = PositionDictionaryReader(r3, r4)   366 rd = TermDictionaryReader(r, r2, rp)   367 terms_reversed = terms_with_positions[:]   368 terms_reversed.reverse()   369 for term, doc_positions in terms_reversed:   370     dp = list(rd.find_positions(term))   371     print doc_positions == dp, doc_positions, dp   372 for term in ("aaa", "dog", "dingo"):   373     dp = rd.find_positions(term)   374     print dp == [], dp   375    376 # (Test iterators.)   377    378 for term, docnum, positions in position_dict_tests:   379     dp = rd.find_positions(term)   380     pos = dp.from_document(docnum)   381     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   382    383 # (Test sequential access.)   384    385 rd.rewind()   386 for term, doc_positions in terms_with_positions:   387     t, fr, df, dp = rd.read_term()   388     dp = list(dp)   389     print term == t, term, t   390     print doc_positions == dp, doc_positions, dp   391 rd.close()   392    393 # Test high-level index operations (including merging).   394    395 docs = [   396     (1, "The cat sat on the mat"),   397     (2, "Every good boy deserves football"),   398     (13, "One good turn deserves another"),   399     (14, "Every man for himself"),   400     (25, "Red sky at night shepherd's delight"),   401     (36, "She sells sea shells on the sea shore")   402     ]   403    404 doc_tests = [   405     ("Every", 2, [(2, [0]), (14, [0])]),   406     ("good", 2, [(2, [1]), (13, [1])]),   407     ("deserves", 2, [(2, [3]), (13, [3])]),   408     ("sea", 2, [(36, [2, 6])])   409     ]   410    411 position_tests = [   412     ("Every", 14, [0]),   413     ("sea", 36, [2, 6]),   414     ("shells", 1, None),   415     ("shells", 37, None)   416     ]   417    418 phrase_tests = [   419     (["good", "boy"], [(2, [1, 2])]),   420     (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),   421     (["sea", "shore"], [(36, [6, 7])])   422     ]   423    424 index = Index("test_index")   425 wi = index.get_writer(3, 2, 6)   426 for docnum, text in docs:   427     doc = Document(docnum)   428     for position, term in enumerate(text.split()):   429         doc.add_position(term, position)   430     doc.add_field(123, text)   431     wi.add_document(doc)   432 wi.close()   433    434 rd = index.get_reader()   435    436 # (Test searching.)   437    438 for term, frequency, doc_positions in doc_tests:   439     dp = list(rd.find_positions(term))   440     print doc_positions == dp, doc_positions, dp   441     fr = rd.get_frequency(term)   442     print frequency == fr, frequency, fr   443    444 # (Test fields.)   445    446 for docnum, text in docs:   447     df = dict(rd.get_fields(docnum))   448     print df[123] == text, text, df[123]   449    450 # (Test navigation.)   451    452 for term, docnum, positions in position_tests:   453     dp = rd.find_positions(term)   454     pos = dp.from_document(docnum)   455     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   456    457 # (Test phrases.)   458    459 for terms, results in phrase_tests:   460     res = list(rd.find_common_positions(terms))   461     print results == res, results, res   462    463 index.close()   464    465 # Test index updates.   466    467 index = Index("test_index")   468 index2 = Index("test_index2")   469 wi = index2.get_writer(3, 2, 6)   470 for docnum, text in docs:   471    472     # Add the same documents but with different numbers.   473    474     doc = Document(docnum + 100)   475     for position, term in enumerate(text.split()):   476         doc.add_position(term, position)   477     doc.add_field(123, text)   478     wi.add_document(doc)   479 wi.close()   480    481 index2.update([index])   482 index.close()   483    484 rd = index2.get_reader()   485 for term, frequency, doc_positions in doc_tests:   486    487     # Add the extra documents to the expected result.   488    489     orig_doc_positions = doc_positions   490     doc_positions = doc_positions[:]   491    492     for docnum, positions in orig_doc_positions:   493         doc_positions.append((docnum + 100, positions))   494     frequency *= 2   495    496     dp = list(rd.find_positions(term))   497     print doc_positions == dp, doc_positions, dp   498     fr = rd.get_frequency(term)   499     print frequency == fr, frequency, fr   500 index2.close()   501    502 # (Test update of an empty index.)   503    504 index = Index("test_index")   505 index3 = Index("test_index3")   506 index3.update([index])   507 index.close()   508    509 rd = index3.get_reader()   510 for term, frequency, doc_positions in doc_tests:   511     dp = list(rd.find_positions(term))   512     print doc_positions == dp, doc_positions, dp   513     fr = rd.get_frequency(term)   514     print frequency == fr, frequency, fr   515 index3.close()   516    517 # vim: tabstop=4 expandtab shiftwidth=4