iixr (file test.py at 6dd92daca068)

     1 #!/usr/bin/env python     2      3 from iixr.files import *     4 from iixr.fields import *     5 from iixr.terms import *     6 from iixr.positions import *     7 from iixr.index import *     8 import os, sys     9     10 # Remove old test files.    11     12 for filename in ("test", "testF", "testFI", "testI", "testP", "testPI"):    13     try:    14         os.remove(filename)    15     except OSError:    16         pass    17     18 try:    19     for dirname in ("test_index", "test_index2", "test_index3"):    20         for filename in os.listdir(dirname):    21             os.remove(os.path.join(dirname, filename))    22         os.rmdir(dirname)    23 except OSError:    24     pass    25     26 if "clean" in sys.argv:    27     sys.exit(0)    28     29 print "- Test basic data types."    30     31 numbers = [12345678, 0, 1, 127, 128, 255, 256]    32     33 f = open("test", "wb")    34 w = FileWriter(f)    35 for number in numbers:    36     w.write_number(number)    37 w.close()    38     39 f = open("test", "rb")    40 r = FileReader(f)    41 for number in numbers:    42     n = r.read_number()    43     print number == n, number, n    44 r.close()    45     46 print "- Test positions."    47     48 all_doc_positions = [    49     [    50         (123, [1, 3, 5, 15, 25]),    51         (124, [0, 100]),    52         (125, [11, 99, 199]),    53         (130, [77, 78, 80, 82, 89])    54     ],    55     [    56         (78, [9]),    57         (196, [10, 11]),    58         (197, [17, 21, 30])    59     ]    60     ]    61     62 f = open("testP", "wb")    63 w = PositionWriter(f)    64 for doc_positions in all_doc_positions:    65     for docnum, positions in doc_positions:    66         w.write_positions(docnum, positions)    67     w.reset()    68 w.close()    69     70 f = open("testP", "rb")    71 r = PositionReader(f)    72 for doc_positions in all_doc_positions:    73     for docnum, positions in doc_positions:    74         d, p = r.read_positions()    75         print docnum == d, docnum, d    76         print positions == p, positions, p    77     r.reset()    78 r.close()    79     80 print "- Test position index files."    81     82 indexed_positions = [    83     [    84         (1234, 0, 100),    85         (2345, 700, 100),    86         (3456, 1900, 50)    87     ],    88     [    89         (4567, 2800, 20)    90     ]    91     ]    92     93 offsets = []    94 f = open("testPI", "wb")    95 w = PositionIndexWriter(f)    96 for term_positions in indexed_positions:    97     offset = None    98     doc_frequency = 0    99     w.reset()   100     for docnum, pos_offset, count in term_positions:   101         if offset is None:   102             offset = w.f.tell()   103         w.write_positions(docnum, pos_offset, count)   104         doc_frequency += count   105     offsets.append((offset, doc_frequency))   106 w.close()   107    108 r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb")))   109 offsets.reverse()   110 indexed_positions.reverse()   111 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):   112     r.seek(offset, doc_frequency)   113     for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):   114         print docnum == dn, docnum, dn   115         print pos_offset == po, pos_offset, po   116         print count == c, count, c   117 r.reader.close()   118    119 print "- Test position dictionaries."   120    121 f = open("testP", "wb")   122 w = PositionWriter(f)   123 f2 = open("testPI", "wb")   124 w2 = PositionIndexWriter(f2)   125 wd = PositionDictionaryWriter(w, w2, 2)   126 offsets = []   127 for doc_positions in all_doc_positions:   128     offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)   129     offsets.append((offset, doc_frequency))   130 wd.close()   131    132 r = PositionReader(open("testP", "rb"))   133 r2 = PositionIndexReader(open("testPI", "rb"))   134 rd = PositionDictionaryReader(r, r2)   135 offsets.reverse()   136 all_doc_positions.reverse()   137 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):   138     it = rd.read_term_positions(offset, doc_frequency)   139     dp = list(it)   140     print doc_positions == dp, doc_positions, dp   141 rd.close()   142    143 print "- Test fields."   144    145 doc_fields = [   146     (123, ["testing", "fields", "stored", "compressed"]),   147     (456, ["fields", "for a second", "document"]),   148     (789, ["field value"]),   149     (1234, []),   150     (2345, ["abc", "def"]),   151     (3456, ["apple", "banana", "cherry"]),   152     (4567, ["drue", "eple"])   153     ]   154    155 f = open("testF", "wb")   156 w = FieldWriter(f)   157 for docnum, fields in doc_fields:   158     w.write_fields(docnum, list(enumerate(fields)))   159 w.close()   160    161 f = open("testF", "rb")   162 r = FieldReader(f)   163 for docnum, fields in doc_fields:   164     dn, df = r.read_fields()   165     print docnum == dn, docnum, dn   166     print list(enumerate(fields)) == df, list(enumerate(fields)), df   167 r.close()   168    169 print "- Test field index files."   170    171 indexed_docs = [   172     (123, 100000987),   173     (456, 100004321),   174     (789, 100008765)   175     ]   176    177 f = open("testFI", "wb")   178 w = FieldIndexWriter(f)   179 for docnum, offset in indexed_docs:   180     w.write_document(docnum, offset)   181 w.close()   182    183 f = open("testFI", "rb")   184 r = FieldIndexReader(f)   185 for docnum, offset in indexed_docs:   186     dn, o = r.read_document()   187     print docnum == dn, docnum, dn   188     print offset == o, offset, o   189 r.close()   190    191 print "- Test field dictionaries."   192    193 f = open("testF", "wb")   194 w = FieldWriter(f)   195 f2 = open("testFI", "wb")   196 w2 = FieldIndexWriter(f2)   197 wd = FieldDictionaryWriter(w, w2, 3)   198 for docnum, fields in doc_fields:   199     wd.write_fields(docnum, list(enumerate(fields)))   200 wd.close()   201    202 f = open("testF", "rb")   203 r = FieldReader(f)   204 f2 = open("testFI", "rb")   205 r2 = FieldIndexReader(f2)   206 rd = FieldDictionaryReader(r, r2)   207 doc_fields_reversed = doc_fields[:]   208 doc_fields_reversed.reverse()   209 for docnum, fields in doc_fields_reversed:   210     df = dict(rd.get_fields(docnum))   211     print dict(enumerate(fields)) == df, dict(enumerate(fields)), df   212 for docnum in (13579, 246810):   213     df = rd.get_fields(docnum)   214     print df is None, df   215    216 print "- (Test sequential access.)"   217    218 rd.rewind()   219 for docnum, fields in doc_fields:   220     dn, df = rd.read_fields()   221     print docnum == dn, docnum, dn   222     print list(enumerate(fields)) == df, list(enumerate(fields)), df   223 rd.close()   224    225 print "- Test terms."   226    227 terms = [   228     # term       offset      frequency  doc_frequency   229     ("aardvark",  100000123,  1,         1),   230     ("anteater",  100000456,  2,         1),   231     ("badger",    100000789, 13,         7),   232     ("bull",     1000001234, 59,        17),   233     ("bulldog",  1000002345, 99,        80),   234     ("cat",      1000003456, 89,        28)   235     ]   236    237 f = open("test", "wb")   238 w = TermWriter(f)   239 for term, offset, frequency, doc_frequency in terms:   240     w.write_term(term, offset, frequency, doc_frequency)   241 w.close()   242    243 f = open("test", "rb")   244 r = TermReader(f)   245 for term, offset, frequency, doc_frequency in terms:   246     t, o, fr, df = r.read_term()   247     print term == t, term, t   248     print offset == o, offset, o   249     print frequency == fr, frequency, fr   250     print doc_frequency == df, doc_frequency, df   251 r.close()   252    253 print "- Test terms in index files."   254    255 indexed_terms = [   256     # term       offset      frequency  doc_frequency   info_offset   257     ("aardvark",  100000123,  1,         1,             200000321),   258     ("anteater",  100000456,  2,         1,             200000654),   259     ("badger",    100000789, 13,         7,             200000987),   260     ("bull",     1000001234, 59,        17,             200004321),   261     ("bulldog",  1000002345, 99,        80,             200005432),   262     ("cat",      1000003456, 89,        28,             200006543)   263     ]   264    265 f = open("test", "wb")   266 w = TermIndexWriter(f)   267 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   268     w.write_term(term, offset, frequency, doc_frequency, info_offset)   269 w.close()   270    271 f = open("test", "rb")   272 r = TermIndexReader(f)   273 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   274     t, o, fr, df, i = r.read_term()   275     print term == t, term, t   276     print offset == o, offset, o   277     print frequency == fr, frequency, fr   278     print doc_frequency == df, doc_frequency, df   279     print info_offset == i, info_offset, i   280 r.close()   281    282 print "- Test dictionaries with only term data."   283    284 f = open("test", "wb")   285 w = TermWriter(f)   286 f2 = open("testI", "wb")   287 w2 = TermIndexWriter(f2)   288 f3 = open("testP", "wb")   289 w3 = PositionWriter(f3)   290 f4 = open("testPI", "wb")   291 w4 = PositionIndexWriter(f4)   292 wp = PositionDictionaryWriter(w3, w4, 2)   293 wd = TermDictionaryWriter(w, w2, wp, 3)   294 for term, offset, frequency, doc_frequency in terms:   295     wd._write_term(term, offset, frequency, doc_frequency)   296 wd.close()   297    298 f = open("test", "rb")   299 r = TermReader(f)   300 f2 = open("testI", "rb")   301 r2 = TermIndexReader(f2)   302 r3 = PositionReader(open("testP", "rb"))   303 r4 = PositionIndexReader(open("testPI", "rb"))   304 rp = PositionDictionaryReader(r3, r4)   305 rd = TermDictionaryReader(r, r2, rp)   306 terms_reversed = terms[:]   307 terms_reversed.reverse()   308 for term, offset, frequency, doc_frequency in terms_reversed:   309     o, fr, df = rd._find_term(term)   310     print offset == o, offset, o   311     print frequency == fr, frequency, fr   312     print doc_frequency == df, doc_frequency, df   313 for term in ("dog", "dingo"):   314     t = rd._find_term(term)   315     print t is None, t   316    317 print "- (Test term prefix searching.)"   318    319 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]   320 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]   321 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]   322 print rd.find_terms("d") == [], rd.find_terms("d"), []   323 rd.close()   324    325 print "- Test dictionaries with term and position data."   326    327 terms_with_positions = [   328     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),   329     ("anteater",  [(1, [43, 44])]),   330     ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),   331     ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),   332     ("bulldog",   [(43, [17, 19, 256, 512])]),   333     ("cat",       [(123, [12, 145, 196]), (1200, [113])])   334     ]   335    336 position_dict_tests = [   337     ("badger", 19, [55, 1333]),   338     ("badger", 20, None),   339     ("bull", 6, [128]),   340     ("bull", 26, [1, 3, 5, 7, 9]),   341     ("cat", 111, None),   342     ("cat", 123, [12, 145, 196]),   343     ("cat", 1234, None)   344     ]   345    346 f = open("test", "wb")   347 w = TermWriter(f)   348 f2 = open("testI", "wb")   349 w2 = TermIndexWriter(f2)   350 f3 = open("testP", "wb")   351 w3 = PositionWriter(f3)   352 f4 = open("testPI", "wb")   353 w4 = PositionIndexWriter(f4)   354 wp = PositionDictionaryWriter(w3, w4, 2)   355 wd = TermDictionaryWriter(w, w2, wp, 3)   356 for term, doc_positions in terms_with_positions:   357     wd.write_term_positions(term, doc_positions)   358 wd.close()   359    360 f = open("test", "rb")   361 r = TermReader(f)   362 f2 = open("testI", "rb")   363 r2 = TermIndexReader(f2)   364 r3 = PositionReader(open("testP", "rb"))   365 r4 = PositionIndexReader(open("testPI", "rb"))   366 rp = PositionDictionaryReader(r3, r4)   367 rd = TermDictionaryReader(r, r2, rp)   368 terms_reversed = terms_with_positions[:]   369 terms_reversed.reverse()   370 for term, doc_positions in terms_reversed:   371     dp = list(rd.find_positions(term))   372     print doc_positions == dp, doc_positions, dp   373 for term in ("aaa", "dog", "dingo"):   374     dp = rd.find_positions(term)   375     print dp == [], dp   376    377 print "- (Test iterators.)"   378    379 for term, docnum, positions in position_dict_tests:   380     dp = rd.find_positions(term)   381     pos = dp.from_document(docnum)   382     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   383    384 print "- (Test sequential access.)"   385    386 rd.rewind()   387 for term, doc_positions in terms_with_positions:   388     t, fr, df, dp = rd.read_term()   389     dp = list(dp)   390     print term == t, term, t   391     print doc_positions == dp, doc_positions, dp   392 rd.close()   393    394 print "- Test high-level index operations (including merging)."   395    396 docs = [   397     (1, "The cat sat on the mat"),   398     (2, "Every good boy deserves football"),   399     (13, "One good turn deserves another"),   400     (14, "Every man for himself"),   401     (25, "Red sky at night shepherd's delight"),   402     (36, "She sells sea shells on the sea shore")   403     ]   404    405 doc_tests = [   406     ("Every", 2, [(2, [0]), (14, [0])]),   407     ("good", 2, [(2, [1]), (13, [1])]),   408     ("deserves", 2, [(2, [3]), (13, [3])]),   409     ("sea", 2, [(36, [2, 6])])   410     ]   411    412 position_tests = [   413     ("Every", 14, [0]),   414     ("sea", 36, [2, 6]),   415     ("shells", 1, None),   416     ("shells", 37, None)   417     ]   418    419 phrase_tests = [   420     (["good", "boy"], [(2, [1, 2])]),   421     (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),   422     (["sea", "shore"], [(36, [6, 7])])   423     ]   424    425 index = Index("test_index", 3, 2, 3, 6)   426 wi = index.get_writer()   427 for docnum, text in docs:   428     doc = Document(docnum)   429     for position, term in enumerate(text.split()):   430         doc.add_position(term, position)   431     doc.add_field(123, text)   432     wi.add_document(doc)   433 wi.close()   434    435 rd = index.get_reader()   436    437 print "- (Test searching.)"   438    439 for term, frequency, doc_positions in doc_tests:   440     dp = list(rd.find_positions(term))   441     print doc_positions == dp, doc_positions, dp   442     fr = rd.get_frequency(term)   443     print frequency == fr, frequency, fr   444    445 print "- (Test fields.)"   446    447 for docnum, text in docs:   448     df = dict(rd.get_fields(docnum))   449     print df[123] == text, text, df[123]   450    451 print "- (Test navigation.)"   452    453 for term, docnum, positions in position_tests:   454     dp = rd.find_positions(term)   455     pos = dp.from_document(docnum)   456     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   457    458 print "- (Test phrases.)"   459    460 for terms, results in phrase_tests:   461     res = list(rd.find_common_positions(terms))   462     print results == res, results, res   463    464 index.close()   465    466 print "- Test index updates."   467    468 index = Index("test_index")   469 index2 = Index("test_index2", 3, 2, 3, 6)   470 wi = index2.get_writer()   471 for docnum, text in docs:   472    473     # Add the same documents but with different numbers.   474    475     doc = Document(docnum + 100)   476     for position, term in enumerate(text.split()):   477         doc.add_position(term, position)   478     doc.add_field(123, text)   479     wi.add_document(doc)   480 wi.close()   481    482 index2.update([index])   483 index.close()   484    485 rd = index2.get_reader()   486 for term, frequency, doc_positions in doc_tests:   487    488     # Add the extra documents to the expected result.   489    490     orig_doc_positions = doc_positions   491     doc_positions = doc_positions[:]   492    493     for docnum, positions in orig_doc_positions:   494         doc_positions.append((docnum + 100, positions))   495     frequency *= 2   496    497     dp = list(rd.find_positions(term))   498     print doc_positions == dp, doc_positions, dp   499     fr = rd.get_frequency(term)   500     print frequency == fr, frequency, fr   501 index2.close()   502    503 print "- (Test update of an empty index.)"   504    505 index = Index("test_index")   506 index3 = Index("test_index3")   507 index3.update([index])   508 index.close()   509    510 rd = index3.get_reader()   511 for term, frequency, doc_positions in doc_tests:   512     dp = list(rd.find_positions(term))   513     print doc_positions == dp, doc_positions, dp   514     fr = rd.get_frequency(term)   515     print frequency == fr, frequency, fr   516 index3.close()   517    518 # vim: tabstop=4 expandtab shiftwidth=4