iixr (file test.py at 571be37961d8)

     1 #!/usr/bin/env python     2      3 from iixr.files import *     4 from iixr.fields import *     5 from iixr.terms import *     6 from iixr.positions import *     7 from iixr.index import *     8 import os     9     10 # Remove old test files.    11     12 for filename in ("test", "testF", "testFI", "testI", "testP"):    13     try:    14         os.remove(filename)    15     except OSError:    16         pass    17     18 try:    19     for filename in os.listdir("test_index"):    20         os.remove(os.path.join("test_index", filename))    21     os.rmdir("test_index")    22 except OSError:    23     pass    24     25 # Test basic data types.    26     27 numbers = [12345678, 0, 1, 127, 128, 255, 256]    28     29 f = open("test", "wb")    30 w = FileWriter(f)    31 for number in numbers:    32     w.write_number(number)    33 w.close()    34     35 f = open("test", "rb")    36 r = FileReader(f)    37 for number in numbers:    38     n = r.read_number()    39     print number == n, number, n    40 r.close()    41     42 # Test positions.    43     44 all_doc_positions = [    45     [    46         (123, [1, 3, 5, 15, 25]),    47         (124, [0, 100]),    48         (125, [11, 99, 199]),    49         (130, [77, 78, 80, 82, 89])    50     ],    51     [    52         (78, [9]),    53         (196, [10, 11]),    54         (197, [17, 21, 30])    55     ]    56     ]    57     58 f = open("testP", "wb")    59 w = PositionWriter(f)    60 for doc_positions in all_doc_positions:    61     for docnum, positions in doc_positions:    62         w.write_positions(docnum, positions)    63     w.reset()    64 w.close()    65     66 f = open("testP", "rb")    67 r = PositionIterator(f, 0, None)    68 for doc_positions in all_doc_positions:    69     for docnum, positions in doc_positions:    70         d, p = r.read_positions()    71         print docnum == d, docnum, d    72         print positions == p, positions, p    73     r.reset()    74 r.close()    75     76 # Test position index files.    77     78 indexed_positions = [    79     [    80         (1234, 0, 100),    81         (2345, 700, 100),    82         (3456, 1900, 50)    83     ],    84     [    85         (4567, 2800, 20)    86     ]    87     ]    88     89 offsets = []    90 f = open("testPI", "wb")    91 w = PositionIndexWriter(f)    92 for term_positions in indexed_positions:    93     offset = None    94     doc_frequency = 0    95     w.reset()    96     for docnum, pos_offset, count in term_positions:    97         io = w.write_positions(docnum, pos_offset, count)    98         if offset is None:    99             offset = io   100         doc_frequency += count   101     offsets.append((offset, doc_frequency))   102 w.close()   103    104 r = PositionIndexOpener("testPI")   105 offsets.reverse()   106 indexed_positions.reverse()   107 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):   108     found_positions = r.read_term_positions(offset, doc_frequency)   109     for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):   110         print docnum == dn, docnum, dn   111         print pos_offset == po, pos_offset, po   112         print count == c, count, c   113 r.close()   114    115 # Test position dictionaries.   116    117 f = open("testP", "wb")   118 w = PositionWriter(f)   119 f2 = open("testPI", "wb")   120 w2 = PositionIndexWriter(f2)   121 wd = PositionDictionaryWriter(w, w2, 2)   122 offsets = []   123 for doc_positions in all_doc_positions:   124     offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)   125     offsets.append((offset, doc_frequency))   126 wd.close()   127    128 r = PositionOpener("testP")   129 r2 = PositionIndexOpener("testPI")   130 rd = PositionDictionaryReader(r, r2)   131 offsets.reverse()   132 all_doc_positions.reverse()   133 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):   134     dp = list(rd.read_term_positions(offset, doc_frequency))   135     print doc_positions == dp, doc_positions, dp   136 rd.close()   137    138 # Test fields.   139    140 doc_fields = [   141     (123, ["testing", "fields", "stored", "compressed"]),   142     (456, ["fields", "for a second", "document"]),   143     (789, ["field value"]),   144     (1234, []),   145     (2345, ["abc", "def"]),   146     (3456, ["apple", "banana", "cherry"]),   147     (4567, ["drue", "eple"])   148     ]   149    150 f = open("testF", "wb")   151 w = FieldWriter(f)   152 for docnum, fields in doc_fields:   153     w.write_fields(docnum, list(enumerate(fields)))   154 w.close()   155    156 f = open("testF", "rb")   157 r = FieldReader(f)   158 for docnum, fields in doc_fields:   159     dn, df = r.read_fields()   160     print docnum == dn, docnum, dn   161     print list(enumerate(fields)) == df, list(enumerate(fields)), df   162 r.close()   163    164 # Test field index files.   165    166 indexed_docs = [   167     (123, 100000987),   168     (456, 100004321),   169     (789, 100008765)   170     ]   171    172 f = open("testFI", "wb")   173 w = FieldIndexWriter(f)   174 for docnum, offset in indexed_docs:   175     w.write_document(docnum, offset)   176 w.close()   177    178 f = open("testFI", "rb")   179 r = FieldIndexReader(f)   180 for docnum, offset in indexed_docs:   181     dn, o = r.read_document()   182     print docnum == dn, docnum, dn   183     print offset == o, offset, o   184 r.close()   185    186 # Test field dictionaries.   187    188 f = open("testF", "wb")   189 w = FieldWriter(f)   190 f2 = open("testFI", "wb")   191 w2 = FieldIndexWriter(f2)   192 wd = FieldDictionaryWriter(w, w2, 3)   193 for docnum, fields in doc_fields:   194     wd.write_fields(docnum, list(enumerate(fields)))   195 wd.close()   196    197 f = open("testF", "rb")   198 r = FieldReader(f)   199 f2 = open("testFI", "rb")   200 r2 = FieldIndexReader(f2)   201 rd = FieldDictionaryReader(r, r2)   202 doc_fields_reversed = doc_fields[:]   203 doc_fields_reversed.reverse()   204 for docnum, fields in doc_fields_reversed:   205     df = dict(rd.get_fields(docnum))   206     print dict(enumerate(fields)) == df, dict(enumerate(fields)), df   207 for docnum in (13579, 246810):   208     df = rd.get_fields(docnum)   209     print df is None, df   210    211 # (Test sequential access.)   212    213 rd.rewind()   214 for docnum, fields in doc_fields:   215     dn, df = rd.read_fields()   216     print docnum == dn, docnum, dn   217     print list(enumerate(fields)) == df, list(enumerate(fields)), df   218 rd.close()   219    220 # Test terms.   221    222 terms = [   223     # term       offset      frequency  doc_frequency   224     ("aardvark",  100000123,  1,         1),   225     ("anteater",  100000456,  2,         1),   226     ("badger",    100000789, 13,         7),   227     ("bull",     1000001234, 59,        17),   228     ("bulldog",  1000002345, 99,        80),   229     ("cat",      1000003456, 89,        28)   230     ]   231    232 f = open("test", "wb")   233 w = TermWriter(f)   234 for term, offset, frequency, doc_frequency in terms:   235     w.write_term(term, offset, frequency, doc_frequency)   236 w.close()   237    238 f = open("test", "rb")   239 r = TermReader(f)   240 for term, offset, frequency, doc_frequency in terms:   241     t, o, fr, df = r.read_term()   242     print term == t, term, t   243     print offset == o, offset, o   244     print frequency == fr, frequency, fr   245     print doc_frequency == df, doc_frequency, df   246 r.close()   247    248 # Test terms in index files.   249    250 indexed_terms = [   251     # term       offset      frequency  doc_frequency   info_offset   252     ("aardvark",  100000123,  1,         1,             200000321),   253     ("anteater",  100000456,  2,         1,             200000654),   254     ("badger",    100000789, 13,         7,             200000987),   255     ("bull",     1000001234, 59,        17,             200004321),   256     ("bulldog",  1000002345, 99,        80,             200005432),   257     ("cat",      1000003456, 89,        28,             200006543)   258     ]   259    260 f = open("test", "wb")   261 w = TermIndexWriter(f)   262 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   263     w.write_term(term, offset, frequency, doc_frequency, info_offset)   264 w.close()   265    266 f = open("test", "rb")   267 r = TermIndexReader(f)   268 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   269     t, o, fr, df, i = r.read_term()   270     print term == t, term, t   271     print offset == o, offset, o   272     print frequency == fr, frequency, fr   273     print doc_frequency == df, doc_frequency, df   274     print info_offset == i, info_offset, i   275 r.close()   276    277 # Test dictionaries with only term data.   278    279 f = open("test", "wb")   280 w = TermWriter(f)   281 f2 = open("testI", "wb")   282 w2 = TermIndexWriter(f2)   283 f3 = open("testP", "wb")   284 w3 = PositionWriter(f3)   285 f4 = open("testPI", "wb")   286 w4 = PositionIndexWriter(f4)   287 wp = PositionDictionaryWriter(w3, w4, 2)   288 wd = TermDictionaryWriter(w, w2, wp, 3)   289 for term, offset, frequency, doc_frequency in terms:   290     wd._write_term(term, offset, frequency, doc_frequency)   291 wd.close()   292    293 f = open("test", "rb")   294 r = TermReader(f)   295 f2 = open("testI", "rb")   296 r2 = TermIndexReader(f2)   297 r3 = PositionOpener("testP")   298 r4 = PositionIndexOpener("testPI")   299 rp = PositionDictionaryReader(r3, r4)   300 rd = TermDictionaryReader(r, r2, rp)   301 terms_reversed = terms[:]   302 terms_reversed.reverse()   303 for term, offset, frequency, doc_frequency in terms_reversed:   304     o, fr, df = rd._find_term(term)   305     print offset == o, offset, o   306     print frequency == fr, frequency, fr   307     print doc_frequency == df, doc_frequency, df   308 for term in ("dog", "dingo"):   309     t = rd._find_term(term)   310     print t is None, t   311    312 # (Test term prefix searching.)   313    314 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]   315 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]   316 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]   317 print rd.find_terms("d") == [], rd.find_terms("d"), []   318 rd.close()   319    320 # Test dictionaries with term and position data.   321    322 terms_with_positions = [   323     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),   324     ("anteater",  [(1, [43, 44])]),   325     ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),   326     ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),   327     ("bulldog",   [(43, [17, 19, 256, 512])]),   328     ("cat",       [(123, [12, 145, 196]), (1200, [113])])   329     ]   330    331 position_dict_tests = [   332     ("badger", 19, [55, 1333]),   333     ("badger", 20, None),   334     ("bull", 6, [128]),   335     ("bull", 26, [1, 3, 5, 7, 9]),   336     ("cat", 111, None),   337     ("cat", 123, [12, 145, 196]),   338     ("cat", 1234, None)   339     ]   340    341 f = open("test", "wb")   342 w = TermWriter(f)   343 f2 = open("testI", "wb")   344 w2 = TermIndexWriter(f2)   345 f3 = open("testP", "wb")   346 w3 = PositionWriter(f3)   347 f4 = open("testPI", "wb")   348 w4 = PositionIndexWriter(f4)   349 wp = PositionDictionaryWriter(w3, w4, 2)   350 wd = TermDictionaryWriter(w, w2, wp, 3)   351 for term, doc_positions in terms_with_positions:   352     wd.write_term_positions(term, doc_positions)   353 wd.close()   354    355 f = open("test", "rb")   356 r = TermReader(f)   357 f2 = open("testI", "rb")   358 r2 = TermIndexReader(f2)   359 r3 = PositionOpener("testP")   360 r4 = PositionIndexOpener("testPI")   361 rp = PositionDictionaryReader(r3, r4)   362 rd = TermDictionaryReader(r, r2, rp)   363 terms_reversed = terms_with_positions[:]   364 terms_reversed.reverse()   365 for term, doc_positions in terms_reversed:   366     dp = list(rd.find_positions(term))   367     print doc_positions == dp, doc_positions, dp   368 for term in ("aaa", "dog", "dingo"):   369     dp = rd.find_positions(term)   370     print dp is None, dp   371    372 # (Test iterators.)   373    374 for term, docnum, positions in position_dict_tests:   375     dp = rd.find_positions(term)   376     pos = dp.from_document(docnum)   377     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   378    379 # (Test sequential access.)   380    381 rd.rewind()   382 for term, doc_positions in terms_with_positions:   383     t, fr, df, dp = rd.read_term()   384     dp = list(dp)   385     print term == t, term, t   386     print doc_positions == dp, doc_positions, dp   387 rd.close()   388    389 # Test high-level index operations (including merging).   390    391 docs = [   392     (1, "The cat sat on the mat"),   393     (2, "Every good boy deserves football"),   394     (13, "One good turn deserves another"),   395     (14, "Every man for himself"),   396     (25, "Red sky at night shepherd's delight"),   397     (36, "She sells sea shells on the sea shore")   398     ]   399    400 doc_tests = [   401     ("Every", 2, [(2, [0]), (14, [0])]),   402     ("good", 2, [(2, [1]), (13, [1])]),   403     ("deserves", 2, [(2, [3]), (13, [3])]),   404     ("sea", 2, [(36, [2, 6])])   405     ]   406    407 position_tests = [   408     ("Every", 14, [0]),   409     ("sea", 36, [2, 6]),   410     ("shells", 1, None),   411     ("shells", 37, None)   412     ]   413    414 index = Index("test_index")   415 wi = index.get_writer(3, 2, 6)   416 for docnum, text in docs:   417     doc = Document(docnum)   418     for position, term in enumerate(text.split()):   419         doc.add_position(term, position)   420     doc.add_field(123, text)   421     wi.add_document(doc)   422 wi.close()   423    424 rd = index.get_reader()   425 for term, frequency, doc_positions in doc_tests:   426     dp = list(rd.find_positions(term))   427     print doc_positions == dp, doc_positions, dp   428     fr = rd.get_frequency(term)   429     print frequency == fr, frequency, fr   430 for docnum, text in docs:   431     df = dict(rd.get_fields(docnum))   432     print df[123] == text, text, df[123]   433 for term, docnum, positions in position_tests:   434     dp = rd.find_positions(term)   435     pos = dp.from_document(docnum)   436     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   437 index.close()   438    439 # vim: tabstop=4 expandtab shiftwidth=4