iixr

test.py

28:147f0472ed01
2009-09-08 Paul Boddie Introduced a Document class which is instantiated in order to present data to the IndexWriter in a more efficient way through the add_document method, thus rendering the commit_document method obsolete. Reduced the data flushing threshold to a more reasonable size. Fixed maximum offset definitions in cases where datasets are empty.
     1 #!/usr/bin/env python     2      3 import iixr     4 import os     5      6 # Remove old test files.     7      8 for filename in ("test", "testF", "testFI", "testI", "testP"):     9     try:    10         os.remove(filename)    11     except OSError:    12         pass    13     14 try:    15     for filename in os.listdir("test_index"):    16         os.remove(os.path.join("test_index", filename))    17     os.rmdir("test_index")    18 except OSError:    19     pass    20     21 # Test basic data types.    22     23 numbers = [12345678, 0, 1, 127, 128, 255, 256]    24     25 f = open("test", "wb")    26 w = iixr.FileWriter(f)    27 for number in numbers:    28     w.write_number(number)    29 w.close()    30     31 f = open("test", "rb")    32 r = iixr.FileReader(f)    33 for number in numbers:    34     n = r.read_number()    35     print number == n, number, n    36 r.close()    37     38 # Test positions.    39     40 all_doc_positions = [    41     [    42         (123, [1, 3, 5, 15, 25]),    43         (124, [0, 100]),    44         (125, [11, 99, 199]),    45         (130, [77, 78, 80, 82, 89])    46     ],    47     [    48         (78, [9]),    49         (196, [10, 11]),    50         (197, [17, 21, 30])    51     ]    52     ]    53     54 f = open("testP", "wb")    55 w = iixr.PositionWriter(f)    56 for doc_positions in all_doc_positions:    57     for docnum, positions in doc_positions:    58         w.write_positions(docnum, positions)    59     w.reset()    60 w.close()    61     62 f = open("testP", "rb")    63 r = iixr.PositionReader(f)    64 for doc_positions in all_doc_positions:    65     for docnum, positions in doc_positions:    66         d, p = r.read_positions()    67         print docnum == d, docnum, d    68         print positions == p, positions, p    69     r.reset()    70 r.close()    71     72 # Test position index files.    73     74 indexed_positions = [    75     [    76         (1234, 0, 100),    77         (2345, 700, 100),    78         (3456, 1900, 50)    79     ],    80     [    81         (4567, 2800, 20)    82     ]    83     ]    84     85 offsets = []    86 f = open("testPI", "wb")    87 w = iixr.PositionIndexWriter(f)    88 for term_positions in indexed_positions:    89     offset = None    90     doc_frequency = 0    91     w.reset()    92     for docnum, pos_offset, count in term_positions:    93         io = w.write_positions(docnum, pos_offset, count)    94         if offset is None:    95             offset = io    96         doc_frequency += count    97     offsets.append((offset, doc_frequency))    98 w.close()    99    100 f = open("testPI", "rb")   101 r = iixr.PositionIndexReader(f)   102 offsets.reverse()   103 indexed_positions.reverse()   104 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):   105     found_positions = r.read_term_positions(offset, doc_frequency)   106     for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):   107         print docnum == dn, docnum, dn   108         print pos_offset == po, pos_offset, po   109         print count == c, count, c   110 r.close()   111    112 # Test position dictionaries.   113    114 f = open("testP", "wb")   115 w = iixr.PositionWriter(f)   116 f2 = open("testPI", "wb")   117 w2 = iixr.PositionIndexWriter(f2)   118 wd = iixr.PositionDictionaryWriter(w, w2, 2)   119 offsets = []   120 for doc_positions in all_doc_positions:   121     offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)   122     offsets.append((offset, doc_frequency))   123 wd.close()   124    125 f = open("testP", "rb")   126 r = iixr.PositionReader(f)   127 f2 = open("testPI", "rb")   128 r2 = iixr.PositionIndexReader(f2)   129 rd = iixr.PositionDictionaryReader(r, r2)   130 offsets.reverse()   131 all_doc_positions.reverse()   132 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):   133     dp = list(rd.read_term_positions(offset, doc_frequency))   134     print doc_positions == dp, doc_positions, dp   135 rd.close()   136    137 # Test fields.   138    139 doc_fields = [   140     (123, ["testing", "fields", "stored", "compressed"]),   141     (456, ["fields", "for a second", "document"]),   142     (789, ["field value"]),   143     (1234, []),   144     (2345, ["abc", "def"]),   145     (3456, ["apple", "banana", "cherry"]),   146     (4567, ["drue", "eple"])   147     ]   148    149 f = open("testF", "wb")   150 w = iixr.FieldWriter(f)   151 for docnum, fields in doc_fields:   152     w.write_fields(docnum, list(enumerate(fields)))   153 w.close()   154    155 f = open("testF", "rb")   156 r = iixr.FieldReader(f)   157 for docnum, fields in doc_fields:   158     dn, df = r.read_fields()   159     print docnum == dn, docnum, dn   160     print list(enumerate(fields)) == df, list(enumerate(fields)), df   161 r.close()   162    163 # Test field index files.   164    165 indexed_docs = [   166     (123, 100000987),   167     (456, 100004321),   168     (789, 100008765)   169     ]   170    171 f = open("testFI", "wb")   172 w = iixr.FieldIndexWriter(f)   173 for docnum, offset in indexed_docs:   174     w.write_document(docnum, offset)   175 w.close()   176    177 f = open("testFI", "rb")   178 r = iixr.FieldIndexReader(f)   179 for docnum, offset in indexed_docs:   180     dn, o = r.read_document()   181     print docnum == dn, docnum, dn   182     print offset == o, offset, o   183 r.close()   184    185 # Test field dictionaries.   186    187 f = open("testF", "wb")   188 w = iixr.FieldWriter(f)   189 f2 = open("testFI", "wb")   190 w2 = iixr.FieldIndexWriter(f2)   191 wd = iixr.FieldDictionaryWriter(w, w2, 3)   192 for docnum, fields in doc_fields:   193     wd.write_fields(docnum, list(enumerate(fields)))   194 wd.close()   195    196 f = open("testF", "rb")   197 r = iixr.FieldReader(f)   198 f2 = open("testFI", "rb")   199 r2 = iixr.FieldIndexReader(f2)   200 rd = iixr.FieldDictionaryReader(r, r2)   201 doc_fields_reversed = doc_fields[:]   202 doc_fields_reversed.reverse()   203 for docnum, fields in doc_fields_reversed:   204     df = dict(rd.get_fields(docnum))   205     print dict(enumerate(fields)) == df, dict(enumerate(fields)), df   206 for docnum in (13579, 246810):   207     df = rd.get_fields(docnum)   208     print df is None, df   209    210 # (Test sequential access.)   211    212 rd.rewind()   213 for docnum, fields in doc_fields:   214     dn, df = rd.read_fields()   215     print docnum == dn, docnum, dn   216     print list(enumerate(fields)) == df, list(enumerate(fields)), df   217 rd.close()   218    219 # Test terms.   220    221 terms = [   222     # term       offset      frequency  doc_frequency   223     ("aardvark",  100000123,  1,         1),   224     ("anteater",  100000456,  2,         1),   225     ("badger",    100000789, 13,         7),   226     ("bull",     1000001234, 59,        17),   227     ("bulldog",  1000002345, 99,        80),   228     ("cat",      1000003456, 89,        28)   229     ]   230    231 f = open("test", "wb")   232 w = iixr.TermWriter(f)   233 for term, offset, frequency, doc_frequency in terms:   234     w.write_term(term, offset, frequency, doc_frequency)   235 w.close()   236    237 f = open("test", "rb")   238 r = iixr.TermReader(f)   239 for term, offset, frequency, doc_frequency in terms:   240     t, o, fr, df = r.read_term()   241     print term == t, term, t   242     print offset == o, offset, o   243     print frequency == fr, frequency, fr   244     print doc_frequency == df, doc_frequency, df   245 r.close()   246    247 # Test terms in index files.   248    249 indexed_terms = [   250     # term       offset      frequency  doc_frequency   info_offset   251     ("aardvark",  100000123,  1,         1,             200000321),   252     ("anteater",  100000456,  2,         1,             200000654),   253     ("badger",    100000789, 13,         7,             200000987),   254     ("bull",     1000001234, 59,        17,             200004321),   255     ("bulldog",  1000002345, 99,        80,             200005432),   256     ("cat",      1000003456, 89,        28,             200006543)   257     ]   258    259 f = open("test", "wb")   260 w = iixr.TermIndexWriter(f)   261 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   262     w.write_term(term, offset, frequency, doc_frequency, info_offset)   263 w.close()   264    265 f = open("test", "rb")   266 r = iixr.TermIndexReader(f)   267 for term, offset, frequency, doc_frequency, info_offset in indexed_terms:   268     t, o, fr, df, i = r.read_term()   269     print term == t, term, t   270     print offset == o, offset, o   271     print frequency == fr, frequency, fr   272     print doc_frequency == df, doc_frequency, df   273     print info_offset == i, info_offset, i   274 r.close()   275    276 # Test dictionaries with only term data.   277    278 f = open("test", "wb")   279 w = iixr.TermWriter(f)   280 f2 = open("testI", "wb")   281 w2 = iixr.TermIndexWriter(f2)   282 f3 = open("testP", "wb")   283 w3 = iixr.PositionWriter(f3)   284 f4 = open("testPI", "wb")   285 w4 = iixr.PositionIndexWriter(f4)   286 wp = iixr.PositionDictionaryWriter(w3, w4, 2)   287 wd = iixr.TermDictionaryWriter(w, w2, wp, 3)   288 for term, offset, frequency, doc_frequency in terms:   289     wd._write_term(term, offset, frequency, doc_frequency)   290 wd.close()   291    292 f = open("test", "rb")   293 r = iixr.TermReader(f)   294 f2 = open("testI", "rb")   295 r2 = iixr.TermIndexReader(f2)   296 f3 = open("testP", "rb")   297 r3 = iixr.PositionReader(f3)   298 f4 = open("testPI", "rb")   299 r4 = iixr.PositionIndexReader(f4)   300 rp = iixr.PositionDictionaryReader(r3, r4)   301 rd = iixr.TermDictionaryReader(r, r2, rp)   302 terms_reversed = terms[:]   303 terms_reversed.reverse()   304 for term, offset, frequency, doc_frequency in terms_reversed:   305     o, fr, df = rd._find_term(term)   306     print offset == o, offset, o   307     print frequency == fr, frequency, fr   308     print doc_frequency == df, doc_frequency, df   309 for term in ("dog", "dingo"):   310     t = rd._find_term(term)   311     print t is None, t   312    313 # (Test term prefix searching.)   314    315 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]   316 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]   317 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]   318 print rd.find_terms("d") == [], rd.find_terms("d"), []   319 rd.close()   320    321 # Test dictionaries with term and position data.   322    323 terms_with_positions = [   324     ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),   325     ("anteater",  [(1, [43, 44])]),   326     ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),   327     ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),   328     ("bulldog",   [(43, [17, 19, 256, 512])]),   329     ("cat",       [(123, [12, 145, 196]), (1200, [113])])   330     ]   331    332 position_dict_tests = [   333     ("badger", 19, [55, 1333]),   334     ("badger", 20, None),   335     ("bull", 6, [128]),   336     ("bull", 26, [1, 3, 5, 7, 9]),   337     ("cat", 111, None),   338     ("cat", 123, [12, 145, 196]),   339     ("cat", 1234, None)   340     ]   341    342 f = open("test", "wb")   343 w = iixr.TermWriter(f)   344 f2 = open("testI", "wb")   345 w2 = iixr.TermIndexWriter(f2)   346 f3 = open("testP", "wb")   347 w3 = iixr.PositionWriter(f3)   348 f4 = open("testPI", "wb")   349 w4 = iixr.PositionIndexWriter(f4)   350 wp = iixr.PositionDictionaryWriter(w3, w4, 2)   351 wd = iixr.TermDictionaryWriter(w, w2, wp, 3)   352 for term, doc_positions in terms_with_positions:   353     wd.write_term_positions(term, doc_positions)   354 wd.close()   355    356 f = open("test", "rb")   357 r = iixr.TermReader(f)   358 f2 = open("testI", "rb")   359 r2 = iixr.TermIndexReader(f2)   360 f3 = open("testP", "rb")   361 r3 = iixr.PositionReader(f3)   362 f4 = open("testPI", "rb")   363 r4 = iixr.PositionIndexReader(f4)   364 rp = iixr.PositionDictionaryReader(r3, r4)   365 rd = iixr.TermDictionaryReader(r, r2, rp)   366 terms_reversed = terms_with_positions[:]   367 terms_reversed.reverse()   368 for term, doc_positions in terms_reversed:   369     dp = list(rd.find_positions(term))   370     print doc_positions == dp, doc_positions, dp   371 for term in ("aaa", "dog", "dingo"):   372     dp = rd.find_positions(term)   373     print dp is None, dp   374    375 # (Test iterators.)   376    377 for term, docnum, positions in position_dict_tests:   378     dp = rd.find_positions(term)   379     pos = dp.from_document(docnum)   380     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   381    382 # (Test sequential access.)   383    384 rd.rewind()   385 for term, doc_positions in terms_with_positions:   386     t, fr, df, dp = rd.read_term()   387     dp = list(dp)   388     print term == t, term, t   389     print doc_positions == dp, doc_positions, dp   390 rd.close()   391    392 # Test high-level index operations (including merging).   393    394 docs = [   395     (1, "The cat sat on the mat"),   396     (2, "Every good boy deserves football"),   397     (13, "One good turn deserves another"),   398     (14, "Every man for himself"),   399     (25, "Red sky at night shepherd's delight"),   400     (36, "She sells sea shells on the sea shore")   401     ]   402    403 doc_tests = [   404     ("Every", 2, [(2, [0]), (14, [0])]),   405     ("good", 2, [(2, [1]), (13, [1])]),   406     ("deserves", 2, [(2, [3]), (13, [3])]),   407     ("sea", 2, [(36, [2, 6])])   408     ]   409    410 position_tests = [   411     ("Every", 14, [0]),   412     ("sea", 36, [2, 6]),   413     ("shells", 1, None),   414     ("shells", 37, None)   415     ]   416    417 index = iixr.Index("test_index")   418 wi = index.get_writer(3, 2, 6)   419 for docnum, text in docs:   420     doc = iixr.Document(docnum)   421     for position, term in enumerate(text.split()):   422         doc.add_position(term, position)   423     doc.add_field(123, text)   424     wi.add_document(doc)   425 wi.close()   426    427 rd = index.get_reader()   428 for term, frequency, doc_positions in doc_tests:   429     dp = list(rd.find_positions(term))   430     print doc_positions == dp, doc_positions, dp   431     fr = rd.get_frequency(term)   432     print frequency == fr, frequency, fr   433 for docnum, text in docs:   434     df = dict(rd.get_fields(docnum))   435     print df[123] == text, text, df[123]   436 for term, docnum, positions in position_tests:   437     dp = rd.find_positions(term)   438     pos = dp.from_document(docnum)   439     print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos   440 index.close()   441    442 # vim: tabstop=4 expandtab shiftwidth=4