1 #!/usr/bin/env python 2 # encoding: iso-8859-1 3 4 from iixr.files import * 5 from iixr.terms import * 6 from iixr.index import * 7 import os, sys 8 9 # Remove old test files. 10 11 for filename in ("test", "testMS", "testNMS", "testP", "testP2"): 12 try: 13 os.remove(filename) 14 except OSError: 15 pass 16 17 try: 18 for dirname in ("test_index",): 19 for filename in os.listdir(dirname): 20 os.remove(os.path.join(dirname, filename)) 21 os.rmdir(dirname) 22 except OSError: 23 pass 24 25 if "clean" in sys.argv: 26 sys.exit(0) 27 28 print "- Test basic data types." 29 30 numbers = [12345678, 0, 1, 127, 128, 255, 256] 31 32 f = open("test", "wb") 33 w = FileWriter(f) 34 w.begin_record() 35 for number in numbers: 36 w.write_number(number) 37 w.end_record() 38 w.close() 39 40 f = open("test", "rb") 41 r = FileReader(f) 42 r.begin_record() 43 for number in numbers: 44 n = r.read_number() 45 print number == n, number, n 46 r.end_record() 47 r.close() 48 49 tuples = [(0, 0), (1, 3), (2, 5), (3, 9)] 50 51 f = open("testMS", "wb") 52 w = FileWriter(f) 53 w.begin_record() 54 w.write_monotonic_sequence(tuples, 2) 55 w.end_record() 56 w.close() 57 58 f = open("testMS", "rb") 59 r = FileReader(f) 60 r.begin_record() 61 for t, t2 in zip(r.read_monotonic_sequence(2), tuples): 62 print t == t2, t, t2 63 r.end_record() 64 r.close() 65 66 tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)] 67 68 f = open("testNMS", "wb") 69 w = FileWriter(f) 70 w.begin_record() 71 w.write_delta_sequence(tuples2, 2) 72 w.end_record() 73 w.close() 74 75 f = open("testNMS", "rb") 76 r = FileReader(f) 77 r.begin_record() 78 for t, t2 in zip(r.read_delta_sequence(2), tuples2): 79 print t == t2, t, t2 80 r.end_record() 81 r.close() 82 83 print "- Test positions." 84 85 all_doc_positions = [ 86 [ 87 (123, [1, 3, 5, 15, 25]), 88 (124, [0, 100]), 89 (125, [11, 99, 199]), 90 (130, [77, 78, 80, 82, 89]) 91 ], 92 [ 93 (78, [9]), 94 (196, [10, 11]), 95 (197, [17, 21, 30]) 96 ] 97 ] 98 99 f = open("testP", "wb") 100 w = TermWriter(f) 101 w.begin(0, 0) 102 for doc_positions in all_doc_positions: 103 w.write_positions(doc_positions) 104 w.end_record() 105 w.close() 106 107 f = open("testP", "rb") 108 r = TermReader(f) 109 for doc_positions in all_doc_positions: 110 r.begin_record() 111 dp = r.read_positions() 112 print doc_positions == dp, doc_positions 113 print " ", dp 114 r.close() 115 116 all_doc_positions_seq = [ 117 [ 118 ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]), 119 ((124, 1), [(0, 0), (100, 350)]), 120 ((124, 2), [(11, 38), (99, 379), (199, 720)]), 121 ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)]) 122 ], 123 [ 124 ((78, 1), [(9, 19)]), 125 ((196, 0), [(10, 27), (11, 29)]), 126 ((196, 1), [(17, 46), (21, 52), (30, 60)]) 127 ] 128 ] 129 130 f = open("testP2", "wb") 131 w = TermWriter(f) 132 w.begin(2, 2) 133 for doc_positions in all_doc_positions_seq: 134 w.write_positions(doc_positions) 135 w.end_record() 136 w.close() 137 138 f = open("testP2", "rb") 139 r = TermReader(f) 140 for doc_positions in all_doc_positions_seq: 141 r.begin_record() 142 dp = r.read_positions() 143 print doc_positions == dp, doc_positions 144 print " ", dp 145 r.close() 146 147 print "- Test dictionaries with term and position data." 148 149 terms_with_positions = [ 150 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 151 ("anteater", [(1, [43, 44])]), 152 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 153 (u"bj?rn", [(11, [19, 54])]), 154 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 155 ("bulldog", [(43, [17, 19, 256, 512])]), 156 ("cat", [(123, [12, 145, 196]), (1200, [113])]), 157 (u"?", [(15, [384])]), 158 ] 159 160 f = open("test", "wb") 161 w = TermWriter(f) 162 w.begin(0, 0) 163 w.write_terms(terms_with_positions) 164 w.close() 165 166 f = open("test", "rb") 167 r = TermIterator(f) 168 for (term, doc_positions), (t, dp) in zip(terms_with_positions, r): 169 print term == t, term, t 170 print doc_positions == dp, doc_positions 171 print " ", dp 172 r.close() 173 174 f = open("test", "rb") 175 r = TermDataIterator(f) 176 for (term, doc_positions), (t, data) in zip(terms_with_positions, r): 177 print term == t, term, t, data 178 r.close() 179 180 print "- Test high-level index operations." 181 182 docs = [ 183 (1, "The cat sat on the mat"), 184 (2, "Every good boy deserves football"), 185 (13, "One good turn deserves another"), 186 (14, "Every man for himself"), 187 (25, "Red sky at night shepherd's delight"), 188 (36, "She sells sea shells on the sea shore") 189 ] 190 191 index = Index("test_index", 3) 192 wi = index.get_writer() 193 for docnum, text in docs: 194 doc = Document(docnum) 195 for position, term in enumerate(text.split()): 196 doc.add_position(term, position) 197 wi.add_document(doc) 198 wi.close() 199 200 print "- Test merge." 201 202 l1 = list(index.get_reader()) 203 index.merge(3) 204 l2 = list(index.get_reader(1)) 205 206 for (t1, dp1), (t2, dp2) in zip(l1, l2): 207 print t1 == t2, t1, t2 208 print dp1 == dp1, dp1 209 print " ", dp2 210 211 index.close() 212 213 # vim: tabstop=4 expandtab shiftwidth=4