1 #!/usr/bin/env python 2 3 from iixr.files import * 4 from iixr.fields import * 5 from iixr.terms import * 6 from iixr.positions import * 7 from iixr.index import * 8 import os 9 10 # Remove old test files. 11 12 for filename in ("test", "testF", "testFI", "testI", "testP"): 13 try: 14 os.remove(filename) 15 except OSError: 16 pass 17 18 try: 19 for filename in os.listdir("test_index"): 20 os.remove(os.path.join("test_index", filename)) 21 os.rmdir("test_index") 22 except OSError: 23 pass 24 25 # Test basic data types. 26 27 numbers = [12345678, 0, 1, 127, 128, 255, 256] 28 29 f = open("test", "wb") 30 w = FileWriter(f) 31 for number in numbers: 32 w.write_number(number) 33 w.close() 34 35 f = open("test", "rb") 36 r = FileReader(f) 37 for number in numbers: 38 n = r.read_number() 39 print number == n, number, n 40 r.close() 41 42 # Test positions. 43 44 all_doc_positions = [ 45 [ 46 (123, [1, 3, 5, 15, 25]), 47 (124, [0, 100]), 48 (125, [11, 99, 199]), 49 (130, [77, 78, 80, 82, 89]) 50 ], 51 [ 52 (78, [9]), 53 (196, [10, 11]), 54 (197, [17, 21, 30]) 55 ] 56 ] 57 58 f = open("testP", "wb") 59 w = PositionWriter(f) 60 for doc_positions in all_doc_positions: 61 for docnum, positions in doc_positions: 62 w.write_positions(docnum, positions) 63 w.reset() 64 w.close() 65 66 f = open("testP", "rb") 67 r = PositionIterator(f, 0, None) 68 for doc_positions in all_doc_positions: 69 for docnum, positions in doc_positions: 70 d, p = r.read_positions() 71 print docnum == d, docnum, d 72 print positions == p, positions, p 73 r.reset() 74 r.close() 75 76 # Test position index files. 77 78 indexed_positions = [ 79 [ 80 (1234, 0, 100), 81 (2345, 700, 100), 82 (3456, 1900, 50) 83 ], 84 [ 85 (4567, 2800, 20) 86 ] 87 ] 88 89 offsets = [] 90 f = open("testPI", "wb") 91 w = PositionIndexWriter(f) 92 for term_positions in indexed_positions: 93 offset = None 94 doc_frequency = 0 95 w.reset() 96 for docnum, pos_offset, count in term_positions: 97 io = w.write_positions(docnum, pos_offset, count) 98 if offset is None: 99 offset = io 100 doc_frequency += count 101 offsets.append((offset, doc_frequency)) 102 w.close() 103 104 r = PositionIndexOpener("testPI") 105 offsets.reverse() 106 indexed_positions.reverse() 107 for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions): 108 found_positions = r.read_term_positions(offset, doc_frequency) 109 for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions): 110 print docnum == dn, docnum, dn 111 print pos_offset == po, pos_offset, po 112 print count == c, count, c 113 r.close() 114 115 # Test position dictionaries. 116 117 f = open("testP", "wb") 118 w = PositionWriter(f) 119 f2 = open("testPI", "wb") 120 w2 = PositionIndexWriter(f2) 121 wd = PositionDictionaryWriter(w, w2, 2) 122 offsets = [] 123 for doc_positions in all_doc_positions: 124 offset, frequency, doc_frequency = wd.write_term_positions(doc_positions) 125 offsets.append((offset, doc_frequency)) 126 wd.close() 127 128 r = PositionOpener("testP") 129 r2 = PositionIndexOpener("testPI") 130 rd = PositionDictionaryReader(r, r2) 131 offsets.reverse() 132 all_doc_positions.reverse() 133 for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions): 134 dp = list(rd.read_term_positions(offset, doc_frequency)) 135 print doc_positions == dp, doc_positions, dp 136 rd.close() 137 138 # Test fields. 139 140 doc_fields = [ 141 (123, ["testing", "fields", "stored", "compressed"]), 142 (456, ["fields", "for a second", "document"]), 143 (789, ["field value"]), 144 (1234, []), 145 (2345, ["abc", "def"]), 146 (3456, ["apple", "banana", "cherry"]), 147 (4567, ["drue", "eple"]) 148 ] 149 150 f = open("testF", "wb") 151 w = FieldWriter(f) 152 for docnum, fields in doc_fields: 153 w.write_fields(docnum, list(enumerate(fields))) 154 w.close() 155 156 f = open("testF", "rb") 157 r = FieldReader(f) 158 for docnum, fields in doc_fields: 159 dn, df = r.read_fields() 160 print docnum == dn, docnum, dn 161 print list(enumerate(fields)) == df, list(enumerate(fields)), df 162 r.close() 163 164 # Test field index files. 165 166 indexed_docs = [ 167 (123, 100000987), 168 (456, 100004321), 169 (789, 100008765) 170 ] 171 172 f = open("testFI", "wb") 173 w = FieldIndexWriter(f) 174 for docnum, offset in indexed_docs: 175 w.write_document(docnum, offset) 176 w.close() 177 178 f = open("testFI", "rb") 179 r = FieldIndexReader(f) 180 for docnum, offset in indexed_docs: 181 dn, o = r.read_document() 182 print docnum == dn, docnum, dn 183 print offset == o, offset, o 184 r.close() 185 186 # Test field dictionaries. 187 188 f = open("testF", "wb") 189 w = FieldWriter(f) 190 f2 = open("testFI", "wb") 191 w2 = FieldIndexWriter(f2) 192 wd = FieldDictionaryWriter(w, w2, 3) 193 for docnum, fields in doc_fields: 194 wd.write_fields(docnum, list(enumerate(fields))) 195 wd.close() 196 197 f = open("testF", "rb") 198 r = FieldReader(f) 199 f2 = open("testFI", "rb") 200 r2 = FieldIndexReader(f2) 201 rd = FieldDictionaryReader(r, r2) 202 doc_fields_reversed = doc_fields[:] 203 doc_fields_reversed.reverse() 204 for docnum, fields in doc_fields_reversed: 205 df = dict(rd.get_fields(docnum)) 206 print dict(enumerate(fields)) == df, dict(enumerate(fields)), df 207 for docnum in (13579, 246810): 208 df = rd.get_fields(docnum) 209 print df is None, df 210 211 # (Test sequential access.) 212 213 rd.rewind() 214 for docnum, fields in doc_fields: 215 dn, df = rd.read_fields() 216 print docnum == dn, docnum, dn 217 print list(enumerate(fields)) == df, list(enumerate(fields)), df 218 rd.close() 219 220 # Test terms. 221 222 terms = [ 223 # term offset frequency doc_frequency 224 ("aardvark", 100000123, 1, 1), 225 ("anteater", 100000456, 2, 1), 226 ("badger", 100000789, 13, 7), 227 ("bull", 1000001234, 59, 17), 228 ("bulldog", 1000002345, 99, 80), 229 ("cat", 1000003456, 89, 28) 230 ] 231 232 f = open("test", "wb") 233 w = TermWriter(f) 234 for term, offset, frequency, doc_frequency in terms: 235 w.write_term(term, offset, frequency, doc_frequency) 236 w.close() 237 238 f = open("test", "rb") 239 r = TermReader(f) 240 for term, offset, frequency, doc_frequency in terms: 241 t, o, fr, df = r.read_term() 242 print term == t, term, t 243 print offset == o, offset, o 244 print frequency == fr, frequency, fr 245 print doc_frequency == df, doc_frequency, df 246 r.close() 247 248 # Test terms in index files. 249 250 indexed_terms = [ 251 # term offset frequency doc_frequency info_offset 252 ("aardvark", 100000123, 1, 1, 200000321), 253 ("anteater", 100000456, 2, 1, 200000654), 254 ("badger", 100000789, 13, 7, 200000987), 255 ("bull", 1000001234, 59, 17, 200004321), 256 ("bulldog", 1000002345, 99, 80, 200005432), 257 ("cat", 1000003456, 89, 28, 200006543) 258 ] 259 260 f = open("test", "wb") 261 w = TermIndexWriter(f) 262 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 263 w.write_term(term, offset, frequency, doc_frequency, info_offset) 264 w.close() 265 266 f = open("test", "rb") 267 r = TermIndexReader(f) 268 for term, offset, frequency, doc_frequency, info_offset in indexed_terms: 269 t, o, fr, df, i = r.read_term() 270 print term == t, term, t 271 print offset == o, offset, o 272 print frequency == fr, frequency, fr 273 print doc_frequency == df, doc_frequency, df 274 print info_offset == i, info_offset, i 275 r.close() 276 277 # Test dictionaries with only term data. 278 279 f = open("test", "wb") 280 w = TermWriter(f) 281 f2 = open("testI", "wb") 282 w2 = TermIndexWriter(f2) 283 f3 = open("testP", "wb") 284 w3 = PositionWriter(f3) 285 f4 = open("testPI", "wb") 286 w4 = PositionIndexWriter(f4) 287 wp = PositionDictionaryWriter(w3, w4, 2) 288 wd = TermDictionaryWriter(w, w2, wp, 3) 289 for term, offset, frequency, doc_frequency in terms: 290 wd._write_term(term, offset, frequency, doc_frequency) 291 wd.close() 292 293 f = open("test", "rb") 294 r = TermReader(f) 295 f2 = open("testI", "rb") 296 r2 = TermIndexReader(f2) 297 r3 = PositionOpener("testP") 298 r4 = PositionIndexOpener("testPI") 299 rp = PositionDictionaryReader(r3, r4) 300 rd = TermDictionaryReader(r, r2, rp) 301 terms_reversed = terms[:] 302 terms_reversed.reverse() 303 for term, offset, frequency, doc_frequency in terms_reversed: 304 o, fr, df = rd._find_term(term) 305 print offset == o, offset, o 306 print frequency == fr, frequency, fr 307 print doc_frequency == df, doc_frequency, df 308 for term in ("dog", "dingo"): 309 t = rd._find_term(term) 310 print t is None, t 311 312 # (Test term prefix searching.) 313 314 print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"] 315 print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"] 316 print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"] 317 print rd.find_terms("d") == [], rd.find_terms("d"), [] 318 rd.close() 319 320 # Test dictionaries with term and position data. 321 322 terms_with_positions = [ 323 ("aardvark", [(1, [2, 45, 96]), (20, [13])]), 324 ("anteater", [(1, [43, 44])]), 325 ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), 326 ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), 327 ("bulldog", [(43, [17, 19, 256, 512])]), 328 ("cat", [(123, [12, 145, 196]), (1200, [113])]) 329 ] 330 331 position_dict_tests = [ 332 ("badger", 19, [55, 1333]), 333 ("badger", 20, None), 334 ("bull", 6, [128]), 335 ("bull", 26, [1, 3, 5, 7, 9]), 336 ("cat", 111, None), 337 ("cat", 123, [12, 145, 196]), 338 ("cat", 1234, None) 339 ] 340 341 f = open("test", "wb") 342 w = TermWriter(f) 343 f2 = open("testI", "wb") 344 w2 = TermIndexWriter(f2) 345 f3 = open("testP", "wb") 346 w3 = PositionWriter(f3) 347 f4 = open("testPI", "wb") 348 w4 = PositionIndexWriter(f4) 349 wp = PositionDictionaryWriter(w3, w4, 2) 350 wd = TermDictionaryWriter(w, w2, wp, 3) 351 for term, doc_positions in terms_with_positions: 352 wd.write_term_positions(term, doc_positions) 353 wd.close() 354 355 f = open("test", "rb") 356 r = TermReader(f) 357 f2 = open("testI", "rb") 358 r2 = TermIndexReader(f2) 359 r3 = PositionOpener("testP") 360 r4 = PositionIndexOpener("testPI") 361 rp = PositionDictionaryReader(r3, r4) 362 rd = TermDictionaryReader(r, r2, rp) 363 terms_reversed = terms_with_positions[:] 364 terms_reversed.reverse() 365 for term, doc_positions in terms_reversed: 366 dp = list(rd.find_positions(term)) 367 print doc_positions == dp, doc_positions, dp 368 for term in ("aaa", "dog", "dingo"): 369 dp = rd.find_positions(term) 370 print dp is None, dp 371 372 # (Test iterators.) 373 374 for term, docnum, positions in position_dict_tests: 375 dp = rd.find_positions(term) 376 pos = dp.from_document(docnum) 377 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 378 379 # (Test sequential access.) 380 381 rd.rewind() 382 for term, doc_positions in terms_with_positions: 383 t, fr, df, dp = rd.read_term() 384 dp = list(dp) 385 print term == t, term, t 386 print doc_positions == dp, doc_positions, dp 387 rd.close() 388 389 # Test high-level index operations (including merging). 390 391 docs = [ 392 (1, "The cat sat on the mat"), 393 (2, "Every good boy deserves football"), 394 (13, "One good turn deserves another"), 395 (14, "Every man for himself"), 396 (25, "Red sky at night shepherd's delight"), 397 (36, "She sells sea shells on the sea shore") 398 ] 399 400 doc_tests = [ 401 ("Every", 2, [(2, [0]), (14, [0])]), 402 ("good", 2, [(2, [1]), (13, [1])]), 403 ("deserves", 2, [(2, [3]), (13, [3])]), 404 ("sea", 2, [(36, [2, 6])]) 405 ] 406 407 position_tests = [ 408 ("Every", 14, [0]), 409 ("sea", 36, [2, 6]), 410 ("shells", 1, None), 411 ("shells", 37, None) 412 ] 413 414 index = Index("test_index") 415 wi = index.get_writer(3, 2, 6) 416 for docnum, text in docs: 417 doc = Document(docnum) 418 for position, term in enumerate(text.split()): 419 doc.add_position(term, position) 420 doc.add_field(123, text) 421 wi.add_document(doc) 422 wi.close() 423 424 rd = index.get_reader() 425 for term, frequency, doc_positions in doc_tests: 426 dp = list(rd.find_positions(term)) 427 print doc_positions == dp, doc_positions, dp 428 fr = rd.get_frequency(term) 429 print frequency == fr, frequency, fr 430 for docnum, text in docs: 431 df = dict(rd.get_fields(docnum)) 432 print df[123] == text, text, df[123] 433 for term, docnum, positions in position_tests: 434 dp = rd.find_positions(term) 435 pos = dp.from_document(docnum) 436 print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos 437 index.close() 438 439 # vim: tabstop=4 expandtab shiftwidth=4