iixr

Annotated test.py

67:89465c390a46
2009-10-03 Paul Boddie Added a document cache, used when reading fields. Optimised read_number slightly using arrays.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@44 3
from iixr.files import *
paul@44 4
from iixr.fields import *
paul@44 5
from iixr.terms import *
paul@44 6
from iixr.positions import *
paul@44 7
from iixr.index import *
paul@59 8
import os, sys
paul@18 9
paul@18 10
# Remove old test files.
paul@18 11
paul@59 12
for filename in ("test", "testF", "testFI", "testI", "testP", "testPI"):
paul@18 13
    try:
paul@18 14
        os.remove(filename)
paul@18 15
    except OSError:
paul@18 16
        pass
paul@18 17
paul@18 18
try:
paul@59 19
    for dirname in ("test_index", "test_index2", "test_index3"):
paul@59 20
        for filename in os.listdir(dirname):
paul@59 21
            os.remove(os.path.join(dirname, filename))
paul@59 22
        os.rmdir(dirname)
paul@18 23
except OSError:
paul@18 24
    pass
paul@0 25
paul@59 26
if "clean" in sys.argv:
paul@59 27
    sys.exit(0)
paul@59 28
paul@9 29
# Test basic data types.
paul@9 30
paul@5 31
numbers = [12345678, 0, 1, 127, 128, 255, 256]
paul@0 32
paul@0 33
f = open("test", "wb")
paul@44 34
w = FileWriter(f)
paul@0 35
for number in numbers:
paul@0 36
    w.write_number(number)
paul@0 37
w.close()
paul@0 38
paul@3 39
f = open("test", "rb")
paul@44 40
r = FileReader(f)
paul@0 41
for number in numbers:
paul@0 42
    n = r.read_number()
paul@0 43
    print number == n, number, n
paul@0 44
r.close()
paul@0 45
paul@9 46
# Test positions.
paul@9 47
paul@0 48
all_doc_positions = [
paul@0 49
    [
paul@0 50
        (123, [1, 3, 5, 15, 25]),
paul@19 51
        (124, [0, 100]),
paul@19 52
        (125, [11, 99, 199]),
paul@19 53
        (130, [77, 78, 80, 82, 89])
paul@0 54
    ],
paul@0 55
    [
paul@0 56
        (78, [9]),
paul@19 57
        (196, [10, 11]),
paul@19 58
        (197, [17, 21, 30])
paul@0 59
    ]
paul@0 60
    ]
paul@0 61
paul@19 62
f = open("testP", "wb")
paul@44 63
w = PositionWriter(f)
paul@0 64
for doc_positions in all_doc_positions:
paul@0 65
    for docnum, positions in doc_positions:
paul@0 66
        w.write_positions(docnum, positions)
paul@0 67
    w.reset()
paul@0 68
w.close()
paul@0 69
paul@19 70
f = open("testP", "rb")
paul@44 71
r = PositionIterator(f, 0, None)
paul@0 72
for doc_positions in all_doc_positions:
paul@0 73
    for docnum, positions in doc_positions:
paul@0 74
        d, p = r.read_positions()
paul@0 75
        print docnum == d, docnum, d
paul@0 76
        print positions == p, positions, p
paul@0 77
    r.reset()
paul@0 78
r.close()
paul@0 79
paul@19 80
# Test position index files.
paul@19 81
paul@19 82
indexed_positions = [
paul@19 83
    [
paul@19 84
        (1234, 0, 100),
paul@19 85
        (2345, 700, 100),
paul@19 86
        (3456, 1900, 50)
paul@19 87
    ],
paul@19 88
    [
paul@19 89
        (4567, 2800, 20)
paul@19 90
    ]
paul@19 91
    ]
paul@19 92
paul@19 93
offsets = []
paul@19 94
f = open("testPI", "wb")
paul@44 95
w = PositionIndexWriter(f)
paul@19 96
for term_positions in indexed_positions:
paul@19 97
    offset = None
paul@19 98
    doc_frequency = 0
paul@19 99
    w.reset()
paul@19 100
    for docnum, pos_offset, count in term_positions:
paul@19 101
        if offset is None:
paul@55 102
            offset = w.f.tell()
paul@55 103
        w.write_positions(docnum, pos_offset, count)
paul@19 104
        doc_frequency += count
paul@19 105
    offsets.append((offset, doc_frequency))
paul@19 106
w.close()
paul@19 107
paul@44 108
r = PositionIndexOpener("testPI")
paul@19 109
offsets.reverse()
paul@19 110
indexed_positions.reverse()
paul@19 111
for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
paul@19 112
    found_positions = r.read_term_positions(offset, doc_frequency)
paul@19 113
    for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):
paul@19 114
        print docnum == dn, docnum, dn
paul@19 115
        print pos_offset == po, pos_offset, po
paul@19 116
        print count == c, count, c
paul@19 117
r.close()
paul@19 118
paul@19 119
# Test position dictionaries.
paul@19 120
paul@19 121
f = open("testP", "wb")
paul@44 122
w = PositionWriter(f)
paul@19 123
f2 = open("testPI", "wb")
paul@44 124
w2 = PositionIndexWriter(f2)
paul@44 125
wd = PositionDictionaryWriter(w, w2, 2)
paul@0 126
offsets = []
paul@0 127
for doc_positions in all_doc_positions:
paul@19 128
    offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
paul@19 129
    offsets.append((offset, doc_frequency))
paul@20 130
wd.close()
paul@0 131
paul@44 132
r = PositionOpener("testP")
paul@44 133
r2 = PositionIndexOpener("testPI")
paul@44 134
rd = PositionDictionaryReader(r, r2)
paul@0 135
offsets.reverse()
paul@0 136
all_doc_positions.reverse()
paul@19 137
for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
paul@19 138
    dp = list(rd.read_term_positions(offset, doc_frequency))
paul@0 139
    print doc_positions == dp, doc_positions, dp
paul@20 140
rd.close()
paul@0 141
paul@9 142
# Test fields.
paul@9 143
paul@8 144
doc_fields = [
paul@9 145
    (123, ["testing", "fields", "stored", "compressed"]),
paul@9 146
    (456, ["fields", "for a second", "document"]),
paul@9 147
    (789, ["field value"]),
paul@9 148
    (1234, []),
paul@9 149
    (2345, ["abc", "def"]),
paul@9 150
    (3456, ["apple", "banana", "cherry"]),
paul@9 151
    (4567, ["drue", "eple"])
paul@8 152
    ]
paul@8 153
paul@8 154
f = open("testF", "wb")
paul@44 155
w = FieldWriter(f)
paul@9 156
for docnum, fields in doc_fields:
paul@13 157
    w.write_fields(docnum, list(enumerate(fields)))
paul@8 158
w.close()
paul@8 159
paul@8 160
f = open("testF", "rb")
paul@44 161
r = FieldReader(f)
paul@9 162
for docnum, fields in doc_fields:
paul@9 163
    dn, df = r.read_fields()
paul@9 164
    print docnum == dn, docnum, dn
paul@13 165
    print list(enumerate(fields)) == df, list(enumerate(fields)), df
paul@8 166
r.close()
paul@8 167
paul@9 168
# Test field index files.
paul@9 169
paul@9 170
indexed_docs = [
paul@9 171
    (123, 100000987),
paul@9 172
    (456, 100004321),
paul@9 173
    (789, 100008765)
paul@9 174
    ]
paul@9 175
paul@9 176
f = open("testFI", "wb")
paul@44 177
w = FieldIndexWriter(f)
paul@9 178
for docnum, offset in indexed_docs:
paul@9 179
    w.write_document(docnum, offset)
paul@9 180
w.close()
paul@9 181
paul@9 182
f = open("testFI", "rb")
paul@44 183
r = FieldIndexReader(f)
paul@9 184
for docnum, offset in indexed_docs:
paul@9 185
    dn, o = r.read_document()
paul@9 186
    print docnum == dn, docnum, dn
paul@9 187
    print offset == o, offset, o
paul@9 188
r.close()
paul@9 189
paul@9 190
# Test field dictionaries.
paul@9 191
paul@9 192
f = open("testF", "wb")
paul@44 193
w = FieldWriter(f)
paul@9 194
f2 = open("testFI", "wb")
paul@44 195
w2 = FieldIndexWriter(f2)
paul@44 196
wd = FieldDictionaryWriter(w, w2, 3)
paul@9 197
for docnum, fields in doc_fields:
paul@13 198
    wd.write_fields(docnum, list(enumerate(fields)))
paul@9 199
wd.close()
paul@9 200
paul@9 201
f = open("testF", "rb")
paul@44 202
r = FieldReader(f)
paul@9 203
f2 = open("testFI", "rb")
paul@44 204
r2 = FieldIndexReader(f2)
paul@44 205
rd = FieldDictionaryReader(r, r2)
paul@9 206
doc_fields_reversed = doc_fields[:]
paul@9 207
doc_fields_reversed.reverse()
paul@9 208
for docnum, fields in doc_fields_reversed:
paul@25 209
    df = dict(rd.get_fields(docnum))
paul@25 210
    print dict(enumerate(fields)) == df, dict(enumerate(fields)), df
paul@9 211
for docnum in (13579, 246810):
paul@13 212
    df = rd.get_fields(docnum)
paul@9 213
    print df is None, df
paul@13 214
paul@13 215
# (Test sequential access.)
paul@13 216
paul@13 217
rd.rewind()
paul@13 218
for docnum, fields in doc_fields:
paul@13 219
    dn, df = rd.read_fields()
paul@13 220
    print docnum == dn, docnum, dn
paul@13 221
    print list(enumerate(fields)) == df, list(enumerate(fields)), df
paul@9 222
rd.close()
paul@9 223
paul@9 224
# Test terms.
paul@9 225
paul@2 226
terms = [
paul@19 227
    # term       offset      frequency  doc_frequency
paul@19 228
    ("aardvark",  100000123,  1,         1),
paul@19 229
    ("anteater",  100000456,  2,         1),
paul@19 230
    ("badger",    100000789, 13,         7),
paul@19 231
    ("bull",     1000001234, 59,        17),
paul@19 232
    ("bulldog",  1000002345, 99,        80),
paul@19 233
    ("cat",      1000003456, 89,        28)
paul@2 234
    ]
paul@2 235
paul@2 236
f = open("test", "wb")
paul@44 237
w = TermWriter(f)
paul@19 238
for term, offset, frequency, doc_frequency in terms:
paul@19 239
    w.write_term(term, offset, frequency, doc_frequency)
paul@2 240
w.close()
paul@2 241
paul@3 242
f = open("test", "rb")
paul@44 243
r = TermReader(f)
paul@19 244
for term, offset, frequency, doc_frequency in terms:
paul@19 245
    t, o, fr, df = r.read_term()
paul@2 246
    print term == t, term, t
paul@2 247
    print offset == o, offset, o
paul@11 248
    print frequency == fr, frequency, fr
paul@19 249
    print doc_frequency == df, doc_frequency, df
paul@2 250
r.close()
paul@2 251
paul@9 252
# Test terms in index files.
paul@9 253
paul@3 254
indexed_terms = [
paul@19 255
    # term       offset      frequency  doc_frequency   info_offset
paul@19 256
    ("aardvark",  100000123,  1,         1,             200000321),
paul@19 257
    ("anteater",  100000456,  2,         1,             200000654),
paul@19 258
    ("badger",    100000789, 13,         7,             200000987),
paul@19 259
    ("bull",     1000001234, 59,        17,             200004321),
paul@19 260
    ("bulldog",  1000002345, 99,        80,             200005432),
paul@19 261
    ("cat",      1000003456, 89,        28,             200006543)
paul@3 262
    ]
paul@3 263
paul@3 264
f = open("test", "wb")
paul@44 265
w = TermIndexWriter(f)
paul@19 266
for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
paul@19 267
    w.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3 268
w.close()
paul@3 269
paul@3 270
f = open("test", "rb")
paul@44 271
r = TermIndexReader(f)
paul@19 272
for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
paul@19 273
    t, o, fr, df, i = r.read_term()
paul@3 274
    print term == t, term, t
paul@3 275
    print offset == o, offset, o
paul@11 276
    print frequency == fr, frequency, fr
paul@19 277
    print doc_frequency == df, doc_frequency, df
paul@3 278
    print info_offset == i, info_offset, i
paul@3 279
r.close()
paul@3 280
paul@9 281
# Test dictionaries with only term data.
paul@9 282
paul@3 283
f = open("test", "wb")
paul@44 284
w = TermWriter(f)
paul@3 285
f2 = open("testI", "wb")
paul@44 286
w2 = TermIndexWriter(f2)
paul@20 287
f3 = open("testP", "wb")
paul@44 288
w3 = PositionWriter(f3)
paul@20 289
f4 = open("testPI", "wb")
paul@44 290
w4 = PositionIndexWriter(f4)
paul@44 291
wp = PositionDictionaryWriter(w3, w4, 2)
paul@44 292
wd = TermDictionaryWriter(w, w2, wp, 3)
paul@19 293
for term, offset, frequency, doc_frequency in terms:
paul@19 294
    wd._write_term(term, offset, frequency, doc_frequency)
paul@5 295
wd.close()
paul@3 296
paul@3 297
f = open("test", "rb")
paul@44 298
r = TermReader(f)
paul@3 299
f2 = open("testI", "rb")
paul@44 300
r2 = TermIndexReader(f2)
paul@44 301
r3 = PositionOpener("testP")
paul@44 302
r4 = PositionIndexOpener("testPI")
paul@44 303
rp = PositionDictionaryReader(r3, r4)
paul@44 304
rd = TermDictionaryReader(r, r2, rp)
paul@3 305
terms_reversed = terms[:]
paul@3 306
terms_reversed.reverse()
paul@19 307
for term, offset, frequency, doc_frequency in terms_reversed:
paul@19 308
    o, fr, df = rd._find_term(term)
paul@3 309
    print offset == o, offset, o
paul@11 310
    print frequency == fr, frequency, fr
paul@19 311
    print doc_frequency == df, doc_frequency, df
paul@3 312
for term in ("dog", "dingo"):
paul@11 313
    t = rd._find_term(term)
paul@11 314
    print t is None, t
paul@25 315
paul@25 316
# (Test term prefix searching.)
paul@25 317
paul@25 318
print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]
paul@25 319
print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]
paul@25 320
print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]
paul@25 321
print rd.find_terms("d") == [], rd.find_terms("d"), []
paul@5 322
rd.close()
paul@5 323
paul@9 324
# Test dictionaries with term and position data.
paul@9 325
paul@5 326
terms_with_positions = [
paul@5 327
    ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),
paul@5 328
    ("anteater",  [(1, [43, 44])]),
paul@5 329
    ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
paul@19 330
    ("bull",      [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
paul@5 331
    ("bulldog",   [(43, [17, 19, 256, 512])]),
paul@5 332
    ("cat",       [(123, [12, 145, 196]), (1200, [113])])
paul@5 333
    ]
paul@5 334
paul@22 335
position_dict_tests = [
paul@22 336
    ("badger", 19, [55, 1333]),
paul@22 337
    ("badger", 20, None),
paul@22 338
    ("bull", 6, [128]),
paul@22 339
    ("bull", 26, [1, 3, 5, 7, 9]),
paul@22 340
    ("cat", 111, None),
paul@22 341
    ("cat", 123, [12, 145, 196]),
paul@22 342
    ("cat", 1234, None)
paul@22 343
    ]
paul@22 344
paul@5 345
f = open("test", "wb")
paul@44 346
w = TermWriter(f)
paul@5 347
f2 = open("testI", "wb")
paul@44 348
w2 = TermIndexWriter(f2)
paul@5 349
f3 = open("testP", "wb")
paul@44 350
w3 = PositionWriter(f3)
paul@19 351
f4 = open("testPI", "wb")
paul@44 352
w4 = PositionIndexWriter(f4)
paul@44 353
wp = PositionDictionaryWriter(w3, w4, 2)
paul@44 354
wd = TermDictionaryWriter(w, w2, wp, 3)
paul@5 355
for term, doc_positions in terms_with_positions:
paul@5 356
    wd.write_term_positions(term, doc_positions)
paul@5 357
wd.close()
paul@5 358
paul@5 359
f = open("test", "rb")
paul@44 360
r = TermReader(f)
paul@5 361
f2 = open("testI", "rb")
paul@44 362
r2 = TermIndexReader(f2)
paul@44 363
r3 = PositionOpener("testP")
paul@44 364
r4 = PositionIndexOpener("testPI")
paul@44 365
rp = PositionDictionaryReader(r3, r4)
paul@44 366
rd = TermDictionaryReader(r, r2, rp)
paul@5 367
terms_reversed = terms_with_positions[:]
paul@5 368
terms_reversed.reverse()
paul@5 369
for term, doc_positions in terms_reversed:
paul@18 370
    dp = list(rd.find_positions(term))
paul@5 371
    print doc_positions == dp, doc_positions, dp
paul@25 372
for term in ("aaa", "dog", "dingo"):
paul@5 373
    dp = rd.find_positions(term)
paul@61 374
    print dp == [], dp
paul@12 375
paul@22 376
# (Test iterators.)
paul@22 377
paul@22 378
for term, docnum, positions in position_dict_tests:
paul@22 379
    dp = rd.find_positions(term)
paul@22 380
    pos = dp.from_document(docnum)
paul@22 381
    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
paul@22 382
paul@12 383
# (Test sequential access.)
paul@12 384
paul@12 385
rd.rewind()
paul@12 386
for term, doc_positions in terms_with_positions:
paul@19 387
    t, fr, df, dp = rd.read_term()
paul@18 388
    dp = list(dp)
paul@12 389
    print term == t, term, t
paul@12 390
    print doc_positions == dp, doc_positions, dp
paul@5 391
rd.close()
paul@3 392
paul@14 393
# Test high-level index operations (including merging).
paul@9 394
paul@6 395
docs = [
paul@6 396
    (1, "The cat sat on the mat"),
paul@6 397
    (2, "Every good boy deserves football"),
paul@6 398
    (13, "One good turn deserves another"),
paul@6 399
    (14, "Every man for himself"),
paul@6 400
    (25, "Red sky at night shepherd's delight"),
paul@6 401
    (36, "She sells sea shells on the sea shore")
paul@6 402
    ]
paul@6 403
paul@6 404
doc_tests = [
paul@11 405
    ("Every", 2, [(2, [0]), (14, [0])]),
paul@11 406
    ("good", 2, [(2, [1]), (13, [1])]),
paul@11 407
    ("deserves", 2, [(2, [3]), (13, [3])]),
paul@11 408
    ("sea", 2, [(36, [2, 6])])
paul@6 409
    ]
paul@6 410
paul@21 411
position_tests = [
paul@21 412
    ("Every", 14, [0]),
paul@21 413
    ("sea", 36, [2, 6]),
paul@22 414
    ("shells", 1, None),
paul@22 415
    ("shells", 37, None)
paul@21 416
    ]
paul@21 417
paul@60 418
phrase_tests = [
paul@62 419
    (["good", "boy"], [(2, [1, 2])]),
paul@62 420
    (["on", "the"], [(1, [3, 4]), (36, [4, 5])]),
paul@62 421
    (["sea", "shore"], [(36, [6, 7])])
paul@60 422
    ]
paul@60 423
paul@64 424
index = Index("test_index", 3, 2, 3, 6)
paul@64 425
wi = index.get_writer()
paul@6 426
for docnum, text in docs:
paul@44 427
    doc = Document(docnum)
paul@6 428
    for position, term in enumerate(text.split()):
paul@28 429
        doc.add_position(term, position)
paul@28 430
    doc.add_field(123, text)
paul@28 431
    wi.add_document(doc)
paul@6 432
wi.close()
paul@6 433
paul@7 434
rd = index.get_reader()
paul@60 435
paul@60 436
# (Test searching.)
paul@60 437
paul@11 438
for term, frequency, doc_positions in doc_tests:
paul@18 439
    dp = list(rd.find_positions(term))
paul@6 440
    print doc_positions == dp, doc_positions, dp
paul@11 441
    fr = rd.get_frequency(term)
paul@11 442
    print frequency == fr, frequency, fr
paul@60 443
paul@60 444
# (Test fields.)
paul@60 445
paul@10 446
for docnum, text in docs:
paul@25 447
    df = dict(rd.get_fields(docnum))
paul@25 448
    print df[123] == text, text, df[123]
paul@60 449
paul@60 450
# (Test navigation.)
paul@60 451
paul@21 452
for term, docnum, positions in position_tests:
paul@21 453
    dp = rd.find_positions(term)
paul@22 454
    pos = dp.from_document(docnum)
paul@22 455
    print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
paul@60 456
paul@60 457
# (Test phrases.)
paul@60 458
paul@60 459
for terms, results in phrase_tests:
paul@60 460
    res = list(rd.find_common_positions(terms))
paul@60 461
    print results == res, results, res
paul@60 462
paul@7 463
index.close()
paul@6 464
paul@58 465
# Test index updates.
paul@58 466
paul@58 467
index = Index("test_index")
paul@64 468
index2 = Index("test_index2", 3, 2, 3, 6)
paul@64 469
wi = index2.get_writer()
paul@58 470
for docnum, text in docs:
paul@58 471
paul@58 472
    # Add the same documents but with different numbers.
paul@58 473
paul@58 474
    doc = Document(docnum + 100)
paul@58 475
    for position, term in enumerate(text.split()):
paul@58 476
        doc.add_position(term, position)
paul@58 477
    doc.add_field(123, text)
paul@58 478
    wi.add_document(doc)
paul@58 479
wi.close()
paul@58 480
paul@58 481
index2.update([index])
paul@58 482
index.close()
paul@58 483
paul@58 484
rd = index2.get_reader()
paul@58 485
for term, frequency, doc_positions in doc_tests:
paul@58 486
paul@58 487
    # Add the extra documents to the expected result.
paul@58 488
paul@59 489
    orig_doc_positions = doc_positions
paul@59 490
    doc_positions = doc_positions[:]
paul@59 491
paul@59 492
    for docnum, positions in orig_doc_positions:
paul@58 493
        doc_positions.append((docnum + 100, positions))
paul@58 494
    frequency *= 2
paul@58 495
paul@58 496
    dp = list(rd.find_positions(term))
paul@58 497
    print doc_positions == dp, doc_positions, dp
paul@58 498
    fr = rd.get_frequency(term)
paul@58 499
    print frequency == fr, frequency, fr
paul@58 500
index2.close()
paul@58 501
paul@59 502
# (Test update of an empty index.)
paul@59 503
paul@59 504
index = Index("test_index")
paul@59 505
index3 = Index("test_index3")
paul@59 506
index3.update([index])
paul@59 507
index.close()
paul@59 508
paul@59 509
rd = index3.get_reader()
paul@59 510
for term, frequency, doc_positions in doc_tests:
paul@59 511
    dp = list(rd.find_positions(term))
paul@59 512
    print doc_positions == dp, doc_positions, dp
paul@59 513
    fr = rd.get_frequency(term)
paul@59 514
    print frequency == fr, frequency, fr
paul@59 515
index3.close()
paul@59 516
paul@0 517
# vim: tabstop=4 expandtab shiftwidth=4