iixr

Annotated test.py

9:fe7ed6b96612
2009-08-29 Paul Boddie Added field dictionary and field index readers and writers. Renamed various internal methods. Added document number deltas to field collections in order to support scanning for documents.
paul@0 1
#!/usr/bin/env python
paul@0 2
paul@0 3
import iixr
paul@0 4
paul@9 5
# Test basic data types.
paul@9 6
paul@5 7
numbers = [12345678, 0, 1, 127, 128, 255, 256]
paul@0 8
paul@0 9
f = open("test", "wb")
paul@0 10
w = iixr.FileWriter(f)
paul@0 11
for number in numbers:
paul@0 12
    w.write_number(number)
paul@0 13
w.close()
paul@0 14
paul@3 15
f = open("test", "rb")
paul@0 16
r = iixr.FileReader(f)
paul@0 17
for number in numbers:
paul@0 18
    n = r.read_number()
paul@0 19
    print number == n, number, n
paul@0 20
r.close()
paul@0 21
paul@9 22
# Test positions.
paul@9 23
paul@0 24
all_doc_positions = [
paul@0 25
    [
paul@0 26
        (123, [1, 3, 5, 15, 25]),
paul@0 27
        (124, [0, 100])
paul@0 28
    ],
paul@0 29
    [
paul@0 30
        (78, [9]),
paul@0 31
        (196, [10, 11])
paul@0 32
    ]
paul@0 33
    ]
paul@0 34
paul@0 35
f = open("test", "wb")
paul@0 36
w = iixr.PositionWriter(f)
paul@0 37
for doc_positions in all_doc_positions:
paul@0 38
    for docnum, positions in doc_positions:
paul@0 39
        w.write_positions(docnum, positions)
paul@0 40
    w.reset()
paul@0 41
w.close()
paul@0 42
paul@3 43
f = open("test", "rb")
paul@0 44
r = iixr.PositionReader(f)
paul@0 45
for doc_positions in all_doc_positions:
paul@0 46
    for docnum, positions in doc_positions:
paul@0 47
        d, p = r.read_positions()
paul@0 48
        print docnum == d, docnum, d
paul@0 49
        print positions == p, positions, p
paul@0 50
    r.reset()
paul@0 51
r.close()
paul@0 52
paul@0 53
f = open("test", "wb")
paul@0 54
w = iixr.PositionWriter(f)
paul@0 55
offsets = []
paul@0 56
for doc_positions in all_doc_positions:
paul@0 57
    offsets.append(
paul@0 58
        w.write_all_positions(doc_positions)
paul@0 59
        )
paul@0 60
w.close()
paul@0 61
paul@3 62
f = open("test", "rb")
paul@0 63
r = iixr.PositionReader(f)
paul@0 64
offsets.reverse()
paul@0 65
all_doc_positions.reverse()
paul@0 66
for offset, doc_positions in zip(offsets, all_doc_positions):
paul@0 67
    dp = r.read_all_positions(offset)
paul@0 68
    print doc_positions == dp, doc_positions, dp
paul@0 69
r.close()
paul@0 70
paul@9 71
# Test fields.
paul@9 72
paul@8 73
doc_fields = [
paul@9 74
    (123, ["testing", "fields", "stored", "compressed"]),
paul@9 75
    (456, ["fields", "for a second", "document"]),
paul@9 76
    (789, ["field value"]),
paul@9 77
    (1234, []),
paul@9 78
    (2345, ["abc", "def"]),
paul@9 79
    (3456, ["apple", "banana", "cherry"]),
paul@9 80
    (4567, ["drue", "eple"])
paul@8 81
    ]
paul@8 82
paul@8 83
f = open("testF", "wb")
paul@8 84
w = iixr.FieldWriter(f)
paul@9 85
for docnum, fields in doc_fields:
paul@9 86
    w.write_fields(docnum, fields)
paul@8 87
w.close()
paul@8 88
paul@8 89
f = open("testF", "rb")
paul@8 90
r = iixr.FieldReader(f)
paul@9 91
for docnum, fields in doc_fields:
paul@9 92
    dn, df = r.read_fields()
paul@9 93
    print docnum == dn, docnum, dn
paul@8 94
    print fields == df, fields, df
paul@8 95
r.close()
paul@8 96
paul@9 97
# Test field index files.
paul@9 98
paul@9 99
indexed_docs = [
paul@9 100
    (123, 100000987),
paul@9 101
    (456, 100004321),
paul@9 102
    (789, 100008765)
paul@9 103
    ]
paul@9 104
paul@9 105
f = open("testFI", "wb")
paul@9 106
w = iixr.FieldIndexWriter(f)
paul@9 107
for docnum, offset in indexed_docs:
paul@9 108
    w.write_document(docnum, offset)
paul@9 109
w.close()
paul@9 110
paul@9 111
f = open("testFI", "rb")
paul@9 112
r = iixr.FieldIndexReader(f)
paul@9 113
for docnum, offset in indexed_docs:
paul@9 114
    dn, o = r.read_document()
paul@9 115
    print docnum == dn, docnum, dn
paul@9 116
    print offset == o, offset, o
paul@9 117
r.close()
paul@9 118
paul@9 119
# Test field dictionaries.
paul@9 120
paul@9 121
f = open("testF", "wb")
paul@9 122
w = iixr.FieldWriter(f)
paul@9 123
f2 = open("testFI", "wb")
paul@9 124
w2 = iixr.FieldIndexWriter(f2)
paul@9 125
wd = iixr.FieldDictionaryWriter(w, w2, 3)
paul@9 126
for docnum, fields in doc_fields:
paul@9 127
    wd.write_fields(docnum, fields)
paul@9 128
wd.close()
paul@9 129
paul@9 130
f = open("testF", "rb")
paul@9 131
r = iixr.FieldReader(f)
paul@9 132
f2 = open("testFI", "rb")
paul@9 133
r2 = iixr.FieldIndexReader(f2)
paul@9 134
rd = iixr.FieldDictionaryReader(r, r2)
paul@9 135
doc_fields_reversed = doc_fields[:]
paul@9 136
doc_fields_reversed.reverse()
paul@9 137
for docnum, fields in doc_fields_reversed:
paul@9 138
    df = rd.read_fields(docnum)
paul@9 139
    print fields == df, fields, df
paul@9 140
for docnum in (13579, 246810):
paul@9 141
    df = rd.read_fields(docnum)
paul@9 142
    print df is None, df
paul@9 143
rd.close()
paul@9 144
paul@9 145
# Test terms.
paul@9 146
paul@2 147
terms = [
paul@2 148
    ("aardvark",  100000123),
paul@2 149
    ("anteater",  100000456),
paul@2 150
    ("badger",    100000789),
paul@2 151
    ("bull",     1000001234),
paul@2 152
    ("bulldog",  1000002345),
paul@2 153
    ("cat",      1000003456)
paul@2 154
    ]
paul@2 155
paul@2 156
f = open("test", "wb")
paul@2 157
w = iixr.TermWriter(f)
paul@2 158
for term, offset in terms:
paul@2 159
    w.write_term(term, offset)
paul@2 160
w.close()
paul@2 161
paul@3 162
f = open("test", "rb")
paul@2 163
r = iixr.TermReader(f)
paul@2 164
for term, offset in terms:
paul@2 165
    t, o = r.read_term()
paul@2 166
    print term == t, term, t
paul@2 167
    print offset == o, offset, o
paul@2 168
r.close()
paul@2 169
paul@9 170
# Test terms in index files.
paul@9 171
paul@3 172
indexed_terms = [
paul@3 173
    ("aardvark",  100000123, 200000321),
paul@3 174
    ("anteater",  100000456, 200000654),
paul@3 175
    ("badger",    100000789, 200000987),
paul@3 176
    ("bull",     1000001234, 200004321),
paul@3 177
    ("bulldog",  1000002345, 200005432),
paul@3 178
    ("cat",      1000003456, 200006543)
paul@3 179
    ]
paul@3 180
paul@3 181
f = open("test", "wb")
paul@3 182
w = iixr.TermIndexWriter(f)
paul@3 183
for term, offset, info_offset in indexed_terms:
paul@3 184
    w.write_term(term, offset, info_offset)
paul@3 185
w.close()
paul@3 186
paul@3 187
f = open("test", "rb")
paul@3 188
r = iixr.TermIndexReader(f)
paul@3 189
for term, offset, info_offset in indexed_terms:
paul@3 190
    t, o, i = r.read_term()
paul@3 191
    print term == t, term, t
paul@3 192
    print offset == o, offset, o
paul@3 193
    print info_offset == i, info_offset, i
paul@3 194
r.close()
paul@3 195
paul@9 196
# Test dictionaries with only term data.
paul@9 197
paul@3 198
f = open("test", "wb")
paul@3 199
w = iixr.TermWriter(f)
paul@3 200
f2 = open("testI", "wb")
paul@3 201
w2 = iixr.TermIndexWriter(f2)
paul@5 202
f3 = open("testP", "wb")
paul@5 203
w3 = iixr.PositionWriter(f3)
paul@5 204
wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
paul@3 205
for term, offset in terms:
paul@9 206
    wd._write_term(term, offset)
paul@5 207
wd.close()
paul@3 208
paul@3 209
f = open("test", "rb")
paul@3 210
r = iixr.TermReader(f)
paul@3 211
f2 = open("testI", "rb")
paul@3 212
r2 = iixr.TermIndexReader(f2)
paul@5 213
f3 = open("testP", "rb")
paul@5 214
r3 = iixr.PositionReader(f3)
paul@5 215
rd = iixr.TermDictionaryReader(r, r2, r3)
paul@3 216
terms_reversed = terms[:]
paul@3 217
terms_reversed.reverse()
paul@3 218
for term, offset in terms_reversed:
paul@9 219
    o = rd._find_term(term)
paul@3 220
    print offset == o, offset, o
paul@3 221
for term in ("dog", "dingo"):
paul@9 222
    o = rd._find_term(term)
paul@3 223
    print o is None, o
paul@5 224
rd.close()
paul@5 225
paul@9 226
# Test dictionaries with term and position data.
paul@9 227
paul@5 228
terms_with_positions = [
paul@5 229
    ("aardvark",  [(1, [2, 45, 96]), (20, [13])]),
paul@5 230
    ("anteater",  [(1, [43, 44])]),
paul@5 231
    ("badger",    [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
paul@5 232
    ("bull",      [(6, [128]), (16, [12])]),
paul@5 233
    ("bulldog",   [(43, [17, 19, 256, 512])]),
paul@5 234
    ("cat",       [(123, [12, 145, 196]), (1200, [113])])
paul@5 235
    ]
paul@5 236
paul@5 237
f = open("test", "wb")
paul@5 238
w = iixr.TermWriter(f)
paul@5 239
f2 = open("testI", "wb")
paul@5 240
w2 = iixr.TermIndexWriter(f2)
paul@5 241
f3 = open("testP", "wb")
paul@5 242
w3 = iixr.PositionWriter(f3)
paul@5 243
wd = iixr.TermDictionaryWriter(w, w2, w3, 3)
paul@5 244
for term, doc_positions in terms_with_positions:
paul@5 245
    wd.write_term_positions(term, doc_positions)
paul@5 246
wd.close()
paul@5 247
paul@5 248
f = open("test", "rb")
paul@5 249
r = iixr.TermReader(f)
paul@5 250
f2 = open("testI", "rb")
paul@5 251
r2 = iixr.TermIndexReader(f2)
paul@5 252
f3 = open("testP", "rb")
paul@5 253
r3 = iixr.PositionReader(f3)
paul@5 254
rd = iixr.TermDictionaryReader(r, r2, r3)
paul@5 255
terms_reversed = terms_with_positions[:]
paul@5 256
terms_reversed.reverse()
paul@5 257
for term, doc_positions in terms_reversed:
paul@5 258
    dp = rd.find_positions(term)
paul@5 259
    print doc_positions == dp, doc_positions, dp
paul@5 260
for term in ("dog", "dingo"):
paul@5 261
    dp = rd.find_positions(term)
paul@5 262
    print dp is None, dp
paul@5 263
rd.close()
paul@3 264
paul@9 265
# Test high-level index operations.
paul@9 266
paul@6 267
docs = [
paul@6 268
    (1, "The cat sat on the mat"),
paul@6 269
    (2, "Every good boy deserves football"),
paul@6 270
    (13, "One good turn deserves another"),
paul@6 271
    (14, "Every man for himself"),
paul@6 272
    (25, "Red sky at night shepherd's delight"),
paul@6 273
    (36, "She sells sea shells on the sea shore")
paul@6 274
    ]
paul@6 275
paul@6 276
doc_tests = [
paul@6 277
    ("Every", [(2, [0]), (14, [0])]),
paul@6 278
    ("good", [(2, [1]), (13, [1])]),
paul@6 279
    ("deserves", [(2, [3]), (13, [3])]),
paul@6 280
    ("sea", [(36, [2, 6])])
paul@6 281
    ]
paul@6 282
paul@7 283
index = iixr.Index("test_index")
paul@7 284
wi = index.get_writer(3)
paul@6 285
for docnum, text in docs:
paul@6 286
    for position, term in enumerate(text.split()):
paul@6 287
        wi.add_position(term, docnum, position)
paul@6 288
wi.close()
paul@6 289
paul@7 290
rd = index.get_reader()
paul@6 291
for term, doc_positions in doc_tests:
paul@6 292
    dp = rd.find_positions(term)
paul@6 293
    print doc_positions == dp, doc_positions, dp
paul@7 294
index.close()
paul@6 295
paul@0 296
# vim: tabstop=4 expandtab shiftwidth=4