paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@0 | 3 | import iixr |
paul@0 | 4 | |
paul@9 | 5 | # Test basic data types. |
paul@9 | 6 | |
paul@5 | 7 | numbers = [12345678, 0, 1, 127, 128, 255, 256] |
paul@0 | 8 | |
paul@0 | 9 | f = open("test", "wb") |
paul@0 | 10 | w = iixr.FileWriter(f) |
paul@0 | 11 | for number in numbers: |
paul@0 | 12 | w.write_number(number) |
paul@0 | 13 | w.close() |
paul@0 | 14 | |
paul@3 | 15 | f = open("test", "rb") |
paul@0 | 16 | r = iixr.FileReader(f) |
paul@0 | 17 | for number in numbers: |
paul@0 | 18 | n = r.read_number() |
paul@0 | 19 | print number == n, number, n |
paul@0 | 20 | r.close() |
paul@0 | 21 | |
paul@9 | 22 | # Test positions. |
paul@9 | 23 | |
paul@0 | 24 | all_doc_positions = [ |
paul@0 | 25 | [ |
paul@0 | 26 | (123, [1, 3, 5, 15, 25]), |
paul@0 | 27 | (124, [0, 100]) |
paul@0 | 28 | ], |
paul@0 | 29 | [ |
paul@0 | 30 | (78, [9]), |
paul@0 | 31 | (196, [10, 11]) |
paul@0 | 32 | ] |
paul@0 | 33 | ] |
paul@0 | 34 | |
paul@0 | 35 | f = open("test", "wb") |
paul@0 | 36 | w = iixr.PositionWriter(f) |
paul@0 | 37 | for doc_positions in all_doc_positions: |
paul@0 | 38 | for docnum, positions in doc_positions: |
paul@0 | 39 | w.write_positions(docnum, positions) |
paul@0 | 40 | w.reset() |
paul@0 | 41 | w.close() |
paul@0 | 42 | |
paul@3 | 43 | f = open("test", "rb") |
paul@0 | 44 | r = iixr.PositionReader(f) |
paul@0 | 45 | for doc_positions in all_doc_positions: |
paul@0 | 46 | for docnum, positions in doc_positions: |
paul@0 | 47 | d, p = r.read_positions() |
paul@0 | 48 | print docnum == d, docnum, d |
paul@0 | 49 | print positions == p, positions, p |
paul@0 | 50 | r.reset() |
paul@0 | 51 | r.close() |
paul@0 | 52 | |
paul@0 | 53 | f = open("test", "wb") |
paul@0 | 54 | w = iixr.PositionWriter(f) |
paul@0 | 55 | offsets = [] |
paul@0 | 56 | for doc_positions in all_doc_positions: |
paul@0 | 57 | offsets.append( |
paul@0 | 58 | w.write_all_positions(doc_positions) |
paul@0 | 59 | ) |
paul@0 | 60 | w.close() |
paul@0 | 61 | |
paul@3 | 62 | f = open("test", "rb") |
paul@0 | 63 | r = iixr.PositionReader(f) |
paul@0 | 64 | offsets.reverse() |
paul@0 | 65 | all_doc_positions.reverse() |
paul@0 | 66 | for offset, doc_positions in zip(offsets, all_doc_positions): |
paul@0 | 67 | dp = r.read_all_positions(offset) |
paul@0 | 68 | print doc_positions == dp, doc_positions, dp |
paul@0 | 69 | r.close() |
paul@0 | 70 | |
paul@9 | 71 | # Test fields. |
paul@9 | 72 | |
paul@8 | 73 | doc_fields = [ |
paul@9 | 74 | (123, ["testing", "fields", "stored", "compressed"]), |
paul@9 | 75 | (456, ["fields", "for a second", "document"]), |
paul@9 | 76 | (789, ["field value"]), |
paul@9 | 77 | (1234, []), |
paul@9 | 78 | (2345, ["abc", "def"]), |
paul@9 | 79 | (3456, ["apple", "banana", "cherry"]), |
paul@9 | 80 | (4567, ["drue", "eple"]) |
paul@8 | 81 | ] |
paul@8 | 82 | |
paul@8 | 83 | f = open("testF", "wb") |
paul@8 | 84 | w = iixr.FieldWriter(f) |
paul@9 | 85 | for docnum, fields in doc_fields: |
paul@9 | 86 | w.write_fields(docnum, fields) |
paul@8 | 87 | w.close() |
paul@8 | 88 | |
paul@8 | 89 | f = open("testF", "rb") |
paul@8 | 90 | r = iixr.FieldReader(f) |
paul@9 | 91 | for docnum, fields in doc_fields: |
paul@9 | 92 | dn, df = r.read_fields() |
paul@9 | 93 | print docnum == dn, docnum, dn |
paul@8 | 94 | print fields == df, fields, df |
paul@8 | 95 | r.close() |
paul@8 | 96 | |
paul@9 | 97 | # Test field index files. |
paul@9 | 98 | |
paul@9 | 99 | indexed_docs = [ |
paul@9 | 100 | (123, 100000987), |
paul@9 | 101 | (456, 100004321), |
paul@9 | 102 | (789, 100008765) |
paul@9 | 103 | ] |
paul@9 | 104 | |
paul@9 | 105 | f = open("testFI", "wb") |
paul@9 | 106 | w = iixr.FieldIndexWriter(f) |
paul@9 | 107 | for docnum, offset in indexed_docs: |
paul@9 | 108 | w.write_document(docnum, offset) |
paul@9 | 109 | w.close() |
paul@9 | 110 | |
paul@9 | 111 | f = open("testFI", "rb") |
paul@9 | 112 | r = iixr.FieldIndexReader(f) |
paul@9 | 113 | for docnum, offset in indexed_docs: |
paul@9 | 114 | dn, o = r.read_document() |
paul@9 | 115 | print docnum == dn, docnum, dn |
paul@9 | 116 | print offset == o, offset, o |
paul@9 | 117 | r.close() |
paul@9 | 118 | |
paul@9 | 119 | # Test field dictionaries. |
paul@9 | 120 | |
paul@9 | 121 | f = open("testF", "wb") |
paul@9 | 122 | w = iixr.FieldWriter(f) |
paul@9 | 123 | f2 = open("testFI", "wb") |
paul@9 | 124 | w2 = iixr.FieldIndexWriter(f2) |
paul@9 | 125 | wd = iixr.FieldDictionaryWriter(w, w2, 3) |
paul@9 | 126 | for docnum, fields in doc_fields: |
paul@9 | 127 | wd.write_fields(docnum, fields) |
paul@9 | 128 | wd.close() |
paul@9 | 129 | |
paul@9 | 130 | f = open("testF", "rb") |
paul@9 | 131 | r = iixr.FieldReader(f) |
paul@9 | 132 | f2 = open("testFI", "rb") |
paul@9 | 133 | r2 = iixr.FieldIndexReader(f2) |
paul@9 | 134 | rd = iixr.FieldDictionaryReader(r, r2) |
paul@9 | 135 | doc_fields_reversed = doc_fields[:] |
paul@9 | 136 | doc_fields_reversed.reverse() |
paul@9 | 137 | for docnum, fields in doc_fields_reversed: |
paul@9 | 138 | df = rd.read_fields(docnum) |
paul@9 | 139 | print fields == df, fields, df |
paul@9 | 140 | for docnum in (13579, 246810): |
paul@9 | 141 | df = rd.read_fields(docnum) |
paul@9 | 142 | print df is None, df |
paul@9 | 143 | rd.close() |
paul@9 | 144 | |
paul@9 | 145 | # Test terms. |
paul@9 | 146 | |
paul@2 | 147 | terms = [ |
paul@2 | 148 | ("aardvark", 100000123), |
paul@2 | 149 | ("anteater", 100000456), |
paul@2 | 150 | ("badger", 100000789), |
paul@2 | 151 | ("bull", 1000001234), |
paul@2 | 152 | ("bulldog", 1000002345), |
paul@2 | 153 | ("cat", 1000003456) |
paul@2 | 154 | ] |
paul@2 | 155 | |
paul@2 | 156 | f = open("test", "wb") |
paul@2 | 157 | w = iixr.TermWriter(f) |
paul@2 | 158 | for term, offset in terms: |
paul@2 | 159 | w.write_term(term, offset) |
paul@2 | 160 | w.close() |
paul@2 | 161 | |
paul@3 | 162 | f = open("test", "rb") |
paul@2 | 163 | r = iixr.TermReader(f) |
paul@2 | 164 | for term, offset in terms: |
paul@2 | 165 | t, o = r.read_term() |
paul@2 | 166 | print term == t, term, t |
paul@2 | 167 | print offset == o, offset, o |
paul@2 | 168 | r.close() |
paul@2 | 169 | |
paul@9 | 170 | # Test terms in index files. |
paul@9 | 171 | |
paul@3 | 172 | indexed_terms = [ |
paul@3 | 173 | ("aardvark", 100000123, 200000321), |
paul@3 | 174 | ("anteater", 100000456, 200000654), |
paul@3 | 175 | ("badger", 100000789, 200000987), |
paul@3 | 176 | ("bull", 1000001234, 200004321), |
paul@3 | 177 | ("bulldog", 1000002345, 200005432), |
paul@3 | 178 | ("cat", 1000003456, 200006543) |
paul@3 | 179 | ] |
paul@3 | 180 | |
paul@3 | 181 | f = open("test", "wb") |
paul@3 | 182 | w = iixr.TermIndexWriter(f) |
paul@3 | 183 | for term, offset, info_offset in indexed_terms: |
paul@3 | 184 | w.write_term(term, offset, info_offset) |
paul@3 | 185 | w.close() |
paul@3 | 186 | |
paul@3 | 187 | f = open("test", "rb") |
paul@3 | 188 | r = iixr.TermIndexReader(f) |
paul@3 | 189 | for term, offset, info_offset in indexed_terms: |
paul@3 | 190 | t, o, i = r.read_term() |
paul@3 | 191 | print term == t, term, t |
paul@3 | 192 | print offset == o, offset, o |
paul@3 | 193 | print info_offset == i, info_offset, i |
paul@3 | 194 | r.close() |
paul@3 | 195 | |
paul@9 | 196 | # Test dictionaries with only term data. |
paul@9 | 197 | |
paul@3 | 198 | f = open("test", "wb") |
paul@3 | 199 | w = iixr.TermWriter(f) |
paul@3 | 200 | f2 = open("testI", "wb") |
paul@3 | 201 | w2 = iixr.TermIndexWriter(f2) |
paul@5 | 202 | f3 = open("testP", "wb") |
paul@5 | 203 | w3 = iixr.PositionWriter(f3) |
paul@5 | 204 | wd = iixr.TermDictionaryWriter(w, w2, w3, 3) |
paul@3 | 205 | for term, offset in terms: |
paul@9 | 206 | wd._write_term(term, offset) |
paul@5 | 207 | wd.close() |
paul@3 | 208 | |
paul@3 | 209 | f = open("test", "rb") |
paul@3 | 210 | r = iixr.TermReader(f) |
paul@3 | 211 | f2 = open("testI", "rb") |
paul@3 | 212 | r2 = iixr.TermIndexReader(f2) |
paul@5 | 213 | f3 = open("testP", "rb") |
paul@5 | 214 | r3 = iixr.PositionReader(f3) |
paul@5 | 215 | rd = iixr.TermDictionaryReader(r, r2, r3) |
paul@3 | 216 | terms_reversed = terms[:] |
paul@3 | 217 | terms_reversed.reverse() |
paul@3 | 218 | for term, offset in terms_reversed: |
paul@9 | 219 | o = rd._find_term(term) |
paul@3 | 220 | print offset == o, offset, o |
paul@3 | 221 | for term in ("dog", "dingo"): |
paul@9 | 222 | o = rd._find_term(term) |
paul@3 | 223 | print o is None, o |
paul@5 | 224 | rd.close() |
paul@5 | 225 | |
paul@9 | 226 | # Test dictionaries with term and position data. |
paul@9 | 227 | |
paul@5 | 228 | terms_with_positions = [ |
paul@5 | 229 | ("aardvark", [(1, [2, 45, 96]), (20, [13])]), |
paul@5 | 230 | ("anteater", [(1, [43, 44])]), |
paul@5 | 231 | ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), |
paul@5 | 232 | ("bull", [(6, [128]), (16, [12])]), |
paul@5 | 233 | ("bulldog", [(43, [17, 19, 256, 512])]), |
paul@5 | 234 | ("cat", [(123, [12, 145, 196]), (1200, [113])]) |
paul@5 | 235 | ] |
paul@5 | 236 | |
paul@5 | 237 | f = open("test", "wb") |
paul@5 | 238 | w = iixr.TermWriter(f) |
paul@5 | 239 | f2 = open("testI", "wb") |
paul@5 | 240 | w2 = iixr.TermIndexWriter(f2) |
paul@5 | 241 | f3 = open("testP", "wb") |
paul@5 | 242 | w3 = iixr.PositionWriter(f3) |
paul@5 | 243 | wd = iixr.TermDictionaryWriter(w, w2, w3, 3) |
paul@5 | 244 | for term, doc_positions in terms_with_positions: |
paul@5 | 245 | wd.write_term_positions(term, doc_positions) |
paul@5 | 246 | wd.close() |
paul@5 | 247 | |
paul@5 | 248 | f = open("test", "rb") |
paul@5 | 249 | r = iixr.TermReader(f) |
paul@5 | 250 | f2 = open("testI", "rb") |
paul@5 | 251 | r2 = iixr.TermIndexReader(f2) |
paul@5 | 252 | f3 = open("testP", "rb") |
paul@5 | 253 | r3 = iixr.PositionReader(f3) |
paul@5 | 254 | rd = iixr.TermDictionaryReader(r, r2, r3) |
paul@5 | 255 | terms_reversed = terms_with_positions[:] |
paul@5 | 256 | terms_reversed.reverse() |
paul@5 | 257 | for term, doc_positions in terms_reversed: |
paul@5 | 258 | dp = rd.find_positions(term) |
paul@5 | 259 | print doc_positions == dp, doc_positions, dp |
paul@5 | 260 | for term in ("dog", "dingo"): |
paul@5 | 261 | dp = rd.find_positions(term) |
paul@5 | 262 | print dp is None, dp |
paul@5 | 263 | rd.close() |
paul@3 | 264 | |
paul@9 | 265 | # Test high-level index operations. |
paul@9 | 266 | |
paul@6 | 267 | docs = [ |
paul@6 | 268 | (1, "The cat sat on the mat"), |
paul@6 | 269 | (2, "Every good boy deserves football"), |
paul@6 | 270 | (13, "One good turn deserves another"), |
paul@6 | 271 | (14, "Every man for himself"), |
paul@6 | 272 | (25, "Red sky at night shepherd's delight"), |
paul@6 | 273 | (36, "She sells sea shells on the sea shore") |
paul@6 | 274 | ] |
paul@6 | 275 | |
paul@6 | 276 | doc_tests = [ |
paul@6 | 277 | ("Every", [(2, [0]), (14, [0])]), |
paul@6 | 278 | ("good", [(2, [1]), (13, [1])]), |
paul@6 | 279 | ("deserves", [(2, [3]), (13, [3])]), |
paul@6 | 280 | ("sea", [(36, [2, 6])]) |
paul@6 | 281 | ] |
paul@6 | 282 | |
paul@7 | 283 | index = iixr.Index("test_index") |
paul@7 | 284 | wi = index.get_writer(3) |
paul@6 | 285 | for docnum, text in docs: |
paul@6 | 286 | for position, term in enumerate(text.split()): |
paul@6 | 287 | wi.add_position(term, docnum, position) |
paul@6 | 288 | wi.close() |
paul@6 | 289 | |
paul@7 | 290 | rd = index.get_reader() |
paul@6 | 291 | for term, doc_positions in doc_tests: |
paul@6 | 292 | dp = rd.find_positions(term) |
paul@6 | 293 | print doc_positions == dp, doc_positions, dp |
paul@7 | 294 | index.close() |
paul@6 | 295 | |
paul@0 | 296 | # vim: tabstop=4 expandtab shiftwidth=4 |