paul@0 | 1 | #!/usr/bin/env python |
paul@96 | 2 | # encoding: iso-8859-1 |
paul@0 | 3 | |
paul@44 | 4 | from iixr.files import * |
paul@44 | 5 | from iixr.terms import * |
paul@44 | 6 | from iixr.index import * |
paul@59 | 7 | import os, sys |
paul@18 | 8 | |
paul@18 | 9 | # Remove old test files. |
paul@18 | 10 | |
paul@96 | 11 | for filename in ("test", "testMS", "testNMS", "testP", "testP2"): |
paul@18 | 12 | try: |
paul@18 | 13 | os.remove(filename) |
paul@18 | 14 | except OSError: |
paul@18 | 15 | pass |
paul@18 | 16 | |
paul@18 | 17 | try: |
paul@96 | 18 | for dirname in ("test_index",): |
paul@59 | 19 | for filename in os.listdir(dirname): |
paul@59 | 20 | os.remove(os.path.join(dirname, filename)) |
paul@59 | 21 | os.rmdir(dirname) |
paul@18 | 22 | except OSError: |
paul@18 | 23 | pass |
paul@0 | 24 | |
paul@59 | 25 | if "clean" in sys.argv: |
paul@59 | 26 | sys.exit(0) |
paul@59 | 27 | |
paul@69 | 28 | print "- Test basic data types." |
paul@9 | 29 | |
paul@5 | 30 | numbers = [12345678, 0, 1, 127, 128, 255, 256] |
paul@0 | 31 | |
paul@0 | 32 | f = open("test", "wb") |
paul@44 | 33 | w = FileWriter(f) |
paul@89 | 34 | w.begin_record() |
paul@0 | 35 | for number in numbers: |
paul@0 | 36 | w.write_number(number) |
paul@89 | 37 | w.end_record() |
paul@0 | 38 | w.close() |
paul@0 | 39 | |
paul@3 | 40 | f = open("test", "rb") |
paul@44 | 41 | r = FileReader(f) |
paul@89 | 42 | r.begin_record() |
paul@0 | 43 | for number in numbers: |
paul@0 | 44 | n = r.read_number() |
paul@0 | 45 | print number == n, number, n |
paul@89 | 46 | r.end_record() |
paul@0 | 47 | r.close() |
paul@0 | 48 | |
paul@74 | 49 | tuples = [(0, 0), (1, 3), (2, 5), (3, 9)] |
paul@74 | 50 | |
paul@74 | 51 | f = open("testMS", "wb") |
paul@74 | 52 | w = FileWriter(f) |
paul@89 | 53 | w.begin_record() |
paul@91 | 54 | w.write_monotonic_sequence(tuples, 2) |
paul@89 | 55 | w.end_record() |
paul@74 | 56 | w.close() |
paul@74 | 57 | |
paul@74 | 58 | f = open("testMS", "rb") |
paul@74 | 59 | r = FileReader(f) |
paul@89 | 60 | r.begin_record() |
paul@91 | 61 | for t, t2 in zip(r.read_monotonic_sequence(2), tuples): |
paul@74 | 62 | print t == t2, t, t2 |
paul@89 | 63 | r.end_record() |
paul@74 | 64 | r.close() |
paul@74 | 65 | |
paul@74 | 66 | tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)] |
paul@74 | 67 | |
paul@74 | 68 | f = open("testNMS", "wb") |
paul@74 | 69 | w = FileWriter(f) |
paul@89 | 70 | w.begin_record() |
paul@91 | 71 | w.write_delta_sequence(tuples2, 2) |
paul@89 | 72 | w.end_record() |
paul@74 | 73 | w.close() |
paul@74 | 74 | |
paul@74 | 75 | f = open("testNMS", "rb") |
paul@74 | 76 | r = FileReader(f) |
paul@89 | 77 | r.begin_record() |
paul@91 | 78 | for t, t2 in zip(r.read_delta_sequence(2), tuples2): |
paul@74 | 79 | print t == t2, t, t2 |
paul@89 | 80 | r.end_record() |
paul@74 | 81 | r.close() |
paul@74 | 82 | |
paul@69 | 83 | print "- Test positions." |
paul@9 | 84 | |
paul@0 | 85 | all_doc_positions = [ |
paul@0 | 86 | [ |
paul@0 | 87 | (123, [1, 3, 5, 15, 25]), |
paul@19 | 88 | (124, [0, 100]), |
paul@19 | 89 | (125, [11, 99, 199]), |
paul@19 | 90 | (130, [77, 78, 80, 82, 89]) |
paul@0 | 91 | ], |
paul@0 | 92 | [ |
paul@0 | 93 | (78, [9]), |
paul@19 | 94 | (196, [10, 11]), |
paul@19 | 95 | (197, [17, 21, 30]) |
paul@0 | 96 | ] |
paul@0 | 97 | ] |
paul@0 | 98 | |
paul@19 | 99 | f = open("testP", "wb") |
paul@96 | 100 | w = TermWriter(f) |
paul@91 | 101 | w.begin(0, 0) |
paul@0 | 102 | for doc_positions in all_doc_positions: |
paul@96 | 103 | w.write_positions(doc_positions) |
paul@96 | 104 | w.end_record() |
paul@0 | 105 | w.close() |
paul@0 | 106 | |
paul@19 | 107 | f = open("testP", "rb") |
paul@96 | 108 | r = TermReader(f) |
paul@0 | 109 | for doc_positions in all_doc_positions: |
paul@96 | 110 | r.begin_record() |
paul@96 | 111 | dp = r.read_positions() |
paul@96 | 112 | print doc_positions == dp, doc_positions |
paul@96 | 113 | print " ", dp |
paul@0 | 114 | r.close() |
paul@0 | 115 | |
paul@74 | 116 | all_doc_positions_seq = [ |
paul@74 | 117 | [ |
paul@74 | 118 | ((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]), |
paul@74 | 119 | ((124, 1), [(0, 0), (100, 350)]), |
paul@74 | 120 | ((124, 2), [(11, 38), (99, 379), (199, 720)]), |
paul@74 | 121 | ((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)]) |
paul@74 | 122 | ], |
paul@74 | 123 | [ |
paul@74 | 124 | ((78, 1), [(9, 19)]), |
paul@74 | 125 | ((196, 0), [(10, 27), (11, 29)]), |
paul@74 | 126 | ((196, 1), [(17, 46), (21, 52), (30, 60)]) |
paul@74 | 127 | ] |
paul@74 | 128 | ] |
paul@74 | 129 | |
paul@74 | 130 | f = open("testP2", "wb") |
paul@96 | 131 | w = TermWriter(f) |
paul@91 | 132 | w.begin(2, 2) |
paul@74 | 133 | for doc_positions in all_doc_positions_seq: |
paul@96 | 134 | w.write_positions(doc_positions) |
paul@96 | 135 | w.end_record() |
paul@74 | 136 | w.close() |
paul@74 | 137 | |
paul@74 | 138 | f = open("testP2", "rb") |
paul@96 | 139 | r = TermReader(f) |
paul@74 | 140 | for doc_positions in all_doc_positions_seq: |
paul@96 | 141 | r.begin_record() |
paul@96 | 142 | dp = r.read_positions() |
paul@96 | 143 | print doc_positions == dp, doc_positions |
paul@96 | 144 | print " ", dp |
paul@9 | 145 | r.close() |
paul@9 | 146 | |
paul@69 | 147 | print "- Test dictionaries with term and position data." |
paul@9 | 148 | |
paul@5 | 149 | terms_with_positions = [ |
paul@5 | 150 | ("aardvark", [(1, [2, 45, 96]), (20, [13])]), |
paul@5 | 151 | ("anteater", [(1, [43, 44])]), |
paul@5 | 152 | ("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]), |
paul@96 | 153 | (u"bj?rn", [(11, [19, 54])]), |
paul@19 | 154 | ("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]), |
paul@5 | 155 | ("bulldog", [(43, [17, 19, 256, 512])]), |
paul@96 | 156 | ("cat", [(123, [12, 145, 196]), (1200, [113])]), |
paul@96 | 157 | (u"?", [(15, [384])]), |
paul@22 | 158 | ] |
paul@22 | 159 | |
paul@5 | 160 | f = open("test", "wb") |
paul@44 | 161 | w = TermWriter(f) |
paul@96 | 162 | w.begin(0, 0) |
paul@96 | 163 | w.write_terms(terms_with_positions) |
paul@96 | 164 | w.close() |
paul@5 | 165 | |
paul@5 | 166 | f = open("test", "rb") |
paul@96 | 167 | r = TermIterator(f) |
paul@96 | 168 | for (term, doc_positions), (t, dp) in zip(terms_with_positions, r): |
paul@96 | 169 | print term == t, term, t |
paul@96 | 170 | print doc_positions == dp, doc_positions |
paul@96 | 171 | print " ", dp |
paul@96 | 172 | r.close() |
paul@12 | 173 | |
paul@96 | 174 | f = open("test", "rb") |
paul@96 | 175 | r = TermDataIterator(f) |
paul@96 | 176 | for (term, doc_positions), (t, data) in zip(terms_with_positions, r): |
paul@96 | 177 | print term == t, term, t, data |
paul@96 | 178 | r.close() |
paul@12 | 179 | |
paul@96 | 180 | print "- Test high-level index operations." |
paul@9 | 181 | |
paul@6 | 182 | docs = [ |
paul@6 | 183 | (1, "The cat sat on the mat"), |
paul@6 | 184 | (2, "Every good boy deserves football"), |
paul@6 | 185 | (13, "One good turn deserves another"), |
paul@6 | 186 | (14, "Every man for himself"), |
paul@6 | 187 | (25, "Red sky at night shepherd's delight"), |
paul@6 | 188 | (36, "She sells sea shells on the sea shore") |
paul@6 | 189 | ] |
paul@6 | 190 | |
paul@96 | 191 | index = Index("test_index", 3) |
paul@64 | 192 | wi = index.get_writer() |
paul@6 | 193 | for docnum, text in docs: |
paul@44 | 194 | doc = Document(docnum) |
paul@6 | 195 | for position, term in enumerate(text.split()): |
paul@28 | 196 | doc.add_position(term, position) |
paul@77 | 197 | wi.add_document(doc) |
paul@77 | 198 | wi.close() |
paul@77 | 199 | |
paul@96 | 200 | print "- Test merge." |
paul@77 | 201 | |
paul@96 | 202 | l1 = list(index.get_reader()) |
paul@97 | 203 | index.merge(3) |
paul@96 | 204 | l2 = list(index.get_reader(1)) |
paul@77 | 205 | |
paul@96 | 206 | for (t1, dp1), (t2, dp2) in zip(l1, l2): |
paul@96 | 207 | print t1 == t2, t1, t2 |
paul@96 | 208 | print dp1 == dp1, dp1 |
paul@96 | 209 | print " ", dp2 |
paul@77 | 210 | |
paul@77 | 211 | index.close() |
paul@77 | 212 | |
paul@0 | 213 | # vim: tabstop=4 expandtab shiftwidth=4 |