iixr (annotate test.py in 89465c390a46)

iixr

Annotated test.py

67:89465c390a46

2009-10-03

Paul Boddie

Added a document cache, used when reading fields. Optimised read_number slightly using arrays.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@44	3	from iixr.files import *
paul@44	4	from iixr.fields import *
paul@44	5	from iixr.terms import *
paul@44	6	from iixr.positions import *
paul@44	7	from iixr.index import *
paul@59	8	import os, sys
paul@18	9
paul@18	10	# Remove old test files.
paul@18	11
paul@59	12	for filename in ("test", "testF", "testFI", "testI", "testP", "testPI"):
paul@18	13	try:
paul@18	14	os.remove(filename)
paul@18	15	except OSError:
paul@18	16	pass
paul@18	17
paul@18	18	try:
paul@59	19	for dirname in ("test_index", "test_index2", "test_index3"):
paul@59	20	for filename in os.listdir(dirname):
paul@59	21	os.remove(os.path.join(dirname, filename))
paul@59	22	os.rmdir(dirname)
paul@18	23	except OSError:
paul@18	24	pass
paul@0	25
paul@59	26	if "clean" in sys.argv:
paul@59	27	sys.exit(0)
paul@59	28
paul@9	29	# Test basic data types.
paul@9	30
paul@5	31	numbers = [12345678, 0, 1, 127, 128, 255, 256]
paul@0	32
paul@0	33	f = open("test", "wb")
paul@44	34	w = FileWriter(f)
paul@0	35	for number in numbers:
paul@0	36	w.write_number(number)
paul@0	37	w.close()
paul@0	38
paul@3	39	f = open("test", "rb")
paul@44	40	r = FileReader(f)
paul@0	41	for number in numbers:
paul@0	42	n = r.read_number()
paul@0	43	print number == n, number, n
paul@0	44	r.close()
paul@0	45
paul@9	46	# Test positions.
paul@9	47
paul@0	48	all_doc_positions = [
paul@0	49	[
paul@0	50	(123, [1, 3, 5, 15, 25]),
paul@19	51	(124, [0, 100]),
paul@19	52	(125, [11, 99, 199]),
paul@19	53	(130, [77, 78, 80, 82, 89])
paul@0	54	],
paul@0	55	[
paul@0	56	(78, [9]),
paul@19	57	(196, [10, 11]),
paul@19	58	(197, [17, 21, 30])
paul@0	59	]
paul@0	60	]
paul@0	61
paul@19	62	f = open("testP", "wb")
paul@44	63	w = PositionWriter(f)
paul@0	64	for doc_positions in all_doc_positions:
paul@0	65	for docnum, positions in doc_positions:
paul@0	66	w.write_positions(docnum, positions)
paul@0	67	w.reset()
paul@0	68	w.close()
paul@0	69
paul@19	70	f = open("testP", "rb")
paul@44	71	r = PositionIterator(f, 0, None)
paul@0	72	for doc_positions in all_doc_positions:
paul@0	73	for docnum, positions in doc_positions:
paul@0	74	d, p = r.read_positions()
paul@0	75	print docnum == d, docnum, d
paul@0	76	print positions == p, positions, p
paul@0	77	r.reset()
paul@0	78	r.close()
paul@0	79
paul@19	80	# Test position index files.
paul@19	81
paul@19	82	indexed_positions = [
paul@19	83	[
paul@19	84	(1234, 0, 100),
paul@19	85	(2345, 700, 100),
paul@19	86	(3456, 1900, 50)
paul@19	87	],
paul@19	88	[
paul@19	89	(4567, 2800, 20)
paul@19	90	]
paul@19	91	]
paul@19	92
paul@19	93	offsets = []
paul@19	94	f = open("testPI", "wb")
paul@44	95	w = PositionIndexWriter(f)
paul@19	96	for term_positions in indexed_positions:
paul@19	97	offset = None
paul@19	98	doc_frequency = 0
paul@19	99	w.reset()
paul@19	100	for docnum, pos_offset, count in term_positions:
paul@19	101	if offset is None:
paul@55	102	offset = w.f.tell()
paul@55	103	w.write_positions(docnum, pos_offset, count)
paul@19	104	doc_frequency += count
paul@19	105	offsets.append((offset, doc_frequency))
paul@19	106	w.close()
paul@19	107
paul@44	108	r = PositionIndexOpener("testPI")
paul@19	109	offsets.reverse()
paul@19	110	indexed_positions.reverse()
paul@19	111	for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
paul@19	112	found_positions = r.read_term_positions(offset, doc_frequency)
paul@19	113	for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, found_positions):
paul@19	114	print docnum == dn, docnum, dn
paul@19	115	print pos_offset == po, pos_offset, po
paul@19	116	print count == c, count, c
paul@19	117	r.close()
paul@19	118
paul@19	119	# Test position dictionaries.
paul@19	120
paul@19	121	f = open("testP", "wb")
paul@44	122	w = PositionWriter(f)
paul@19	123	f2 = open("testPI", "wb")
paul@44	124	w2 = PositionIndexWriter(f2)
paul@44	125	wd = PositionDictionaryWriter(w, w2, 2)
paul@0	126	offsets = []
paul@0	127	for doc_positions in all_doc_positions:
paul@19	128	offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
paul@19	129	offsets.append((offset, doc_frequency))
paul@20	130	wd.close()
paul@0	131
paul@44	132	r = PositionOpener("testP")
paul@44	133	r2 = PositionIndexOpener("testPI")
paul@44	134	rd = PositionDictionaryReader(r, r2)
paul@0	135	offsets.reverse()
paul@0	136	all_doc_positions.reverse()
paul@19	137	for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
paul@19	138	dp = list(rd.read_term_positions(offset, doc_frequency))
paul@0	139	print doc_positions == dp, doc_positions, dp
paul@20	140	rd.close()
paul@0	141
paul@9	142	# Test fields.
paul@9	143
paul@8	144	doc_fields = [
paul@9	145	(123, ["testing", "fields", "stored", "compressed"]),
paul@9	146	(456, ["fields", "for a second", "document"]),
paul@9	147	(789, ["field value"]),
paul@9	148	(1234, []),
paul@9	149	(2345, ["abc", "def"]),
paul@9	150	(3456, ["apple", "banana", "cherry"]),
paul@9	151	(4567, ["drue", "eple"])
paul@8	152	]
paul@8	153
paul@8	154	f = open("testF", "wb")
paul@44	155	w = FieldWriter(f)
paul@9	156	for docnum, fields in doc_fields:
paul@13	157	w.write_fields(docnum, list(enumerate(fields)))
paul@8	158	w.close()
paul@8	159
paul@8	160	f = open("testF", "rb")
paul@44	161	r = FieldReader(f)
paul@9	162	for docnum, fields in doc_fields:
paul@9	163	dn, df = r.read_fields()
paul@9	164	print docnum == dn, docnum, dn
paul@13	165	print list(enumerate(fields)) == df, list(enumerate(fields)), df
paul@8	166	r.close()
paul@8	167
paul@9	168	# Test field index files.
paul@9	169
paul@9	170	indexed_docs = [
paul@9	171	(123, 100000987),
paul@9	172	(456, 100004321),
paul@9	173	(789, 100008765)
paul@9	174	]
paul@9	175
paul@9	176	f = open("testFI", "wb")
paul@44	177	w = FieldIndexWriter(f)
paul@9	178	for docnum, offset in indexed_docs:
paul@9	179	w.write_document(docnum, offset)
paul@9	180	w.close()
paul@9	181
paul@9	182	f = open("testFI", "rb")
paul@44	183	r = FieldIndexReader(f)
paul@9	184	for docnum, offset in indexed_docs:
paul@9	185	dn, o = r.read_document()
paul@9	186	print docnum == dn, docnum, dn
paul@9	187	print offset == o, offset, o
paul@9	188	r.close()
paul@9	189
paul@9	190	# Test field dictionaries.
paul@9	191
paul@9	192	f = open("testF", "wb")
paul@44	193	w = FieldWriter(f)
paul@9	194	f2 = open("testFI", "wb")
paul@44	195	w2 = FieldIndexWriter(f2)
paul@44	196	wd = FieldDictionaryWriter(w, w2, 3)
paul@9	197	for docnum, fields in doc_fields:
paul@13	198	wd.write_fields(docnum, list(enumerate(fields)))
paul@9	199	wd.close()
paul@9	200
paul@9	201	f = open("testF", "rb")
paul@44	202	r = FieldReader(f)
paul@9	203	f2 = open("testFI", "rb")
paul@44	204	r2 = FieldIndexReader(f2)
paul@44	205	rd = FieldDictionaryReader(r, r2)
paul@9	206	doc_fields_reversed = doc_fields[:]
paul@9	207	doc_fields_reversed.reverse()
paul@9	208	for docnum, fields in doc_fields_reversed:
paul@25	209	df = dict(rd.get_fields(docnum))
paul@25	210	print dict(enumerate(fields)) == df, dict(enumerate(fields)), df
paul@9	211	for docnum in (13579, 246810):
paul@13	212	df = rd.get_fields(docnum)
paul@9	213	print df is None, df
paul@13	214
paul@13	215	# (Test sequential access.)
paul@13	216
paul@13	217	rd.rewind()
paul@13	218	for docnum, fields in doc_fields:
paul@13	219	dn, df = rd.read_fields()
paul@13	220	print docnum == dn, docnum, dn
paul@13	221	print list(enumerate(fields)) == df, list(enumerate(fields)), df
paul@9	222	rd.close()
paul@9	223
paul@9	224	# Test terms.
paul@9	225
paul@2	226	terms = [
paul@19	227	# term offset frequency doc_frequency
paul@19	228	("aardvark", 100000123, 1, 1),
paul@19	229	("anteater", 100000456, 2, 1),
paul@19	230	("badger", 100000789, 13, 7),
paul@19	231	("bull", 1000001234, 59, 17),
paul@19	232	("bulldog", 1000002345, 99, 80),
paul@19	233	("cat", 1000003456, 89, 28)
paul@2	234	]
paul@2	235
paul@2	236	f = open("test", "wb")
paul@44	237	w = TermWriter(f)
paul@19	238	for term, offset, frequency, doc_frequency in terms:
paul@19	239	w.write_term(term, offset, frequency, doc_frequency)
paul@2	240	w.close()
paul@2	241
paul@3	242	f = open("test", "rb")
paul@44	243	r = TermReader(f)
paul@19	244	for term, offset, frequency, doc_frequency in terms:
paul@19	245	t, o, fr, df = r.read_term()
paul@2	246	print term == t, term, t
paul@2	247	print offset == o, offset, o
paul@11	248	print frequency == fr, frequency, fr
paul@19	249	print doc_frequency == df, doc_frequency, df
paul@2	250	r.close()
paul@2	251
paul@9	252	# Test terms in index files.
paul@9	253
paul@3	254	indexed_terms = [
paul@19	255	# term offset frequency doc_frequency info_offset
paul@19	256	("aardvark", 100000123, 1, 1, 200000321),
paul@19	257	("anteater", 100000456, 2, 1, 200000654),
paul@19	258	("badger", 100000789, 13, 7, 200000987),
paul@19	259	("bull", 1000001234, 59, 17, 200004321),
paul@19	260	("bulldog", 1000002345, 99, 80, 200005432),
paul@19	261	("cat", 1000003456, 89, 28, 200006543)
paul@3	262	]
paul@3	263
paul@3	264	f = open("test", "wb")
paul@44	265	w = TermIndexWriter(f)
paul@19	266	for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
paul@19	267	w.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	268	w.close()
paul@3	269
paul@3	270	f = open("test", "rb")
paul@44	271	r = TermIndexReader(f)
paul@19	272	for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
paul@19	273	t, o, fr, df, i = r.read_term()
paul@3	274	print term == t, term, t
paul@3	275	print offset == o, offset, o
paul@11	276	print frequency == fr, frequency, fr
paul@19	277	print doc_frequency == df, doc_frequency, df
paul@3	278	print info_offset == i, info_offset, i
paul@3	279	r.close()
paul@3	280
paul@9	281	# Test dictionaries with only term data.
paul@9	282
paul@3	283	f = open("test", "wb")
paul@44	284	w = TermWriter(f)
paul@3	285	f2 = open("testI", "wb")
paul@44	286	w2 = TermIndexWriter(f2)
paul@20	287	f3 = open("testP", "wb")
paul@44	288	w3 = PositionWriter(f3)
paul@20	289	f4 = open("testPI", "wb")
paul@44	290	w4 = PositionIndexWriter(f4)
paul@44	291	wp = PositionDictionaryWriter(w3, w4, 2)
paul@44	292	wd = TermDictionaryWriter(w, w2, wp, 3)
paul@19	293	for term, offset, frequency, doc_frequency in terms:
paul@19	294	wd._write_term(term, offset, frequency, doc_frequency)
paul@5	295	wd.close()
paul@3	296
paul@3	297	f = open("test", "rb")
paul@44	298	r = TermReader(f)
paul@3	299	f2 = open("testI", "rb")
paul@44	300	r2 = TermIndexReader(f2)
paul@44	301	r3 = PositionOpener("testP")
paul@44	302	r4 = PositionIndexOpener("testPI")
paul@44	303	rp = PositionDictionaryReader(r3, r4)
paul@44	304	rd = TermDictionaryReader(r, r2, rp)
paul@3	305	terms_reversed = terms[:]
paul@3	306	terms_reversed.reverse()
paul@19	307	for term, offset, frequency, doc_frequency in terms_reversed:
paul@19	308	o, fr, df = rd._find_term(term)
paul@3	309	print offset == o, offset, o
paul@11	310	print frequency == fr, frequency, fr
paul@19	311	print doc_frequency == df, doc_frequency, df
paul@3	312	for term in ("dog", "dingo"):
paul@11	313	t = rd._find_term(term)
paul@11	314	print t is None, t
paul@25	315
paul@25	316	# (Test term prefix searching.)
paul@25	317
paul@25	318	print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]
paul@25	319	print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]
paul@25	320	print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]
paul@25	321	print rd.find_terms("d") == [], rd.find_terms("d"), []
paul@5	322	rd.close()
paul@5	323
paul@9	324	# Test dictionaries with term and position data.
paul@9	325
paul@5	326	terms_with_positions = [
paul@5	327	("aardvark", [(1, [2, 45, 96]), (20, [13])]),
paul@5	328	("anteater", [(1, [43, 44])]),
paul@5	329	("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
paul@19	330	("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
paul@5	331	("bulldog", [(43, [17, 19, 256, 512])]),
paul@5	332	("cat", [(123, [12, 145, 196]), (1200, [113])])
paul@5	333	]
paul@5	334
paul@22	335	position_dict_tests = [
paul@22	336	("badger", 19, [55, 1333]),
paul@22	337	("badger", 20, None),
paul@22	338	("bull", 6, [128]),
paul@22	339	("bull", 26, [1, 3, 5, 7, 9]),
paul@22	340	("cat", 111, None),
paul@22	341	("cat", 123, [12, 145, 196]),
paul@22	342	("cat", 1234, None)
paul@22	343	]
paul@22	344
paul@5	345	f = open("test", "wb")
paul@44	346	w = TermWriter(f)
paul@5	347	f2 = open("testI", "wb")
paul@44	348	w2 = TermIndexWriter(f2)
paul@5	349	f3 = open("testP", "wb")
paul@44	350	w3 = PositionWriter(f3)
paul@19	351	f4 = open("testPI", "wb")
paul@44	352	w4 = PositionIndexWriter(f4)
paul@44	353	wp = PositionDictionaryWriter(w3, w4, 2)
paul@44	354	wd = TermDictionaryWriter(w, w2, wp, 3)
paul@5	355	for term, doc_positions in terms_with_positions:
paul@5	356	wd.write_term_positions(term, doc_positions)
paul@5	357	wd.close()
paul@5	358
paul@5	359	f = open("test", "rb")
paul@44	360	r = TermReader(f)
paul@5	361	f2 = open("testI", "rb")
paul@44	362	r2 = TermIndexReader(f2)
paul@44	363	r3 = PositionOpener("testP")
paul@44	364	r4 = PositionIndexOpener("testPI")
paul@44	365	rp = PositionDictionaryReader(r3, r4)
paul@44	366	rd = TermDictionaryReader(r, r2, rp)
paul@5	367	terms_reversed = terms_with_positions[:]
paul@5	368	terms_reversed.reverse()
paul@5	369	for term, doc_positions in terms_reversed:
paul@18	370	dp = list(rd.find_positions(term))
paul@5	371	print doc_positions == dp, doc_positions, dp
paul@25	372	for term in ("aaa", "dog", "dingo"):
paul@5	373	dp = rd.find_positions(term)
paul@61	374	print dp == [], dp
paul@12	375
paul@22	376	# (Test iterators.)
paul@22	377
paul@22	378	for term, docnum, positions in position_dict_tests:
paul@22	379	dp = rd.find_positions(term)
paul@22	380	pos = dp.from_document(docnum)
paul@22	381	print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
paul@22	382
paul@12	383	# (Test sequential access.)
paul@12	384
paul@12	385	rd.rewind()
paul@12	386	for term, doc_positions in terms_with_positions:
paul@19	387	t, fr, df, dp = rd.read_term()
paul@18	388	dp = list(dp)
paul@12	389	print term == t, term, t
paul@12	390	print doc_positions == dp, doc_positions, dp
paul@5	391	rd.close()
paul@3	392
paul@14	393	# Test high-level index operations (including merging).
paul@9	394
paul@6	395	docs = [
paul@6	396	(1, "The cat sat on the mat"),
paul@6	397	(2, "Every good boy deserves football"),
paul@6	398	(13, "One good turn deserves another"),
paul@6	399	(14, "Every man for himself"),
paul@6	400	(25, "Red sky at night shepherd's delight"),
paul@6	401	(36, "She sells sea shells on the sea shore")
paul@6	402	]
paul@6	403
paul@6	404	doc_tests = [
paul@11	405	("Every", 2, [(2, [0]), (14, [0])]),
paul@11	406	("good", 2, [(2, [1]), (13, [1])]),
paul@11	407	("deserves", 2, [(2, [3]), (13, [3])]),
paul@11	408	("sea", 2, [(36, [2, 6])])
paul@6	409	]
paul@6	410
paul@21	411	position_tests = [
paul@21	412	("Every", 14, [0]),
paul@21	413	("sea", 36, [2, 6]),
paul@22	414	("shells", 1, None),
paul@22	415	("shells", 37, None)
paul@21	416	]
paul@21	417
paul@60	418	phrase_tests = [
paul@62	419	(["good", "boy"], [(2, [1, 2])]),
paul@62	420	(["on", "the"], [(1, [3, 4]), (36, [4, 5])]),
paul@62	421	(["sea", "shore"], [(36, [6, 7])])
paul@60	422	]
paul@60	423
paul@64	424	index = Index("test_index", 3, 2, 3, 6)
paul@64	425	wi = index.get_writer()
paul@6	426	for docnum, text in docs:
paul@44	427	doc = Document(docnum)
paul@6	428	for position, term in enumerate(text.split()):
paul@28	429	doc.add_position(term, position)
paul@28	430	doc.add_field(123, text)
paul@28	431	wi.add_document(doc)
paul@6	432	wi.close()
paul@6	433
paul@7	434	rd = index.get_reader()
paul@60	435
paul@60	436	# (Test searching.)
paul@60	437
paul@11	438	for term, frequency, doc_positions in doc_tests:
paul@18	439	dp = list(rd.find_positions(term))
paul@6	440	print doc_positions == dp, doc_positions, dp
paul@11	441	fr = rd.get_frequency(term)
paul@11	442	print frequency == fr, frequency, fr
paul@60	443
paul@60	444	# (Test fields.)
paul@60	445
paul@10	446	for docnum, text in docs:
paul@25	447	df = dict(rd.get_fields(docnum))
paul@25	448	print df[123] == text, text, df[123]
paul@60	449
paul@60	450	# (Test navigation.)
paul@60	451
paul@21	452	for term, docnum, positions in position_tests:
paul@21	453	dp = rd.find_positions(term)
paul@22	454	pos = dp.from_document(docnum)
paul@22	455	print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
paul@60	456
paul@60	457	# (Test phrases.)
paul@60	458
paul@60	459	for terms, results in phrase_tests:
paul@60	460	res = list(rd.find_common_positions(terms))
paul@60	461	print results == res, results, res
paul@60	462
paul@7	463	index.close()
paul@6	464
paul@58	465	# Test index updates.
paul@58	466
paul@58	467	index = Index("test_index")
paul@64	468	index2 = Index("test_index2", 3, 2, 3, 6)
paul@64	469	wi = index2.get_writer()
paul@58	470	for docnum, text in docs:
paul@58	471
paul@58	472	# Add the same documents but with different numbers.
paul@58	473
paul@58	474	doc = Document(docnum + 100)
paul@58	475	for position, term in enumerate(text.split()):
paul@58	476	doc.add_position(term, position)
paul@58	477	doc.add_field(123, text)
paul@58	478	wi.add_document(doc)
paul@58	479	wi.close()
paul@58	480
paul@58	481	index2.update([index])
paul@58	482	index.close()
paul@58	483
paul@58	484	rd = index2.get_reader()
paul@58	485	for term, frequency, doc_positions in doc_tests:
paul@58	486
paul@58	487	# Add the extra documents to the expected result.
paul@58	488
paul@59	489	orig_doc_positions = doc_positions
paul@59	490	doc_positions = doc_positions[:]
paul@59	491
paul@59	492	for docnum, positions in orig_doc_positions:
paul@58	493	doc_positions.append((docnum + 100, positions))
paul@58	494	frequency *= 2
paul@58	495
paul@58	496	dp = list(rd.find_positions(term))
paul@58	497	print doc_positions == dp, doc_positions, dp
paul@58	498	fr = rd.get_frequency(term)
paul@58	499	print frequency == fr, frequency, fr
paul@58	500	index2.close()
paul@58	501
paul@59	502	# (Test update of an empty index.)
paul@59	503
paul@59	504	index = Index("test_index")
paul@59	505	index3 = Index("test_index3")
paul@59	506	index3.update([index])
paul@59	507	index.close()
paul@59	508
paul@59	509	rd = index3.get_reader()
paul@59	510	for term, frequency, doc_positions in doc_tests:
paul@59	511	dp = list(rd.find_positions(term))
paul@59	512	print doc_positions == dp, doc_positions, dp
paul@59	513	fr = rd.get_frequency(term)
paul@59	514	print frequency == fr, frequency, fr
paul@59	515	index3.close()
paul@59	516
paul@0	517	# vim: tabstop=4 expandtab shiftwidth=4