iixr (annotate test.py in 74e2e30aabea)

iixr

Annotated test.py

93:74e2e30aabea

2011-02-11

Paul Boddie

Introduced read and write caches in order to investigate performance changes.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@44	3	from iixr.files import *
paul@44	4	from iixr.fields import *
paul@44	5	from iixr.terms import *
paul@44	6	from iixr.positions import *
paul@44	7	from iixr.index import *
paul@59	8	import os, sys
paul@18	9
paul@18	10	# Remove old test files.
paul@18	11
paul@74	12	for filename in ("test", "testMS", "testNMS", "testF", "testFI", "testI", "testP", "testP2", "testPI"):
paul@18	13	try:
paul@18	14	os.remove(filename)
paul@18	15	except OSError:
paul@18	16	pass
paul@18	17
paul@18	18	try:
paul@77	19	for dirname in ("test_index", "test_index2", "test_index3", "test_indexT"):
paul@59	20	for filename in os.listdir(dirname):
paul@59	21	os.remove(os.path.join(dirname, filename))
paul@59	22	os.rmdir(dirname)
paul@18	23	except OSError:
paul@18	24	pass
paul@0	25
paul@59	26	if "clean" in sys.argv:
paul@59	27	sys.exit(0)
paul@59	28
paul@69	29	print "- Test basic data types."
paul@9	30
paul@5	31	numbers = [12345678, 0, 1, 127, 128, 255, 256]
paul@0	32
paul@0	33	f = open("test", "wb")
paul@44	34	w = FileWriter(f)
paul@89	35	w.begin_record()
paul@0	36	for number in numbers:
paul@0	37	w.write_number(number)
paul@89	38	w.end_record()
paul@0	39	w.close()
paul@0	40
paul@3	41	f = open("test", "rb")
paul@44	42	r = FileReader(f)
paul@89	43	r.begin_record()
paul@0	44	for number in numbers:
paul@0	45	n = r.read_number()
paul@0	46	print number == n, number, n
paul@89	47	r.end_record()
paul@0	48	r.close()
paul@0	49
paul@74	50	tuples = [(0, 0), (1, 3), (2, 5), (3, 9)]
paul@74	51
paul@74	52	f = open("testMS", "wb")
paul@74	53	w = FileWriter(f)
paul@89	54	w.begin_record()
paul@91	55	w.write_monotonic_sequence(tuples, 2)
paul@89	56	w.end_record()
paul@74	57	w.close()
paul@74	58
paul@74	59	f = open("testMS", "rb")
paul@74	60	r = FileReader(f)
paul@89	61	r.begin_record()
paul@91	62	for t, t2 in zip(r.read_monotonic_sequence(2), tuples):
paul@74	63	print t == t2, t, t2
paul@89	64	r.end_record()
paul@74	65	r.close()
paul@74	66
paul@74	67	tuples2 = [(0, 0), (1, 3), (2, 1), (3, 2), (4, 0)]
paul@74	68
paul@74	69	f = open("testNMS", "wb")
paul@74	70	w = FileWriter(f)
paul@89	71	w.begin_record()
paul@91	72	w.write_delta_sequence(tuples2, 2)
paul@89	73	w.end_record()
paul@74	74	w.close()
paul@74	75
paul@74	76	f = open("testNMS", "rb")
paul@74	77	r = FileReader(f)
paul@89	78	r.begin_record()
paul@91	79	for t, t2 in zip(r.read_delta_sequence(2), tuples2):
paul@74	80	print t == t2, t, t2
paul@89	81	r.end_record()
paul@74	82	r.close()
paul@74	83
paul@69	84	print "- Test positions."
paul@9	85
paul@0	86	all_doc_positions = [
paul@0	87	[
paul@0	88	(123, [1, 3, 5, 15, 25]),
paul@19	89	(124, [0, 100]),
paul@19	90	(125, [11, 99, 199]),
paul@19	91	(130, [77, 78, 80, 82, 89])
paul@0	92	],
paul@0	93	[
paul@0	94	(78, [9]),
paul@19	95	(196, [10, 11]),
paul@19	96	(197, [17, 21, 30])
paul@0	97	]
paul@0	98	]
paul@0	99
paul@19	100	f = open("testP", "wb")
paul@44	101	w = PositionWriter(f)
paul@91	102	w.begin(0, 0)
paul@0	103	for doc_positions in all_doc_positions:
paul@91	104	w.reset()
paul@0	105	for docnum, positions in doc_positions:
paul@0	106	w.write_positions(docnum, positions)
paul@0	107	w.close()
paul@0	108
paul@19	109	f = open("testP", "rb")
paul@68	110	r = PositionReader(f)
paul@0	111	for doc_positions in all_doc_positions:
paul@91	112	r.reset()
paul@0	113	for docnum, positions in doc_positions:
paul@0	114	d, p = r.read_positions()
paul@0	115	print docnum == d, docnum, d
paul@0	116	print positions == p, positions, p
paul@0	117	r.close()
paul@0	118
paul@74	119	all_doc_positions_seq = [
paul@74	120	[
paul@74	121	((123, 0), [(1, 5), (3, 9), (5, 15), (15, 45), (25, 70)]),
paul@74	122	((124, 1), [(0, 0), (100, 350)]),
paul@74	123	((124, 2), [(11, 38), (99, 379), (199, 720)]),
paul@74	124	((130, 0), [(77, 286), (78, 290), (80, 300), (82, 304), (89, 316)])
paul@74	125	],
paul@74	126	[
paul@74	127	((78, 1), [(9, 19)]),
paul@74	128	((196, 0), [(10, 27), (11, 29)]),
paul@74	129	((196, 1), [(17, 46), (21, 52), (30, 60)])
paul@74	130	]
paul@74	131	]
paul@74	132
paul@74	133	f = open("testP2", "wb")
paul@74	134	w = PositionWriter(f)
paul@91	135	w.begin(2, 2)
paul@74	136	for doc_positions in all_doc_positions_seq:
paul@91	137	w.reset()
paul@74	138	for docnum, positions in doc_positions:
paul@74	139	w.write_positions(docnum, positions)
paul@74	140	w.close()
paul@74	141
paul@74	142	f = open("testP2", "rb")
paul@74	143	r = PositionReader(f)
paul@74	144	for doc_positions in all_doc_positions_seq:
paul@91	145	r.reset()
paul@74	146	for docnum, positions in doc_positions:
paul@74	147	d, p = r.read_positions()
paul@89	148	print docnum == d, docnum, d
paul@89	149	print positions == p, positions, p
paul@74	150	r.close()
paul@74	151
paul@69	152	print "- Test position index files."
paul@19	153
paul@19	154	indexed_positions = [
paul@19	155	[
paul@19	156	(1234, 0, 100),
paul@19	157	(2345, 700, 100),
paul@19	158	(3456, 1900, 50)
paul@19	159	],
paul@19	160	[
paul@19	161	(4567, 2800, 20)
paul@19	162	]
paul@19	163	]
paul@19	164
paul@19	165	offsets = []
paul@19	166	f = open("testPI", "wb")
paul@44	167	w = PositionIndexWriter(f)
paul@91	168	w.begin(0)
paul@19	169	for term_positions in indexed_positions:
paul@19	170	offset = None
paul@19	171	doc_frequency = 0
paul@19	172	w.reset()
paul@19	173	for docnum, pos_offset, count in term_positions:
paul@19	174	if offset is None:
paul@88	175	offset = w.tell()
paul@55	176	w.write_positions(docnum, pos_offset, count)
paul@19	177	doc_frequency += count
paul@19	178	offsets.append((offset, doc_frequency))
paul@19	179	w.close()
paul@19	180
paul@69	181	r = PositionIndexIterator(PositionIndexReader(open("testPI", "rb")))
paul@19	182	offsets.reverse()
paul@19	183	indexed_positions.reverse()
paul@19	184	for (offset, doc_frequency), term_positions in zip(offsets, indexed_positions):
paul@68	185	r.seek(offset, doc_frequency)
paul@68	186	for (docnum, pos_offset, count), (dn, po, c) in zip(term_positions, r):
paul@19	187	print docnum == dn, docnum, dn
paul@19	188	print pos_offset == po, pos_offset, po
paul@19	189	print count == c, count, c
paul@69	190	r.reader.close()
paul@19	191
paul@69	192	print "- Test position dictionaries."
paul@19	193
paul@19	194	f = open("testP", "wb")
paul@44	195	w = PositionWriter(f)
paul@19	196	f2 = open("testPI", "wb")
paul@44	197	w2 = PositionIndexWriter(f2)
paul@44	198	wd = PositionDictionaryWriter(w, w2, 2)
paul@0	199	offsets = []
paul@0	200	for doc_positions in all_doc_positions:
paul@19	201	offset, frequency, doc_frequency = wd.write_term_positions(doc_positions)
paul@19	202	offsets.append((offset, doc_frequency))
paul@20	203	wd.close()
paul@0	204
paul@68	205	r = PositionReader(open("testP", "rb"))
paul@68	206	r2 = PositionIndexReader(open("testPI", "rb"))
paul@44	207	rd = PositionDictionaryReader(r, r2)
paul@0	208	offsets.reverse()
paul@0	209	all_doc_positions.reverse()
paul@19	210	for (offset, doc_frequency), doc_positions in zip(offsets, all_doc_positions):
paul@69	211	it = rd.read_term_positions(offset, doc_frequency)
paul@69	212	dp = list(it)
paul@0	213	print doc_positions == dp, doc_positions, dp
paul@20	214	rd.close()
paul@0	215
paul@69	216	print "- Test fields."
paul@9	217
paul@8	218	doc_fields = [
paul@9	219	(123, ["testing", "fields", "stored", "compressed"]),
paul@9	220	(456, ["fields", "for a second", "document"]),
paul@9	221	(789, ["field value"]),
paul@9	222	(1234, []),
paul@9	223	(2345, ["abc", "def"]),
paul@9	224	(3456, ["apple", "banana", "cherry"]),
paul@9	225	(4567, ["drue", "eple"])
paul@8	226	]
paul@8	227
paul@8	228	f = open("testF", "wb")
paul@44	229	w = FieldWriter(f)
paul@91	230	w.begin(0)
paul@91	231	w.reset()
paul@9	232	for docnum, fields in doc_fields:
paul@13	233	w.write_fields(docnum, list(enumerate(fields)))
paul@8	234	w.close()
paul@8	235
paul@8	236	f = open("testF", "rb")
paul@44	237	r = FieldReader(f)
paul@91	238	r.reset()
paul@9	239	for docnum, fields in doc_fields:
paul@9	240	dn, df = r.read_fields()
paul@9	241	print docnum == dn, docnum, dn
paul@13	242	print list(enumerate(fields)) == df, list(enumerate(fields)), df
paul@8	243	r.close()
paul@8	244
paul@69	245	print "- Test field index files."
paul@9	246
paul@9	247	indexed_docs = [
paul@9	248	(123, 100000987),
paul@9	249	(456, 100004321),
paul@9	250	(789, 100008765)
paul@9	251	]
paul@9	252
paul@9	253	f = open("testFI", "wb")
paul@44	254	w = FieldIndexWriter(f)
paul@91	255	w.begin(0)
paul@91	256	w.reset()
paul@9	257	for docnum, offset in indexed_docs:
paul@9	258	w.write_document(docnum, offset)
paul@9	259	w.close()
paul@9	260
paul@9	261	f = open("testFI", "rb")
paul@44	262	r = FieldIndexReader(f)
paul@91	263	r.reset()
paul@9	264	for docnum, offset in indexed_docs:
paul@9	265	dn, o = r.read_document()
paul@9	266	print docnum == dn, docnum, dn
paul@9	267	print offset == o, offset, o
paul@9	268	r.close()
paul@9	269
paul@69	270	print "- Test field dictionaries."
paul@9	271
paul@9	272	f = open("testF", "wb")
paul@44	273	w = FieldWriter(f)
paul@9	274	f2 = open("testFI", "wb")
paul@44	275	w2 = FieldIndexWriter(f2)
paul@44	276	wd = FieldDictionaryWriter(w, w2, 3)
paul@9	277	for docnum, fields in doc_fields:
paul@13	278	wd.write_fields(docnum, list(enumerate(fields)))
paul@9	279	wd.close()
paul@9	280
paul@9	281	f = open("testF", "rb")
paul@44	282	r = FieldReader(f)
paul@9	283	f2 = open("testFI", "rb")
paul@44	284	r2 = FieldIndexReader(f2)
paul@44	285	rd = FieldDictionaryReader(r, r2)
paul@9	286	doc_fields_reversed = doc_fields[:]
paul@9	287	doc_fields_reversed.reverse()
paul@9	288	for docnum, fields in doc_fields_reversed:
paul@25	289	df = dict(rd.get_fields(docnum))
paul@25	290	print dict(enumerate(fields)) == df, dict(enumerate(fields)), df
paul@9	291	for docnum in (13579, 246810):
paul@13	292	df = rd.get_fields(docnum)
paul@9	293	print df is None, df
paul@13	294
paul@69	295	print "- (Test sequential access.)"
paul@13	296
paul@13	297	rd.rewind()
paul@13	298	for docnum, fields in doc_fields:
paul@13	299	dn, df = rd.read_fields()
paul@13	300	print docnum == dn, docnum, dn
paul@13	301	print list(enumerate(fields)) == df, list(enumerate(fields)), df
paul@9	302	rd.close()
paul@9	303
paul@69	304	print "- Test terms."
paul@9	305
paul@2	306	terms = [
paul@19	307	# term offset frequency doc_frequency
paul@19	308	("aardvark", 100000123, 1, 1),
paul@19	309	("anteater", 100000456, 2, 1),
paul@19	310	("badger", 100000789, 13, 7),
paul@19	311	("bull", 1000001234, 59, 17),
paul@19	312	("bulldog", 1000002345, 99, 80),
paul@19	313	("cat", 1000003456, 89, 28)
paul@2	314	]
paul@2	315
paul@2	316	f = open("test", "wb")
paul@44	317	w = TermWriter(f)
paul@91	318	w.reset()
paul@19	319	for term, offset, frequency, doc_frequency in terms:
paul@19	320	w.write_term(term, offset, frequency, doc_frequency)
paul@2	321	w.close()
paul@2	322
paul@3	323	f = open("test", "rb")
paul@44	324	r = TermReader(f)
paul@91	325	r.reset()
paul@19	326	for term, offset, frequency, doc_frequency in terms:
paul@19	327	t, o, fr, df = r.read_term()
paul@2	328	print term == t, term, t
paul@2	329	print offset == o, offset, o
paul@11	330	print frequency == fr, frequency, fr
paul@19	331	print doc_frequency == df, doc_frequency, df
paul@2	332	r.close()
paul@2	333
paul@69	334	print "- Test terms in index files."
paul@9	335
paul@3	336	indexed_terms = [
paul@19	337	# term offset frequency doc_frequency info_offset
paul@19	338	("aardvark", 100000123, 1, 1, 200000321),
paul@19	339	("anteater", 100000456, 2, 1, 200000654),
paul@19	340	("badger", 100000789, 13, 7, 200000987),
paul@19	341	("bull", 1000001234, 59, 17, 200004321),
paul@19	342	("bulldog", 1000002345, 99, 80, 200005432),
paul@19	343	("cat", 1000003456, 89, 28, 200006543)
paul@3	344	]
paul@3	345
paul@3	346	f = open("test", "wb")
paul@44	347	w = TermIndexWriter(f)
paul@91	348	w.reset()
paul@19	349	for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
paul@19	350	w.write_term(term, offset, frequency, doc_frequency, info_offset)
paul@3	351	w.close()
paul@3	352
paul@3	353	f = open("test", "rb")
paul@44	354	r = TermIndexReader(f)
paul@91	355	r.reset()
paul@19	356	for term, offset, frequency, doc_frequency, info_offset in indexed_terms:
paul@19	357	t, o, fr, df, i = r.read_term()
paul@3	358	print term == t, term, t
paul@3	359	print offset == o, offset, o
paul@11	360	print frequency == fr, frequency, fr
paul@19	361	print doc_frequency == df, doc_frequency, df
paul@3	362	print info_offset == i, info_offset, i
paul@3	363	r.close()
paul@3	364
paul@69	365	print "- Test dictionaries with only term data."
paul@9	366
paul@3	367	f = open("test", "wb")
paul@44	368	w = TermWriter(f)
paul@3	369	f2 = open("testI", "wb")
paul@44	370	w2 = TermIndexWriter(f2)
paul@20	371	f3 = open("testP", "wb")
paul@44	372	w3 = PositionWriter(f3)
paul@20	373	f4 = open("testPI", "wb")
paul@44	374	w4 = PositionIndexWriter(f4)
paul@44	375	wp = PositionDictionaryWriter(w3, w4, 2)
paul@44	376	wd = TermDictionaryWriter(w, w2, wp, 3)
paul@19	377	for term, offset, frequency, doc_frequency in terms:
paul@19	378	wd._write_term(term, offset, frequency, doc_frequency)
paul@5	379	wd.close()
paul@3	380
paul@3	381	f = open("test", "rb")
paul@44	382	r = TermReader(f)
paul@3	383	f2 = open("testI", "rb")
paul@44	384	r2 = TermIndexReader(f2)
paul@68	385	r3 = PositionReader(open("testP", "rb"))
paul@68	386	r4 = PositionIndexReader(open("testPI", "rb"))
paul@44	387	rp = PositionDictionaryReader(r3, r4)
paul@44	388	rd = TermDictionaryReader(r, r2, rp)
paul@3	389	terms_reversed = terms[:]
paul@3	390	terms_reversed.reverse()
paul@19	391	for term, offset, frequency, doc_frequency in terms_reversed:
paul@19	392	o, fr, df = rd._find_term(term)
paul@3	393	print offset == o, offset, o
paul@11	394	print frequency == fr, frequency, fr
paul@19	395	print doc_frequency == df, doc_frequency, df
paul@3	396	for term in ("dog", "dingo"):
paul@11	397	t = rd._find_term(term)
paul@11	398	print t is None, t
paul@25	399
paul@69	400	print "- (Test term prefix searching.)"
paul@25	401
paul@25	402	print rd.find_terms("a") == ["aardvark", "anteater"], rd.find_terms("a"), ["aardvark", "anteater"]
paul@25	403	print rd.find_terms("bu") == ["bull", "bulldog"], rd.find_terms("bu"), ["bull", "bulldog"]
paul@25	404	print rd.find_terms("c") == ["cat"], rd.find_terms("c"), ["cat"]
paul@25	405	print rd.find_terms("d") == [], rd.find_terms("d"), []
paul@5	406	rd.close()
paul@5	407
paul@69	408	print "- Test dictionaries with term and position data."
paul@9	409
paul@5	410	terms_with_positions = [
paul@5	411	("aardvark", [(1, [2, 45, 96]), (20, [13])]),
paul@5	412	("anteater", [(1, [43, 44])]),
paul@5	413	("badger", [(7, [2, 22, 196]), (19, [55, 1333]), (21, [0])]),
paul@19	414	("bull", [(6, [128]), (16, [12]), (26, [1, 3, 5, 7, 9]), (36, [2, 4, 6, 8, 10])]),
paul@5	415	("bulldog", [(43, [17, 19, 256, 512])]),
paul@5	416	("cat", [(123, [12, 145, 196]), (1200, [113])])
paul@5	417	]
paul@5	418
paul@22	419	position_dict_tests = [
paul@22	420	("badger", 19, [55, 1333]),
paul@22	421	("badger", 20, None),
paul@22	422	("bull", 6, [128]),
paul@22	423	("bull", 26, [1, 3, 5, 7, 9]),
paul@22	424	("cat", 111, None),
paul@22	425	("cat", 123, [12, 145, 196]),
paul@22	426	("cat", 1234, None)
paul@22	427	]
paul@22	428
paul@5	429	f = open("test", "wb")
paul@44	430	w = TermWriter(f)
paul@5	431	f2 = open("testI", "wb")
paul@44	432	w2 = TermIndexWriter(f2)
paul@5	433	f3 = open("testP", "wb")
paul@44	434	w3 = PositionWriter(f3)
paul@19	435	f4 = open("testPI", "wb")
paul@44	436	w4 = PositionIndexWriter(f4)
paul@44	437	wp = PositionDictionaryWriter(w3, w4, 2)
paul@44	438	wd = TermDictionaryWriter(w, w2, wp, 3)
paul@5	439	for term, doc_positions in terms_with_positions:
paul@5	440	wd.write_term_positions(term, doc_positions)
paul@5	441	wd.close()
paul@5	442
paul@5	443	f = open("test", "rb")
paul@44	444	r = TermReader(f)
paul@5	445	f2 = open("testI", "rb")
paul@44	446	r2 = TermIndexReader(f2)
paul@68	447	r3 = PositionReader(open("testP", "rb"))
paul@68	448	r4 = PositionIndexReader(open("testPI", "rb"))
paul@44	449	rp = PositionDictionaryReader(r3, r4)
paul@44	450	rd = TermDictionaryReader(r, r2, rp)
paul@5	451	terms_reversed = terms_with_positions[:]
paul@5	452	terms_reversed.reverse()
paul@5	453	for term, doc_positions in terms_reversed:
paul@18	454	dp = list(rd.find_positions(term))
paul@5	455	print doc_positions == dp, doc_positions, dp
paul@25	456	for term in ("aaa", "dog", "dingo"):
paul@5	457	dp = rd.find_positions(term)
paul@61	458	print dp == [], dp
paul@12	459
paul@69	460	print "- (Test iterators.)"
paul@22	461
paul@22	462	for term, docnum, positions in position_dict_tests:
paul@22	463	dp = rd.find_positions(term)
paul@22	464	pos = dp.from_document(docnum)
paul@22	465	print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
paul@22	466
paul@69	467	print "- (Test sequential access.)"
paul@12	468
paul@12	469	rd.rewind()
paul@12	470	for term, doc_positions in terms_with_positions:
paul@19	471	t, fr, df, dp = rd.read_term()
paul@18	472	dp = list(dp)
paul@12	473	print term == t, term, t
paul@12	474	print doc_positions == dp, doc_positions, dp
paul@5	475	rd.close()
paul@3	476
paul@69	477	print "- Test high-level index operations (including merging)."
paul@9	478
paul@6	479	docs = [
paul@6	480	(1, "The cat sat on the mat"),
paul@6	481	(2, "Every good boy deserves football"),
paul@6	482	(13, "One good turn deserves another"),
paul@6	483	(14, "Every man for himself"),
paul@6	484	(25, "Red sky at night shepherd's delight"),
paul@6	485	(36, "She sells sea shells on the sea shore")
paul@6	486	]
paul@6	487
paul@6	488	doc_tests = [
paul@11	489	("Every", 2, [(2, [0]), (14, [0])]),
paul@11	490	("good", 2, [(2, [1]), (13, [1])]),
paul@11	491	("deserves", 2, [(2, [3]), (13, [3])]),
paul@11	492	("sea", 2, [(36, [2, 6])])
paul@6	493	]
paul@6	494
paul@21	495	position_tests = [
paul@21	496	("Every", 14, [0]),
paul@21	497	("sea", 36, [2, 6]),
paul@22	498	("shells", 1, None),
paul@22	499	("shells", 37, None)
paul@21	500	]
paul@21	501
paul@60	502	phrase_tests = [
paul@62	503	(["good", "boy"], [(2, [1, 2])]),
paul@62	504	(["on", "the"], [(1, [3, 4]), (36, [4, 5])]),
paul@62	505	(["sea", "shore"], [(36, [6, 7])])
paul@60	506	]
paul@60	507
paul@64	508	index = Index("test_index", 3, 2, 3, 6)
paul@64	509	wi = index.get_writer()
paul@6	510	for docnum, text in docs:
paul@44	511	doc = Document(docnum)
paul@6	512	for position, term in enumerate(text.split()):
paul@28	513	doc.add_position(term, position)
paul@28	514	doc.add_field(123, text)
paul@28	515	wi.add_document(doc)
paul@6	516	wi.close()
paul@6	517
paul@7	518	rd = index.get_reader()
paul@60	519
paul@69	520	print "- (Test searching.)"
paul@60	521
paul@11	522	for term, frequency, doc_positions in doc_tests:
paul@18	523	dp = list(rd.find_positions(term))
paul@6	524	print doc_positions == dp, doc_positions, dp
paul@11	525	fr = rd.get_frequency(term)
paul@11	526	print frequency == fr, frequency, fr
paul@60	527
paul@69	528	print "- (Test fields.)"
paul@60	529
paul@10	530	for docnum, text in docs:
paul@25	531	df = dict(rd.get_fields(docnum))
paul@25	532	print df[123] == text, text, df[123]
paul@60	533
paul@69	534	print "- (Test navigation.)"
paul@60	535
paul@21	536	for term, docnum, positions in position_tests:
paul@21	537	dp = rd.find_positions(term)
paul@22	538	pos = dp.from_document(docnum)
paul@22	539	print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
paul@60	540
paul@69	541	print "- (Test phrases.)"
paul@60	542
paul@60	543	for terms, results in phrase_tests:
paul@60	544	res = list(rd.find_common_positions(terms))
paul@60	545	print results == res, results, res
paul@60	546
paul@7	547	index.close()
paul@6	548
paul@77	549	docs2 = [
paul@77	550	((1, 0), "The cat sat on the mat"),
paul@77	551	((1, 2), "Every good boy deserves football"),
paul@77	552	((13, 1), "One good turn deserves another"),
paul@77	553	((14, 0), "Every man for himself"),
paul@77	554	((14, 25), "Red sky at night shepherd's delight"),
paul@77	555	((36, 12), "She sells sea shells on the sea shore")
paul@77	556	]
paul@77	557
paul@77	558	doc_tests2 = [
paul@77	559	("Every", 2, [((1, 2), [(0, 0)]), ((14, 0), [(0, 0)])]),
paul@77	560	("good", 2, [((1, 2), [(1, 6)]), ((13, 1), [(1, 4)])]),
paul@77	561	("deserves", 2, [((1, 2), [(3, 15)]), ((13, 1), [(3, 14)])]),
paul@77	562	("sea", 2, [((36, 12), [(2, 10), (6, 28)])])
paul@77	563	]
paul@77	564
paul@77	565	position_tests2 = [
paul@77	566	("Every", (14, 0), [(0, 0)]),
paul@77	567	("sea", (36, 12), [(2, 10), (6, 28)]),
paul@77	568	("shells", (1, 0), None),
paul@77	569	("shells", (37, 0), None)
paul@77	570	]
paul@77	571
paul@77	572	phrase_tests2 = [
paul@77	573	(["good", "boy"], [((1, 2), [(1, 6), (2, 11)])]),
paul@77	574	(["on", "the"], [((1, 0), [(3, 12), (4, 15)]), ((36, 12), [(4, 21), (5, 24)])]),
paul@77	575	(["sea", "shore"], [((36, 12), [(6, 28), (7, 32)])])
paul@77	576	]
paul@77	577
paul@77	578	index = Index("test_indexT", 3, 2, 3, 6)
paul@77	579	wi = index.get_writer()
paul@77	580	for docnum, text in docs2:
paul@77	581	doc = Document(docnum)
paul@77	582	offset = 0
paul@77	583	for position, term in enumerate(text.split()):
paul@77	584	doc.add_position(term, (position, offset))
paul@77	585	offset += len(term) + 1 # assume one space after the term
paul@77	586	doc.add_field(123, text)
paul@77	587	wi.add_document(doc)
paul@77	588	wi.close()
paul@77	589
paul@77	590	rd = index.get_reader()
paul@77	591
paul@77	592	print "- (Test searching.)"
paul@77	593
paul@77	594	for term, frequency, doc_positions in doc_tests2:
paul@77	595	dp = list(rd.find_positions(term))
paul@77	596	print doc_positions == dp, doc_positions, dp
paul@77	597	fr = rd.get_frequency(term)
paul@77	598	print frequency == fr, frequency, fr
paul@77	599
paul@77	600	print "- (Test fields.)"
paul@77	601
paul@77	602	for docnum, text in docs2:
paul@77	603	df = dict(rd.get_fields(docnum))
paul@77	604	print df[123] == text, text, df[123]
paul@77	605
paul@77	606	print "- (Test navigation.)"
paul@77	607
paul@77	608	for term, docnum, positions in position_tests2:
paul@77	609	dp = rd.find_positions(term)
paul@77	610	pos = dp.from_document(docnum)
paul@77	611	print positions is None and pos is None or pos is not None and positions == list(pos), positions, pos
paul@77	612
paul@77	613	print "- (Test phrases.)"
paul@77	614
paul@77	615	for terms, results in phrase_tests2:
paul@77	616	res = list(rd.find_common_positions(terms))
paul@77	617	print results == res, results, res
paul@77	618
paul@77	619	index.close()
paul@77	620
paul@69	621	print "- Test index updates."
paul@58	622
paul@58	623	index = Index("test_index")
paul@64	624	index2 = Index("test_index2", 3, 2, 3, 6)
paul@64	625	wi = index2.get_writer()
paul@58	626	for docnum, text in docs:
paul@58	627
paul@58	628	# Add the same documents but with different numbers.
paul@58	629
paul@58	630	doc = Document(docnum + 100)
paul@58	631	for position, term in enumerate(text.split()):
paul@58	632	doc.add_position(term, position)
paul@58	633	doc.add_field(123, text)
paul@58	634	wi.add_document(doc)
paul@58	635	wi.close()
paul@58	636
paul@58	637	index2.update([index])
paul@58	638	index.close()
paul@58	639
paul@58	640	rd = index2.get_reader()
paul@58	641	for term, frequency, doc_positions in doc_tests:
paul@58	642
paul@58	643	# Add the extra documents to the expected result.
paul@58	644
paul@59	645	orig_doc_positions = doc_positions
paul@59	646	doc_positions = doc_positions[:]
paul@59	647
paul@59	648	for docnum, positions in orig_doc_positions:
paul@58	649	doc_positions.append((docnum + 100, positions))
paul@58	650	frequency *= 2
paul@58	651
paul@58	652	dp = list(rd.find_positions(term))
paul@58	653	print doc_positions == dp, doc_positions, dp
paul@58	654	fr = rd.get_frequency(term)
paul@58	655	print frequency == fr, frequency, fr
paul@58	656	index2.close()
paul@58	657
paul@69	658	print "- (Test update of an empty index.)"
paul@59	659
paul@59	660	index = Index("test_index")
paul@59	661	index3 = Index("test_index3")
paul@59	662	index3.update([index])
paul@59	663	index.close()
paul@59	664
paul@59	665	rd = index3.get_reader()
paul@59	666	for term, frequency, doc_positions in doc_tests:
paul@59	667	dp = list(rd.find_positions(term))
paul@59	668	print doc_positions == dp, doc_positions, dp
paul@59	669	fr = rd.get_frequency(term)
paul@59	670	print frequency == fr, frequency, fr
paul@59	671	index3.close()
paul@59	672
paul@0	673	# vim: tabstop=4 expandtab shiftwidth=4