iixr (annotate iixr/terms.py in 892f694ba76f)

iixr

Annotated iixr/terms.py

100:892f694ba76f

2011-02-15

Paul Boddie

Fixed the itermerge optimisation for single element collections of iterators. Removed the common iterator support in the terms module since the features are really only relevant for the combined iterator. Fixed the seeking and state resetting logic in the combined iterator, retaining the last returned values only in this class. Introduced usage of the combined iterator as index reader only where a single pair of files is present.

paul@44	1	#!/usr/bin/env python
paul@44	2
paul@44	3	"""
paul@44	4	Specific classes for storing term information.
paul@44	5
paul@89	6	Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
paul@44	7
paul@44	8	This program is free software; you can redistribute it and/or modify it under
paul@44	9	the terms of the GNU General Public License as published by the Free Software
paul@44	10	Foundation; either version 3 of the License, or (at your option) any later
paul@44	11	version.
paul@44	12
paul@44	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@44	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@44	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@44	16
paul@44	17	You should have received a copy of the GNU General Public License along
paul@44	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@44	19	"""
paul@44	20
paul@96	21	from iixr.data import *
paul@44	22	from iixr.files import *
paul@97	23	from itermerge import itermerge
paul@44	24	from os.path import commonprefix # to find common string prefixes
paul@97	25	from bisect import bisect_right, insort_right
paul@97	26	import operator
paul@44	27
paul@44	28	class TermWriter(FileWriter):
paul@44	29
paul@44	30	"Writing term information to files."
paul@44	31
paul@96	32	def begin(self, docnum_size, position_size):
paul@96	33
paul@96	34	"Begin writing to the file."
paul@96	35
paul@96	36	self.write_numbers((docnum_size, position_size))
paul@90	37	self.end_record()
paul@96	38
paul@96	39	self.data_start = self.tell()
paul@96	40	self.docnum_size = docnum_size
paul@96	41	self.position_size = position_size
paul@96	42	self.subtractor = get_subtractor(docnum_size)
paul@44	43	self.last_term = ""
paul@44	44
paul@96	45	def write_terms(self, terms):
paul@44	46
paul@44	47	"""
paul@96	48	Write the 'terms' to the term information file, with each term's details
paul@96	49	stored in a separate record.
paul@44	50	"""
paul@44	51
paul@96	52	if hasattr(terms, "items"):
paul@96	53	terms = terms.items()
paul@96	54	terms.sort()
paul@96	55
paul@96	56	for term, doc_positions in terms:
paul@96	57	if not doc_positions:
paul@96	58	continue
paul@96	59
paul@96	60	if hasattr(doc_positions, "items"):
paul@96	61	doc_positions = doc_positions.items()
paul@96	62
paul@96	63	docnum, positions = doc_positions[0]
paul@96	64
paul@96	65	if not positions:
paul@96	66	continue
paul@96	67
paul@96	68	# Start the writing, if appropriate.
paul@96	69
paul@96	70	if self.data_start is None:
paul@96	71	self.begin(sizeof(docnum), sizeof(positions[0]))
paul@96	72
paul@96	73	# Write each term and document positions.
paul@96	74
paul@96	75	self.write_term(term, doc_positions)
paul@96	76	self.end_record()
paul@96	77
paul@96	78	# Methods requiring an open record.
paul@96	79
paul@96	80	def write_term(self, term, doc_positions):
paul@96	81
paul@96	82	"""
paul@96	83	Write the given 'term', its document frequency (number of documents in
paul@96	84	which it appears), and 'doc_positions' to the term information file.
paul@96	85	"""
paul@96	86
paul@96	87	self.write_term_only(term)
paul@96	88
paul@96	89	# Write the document frequency and the term positions.
paul@96	90
paul@96	91	self.write_positions(doc_positions)
paul@96	92
paul@96	93	def write_term_plus_remaining(self, term, data):
paul@96	94
paul@96	95	"Write the given 'term' and the document position 'data'."
paul@96	96
paul@96	97	self.write_term_only(term)
paul@96	98	self.write_remaining(data)
paul@96	99
paul@96	100	def write_term_only(self, term):
paul@96	101
paul@96	102	"Write only the given 'term'."
paul@96	103
paul@75	104	if term <= self.last_term:
paul@75	105	raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
paul@75	106
paul@44	107	# Write the prefix length and term suffix.
paul@44	108
paul@44	109	common = len(commonprefix([self.last_term, term]))
paul@44	110	suffix = term[common:]
paul@44	111
paul@44	112	self.write_number(common)
paul@44	113	self.write_string(suffix)
paul@44	114
paul@96	115	self.last_term = term
paul@96	116
paul@96	117	def write_positions(self, doc_positions):
paul@96	118
paul@96	119	"Write the given 'doc_positions' to the file."
paul@96	120
paul@96	121	# Make sure that the positions are sorted.
paul@96	122
paul@96	123	doc_positions.sort()
paul@96	124
paul@44	125	# Write the document frequency.
paul@44	126
paul@96	127	self.write_number(len(doc_positions))
paul@96	128
paul@96	129	last_docnum = None
paul@96	130
paul@96	131	for docnum, positions in doc_positions:
paul@96	132
paul@96	133	# Store the first document number as it is.
paul@96	134
paul@96	135	if last_docnum is None:
paul@96	136	docnum_seq = docnum
paul@96	137
paul@96	138	# Reject out-of-order documents.
paul@96	139
paul@96	140	elif docnum < last_docnum:
paul@96	141	raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum)
paul@44	142
paul@96	143	# Calculate an ongoing delta.
paul@96	144
paul@96	145	else:
paul@96	146	docnum_seq = self.subtractor(docnum, last_docnum)
paul@96	147
paul@96	148	# Write the document number and positions.
paul@96	149
paul@96	150	self.write_sequence_value(docnum_seq, self.docnum_size)
paul@96	151	self.write_monotonic_sequence(positions, self.position_size)
paul@96	152
paul@96	153	last_docnum = docnum
paul@96	154
paul@96	155	# Write a terminating byte to indicate that no more document pages
paul@96	156	# exist.
paul@96	157
paul@96	158	self.write_byte(0)
paul@44	159
paul@44	160	class TermReader(FileReader):
paul@44	161
paul@44	162	"Reading term information from files."
paul@44	163
paul@96	164	def begin(self):
paul@96	165
paul@96	166	"Begin reading from the file."
paul@96	167
paul@96	168	self.begin_record()
paul@96	169	try:
paul@96	170	self.docnum_size, self.position_size = self.read_numbers(2)
paul@96	171	except EOFError:
paul@96	172	self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
paul@96	173
paul@96	174	self.data_start = self.tell()
paul@96	175	self.adder = get_adder(self.docnum_size)
paul@44	176	self.last_term = ""
paul@96	177
paul@96	178	def get_sizes(self):
paul@96	179	return self.docnum_size, self.position_size
paul@96	180
paul@96	181	# Methods requiring an open record.
paul@44	182
paul@44	183	def read_term(self):
paul@44	184
paul@96	185	"Read a term and its document positions from the term information file."
paul@96	186
paul@96	187	# Read the term.
paul@96	188
paul@96	189	self.read_term_only()
paul@96	190
paul@96	191	# Read the document frequency and the term positions.
paul@96	192
paul@96	193	positions = self.read_positions()
paul@96	194
paul@96	195	return self.last_term, positions
paul@96	196
paul@96	197	def read_term_plus_remaining(self):
paul@96	198
paul@44	199	"""
paul@96	200	Read a term and the unprocessed document position data.
paul@44	201	"""
paul@44	202
paul@96	203	self.read_term_only()
paul@96	204	return self.last_term, self.read_remaining()
paul@96	205
paul@96	206	def read_term_only(self):
paul@96	207
paul@96	208	"Read a term only."
paul@96	209
paul@44	210	# Read the prefix length and term suffix.
paul@44	211
paul@44	212	common = self.read_number()
paul@44	213	suffix = self.read_string()
paul@44	214
paul@44	215	self.last_term = self.last_term[:common] + suffix
paul@96	216	return self.last_term
paul@44	217
paul@96	218	def read_positions(self):
paul@44	219
paul@96	220	"Read document positions from the term information file."
paul@44	221
paul@96	222	doc_positions = []
paul@44	223
paul@96	224	while 1:
paul@44	225
paul@96	226	# Read the document frequency.
paul@44	227
paul@96	228	npositions = self.read_number()
paul@91	229
paul@96	230	last_docnum = None
paul@96	231	i = 0
paul@96	232	while i < npositions:
paul@44	233
paul@96	234	# Read the document number.
paul@44	235
paul@96	236	docnum = self.read_sequence_value(self.docnum_size)
paul@96	237	if last_docnum is not None:
paul@96	238	docnum = self.adder(docnum, last_docnum)
paul@44	239
paul@96	240	# Read the positions.
paul@44	241
paul@96	242	positions = self.read_monotonic_sequence(self.position_size)
paul@96	243	doc_positions.append((docnum, positions))
paul@44	244
paul@96	245	last_docnum = docnum
paul@96	246	i += 1
paul@44	247
paul@96	248	# Read a terminating byte to discover whether more document pages
paul@96	249	# exist.
paul@44	250
paul@96	251	if not self.read_byte():
paul@96	252	break
paul@44	253
paul@96	254	return doc_positions
paul@81	255
paul@97	256	# Indexes covering the information files.
paul@97	257
paul@97	258	class TermIndexWriter(FileWriter):
paul@97	259
paul@97	260	"Writing term index information to files."
paul@97	261
paul@97	262	def begin(self):
paul@97	263
paul@97	264	"Begin writing to the file."
paul@97	265
paul@97	266	self.data_start = self.tell()
paul@97	267	self.last_term = ""
paul@97	268	self.last_offset = 0
paul@97	269
paul@97	270	def write_term(self, term, offset):
paul@97	271
paul@97	272	"Write the given 'term' and 'offset'."
paul@97	273
paul@97	274	if term <= self.last_term:
paul@97	275	raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
paul@97	276
paul@97	277	# Write the prefix length and term suffix.
paul@97	278
paul@97	279	common = len(commonprefix([self.last_term, term]))
paul@97	280	suffix = term[common:]
paul@97	281
paul@97	282	self.write_number(common)
paul@97	283	self.write_string(suffix)
paul@97	284
paul@97	285	# Write the offset delta.
paul@97	286
paul@97	287	self.write_number(offset - self.last_offset)
paul@97	288
paul@97	289	self.last_term = term
paul@97	290	self.last_offset = offset
paul@97	291
paul@97	292	class TermIndexReader(FileReader):
paul@58	293
paul@97	294	"Reading term index information to files."
paul@97	295
paul@97	296	def begin(self):
paul@97	297
paul@97	298	"Begin reading from the file."
paul@97	299
paul@97	300	self.data_start = self.tell()
paul@97	301	self.last_term = ""
paul@97	302	self.last_offset = 0
paul@97	303
paul@97	304	def read_term(self):
paul@97	305
paul@97	306	"Read a term and an offset from the file."
paul@97	307
paul@97	308	# Read the prefix length and term suffix.
paul@97	309
paul@97	310	common = self.read_number()
paul@97	311	suffix = self.read_string()
paul@97	312
paul@97	313	self.last_term = self.last_term[:common] + suffix
paul@97	314
paul@97	315	# Read the offset delta.
paul@97	316
paul@97	317	self.last_offset += self.read_number()
paul@97	318	return self.last_term, self.last_offset
paul@97	319
paul@100	320	# External reading classes.
paul@99	321
paul@100	322	class TermIterator(TermReader):
paul@99	323
paul@100	324	"An iterator over terms and positions read from a file."
paul@44	325
paul@44	326	def __iter__(self):
paul@44	327	return self
paul@44	328
paul@99	329	def next(self):
paul@44	330	try:
paul@96	331	self.begin_record()
paul@44	332	return self.read_term()
paul@44	333	except EOFError:
paul@44	334	raise StopIteration
paul@44	335
paul@100	336	class TermDataIterator(TermReader):
paul@90	337
paul@96	338	"An iterator over terms and unprocessed document positions data."
paul@90	339
paul@100	340	def __iter__(self):
paul@100	341	return self
paul@72	342
paul@100	343	def next(self):
paul@44	344	try:
paul@96	345	self.begin_record()
paul@96	346	return self.read_term_plus_remaining()
paul@44	347	except EOFError:
paul@96	348	raise StopIteration
paul@44	349
paul@97	350	class TermIndexIterator(TermIndexReader):
paul@97	351
paul@97	352	"An iterator over terms and offsets read from a file."
paul@97	353
paul@97	354	def __iter__(self):
paul@97	355	return self
paul@97	356
paul@97	357	def next(self):
paul@97	358	try:
paul@97	359	self.begin_record()
paul@97	360	return self.read_term()
paul@97	361	except EOFError:
paul@97	362	raise StopIteration
paul@97	363
paul@100	364	class CombinedIterator:
paul@97	365
paul@97	366	"An iterator providing index and information file access."
paul@97	367
paul@97	368	def __init__(self, reader, index_reader):
paul@97	369	self.reader = reader
paul@97	370	self.index_reader = index_reader
paul@97	371	self.records = list(index_reader)
paul@98	372	self.terms = [t for t, dp in self.records]
paul@98	373
paul@100	374	# Cache the last term and positions.
paul@100	375
paul@100	376	self.last_term_returned = None
paul@100	377	self.last_positions_returned = None
paul@100	378
paul@97	379	def go_to_term(self, term):
paul@97	380
paul@98	381	"""
paul@98	382	Return the 'term' and positions or nearest following term and positions.
paul@98	383	"""
paul@98	384
paul@100	385	if self.last_term_returned is not None and self.last_term_returned == term:
paul@99	386	return self.last_term_returned, self.last_positions_returned
paul@98	387
paul@97	388	# Get the record providing a term less than or equal to the requested
paul@97	389	# term, getting the first entry if no such records exist.
paul@97	390
paul@98	391	after = bisect_right(self.terms, term)
paul@98	392	before = max(0, after - 1)
paul@98	393
paul@98	394	t, offset = self.records[before]
paul@98	395	terms_after = self.terms[after:]
paul@97	396
paul@97	397	# Seek to the corresponding record in the information file.
paul@98	398	# Only do this if the term is more quickly reached by seeking.
paul@97	399
paul@100	400	if term <= t or (
paul@100	401	self.last_term_returned is None or term < self.last_term_returned or
paul@100	402	self.last_term_returned < t or terms_after and terms_after[0] <= self.last_term_returned
paul@100	403	):
paul@98	404
paul@98	405	self.reader.seek(offset)
paul@100	406
paul@100	407	# Skip the term information, overwrite the reader's state.
paul@100	408
paul@100	409	self.reader.begin_record()
paul@100	410	self.reader.read_term_only()
paul@99	411	self.reader.last_term = t
paul@97	412
paul@97	413	# Where the found term is equal or greater, just read the positions for
paul@97	414	# the index entry.
paul@97	415
paul@97	416	if t >= term:
paul@97	417
paul@100	418	# Get the positions.
paul@97	419
paul@99	420	self.last_term_returned, self.last_positions_returned = t, self.reader.read_positions()
paul@99	421	return self.last_term_returned, self.last_positions_returned
paul@97	422
paul@97	423	# Where the found term is less, use the information file to find the
paul@97	424	# term or the one after.
paul@97	425
paul@97	426	else:
paul@97	427
paul@100	428	# Scan for the term.
paul@97	429
paul@97	430	while t < term:
paul@100	431	t, dp = self.next() # remembers the term and positions
paul@97	432
paul@97	433	return t, dp
paul@97	434
paul@100	435	def __iter__(self):
paul@100	436	return self
paul@100	437
paul@100	438	def next(self):
paul@100	439	self.last_term_returned, self.last_positions_returned = record = self.reader.next()
paul@100	440	return record
paul@97	441
paul@97	442	def close(self):
paul@97	443	if self.reader is not None:
paul@97	444	self.reader.close()
paul@97	445	self.reader = None
paul@97	446	if self.index_reader is not None:
paul@97	447	self.index_reader.close()
paul@97	448	self.index_reader = None
paul@97	449
paul@97	450	class MultipleReader(itermerge):
paul@97	451
paul@97	452	"Accessing many term readers at once."
paul@97	453
paul@97	454	def __init__(self, readers, combine=None):
paul@97	455
paul@97	456	"""
paul@97	457	Initialise a master index reader using underlying 'readers' and a
paul@97	458	'combine' function which knows how to combine position information from
paul@97	459	different sources.
paul@97	460	"""
paul@97	461
paul@97	462	self.readers = readers
paul@97	463	self.combine = combine or operator.add
paul@97	464
paul@97	465	# Initialise this object as an iterator over the readers.
paul@97	466
paul@97	467	itermerge.__init__(self, self.readers)
paul@97	468	self.next_value = None
paul@97	469
paul@97	470	def get_sizes(self):
paul@97	471
paul@97	472	# Readers must have compatible sizes.
paul@97	473
paul@97	474	if self.readers:
paul@97	475	return self.readers[0].get_sizes()
paul@97	476	else:
paul@97	477	return 0, 0
paul@97	478
paul@97	479	def go_to_term(self, term):
paul@100	480
paul@100	481	# Refresh the iterators to provide sought values.
paul@100	482
paul@97	483	self.iters = []
paul@97	484	for reader in self.readers:
paul@97	485	try:
paul@97	486	insort_right(self.iters, (reader.go_to_term(term), reader.next))
paul@97	487	except StopIteration:
paul@97	488	pass
paul@100	489
paul@97	490	self.next_value = None
paul@97	491	return self.next()
paul@97	492
paul@97	493	def next(self):
paul@100	494
paul@100	495	# Return any prematurely read value.
paul@100	496
paul@97	497	if self.next_value is not None:
paul@97	498	term, positions = self.next_value
paul@100	499
paul@100	500	# Otherwise, get the next value.
paul@100	501
paul@97	502	else:
paul@97	503	term, positions = itermerge.next(self)
paul@97	504
paul@100	505	# Look at the next item to see if it has positions for the current term.
paul@97	506
paul@97	507	try:
paul@97	508	t, p = itermerge.next(self)
paul@97	509	while t == term:
paul@97	510	positions = self.combine(positions, p)
paul@97	511	t, p = itermerge.next(self)
paul@97	512	self.next_value = t, p
paul@97	513
paul@97	514	# Where an item could not be fetched, cause future requests to fail.
paul@97	515
paul@97	516	except StopIteration:
paul@97	517	self.next_value = None
paul@97	518
paul@97	519	return term, positions
paul@97	520
paul@97	521	def close(self):
paul@97	522	for reader in self.readers:
paul@97	523	reader.close()
paul@97	524	self.readers = []
paul@97	525
paul@44	526	# vim: tabstop=4 expandtab shiftwidth=4