iixr (annotate iixr/terms.py in 6542c54d115b)

iixr

Annotated iixr/terms.py

96:6542c54d115b

2011-02-13

Paul Boddie

Removed numerous classes, simplifying the package and focusing on combined term and position files which can be merged using fewer processing operations.

paul@44	1	#!/usr/bin/env python
paul@44	2
paul@44	3	"""
paul@44	4	Specific classes for storing term information.
paul@44	5
paul@89	6	Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
paul@44	7
paul@44	8	This program is free software; you can redistribute it and/or modify it under
paul@44	9	the terms of the GNU General Public License as published by the Free Software
paul@44	10	Foundation; either version 3 of the License, or (at your option) any later
paul@44	11	version.
paul@44	12
paul@44	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@44	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@44	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@44	16
paul@44	17	You should have received a copy of the GNU General Public License along
paul@44	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@44	19	"""
paul@44	20
paul@96	21	from iixr.data import *
paul@44	22	from iixr.files import *
paul@60	23	from iixr.phrases import PhraseIterator
paul@44	24	from os.path import commonprefix # to find common string prefixes
paul@44	25
paul@44	26	class TermWriter(FileWriter):
paul@44	27
paul@44	28	"Writing term information to files."
paul@44	29
paul@96	30	def begin(self, docnum_size, position_size):
paul@96	31
paul@96	32	"Begin writing to the file."
paul@96	33
paul@96	34	self.write_numbers((docnum_size, position_size))
paul@90	35	self.end_record()
paul@96	36
paul@96	37	self.data_start = self.tell()
paul@96	38	self.docnum_size = docnum_size
paul@96	39	self.position_size = position_size
paul@96	40	self.subtractor = get_subtractor(docnum_size)
paul@44	41	self.last_term = ""
paul@44	42
paul@96	43	def write_terms(self, terms):
paul@44	44
paul@44	45	"""
paul@96	46	Write the 'terms' to the term information file, with each term's details
paul@96	47	stored in a separate record.
paul@44	48	"""
paul@44	49
paul@96	50	if hasattr(terms, "items"):
paul@96	51	terms = terms.items()
paul@96	52	terms.sort()
paul@96	53
paul@96	54	for term, doc_positions in terms:
paul@96	55	if not doc_positions:
paul@96	56	continue
paul@96	57
paul@96	58	if hasattr(doc_positions, "items"):
paul@96	59	doc_positions = doc_positions.items()
paul@96	60
paul@96	61	docnum, positions = doc_positions[0]
paul@96	62
paul@96	63	if not positions:
paul@96	64	continue
paul@96	65
paul@96	66	# Start the writing, if appropriate.
paul@96	67
paul@96	68	if self.data_start is None:
paul@96	69	self.begin(sizeof(docnum), sizeof(positions[0]))
paul@96	70
paul@96	71	# Write each term and document positions.
paul@96	72
paul@96	73	self.write_term(term, doc_positions)
paul@96	74	self.end_record()
paul@96	75
paul@96	76	# Methods requiring an open record.
paul@96	77
paul@96	78	def write_term(self, term, doc_positions):
paul@96	79
paul@96	80	"""
paul@96	81	Write the given 'term', its document frequency (number of documents in
paul@96	82	which it appears), and 'doc_positions' to the term information file.
paul@96	83	"""
paul@96	84
paul@96	85	self.write_term_only(term)
paul@96	86
paul@96	87	# Write the document frequency and the term positions.
paul@96	88
paul@96	89	self.write_positions(doc_positions)
paul@96	90
paul@96	91	def write_term_plus_remaining(self, term, data):
paul@96	92
paul@96	93	"Write the given 'term' and the document position 'data'."
paul@96	94
paul@96	95	self.write_term_only(term)
paul@96	96	self.write_remaining(data)
paul@96	97
paul@96	98	def write_term_only(self, term):
paul@96	99
paul@96	100	"Write only the given 'term'."
paul@96	101
paul@75	102	if term <= self.last_term:
paul@75	103	raise ValueError, "Term %r precedes the previous term %r." % (term, self.last_term)
paul@75	104
paul@44	105	# Write the prefix length and term suffix.
paul@44	106
paul@44	107	common = len(commonprefix([self.last_term, term]))
paul@44	108	suffix = term[common:]
paul@44	109
paul@44	110	self.write_number(common)
paul@44	111	self.write_string(suffix)
paul@44	112
paul@96	113	self.last_term = term
paul@96	114
paul@96	115	def write_positions(self, doc_positions):
paul@96	116
paul@96	117	"Write the given 'doc_positions' to the file."
paul@96	118
paul@96	119	# Make sure that the positions are sorted.
paul@96	120
paul@96	121	doc_positions.sort()
paul@96	122
paul@44	123	# Write the document frequency.
paul@44	124
paul@96	125	self.write_number(len(doc_positions))
paul@96	126
paul@96	127	last_docnum = None
paul@96	128
paul@96	129	for docnum, positions in doc_positions:
paul@96	130
paul@96	131	# Store the first document number as it is.
paul@96	132
paul@96	133	if last_docnum is None:
paul@96	134	docnum_seq = docnum
paul@96	135
paul@96	136	# Reject out-of-order documents.
paul@96	137
paul@96	138	elif docnum < last_docnum:
paul@96	139	raise ValueError, "Document number %r is less than previous number %r." % (docnum, last_docnum)
paul@44	140
paul@96	141	# Calculate an ongoing delta.
paul@96	142
paul@96	143	else:
paul@96	144	docnum_seq = self.subtractor(docnum, last_docnum)
paul@96	145
paul@96	146	# Write the document number and positions.
paul@96	147
paul@96	148	self.write_sequence_value(docnum_seq, self.docnum_size)
paul@96	149	self.write_monotonic_sequence(positions, self.position_size)
paul@96	150
paul@96	151	last_docnum = docnum
paul@96	152
paul@96	153	# Write a terminating byte to indicate that no more document pages
paul@96	154	# exist.
paul@96	155
paul@96	156	self.write_byte(0)
paul@44	157
paul@44	158	class TermReader(FileReader):
paul@44	159
paul@44	160	"Reading term information from files."
paul@44	161
paul@96	162	def begin(self):
paul@96	163
paul@96	164	"Begin reading from the file."
paul@96	165
paul@96	166	self.begin_record()
paul@96	167	try:
paul@96	168	self.docnum_size, self.position_size = self.read_numbers(2)
paul@96	169	except EOFError:
paul@96	170	self.docnum_size, self.position_size = 0, 0 # NOTE: No positions!
paul@96	171
paul@96	172	self.data_start = self.tell()
paul@96	173	self.adder = get_adder(self.docnum_size)
paul@44	174	self.last_term = ""
paul@96	175
paul@96	176	def get_sizes(self):
paul@96	177	return self.docnum_size, self.position_size
paul@96	178
paul@96	179	# Methods requiring an open record.
paul@44	180
paul@44	181	def read_term(self):
paul@44	182
paul@96	183	"Read a term and its document positions from the term information file."
paul@96	184
paul@96	185	# Read the term.
paul@96	186
paul@96	187	self.read_term_only()
paul@96	188
paul@96	189	# Read the document frequency and the term positions.
paul@96	190
paul@96	191	positions = self.read_positions()
paul@96	192
paul@96	193	return self.last_term, positions
paul@96	194
paul@96	195	def read_term_plus_remaining(self):
paul@96	196
paul@44	197	"""
paul@96	198	Read a term and the unprocessed document position data.
paul@44	199	"""
paul@44	200
paul@96	201	self.read_term_only()
paul@96	202	return self.last_term, self.read_remaining()
paul@96	203
paul@96	204	def read_term_only(self):
paul@96	205
paul@96	206	"Read a term only."
paul@96	207
paul@44	208	# Read the prefix length and term suffix.
paul@44	209
paul@44	210	common = self.read_number()
paul@44	211	suffix = self.read_string()
paul@44	212
paul@44	213	self.last_term = self.last_term[:common] + suffix
paul@96	214	return self.last_term
paul@44	215
paul@96	216	def read_positions(self):
paul@44	217
paul@96	218	"Read document positions from the term information file."
paul@44	219
paul@96	220	doc_positions = []
paul@44	221
paul@96	222	while 1:
paul@44	223
paul@96	224	# Read the document frequency.
paul@44	225
paul@96	226	npositions = self.read_number()
paul@91	227
paul@96	228	last_docnum = None
paul@96	229	i = 0
paul@96	230	while i < npositions:
paul@44	231
paul@96	232	# Read the document number.
paul@44	233
paul@96	234	docnum = self.read_sequence_value(self.docnum_size)
paul@96	235	if last_docnum is not None:
paul@96	236	docnum = self.adder(docnum, last_docnum)
paul@44	237
paul@96	238	# Read the positions.
paul@44	239
paul@96	240	positions = self.read_monotonic_sequence(self.position_size)
paul@96	241	doc_positions.append((docnum, positions))
paul@44	242
paul@96	243	last_docnum = docnum
paul@96	244	i += 1
paul@44	245
paul@96	246	# Read a terminating byte to discover whether more document pages
paul@96	247	# exist.
paul@44	248
paul@96	249	if not self.read_byte():
paul@96	250	break
paul@44	251
paul@96	252	return doc_positions
paul@81	253
paul@96	254	class TermIterator(TermReader):
paul@58	255
paul@96	256	"An iterator over terms and positions read from a file."
paul@44	257
paul@44	258	def __iter__(self):
paul@44	259	return self
paul@44	260
paul@44	261	def next(self):
paul@44	262	try:
paul@96	263	self.begin_record()
paul@44	264	return self.read_term()
paul@44	265	except EOFError:
paul@44	266	raise StopIteration
paul@44	267
paul@96	268	class TermDataIterator(TermReader):
paul@90	269
paul@96	270	"An iterator over terms and unprocessed document positions data."
paul@90	271
paul@96	272	def __iter__(self):
paul@96	273	return self
paul@72	274
paul@96	275	def next(self):
paul@44	276	try:
paul@96	277	self.begin_record()
paul@96	278	return self.read_term_plus_remaining()
paul@44	279	except EOFError:
paul@96	280	raise StopIteration
paul@44	281
paul@44	282	# vim: tabstop=4 expandtab shiftwidth=4