iixr (annotate iixr/index.py in 6542c54d115b)

iixr

Annotated iixr/index.py

96:6542c54d115b

2011-02-13

Paul Boddie

Removed numerous classes, simplifying the package and focusing on combined term and position files which can be merged using fewer processing operations.

paul@44	1	#!/usr/bin/env python
paul@44	2
paul@44	3	"""
paul@44	4	High-level classes.
paul@44	5
paul@85	6	Copyright (C) 2009, 2010, 2011 Paul Boddie <paul@boddie.org.uk>
paul@44	7
paul@44	8	This program is free software; you can redistribute it and/or modify it under
paul@44	9	the terms of the GNU General Public License as published by the Free Software
paul@44	10	Foundation; either version 3 of the License, or (at your option) any later
paul@44	11	version.
paul@44	12
paul@44	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@44	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@44	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@44	16
paul@44	17	You should have received a copy of the GNU General Public License along
paul@44	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@44	19	"""
paul@44	20
paul@44	21	from iixr.filesystem import *
paul@96	22	from itermerge import itermerge
paul@76	23	from os import mkdir # index discovery
paul@44	24	from os.path import exists
paul@96	25	import operator
paul@44	26
paul@44	27	# Constants.
paul@44	28
paul@44	29	FLUSH_INTERVAL = 10000
paul@73	30	OPEN_PARTITIONS = 20
paul@44	31
paul@44	32	# High-level classes.
paul@44	33
paul@44	34	class Document:
paul@44	35
paul@44	36	"A container of document information."
paul@44	37
paul@96	38	def __init__(self, docnum):
paul@44	39	self.docnum = docnum
paul@44	40	self.terms = {}
paul@44	41
paul@44	42	def add_position(self, term, position):
paul@44	43
paul@44	44	"""
paul@44	45	Add a position entry for the given 'term', indicating the given
paul@44	46	'position'.
paul@44	47	"""
paul@44	48
paul@44	49	self.terms.setdefault(term, []).append(position)
paul@44	50
paul@44	51	class IndexWriter:
paul@44	52
paul@96	53	"Building term information and writing it to the term dictionary."
paul@44	54
paul@96	55	def __init__(self, pathname, flush_interval):
paul@44	56	self.pathname = pathname
paul@44	57	self.flush_interval = flush_interval
paul@44	58
paul@96	59	self.term_partition = get_next_partition(get_term_partitions(self.pathname))
paul@44	60
paul@44	61	self.terms = {}
paul@44	62	self.doc_counter = 0
paul@44	63
paul@44	64	def add_document(self, doc):
paul@44	65
paul@44	66	"""
paul@44	67	Add the given document 'doc', updating the document counter and flushing
paul@44	68	terms and fields if appropriate.
paul@44	69	"""
paul@44	70
paul@85	71	docnum = doc.docnum
paul@85	72
paul@44	73	for term, positions in doc.terms.items():
paul@85	74	self.terms.setdefault(term, {})[docnum] = positions
paul@44	75
paul@44	76	self.doc_counter += 1
paul@85	77
paul@96	78	if self.flush_interval and self.doc_counter >= self.flush_interval:
paul@44	79	self.flush_terms()
paul@44	80	self.doc_counter = 0
paul@44	81
paul@44	82	def get_term_writer(self):
paul@44	83
paul@96	84	"Return a term writer for the current partition."
paul@44	85
paul@96	86	return get_term_writer(self.pathname, self.term_partition)
paul@44	87
paul@44	88	def flush_terms(self):
paul@44	89
paul@96	90	"Flush terms into the current term partition."
paul@44	91
paul@44	92	# Get the terms in order.
paul@44	93
paul@96	94	term_writer = self.get_term_writer()
paul@96	95	try:
paul@96	96	term_writer.write_terms(self.terms)
paul@96	97	finally:
paul@96	98	term_writer.close()
paul@44	99
paul@44	100	self.terms = {}
paul@96	101	self.term_partition += 1
paul@44	102
paul@44	103	def close(self):
paul@76	104	if self.terms or not get_term_partitions(self.pathname):
paul@44	105	self.flush_terms()
paul@96	106
paul@96	107	class IndexReader(itermerge):
paul@96	108
paul@96	109	"Accessing the term dictionaries."
paul@44	110
paul@96	111	def __init__(self, pathname, get_reader=None, combine=None):
paul@44	112
paul@96	113	# Get the partitions in order.
paul@96	114
paul@96	115	partitions = list(get_term_partitions(pathname))
paul@96	116	partitions.sort()
paul@44	117
paul@96	118	# Initialise the underlying term partition readers.
paul@44	119
paul@96	120	self.readers = [(get_reader or get_term_reader)(pathname, partition) for partition in partitions]
paul@96	121	self.combine = combine or operator.add
paul@96	122
paul@96	123	# Initialise this object as an iterator over the readers.
paul@81	124
paul@96	125	itermerge.__init__(self, self.readers)
paul@96	126	self.next_value = None
paul@81	127
paul@96	128	def get_sizes(self):
paul@81	129
paul@96	130	# Readers must have compatible sizes.
paul@81	131
paul@96	132	if self.readers:
paul@96	133	return self.readers[0].get_sizes()
paul@96	134	else:
paul@96	135	return 0, 0
paul@44	136
paul@96	137	def next(self):
paul@96	138	if self.next_value is not None:
paul@96	139	term, positions = self.next_value
paul@96	140	else:
paul@96	141	term, positions = itermerge.next(self)
paul@44	142
paul@96	143	# Look at the next item to see if it is has positions for the current
paul@96	144	# term.
paul@60	145
paul@96	146	try:
paul@96	147	t, p = itermerge.next(self)
paul@96	148	while t == term:
paul@96	149	positions = self.combine(positions, p)
paul@96	150	t, p = itermerge.next(self)
paul@96	151	self.next_value = t, p
paul@44	152
paul@96	153	# Where an item could not be fetched, cause future requests to fail.
paul@44	154
paul@96	155	except StopIteration:
paul@96	156	self.next_value = None
paul@96	157
paul@96	158	return term, positions
paul@63	159
paul@44	160	def close(self):
paul@96	161	for reader in self.readers:
paul@96	162	reader.close()
paul@96	163	self.readers = []
paul@44	164
paul@44	165	class Index:
paul@44	166
paul@44	167	"An inverted index solution encapsulating the various components."
paul@44	168
paul@96	169	def __init__(self, pathname, flush_interval=FLUSH_INTERVAL,
paul@96	170	open_partitions=OPEN_PARTITIONS):
paul@64	171
paul@44	172	self.pathname = pathname
paul@64	173	self.flush_interval = flush_interval
paul@73	174	self.open_partitions = open_partitions
paul@44	175	self.reader = None
paul@44	176	self.writer = None
paul@44	177
paul@64	178	def get_writer(self):
paul@44	179
paul@64	180	"Return a writer."
paul@44	181
paul@96	182	if self.writer is None:
paul@96	183	self._ensure_directory()
paul@96	184	self.writer = IndexWriter(self.pathname, self.flush_interval)
paul@59	185	return self.writer
paul@59	186
paul@59	187	def _ensure_directory(self):
paul@44	188	if not exists(self.pathname):
paul@44	189	mkdir(self.pathname)
paul@44	190
paul@96	191	def get_reader(self, refresh=0):
paul@44	192
paul@44	193	"Return a reader for the index."
paul@44	194
paul@96	195	if refresh and self.reader is not None:
paul@96	196	self.reader.close()
paul@96	197	self.reader = None
paul@58	198
paul@96	199	if self.reader is None:
paul@96	200	if not exists(self.pathname):
paul@96	201	raise OSError, "Index path %r does not exist." % self.pathname
paul@96	202	self.reader = IndexReader(self.pathname)
paul@96	203	return self.reader
paul@58	204
paul@44	205	def merge(self):
paul@44	206
paul@96	207	"Merge the partitions in the index."
paul@44	208
paul@96	209	reader = IndexReader(self.pathname, get_term_data_reader, self.merge_data)
paul@96	210	writer = get_term_writer(self.pathname, "merged")
paul@96	211	try:
paul@96	212	writer.begin(*reader.get_sizes())
paul@96	213	for term, data in reader:
paul@96	214	writer.write_term_plus_remaining(term, data)
paul@96	215	writer.end_record()
paul@96	216	finally:
paul@96	217	writer.close()
paul@96	218	reader.close()
paul@73	219
paul@96	220	for partition in get_term_partitions(self.pathname):
paul@96	221	remove_term_files(self.pathname, partition)
paul@73	222
paul@96	223	rename_term_files(self.pathname, "merged", 0)
paul@73	224
paul@96	225	def merge_data(self, a, b):
paul@44	226
paul@96	227	"""
paul@96	228	Merge 'a' and 'b', modifying the data to permit concatenation.
paul@96	229	"""
paul@58	230
paul@96	231	# Modify the record to indicate a continuation of the data.
paul@59	232
paul@96	233	c = a + b
paul@96	234	c[len(a) - 1] = 1
paul@96	235	return c
paul@58	236
paul@44	237	def close(self):
paul@44	238	if self.reader is not None:
paul@44	239	self.reader.close()
paul@44	240	self.reader = None
paul@44	241	if self.writer is not None:
paul@44	242	self.writer.close()
paul@44	243	self.writer = None
paul@44	244
paul@44	245	# vim: tabstop=4 expandtab shiftwidth=4