simplex (annotate simplex/__init_

simplex

Annotated simplex/init.py

22:e0ed0702667c

2011-10-03

Paul Boddie

Introduced an abstraction for an indexed file.

paul@0	1	#!/usr/bin/env python
paul@0	2
paul@0	3	"""
paul@0	4	Simple indexing of sorted files.
paul@0	5
paul@1	6	Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
paul@0	7
paul@0	8	This program is free software; you can redistribute it and/or modify it under
paul@0	9	the terms of the GNU General Public License as published by the Free Software
paul@0	10	Foundation; either version 3 of the License, or (at your option) any later
paul@0	11	version.
paul@0	12
paul@0	13	This program is distributed in the hope that it will be useful, but WITHOUT ANY
paul@0	14	WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
paul@0	15	PARTICULAR PURPOSE. See the GNU General Public License for more details.
paul@0	16
paul@0	17	You should have received a copy of the GNU General Public License along
paul@0	18	with this program. If not, see <http://www.gnu.org/licenses/>.
paul@0	19
paul@0	20	Motivations
paul@0	21	-----------
paul@0	22
paul@0	23	When searching files, the cost to scan an entire file typically exceeds the
paul@0	24	cost to navigate to the appropriate region and to scan a smaller portion of the
paul@0	25	file, especially for large files. At the same time, exotic data structures
paul@0	26	encouraging multiple seeks and reads are likely to waste time compared to just
paul@0	27	performing a single read operation, even if that operation involves a larger
paul@0	28	quantity of data, at least for storage with hard disk access characteristics.
paul@0	29	"""
paul@0	30
paul@6	31	from simplex.readers import *
paul@20	32	from simplex.indexers import *
paul@0	33	import bisect
paul@0	34
paul@22	35	class IndexedFile:
paul@22	36
paul@22	37	"An indexed file referring to a sorted file."
paul@22	38
paul@22	39	def __init__(self, reader, index_reader, get_key):
paul@22	40	self.reader = reader
paul@22	41	self.index_reader = index_reader
paul@22	42	self.get_key = get_key
paul@22	43
paul@22	44	def find(self, term):
paul@22	45	return find_with_index(self.reader, self.get_key, self.index_reader, term)
paul@22	46
paul@17	47	def find_with_index(reader, get_key, l, term):
paul@0	48
paul@0	49	"""
paul@17	50	In the resource whose 'reader' provides records, using a 'get_key' operation
paul@17	51	to yield the key for such records, and using the given index list 'l', find
paul@17	52	the given 'term', returning a record employing the term or None if no such
paul@4	53	record was found.
paul@0	54	"""
paul@0	55
paul@0	56	i = bisect.bisect_left(l, (term, None))
paul@3	57
paul@3	58	try:
paul@3	59	found, pos = l[i]
paul@3	60	except IndexError:
paul@3	61	return None
paul@0	62
paul@0	63	# Since the index is more coarse than the underlying file, the bisect left
paul@0	64	# operation will most likely point to an index entry for later records than
paul@0	65	# the desired one.
paul@0	66
paul@0	67	if found > term:
paul@0	68	i = max(0, i - 1)
paul@0	69	found, pos = l[i]
paul@0	70
paul@4	71	reader.seek(pos)
paul@17	72	return find_in_file(reader, get_key, term)
paul@0	73
paul@17	74	def find_in_file(reader, get_key, term):
paul@0	75
paul@0	76	"""
paul@17	77	In the resource whose 'reader' provides records, using a 'get_key' operation
paul@17	78	to yield the key for such records, find the given 'term', returning a record
paul@4	79	employing the term or None if no such record was found.
paul@0	80	"""
paul@0	81
paul@16	82	for record in reader:
paul@17	83	key = get_key(record)
paul@7	84	if term == key:
paul@1	85	return record
paul@0	86
paul@7	87	# Short-circuit failed searches.
paul@7	88
paul@7	89	elif term < key:
paul@7	90	return None
paul@7	91
paul@0	92	return None
paul@0	93
paul@3	94	def groups(l, length):
paul@3	95
paul@3	96	"Split 'l' into groups of the given 'length'."
paul@3	97
paul@3	98	if length <= 0:
paul@3	99	raise ValueError, "Groups must be greater than zero."
paul@3	100
paul@3	101	i = 0
paul@3	102	g = []
paul@3	103
paul@3	104	while i < len(l):
paul@3	105	g.append(l[i:i+length])
paul@3	106	i += length
paul@3	107
paul@3	108	return g
paul@3	109
paul@0	110	# vim: tabstop=4 expandtab shiftwidth=4

simplex

Annotated simplex/__init__.py

Annotated simplex/init.py