1 #!/usr/bin/env python 2 3 """ 4 Simple indexing of sorted files. 5 6 Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT ANY 14 WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A 15 PARTICULAR PURPOSE. See the GNU General Public License for more details. 16 17 You should have received a copy of the GNU General Public License along 18 with this program. If not, see <http://www.gnu.org/licenses/>. 19 20 Motivations 21 ----------- 22 23 When searching files, the cost to scan an entire file typically exceeds the 24 cost to navigate to the appropriate region and to scan a smaller portion of the 25 file, especially for large files. At the same time, exotic data structures 26 encouraging multiple seeks and reads are likely to waste time compared to just 27 performing a single read operation, even if that operation involves a larger 28 quantity of data, at least for storage with hard disk access characteristics. 29 """ 30 31 from simplex.readers import * 32 from simplex.indexers import * 33 import bisect 34 35 class IndexedFile: 36 37 "An indexed file referring to a sorted file." 38 39 def __init__(self, reader, index, get_key): 40 self.reader = reader 41 self.index = index 42 self.get_key = get_key 43 44 def find(self, term): 45 return find_with_index(self.reader, self.get_key, self.index, term) 46 47 def find_with_index(reader, get_key, l, term): 48 49 """ 50 In the resource whose 'reader' provides records, using a 'get_key' operation 51 to yield the key for such records, and using the given index list 'l', find 52 the given 'term', returning a record employing the term or None if no such 53 record was found. 54 """ 55 56 i = bisect.bisect_left(l, (term, None)) 57 58 try: 59 found, pos = l[i] 60 except IndexError: 61 return None 62 63 # Since the index is more coarse than the underlying file, the bisect left 64 # operation will most likely point to an index entry for later records than 65 # the desired one. 66 67 if found > term: 68 i = max(0, i - 1) 69 found, pos = l[i] 70 71 reader.seek(pos) 72 73 if found == term: 74 return reader.next() 75 else: 76 return find_in_file(reader, get_key, term) 77 78 def find_in_file(reader, get_key, term): 79 80 """ 81 In the resource whose 'reader' provides records, using a 'get_key' operation 82 to yield the key for such records, find the given 'term', returning a record 83 employing the term or None if no such record was found. 84 """ 85 86 for record in reader: 87 key = get_key(record) 88 if term == key: 89 return record 90 91 # Short-circuit failed searches. 92 93 elif term < key: 94 return None 95 96 return None 97 98 def groups(l, length): 99 100 "Split 'l' into groups of the given 'length'." 101 102 if length <= 0: 103 raise ValueError, "Groups must be greater than zero." 104 105 i = 0 106 g = [] 107 108 while i < len(l): 109 g.append(l[i:i+length]) 110 i += length 111 112 return g 113 114 # vim: tabstop=4 expandtab shiftwidth=4