paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@0 | 3 | """ |
paul@0 | 4 | Simple indexing of sorted files. |
paul@0 | 5 | |
paul@1 | 6 | Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> |
paul@0 | 7 | |
paul@0 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@0 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@0 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@0 | 11 | version. |
paul@0 | 12 | |
paul@0 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT ANY |
paul@0 | 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A |
paul@0 | 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. |
paul@0 | 16 | |
paul@0 | 17 | You should have received a copy of the GNU General Public License along |
paul@0 | 18 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@0 | 19 | |
paul@0 | 20 | Motivations |
paul@0 | 21 | ----------- |
paul@0 | 22 | |
paul@0 | 23 | When searching files, the cost to scan an entire file typically exceeds the |
paul@0 | 24 | cost to navigate to the appropriate region and to scan a smaller portion of the |
paul@0 | 25 | file, especially for large files. At the same time, exotic data structures |
paul@0 | 26 | encouraging multiple seeks and reads are likely to waste time compared to just |
paul@0 | 27 | performing a single read operation, even if that operation involves a larger |
paul@0 | 28 | quantity of data, at least for storage with hard disk access characteristics. |
paul@0 | 29 | """ |
paul@0 | 30 | |
paul@6 | 31 | from simplex.readers import * |
paul@20 | 32 | from simplex.indexers import * |
paul@0 | 33 | import bisect |
paul@0 | 34 | |
paul@22 | 35 | class IndexedFile: |
paul@22 | 36 | |
paul@22 | 37 | "An indexed file referring to a sorted file." |
paul@22 | 38 | |
paul@22 | 39 | def __init__(self, reader, index_reader, get_key): |
paul@22 | 40 | self.reader = reader |
paul@22 | 41 | self.index_reader = index_reader |
paul@22 | 42 | self.get_key = get_key |
paul@22 | 43 | |
paul@22 | 44 | def find(self, term): |
paul@22 | 45 | return find_with_index(self.reader, self.get_key, self.index_reader, term) |
paul@22 | 46 | |
paul@17 | 47 | def find_with_index(reader, get_key, l, term): |
paul@0 | 48 | |
paul@0 | 49 | """ |
paul@17 | 50 | In the resource whose 'reader' provides records, using a 'get_key' operation |
paul@17 | 51 | to yield the key for such records, and using the given index list 'l', find |
paul@17 | 52 | the given 'term', returning a record employing the term or None if no such |
paul@4 | 53 | record was found. |
paul@0 | 54 | """ |
paul@0 | 55 | |
paul@0 | 56 | i = bisect.bisect_left(l, (term, None)) |
paul@3 | 57 | |
paul@3 | 58 | try: |
paul@3 | 59 | found, pos = l[i] |
paul@3 | 60 | except IndexError: |
paul@3 | 61 | return None |
paul@0 | 62 | |
paul@0 | 63 | # Since the index is more coarse than the underlying file, the bisect left |
paul@0 | 64 | # operation will most likely point to an index entry for later records than |
paul@0 | 65 | # the desired one. |
paul@0 | 66 | |
paul@0 | 67 | if found > term: |
paul@0 | 68 | i = max(0, i - 1) |
paul@0 | 69 | found, pos = l[i] |
paul@0 | 70 | |
paul@4 | 71 | reader.seek(pos) |
paul@17 | 72 | return find_in_file(reader, get_key, term) |
paul@0 | 73 | |
paul@17 | 74 | def find_in_file(reader, get_key, term): |
paul@0 | 75 | |
paul@0 | 76 | """ |
paul@17 | 77 | In the resource whose 'reader' provides records, using a 'get_key' operation |
paul@17 | 78 | to yield the key for such records, find the given 'term', returning a record |
paul@4 | 79 | employing the term or None if no such record was found. |
paul@0 | 80 | """ |
paul@0 | 81 | |
paul@16 | 82 | for record in reader: |
paul@17 | 83 | key = get_key(record) |
paul@7 | 84 | if term == key: |
paul@1 | 85 | return record |
paul@0 | 86 | |
paul@7 | 87 | # Short-circuit failed searches. |
paul@7 | 88 | |
paul@7 | 89 | elif term < key: |
paul@7 | 90 | return None |
paul@7 | 91 | |
paul@0 | 92 | return None |
paul@0 | 93 | |
paul@3 | 94 | def groups(l, length): |
paul@3 | 95 | |
paul@3 | 96 | "Split 'l' into groups of the given 'length'." |
paul@3 | 97 | |
paul@3 | 98 | if length <= 0: |
paul@3 | 99 | raise ValueError, "Groups must be greater than zero." |
paul@3 | 100 | |
paul@3 | 101 | i = 0 |
paul@3 | 102 | g = [] |
paul@3 | 103 | |
paul@3 | 104 | while i < len(l): |
paul@3 | 105 | g.append(l[i:i+length]) |
paul@3 | 106 | i += length |
paul@3 | 107 | |
paul@3 | 108 | return g |
paul@3 | 109 | |
paul@0 | 110 | # vim: tabstop=4 expandtab shiftwidth=4 |