paul@0 | 1 | #!/usr/bin/env python |
paul@0 | 2 | |
paul@0 | 3 | """ |
paul@0 | 4 | Simple indexing of sorted files. |
paul@0 | 5 | |
paul@1 | 6 | Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> |
paul@0 | 7 | |
paul@0 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@0 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@0 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@0 | 11 | version. |
paul@0 | 12 | |
paul@0 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT ANY |
paul@0 | 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A |
paul@0 | 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. |
paul@0 | 16 | |
paul@0 | 17 | You should have received a copy of the GNU General Public License along |
paul@0 | 18 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@0 | 19 | |
paul@0 | 20 | Motivations |
paul@0 | 21 | ----------- |
paul@0 | 22 | |
paul@0 | 23 | When searching files, the cost to scan an entire file typically exceeds the |
paul@0 | 24 | cost to navigate to the appropriate region and to scan a smaller portion of the |
paul@0 | 25 | file, especially for large files. At the same time, exotic data structures |
paul@0 | 26 | encouraging multiple seeks and reads are likely to waste time compared to just |
paul@0 | 27 | performing a single read operation, even if that operation involves a larger |
paul@0 | 28 | quantity of data, at least for storage with hard disk access characteristics. |
paul@0 | 29 | """ |
paul@0 | 30 | |
paul@6 | 31 | from simplex.readers import * |
paul@20 | 32 | from simplex.indexers import * |
paul@0 | 33 | import bisect |
paul@0 | 34 | |
paul@17 | 35 | def find_with_index(reader, get_key, l, term): |
paul@0 | 36 | |
paul@0 | 37 | """ |
paul@17 | 38 | In the resource whose 'reader' provides records, using a 'get_key' operation |
paul@17 | 39 | to yield the key for such records, and using the given index list 'l', find |
paul@17 | 40 | the given 'term', returning a record employing the term or None if no such |
paul@4 | 41 | record was found. |
paul@0 | 42 | """ |
paul@0 | 43 | |
paul@0 | 44 | i = bisect.bisect_left(l, (term, None)) |
paul@3 | 45 | |
paul@3 | 46 | try: |
paul@3 | 47 | found, pos = l[i] |
paul@3 | 48 | except IndexError: |
paul@3 | 49 | return None |
paul@0 | 50 | |
paul@0 | 51 | # Since the index is more coarse than the underlying file, the bisect left |
paul@0 | 52 | # operation will most likely point to an index entry for later records than |
paul@0 | 53 | # the desired one. |
paul@0 | 54 | |
paul@0 | 55 | if found > term: |
paul@0 | 56 | i = max(0, i - 1) |
paul@0 | 57 | found, pos = l[i] |
paul@0 | 58 | |
paul@4 | 59 | reader.seek(pos) |
paul@17 | 60 | return find_in_file(reader, get_key, term) |
paul@0 | 61 | |
paul@17 | 62 | def find_in_file(reader, get_key, term): |
paul@0 | 63 | |
paul@0 | 64 | """ |
paul@17 | 65 | In the resource whose 'reader' provides records, using a 'get_key' operation |
paul@17 | 66 | to yield the key for such records, find the given 'term', returning a record |
paul@4 | 67 | employing the term or None if no such record was found. |
paul@0 | 68 | """ |
paul@0 | 69 | |
paul@16 | 70 | for record in reader: |
paul@17 | 71 | key = get_key(record) |
paul@7 | 72 | if term == key: |
paul@1 | 73 | return record |
paul@0 | 74 | |
paul@7 | 75 | # Short-circuit failed searches. |
paul@7 | 76 | |
paul@7 | 77 | elif term < key: |
paul@7 | 78 | return None |
paul@7 | 79 | |
paul@0 | 80 | return None |
paul@0 | 81 | |
paul@3 | 82 | def groups(l, length): |
paul@3 | 83 | |
paul@3 | 84 | "Split 'l' into groups of the given 'length'." |
paul@3 | 85 | |
paul@3 | 86 | if length <= 0: |
paul@3 | 87 | raise ValueError, "Groups must be greater than zero." |
paul@3 | 88 | |
paul@3 | 89 | i = 0 |
paul@3 | 90 | g = [] |
paul@3 | 91 | |
paul@3 | 92 | while i < len(l): |
paul@3 | 93 | g.append(l[i:i+length]) |
paul@3 | 94 | i += length |
paul@3 | 95 | |
paul@3 | 96 | return g |
paul@3 | 97 | |
paul@0 | 98 | # vim: tabstop=4 expandtab shiftwidth=4 |