paul@20 | 1 | #!/usr/bin/env python |
paul@20 | 2 | |
paul@20 | 3 | """ |
paul@20 | 4 | Indexing classes. |
paul@20 | 5 | |
paul@20 | 6 | Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk> |
paul@20 | 7 | |
paul@20 | 8 | This program is free software; you can redistribute it and/or modify it under |
paul@20 | 9 | the terms of the GNU General Public License as published by the Free Software |
paul@20 | 10 | Foundation; either version 3 of the License, or (at your option) any later |
paul@20 | 11 | version. |
paul@20 | 12 | |
paul@20 | 13 | This program is distributed in the hope that it will be useful, but WITHOUT ANY |
paul@20 | 14 | WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A |
paul@20 | 15 | PARTICULAR PURPOSE. See the GNU General Public License for more details. |
paul@20 | 16 | |
paul@20 | 17 | You should have received a copy of the GNU General Public License along |
paul@20 | 18 | with this program. If not, see <http://www.gnu.org/licenses/>. |
paul@20 | 19 | """ |
paul@20 | 20 | |
paul@20 | 21 | class Indexer: |
paul@20 | 22 | |
paul@20 | 23 | "An indexer which records an entry periodically." |
paul@20 | 24 | |
paul@20 | 25 | def __init__(self, output, get_key, interval): |
paul@20 | 26 | |
paul@20 | 27 | """ |
paul@20 | 28 | Index a resource, recording entries in the given 'output' sequence, |
paul@20 | 29 | using a 'get_key' operation to yield the key for each record, creating |
paul@20 | 30 | an index entry for a record after a given number of records, defined by |
paul@20 | 31 | 'interval', have been appended since the last entry was produced. |
paul@20 | 32 | """ |
paul@20 | 33 | |
paul@20 | 34 | self.output = output |
paul@20 | 35 | self.interval = interval |
paul@20 | 36 | self.get_key = get_key |
paul@20 | 37 | |
paul@20 | 38 | self.count = 0 |
paul@20 | 39 | self.pos = 0 |
paul@20 | 40 | |
paul@20 | 41 | # Information about the current group. |
paul@20 | 42 | |
paul@20 | 43 | self.start_pos = 0 |
paul@20 | 44 | self.current_key = None |
paul@20 | 45 | |
paul@20 | 46 | def append(self, record): |
paul@20 | 47 | |
paul@20 | 48 | """ |
paul@20 | 49 | Present the given 'record' to the indexer, recording it if appropriate. |
paul@20 | 50 | """ |
paul@20 | 51 | |
paul@20 | 52 | key = self.get_key(record) |
paul@20 | 53 | |
paul@20 | 54 | # Where duplicate keys are permitted, the first record employing the key |
paul@20 | 55 | # must be available as an index entry. Otherwise, records preceding the |
paul@20 | 56 | # one referenced by the entry may have the same key and be missed when |
paul@20 | 57 | # seeking using the index. |
paul@20 | 58 | |
paul@20 | 59 | if key != self.current_key: |
paul@20 | 60 | self.current_key = key |
paul@20 | 61 | self.start_pos = self.pos |
paul@20 | 62 | |
paul@20 | 63 | if self.count % self.interval == 0: |
paul@20 | 64 | self.output.append((self.current_key, self.start_pos)) |
paul@20 | 65 | |
paul@20 | 66 | self.count += 1 |
paul@20 | 67 | self.pos += len(record) |
paul@20 | 68 | |
paul@21 | 69 | def make_index(reader, get_key, interval, output=None): |
paul@20 | 70 | |
paul@20 | 71 | """ |
paul@20 | 72 | Index a resource whose 'reader' provides records, using a 'get_key' |
paul@20 | 73 | operation to yield the key for such records, creating an index entry for a |
paul@20 | 74 | record after a given number of records, defined by 'interval', have been |
paul@20 | 75 | read since the last entry was produced. |
paul@21 | 76 | |
paul@21 | 77 | Either append index entries to the given 'output' sequence, or populate a |
paul@21 | 78 | new list. |
paul@20 | 79 | """ |
paul@20 | 80 | |
paul@21 | 81 | l = output or [] |
paul@20 | 82 | indexer = Indexer(l, get_key, interval) |
paul@20 | 83 | |
paul@20 | 84 | for record in reader: |
paul@20 | 85 | indexer.append(record) |
paul@20 | 86 | |
paul@20 | 87 | return l |
paul@20 | 88 | |
paul@20 | 89 | # vim: tabstop=4 expandtab shiftwidth=4 |