1.1 --- a/simplex.py Sat Oct 01 01:14:13 2011 +0200
1.2 +++ /dev/null Thu Jan 01 00:00:00 1970 +0000
1.3 @@ -1,149 +0,0 @@
1.4 -#!/usr/bin/env python
1.5 -
1.6 -"""
1.7 -Simple indexing of sorted files.
1.8 -
1.9 -Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
1.10 -
1.11 -This program is free software; you can redistribute it and/or modify it under
1.12 -the terms of the GNU General Public License as published by the Free Software
1.13 -Foundation; either version 3 of the License, or (at your option) any later
1.14 -version.
1.15 -
1.16 -This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 -WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 -PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 -
1.20 -You should have received a copy of the GNU General Public License along
1.21 -with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 -
1.23 -Motivations
1.24 ------------
1.25 -
1.26 -When searching files, the cost to scan an entire file typically exceeds the
1.27 -cost to navigate to the appropriate region and to scan a smaller portion of the
1.28 -file, especially for large files. At the same time, exotic data structures
1.29 -encouraging multiple seeks and reads are likely to waste time compared to just
1.30 -performing a single read operation, even if that operation involves a larger
1.31 -quantity of data, at least for storage with hard disk access characteristics.
1.32 -"""
1.33 -
1.34 -import bisect
1.35 -
1.36 -class TextFile:
1.37 -
1.38 - "A wrapper around text files."
1.39 -
1.40 - def __init__(self, f):
1.41 - self.f = f
1.42 -
1.43 - def seek(self, pos):
1.44 - self.f.seek(pos)
1.45 -
1.46 - def get_records(self):
1.47 - return self.f.xreadlines()
1.48 -
1.49 -class DelimitedRecord:
1.50 -
1.51 - "An accessor using a delimiter to split a record."
1.52 -
1.53 - def __init__(self, keys=None, delimiter=None):
1.54 - self.keys = keys or [0]
1.55 - self.delimiter = delimiter
1.56 -
1.57 - def get_key(self, record):
1.58 - values = record.split(self.delimiter)
1.59 - return [values[key] for key in self.keys]
1.60 -
1.61 -def make_index(reader, accessor, interval):
1.62 -
1.63 - """
1.64 - Index a resource whose 'reader' provides records and whose 'accessor' can
1.65 - yield the key for such records, creating an index entry for a record after a
1.66 - given number of records, defined by 'interval', have been read since the
1.67 - last entry was produced.
1.68 - """
1.69 -
1.70 - l = []
1.71 - pos = 0
1.72 -
1.73 - current_key = None
1.74 - start_pos = 0
1.75 -
1.76 - for i, record in enumerate(reader.get_records()):
1.77 - key = accessor.get_key(record)
1.78 -
1.79 - # Where duplicate keys are permitted, the first record employing the key
1.80 - # must be available as an index entry. Otherwise, records preceding the
1.81 - # one referenced by the entry may have the same key and be missed when
1.82 - # seeking using the index.
1.83 -
1.84 - if key != current_key:
1.85 - current_key = key
1.86 - start_pos = pos
1.87 -
1.88 - if i % interval == 0:
1.89 - l.append((current_key, start_pos))
1.90 -
1.91 - pos += len(record)
1.92 -
1.93 - return l
1.94 -
1.95 -def find_with_index(reader, accessor, l, term):
1.96 -
1.97 - """
1.98 - Find in the resource whose 'reader' provides records and whose 'accessor'
1.99 - can yield the key for such records, using the given index list 'l', the
1.100 - given 'term', returning a record employing the term or None if no such
1.101 - record was found.
1.102 - """
1.103 -
1.104 - i = bisect.bisect_left(l, (term, None))
1.105 -
1.106 - try:
1.107 - found, pos = l[i]
1.108 - except IndexError:
1.109 - return None
1.110 -
1.111 - # Since the index is more coarse than the underlying file, the bisect left
1.112 - # operation will most likely point to an index entry for later records than
1.113 - # the desired one.
1.114 -
1.115 - if found > term:
1.116 - i = max(0, i - 1)
1.117 - found, pos = l[i]
1.118 -
1.119 - reader.seek(pos)
1.120 - return find_in_file(reader, accessor, term)
1.121 -
1.122 -def find_in_file(reader, accessor, term):
1.123 -
1.124 - """
1.125 - Find in the resource whose 'reader' provides records and whose 'accessor'
1.126 - can yield the key for such records, the given 'term', returning a record
1.127 - employing the term or None if no such record was found.
1.128 - """
1.129 -
1.130 - for record in reader.get_records():
1.131 - if term == accessor.get_key(record):
1.132 - return record
1.133 -
1.134 - return None
1.135 -
1.136 -def groups(l, length):
1.137 -
1.138 - "Split 'l' into groups of the given 'length'."
1.139 -
1.140 - if length <= 0:
1.141 - raise ValueError, "Groups must be greater than zero."
1.142 -
1.143 - i = 0
1.144 - g = []
1.145 -
1.146 - while i < len(l):
1.147 - g.append(l[i:i+length])
1.148 - i += length
1.149 -
1.150 - return g
1.151 -
1.152 -# vim: tabstop=4 expandtab shiftwidth=4
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
2.2 +++ b/simplex/__init__.py Sat Oct 01 15:07:17 2011 +0200
2.3 @@ -0,0 +1,125 @@
2.4 +#!/usr/bin/env python
2.5 +
2.6 +"""
2.7 +Simple indexing of sorted files.
2.8 +
2.9 +Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
2.10 +
2.11 +This program is free software; you can redistribute it and/or modify it under
2.12 +the terms of the GNU General Public License as published by the Free Software
2.13 +Foundation; either version 3 of the License, or (at your option) any later
2.14 +version.
2.15 +
2.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
2.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
2.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
2.19 +
2.20 +You should have received a copy of the GNU General Public License along
2.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
2.22 +
2.23 +Motivations
2.24 +-----------
2.25 +
2.26 +When searching files, the cost to scan an entire file typically exceeds the
2.27 +cost to navigate to the appropriate region and to scan a smaller portion of the
2.28 +file, especially for large files. At the same time, exotic data structures
2.29 +encouraging multiple seeks and reads are likely to waste time compared to just
2.30 +performing a single read operation, even if that operation involves a larger
2.31 +quantity of data, at least for storage with hard disk access characteristics.
2.32 +"""
2.33 +
2.34 +from simplex.readers import *
2.35 +import bisect
2.36 +
2.37 +def make_index(reader, accessor, interval):
2.38 +
2.39 + """
2.40 + Index a resource whose 'reader' provides records and whose 'accessor' can
2.41 + yield the key for such records, creating an index entry for a record after a
2.42 + given number of records, defined by 'interval', have been read since the
2.43 + last entry was produced.
2.44 + """
2.45 +
2.46 + l = []
2.47 + pos = 0
2.48 +
2.49 + current_key = None
2.50 + start_pos = 0
2.51 +
2.52 + for i, record in enumerate(reader.get_records()):
2.53 + key = accessor.get_key(record)
2.54 +
2.55 + # Where duplicate keys are permitted, the first record employing the key
2.56 + # must be available as an index entry. Otherwise, records preceding the
2.57 + # one referenced by the entry may have the same key and be missed when
2.58 + # seeking using the index.
2.59 +
2.60 + if key != current_key:
2.61 + current_key = key
2.62 + start_pos = pos
2.63 +
2.64 + if i % interval == 0:
2.65 + l.append((current_key, start_pos))
2.66 +
2.67 + pos += len(record)
2.68 +
2.69 + return l
2.70 +
2.71 +def find_with_index(reader, accessor, l, term):
2.72 +
2.73 + """
2.74 + Find in the resource whose 'reader' provides records and whose 'accessor'
2.75 + can yield the key for such records, using the given index list 'l', the
2.76 + given 'term', returning a record employing the term or None if no such
2.77 + record was found.
2.78 + """
2.79 +
2.80 + i = bisect.bisect_left(l, (term, None))
2.81 +
2.82 + try:
2.83 + found, pos = l[i]
2.84 + except IndexError:
2.85 + return None
2.86 +
2.87 + # Since the index is more coarse than the underlying file, the bisect left
2.88 + # operation will most likely point to an index entry for later records than
2.89 + # the desired one.
2.90 +
2.91 + if found > term:
2.92 + i = max(0, i - 1)
2.93 + found, pos = l[i]
2.94 +
2.95 + reader.seek(pos)
2.96 + return find_in_file(reader, accessor, term)
2.97 +
2.98 +def find_in_file(reader, accessor, term):
2.99 +
2.100 + """
2.101 + Find in the resource whose 'reader' provides records and whose 'accessor'
2.102 + can yield the key for such records, the given 'term', returning a record
2.103 + employing the term or None if no such record was found.
2.104 + """
2.105 +
2.106 + for record in reader.get_records():
2.107 + if term == accessor.get_key(record):
2.108 + return record
2.109 +
2.110 + return None
2.111 +
2.112 +def groups(l, length):
2.113 +
2.114 + "Split 'l' into groups of the given 'length'."
2.115 +
2.116 + if length <= 0:
2.117 + raise ValueError, "Groups must be greater than zero."
2.118 +
2.119 + i = 0
2.120 + g = []
2.121 +
2.122 + while i < len(l):
2.123 + g.append(l[i:i+length])
2.124 + i += length
2.125 +
2.126 + return g
2.127 +
2.128 +# vim: tabstop=4 expandtab shiftwidth=4
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2 +++ b/simplex/readers.py Sat Oct 01 15:07:17 2011 +0200
3.3 @@ -0,0 +1,54 @@
3.4 +#!/usr/bin/env python
3.5 +
3.6 +"""
3.7 +Reader and accessor classes for indexing.
3.8 +
3.9 +Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
3.10 +
3.11 +This program is free software; you can redistribute it and/or modify it under
3.12 +the terms of the GNU General Public License as published by the Free Software
3.13 +Foundation; either version 3 of the License, or (at your option) any later
3.14 +version.
3.15 +
3.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
3.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
3.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
3.19 +
3.20 +You should have received a copy of the GNU General Public License along
3.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
3.22 +"""
3.23 +
3.24 +class TextFile:
3.25 +
3.26 + "A wrapper around text files."
3.27 +
3.28 + def __init__(self, f):
3.29 + self.f = f
3.30 +
3.31 + def seek(self, pos):
3.32 + self.f.seek(pos)
3.33 +
3.34 + def get_records(self):
3.35 + return self.f.xreadlines()
3.36 +
3.37 +class DelimitedRecord:
3.38 +
3.39 + "An accessor using a delimiter to split a record."
3.40 +
3.41 + def __init__(self, keys=None, delimiter=None):
3.42 +
3.43 + """
3.44 + Initialise the accessor using a sequence of 'keys' indicating the
3.45 + columns in each record that provide the values in the eventual compound
3.46 + key provided by each record, along with a 'delimiter' indicating how
3.47 + such columns are identified.
3.48 + """
3.49 +
3.50 + self.keys = keys or [0]
3.51 + self.delimiter = delimiter
3.52 +
3.53 + def get_key(self, record):
3.54 + values = record.split(self.delimiter)
3.55 + return [values[key] for key in self.keys]
3.56 +
3.57 +# vim: tabstop=4 expandtab shiftwidth=4