1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/simplex.py Fri Sep 30 00:44:45 2011 +0200
1.3 @@ -0,0 +1,104 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Simple indexing of sorted files.
1.8 +
1.9 +Copyright (C) 2009, 2010 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU General Public License as published by the Free Software
1.13 +Foundation; either version 3 of the License, or (at your option) any later
1.14 +version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
1.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
1.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
1.19 +
1.20 +You should have received a copy of the GNU General Public License along
1.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
1.22 +
1.23 +Motivations
1.24 +-----------
1.25 +
1.26 +When searching files, the cost to scan an entire file typically exceeds the
1.27 +cost to navigate to the appropriate region and to scan a smaller portion of the
1.28 +file, especially for large files. At the same time, exotic data structures
1.29 +encouraging multiple seeks and reads are likely to waste time compared to just
1.30 +performing a single read operation, even if that operation involves a larger
1.31 +quantity of data, at least for storage with hard disk access characteristics.
1.32 +
1.33 +Potential Improvements
1.34 +----------------------
1.35 +
1.36 +Ideally, the acquisition of records should be done more generally than just
1.37 +reading lines, and the selection of matches should involve more than just
1.38 +selecting the first column.
1.39 +"""
1.40 +
1.41 +import bisect
1.42 +
1.43 +def index_by_lines(f, interval):
1.44 +
1.45 + """
1.46 + Index a file 'f', creating an index entry for a line after a given number,
1.47 + defined by 'interval', has been read since the last entry.
1.48 + """
1.49 +
1.50 + l = []
1.51 + pos = 0
1.52 +
1.53 + for i, line in enumerate(f.xreadlines()):
1.54 + columns = line.split("\t")
1.55 + if i % interval == 0:
1.56 + l.append((columns[0], pos))
1.57 + pos += len(line)
1.58 +
1.59 + return l
1.60 +
1.61 +def find_line_with_index(f, l, term):
1.62 +
1.63 + """
1.64 + Find in file 'f', using the given index list 'l', the given 'term',
1.65 + returning a line employing the term or None if no such line was found.
1.66 + """
1.67 +
1.68 + i = bisect.bisect_left(l, (term, None))
1.69 + found, pos = l[i]
1.70 +
1.71 + # Since the index is more coarse than the underlying file, the bisect left
1.72 + # operation will most likely point to an index entry for later records than
1.73 + # the desired one.
1.74 +
1.75 + if found > term:
1.76 + i = max(0, i - 1)
1.77 + found, pos = l[i]
1.78 +
1.79 + f.seek(pos)
1.80 + return find_line_in_file(f, term)
1.81 +
1.82 +def find_line_in_file(f, term):
1.83 +
1.84 + """
1.85 + Find in file 'f' the given 'term', returning a line employing the term or
1.86 + None if no such line was found.
1.87 + """
1.88 +
1.89 + for line in f.xreadlines():
1.90 + columns = line.split("\t")
1.91 + if term == columns[0]:
1.92 + return line
1.93 +
1.94 + return None
1.95 +
1.96 +class Index:
1.97 +
1.98 + "An index abstraction."
1.99 +
1.100 + def __init__(self, entries, f):
1.101 + self.entries = entries
1.102 + self.f = f
1.103 +
1.104 + def find(self, term):
1.105 + return find_line_with_index(self.f, self.entries, term)
1.106 +
1.107 +# vim: tabstop=4 expandtab shiftwidth=4