1.1 --- a/simplex.py Sat Oct 01 00:40:38 2011 +0200
1.2 +++ b/simplex.py Sat Oct 01 00:56:16 2011 +0200
1.3 @@ -26,13 +26,6 @@
1.4 encouraging multiple seeks and reads are likely to waste time compared to just
1.5 performing a single read operation, even if that operation involves a larger
1.6 quantity of data, at least for storage with hard disk access characteristics.
1.7 -
1.8 -Potential Improvements
1.9 -----------------------
1.10 -
1.11 -Ideally, the acquisition of records should be done more generally than just
1.12 -reading lines, and the selection of matches should involve more than just
1.13 -selecting the first column.
1.14 """
1.15
1.16 import bisect
1.17 @@ -56,11 +49,13 @@
1.18 values = record.split(self.delimiter)
1.19 return [values[key] for key in self.keys]
1.20
1.21 -def index_file(f, interval):
1.22 +def make_index(reader, accessor, interval):
1.23
1.24 """
1.25 - Index a file 'f', creating an index entry for a record after a given number,
1.26 - defined by 'interval', have been read since the last entry.
1.27 + Index a resource whose 'reader' provides records and whose 'accessor' can
1.28 + yield the key for such records, creating an index entry for a record after a
1.29 + given number of records, defined by 'interval', have been read since the
1.30 + last entry was produced.
1.31 """
1.32
1.33 l = []
1.34 @@ -69,8 +64,8 @@
1.35 current_key = None
1.36 start_pos = 0
1.37
1.38 - for i, record in enumerate(f.get_records()):
1.39 - key = f.get_key(record)
1.40 + for i, record in enumerate(reader.get_records()):
1.41 + key = accessor.get_key(record)
1.42
1.43 # Where duplicate keys are permitted, the first record employing the key
1.44 # must be available as an index entry. Otherwise, records preceding the
1.45 @@ -88,11 +83,13 @@
1.46
1.47 return l
1.48
1.49 -def find_with_index(f, l, term):
1.50 +def find_with_index(reader, accessor, l, term):
1.51
1.52 """
1.53 - Find in file 'f', using the given index list 'l', the given 'term',
1.54 - returning a record employing the term or None if no such record was found.
1.55 + Find in the resource whose 'reader' provides records and whose 'accessor'
1.56 + can yield the key for such records, using the given index list 'l', the
1.57 + given 'term', returning a record employing the term or None if no such
1.58 + record was found.
1.59 """
1.60
1.61 i = bisect.bisect_left(l, (term, None))
1.62 @@ -110,33 +107,23 @@
1.63 i = max(0, i - 1)
1.64 found, pos = l[i]
1.65
1.66 - f.seek(pos)
1.67 - return find_in_file(f, term)
1.68 + reader.seek(pos)
1.69 + return find_in_file(reader, accessor, term)
1.70
1.71 -def find_in_file(f, term):
1.72 +def find_in_file(reader, accessor, term):
1.73
1.74 """
1.75 - Find in file 'f' the given 'term', returning a record employing the term or
1.76 - None if no such record was found.
1.77 + Find in the resource whose 'reader' provides records and whose 'accessor'
1.78 + can yield the key for such records, the given 'term', returning a record
1.79 + employing the term or None if no such record was found.
1.80 """
1.81
1.82 - for record in f.get_records():
1.83 - if term == f.get_key(record):
1.84 + for record in reader.get_records():
1.85 + if term == accessor.get_key(record):
1.86 return record
1.87
1.88 return None
1.89
1.90 -class Index:
1.91 -
1.92 - "An index abstraction."
1.93 -
1.94 - def __init__(self, entries, f):
1.95 - self.entries = entries
1.96 - self.f = f
1.97 -
1.98 - def find(self, term):
1.99 - return find_with_index(self.f, self.entries, term)
1.100 -
1.101 def groups(l, length):
1.102
1.103 "Split 'l' into groups of the given 'length'."
2.1 --- a/test_indexed.py Sat Oct 01 00:40:38 2011 +0200
2.2 +++ b/test_indexed.py Sat Oct 01 00:56:16 2011 +0200
2.3 @@ -13,17 +13,17 @@
2.4 sys.exit(1)
2.5
2.6 f = open(filename)
2.7 -tf = TextFile(f, keys)
2.8 +reader = TextFile(f, keys)
2.9 try:
2.10 t = time.time()
2.11 - l = index_file(tf, int(interval))
2.12 + l = make_index(reader, reader, int(interval))
2.13 print "Indexed in %s seconds." % (time.time() - t)
2.14
2.15 # Now use the index.
2.16
2.17 for term in terms:
2.18 t = time.time()
2.19 - line = find_with_index(tf, l, term)
2.20 + line = find_with_index(reader, reader, l, term)
2.21 if line:
2.22 print "Found (at %s seconds)...\n%s" % (time.time() - t, line)
2.23
3.1 --- a/test_scan.py Sat Oct 01 00:40:38 2011 +0200
3.2 +++ b/test_scan.py Sat Oct 01 00:56:16 2011 +0200
3.3 @@ -13,13 +13,13 @@
3.4 sys.exit(1)
3.5
3.6 f = open(filename)
3.7 -tf = TextFile(f, keys)
3.8 +reader = TextFile(f, keys)
3.9 try:
3.10 for term in terms:
3.11 - tf.seek(0)
3.12 + reader.seek(0)
3.13
3.14 t = time.time()
3.15 - line = find_in_file(tf, term)
3.16 + line = find_in_file(reader, reader, term)
3.17 if line:
3.18 print "Found (at %s seconds)...\n%s" % (time.time() - t, line)
3.19 finally: