1.1 --- a/simplex/__init__.py Sat Oct 01 17:59:20 2011 +0200
1.2 +++ b/simplex/__init__.py Sat Oct 01 19:33:58 2011 +0200
1.3 @@ -29,6 +29,10 @@
1.4 """
1.5
1.6 from simplex.readers import *
1.7 +from simplex.iterators import *
1.8 +from simplex.accessors import *
1.9 +from simplex.state import *
1.10 +
1.11 import bisect
1.12
1.13 def make_index(reader, interval):
1.14 @@ -45,7 +49,7 @@
1.15 current_key = None
1.16 start_pos = 0
1.17
1.18 - for i, (key, record) in enumerate(reader.get_records()):
1.19 + for i, (key, record) in enumerate(reader):
1.20
1.21 # Where duplicate keys are permitted, the first record employing the key
1.22 # must be available as an index entry. Otherwise, records preceding the
1.23 @@ -97,7 +101,7 @@
1.24 found.
1.25 """
1.26
1.27 - for key, record in reader.get_records():
1.28 + for key, record in reader:
1.29 if term == key:
1.30 return record
1.31
2.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
2.2 +++ b/simplex/accessors.py Sat Oct 01 19:33:58 2011 +0200
2.3 @@ -0,0 +1,63 @@
2.4 +#!/usr/bin/env python
2.5 +
2.6 +"""
2.7 +Accessor classes for indexing.
2.8 +
2.9 +Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
2.10 +
2.11 +This program is free software; you can redistribute it and/or modify it under
2.12 +the terms of the GNU General Public License as published by the Free Software
2.13 +Foundation; either version 3 of the License, or (at your option) any later
2.14 +version.
2.15 +
2.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
2.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
2.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
2.19 +
2.20 +You should have received a copy of the GNU General Public License along
2.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
2.22 +"""
2.23 +
2.24 +class DelimitedRecord:
2.25 +
2.26 + "An accessor using a delimiter to split a record."
2.27 +
2.28 + def __init__(self, keys=None, delimiter=None, numeric=0):
2.29 +
2.30 + """
2.31 + Initialise the accessor using a sequence of 'keys' indicating the
2.32 + columns in each record that provide the values in the eventual compound
2.33 + key provided by each record, along with a 'delimiter' indicating how
2.34 + such columns are identified. If 'numeric' is set to a true value, keys
2.35 + will be interpreted as numbers.
2.36 + """
2.37 +
2.38 + self.keys = keys or [0]
2.39 + self.delimiter = delimiter
2.40 + self.numeric = numeric
2.41 +
2.42 + # Define a conversion method.
2.43 +
2.44 + self.convert = numeric and self.convert_numeric or (lambda x: x)
2.45 +
2.46 + def convert_numeric(self, term):
2.47 + return map(int, term)
2.48 +
2.49 + def get_key(self, record):
2.50 + values = record.split(self.delimiter)
2.51 + return self.convert([values[key] for key in self.keys])
2.52 +
2.53 + def get_sort_command(self):
2.54 +
2.55 + """
2.56 + Return the Unix sort command invocation required to produce the ordering
2.57 + described by this instance.
2.58 + """
2.59 +
2.60 + return "sort%s%s%s" % (
2.61 + self.delimiter and (" -t $'%s'" % repr(self.delimiter)[1:-1]) or "",
2.62 + self.numeric and " -n" or "",
2.63 + "".join([(" -k %d,%d" % (key + 1, key + 1)) for key in self.keys])
2.64 + )
2.65 +
2.66 +# vim: tabstop=4 expandtab shiftwidth=4
3.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
3.2 +++ b/simplex/iterators.py Sat Oct 01 19:33:58 2011 +0200
3.3 @@ -0,0 +1,61 @@
3.4 +#!/usr/bin/env python
3.5 +
3.6 +"""
3.7 +Iterator classes for indexing.
3.8 +
3.9 +Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
3.10 +
3.11 +This program is free software; you can redistribute it and/or modify it under
3.12 +the terms of the GNU General Public License as published by the Free Software
3.13 +Foundation; either version 3 of the License, or (at your option) any later
3.14 +version.
3.15 +
3.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
3.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
3.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
3.19 +
3.20 +You should have received a copy of the GNU General Public License along
3.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
3.22 +"""
3.23 +
3.24 +class Iterator:
3.25 +
3.26 + "An iterator over records employing record accessors."
3.27 +
3.28 + def __init__(self, accessor):
3.29 + self.accessor = accessor
3.30 + self.records = None
3.31 + self.iterator = None
3.32 +
3.33 + def set_records(self, records):
3.34 + self.records = records
3.35 +
3.36 + def __iter__(self):
3.37 + self.iterator = iter(self.records)
3.38 + return self
3.39 +
3.40 + def next(self):
3.41 + if self.iterator is None:
3.42 + iter(self)
3.43 + record = self.iterator.next()
3.44 + return self.accessor.get_key(record), record
3.45 +
3.46 +class StatefulIterator(Iterator):
3.47 +
3.48 + "An iterator over records maintaining state."
3.49 +
3.50 + def __init__(self, accessor, state):
3.51 + Iterator.__init__(self, accessor)
3.52 + self.state = state
3.53 +
3.54 + def __iter__(self):
3.55 + Iterator.__iter__(self)
3.56 + self.state.reset()
3.57 + return self
3.58 +
3.59 + def next(self):
3.60 + key, record = Iterator.next(self)
3.61 + self.key = self.state.update(key)
3.62 + return self.key, record
3.63 +
3.64 +# vim: tabstop=4 expandtab shiftwidth=4
4.1 --- a/simplex/readers.py Sat Oct 01 17:59:20 2011 +0200
4.2 +++ b/simplex/readers.py Sat Oct 01 19:33:58 2011 +0200
4.3 @@ -1,7 +1,7 @@
4.4 #!/usr/bin/env python
4.5
4.6 """
4.7 -Reader and accessor classes for indexing.
4.8 +Reader classes for indexing.
4.9
4.10 Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
4.11
4.12 @@ -22,9 +22,9 @@
4.13
4.14 "A generic file wrapper."
4.15
4.16 - def __init__(self, f, accessor):
4.17 + def __init__(self, f, iterator):
4.18 self.f = f
4.19 - self.accessor = accessor
4.20 + self.iterator = iterator
4.21
4.22 def seek(self, pos):
4.23 self.f.seek(pos)
4.24 @@ -33,62 +33,8 @@
4.25
4.26 "A wrapper around text files."
4.27
4.28 - def get_records(self):
4.29 - return Iterator(self.f.xreadlines(), self.accessor)
4.30 -
4.31 -class Iterator:
4.32 -
4.33 - "An iterator over records employing record accessors."
4.34 -
4.35 - def __init__(self, records, accessor):
4.36 - self.records = records
4.37 - self.accessor = accessor
4.38 - self.iterator = None
4.39 -
4.40 def __iter__(self):
4.41 - self.iterator = iter(self.records)
4.42 - return self
4.43 -
4.44 - def next(self):
4.45 - if self.iterator is None:
4.46 - iter(self)
4.47 - record = self.iterator.next()
4.48 - return self.accessor.get_key(record), record
4.49 -
4.50 -class DelimitedRecord:
4.51 -
4.52 - "An accessor using a delimiter to split a record."
4.53 -
4.54 - def __init__(self, keys=None, delimiter=None, numeric=0):
4.55 -
4.56 - """
4.57 - Initialise the accessor using a sequence of 'keys' indicating the
4.58 - columns in each record that provide the values in the eventual compound
4.59 - key provided by each record, along with a 'delimiter' indicating how
4.60 - such columns are identified. If 'numeric' is set to a true value, keys
4.61 - will be interpreted as numbers.
4.62 - """
4.63 -
4.64 - self.keys = keys or [0]
4.65 - self.delimiter = delimiter
4.66 - self.numeric = numeric
4.67 -
4.68 - # Define a conversion method.
4.69 -
4.70 - self.convert = numeric and self.convert_numeric or (lambda x: x)
4.71 -
4.72 - def convert_numeric(self, term):
4.73 - return map(int, term)
4.74 -
4.75 - def get_key(self, record):
4.76 - values = record.split(self.delimiter)
4.77 - return self.convert([values[key] for key in self.keys])
4.78 -
4.79 - def get_sort_command(self):
4.80 - return "sort%s%s%s" % (
4.81 - self.delimiter and (" -t $'%s'" % repr(self.delimiter)[1:-1]) or "",
4.82 - self.numeric and " -n" or "",
4.83 - "".join([(" -k %d,%d" % (key + 1, key + 1)) for key in self.keys])
4.84 - )
4.85 + self.iterator.set_records(self.f.xreadlines())
4.86 + return self.iterator
4.87
4.88 # vim: tabstop=4 expandtab shiftwidth=4
5.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
5.2 +++ b/simplex/state.py Sat Oct 01 19:33:58 2011 +0200
5.3 @@ -0,0 +1,53 @@
5.4 +#!/usr/bin/env python
5.5 +
5.6 +"""
5.7 +State management classes for iterators.
5.8 +
5.9 +Copyright (C) 2011 Paul Boddie <paul@boddie.org.uk>
5.10 +
5.11 +This program is free software; you can redistribute it and/or modify it under
5.12 +the terms of the GNU General Public License as published by the Free Software
5.13 +Foundation; either version 3 of the License, or (at your option) any later
5.14 +version.
5.15 +
5.16 +This program is distributed in the hope that it will be useful, but WITHOUT ANY
5.17 +WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS FOR A
5.18 +PARTICULAR PURPOSE. See the GNU General Public License for more details.
5.19 +
5.20 +You should have received a copy of the GNU General Public License along
5.21 +with this program. If not, see <http://www.gnu.org/licenses/>.
5.22 +"""
5.23 +
5.24 +from os.path import commonprefix
5.25 +
5.26 +class CommonPrefixState:
5.27 +
5.28 + "A class whose instances maintain common prefix state."
5.29 +
5.30 + def __init__(self, initial=""):
5.31 + self.initial = initial
5.32 + self.reset()
5.33 +
5.34 + def reset(self):
5.35 + self.value = self.initial
5.36 +
5.37 +class CommonPrefixDecoder(CommonPrefixState):
5.38 +
5.39 + "A class whose instances decode common prefix information."
5.40 +
5.41 + def update(self, common_plus_suffix):
5.42 + common, suffix = common_plus_suffix
5.43 + self.value = self.value[:common] + suffix
5.44 + return self.value
5.45 +
5.46 +class CommonPrefixEncoder(CommonPrefixState):
5.47 +
5.48 + "A class whose instances encode common prefix information."
5.49 +
5.50 + def update(self, value):
5.51 + common = len(commonprefix((self.value, value)))
5.52 + suffix = value[common:]
5.53 + self.value = value
5.54 + return common, suffix
5.55 +
5.56 +# vim: tabstop=4 expandtab shiftwidth=4
6.1 --- a/test_indexed.py Sat Oct 01 17:59:20 2011 +0200
6.2 +++ b/test_indexed.py Sat Oct 01 19:33:58 2011 +0200
6.3 @@ -14,7 +14,7 @@
6.4
6.5 f = open(filename)
6.6 accessor = DelimitedRecord(keys, numeric=(numeric == "true"))
6.7 -reader = TextFile(f, accessor)
6.8 +reader = TextFile(f, Iterator(accessor))
6.9 try:
6.10 t = time.time()
6.11 l = make_index(reader, int(interval))
7.1 --- a/test_scan.py Sat Oct 01 17:59:20 2011 +0200
7.2 +++ b/test_scan.py Sat Oct 01 19:33:58 2011 +0200
7.3 @@ -14,7 +14,7 @@
7.4
7.5 f = open(filename)
7.6 accessor = DelimitedRecord(keys, numeric=(numeric == "true"))
7.7 -reader = TextFile(f, accessor)
7.8 +reader = TextFile(f, Iterator(accessor))
7.9 try:
7.10 for term in terms:
7.11 reader.seek(0)