1.1 --- a/imiptools/text.py Fri May 26 23:58:06 2017 +0200
1.2 +++ b/imiptools/text.py Thu Jun 01 23:26:38 2017 +0200
1.3 @@ -19,47 +19,305 @@
1.4 this program. If not, see <http://www.gnu.org/licenses/>.
1.5 """
1.6
1.7 +from imiptools.filesys import fix_permissions
1.8 +from os.path import isfile
1.9 import codecs
1.10 import re
1.11
1.12 -# Parsing of lines to obtain functions and arguments.
1.13 +def have_table(obj, filename):
1.14 +
1.15 + "Return whether 'obj' is a table using the given 'filename'."
1.16 +
1.17 + return hasattr(obj, "get_filename") and obj.get_filename() == filename
1.18 +
1.19 +class FileTable:
1.20 +
1.21 + "A file-based data table."
1.22 +
1.23 + def __init__(self, filename, mutable=True,
1.24 + in_defaults=None, out_defaults=None,
1.25 + in_converter=None, out_converter=None,
1.26 + tab_separated=True, headers=False):
1.27 +
1.28 + """
1.29 + Open the table from the file having the given 'filename'. If 'mutable'
1.30 + is given as a true value (as is the default), the table can be modified.
1.31 +
1.32 + The 'in_defaults' is a list of (index, value) tuples indicating the
1.33 + default value where a column either does not exist or provides an empty
1.34 + value. The 'out_defaults' is a corresponding list used to serialise
1.35 + missing and empty values.
1.36 +
1.37 + The 'in_converter' is a callable accepting a tuple of values and
1.38 + returning an object. The corresponding 'out_converter' accepts an object
1.39 + and returns a tuple of values.
1.40 +
1.41 + If 'tab_separated' is specified and is a false value, line parsing using
1.42 + the imiptools.text.parse_line function will be performed instead of
1.43 + splitting each line of the file using tab characters as separators.
1.44 +
1.45 + If 'headers' is specified and is not false, the first line in the table
1.46 + will provide header value information.
1.47 + """
1.48 +
1.49 + self.filename = filename
1.50 + self.mutable = mutable
1.51 + self.in_defaults = in_defaults
1.52 + self.out_defaults = out_defaults
1.53 + self.in_converter = in_converter
1.54 + self.out_converter = out_converter
1.55 + self.tab_separated = tab_separated
1.56 +
1.57 + # Obtain the items. In subsequent implementations, the items could be
1.58 + # retrieved dynamically.
1.59 +
1.60 + items = []
1.61 +
1.62 + if isfile(filename):
1.63 + for item in get_table(filename, in_defaults, tab_separated):
1.64 + if self.in_converter:
1.65 + item = self.in_converter(item)
1.66 + items.append(item)
1.67 +
1.68 + # Obtain header values and separate them from the rest of the data.
1.69 +
1.70 + self.table = items[headers and 1 or 0:]
1.71 + self.header_values = headers and items and items[0] or []
1.72 + self.headers = headers
1.73 +
1.74 + def get_filename(self):
1.75 + return self.filename
1.76 +
1.77 + def get_header_values(self):
1.78 + return self.header_values
1.79 +
1.80 + def set_header_values(self, values):
1.81 + self.header_values = values
1.82 +
1.83 + def close(self):
1.84
1.85 -line_pattern_str = (
1.86 - r"(?:"
1.87 - r"(?:'(.*?)')" # single-quoted text
1.88 - r"|"
1.89 - r'(?:"(.*?)")' # double-quoted text
1.90 - r"|"
1.91 - r"([^\s]+)" # non-whitespace characters
1.92 - r")+"
1.93 - r"(?:\s+|$)" # optional trailing whitespace before line end
1.94 - )
1.95 + "Write any modifications and close the table."
1.96 +
1.97 + if self.mutable:
1.98 + f = codecs.open(self.filename, "wb", encoding="utf-8")
1.99 + try:
1.100 + sep = self.tab_separated and "\t" or " "
1.101 +
1.102 + # Include any headers in the output.
1.103 +
1.104 + if self.headers:
1.105 + self.table.insert(0, self.header_values)
1.106 +
1.107 + for item in self.table:
1.108 + if self.out_converter:
1.109 + item = self.out_converter(item)
1.110 +
1.111 + # Insert defaults for empty columns.
1.112 +
1.113 + if self.out_defaults:
1.114 + item = set_defaults(list(item), self.out_defaults)
1.115 +
1.116 + # Separate the columns and write to the file.
1.117 +
1.118 + print >>f, sep.join(item)
1.119 +
1.120 + # Remove the headers from the items in case the table is
1.121 + # accessed again.
1.122 +
1.123 + if self.headers:
1.124 + del self.table[0]
1.125 +
1.126 + finally:
1.127 + f.close()
1.128 + fix_permissions(self.filename)
1.129 +
1.130 + # General collection methods.
1.131
1.132 -line_pattern = re.compile(line_pattern_str)
1.133 + def __nonzero__(self):
1.134 + return bool(self.table)
1.135 +
1.136 + # List emulation methods.
1.137 +
1.138 + def __iadd__(self, other):
1.139 + for value in other:
1.140 + self.append(value)
1.141 + return self
1.142 +
1.143 + def __iter__(self):
1.144 + return iter(self.table)
1.145 +
1.146 + def __len__(self):
1.147 + return len(self.table)
1.148
1.149 -def parse_line(text):
1.150 + def __delitem__(self, i):
1.151 + del self.table[i]
1.152 +
1.153 + def __delslice__(self, start, end):
1.154 + del self.table[start:end]
1.155 +
1.156 + def __getitem__(self, i):
1.157 + return self.table[i]
1.158 +
1.159 + def __getslice__(self, start, end):
1.160 + return self.table[start:end]
1.161 +
1.162 + def __setitem__(self, i, value):
1.163 + self.table[i] = value
1.164 +
1.165 + def __setslice__(self, start, end, values):
1.166 + self.table[start:end] = values
1.167 +
1.168 + def append(self, value):
1.169 + self.table.append(value)
1.170
1.171 - """
1.172 - Parse the given 'text', returning a list of words separated by whitespace in
1.173 - the input, where whitespace may occur inside words if quoted using single or
1.174 - double quotes.
1.175 + def insert(self, i, value):
1.176 + self.table.insert(i, value)
1.177 +
1.178 + def remove(self, value):
1.179 + self.table.remove(value)
1.180 +
1.181 + # Dictionary emulation methods (even though this is not a mapping).
1.182 +
1.183 + def clear(self):
1.184 + del self.table[:]
1.185 +
1.186 + # Additional modification methods.
1.187 +
1.188 + def replaceall(self, values):
1.189 + self.table[:] = values
1.190 +
1.191 +class FileTableDict(FileTable):
1.192 +
1.193 + "A file-based table acting as a dictionary."
1.194 +
1.195 + def __init__(self, filename, mutable=True,
1.196 + in_defaults=None, out_defaults=None,
1.197 + in_converter=None, out_converter=None,
1.198 + tab_separated=True, headers=False):
1.199 +
1.200 + FileTable.__init__(self, filename, mutable, in_defaults, out_defaults,
1.201 + in_converter, out_converter, tab_separated, headers)
1.202 + self.mapping = dict(self.table)
1.203 +
1.204 + def close(self):
1.205 + self.table = self.mapping.items()
1.206 + FileTable.close(self)
1.207 +
1.208 + # General collection methods.
1.209
1.210 - Hello world -> ['Hello', 'world']
1.211 - Hello ' world' -> ['Hello', ' world']
1.212 - Hello' 'world -> ["'Hello'", "'world']
1.213 - """
1.214 + def __nonzero__(self):
1.215 + return bool(self.mapping)
1.216 +
1.217 + # List emulation methods.
1.218 +
1.219 + def __iter__(self):
1.220 + return iter(self.mapping)
1.221 +
1.222 + def __len__(self):
1.223 + return len(self.mapping)
1.224 +
1.225 + def append(self, value):
1.226 + key, value = value
1.227 + self.mapping[key] = value
1.228 +
1.229 + def insert(self, i, value):
1.230 + self.append(value)
1.231 +
1.232 + def remove(self, value):
1.233 + key, value = value
1.234 + del self.mapping[key]
1.235 +
1.236 + # Unimplemented methods.
1.237 +
1.238 + def __delslice__(self, start, end):
1.239 + raise NotImplementedError, "__delslice__"
1.240 +
1.241 + def __getslice__(self, start, end):
1.242 + raise NotImplementedError, "__getslice__"
1.243 +
1.244 + def __setslice__(self, start, end, values):
1.245 + raise NotImplementedError, "__setslice__"
1.246 +
1.247 + # Dictionary emulation methods.
1.248 +
1.249 + def clear(self):
1.250 + self.mapping.clear()
1.251
1.252 - parts = []
1.253 + def get(self, i, default=None):
1.254 + return self.mapping.get(i, default)
1.255 +
1.256 + def keys(self):
1.257 + return self.mapping.keys()
1.258 +
1.259 + def items(self):
1.260 + return self.mapping.items()
1.261 +
1.262 + def update(self, other):
1.263 + self.mapping.update(other)
1.264 +
1.265 + def values(self):
1.266 + return self.mapping.values()
1.267 +
1.268 + def __delitem__(self, i):
1.269 + del self.mapping[i]
1.270
1.271 - # Match the components of each part.
1.272 + def __getitem__(self, i):
1.273 + return self.mapping[i]
1.274 +
1.275 + def __setitem__(self, i, value):
1.276 + if self.mutable:
1.277 + self.mapping[i] = value
1.278 +
1.279 + # Additional modification methods.
1.280
1.281 - for match in line_pattern.finditer(text):
1.282 + def replaceall(self, values):
1.283 + self.mapping = {}
1.284 + self.mapping.update(dict(values))
1.285 +
1.286 + def updateall(self, mapping):
1.287 + self.mapping = {}
1.288 + self.mapping.update(mapping)
1.289 +
1.290 +def first(t):
1.291 + return t[0]
1.292
1.293 - # Combine the components by traversing the matching groups.
1.294 +def tuplevalue(v):
1.295 + return (v,)
1.296 +
1.297 +class FileTableSingle(FileTable):
1.298 +
1.299 + "A file-based table providing single value items."
1.300 +
1.301 + def __iter__(self):
1.302 + return iter(self[:])
1.303 +
1.304 + def __getitem__(self, i):
1.305 + return self.table[i][0]
1.306 +
1.307 + def __getslice__(self, start, end):
1.308 + return map(first, self.table[start:end])
1.309 +
1.310 + def __setitem__(self, i, value):
1.311 + self.table[i] = [(value,)]
1.312
1.313 - parts.append(reduce(lambda a, b: (a or "") + (b or ""), match.groups()))
1.314 + def __setslice__(self, start, end, values):
1.315 + self.table[start:end] = map(tuplevalue, values)
1.316 +
1.317 + def append(self, value):
1.318 + self.table.append((value,))
1.319 +
1.320 + def insert(self, i, value):
1.321 + self.table.insert(i, (value,))
1.322
1.323 - return parts
1.324 + def remove(self, value):
1.325 + self.table.remove((value,))
1.326 +
1.327 + # Additional modification methods.
1.328 +
1.329 + def replaceall(self, values):
1.330 + self.table[:] = map(tuplevalue, values)
1.331 +
1.332 +
1.333
1.334 # Parsing of tabular files.
1.335
1.336 @@ -129,4 +387,45 @@
1.337
1.338 return l
1.339
1.340 +
1.341 +
1.342 +# Parsing of lines to obtain functions and arguments.
1.343 +
1.344 +line_pattern_str = (
1.345 + r"(?:"
1.346 + r"(?:'(.*?)')" # single-quoted text
1.347 + r"|"
1.348 + r'(?:"(.*?)")' # double-quoted text
1.349 + r"|"
1.350 + r"([^\s]+)" # non-whitespace characters
1.351 + r")+"
1.352 + r"(?:\s+|$)" # optional trailing whitespace before line end
1.353 + )
1.354 +
1.355 +line_pattern = re.compile(line_pattern_str)
1.356 +
1.357 +def parse_line(text):
1.358 +
1.359 + """
1.360 + Parse the given 'text', returning a list of words separated by whitespace in
1.361 + the input, where whitespace may occur inside words if quoted using single or
1.362 + double quotes.
1.363 +
1.364 + Hello world -> ['Hello', 'world']
1.365 + Hello ' world' -> ['Hello', ' world']
1.366 + Hello' 'world -> ["'Hello'", "'world']
1.367 + """
1.368 +
1.369 + parts = []
1.370 +
1.371 + # Match the components of each part.
1.372 +
1.373 + for match in line_pattern.finditer(text):
1.374 +
1.375 + # Combine the components by traversing the matching groups.
1.376 +
1.377 + parts.append(reduce(lambda a, b: (a or "") + (b or ""), match.groups()))
1.378 +
1.379 + return parts
1.380 +
1.381 # vim: tabstop=4 expandtab shiftwidth=4