1 #!/usr/bin/env python 2 3 """ 4 Parsing of textual content. 5 6 Copyright (C) 2014, 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 """ 21 22 from imiptools.filesys import fix_permissions 23 from os.path import isfile 24 import codecs 25 import re 26 27 def have_table(obj, filename): 28 29 "Return whether 'obj' is a table using the given 'filename'." 30 31 return hasattr(obj, "get_filename") and obj.get_filename() == filename 32 33 class FileTable: 34 35 "A file-based data table." 36 37 def __init__(self, filename, mutable=True, 38 in_defaults=None, out_defaults=None, 39 in_converter=None, out_converter=None, 40 tab_separated=True, headers=False): 41 42 """ 43 Open the table from the file having the given 'filename'. If 'mutable' 44 is given as a true value (as is the default), the table can be modified. 45 46 The 'in_defaults' is a list of (index, value) tuples indicating the 47 default value where a column either does not exist or provides an empty 48 value. The 'out_defaults' is a corresponding list used to serialise 49 missing and empty values. 50 51 The 'in_converter' is a callable accepting a tuple of values and 52 returning an object. The corresponding 'out_converter' accepts an object 53 and returns a tuple of values. 54 55 If 'tab_separated' is specified and is a false value, line parsing using 56 the imiptools.text.parse_line function will be performed instead of 57 splitting each line of the file using tab characters as separators. 58 59 If 'headers' is specified and is not false, the first line in the table 60 will provide header value information. 61 """ 62 63 self.filename = filename 64 self.mutable = mutable 65 self.in_defaults = in_defaults 66 self.out_defaults = out_defaults 67 self.in_converter = in_converter 68 self.out_converter = out_converter 69 self.tab_separated = tab_separated 70 71 # Obtain the items. In subsequent implementations, the items could be 72 # retrieved dynamically. 73 74 items = [] 75 76 if isfile(filename): 77 for item in get_table(filename, in_defaults, tab_separated): 78 if self.in_converter: 79 item = self.in_converter(item) 80 items.append(item) 81 82 # Obtain header values and separate them from the rest of the data. 83 84 self.table = items[headers and 1 or 0:] 85 self.header_values = headers and items and items[0] or [] 86 self.headers = headers 87 88 def get_filename(self): 89 return self.filename 90 91 def get_header_values(self): 92 return self.header_values 93 94 def set_header_values(self, values): 95 self.header_values = values 96 97 def close(self): 98 99 "Write any modifications and close the table." 100 101 if self.mutable: 102 f = codecs.open(self.filename, "wb", encoding="utf-8") 103 try: 104 sep = self.tab_separated and "\t" or " " 105 106 # Include any headers in the output. 107 108 if self.headers: 109 self.table.insert(0, self.header_values) 110 111 for item in self.table: 112 if self.out_converter: 113 item = self.out_converter(item) 114 115 # Insert defaults for empty columns. 116 117 if self.out_defaults: 118 item = set_defaults(list(item), self.out_defaults) 119 120 # Separate the columns and write to the file. 121 122 print >>f, sep.join(item) 123 124 # Remove the headers from the items in case the table is 125 # accessed again. 126 127 if self.headers: 128 del self.table[0] 129 130 finally: 131 f.close() 132 fix_permissions(self.filename) 133 134 # General collection methods. 135 136 def __nonzero__(self): 137 return bool(self.table) 138 139 # List emulation methods. 140 141 def __iadd__(self, other): 142 for value in other: 143 self.append(value) 144 return self 145 146 def __iter__(self): 147 return iter(self.table) 148 149 def __len__(self): 150 return len(self.table) 151 152 def __delitem__(self, i): 153 del self.table[i] 154 155 def __delslice__(self, start, end): 156 del self.table[start:end] 157 158 def __getitem__(self, i): 159 return self.table[i] 160 161 def __getslice__(self, start, end): 162 return self.table[start:end] 163 164 def __setitem__(self, i, value): 165 self.table[i] = value 166 167 def __setslice__(self, start, end, values): 168 self.table[start:end] = values 169 170 def append(self, value): 171 self.table.append(value) 172 173 def insert(self, i, value): 174 self.table.insert(i, value) 175 176 def remove(self, value): 177 self.table.remove(value) 178 179 # Dictionary emulation methods (even though this is not a mapping). 180 181 def clear(self): 182 del self.table[:] 183 184 # Additional modification methods. 185 186 def replaceall(self, values): 187 self.table[:] = values 188 189 class FileTableDict(FileTable): 190 191 "A file-based table acting as a dictionary." 192 193 def __init__(self, filename, mutable=True, 194 in_defaults=None, out_defaults=None, 195 in_converter=None, out_converter=None, 196 tab_separated=True, headers=False): 197 198 FileTable.__init__(self, filename, mutable, in_defaults, out_defaults, 199 in_converter, out_converter, tab_separated, headers) 200 self.mapping = dict(self.table) 201 202 def close(self): 203 self.table = self.mapping.items() 204 FileTable.close(self) 205 206 # General collection methods. 207 208 def __nonzero__(self): 209 return bool(self.mapping) 210 211 # List emulation methods. 212 213 def __iter__(self): 214 return iter(self.mapping) 215 216 def __len__(self): 217 return len(self.mapping) 218 219 def append(self, value): 220 key, value = value 221 self.mapping[key] = value 222 223 def insert(self, i, value): 224 self.append(value) 225 226 def remove(self, value): 227 key, value = value 228 del self.mapping[key] 229 230 # Unimplemented methods. 231 232 def __delslice__(self, start, end): 233 raise NotImplementedError, "__delslice__" 234 235 def __getslice__(self, start, end): 236 raise NotImplementedError, "__getslice__" 237 238 def __setslice__(self, start, end, values): 239 raise NotImplementedError, "__setslice__" 240 241 # Dictionary emulation methods. 242 243 def clear(self): 244 self.mapping.clear() 245 246 def get(self, i, default=None): 247 return self.mapping.get(i, default) 248 249 def keys(self): 250 return self.mapping.keys() 251 252 def items(self): 253 return self.mapping.items() 254 255 def update(self, other): 256 self.mapping.update(other) 257 258 def values(self): 259 return self.mapping.values() 260 261 def __delitem__(self, i): 262 del self.mapping[i] 263 264 def __getitem__(self, i): 265 return self.mapping[i] 266 267 def __setitem__(self, i, value): 268 if self.mutable: 269 self.mapping[i] = value 270 271 # Additional modification methods. 272 273 def replaceall(self, values): 274 self.mapping = {} 275 self.mapping.update(dict(values)) 276 277 def updateall(self, mapping): 278 self.mapping = {} 279 self.mapping.update(mapping) 280 281 def first(t): 282 return t[0] 283 284 def tuplevalue(v): 285 return (v,) 286 287 class FileTableSingle(FileTable): 288 289 "A file-based table providing single value items." 290 291 def __iter__(self): 292 return iter(self[:]) 293 294 def __getitem__(self, i): 295 return self.table[i][0] 296 297 def __getslice__(self, start, end): 298 return map(first, self.table[start:end]) 299 300 def __setitem__(self, i, value): 301 self.table[i] = [(value,)] 302 303 def __setslice__(self, start, end, values): 304 self.table[start:end] = map(tuplevalue, values) 305 306 def append(self, value): 307 self.table.append((value,)) 308 309 def insert(self, i, value): 310 self.table.insert(i, (value,)) 311 312 def remove(self, value): 313 self.table.remove((value,)) 314 315 # Additional modification methods. 316 317 def replaceall(self, values): 318 self.table[:] = map(tuplevalue, values) 319 320 321 322 # Parsing of tabular files. 323 324 def set_defaults(t, empty_defaults): 325 326 """ 327 In the list 't', replace values that are empty or absent with defaults 328 provided by the 'empty_defaults' collection whose entries are of the form 329 (index, value). 330 """ 331 332 for i, default in empty_defaults: 333 if i >= len(t): 334 t += [None] * (i - len(t) + 1) 335 if not t[i]: 336 t[i] = default 337 return t 338 339 def get_table(filename, empty_defaults=None, tab_separated=True): 340 341 """ 342 From the file having the given 'filename', return a list of tuples 343 representing the file's contents. 344 345 The 'empty_defaults' is a list of (index, value) tuples indicating the 346 default value where a column either does not exist or provides an empty 347 value. 348 349 If 'tab_separated' is specified and is a false value, line parsing using 350 the imiptools.text.parse_line function will be performed instead of 351 splitting each line of the file using tab characters as separators. 352 """ 353 354 f = codecs.open(filename, "rb", encoding="utf-8") 355 try: 356 return get_table_from_stream(f, empty_defaults, tab_separated) 357 finally: 358 f.close() 359 360 def get_table_from_stream(f, empty_defaults=None, tab_separated=True): 361 362 """ 363 Return a list of tuples representing the contents of the stream 'f'. 364 365 The 'empty_defaults' is a list of (index, value) tuples indicating the 366 default value where a column either does not exist or provides an empty 367 value. 368 369 If 'tab_separated' is specified and is a false value, line parsing using 370 the imiptools.text.parse_line function will be performed instead of 371 splitting each line of the file using tab characters as separators. 372 """ 373 374 l = [] 375 376 for line in f.readlines(): 377 line = line.strip(" \r\n") 378 379 if tab_separated: 380 t = line.split("\t") 381 else: 382 t = parse_line(line) 383 384 if empty_defaults: 385 t = set_defaults(t, empty_defaults) 386 l.append(tuple(t)) 387 388 return l 389 390 391 392 # Parsing of lines to obtain functions and arguments. 393 394 line_pattern_str = ( 395 r"(?:" 396 r"(?:'(.*?)')" # single-quoted text 397 r"|" 398 r'(?:"(.*?)")' # double-quoted text 399 r"|" 400 r"([^\s]+)" # non-whitespace characters 401 r")+" 402 r"(?:\s+|$)" # optional trailing whitespace before line end 403 ) 404 405 line_pattern = re.compile(line_pattern_str) 406 407 def parse_line(text): 408 409 """ 410 Parse the given 'text', returning a list of words separated by whitespace in 411 the input, where whitespace may occur inside words if quoted using single or 412 double quotes. 413 414 Hello world -> ['Hello', 'world'] 415 Hello ' world' -> ['Hello', ' world'] 416 Hello' 'world -> ["'Hello'", "'world'] 417 """ 418 419 parts = [] 420 421 # Match the components of each part. 422 423 for match in line_pattern.finditer(text): 424 425 # Combine the components by traversing the matching groups. 426 427 parts.append(reduce(lambda a, b: (a or "") + (b or ""), match.groups())) 428 429 return parts 430 431 # vim: tabstop=4 expandtab shiftwidth=4