imip-agent

imiptools/text.py

1267:7819b77d9330
2017-09-15 Paul Boddie Introduced a tentative means of classifying periods for suitable operations upon updating an event.
     1 #!/usr/bin/env python     2      3 """     4 Parsing of textual content.     5      6 Copyright (C) 2014, 2015, 2016, 2017 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20 """    21     22 from imiptools.filesys import fix_permissions    23 from os.path import isfile    24 import codecs    25 import re    26     27 def have_table(obj, filename):    28     29     "Return whether 'obj' is a table using the given 'filename'."    30     31     return hasattr(obj, "get_filename") and obj.get_filename() == filename    32     33 class FileTable:    34     35     "A file-based data table."    36     37     def __init__(self, filename, mutable=True,    38                  in_defaults=None, out_defaults=None,    39                  in_converter=None, out_converter=None,    40                  tab_separated=True, headers=False):    41     42         """    43         Open the table from the file having the given 'filename'. If 'mutable'    44         is given as a true value (as is the default), the table can be modified.    45     46         The 'in_defaults' is a list of (index, value) tuples indicating the    47         default value where a column either does not exist or provides an empty    48         value. The 'out_defaults' is a corresponding list used to serialise    49         missing and empty values.    50     51         The 'in_converter' is a callable accepting a tuple of values and    52         returning an object. The corresponding 'out_converter' accepts an object    53         and returns a tuple of values.    54     55         If 'tab_separated' is specified and is a false value, line parsing using    56         the imiptools.text.parse_line function will be performed instead of    57         splitting each line of the file using tab characters as separators.    58     59         If 'headers' is specified and is not false, the first line in the table    60         will provide header value information.    61         """    62     63         self.filename = filename    64         self.mutable = mutable    65         self.in_defaults = in_defaults    66         self.out_defaults = out_defaults    67         self.in_converter = in_converter    68         self.out_converter = out_converter    69         self.tab_separated = tab_separated    70     71         # Obtain the items. In subsequent implementations, the items could be    72         # retrieved dynamically.    73     74         items = []    75     76         if isfile(filename):    77             for item in get_table(filename, in_defaults, tab_separated):    78                 if self.in_converter:    79                     item = self.in_converter(item)    80                 items.append(item)    81     82         # Obtain header values and separate them from the rest of the data.    83     84         self.table = items[headers and 1 or 0:]    85         self.header_values = headers and items and items[0] or []    86         self.headers = headers    87     88     def get_filename(self):    89         return self.filename    90     91     def get_header_values(self):    92         return self.header_values    93     94     def set_header_values(self, values):    95         self.header_values = values    96     97     def close(self):    98     99         "Write any modifications and close the table."   100    101         if self.mutable:   102             f = codecs.open(self.filename, "wb", encoding="utf-8")   103             try:   104                 sep = self.tab_separated and "\t" or " "   105    106                 # Include any headers in the output.   107    108                 if self.headers:   109                     self.table.insert(0, self.header_values)   110    111                 for item in self.table:   112                     if self.out_converter:   113                         item = self.out_converter(item)   114    115                     # Insert defaults for empty columns.   116    117                     if self.out_defaults:   118                         item = set_defaults(list(item), self.out_defaults)   119    120                     # Separate the columns and write to the file.   121    122                     print >>f, sep.join(item)   123    124                 # Remove the headers from the items in case the table is   125                 # accessed again.   126    127                 if self.headers:   128                     del self.table[0]   129    130             finally:   131                 f.close()   132                 fix_permissions(self.filename)   133    134     # General collection methods.   135    136     def __nonzero__(self):   137         return bool(self.table)   138    139     # List emulation methods.   140    141     def __iadd__(self, other):   142         for value in other:   143             self.append(value)   144         return self   145    146     def __iter__(self):   147         return iter(self.table)   148    149     def __len__(self):   150         return len(self.table)   151    152     def __delitem__(self, i):   153         del self.table[i]   154    155     def __delslice__(self, start, end):   156         del self.table[start:end]   157    158     def __getitem__(self, i):   159         return self.table[i]   160    161     def __getslice__(self, start, end):   162         return self.table[start:end]   163    164     def __setitem__(self, i, value):   165         self.table[i] = value   166    167     def __setslice__(self, start, end, values):   168         self.table[start:end] = values   169    170     def append(self, value):   171         self.table.append(value)   172    173     def insert(self, i, value):   174         self.table.insert(i, value)   175    176     def remove(self, value):   177         self.table.remove(value)   178    179     # Dictionary emulation methods (even though this is not a mapping).   180    181     def clear(self):   182         del self.table[:]   183    184     # Additional modification methods.   185    186     def replaceall(self, values):   187         self.table[:] = values   188    189 class FileTableDict(FileTable):   190    191     "A file-based table acting as a dictionary."   192    193     def __init__(self, filename, mutable=True,   194                  in_defaults=None, out_defaults=None,   195                  in_converter=None, out_converter=None,   196                  tab_separated=True, headers=False):   197    198         FileTable.__init__(self, filename, mutable, in_defaults, out_defaults,   199                            in_converter, out_converter, tab_separated, headers)   200         self.mapping = dict(self.table)   201    202     def close(self):   203         self.table = self.mapping.items()   204         FileTable.close(self)   205    206     # General collection methods.   207    208     def __nonzero__(self):   209         return bool(self.mapping)   210    211     # List emulation methods.   212    213     def __iter__(self):   214         return iter(self.mapping)   215    216     def __len__(self):   217         return len(self.mapping)   218    219     def append(self, value):   220         key, value = value   221         self.mapping[key] = value   222    223     def insert(self, i, value):   224         self.append(value)   225    226     def remove(self, value):   227         key, value = value   228         del self.mapping[key]   229    230     # Unimplemented methods.   231    232     def __delslice__(self, start, end):   233         raise NotImplementedError, "__delslice__"   234    235     def __getslice__(self, start, end):   236         raise NotImplementedError, "__getslice__"   237    238     def __setslice__(self, start, end, values):   239         raise NotImplementedError, "__setslice__"   240    241     # Dictionary emulation methods.   242    243     def clear(self):   244         self.mapping.clear()   245    246     def get(self, i, default=None):   247         return self.mapping.get(i, default)   248    249     def keys(self):   250         return self.mapping.keys()   251    252     def items(self):   253         return self.mapping.items()   254    255     def update(self, other):   256         self.mapping.update(other)   257    258     def values(self):   259         return self.mapping.values()   260    261     def __delitem__(self, i):   262         del self.mapping[i]   263    264     def __getitem__(self, i):   265         return self.mapping[i]   266    267     def __setitem__(self, i, value):   268         if self.mutable:   269             self.mapping[i] = value   270    271     # Additional modification methods.   272    273     def replaceall(self, values):   274         self.mapping = {}   275         self.mapping.update(dict(values))   276    277     def updateall(self, mapping):   278         self.mapping = {}   279         self.mapping.update(mapping)   280    281 def first(t):   282     return t[0]   283    284 def tuplevalue(v):   285     return (v,)   286    287 class FileTableSingle(FileTable):   288    289     "A file-based table providing single value items."   290    291     def __iter__(self):   292         return iter(self[:])   293    294     def __getitem__(self, i):   295         return self.table[i][0]   296    297     def __getslice__(self, start, end):   298         return map(first, self.table[start:end])   299    300     def __setitem__(self, i, value):   301         self.table[i] = [(value,)]   302    303     def __setslice__(self, start, end, values):   304         self.table[start:end] = map(tuplevalue, values)   305    306     def append(self, value):   307         self.table.append((value,))   308    309     def insert(self, i, value):   310         self.table.insert(i, (value,))   311    312     def remove(self, value):   313         self.table.remove((value,))   314    315     # Additional modification methods.   316    317     def replaceall(self, values):   318         self.table[:] = map(tuplevalue, values)   319    320    321    322 # Parsing of tabular files.   323    324 def set_defaults(t, empty_defaults):   325    326     """   327     In the list 't', replace values that are empty or absent with defaults   328     provided by the 'empty_defaults' collection whose entries are of the form   329     (index, value).   330     """   331    332     for i, default in empty_defaults:   333         if i >= len(t):   334             t += [None] * (i - len(t) + 1)   335         if not t[i]:   336             t[i] = default   337     return t   338    339 def get_table(filename, empty_defaults=None, tab_separated=True):   340    341     """   342     From the file having the given 'filename', return a list of tuples   343     representing the file's contents.   344    345     The 'empty_defaults' is a list of (index, value) tuples indicating the   346     default value where a column either does not exist or provides an empty   347     value.   348    349     If 'tab_separated' is specified and is a false value, line parsing using   350     the imiptools.text.parse_line function will be performed instead of   351     splitting each line of the file using tab characters as separators.   352     """   353    354     f = codecs.open(filename, "rb", encoding="utf-8")   355     try:   356         return get_table_from_stream(f, empty_defaults, tab_separated)   357     finally:   358         f.close()   359    360 def get_table_from_stream(f, empty_defaults=None, tab_separated=True):   361    362     """   363     Return a list of tuples representing the contents of the stream 'f'.   364    365     The 'empty_defaults' is a list of (index, value) tuples indicating the   366     default value where a column either does not exist or provides an empty   367     value.   368    369     If 'tab_separated' is specified and is a false value, line parsing using   370     the imiptools.text.parse_line function will be performed instead of   371     splitting each line of the file using tab characters as separators.   372     """   373    374     l = []   375    376     for line in f.readlines():   377         line = line.strip(" \r\n")   378    379         if tab_separated:   380             t = line.split("\t")   381         else:   382             t = parse_line(line)   383    384         if empty_defaults:   385             t = set_defaults(t, empty_defaults)   386         l.append(tuple(t))   387    388     return l   389    390    391    392 # Parsing of lines to obtain functions and arguments.   393    394 line_pattern_str = (   395                    r"(?:"   396                    r"(?:'(.*?)')" # single-quoted text   397                    r"|"   398                    r'(?:"(.*?)")' # double-quoted text   399                    r"|"   400                    r"([^\s]+)"    # non-whitespace characters   401                    r")+"   402                    r"(?:\s+|$)"   # optional trailing whitespace before line end   403                    )   404    405 line_pattern = re.compile(line_pattern_str)   406    407 def parse_line(text):   408    409     """   410     Parse the given 'text', returning a list of words separated by whitespace in   411     the input, where whitespace may occur inside words if quoted using single or   412     double quotes.   413    414     Hello world     -> ['Hello', 'world']   415     Hello ' world'  -> ['Hello', ' world']   416     Hello' 'world   -> ["'Hello'", "'world']   417     """   418    419     parts = []   420    421     # Match the components of each part.   422    423     for match in line_pattern.finditer(text):   424    425         # Combine the components by traversing the matching groups.   426    427         parts.append(reduce(lambda a, b: (a or "") + (b or ""), match.groups()))   428    429     return parts   430    431 # vim: tabstop=4 expandtab shiftwidth=4