vContent (file vContent.py at 007ab02ae5c9)

     1 #!/usr/bin/env python     2      3 """     4 Parsing of vCard, vCalendar and iCalendar files.     5      6 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2011, 2013 Paul Boddie <paul@boddie.org.uk>     7      8 This program is free software; you can redistribute it and/or modify it under     9 the terms of the GNU General Public License as published by the Free Software    10 Foundation; either version 3 of the License, or (at your option) any later    11 version.    12     13 This program is distributed in the hope that it will be useful, but WITHOUT    14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS    15 FOR A PARTICULAR PURPOSE.  See the GNU General Public License for more    16 details.    17     18 You should have received a copy of the GNU General Public License along with    19 this program.  If not, see <http://www.gnu.org/licenses/>.    20     21 --------    22     23 References:    24     25 RFC 5545: Internet Calendaring and Scheduling Core Object Specification    26           (iCalendar)    27           http://tools.ietf.org/html/rfc5545    28     29 RFC 2445: Internet Calendaring and Scheduling Core Object Specification    30           (iCalendar)    31           http://tools.ietf.org/html/rfc2445    32     33 RFC 2425: A MIME Content-Type for Directory Information    34           http://tools.ietf.org/html/rfc2425    35     36 RFC 2426: vCard MIME Directory Profile    37           http://tools.ietf.org/html/rfc2426    38 """    39     40 try:    41     set    42 except NameError:    43     from sets import Set as set    44     45 # Encoding-related imports.    46     47 import base64, quopri    48 import codecs    49     50 # Tokenisation help.    51     52 import re    53     54 # Configuration.    55     56 default_encoding = "utf-8"    57     58 # Reader and parser classes.    59     60 class Reader:    61     62     "A simple class wrapping a file, providing simple pushback capabilities."    63     64     def __init__(self, f, non_standard_newline=0):    65     66         """    67         Initialise the object with the file 'f'. If 'non_standard_newline' is    68         set to a true value (unlike the default), lines ending with CR will be    69         treated as complete lines.    70         """    71     72         self.f = f    73         self.non_standard_newline = non_standard_newline    74         self.lines = []    75         self.line_number = 1 # about to read line 1    76     77     def close(self):    78     79         "Close the reader."    80     81         self.f.close()    82     83     def pushback(self, line):    84     85         """    86         Push the given 'line' back so that the next line read is actually the    87         given 'line' and not the next line from the underlying file.    88         """    89     90         self.lines.append(line)    91         self.line_number -= 1    92     93     def readline(self):    94     95         """    96         If no pushed-back lines exist, read a line directly from the file.    97         Otherwise, read from the list of pushed-back lines.    98         """    99    100         self.line_number += 1   101         if self.lines:   102             return self.lines.pop()   103         else:   104             # Sanity check for broken lines (\r instead of \r\n or \n).   105             line = self.f.readline()   106             while line.endswith("\r") and not self.non_standard_newline:   107                 s = self.f.readline()   108                 if not s:   109                     break   110                 line += s   111             if line.endswith("\r") and self.non_standard_newline:   112                 return line + "\n"   113             else:   114                 return line   115    116     def read_content_line(self):   117    118         """   119         Read an entire content line, itself potentially consisting of many   120         physical lines of text, returning a string.   121         """   122    123         # Skip blank lines.   124    125         line = self.readline()   126         while line:   127             line_stripped = line.rstrip("\r\n")   128             if not line_stripped:   129                 line = self.readline()   130             else:   131                 break   132         else:   133             return ""   134    135         # Strip all appropriate whitespace from the right end of each line.   136         # For subsequent lines, remove the first whitespace character.   137         # See section 4.1 of the iCalendar specification.   138    139         lines = [line_stripped]   140    141         line = self.readline()   142         while line.startswith(" ") or line.startswith("\t"):   143             lines.append(line[1:].rstrip("\r\n"))   144             line = self.readline()   145    146         # Since one line too many will have been read, push the line back into   147         # the file.   148    149         if line:   150             self.pushback(line)   151    152         return "".join(lines)   153    154     def get_content_line(self):   155    156         "Return a content line object for the current line."   157    158         return ContentLine(self.read_content_line())   159    160 class ContentLine:   161    162     "A content line which can be searched."   163    164     SEPARATORS = re.compile('[;:"]')   165     SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]')   166    167     def __init__(self, text):   168         self.text = text   169         self.start = 0   170    171     def __repr__(self):   172         return "ContentLine(%r)" % self.text   173    174     def get_remaining(self):   175    176         "Get the remaining text from the content line."   177    178         return self.text[self.start:]   179    180     def search(self, targets):   181    182         """   183         Find one of the 'targets' in the text, returning the string from the   184         current position up to the target found, along with the target string,   185         using a tuple of the form (string, target). If no target was found,   186         return the entire string together with a target of None.   187    188         The 'targets' parameter must be a regular expression object or an object   189         compatible with the API of such objects.   190         """   191    192         text = self.text   193         start = pos = self.start   194         length = len(text)   195    196         # Remember the first target.   197    198         first = None   199         first_pos = None   200         in_quoted_region = 0   201    202         # Process the text, looking for the targets.   203    204         while pos < length:   205             match = targets.search(text, pos)   206    207             # Where nothing matches, end the search.   208    209             if match is None:   210                 pos = length   211    212             # Where a double quote matches, toggle the region state.   213    214             elif match.group() == '"':   215                 in_quoted_region = not in_quoted_region   216                 pos = match.end()   217    218             # Where something else matches outside a region, stop searching.   219    220             elif not in_quoted_region:   221                 first = match.group()   222                 first_pos = match.start()   223                 break   224    225             # Otherwise, keep looking for the end of the region.   226    227             else:   228                 pos = match.end()   229    230         # Where no more input can provide the targets, return a special result.   231    232         else:   233             self.start = length   234             return text[start:], None   235    236         self.start = match.end()   237         return text[start:first_pos], first   238    239 class StreamParser:   240    241     "A stream parser for content in vCard/vCalendar/iCalendar-like formats."   242    243     def __init__(self, f):   244    245         "Initialise the parser for the given file 'f'."   246    247         self.f = f   248    249     def close(self):   250    251         "Close the reader."   252    253         self.f.close()   254    255     def __iter__(self):   256    257         "Return self as the iterator."   258    259         return self   260    261     def next(self):   262    263         """   264         Return the next content item in the file as a tuple of the form   265         (name, parameters, values).   266         """   267    268         return self.parse_content_line()   269    270     def decode_content(self, value):   271    272         "Decode the given 'value', replacing quoted characters."   273    274         return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n")   275    276     # Internal methods.   277    278     def parse_content_line(self):   279    280         """   281         Return the name, parameters and value information for the current   282         content line in the file being parsed.   283         """   284    285         f = self.f   286         line_number = f.line_number   287         line = f.get_content_line()   288    289         # Read the property name.   290    291         name, sep = line.search(line.SEPARATORS)   292         name = name.strip()   293    294         if not name and sep is None:   295             raise StopIteration   296    297         # Read the parameters.   298    299         parameters = {}   300    301         while sep == ";":   302    303             # Find the actual modifier.   304    305             parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS)   306             parameter_name = parameter_name.strip()   307    308             if sep == "=":   309                 parameter_value, sep = line.search(line.SEPARATORS)   310                 parameter_value = parameter_value.strip()   311             else:   312                 parameter_value = None   313    314             # Append a key, value tuple to the parameters list.   315    316             parameters[parameter_name] = parameter_value   317    318         # Get the value content.   319    320         if sep != ":":   321             raise ValueError, (line_number, line)   322    323         # Obtain and decode the value.   324    325         value = self.decode(name, parameters, line.get_remaining())   326    327         return name, parameters, value   328    329     def decode(self, name, parameters, value):   330    331         "Decode using 'name' and 'parameters' the given 'value'."   332    333         encoding = parameters.get("ENCODING")   334         charset = parameters.get("CHARSET")   335    336         value = self.decode_content(value)   337    338         if encoding == "QUOTED-PRINTABLE":   339             return unicode(quopri.decodestring(value), charset or "iso-8859-1")   340         elif encoding == "BASE64":   341             return base64.decodestring(value)   342         else:   343             return value   344    345 class ParserBase:   346    347     "An abstract parser for content in vCard/vCalendar/iCalendar-like formats."   348    349     def __init__(self):   350    351         "Initialise the parser."   352    353         self.names = []   354    355     def parse(self, f, parser_cls=None):   356    357         "Parse the contents of the file 'f'."   358    359         parser = (parser_cls or StreamParser)(f)   360    361         for name, parameters, value in parser:   362    363             if name == "BEGIN":   364                 self.names.append(value)   365                 self.startComponent(value, parameters)   366    367             elif name == "END":   368                 start_name = self.names.pop()   369                 if start_name != value:   370                     raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (   371                         start_name, value, f.line_number)   372    373                 self.endComponent(value)   374    375             else:   376                 self.handleProperty(name, parameters, value)   377    378 class Parser(ParserBase):   379    380     "A SAX-like parser for vCard/vCalendar/iCalendar-like formats."   381    382     def __init__(self):   383         ParserBase.__init__(self)   384         self.components = []   385    386     def startComponent(self, name, parameters):   387    388         """   389         Add the component with the given 'name' and 'parameters', recording an   390         empty list of children as part of the component's content.   391         """   392    393         component = self.handleProperty(name, parameters)   394         self.components.append(component)   395         return component   396    397     def endComponent(self, name):   398    399         """   400         End the component with the given 'name' by removing it from the active   401         component stack. If only one component exists on the stack, retain it   402         for later inspection.   403         """   404    405         if len(self.components) > 1:   406             return self.components.pop()   407    408         # Or return the only element.   409    410         elif self.components:   411             return self.components[0]   412    413     def handleProperty(self, name, parameters, value=None):   414    415         """   416         Record the property with the given 'name', 'parameters' and optional   417         'value' as part of the current component's children.   418         """   419    420         component = self.makeComponent(name, parameters, value)   421         self.attachComponent(component)   422         return component   423    424     # Component object construction/manipulation methods.   425    426     def attachComponent(self, component):   427    428         "Attach the given 'component' to its parent."   429    430         if self.components:   431             component_name, component_parameters, component_children = self.components[-1]   432             component_children.append(component)   433    434     def makeComponent(self, name, parameters, value=None):   435    436         """   437         Make a component object from the given 'name', 'parameters' and optional   438         'value'.   439         """   440    441         return (name, parameters, value or [])   442    443     # Public methods.   444    445     def parse(self, f, parser_cls=None):   446    447         "Parse the contents of the file 'f'."   448    449         ParserBase.parse(self, f, parser_cls)   450         return self.components[0]   451    452 # Writer classes.   453    454 class Writer:   455    456     "A simple class wrapping a file, providing simple output capabilities."   457    458     default_line_length = 76   459    460     def __init__(self, write, line_length=None):   461    462         """   463         Initialise the object with the given 'write' operation. If 'line_length'   464         is set, the length of written lines will conform to the specified value   465         instead of the default value.    466         """   467    468         self._write = write   469         self.line_length = line_length or self.default_line_length   470         self.char_offset = 0   471    472     def write(self, text):   473    474         "Write the 'text' to the file."   475    476         write = self._write   477         line_length = self.line_length   478    479         i = 0   480         remaining = len(text)   481    482         while remaining:   483             space = line_length - self.char_offset   484             if remaining > space:   485                 write(text[i:i + space])   486                 write("\r\n ")   487                 self.char_offset = 1   488                 i += space   489                 remaining -= space   490             else:   491                 write(text[i:])   492                 self.char_offset += remaining   493                 i += remaining   494                 remaining = 0   495    496     def end_line(self):   497    498         "End the current content line."   499    500         if self.char_offset > 0:   501             self.char_offset = 0   502             self._write("\r\n")   503    504 class StreamWriter:   505    506     "A stream writer for content in vCard/vCalendar/iCalendar-like formats."   507    508     def __init__(self, f):   509    510         "Initialise the stream writer with the given 'f' stream object."   511    512         self.f = f   513    514     def append(self, record):   515         self.write(*record)   516    517     def write(self, name, parameters, value):   518    519         """   520         Write a content line, serialising the given 'name', 'parameters' and   521         'value' information.   522         """   523    524         self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value))   525    526     # Internal methods.   527    528     def write_content_line(self, name, encoded_parameters, encoded_value):   529    530         """   531         Write a content line for the given 'name', 'encoded_parameters' and   532         'encoded_value' information.   533         """   534    535         f = self.f   536    537         f.write(name)   538         for param_name, param_value in encoded_parameters.items():   539             f.write(";")   540             f.write(param_name)   541             f.write("=")   542             f.write(param_value)   543         f.write(":")   544         f.write(encoded_value)   545         f.end_line()   546    547     def encode_quoted_parameter_value(self, value):   548    549         "Encode the given 'value'."   550    551         return '"%s"' % value   552    553     def encode_value(self, name, parameters, value):   554    555         """   556         Encode using 'name' and 'parameters' the given 'value' so that the   557         resulting encoded form employs any specified character encodings.   558         """   559    560         encoding = parameters.get("ENCODING")   561         charset = parameters.get("CHARSET")   562    563         if encoding == "QUOTED-PRINTABLE":   564             value = quopri.encodestring(value.encode(charset or "iso-8859-1"))   565         elif encoding == "BASE64":   566             value = base64.encodestring(value)   567    568         return self.encode_content(value)   569    570     # Overrideable methods.   571    572     def encode_parameters(self, parameters):   573    574         """   575         Encode the given 'parameters' according to the vCalendar specification.   576         """   577    578         encoded_parameters = {}   579    580         for param_name, param_value in parameters.items():   581    582             # Basic format support merely involves quoting values which seem to   583             # need it. Other more specific formats may define exactly which   584             # parameters should be quoted.   585    586             if ContentLine.SEPARATORS.search(param_value):   587                 param_value = self.encode_quoted_parameter_value(param_value)   588    589             encoded_parameters[param_name] = param_value   590    591         return encoded_parameters   592    593     def encode_content(self, value):   594    595         "Encode the given 'value', quoting characters."   596    597         return value.replace("\n", "\\n")   598    599 # Utility functions.   600    601 def is_input_stream(stream_or_string):   602     return hasattr(stream_or_string, "read")   603    604 def get_input_stream(stream_or_string, encoding=None):   605     if is_input_stream(stream_or_string):   606         return stream_or_string   607     else:   608         return codecs.open(stream_or_string, encoding=(encoding or default_encoding))   609    610 def get_output_stream(stream_or_string, encoding=None):   611     if hasattr(stream_or_string, "write"):   612         return stream_or_string   613     else:   614         return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding))   615    616 # Public functions.   617    618 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   619    620     """   621     Parse the resource data found through the use of the 'stream_or_string',   622     which is either a stream providing Unicode data (the codecs module can be   623     used to open files or to wrap streams in order to provide Unicode data) or a   624     filename identifying a file to be parsed.   625    626     The optional 'encoding' can be used to specify the character encoding used   627     by the file to be parsed.   628    629     The optional 'non_standard_newline' can be set to a true value (unlike the   630     default) in order to attempt to process files with CR as the end of line   631     character.   632    633     As a result of parsing the resource, the root node of the imported resource   634     is returned.   635     """   636    637     stream = get_input_stream(stream_or_string, encoding)   638     reader = Reader(stream, non_standard_newline)   639    640     # Parse using the reader.   641    642     try:   643         parser = (parser_cls or Parser)()   644         return parser.parse(reader)   645    646     # Close any opened streams.   647    648     finally:   649         if not is_input_stream(stream_or_string):   650             reader.close()   651    652 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None):   653    654     """   655     Parse the resource data found through the use of the 'stream_or_string',   656     which is either a stream providing Unicode data (the codecs module can be   657     used to open files or to wrap streams in order to provide Unicode data) or a   658     filename identifying a file to be parsed.   659    660     The optional 'encoding' can be used to specify the character encoding used   661     by the file to be parsed.   662    663     The optional 'non_standard_newline' can be set to a true value (unlike the   664     default) in order to attempt to process files with CR as the end of line   665     character.   666    667     An iterator is returned which provides event tuples describing parsing   668     events of the form (name, parameters, value).   669     """   670    671     stream = get_input_stream(stream_or_string, encoding)   672     reader = Reader(stream, non_standard_newline)   673     parser = (parser_cls or StreamParser)(reader)   674     return parser   675    676 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None):   677    678     """   679     Return a writer which will either send data to the resource found through   680     the use of 'stream_or_string' or using the given 'write' operation.   681    682     The 'stream_or_string' parameter may be either a stream accepting Unicode   683     data (the codecs module can be used to open files or to wrap streams in   684     order to accept Unicode data) or a filename identifying a file to be   685     written.   686    687     The optional 'encoding' can be used to specify the character encoding used   688     by the file to be written.   689    690     The optional 'line_length' can be used to specify how long lines should be   691     in the resulting data.   692     """   693    694     if stream_or_string:   695         stream = get_output_stream(stream_or_string, encoding)   696         _writer = Writer(stream.write, line_length)   697     elif write:   698         _writer = Writer(write, line_length)   699     else:   700         raise IOError, "No stream, filename or write operation specified."   701    702     return (writer_cls or StreamWriter)(_writer)   703    704 # vim: tabstop=4 expandtab shiftwidth=4