Created a vContent module based on essential parts of the RDFCalendar.Parsers module.

     1.1 --- /dev/null	Thu Jan 01 00:00:00 1970 +0000
     1.2 +++ b/vContent.py	Thu Oct 16 21:55:30 2008 +0200
     1.3 @@ -0,0 +1,319 @@
     1.4 +#!/usr/bin/env python
     1.5 +
     1.6 +"""
     1.7 +Parsing of vCard, vCalendar and iCalendar files.
     1.8 +
     1.9 +Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
    1.10 +
    1.11 +This program is free software; you can redistribute it and/or modify it under
    1.12 +the terms of the GNU Lesser General Public License as published by the Free
    1.13 +Software Foundation; either version 3 of the License, or (at your option) any
    1.14 +later version.
    1.15 +
    1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
    1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
    1.18 +FOR A PARTICULAR PURPOSE.  See the GNU Lesser General Public License for more
    1.19 +details.
    1.20 +
    1.21 +You should have received a copy of the GNU Lesser General Public License along
    1.22 +with this program.  If not, see <http://www.gnu.org/licenses/>.
    1.23 +
    1.24 +--------
    1.25 +
    1.26 +References:
    1.27 +
    1.28 +RFC 2445: Internet Calendaring and Scheduling Core Object Specification
    1.29 +          (iCalendar)
    1.30 +          http://rfc.net/rfc2445.html
    1.31 +
    1.32 +RFC 2425: A MIME Content-Type for Directory Information
    1.33 +          http://rfc.net/rfc2425.html
    1.34 +
    1.35 +RFC 2426: vCard MIME Directory Profile
    1.36 +          http://rfc.net/rfc2426.html
    1.37 +"""
    1.38 +
    1.39 +# Encoding-related imports.
    1.40 +
    1.41 +import base64, quopri
    1.42 +
    1.43 +# Simple reader class.
    1.44 +
    1.45 +class Reader:
    1.46 +
    1.47 +    "A simple class wrapping a file, providing simple pushback capabilities."
    1.48 +
    1.49 +    def __init__(self, f, non_standard_newline=0):
    1.50 +
    1.51 +        """
    1.52 +        Initialise the object with the file 'f'. If 'non_standard_newline' is
    1.53 +        set to a true value (unlike the default), lines ending with CR will be
    1.54 +        treated as complete lines.
    1.55 +        """
    1.56 +
    1.57 +        self.f = f
    1.58 +        self.non_standard_newline = non_standard_newline
    1.59 +        self.lines = []
    1.60 +        self.line_number = 0
    1.61 +
    1.62 +    def pushback(self, line):
    1.63 +
    1.64 +        """
    1.65 +        Push the given 'line' back so that the next line read is actually the
    1.66 +        given 'line' and not the next line from the underlying file.
    1.67 +        """
    1.68 +
    1.69 +        self.lines.append(line)
    1.70 +        self.line_number -= 1
    1.71 +
    1.72 +    def readline(self):
    1.73 +
    1.74 +        """
    1.75 +        If no pushed-back lines exist, read a line directly from the file.
    1.76 +        Otherwise, read from the list of pushed-back lines.
    1.77 +        """
    1.78 +
    1.79 +        self.line_number += 1
    1.80 +        if self.lines:
    1.81 +            return self.lines.pop()
    1.82 +        else:
    1.83 +            # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
    1.84 +            line = self.f.readline()
    1.85 +            while line.endswith("\r") and not self.non_standard_newline:
    1.86 +                line += self.f.readline()
    1.87 +            if line.endswith("\r") and self.non_standard_newline:
    1.88 +                return line + "\n"
    1.89 +            else:
    1.90 +                return line
    1.91 +
    1.92 +    def read_until(self, targets):
    1.93 +
    1.94 +        """
    1.95 +        Read from the stream until one of the 'targets' is seen. Return the
    1.96 +        string from the current position up to the target found, along with the
    1.97 +        target string, using a tuple of the form (string, target). If no target
    1.98 +        was found, return the entire string together with a target of None.
    1.99 +        """
   1.100 +
   1.101 +        indexes = {}
   1.102 +
   1.103 +        # Remember the entire text read and the index of the current line in
   1.104 +        # that text.
   1.105 +
   1.106 +        lines = []
   1.107 +
   1.108 +        line = self.readline()
   1.109 +        lines.append(line)
   1.110 +        start = 0
   1.111 +
   1.112 +        while indexes == {} and line != "":
   1.113 +            for target in targets:
   1.114 +                index = line.find(target)
   1.115 +
   1.116 +                # Always choose the first matching target.
   1.117 +
   1.118 +                if index != -1 and not indexes.has_key(start + index):
   1.119 +                    indexes[start + index] = target
   1.120 +
   1.121 +            start += len(line)
   1.122 +            line = self.readline()
   1.123 +            lines.append(line)
   1.124 +
   1.125 +        text = "".join(lines)
   1.126 +
   1.127 +        if indexes:
   1.128 +            min_index = reduce(min, indexes.keys())
   1.129 +            target = indexes[min_index]
   1.130 +
   1.131 +            # Skip the target.
   1.132 +            # Since the end of the buffer should always be a newline, ignore the
   1.133 +            # last element.
   1.134 +
   1.135 +            lines = text[min_index + len(target):].split("\n")[:]
   1.136 +            if not lines[-1]:
   1.137 +                del lines[-1]
   1.138 +            lines.reverse()
   1.139 +
   1.140 +            for line in lines:
   1.141 +                self.pushback(line + "\n")
   1.142 +
   1.143 +            return text[:min_index], target
   1.144 +        else:
   1.145 +            return text, None
   1.146 +
   1.147 +class StreamParser:
   1.148 +
   1.149 +    "A stream parser for content in vCard/vCalendar/iCalendar-like formats."
   1.150 +
   1.151 +    def __init__(self, f):
   1.152 +
   1.153 +        "Initialise the parser for the given file 'f'."
   1.154 +
   1.155 +        self.f = f
   1.156 +
   1.157 +    def __iter__(self):
   1.158 +
   1.159 +        "Return self as the iterator."
   1.160 +
   1.161 +        return self
   1.162 +
   1.163 +    def next(self):
   1.164 +
   1.165 +        """
   1.166 +        Return the next content item in the file as a tuple of the form
   1.167 +        (name, parameters, values).
   1.168 +        """
   1.169 +
   1.170 +        return self.parse_content_line()
   1.171 +
   1.172 +    def parse_content_line(self):
   1.173 +
   1.174 +        """
   1.175 +        Return the name, parameters and a list containing value information for
   1.176 +        the current content line in the file being parsed.
   1.177 +        """
   1.178 +
   1.179 +        f = self.f
   1.180 +
   1.181 +        parameters = {}
   1.182 +        name, sep = f.read_until([";", ":"])
   1.183 +
   1.184 +        name = name.strip()
   1.185 +
   1.186 +        if not name and sep is None:
   1.187 +            raise StopIteration
   1.188 +
   1.189 +        while sep == ";":
   1.190 +
   1.191 +            # Find the actual modifier.
   1.192 +
   1.193 +            parameter_name, sep = f.read_until(["=", ";", ":"])
   1.194 +            parameter_name = parameter_name.strip()
   1.195 +
   1.196 +            if sep == "=":
   1.197 +                parameter_value, sep = f.read_until([";", ":"])
   1.198 +                parameter_value = parameter_value.strip()
   1.199 +            else:
   1.200 +                parameter_value = None
   1.201 +
   1.202 +            # Append a key, value tuple to the parameters list.
   1.203 +
   1.204 +            parameters[parameter_name] = parameter_value
   1.205 +
   1.206 +        # Get the value content.
   1.207 +
   1.208 +        if sep != ":":
   1.209 +            raise ValueError, f.line_number
   1.210 +
   1.211 +        # Strip all appropriate whitespace from the right end of each line.
   1.212 +        # For subsequent lines, remove the first whitespace character.
   1.213 +        # See section 4.1 of the iCalendar specification.
   1.214 +
   1.215 +        line = f.readline()
   1.216 +        value_lines = [line.rstrip("\r\n")]
   1.217 +        line = f.readline()
   1.218 +        while line != "" and line[0] in [" ", "\t"]:
   1.219 +            value_lines.append(line.rstrip("\r\n")[1:])
   1.220 +            line = f.readline()
   1.221 +
   1.222 +        # Since one line too many will have been read, push the line back into the
   1.223 +        # file.
   1.224 +
   1.225 +        f.pushback(line)
   1.226 +
   1.227 +        # Decode the value.
   1.228 +
   1.229 +        value = self.decode("".join(value_lines), parameters.get("ENCODING"))
   1.230 +
   1.231 +        return name, parameters, value
   1.232 +
   1.233 +    def decode(self, value, encoding):
   1.234 +
   1.235 +        "Decode the 'value' with the given 'encoding'."
   1.236 +
   1.237 +        # NOTE: Assuming ISO 8869-1 for the character set.
   1.238 +
   1.239 +        if encoding == "QUOTED-PRINTABLE":
   1.240 +            return unicode(quopri.decodestring(value), "iso-8859-1")
   1.241 +        elif encoding == "BASE64":
   1.242 +            return base64.decodestring(value)
   1.243 +        else:
   1.244 +            # NOTE: Introducing newline conversions.
   1.245 +            # Replace quoted characters (see 4.3.11 in RFC 2445).
   1.246 +
   1.247 +            return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")
   1.248 +
   1.249 +class Parser:
   1.250 +
   1.251 +    "A parser for content in vCard/vCalendar/iCalendar-like formats."
   1.252 +
   1.253 +    def __init__(self):
   1.254 +
   1.255 +        "Initialise the parser."
   1.256 +
   1.257 +        self.elements = [] # also known as components
   1.258 +        self.document = []
   1.259 +        self.current = self.document
   1.260 +
   1.261 +    def parse(self, f):
   1.262 +
   1.263 +        "Parse the contents of the file 'f'."
   1.264 +
   1.265 +        parser = StreamParser(f)
   1.266 +
   1.267 +        for name, parameters, value in parser:
   1.268 +
   1.269 +            # Add new elements/components to the current position in the
   1.270 +            # document, recording the element as the active element.
   1.271 +
   1.272 +            if name == "BEGIN":
   1.273 +                children = []
   1.274 +                element = (value, parameters, children)
   1.275 +                self.elements.append(element)
   1.276 +                self.current.append(element)
   1.277 +                self.current = children
   1.278 +
   1.279 +            # End elements by removing them from the active element stack and
   1.280 +            # making the next element's children the current position for new
   1.281 +            # content.
   1.282 +
   1.283 +            elif name == "END":
   1.284 +                start_element = self.elements.pop()
   1.285 +                start_value, start_parameters, children = start_element
   1.286 +                if start_value != value:
   1.287 +                    raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
   1.288 +                        start_value, value, f.line_number)
   1.289 +                if self.elements:
   1.290 +                    parent_value, parent_parameters, children = self.elements[-1]
   1.291 +                    self.current = children
   1.292 +                else:
   1.293 +                    self.current = self.document
   1.294 +
   1.295 +            else:
   1.296 +                self.current.append((name, parameters, value))
   1.297 +
   1.298 +        return self.document
   1.299 +
   1.300 +# Public functions.
   1.301 +
   1.302 +def parse(f, non_standard_newline=0):
   1.303 +
   1.304 +    """
   1.305 +    Parse the resource data found through the use of the file object 'f', which
   1.306 +    should provide Unicode data, and put the resource information in the given
   1.307 +    'store'. (The codecs module can be used to open files or to wrap streams in
   1.308 +    order to provide Unicode data.)
   1.309 +
   1.310 +    The optional 'non_standard_newline' can be set to a true value (unlike the
   1.311 +    default) in order to attempt to process files with CR as the end of line
   1.312 +    character.
   1.313 +
   1.314 +    As a result of parsing the resource, the root node of the imported resource
   1.315 +    is returned.
   1.316 +    """
   1.317 +
   1.318 +    reader = Reader(f, non_standard_newline=non_standard_newline)
   1.319 +    parser = Parser()
   1.320 +    return parser.parse(reader)
   1.321 +
   1.322 +# vim: tabstop=4 expandtab shiftwidth=4
2008-10-16	Paul Boddie	raw files shortlog changelog graph	Created a vContent module based on essential parts of the RDFCalendar.Parsers module.
			vContent.py (file)