1.1 --- /dev/null Thu Jan 01 00:00:00 1970 +0000
1.2 +++ b/vContent.py Thu Oct 16 21:55:30 2008 +0200
1.3 @@ -0,0 +1,319 @@
1.4 +#!/usr/bin/env python
1.5 +
1.6 +"""
1.7 +Parsing of vCard, vCalendar and iCalendar files.
1.8 +
1.9 +Copyright (C) 2005, 2006, 2007, 2008 Paul Boddie <paul@boddie.org.uk>
1.10 +
1.11 +This program is free software; you can redistribute it and/or modify it under
1.12 +the terms of the GNU Lesser General Public License as published by the Free
1.13 +Software Foundation; either version 3 of the License, or (at your option) any
1.14 +later version.
1.15 +
1.16 +This program is distributed in the hope that it will be useful, but WITHOUT
1.17 +ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS
1.18 +FOR A PARTICULAR PURPOSE. See the GNU Lesser General Public License for more
1.19 +details.
1.20 +
1.21 +You should have received a copy of the GNU Lesser General Public License along
1.22 +with this program. If not, see <http://www.gnu.org/licenses/>.
1.23 +
1.24 +--------
1.25 +
1.26 +References:
1.27 +
1.28 +RFC 2445: Internet Calendaring and Scheduling Core Object Specification
1.29 + (iCalendar)
1.30 + http://rfc.net/rfc2445.html
1.31 +
1.32 +RFC 2425: A MIME Content-Type for Directory Information
1.33 + http://rfc.net/rfc2425.html
1.34 +
1.35 +RFC 2426: vCard MIME Directory Profile
1.36 + http://rfc.net/rfc2426.html
1.37 +"""
1.38 +
1.39 +# Encoding-related imports.
1.40 +
1.41 +import base64, quopri
1.42 +
1.43 +# Simple reader class.
1.44 +
1.45 +class Reader:
1.46 +
1.47 + "A simple class wrapping a file, providing simple pushback capabilities."
1.48 +
1.49 + def __init__(self, f, non_standard_newline=0):
1.50 +
1.51 + """
1.52 + Initialise the object with the file 'f'. If 'non_standard_newline' is
1.53 + set to a true value (unlike the default), lines ending with CR will be
1.54 + treated as complete lines.
1.55 + """
1.56 +
1.57 + self.f = f
1.58 + self.non_standard_newline = non_standard_newline
1.59 + self.lines = []
1.60 + self.line_number = 0
1.61 +
1.62 + def pushback(self, line):
1.63 +
1.64 + """
1.65 + Push the given 'line' back so that the next line read is actually the
1.66 + given 'line' and not the next line from the underlying file.
1.67 + """
1.68 +
1.69 + self.lines.append(line)
1.70 + self.line_number -= 1
1.71 +
1.72 + def readline(self):
1.73 +
1.74 + """
1.75 + If no pushed-back lines exist, read a line directly from the file.
1.76 + Otherwise, read from the list of pushed-back lines.
1.77 + """
1.78 +
1.79 + self.line_number += 1
1.80 + if self.lines:
1.81 + return self.lines.pop()
1.82 + else:
1.83 + # NOTE: Sanity check for broken lines (\r instead of \r\n or \n).
1.84 + line = self.f.readline()
1.85 + while line.endswith("\r") and not self.non_standard_newline:
1.86 + line += self.f.readline()
1.87 + if line.endswith("\r") and self.non_standard_newline:
1.88 + return line + "\n"
1.89 + else:
1.90 + return line
1.91 +
1.92 + def read_until(self, targets):
1.93 +
1.94 + """
1.95 + Read from the stream until one of the 'targets' is seen. Return the
1.96 + string from the current position up to the target found, along with the
1.97 + target string, using a tuple of the form (string, target). If no target
1.98 + was found, return the entire string together with a target of None.
1.99 + """
1.100 +
1.101 + indexes = {}
1.102 +
1.103 + # Remember the entire text read and the index of the current line in
1.104 + # that text.
1.105 +
1.106 + lines = []
1.107 +
1.108 + line = self.readline()
1.109 + lines.append(line)
1.110 + start = 0
1.111 +
1.112 + while indexes == {} and line != "":
1.113 + for target in targets:
1.114 + index = line.find(target)
1.115 +
1.116 + # Always choose the first matching target.
1.117 +
1.118 + if index != -1 and not indexes.has_key(start + index):
1.119 + indexes[start + index] = target
1.120 +
1.121 + start += len(line)
1.122 + line = self.readline()
1.123 + lines.append(line)
1.124 +
1.125 + text = "".join(lines)
1.126 +
1.127 + if indexes:
1.128 + min_index = reduce(min, indexes.keys())
1.129 + target = indexes[min_index]
1.130 +
1.131 + # Skip the target.
1.132 + # Since the end of the buffer should always be a newline, ignore the
1.133 + # last element.
1.134 +
1.135 + lines = text[min_index + len(target):].split("\n")[:]
1.136 + if not lines[-1]:
1.137 + del lines[-1]
1.138 + lines.reverse()
1.139 +
1.140 + for line in lines:
1.141 + self.pushback(line + "\n")
1.142 +
1.143 + return text[:min_index], target
1.144 + else:
1.145 + return text, None
1.146 +
1.147 +class StreamParser:
1.148 +
1.149 + "A stream parser for content in vCard/vCalendar/iCalendar-like formats."
1.150 +
1.151 + def __init__(self, f):
1.152 +
1.153 + "Initialise the parser for the given file 'f'."
1.154 +
1.155 + self.f = f
1.156 +
1.157 + def __iter__(self):
1.158 +
1.159 + "Return self as the iterator."
1.160 +
1.161 + return self
1.162 +
1.163 + def next(self):
1.164 +
1.165 + """
1.166 + Return the next content item in the file as a tuple of the form
1.167 + (name, parameters, values).
1.168 + """
1.169 +
1.170 + return self.parse_content_line()
1.171 +
1.172 + def parse_content_line(self):
1.173 +
1.174 + """
1.175 + Return the name, parameters and a list containing value information for
1.176 + the current content line in the file being parsed.
1.177 + """
1.178 +
1.179 + f = self.f
1.180 +
1.181 + parameters = {}
1.182 + name, sep = f.read_until([";", ":"])
1.183 +
1.184 + name = name.strip()
1.185 +
1.186 + if not name and sep is None:
1.187 + raise StopIteration
1.188 +
1.189 + while sep == ";":
1.190 +
1.191 + # Find the actual modifier.
1.192 +
1.193 + parameter_name, sep = f.read_until(["=", ";", ":"])
1.194 + parameter_name = parameter_name.strip()
1.195 +
1.196 + if sep == "=":
1.197 + parameter_value, sep = f.read_until([";", ":"])
1.198 + parameter_value = parameter_value.strip()
1.199 + else:
1.200 + parameter_value = None
1.201 +
1.202 + # Append a key, value tuple to the parameters list.
1.203 +
1.204 + parameters[parameter_name] = parameter_value
1.205 +
1.206 + # Get the value content.
1.207 +
1.208 + if sep != ":":
1.209 + raise ValueError, f.line_number
1.210 +
1.211 + # Strip all appropriate whitespace from the right end of each line.
1.212 + # For subsequent lines, remove the first whitespace character.
1.213 + # See section 4.1 of the iCalendar specification.
1.214 +
1.215 + line = f.readline()
1.216 + value_lines = [line.rstrip("\r\n")]
1.217 + line = f.readline()
1.218 + while line != "" and line[0] in [" ", "\t"]:
1.219 + value_lines.append(line.rstrip("\r\n")[1:])
1.220 + line = f.readline()
1.221 +
1.222 + # Since one line too many will have been read, push the line back into the
1.223 + # file.
1.224 +
1.225 + f.pushback(line)
1.226 +
1.227 + # Decode the value.
1.228 +
1.229 + value = self.decode("".join(value_lines), parameters.get("ENCODING"))
1.230 +
1.231 + return name, parameters, value
1.232 +
1.233 + def decode(self, value, encoding):
1.234 +
1.235 + "Decode the 'value' with the given 'encoding'."
1.236 +
1.237 + # NOTE: Assuming ISO 8869-1 for the character set.
1.238 +
1.239 + if encoding == "QUOTED-PRINTABLE":
1.240 + return unicode(quopri.decodestring(value), "iso-8859-1")
1.241 + elif encoding == "BASE64":
1.242 + return base64.decodestring(value)
1.243 + else:
1.244 + # NOTE: Introducing newline conversions.
1.245 + # Replace quoted characters (see 4.3.11 in RFC 2445).
1.246 +
1.247 + return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n").replace("\\,", ",").replace("\\;", ";")
1.248 +
1.249 +class Parser:
1.250 +
1.251 + "A parser for content in vCard/vCalendar/iCalendar-like formats."
1.252 +
1.253 + def __init__(self):
1.254 +
1.255 + "Initialise the parser."
1.256 +
1.257 + self.elements = [] # also known as components
1.258 + self.document = []
1.259 + self.current = self.document
1.260 +
1.261 + def parse(self, f):
1.262 +
1.263 + "Parse the contents of the file 'f'."
1.264 +
1.265 + parser = StreamParser(f)
1.266 +
1.267 + for name, parameters, value in parser:
1.268 +
1.269 + # Add new elements/components to the current position in the
1.270 + # document, recording the element as the active element.
1.271 +
1.272 + if name == "BEGIN":
1.273 + children = []
1.274 + element = (value, parameters, children)
1.275 + self.elements.append(element)
1.276 + self.current.append(element)
1.277 + self.current = children
1.278 +
1.279 + # End elements by removing them from the active element stack and
1.280 + # making the next element's children the current position for new
1.281 + # content.
1.282 +
1.283 + elif name == "END":
1.284 + start_element = self.elements.pop()
1.285 + start_value, start_parameters, children = start_element
1.286 + if start_value != value:
1.287 + raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % (
1.288 + start_value, value, f.line_number)
1.289 + if self.elements:
1.290 + parent_value, parent_parameters, children = self.elements[-1]
1.291 + self.current = children
1.292 + else:
1.293 + self.current = self.document
1.294 +
1.295 + else:
1.296 + self.current.append((name, parameters, value))
1.297 +
1.298 + return self.document
1.299 +
1.300 +# Public functions.
1.301 +
1.302 +def parse(f, non_standard_newline=0):
1.303 +
1.304 + """
1.305 + Parse the resource data found through the use of the file object 'f', which
1.306 + should provide Unicode data, and put the resource information in the given
1.307 + 'store'. (The codecs module can be used to open files or to wrap streams in
1.308 + order to provide Unicode data.)
1.309 +
1.310 + The optional 'non_standard_newline' can be set to a true value (unlike the
1.311 + default) in order to attempt to process files with CR as the end of line
1.312 + character.
1.313 +
1.314 + As a result of parsing the resource, the root node of the imported resource
1.315 + is returned.
1.316 + """
1.317 +
1.318 + reader = Reader(f, non_standard_newline=non_standard_newline)
1.319 + parser = Parser()
1.320 + return parser.parse(reader)
1.321 +
1.322 +# vim: tabstop=4 expandtab shiftwidth=4