1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2011, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 5545: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://tools.ietf.org/html/rfc5545 28 29 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 30 (iCalendar) 31 http://tools.ietf.org/html/rfc2445 32 33 RFC 2425: A MIME Content-Type for Directory Information 34 http://tools.ietf.org/html/rfc2425 35 36 RFC 2426: vCard MIME Directory Profile 37 http://tools.ietf.org/html/rfc2426 38 """ 39 40 try: 41 set 42 except NameError: 43 from sets import Set as set 44 45 # Encoding-related imports. 46 47 import base64, quopri 48 import codecs 49 50 # Tokenisation help. 51 52 import re 53 54 # Configuration. 55 56 default_encoding = "utf-8" 57 58 # Reader and parser classes. 59 60 class Reader: 61 62 "A simple class wrapping a file, providing simple pushback capabilities." 63 64 def __init__(self, f, non_standard_newline=0): 65 66 """ 67 Initialise the object with the file 'f'. If 'non_standard_newline' is 68 set to a true value (unlike the default), lines ending with CR will be 69 treated as complete lines. 70 """ 71 72 self.f = f 73 self.non_standard_newline = non_standard_newline 74 self.lines = [] 75 self.line_number = 1 # about to read line 1 76 77 def close(self): 78 79 "Close the reader." 80 81 self.f.close() 82 83 def pushback(self, line): 84 85 """ 86 Push the given 'line' back so that the next line read is actually the 87 given 'line' and not the next line from the underlying file. 88 """ 89 90 self.lines.append(line) 91 self.line_number -= 1 92 93 def readline(self): 94 95 """ 96 If no pushed-back lines exist, read a line directly from the file. 97 Otherwise, read from the list of pushed-back lines. 98 """ 99 100 self.line_number += 1 101 if self.lines: 102 return self.lines.pop() 103 else: 104 # Sanity check for broken lines (\r instead of \r\n or \n). 105 line = self.f.readline() 106 while line.endswith("\r") and not self.non_standard_newline: 107 line += self.f.readline() 108 if line.endswith("\r") and self.non_standard_newline: 109 return line + "\n" 110 else: 111 return line 112 113 def read_content_line(self): 114 115 """ 116 Read an entire content line, itself potentially consisting of many 117 physical lines of text, returning a string. 118 """ 119 120 # Skip blank lines. 121 122 line = self.readline() 123 while line: 124 line_stripped = line.rstrip("\r\n") 125 if not line_stripped: 126 line = self.readline() 127 else: 128 break 129 else: 130 return "" 131 132 # Strip all appropriate whitespace from the right end of each line. 133 # For subsequent lines, remove the first whitespace character. 134 # See section 4.1 of the iCalendar specification. 135 136 lines = [line_stripped] 137 138 line = self.readline() 139 while line.startswith(" ") or line.startswith("\t"): 140 lines.append(line[1:].rstrip("\r\n")) 141 line = self.readline() 142 143 # Since one line too many will have been read, push the line back into 144 # the file. 145 146 if line: 147 self.pushback(line) 148 149 return "".join(lines) 150 151 def get_content_line(self): 152 153 "Return a content line object for the current line." 154 155 return ContentLine(self.read_content_line()) 156 157 class ContentLine: 158 159 "A content line which can be searched." 160 161 SEPARATORS = re.compile('[;:"]') 162 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 163 164 def __init__(self, text): 165 self.text = text 166 self.start = 0 167 168 def __repr__(self): 169 return "ContentLine(%r)" % self.text 170 171 def get_remaining(self): 172 173 "Get the remaining text from the content line." 174 175 return self.text[self.start:] 176 177 def search(self, targets): 178 179 """ 180 Find one of the 'targets' in the text, returning the string from the 181 current position up to the target found, along with the target string, 182 using a tuple of the form (string, target). If no target was found, 183 return the entire string together with a target of None. 184 185 The 'targets' parameter must be a regular expression object or an object 186 compatible with the API of such objects. 187 """ 188 189 text = self.text 190 start = pos = self.start 191 length = len(text) 192 193 # Remember the first target. 194 195 first = None 196 first_pos = None 197 in_quoted_region = 0 198 199 # Process the text, looking for the targets. 200 201 while pos < length: 202 match = targets.search(text, pos) 203 204 # Where nothing matches, end the search. 205 206 if match is None: 207 pos = length 208 209 # Where a double quote matches, toggle the region state. 210 211 elif match.group() == '"': 212 in_quoted_region = not in_quoted_region 213 pos = match.end() 214 215 # Where something else matches outside a region, stop searching. 216 217 elif not in_quoted_region: 218 first = match.group() 219 first_pos = match.start() 220 break 221 222 # Otherwise, keep looking for the end of the region. 223 224 else: 225 pos = match.end() 226 227 # Where no more input can provide the targets, return a special result. 228 229 else: 230 self.start = length 231 return text[start:], None 232 233 self.start = match.end() 234 return text[start:first_pos], first 235 236 class StreamParser: 237 238 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 239 240 def __init__(self, f): 241 242 "Initialise the parser for the given file 'f'." 243 244 self.f = f 245 246 def close(self): 247 248 "Close the reader." 249 250 self.f.close() 251 252 def __iter__(self): 253 254 "Return self as the iterator." 255 256 return self 257 258 def next(self): 259 260 """ 261 Return the next content item in the file as a tuple of the form 262 (name, parameters, values). 263 """ 264 265 return self.parse_content_line() 266 267 def decode_content(self, value): 268 269 "Decode the given 'value', replacing quoted characters." 270 271 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 272 273 # Internal methods. 274 275 def parse_content_line(self): 276 277 """ 278 Return the name, parameters and value information for the current 279 content line in the file being parsed. 280 """ 281 282 f = self.f 283 line_number = f.line_number 284 line = f.get_content_line() 285 286 # Read the property name. 287 288 name, sep = line.search(line.SEPARATORS) 289 name = name.strip() 290 291 if not name and sep is None: 292 raise StopIteration 293 294 # Read the parameters. 295 296 parameters = {} 297 298 while sep == ";": 299 300 # Find the actual modifier. 301 302 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 303 parameter_name = parameter_name.strip() 304 305 if sep == "=": 306 parameter_value, sep = line.search(line.SEPARATORS) 307 parameter_value = parameter_value.strip() 308 else: 309 parameter_value = None 310 311 # Append a key, value tuple to the parameters list. 312 313 parameters[parameter_name] = parameter_value 314 315 # Get the value content. 316 317 if sep != ":": 318 raise ValueError, (line_number, line) 319 320 # Obtain and decode the value. 321 322 value = self.decode(name, parameters, line.get_remaining()) 323 324 return name, parameters, value 325 326 def decode(self, name, parameters, value): 327 328 "Decode using 'name' and 'parameters' the given 'value'." 329 330 encoding = parameters.get("ENCODING") 331 charset = parameters.get("CHARSET") 332 333 value = self.decode_content(value) 334 335 if encoding == "QUOTED-PRINTABLE": 336 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 337 elif encoding == "BASE64": 338 return base64.decodestring(value) 339 else: 340 return value 341 342 class ParserBase: 343 344 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 345 346 def __init__(self): 347 348 "Initialise the parser." 349 350 self.names = [] 351 352 def parse(self, f, parser_cls=None): 353 354 "Parse the contents of the file 'f'." 355 356 parser = (parser_cls or StreamParser)(f) 357 358 for name, parameters, value in parser: 359 360 if name == "BEGIN": 361 self.names.append(value) 362 self.startComponent(value, parameters) 363 364 elif name == "END": 365 start_name = self.names.pop() 366 if start_name != value: 367 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 368 start_name, value, f.line_number) 369 370 self.endComponent(value) 371 372 else: 373 self.handleProperty(name, parameters, value) 374 375 class Parser(ParserBase): 376 377 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 378 379 def __init__(self): 380 ParserBase.__init__(self) 381 self.components = [] 382 383 def startComponent(self, name, parameters): 384 385 """ 386 Add the component with the given 'name' and 'parameters', recording an 387 empty list of children as part of the component's content. 388 """ 389 390 component = self.handleProperty(name, parameters) 391 self.components.append(component) 392 return component 393 394 def endComponent(self, name): 395 396 """ 397 End the component with the given 'name' by removing it from the active 398 component stack. If only one component exists on the stack, retain it 399 for later inspection. 400 """ 401 402 if len(self.components) > 1: 403 return self.components.pop() 404 405 # Or return the only element. 406 407 elif self.components: 408 return self.components[0] 409 410 def handleProperty(self, name, parameters, value=None): 411 412 """ 413 Record the property with the given 'name', 'parameters' and optional 414 'value' as part of the current component's children. 415 """ 416 417 component = self.makeComponent(name, parameters, value) 418 self.attachComponent(component) 419 return component 420 421 # Component object construction/manipulation methods. 422 423 def attachComponent(self, component): 424 425 "Attach the given 'component' to its parent." 426 427 if self.components: 428 component_name, component_parameters, component_children = self.components[-1] 429 component_children.append(component) 430 431 def makeComponent(self, name, parameters, value=None): 432 433 """ 434 Make a component object from the given 'name', 'parameters' and optional 435 'value'. 436 """ 437 438 return (name, parameters, value or []) 439 440 # Public methods. 441 442 def parse(self, f, parser_cls=None): 443 444 "Parse the contents of the file 'f'." 445 446 ParserBase.parse(self, f, parser_cls) 447 return self.components[0] 448 449 # Writer classes. 450 451 class Writer: 452 453 "A simple class wrapping a file, providing simple output capabilities." 454 455 default_line_length = 76 456 457 def __init__(self, write, line_length=None): 458 459 """ 460 Initialise the object with the given 'write' operation. If 'line_length' 461 is set, the length of written lines will conform to the specified value 462 instead of the default value. 463 """ 464 465 self._write = write 466 self.line_length = line_length or self.default_line_length 467 self.char_offset = 0 468 469 def write(self, text): 470 471 "Write the 'text' to the file." 472 473 write = self._write 474 line_length = self.line_length 475 476 i = 0 477 remaining = len(text) 478 479 while remaining: 480 space = line_length - self.char_offset 481 if remaining > space: 482 write(text[i:i + space]) 483 write("\r\n ") 484 self.char_offset = 1 485 i += space 486 remaining -= space 487 else: 488 write(text[i:]) 489 self.char_offset += remaining 490 i += remaining 491 remaining = 0 492 493 def end_line(self): 494 495 "End the current content line." 496 497 if self.char_offset > 0: 498 self.char_offset = 0 499 self._write("\r\n") 500 501 class StreamWriter: 502 503 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 504 505 def __init__(self, f): 506 507 "Initialise the stream writer with the given 'f' stream object." 508 509 self.f = f 510 511 def write(self, name, parameters, value): 512 513 """ 514 Write a content line, serialising the given 'name', 'parameters' and 515 'value' information. 516 """ 517 518 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 519 520 # Internal methods. 521 522 def write_content_line(self, name, encoded_parameters, encoded_value): 523 524 """ 525 Write a content line for the given 'name', 'encoded_parameters' and 526 'encoded_value' information. 527 """ 528 529 f = self.f 530 531 f.write(name) 532 for param_name, param_value in encoded_parameters.items(): 533 f.write(";") 534 f.write(param_name) 535 f.write("=") 536 f.write(param_value) 537 f.write(":") 538 f.write(encoded_value) 539 f.end_line() 540 541 def encode_quoted_parameter_value(self, value): 542 543 "Encode the given 'value'." 544 545 return '"%s"' % value 546 547 def encode_value(self, name, parameters, value): 548 549 """ 550 Encode using 'name' and 'parameters' the given 'value' so that the 551 resulting encoded form employs any specified character encodings. 552 """ 553 554 encoding = parameters.get("ENCODING") 555 charset = parameters.get("CHARSET") 556 557 if encoding == "QUOTED-PRINTABLE": 558 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 559 elif encoding == "BASE64": 560 value = base64.encodestring(value) 561 562 return self.encode_content(value) 563 564 # Overrideable methods. 565 566 def encode_parameters(self, parameters): 567 568 """ 569 Encode the given 'parameters' according to the vCalendar specification. 570 """ 571 572 encoded_parameters = {} 573 574 for param_name, param_value in parameters.items(): 575 576 # Basic format support merely involves quoting values which seem to 577 # need it. Other more specific formats may define exactly which 578 # parameters should be quoted. 579 580 if ContentLine.SEPARATORS.search(param_value): 581 param_value = self.encode_quoted_parameter_value(param_value) 582 583 encoded_parameters[param_name] = param_value 584 585 return encoded_parameters 586 587 def encode_content(self, value): 588 589 "Encode the given 'value', quoting characters." 590 591 return value.replace("\n", "\\n") 592 593 # Utility functions. 594 595 def is_input_stream(stream_or_string): 596 return hasattr(stream_or_string, "read") 597 598 def get_input_stream(stream_or_string, encoding=None): 599 if is_input_stream(stream_or_string): 600 return stream_or_string 601 else: 602 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 603 604 def get_output_stream(stream_or_string, encoding=None): 605 if hasattr(stream_or_string, "write"): 606 return stream_or_string 607 else: 608 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 609 610 # Public functions. 611 612 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 613 614 """ 615 Parse the resource data found through the use of the 'stream_or_string', 616 which is either a stream providing Unicode data (the codecs module can be 617 used to open files or to wrap streams in order to provide Unicode data) or a 618 filename identifying a file to be parsed. 619 620 The optional 'encoding' can be used to specify the character encoding used 621 by the file to be parsed. 622 623 The optional 'non_standard_newline' can be set to a true value (unlike the 624 default) in order to attempt to process files with CR as the end of line 625 character. 626 627 As a result of parsing the resource, the root node of the imported resource 628 is returned. 629 """ 630 631 stream = get_input_stream(stream_or_string, encoding) 632 reader = Reader(stream, non_standard_newline) 633 634 # Parse using the reader. 635 636 try: 637 parser = (parser_cls or Parser)() 638 return parser.parse(reader) 639 640 # Close any opened streams. 641 642 finally: 643 if not is_input_stream(stream_or_string): 644 reader.close() 645 646 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 647 648 """ 649 Parse the resource data found through the use of the 'stream_or_string', 650 which is either a stream providing Unicode data (the codecs module can be 651 used to open files or to wrap streams in order to provide Unicode data) or a 652 filename identifying a file to be parsed. 653 654 The optional 'encoding' can be used to specify the character encoding used 655 by the file to be parsed. 656 657 The optional 'non_standard_newline' can be set to a true value (unlike the 658 default) in order to attempt to process files with CR as the end of line 659 character. 660 661 An iterator is returned which provides event tuples describing parsing 662 events of the form (name, parameters, value). 663 """ 664 665 stream = get_input_stream(stream_or_string, encoding) 666 reader = Reader(stream, non_standard_newline) 667 parser = (parser_cls or StreamParser)(reader) 668 return parser 669 670 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None): 671 672 """ 673 Return a writer which will either send data to the resource found through 674 the use of 'stream_or_string' or using the given 'write' operation. 675 676 The 'stream_or_string' parameter may be either a stream accepting Unicode 677 data (the codecs module can be used to open files or to wrap streams in 678 order to accept Unicode data) or a filename identifying a file to be 679 written. 680 681 The optional 'encoding' can be used to specify the character encoding used 682 by the file to be written. 683 684 The optional 'line_length' can be used to specify how long lines should be 685 in the resulting data. 686 """ 687 688 if stream_or_string: 689 stream = get_output_stream(stream_or_string, encoding) 690 _writer = Writer(stream.write, line_length) 691 elif write: 692 _writer = Writer(write, line_length) 693 else: 694 raise IOError, "No stream, filename or write operation specified." 695 696 return (writer_cls or StreamWriter)(_writer) 697 698 # vim: tabstop=4 expandtab shiftwidth=4