1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2011, 2013, 7 2014 Paul Boddie <paul@boddie.org.uk> 8 9 This program is free software; you can redistribute it and/or modify it under 10 the terms of the GNU General Public License as published by the Free Software 11 Foundation; either version 3 of the License, or (at your option) any later 12 version. 13 14 This program is distributed in the hope that it will be useful, but WITHOUT 15 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 16 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 17 details. 18 19 You should have received a copy of the GNU General Public License along with 20 this program. If not, see <http://www.gnu.org/licenses/>. 21 22 -------- 23 24 References: 25 26 RFC 5545: Internet Calendaring and Scheduling Core Object Specification 27 (iCalendar) 28 http://tools.ietf.org/html/rfc5545 29 30 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 31 (iCalendar) 32 http://tools.ietf.org/html/rfc2445 33 34 RFC 2425: A MIME Content-Type for Directory Information 35 http://tools.ietf.org/html/rfc2425 36 37 RFC 2426: vCard MIME Directory Profile 38 http://tools.ietf.org/html/rfc2426 39 """ 40 41 try: 42 set 43 except NameError: 44 from sets import Set as set 45 46 # Encoding-related imports. 47 48 import base64, quopri 49 import codecs 50 51 # Tokenisation help. 52 53 import re 54 55 # Configuration. 56 57 default_encoding = "utf-8" 58 59 class ParseError(Exception): 60 61 "General parsing errors." 62 63 pass 64 65 # Reader and parser classes. 66 67 class Reader: 68 69 "A simple class wrapping a file, providing simple pushback capabilities." 70 71 def __init__(self, f, non_standard_newline=0): 72 73 """ 74 Initialise the object with the file 'f'. If 'non_standard_newline' is 75 set to a true value (unlike the default), lines ending with CR will be 76 treated as complete lines. 77 """ 78 79 self.f = f 80 self.non_standard_newline = non_standard_newline 81 self.lines = [] 82 self.line_number = 1 # about to read line 1 83 84 def close(self): 85 86 "Close the reader." 87 88 self.f.close() 89 90 def pushback(self, line): 91 92 """ 93 Push the given 'line' back so that the next line read is actually the 94 given 'line' and not the next line from the underlying file. 95 """ 96 97 self.lines.append(line) 98 self.line_number -= 1 99 100 def readline(self): 101 102 """ 103 If no pushed-back lines exist, read a line directly from the file. 104 Otherwise, read from the list of pushed-back lines. 105 """ 106 107 self.line_number += 1 108 if self.lines: 109 return self.lines.pop() 110 else: 111 # Sanity check for broken lines (\r instead of \r\n or \n). 112 line = self.f.readline() 113 while line.endswith("\r") and not self.non_standard_newline: 114 s = self.f.readline() 115 if not s: 116 break 117 line += s 118 if line.endswith("\r") and self.non_standard_newline: 119 return line + "\n" 120 else: 121 return line 122 123 def read_content_line(self): 124 125 """ 126 Read an entire content line, itself potentially consisting of many 127 physical lines of text, returning a string. 128 """ 129 130 # Skip blank lines. 131 132 line = self.readline() 133 while line: 134 line_stripped = line.rstrip("\r\n") 135 if not line_stripped: 136 line = self.readline() 137 else: 138 break 139 else: 140 return "" 141 142 # Strip all appropriate whitespace from the right end of each line. 143 # For subsequent lines, remove the first whitespace character. 144 # See section 4.1 of the iCalendar specification. 145 146 lines = [line_stripped] 147 148 line = self.readline() 149 while line.startswith(" ") or line.startswith("\t"): 150 lines.append(line[1:].rstrip("\r\n")) 151 line = self.readline() 152 153 # Since one line too many will have been read, push the line back into 154 # the file. 155 156 if line: 157 self.pushback(line) 158 159 return "".join(lines) 160 161 def get_content_line(self): 162 163 "Return a content line object for the current line." 164 165 return ContentLine(self.read_content_line()) 166 167 class ContentLine: 168 169 "A content line which can be searched." 170 171 SEPARATORS = re.compile('[;:"]') 172 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 173 174 def __init__(self, text): 175 self.text = text 176 self.start = 0 177 178 def __repr__(self): 179 return "ContentLine(%r)" % self.text 180 181 def get_remaining(self): 182 183 "Get the remaining text from the content line." 184 185 return self.text[self.start:] 186 187 def search(self, targets): 188 189 """ 190 Find one of the 'targets' in the text, returning the string from the 191 current position up to the target found, along with the target string, 192 using a tuple of the form (string, target). If no target was found, 193 return the entire string together with a target of None. 194 195 The 'targets' parameter must be a regular expression object or an object 196 compatible with the API of such objects. 197 """ 198 199 text = self.text 200 start = pos = self.start 201 length = len(text) 202 203 # Remember the first target. 204 205 first = None 206 first_pos = None 207 in_quoted_region = 0 208 209 # Process the text, looking for the targets. 210 211 while pos < length: 212 match = targets.search(text, pos) 213 214 # Where nothing matches, end the search. 215 216 if match is None: 217 pos = length 218 219 # Where a double quote matches, toggle the region state. 220 221 elif match.group() == '"': 222 in_quoted_region = not in_quoted_region 223 pos = match.end() 224 225 # Where something else matches outside a region, stop searching. 226 227 elif not in_quoted_region: 228 first = match.group() 229 first_pos = match.start() 230 break 231 232 # Otherwise, keep looking for the end of the region. 233 234 else: 235 pos = match.end() 236 237 # Where no more input can provide the targets, return a special result. 238 239 else: 240 self.start = length 241 return text[start:], None 242 243 self.start = match.end() 244 return text[start:first_pos], first 245 246 class StreamParser: 247 248 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 249 250 def __init__(self, f): 251 252 "Initialise the parser for the given file 'f'." 253 254 self.f = f 255 256 def close(self): 257 258 "Close the reader." 259 260 self.f.close() 261 262 def __iter__(self): 263 264 "Return self as the iterator." 265 266 return self 267 268 def next(self): 269 270 """ 271 Return the next content item in the file as a tuple of the form 272 (name, parameters, values). 273 """ 274 275 return self.parse_content_line() 276 277 def decode_content(self, value): 278 279 "Decode the given 'value', replacing quoted characters." 280 281 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 282 283 # Internal methods. 284 285 def parse_content_line(self): 286 287 """ 288 Return the name, parameters and value information for the current 289 content line in the file being parsed. 290 """ 291 292 f = self.f 293 line_number = f.line_number 294 line = f.get_content_line() 295 296 # Read the property name. 297 298 name, sep = line.search(line.SEPARATORS) 299 name = name.strip() 300 301 if not name and sep is None: 302 raise StopIteration 303 304 # Read the parameters. 305 306 parameters = {} 307 308 while sep == ";": 309 310 # Find the actual modifier. 311 312 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 313 parameter_name = parameter_name.strip() 314 315 if sep == "=": 316 parameter_value, sep = line.search(line.SEPARATORS) 317 parameter_value = parameter_value.strip() 318 else: 319 parameter_value = None 320 321 # Append a key, value tuple to the parameters list. 322 323 parameters[parameter_name] = parameter_value 324 325 # Get the value content. 326 327 if sep != ":": 328 raise ValueError, (line_number, line) 329 330 # Obtain and decode the value. 331 332 value = self.decode(name, parameters, line.get_remaining()) 333 334 return name, parameters, value 335 336 def decode(self, name, parameters, value): 337 338 "Decode using 'name' and 'parameters' the given 'value'." 339 340 encoding = parameters.get("ENCODING") 341 charset = parameters.get("CHARSET") 342 343 value = self.decode_content(value) 344 345 if encoding == "QUOTED-PRINTABLE": 346 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 347 elif encoding == "BASE64": 348 return base64.decodestring(value) 349 else: 350 return value 351 352 class ParserBase: 353 354 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 355 356 def __init__(self): 357 358 "Initialise the parser." 359 360 self.names = [] 361 362 def parse(self, f, parser_cls=None): 363 364 "Parse the contents of the file 'f'." 365 366 parser = (parser_cls or StreamParser)(f) 367 368 for name, parameters, value in parser: 369 370 if name == "BEGIN": 371 self.names.append(value) 372 self.startComponent(value, parameters) 373 374 elif name == "END": 375 start_name = self.names.pop() 376 if start_name != value: 377 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 378 start_name, value, f.line_number) 379 380 self.endComponent(value) 381 382 else: 383 self.handleProperty(name, parameters, value) 384 385 class Parser(ParserBase): 386 387 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 388 389 def __init__(self): 390 ParserBase.__init__(self) 391 self.components = [] 392 393 def startComponent(self, name, parameters): 394 395 """ 396 Add the component with the given 'name' and 'parameters', recording an 397 empty list of children as part of the component's content. 398 """ 399 400 component = self.handleProperty(name, parameters) 401 self.components.append(component) 402 return component 403 404 def endComponent(self, name): 405 406 """ 407 End the component with the given 'name' by removing it from the active 408 component stack. If only one component exists on the stack, retain it 409 for later inspection. 410 """ 411 412 if len(self.components) > 1: 413 return self.components.pop() 414 415 # Or return the only element. 416 417 elif self.components: 418 return self.components[0] 419 420 def handleProperty(self, name, parameters, value=None): 421 422 """ 423 Record the property with the given 'name', 'parameters' and optional 424 'value' as part of the current component's children. 425 """ 426 427 component = self.makeComponent(name, parameters, value) 428 self.attachComponent(component) 429 return component 430 431 # Component object construction/manipulation methods. 432 433 def attachComponent(self, component): 434 435 "Attach the given 'component' to its parent." 436 437 if self.components: 438 component_name, component_parameters, component_children = self.components[-1] 439 component_children.append(component) 440 441 def makeComponent(self, name, parameters, value=None): 442 443 """ 444 Make a component object from the given 'name', 'parameters' and optional 445 'value'. 446 """ 447 448 return (name, parameters, value or []) 449 450 # Public methods. 451 452 def parse(self, f, parser_cls=None): 453 454 "Parse the contents of the file 'f'." 455 456 ParserBase.parse(self, f, parser_cls) 457 return self.components[0] 458 459 # Writer classes. 460 461 class Writer: 462 463 "A simple class wrapping a file, providing simple output capabilities." 464 465 default_line_length = 76 466 467 def __init__(self, write, line_length=None): 468 469 """ 470 Initialise the object with the given 'write' operation. If 'line_length' 471 is set, the length of written lines will conform to the specified value 472 instead of the default value. 473 """ 474 475 self._write = write 476 self.line_length = line_length or self.default_line_length 477 self.char_offset = 0 478 479 def write(self, text): 480 481 "Write the 'text' to the file." 482 483 write = self._write 484 line_length = self.line_length 485 486 i = 0 487 remaining = len(text) 488 489 while remaining: 490 space = line_length - self.char_offset 491 if remaining > space: 492 write(text[i:i + space]) 493 write("\r\n ") 494 self.char_offset = 1 495 i += space 496 remaining -= space 497 else: 498 write(text[i:]) 499 self.char_offset += remaining 500 i += remaining 501 remaining = 0 502 503 def end_line(self): 504 505 "End the current content line." 506 507 if self.char_offset > 0: 508 self.char_offset = 0 509 self._write("\r\n") 510 511 class StreamWriter: 512 513 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 514 515 def __init__(self, f): 516 517 "Initialise the stream writer with the given 'f' stream object." 518 519 self.f = f 520 521 def append(self, record): 522 self.write(*record) 523 524 def write(self, name, parameters, value): 525 526 """ 527 Write a content line, serialising the given 'name', 'parameters' and 528 'value' information. 529 """ 530 531 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 532 533 # Internal methods. 534 535 def write_content_line(self, name, encoded_parameters, encoded_value): 536 537 """ 538 Write a content line for the given 'name', 'encoded_parameters' and 539 'encoded_value' information. 540 """ 541 542 f = self.f 543 544 f.write(name) 545 for param_name, param_value in encoded_parameters.items(): 546 f.write(";") 547 f.write(param_name) 548 f.write("=") 549 f.write(param_value) 550 f.write(":") 551 f.write(encoded_value) 552 f.end_line() 553 554 def encode_quoted_parameter_value(self, value): 555 556 "Encode the given 'value'." 557 558 return '"%s"' % value 559 560 def encode_value(self, name, parameters, value): 561 562 """ 563 Encode using 'name' and 'parameters' the given 'value' so that the 564 resulting encoded form employs any specified character encodings. 565 """ 566 567 encoding = parameters.get("ENCODING") 568 charset = parameters.get("CHARSET") 569 570 if encoding == "QUOTED-PRINTABLE": 571 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 572 elif encoding == "BASE64": 573 value = base64.encodestring(value) 574 575 return self.encode_content(value) 576 577 # Overrideable methods. 578 579 def encode_parameters(self, parameters): 580 581 """ 582 Encode the given 'parameters' according to the vCalendar specification. 583 """ 584 585 encoded_parameters = {} 586 587 for param_name, param_value in parameters.items(): 588 589 # Basic format support merely involves quoting values which seem to 590 # need it. Other more specific formats may define exactly which 591 # parameters should be quoted. 592 593 if ContentLine.SEPARATORS.search(param_value): 594 param_value = self.encode_quoted_parameter_value(param_value) 595 596 encoded_parameters[param_name] = param_value 597 598 return encoded_parameters 599 600 def encode_content(self, value): 601 602 "Encode the given 'value', quoting characters." 603 604 return value.replace("\n", "\\n") 605 606 # Utility functions. 607 608 def is_input_stream(stream_or_string): 609 return hasattr(stream_or_string, "read") 610 611 def get_input_stream(stream_or_string, encoding=None): 612 if is_input_stream(stream_or_string): 613 return stream_or_string 614 else: 615 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 616 617 def get_output_stream(stream_or_string, encoding=None): 618 if hasattr(stream_or_string, "write"): 619 return stream_or_string 620 else: 621 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 622 623 def items_to_dict(items): 624 625 """ 626 Return the given 'items' as a dictionary mapping names to tuples of the form 627 (value, attributes). 628 """ 629 630 d = {} 631 for name, attr, value in items: 632 if not d.has_key(name): 633 d[name] = [] 634 if isinstance(value, list): 635 d[name].append((items_to_dict(value), attr)) 636 else: 637 d[name].append((value, attr)) 638 return d 639 640 def dict_to_items(d): 641 642 """ 643 Return 'd' converted to a list of items suitable for serialisation using 644 iterwrite. 645 """ 646 647 items = [] 648 for name, value in d.items(): 649 if isinstance(value, list): 650 for v, a in value: 651 if isinstance(v, dict): 652 items.append((name, a, dict_to_items(v))) 653 else: 654 items.append((name, a, v)) 655 else: 656 v, a = value 657 items.append((name, a, dict_to_items(v))) 658 return items 659 660 # Public functions. 661 662 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 663 664 """ 665 Parse the resource data found through the use of the 'stream_or_string', 666 which is either a stream providing Unicode data (the codecs module can be 667 used to open files or to wrap streams in order to provide Unicode data) or a 668 filename identifying a file to be parsed. 669 670 The optional 'encoding' can be used to specify the character encoding used 671 by the file to be parsed. 672 673 The optional 'non_standard_newline' can be set to a true value (unlike the 674 default) in order to attempt to process files with CR as the end of line 675 character. 676 677 As a result of parsing the resource, the root node of the imported resource 678 is returned. 679 """ 680 681 stream = get_input_stream(stream_or_string, encoding) 682 reader = Reader(stream, non_standard_newline) 683 684 # Parse using the reader. 685 686 try: 687 parser = (parser_cls or Parser)() 688 return parser.parse(reader) 689 690 # Close any opened streams. 691 692 finally: 693 if not is_input_stream(stream_or_string): 694 reader.close() 695 696 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 697 698 """ 699 Parse the resource data found through the use of the 'stream_or_string', 700 which is either a stream providing Unicode data (the codecs module can be 701 used to open files or to wrap streams in order to provide Unicode data) or a 702 filename identifying a file to be parsed. 703 704 The optional 'encoding' can be used to specify the character encoding used 705 by the file to be parsed. 706 707 The optional 'non_standard_newline' can be set to a true value (unlike the 708 default) in order to attempt to process files with CR as the end of line 709 character. 710 711 An iterator is returned which provides event tuples describing parsing 712 events of the form (name, parameters, value). 713 """ 714 715 stream = get_input_stream(stream_or_string, encoding) 716 reader = Reader(stream, non_standard_newline) 717 parser = (parser_cls or StreamParser)(reader) 718 return parser 719 720 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None): 721 722 """ 723 Return a writer which will either send data to the resource found through 724 the use of 'stream_or_string' or using the given 'write' operation. 725 726 The 'stream_or_string' parameter may be either a stream accepting Unicode 727 data (the codecs module can be used to open files or to wrap streams in 728 order to accept Unicode data) or a filename identifying a file to be 729 written. 730 731 The optional 'encoding' can be used to specify the character encoding used 732 by the file to be written. 733 734 The optional 'line_length' can be used to specify how long lines should be 735 in the resulting data. 736 """ 737 738 if stream_or_string: 739 stream = get_output_stream(stream_or_string, encoding) 740 _writer = Writer(stream.write, line_length) 741 elif write: 742 _writer = Writer(write, line_length) 743 else: 744 raise IOError, "No stream, filename or write operation specified." 745 746 return (writer_cls or StreamWriter)(_writer) 747 748 def to_dict(node): 749 750 "Return the 'node' converted to a dictionary representation." 751 752 name, attr, items = node 753 return {name : (isinstance(items, list) and items_to_dict(items) or items, attr)} 754 755 def to_node(d): 756 757 "Return 'd' converted to a items-based representation." 758 759 return dict_to_items(d)[0] 760 761 # vim: tabstop=4 expandtab shiftwidth=4