1 #!/usr/bin/env python 2 3 """ 4 Parsing of vCard, vCalendar and iCalendar files. 5 6 Copyright (C) 2005, 2006, 2007, 2008, 2009, 2011, 2013 Paul Boddie <paul@boddie.org.uk> 7 8 This program is free software; you can redistribute it and/or modify it under 9 the terms of the GNU General Public License as published by the Free Software 10 Foundation; either version 3 of the License, or (at your option) any later 11 version. 12 13 This program is distributed in the hope that it will be useful, but WITHOUT 14 ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or FITNESS 15 FOR A PARTICULAR PURPOSE. See the GNU General Public License for more 16 details. 17 18 You should have received a copy of the GNU General Public License along with 19 this program. If not, see <http://www.gnu.org/licenses/>. 20 21 -------- 22 23 References: 24 25 RFC 5545: Internet Calendaring and Scheduling Core Object Specification 26 (iCalendar) 27 http://tools.ietf.org/html/rfc5545 28 29 RFC 2445: Internet Calendaring and Scheduling Core Object Specification 30 (iCalendar) 31 http://tools.ietf.org/html/rfc2445 32 33 RFC 2425: A MIME Content-Type for Directory Information 34 http://tools.ietf.org/html/rfc2425 35 36 RFC 2426: vCard MIME Directory Profile 37 http://tools.ietf.org/html/rfc2426 38 """ 39 40 try: 41 set 42 except NameError: 43 from sets import Set as set 44 45 # Encoding-related imports. 46 47 import base64, quopri 48 import codecs 49 50 # Tokenisation help. 51 52 import re 53 54 # Configuration. 55 56 default_encoding = "utf-8" 57 58 # Reader and parser classes. 59 60 class Reader: 61 62 "A simple class wrapping a file, providing simple pushback capabilities." 63 64 def __init__(self, f, non_standard_newline=0): 65 66 """ 67 Initialise the object with the file 'f'. If 'non_standard_newline' is 68 set to a true value (unlike the default), lines ending with CR will be 69 treated as complete lines. 70 """ 71 72 self.f = f 73 self.non_standard_newline = non_standard_newline 74 self.lines = [] 75 self.line_number = 1 # about to read line 1 76 77 def close(self): 78 79 "Close the reader." 80 81 self.f.close() 82 83 def pushback(self, line): 84 85 """ 86 Push the given 'line' back so that the next line read is actually the 87 given 'line' and not the next line from the underlying file. 88 """ 89 90 self.lines.append(line) 91 self.line_number -= 1 92 93 def readline(self): 94 95 """ 96 If no pushed-back lines exist, read a line directly from the file. 97 Otherwise, read from the list of pushed-back lines. 98 """ 99 100 self.line_number += 1 101 if self.lines: 102 return self.lines.pop() 103 else: 104 # Sanity check for broken lines (\r instead of \r\n or \n). 105 line = self.f.readline() 106 while line.endswith("\r") and not self.non_standard_newline: 107 s = self.f.readline() 108 if not s: 109 break 110 line += s 111 if line.endswith("\r") and self.non_standard_newline: 112 return line + "\n" 113 else: 114 return line 115 116 def read_content_line(self): 117 118 """ 119 Read an entire content line, itself potentially consisting of many 120 physical lines of text, returning a string. 121 """ 122 123 # Skip blank lines. 124 125 line = self.readline() 126 while line: 127 line_stripped = line.rstrip("\r\n") 128 if not line_stripped: 129 line = self.readline() 130 else: 131 break 132 else: 133 return "" 134 135 # Strip all appropriate whitespace from the right end of each line. 136 # For subsequent lines, remove the first whitespace character. 137 # See section 4.1 of the iCalendar specification. 138 139 lines = [line_stripped] 140 141 line = self.readline() 142 while line.startswith(" ") or line.startswith("\t"): 143 lines.append(line[1:].rstrip("\r\n")) 144 line = self.readline() 145 146 # Since one line too many will have been read, push the line back into 147 # the file. 148 149 if line: 150 self.pushback(line) 151 152 return "".join(lines) 153 154 def get_content_line(self): 155 156 "Return a content line object for the current line." 157 158 return ContentLine(self.read_content_line()) 159 160 class ContentLine: 161 162 "A content line which can be searched." 163 164 SEPARATORS = re.compile('[;:"]') 165 SEPARATORS_PLUS_EQUALS = re.compile('[=;:"]') 166 167 def __init__(self, text): 168 self.text = text 169 self.start = 0 170 171 def __repr__(self): 172 return "ContentLine(%r)" % self.text 173 174 def get_remaining(self): 175 176 "Get the remaining text from the content line." 177 178 return self.text[self.start:] 179 180 def search(self, targets): 181 182 """ 183 Find one of the 'targets' in the text, returning the string from the 184 current position up to the target found, along with the target string, 185 using a tuple of the form (string, target). If no target was found, 186 return the entire string together with a target of None. 187 188 The 'targets' parameter must be a regular expression object or an object 189 compatible with the API of such objects. 190 """ 191 192 text = self.text 193 start = pos = self.start 194 length = len(text) 195 196 # Remember the first target. 197 198 first = None 199 first_pos = None 200 in_quoted_region = 0 201 202 # Process the text, looking for the targets. 203 204 while pos < length: 205 match = targets.search(text, pos) 206 207 # Where nothing matches, end the search. 208 209 if match is None: 210 pos = length 211 212 # Where a double quote matches, toggle the region state. 213 214 elif match.group() == '"': 215 in_quoted_region = not in_quoted_region 216 pos = match.end() 217 218 # Where something else matches outside a region, stop searching. 219 220 elif not in_quoted_region: 221 first = match.group() 222 first_pos = match.start() 223 break 224 225 # Otherwise, keep looking for the end of the region. 226 227 else: 228 pos = match.end() 229 230 # Where no more input can provide the targets, return a special result. 231 232 else: 233 self.start = length 234 return text[start:], None 235 236 self.start = match.end() 237 return text[start:first_pos], first 238 239 class StreamParser: 240 241 "A stream parser for content in vCard/vCalendar/iCalendar-like formats." 242 243 def __init__(self, f): 244 245 "Initialise the parser for the given file 'f'." 246 247 self.f = f 248 249 def close(self): 250 251 "Close the reader." 252 253 self.f.close() 254 255 def __iter__(self): 256 257 "Return self as the iterator." 258 259 return self 260 261 def next(self): 262 263 """ 264 Return the next content item in the file as a tuple of the form 265 (name, parameters, values). 266 """ 267 268 return self.parse_content_line() 269 270 def decode_content(self, value): 271 272 "Decode the given 'value', replacing quoted characters." 273 274 return value.replace("\r", "").replace("\\N", "\n").replace("\\n", "\n") 275 276 # Internal methods. 277 278 def parse_content_line(self): 279 280 """ 281 Return the name, parameters and value information for the current 282 content line in the file being parsed. 283 """ 284 285 f = self.f 286 line_number = f.line_number 287 line = f.get_content_line() 288 289 # Read the property name. 290 291 name, sep = line.search(line.SEPARATORS) 292 name = name.strip() 293 294 if not name and sep is None: 295 raise StopIteration 296 297 # Read the parameters. 298 299 parameters = {} 300 301 while sep == ";": 302 303 # Find the actual modifier. 304 305 parameter_name, sep = line.search(line.SEPARATORS_PLUS_EQUALS) 306 parameter_name = parameter_name.strip() 307 308 if sep == "=": 309 parameter_value, sep = line.search(line.SEPARATORS) 310 parameter_value = parameter_value.strip() 311 else: 312 parameter_value = None 313 314 # Append a key, value tuple to the parameters list. 315 316 parameters[parameter_name] = parameter_value 317 318 # Get the value content. 319 320 if sep != ":": 321 raise ValueError, (line_number, line) 322 323 # Obtain and decode the value. 324 325 value = self.decode(name, parameters, line.get_remaining()) 326 327 return name, parameters, value 328 329 def decode(self, name, parameters, value): 330 331 "Decode using 'name' and 'parameters' the given 'value'." 332 333 encoding = parameters.get("ENCODING") 334 charset = parameters.get("CHARSET") 335 336 value = self.decode_content(value) 337 338 if encoding == "QUOTED-PRINTABLE": 339 return unicode(quopri.decodestring(value), charset or "iso-8859-1") 340 elif encoding == "BASE64": 341 return base64.decodestring(value) 342 else: 343 return value 344 345 class ParserBase: 346 347 "An abstract parser for content in vCard/vCalendar/iCalendar-like formats." 348 349 def __init__(self): 350 351 "Initialise the parser." 352 353 self.names = [] 354 355 def parse(self, f, parser_cls=None): 356 357 "Parse the contents of the file 'f'." 358 359 parser = (parser_cls or StreamParser)(f) 360 361 for name, parameters, value in parser: 362 363 if name == "BEGIN": 364 self.names.append(value) 365 self.startComponent(value, parameters) 366 367 elif name == "END": 368 start_name = self.names.pop() 369 if start_name != value: 370 raise ParseError, "Mismatch in BEGIN and END declarations (%r and %r) at line %d." % ( 371 start_name, value, f.line_number) 372 373 self.endComponent(value) 374 375 else: 376 self.handleProperty(name, parameters, value) 377 378 class Parser(ParserBase): 379 380 "A SAX-like parser for vCard/vCalendar/iCalendar-like formats." 381 382 def __init__(self): 383 ParserBase.__init__(self) 384 self.components = [] 385 386 def startComponent(self, name, parameters): 387 388 """ 389 Add the component with the given 'name' and 'parameters', recording an 390 empty list of children as part of the component's content. 391 """ 392 393 component = self.handleProperty(name, parameters) 394 self.components.append(component) 395 return component 396 397 def endComponent(self, name): 398 399 """ 400 End the component with the given 'name' by removing it from the active 401 component stack. If only one component exists on the stack, retain it 402 for later inspection. 403 """ 404 405 if len(self.components) > 1: 406 return self.components.pop() 407 408 # Or return the only element. 409 410 elif self.components: 411 return self.components[0] 412 413 def handleProperty(self, name, parameters, value=None): 414 415 """ 416 Record the property with the given 'name', 'parameters' and optional 417 'value' as part of the current component's children. 418 """ 419 420 component = self.makeComponent(name, parameters, value) 421 self.attachComponent(component) 422 return component 423 424 # Component object construction/manipulation methods. 425 426 def attachComponent(self, component): 427 428 "Attach the given 'component' to its parent." 429 430 if self.components: 431 component_name, component_parameters, component_children = self.components[-1] 432 component_children.append(component) 433 434 def makeComponent(self, name, parameters, value=None): 435 436 """ 437 Make a component object from the given 'name', 'parameters' and optional 438 'value'. 439 """ 440 441 return (name, parameters, value or []) 442 443 # Public methods. 444 445 def parse(self, f, parser_cls=None): 446 447 "Parse the contents of the file 'f'." 448 449 ParserBase.parse(self, f, parser_cls) 450 return self.components[0] 451 452 # Writer classes. 453 454 class Writer: 455 456 "A simple class wrapping a file, providing simple output capabilities." 457 458 default_line_length = 76 459 460 def __init__(self, write, line_length=None): 461 462 """ 463 Initialise the object with the given 'write' operation. If 'line_length' 464 is set, the length of written lines will conform to the specified value 465 instead of the default value. 466 """ 467 468 self._write = write 469 self.line_length = line_length or self.default_line_length 470 self.char_offset = 0 471 472 def write(self, text): 473 474 "Write the 'text' to the file." 475 476 write = self._write 477 line_length = self.line_length 478 479 i = 0 480 remaining = len(text) 481 482 while remaining: 483 space = line_length - self.char_offset 484 if remaining > space: 485 write(text[i:i + space]) 486 write("\r\n ") 487 self.char_offset = 1 488 i += space 489 remaining -= space 490 else: 491 write(text[i:]) 492 self.char_offset += remaining 493 i += remaining 494 remaining = 0 495 496 def end_line(self): 497 498 "End the current content line." 499 500 if self.char_offset > 0: 501 self.char_offset = 0 502 self._write("\r\n") 503 504 class StreamWriter: 505 506 "A stream writer for content in vCard/vCalendar/iCalendar-like formats." 507 508 def __init__(self, f): 509 510 "Initialise the stream writer with the given 'f' stream object." 511 512 self.f = f 513 514 def append(self, record): 515 self.write(*record) 516 517 def write(self, name, parameters, value): 518 519 """ 520 Write a content line, serialising the given 'name', 'parameters' and 521 'value' information. 522 """ 523 524 self.write_content_line(name, self.encode_parameters(parameters), self.encode_value(name, parameters, value)) 525 526 # Internal methods. 527 528 def write_content_line(self, name, encoded_parameters, encoded_value): 529 530 """ 531 Write a content line for the given 'name', 'encoded_parameters' and 532 'encoded_value' information. 533 """ 534 535 f = self.f 536 537 f.write(name) 538 for param_name, param_value in encoded_parameters.items(): 539 f.write(";") 540 f.write(param_name) 541 f.write("=") 542 f.write(param_value) 543 f.write(":") 544 f.write(encoded_value) 545 f.end_line() 546 547 def encode_quoted_parameter_value(self, value): 548 549 "Encode the given 'value'." 550 551 return '"%s"' % value 552 553 def encode_value(self, name, parameters, value): 554 555 """ 556 Encode using 'name' and 'parameters' the given 'value' so that the 557 resulting encoded form employs any specified character encodings. 558 """ 559 560 encoding = parameters.get("ENCODING") 561 charset = parameters.get("CHARSET") 562 563 if encoding == "QUOTED-PRINTABLE": 564 value = quopri.encodestring(value.encode(charset or "iso-8859-1")) 565 elif encoding == "BASE64": 566 value = base64.encodestring(value) 567 568 return self.encode_content(value) 569 570 # Overrideable methods. 571 572 def encode_parameters(self, parameters): 573 574 """ 575 Encode the given 'parameters' according to the vCalendar specification. 576 """ 577 578 encoded_parameters = {} 579 580 for param_name, param_value in parameters.items(): 581 582 # Basic format support merely involves quoting values which seem to 583 # need it. Other more specific formats may define exactly which 584 # parameters should be quoted. 585 586 if ContentLine.SEPARATORS.search(param_value): 587 param_value = self.encode_quoted_parameter_value(param_value) 588 589 encoded_parameters[param_name] = param_value 590 591 return encoded_parameters 592 593 def encode_content(self, value): 594 595 "Encode the given 'value', quoting characters." 596 597 return value.replace("\n", "\\n") 598 599 # Utility functions. 600 601 def is_input_stream(stream_or_string): 602 return hasattr(stream_or_string, "read") 603 604 def get_input_stream(stream_or_string, encoding=None): 605 if is_input_stream(stream_or_string): 606 return stream_or_string 607 else: 608 return codecs.open(stream_or_string, encoding=(encoding or default_encoding)) 609 610 def get_output_stream(stream_or_string, encoding=None): 611 if hasattr(stream_or_string, "write"): 612 return stream_or_string 613 else: 614 return codecs.open(stream_or_string, "w", encoding=(encoding or default_encoding)) 615 616 # Public functions. 617 618 def parse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 619 620 """ 621 Parse the resource data found through the use of the 'stream_or_string', 622 which is either a stream providing Unicode data (the codecs module can be 623 used to open files or to wrap streams in order to provide Unicode data) or a 624 filename identifying a file to be parsed. 625 626 The optional 'encoding' can be used to specify the character encoding used 627 by the file to be parsed. 628 629 The optional 'non_standard_newline' can be set to a true value (unlike the 630 default) in order to attempt to process files with CR as the end of line 631 character. 632 633 As a result of parsing the resource, the root node of the imported resource 634 is returned. 635 """ 636 637 stream = get_input_stream(stream_or_string, encoding) 638 reader = Reader(stream, non_standard_newline) 639 640 # Parse using the reader. 641 642 try: 643 parser = (parser_cls or Parser)() 644 return parser.parse(reader) 645 646 # Close any opened streams. 647 648 finally: 649 if not is_input_stream(stream_or_string): 650 reader.close() 651 652 def iterparse(stream_or_string, encoding=None, non_standard_newline=0, parser_cls=None): 653 654 """ 655 Parse the resource data found through the use of the 'stream_or_string', 656 which is either a stream providing Unicode data (the codecs module can be 657 used to open files or to wrap streams in order to provide Unicode data) or a 658 filename identifying a file to be parsed. 659 660 The optional 'encoding' can be used to specify the character encoding used 661 by the file to be parsed. 662 663 The optional 'non_standard_newline' can be set to a true value (unlike the 664 default) in order to attempt to process files with CR as the end of line 665 character. 666 667 An iterator is returned which provides event tuples describing parsing 668 events of the form (name, parameters, value). 669 """ 670 671 stream = get_input_stream(stream_or_string, encoding) 672 reader = Reader(stream, non_standard_newline) 673 parser = (parser_cls or StreamParser)(reader) 674 return parser 675 676 def iterwrite(stream_or_string=None, write=None, encoding=None, line_length=None, writer_cls=None): 677 678 """ 679 Return a writer which will either send data to the resource found through 680 the use of 'stream_or_string' or using the given 'write' operation. 681 682 The 'stream_or_string' parameter may be either a stream accepting Unicode 683 data (the codecs module can be used to open files or to wrap streams in 684 order to accept Unicode data) or a filename identifying a file to be 685 written. 686 687 The optional 'encoding' can be used to specify the character encoding used 688 by the file to be written. 689 690 The optional 'line_length' can be used to specify how long lines should be 691 in the resulting data. 692 """ 693 694 if stream_or_string: 695 stream = get_output_stream(stream_or_string, encoding) 696 _writer = Writer(stream.write, line_length) 697 elif write: 698 _writer = Writer(write, line_length) 699 else: 700 raise IOError, "No stream, filename or write operation specified." 701 702 return (writer_cls or StreamWriter)(_writer) 703 704 # vim: tabstop=4 expandtab shiftwidth=4