1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.4.3" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 import urllib # for parseURI in HTML mode 33 import xml.dom # for getElementById 34 35 class Implementation(object): 36 37 "Contains an abstraction over the DOM implementation." 38 39 def createDocumentType(self, localName, publicId, systemId): 40 return DocumentType(localName, publicId, systemId) 41 42 def createDocument(self, namespaceURI, localName, doctype): 43 return Document(Node_createDocument(namespaceURI, localName, doctype), self) 44 45 # Wrapping of documents. 46 47 def adoptDocument(self, node): 48 return Document(node, self) 49 50 # Factory functions. 51 52 def get_node(self, _node, context_node): 53 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 54 return context_node.ownerDocument 55 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 56 return Attribute(_node, self, context_node.ownerDocument, context_node) 57 else: 58 return Node(_node, self, context_node.ownerDocument) 59 60 def get_node_or_none(self, _node, context_node): 61 if _node is None: 62 return None 63 else: 64 return self.get_node(_node, context_node) 65 66 # Attribute and node list wrappers. 67 68 class NamedNodeMap(object): 69 70 """ 71 A wrapper around Node objects providing DOM and dictionary convenience 72 methods. 73 """ 74 75 def __init__(self, node, impl): 76 self.node = node 77 self.impl = impl 78 79 def getNamedItem(self, name): 80 return self.node.getAttributeNode(name) 81 82 def getNamedItemNS(self, ns, localName): 83 return self.node.getAttributeNodeNS(ns, localName) 84 85 def setNamedItem(self, node): 86 try: 87 old = self.getNamedItem(node.nodeName) 88 except KeyError: 89 old = None 90 self.node.setAttributeNode(node) 91 return old 92 93 def setNamedItemNS(self, node): 94 try: 95 old = self.getNamedItemNS(node.namespaceURI, node.localName) 96 except KeyError: 97 old = None 98 self.node.setAttributeNodeNS(node) 99 return old 100 101 def removeNamedItem(self, name): 102 try: 103 old = self.getNamedItem(name) 104 except KeyError: 105 old = None 106 self.node.removeAttribute(name) 107 return old 108 109 def removeNamedItemNS(self, ns, localName): 110 try: 111 old = self.getNamedItemNS(ns, localName) 112 except KeyError: 113 old = None 114 self.node.removeAttributeNS(ns, localName) 115 return old 116 117 # Dictionary emulation methods. 118 119 def __getitem__(self, name): 120 return self.getNamedItem(name) 121 122 def __setitem__(self, name, node): 123 if name == node.nodeName: 124 self.setNamedItem(node) 125 else: 126 raise KeyError, name 127 128 def __delitem__(self, name): 129 # NOTE: To be implemented. 130 pass 131 132 def values(self): 133 return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 134 135 def keys(self): 136 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 137 138 def items(self): 139 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 140 141 def __repr__(self): 142 return str(self) 143 144 def __str__(self): 145 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 146 147 def _length(self): 148 return len(self.values()) 149 150 length = property(_length) 151 152 class NodeList(list): 153 154 "A wrapper around node lists." 155 156 def item(self, index): 157 return self[index] 158 159 def _length(self): 160 return len(self) 161 162 length = property(_length) 163 164 # Node classes. 165 166 class Node(object): 167 168 """ 169 A DOM-style wrapper around libxml2mod objects. 170 """ 171 172 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 173 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 174 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 175 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 176 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 177 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 178 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 179 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 180 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 181 TEXT_NODE = xml.dom.Node.TEXT_NODE 182 183 def __init__(self, node, impl=None, ownerDocument=None): 184 self._node = node 185 self.impl = impl or default_impl 186 self.ownerDocument = ownerDocument 187 188 def as_native_node(self): 189 return self._node 190 191 def _nodeType(self): 192 return Node_nodeType(self._node) 193 194 def _childNodes(self): 195 196 # NOTE: Consider a generator instead. 197 198 return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)]) 199 200 def _attributes(self): 201 return NamedNodeMap(self, self.impl) 202 203 def _namespaceURI(self): 204 return Node_namespaceURI(self._node) 205 206 def _textContent(self): 207 return Node_textContent(self._node) 208 209 def _nodeValue(self): 210 if self.nodeType in null_value_node_types: 211 return None 212 return Node_nodeValue(self._node) 213 214 def _setNodeValue(self, value): 215 Node_setNodeValue(self._node, value) 216 217 def _prefix(self): 218 return Node_prefix(self._node) 219 220 def _nodeName(self): 221 return Node_nodeName(self._node) 222 223 def _tagName(self): 224 return Node_tagName(self._node) 225 226 def _localName(self): 227 return Node_localName(self._node) 228 229 def _parentNode(self): 230 return self.impl.get_node_or_none(Node_parentNode(self._node), self) 231 232 def _previousSibling(self): 233 return self.impl.get_node_or_none(Node_previousSibling(self._node), self) 234 235 def _nextSibling(self): 236 return self.impl.get_node_or_none(Node_nextSibling(self._node), self) 237 238 def _doctype(self): 239 return self.impl.get_node(Node_doctype(self._node), self) 240 241 def _publicId(self): 242 # NOTE: To be fixed when the libxml2mod API has been figured out. 243 if self.nodeType != self.DOCUMENT_TYPE_NODE: 244 return None 245 declaration = self.toString() 246 return self._findId(declaration, "PUBLIC") 247 248 def _systemId(self): 249 # NOTE: To be fixed when the libxml2mod API has been figured out. 250 if self.nodeType != self.DOCUMENT_TYPE_NODE: 251 return None 252 declaration = self.toString() 253 if self._findId(declaration, "PUBLIC"): 254 return self._findIdValue(declaration, 0) 255 return self._findId(declaration, "SYSTEM") 256 257 # NOTE: To be removed when the libxml2mod API has been figured out. 258 259 def _findId(self, declaration, identifier): 260 i = declaration.find(identifier) 261 if i == -1: 262 return None 263 return self._findIdValue(declaration, i) 264 265 def _findIdValue(self, declaration, i): 266 q = declaration.find('"', i) 267 if q == -1: 268 return None 269 q2 = declaration.find('"', q + 1) 270 if q2 == -1: 271 return None 272 return declaration[q+1:q2] 273 274 def hasAttributeNS(self, ns, localName): 275 return Node_hasAttributeNS(self._node, ns, localName) 276 277 def hasAttribute(self, name): 278 return Node_hasAttribute(self._node, name) 279 280 def getAttributeNS(self, ns, localName): 281 return Node_getAttributeNS(self._node, ns, localName) 282 283 def getAttribute(self, name): 284 return Node_getAttribute(self._node, name) 285 286 def getAttributeNodeNS(self, ns, localName): 287 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self) 288 289 def getAttributeNode(self, localName): 290 return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self) 291 292 def setAttributeNS(self, ns, name, value): 293 Node_setAttributeNS(self._node, ns, name, value) 294 295 def setAttribute(self, name, value): 296 Node_setAttribute(self._node, name, value) 297 298 def setAttributeNodeNS(self, node): 299 Node_setAttributeNodeNS(self._node, node._node) 300 301 def setAttributeNode(self, node): 302 Node_setAttributeNode(self._node, node._node) 303 304 def removeAttributeNS(self, ns, localName): 305 Node_removeAttributeNS(self._node, ns, localName) 306 307 def removeAttribute(self, name): 308 Node_removeAttribute(self._node, name) 309 310 def createElementNS(self, ns, name): 311 return self.impl.get_node(Node_createElementNS(self._node, ns, name), self) 312 313 def createElement(self, name): 314 return self.impl.get_node(Node_createElement(self._node, name), self) 315 316 def createAttributeNS(self, ns, name): 317 tmp = self.createElement("tmp") 318 return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name)) 319 320 def createAttribute(self, name): 321 tmp = self.createElement("tmp") 322 return Attribute(Node_createAttribute(tmp._node, name), self.impl) 323 324 def createTextNode(self, value): 325 return self.impl.get_node(Node_createTextNode(self._node, value), self) 326 327 def createComment(self, value): 328 return self.impl.get_node(Node_createComment(self._node, value), self) 329 330 def createCDATASection(self, value): 331 return self.impl.get_node(Node_createCDATASection(self._node, value), self) 332 333 def importNode(self, node, deep): 334 if hasattr(node, "as_native_node"): 335 return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self) 336 else: 337 return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self) 338 339 def cloneNode(self, deep): 340 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 341 return self.importNode(self, deep) 342 343 def insertBefore(self, tmp, oldNode): 344 if hasattr(tmp, "as_native_node"): 345 return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 346 else: 347 return self.impl.get_node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self) 348 349 def replaceChild(self, tmp, oldNode): 350 if hasattr(tmp, "as_native_node"): 351 return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self) 352 else: 353 return self.impl.get_node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self) 354 355 def appendChild(self, tmp): 356 if hasattr(tmp, "as_native_node"): 357 return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self) 358 else: 359 return self.impl.get_node(Node_appendChild(self._node, tmp), self) 360 361 def removeChild(self, tmp): 362 if hasattr(tmp, "as_native_node"): 363 Node_removeChild(self._node, tmp.as_native_node()) 364 else: 365 Node_removeChild(self._node, tmp) 366 367 def getElementById(self, identifier): 368 nodes = self.xpath(".//*[@xml:id='" + identifier.replace("'", "'") + "']", 369 namespaces={"xml" : xml.dom.XML_NAMESPACE}) 370 if nodes: 371 return nodes[0] 372 else: 373 return None 374 375 def getElementsByTagName(self, tagName): 376 return self.xpath(".//" + tagName) 377 378 def getElementsByTagNameNS(self, namespaceURI, localName): 379 return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI}) 380 381 def normalize(self): 382 text_nodes = [] 383 for node in self.childNodes: 384 if node.nodeType == node.TEXT_NODE: 385 text_nodes.append(node) 386 elif len(text_nodes) != 0: 387 self._normalize(text_nodes) 388 text_nodes = [] 389 if len(text_nodes) != 0: 390 self._normalize(text_nodes) 391 392 def _normalize(self, text_nodes): 393 texts = [] 394 for text_node in text_nodes[:-1]: 395 texts.append(text_node.nodeValue) 396 self.removeChild(text_node) 397 texts.append(text_nodes[-1].nodeValue) 398 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 399 400 childNodes = property(_childNodes) 401 value = data = nodeValue = property(_nodeValue, _setNodeValue) 402 textContent = property(_textContent) 403 name = nodeName = property(_nodeName) 404 tagName = property(_tagName) 405 namespaceURI = property(_namespaceURI) 406 prefix = property(_prefix) 407 localName = property(_localName) 408 parentNode = property(_parentNode) 409 nodeType = property(_nodeType) 410 attributes = property(_attributes) 411 previousSibling = property(_previousSibling) 412 nextSibling = property(_nextSibling) 413 doctype = property(_doctype) 414 publicId = property(_publicId) 415 systemId = property(_systemId) 416 417 # NOTE: To be fixed - these being doctype-specific values. 418 419 entities = {} 420 notations = {} 421 422 def isSameNode(self, other): 423 return self == other 424 425 def __hash__(self): 426 return hash(self.localName) 427 428 def __eq__(self, other): 429 return isinstance(other, Node) and libxml2mod.xmlXPathCmpNodes(self._node, other._node) == 0 430 431 def __ne__(self, other): 432 return not (self == other) 433 434 # 4DOM extensions to the usual PyXML API. 435 # NOTE: To be finished. 436 437 def xpath(self, expr, variables=None, namespaces=None): 438 result = Node_xpath(self._node, expr, variables, namespaces) 439 if isinstance(result, str): 440 return to_unicode(result) 441 elif hasattr(result, "__len__"): 442 return NodeList([self.impl.get_node(_node, self) for _node in result]) 443 else: 444 return result 445 446 # Convenience methods. 447 448 def toString(self, encoding=None, prettyprint=0): 449 return toString(self, encoding, prettyprint) 450 451 def toStream(self, stream, encoding=None, prettyprint=0): 452 toStream(self, stream, encoding, prettyprint) 453 454 def toFile(self, f, encoding=None, prettyprint=0): 455 toFile(self, f, encoding, prettyprint) 456 457 # Attribute nodes. 458 459 class Attribute(Node): 460 461 "A class providing attribute access." 462 463 def __init__(self, node, impl, ownerDocument=None, ownerElement=None): 464 Node.__init__(self, node, impl, ownerDocument) 465 self.ownerElement = ownerElement 466 467 def _parentNode(self): 468 return self.ownerElement 469 470 parentNode = property(_parentNode) 471 472 # Document housekeeping mechanisms. 473 474 class _Document: 475 476 """ 477 An abstract class providing document-level housekeeping and distinct 478 functionality. 479 """ 480 481 def __init__(self, node, impl): 482 self._node = node 483 self.implementation = self.impl = impl 484 485 def _documentElement(self): 486 return self.xpath("*")[0] 487 488 def _ownerDocument(self): 489 return self 490 491 def __del__(self): 492 #print "Freeing document", self._node 493 libxml2mod.xmlFreeDoc(self._node) 494 495 documentElement = property(_documentElement) 496 ownerDocument = property(_ownerDocument) 497 498 class Document(_Document, Node): 499 500 """ 501 A generic document class. Specialised document classes should inherit from 502 the _Document class and their own variation of Node. 503 """ 504 505 pass 506 507 class DocumentType(object): 508 509 "A class providing a container for document type information." 510 511 def __init__(self, localName, publicId, systemId): 512 self.name = self.localName = localName 513 self.publicId = publicId 514 self.systemId = systemId 515 516 # NOTE: Nothing is currently provided to support the following 517 # NOTE: attributes. 518 519 self.entities = {} 520 self.notations = {} 521 522 # Constants. 523 524 null_value_node_types = [ 525 Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE, 526 Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE 527 ] 528 529 # Utility functions. 530 531 def createDocumentType(localName, publicId, systemId): 532 return default_impl.createDocumentType(localName, publicId, systemId) 533 534 def createDocument(namespaceURI, localName, doctype): 535 return default_impl.createDocument(namespaceURI, localName, doctype) 536 537 def parse(stream_or_string, html=0, htmlencoding=None, unfinished=0, impl=None): 538 539 """ 540 Parse the given 'stream_or_string', where the supplied object can either be 541 a stream (such as a file or stream object), or a string (containing the 542 filename of a document). The optional parameters described below should be 543 provided as keyword arguments. 544 545 If the optional 'html' parameter is set to a true value, the content to be 546 parsed will be treated as being HTML rather than XML. If the optional 547 'htmlencoding' is specified, HTML parsing will be performed with the 548 document encoding assumed to that specified. 549 550 If the optional 'unfinished' parameter is set to a true value, unfinished 551 documents will be parsed, even though such documents may be missing content 552 such as closing tags. 553 554 A document object is returned by this function. 555 """ 556 557 impl = impl or default_impl 558 559 if hasattr(stream_or_string, "read"): 560 stream = stream_or_string 561 return parseString(stream.read(), html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=impl) 562 else: 563 return parseFile(stream_or_string, html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=impl) 564 565 def parseFile(filename, html=0, htmlencoding=None, unfinished=0, impl=None): 566 567 """ 568 Parse the file having the given 'filename'. The optional parameters 569 described below should be provided as keyword arguments. 570 571 If the optional 'html' parameter is set to a true value, the content to be 572 parsed will be treated as being HTML rather than XML. If the optional 573 'htmlencoding' is specified, HTML parsing will be performed with the 574 document encoding assumed to that specified. 575 576 If the optional 'unfinished' parameter is set to a true value, unfinished 577 documents will be parsed, even though such documents may be missing content 578 such as closing tags. 579 580 A document object is returned by this function. 581 """ 582 583 impl = impl or default_impl 584 return impl.adoptDocument(Node_parseFile(filename, html=html, htmlencoding=htmlencoding, unfinished=unfinished)) 585 586 def parseString(s, html=0, htmlencoding=None, unfinished=0, impl=None): 587 588 """ 589 Parse the content of the given string 's'. The optional parameters described 590 below should be provided as keyword arguments. 591 592 If the optional 'html' parameter is set to a true value, the content to be 593 parsed will be treated as being HTML rather than XML. If the optional 594 'htmlencoding' is specified, HTML parsing will be performed with the 595 document encoding assumed to that specified. 596 597 If the optional 'unfinished' parameter is set to a true value, unfinished 598 documents will be parsed, even though such documents may be missing content 599 such as closing tags. 600 601 A document object is returned by this function. 602 """ 603 604 impl = impl or default_impl 605 return impl.adoptDocument(Node_parseString(s, html=html, htmlencoding=htmlencoding, unfinished=unfinished)) 606 607 def parseURI(uri, html=0, htmlencoding=None, unfinished=0, impl=None): 608 609 """ 610 Parse the content found at the given 'uri'. The optional parameters 611 described below should be provided as keyword arguments. 612 613 If the optional 'html' parameter is set to a true value, the content to be 614 parsed will be treated as being HTML rather than XML. If the optional 615 'htmlencoding' is specified, HTML parsing will be performed with the 616 document encoding assumed to that specified. 617 618 If the optional 'unfinished' parameter is set to a true value, unfinished 619 documents will be parsed, even though such documents may be missing content 620 such as closing tags. 621 622 XML documents are retrieved using libxml2's own network capabilities; HTML 623 documents are retrieved using the urllib module provided by Python. To 624 retrieve either kind of document using Python's own modules for this purpose 625 (such as urllib), open a stream and pass it to the parse function: 626 627 f = urllib.urlopen(uri) 628 try: 629 doc = libxml2dom.parse(f, html) 630 finally: 631 f.close() 632 633 A document object is returned by this function. 634 """ 635 636 if html: 637 f = urllib.urlopen(uri) 638 try: 639 return parse(f, html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=impl) 640 finally: 641 f.close() 642 else: 643 impl = impl or default_impl 644 return impl.adoptDocument(Node_parseURI(uri, html=html, htmlencoding=htmlencoding, unfinished=unfinished)) 645 646 def toString(node, encoding=None, prettyprint=0): 647 648 """ 649 Return a string containing the serialised form of the given 'node' and its 650 children. The optional 'encoding' can be used to override the default 651 character encoding used in the serialisation. The optional 'prettyprint' 652 indicates whether the serialised form is prettyprinted or not (the default 653 setting). 654 """ 655 656 return Node_toString(node.as_native_node(), encoding, prettyprint) 657 658 def toStream(node, stream, encoding=None, prettyprint=0): 659 660 """ 661 Write the serialised form of the given 'node' and its children to the given 662 'stream'. The optional 'encoding' can be used to override the default 663 character encoding used in the serialisation. The optional 'prettyprint' 664 indicates whether the serialised form is prettyprinted or not (the default 665 setting). 666 """ 667 668 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 669 670 def toFile(node, filename, encoding=None, prettyprint=0): 671 672 """ 673 Write the serialised form of the given 'node' and its children to a file 674 having the given 'filename'. The optional 'encoding' can be used to override 675 the default character encoding used in the serialisation. The optional 676 'prettyprint' indicates whether the serialised form is prettyprinted or not 677 (the default setting). 678 """ 679 680 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 681 682 def adoptNodes(nodes, impl=None): 683 684 """ 685 A special utility method which adopts the given low-level 'nodes' and which 686 returns a list of high-level equivalents. This is currently experimental and 687 should not be casually used. 688 """ 689 690 impl = impl or default_impl 691 692 if len(nodes) == 0: 693 return [] 694 doc = impl.adoptDocument(libxml2mod.doc(nodes[0])) 695 results = [] 696 for node in nodes: 697 results.append(Node(node, impl, doc)) 698 return results 699 700 def getDOMImplementation(): 701 702 "Return the default DOM implementation." 703 704 return default_impl 705 706 # Single instance of the implementation. 707 708 default_impl = Implementation() 709 710 # vim: tabstop=4 expandtab shiftwidth=4