1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005, 2006 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.3.6" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 33 # Attribute and node list wrappers. 34 35 class NamedNodeMap(object): 36 37 """ 38 A wrapper around Node objects providing DOM and dictionary convenience 39 methods. 40 """ 41 42 def __init__(self, node): 43 self.node = node 44 45 def getNamedItem(self, name): 46 return self.node.getAttributeNode(name) 47 48 def getNamedItemNS(self, ns, localName): 49 return self.node.getAttributeNodeNS(ns, localName) 50 51 def setNamedItem(self, node): 52 try: 53 old = self.getNamedItem(node.nodeName) 54 except KeyError: 55 old = None 56 self.node.setAttributeNode(node) 57 return old 58 59 def setNamedItemNS(self, node): 60 try: 61 old = self.getNamedItemNS(node.namespaceURI, node.localName) 62 except KeyError: 63 old = None 64 self.node.setAttributeNodeNS(node) 65 return old 66 67 def removeNamedItem(self, name): 68 try: 69 old = self.getNamedItem(name) 70 except KeyError: 71 old = None 72 self.node.removeAttribute(name) 73 return old 74 75 def removeNamedItemNS(self, ns, localName): 76 try: 77 old = self.getNamedItemNS(ns, localName) 78 except KeyError: 79 old = None 80 self.node.removeAttributeNS(ns, localName) 81 return old 82 83 # Dictionary emulation methods. 84 85 def __getitem__(self, name): 86 return self.getNamedItem(name) 87 88 def __setitem__(self, name, node): 89 if name == node.nodeName: 90 self.setNamedItem(node) 91 else: 92 raise KeyError, name 93 94 def __delitem__(self, name): 95 # NOTE: To be implemented. 96 pass 97 98 def values(self): 99 return [Attribute(_node, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 100 101 def keys(self): 102 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 103 104 def items(self): 105 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 106 107 def __repr__(self): 108 return str(self) 109 110 def __str__(self): 111 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 112 113 def _length(self): 114 return len(self.values()) 115 116 length = property(_length) 117 118 class NodeList(list): 119 120 "A wrapper around node lists." 121 122 def item(self, index): 123 return self[index] 124 125 def _length(self): 126 return len(self) 127 128 length = property(_length) 129 130 # Node classes. 131 132 class Node(object): 133 134 """ 135 A DOM-style wrapper around libxml2mod objects. 136 """ 137 138 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 139 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 140 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 141 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 142 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 143 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 144 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 145 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 146 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 147 TEXT_NODE = xml.dom.Node.TEXT_NODE 148 149 def __init__(self, node, ownerDocument=None): 150 self._node = node 151 self.ownerDocument = ownerDocument 152 153 def as_native_node(self): 154 return self._node 155 156 def _nodeType(self): 157 return Node_nodeType(self._node) 158 159 def _childNodes(self): 160 161 # NOTE: Consider a generator instead. 162 163 return NodeList([Node(_node, self.ownerDocument) for _node in Node_childNodes(self._node)]) 164 165 def _attributes(self): 166 return NamedNodeMap(self) 167 168 def _namespaceURI(self): 169 return Node_namespaceURI(self._node) 170 171 def _nodeValue(self): 172 return Node_nodeValue(self._node) 173 174 def _setNodeValue(self, value): 175 Node_setNodeValue(self._node, value) 176 177 def _prefix(self): 178 return Node_prefix(self._node) 179 180 def _nodeName(self): 181 return Node_nodeName(self._node) 182 183 def _tagName(self): 184 return Node_tagName(self._node) 185 186 def _localName(self): 187 return Node_localName(self._node) 188 189 def _parentNode(self): 190 return get_node(Node_parentNode(self._node), self) 191 192 def _previousSibling(self): 193 return Node(Node_previousSibling(self._node), self.ownerDocument) 194 195 def _nextSibling(self): 196 return Node(Node_nextSibling(self._node), self.ownerDocument) 197 198 def _doctype(self): 199 return Node(Node_doctype(self._node), self.ownerDocument) 200 201 def _publicId(self): 202 # NOTE: To be fixed when the libxml2mod API has been figured out. 203 if self.nodeType != self.DOCUMENT_TYPE_NODE: 204 return None 205 declaration = self.toString() 206 return self._findId(declaration, "PUBLIC") 207 208 def _systemId(self): 209 # NOTE: To be fixed when the libxml2mod API has been figured out. 210 if self.nodeType != self.DOCUMENT_TYPE_NODE: 211 return None 212 declaration = self.toString() 213 if self._findId(declaration, "PUBLIC"): 214 return self._findIdValue(declaration, 0) 215 return self._findId(declaration, "SYSTEM") 216 217 # NOTE: To be removed when the libxml2mod API has been figured out. 218 219 def _findId(self, declaration, identifier): 220 i = declaration.find(identifier) 221 if i == -1: 222 return None 223 return self._findIdValue(declaration, i) 224 225 def _findIdValue(self, declaration, i): 226 q = declaration.find('"', i) 227 if q == -1: 228 return None 229 q2 = declaration.find('"', q + 1) 230 if q2 == -1: 231 return None 232 return declaration[q+1:q2] 233 234 def hasAttributeNS(self, ns, localName): 235 return Node_hasAttributeNS(self._node, ns, localName) 236 237 def hasAttribute(self, name): 238 return Node_hasAttribute(self._node, name) 239 240 def getAttributeNS(self, ns, localName): 241 return Node_getAttributeNS(self._node, ns, localName) 242 243 def getAttribute(self, name): 244 return Node_getAttribute(self._node, name) 245 246 def getAttributeNodeNS(self, ns, localName): 247 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.ownerDocument, self) 248 249 def getAttributeNode(self, localName): 250 return Attribute(Node_getAttributeNode(self._node, localName), self.ownerDocument, self) 251 252 def setAttributeNS(self, ns, name, value): 253 Node_setAttributeNS(self._node, ns, name, value) 254 255 def setAttribute(self, name, value): 256 Node_setAttribute(self._node, name, value) 257 258 def setAttributeNodeNS(self, node): 259 Node_setAttributeNodeNS(self._node, node._node) 260 261 def setAttributeNode(self, node): 262 Node_setAttributeNode(self._node, node._node) 263 264 def removeAttributeNS(self, ns, localName): 265 Node_removeAttributeNS(self._node, ns, localName) 266 267 def removeAttribute(self, name): 268 Node_removeAttribute(self._node, name) 269 270 def createElementNS(self, ns, name): 271 return Node(Node_createElementNS(self._node, ns, name), self.ownerDocument) 272 273 def createElement(self, name): 274 return Node(Node_createElement(self._node, name), self.ownerDocument) 275 276 def createAttributeNS(self, ns, name): 277 tmp = self.createElement("tmp") 278 return Attribute(Node_createAttributeNS(tmp._node, ns, name)) 279 280 def createAttribute(self, name): 281 tmp = self.createElement("tmp") 282 return Attribute(Node_createAttribute(tmp._node, name)) 283 284 def createTextNode(self, value): 285 return Node(Node_createTextNode(self._node, value), self.ownerDocument) 286 287 def createComment(self, value): 288 return Node(Node_createComment(self._node, value), self.ownerDocument) 289 290 def importNode(self, node, deep): 291 if hasattr(node, "as_native_node"): 292 return Node(Node_importNode(self._node, node.as_native_node(), deep), self.ownerDocument) 293 else: 294 return Node(Node_importNode_DOM(self._node, node, deep), self.ownerDocument) 295 296 def cloneNode(self, deep): 297 # This takes advantage of the ubiquity of importNode (in spite of the DOM specification). 298 return self.importNode(self, deep) 299 300 def insertBefore(self, tmp, oldNode): 301 if hasattr(tmp, "as_native_node"): 302 return Node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self.ownerDocument) 303 else: 304 return Node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self.ownerDocument) 305 306 def replaceChild(self, tmp, oldNode): 307 if hasattr(tmp, "as_native_node"): 308 return Node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self.ownerDocument) 309 else: 310 return Node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self.ownerDocument) 311 312 def appendChild(self, tmp): 313 if hasattr(tmp, "as_native_node"): 314 return Node(Node_appendChild(self._node, tmp.as_native_node()), self.ownerDocument) 315 else: 316 return Node(Node_appendChild(self._node, tmp), self.ownerDocument) 317 318 def removeChild(self, tmp): 319 if hasattr(tmp, "as_native_node"): 320 Node_removeChild(self._node, tmp.as_native_node()) 321 else: 322 Node_removeChild(self._node, tmp) 323 324 def getElementsByTagName(self, tagName): 325 return self.xpath("//" + tagName) 326 327 def getElementsByTagNameNS(self, namespaceURI, localName): 328 return self.xpath("//ns:" + localName, namespaces={"ns" : namespaceURI}) 329 330 def normalize(self): 331 text_nodes = [] 332 for node in self.childNodes: 333 if node.nodeType == node.TEXT_NODE: 334 text_nodes.append(node) 335 elif len(text_nodes) != 0: 336 self._normalize(text_nodes) 337 text_nodes = [] 338 if len(text_nodes) != 0: 339 self._normalize(text_nodes) 340 341 def _normalize(self, text_nodes): 342 texts = [] 343 for text_node in text_nodes[:-1]: 344 texts.append(text_node.nodeValue) 345 self.removeChild(text_node) 346 texts.append(text_nodes[-1].nodeValue) 347 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 348 349 childNodes = property(_childNodes) 350 value = data = nodeValue = property(_nodeValue, _setNodeValue) 351 name = nodeName = property(_nodeName) 352 tagName = property(_tagName) 353 namespaceURI = property(_namespaceURI) 354 prefix = property(_prefix) 355 localName = property(_localName) 356 parentNode = property(_parentNode) 357 nodeType = property(_nodeType) 358 attributes = property(_attributes) 359 previousSibling = property(_previousSibling) 360 nextSibling = property(_nextSibling) 361 doctype = property(_doctype) 362 publicId = property(_publicId) 363 systemId = property(_systemId) 364 365 # NOTE: To be fixed - these being doctype-specific values. 366 367 entities = {} 368 notations = {} 369 370 #def isSameNode(self, other): 371 # return self._node.nodePath() == other._node.nodePath() 372 373 #def __eq__(self, other): 374 # return self._node.nodePath() == other._node.nodePath() 375 376 # 4DOM extensions to the usual PyXML API. 377 # NOTE: To be finished. 378 379 def xpath(self, expr, variables=None, namespaces=None): 380 result = Node_xpath(self._node, expr, variables, namespaces) 381 if isinstance(result, str): 382 return to_unicode(result) 383 elif hasattr(result, "__len__"): 384 return NodeList([get_node(_node, self) for _node in result]) 385 else: 386 return result 387 388 # Convenience methods. 389 390 def toString(self, encoding=None, prettyprint=0): 391 return toString(self, encoding, prettyprint) 392 393 def toStream(self, stream, encoding=None, prettyprint=0): 394 toStream(self, stream, encoding, prettyprint) 395 396 def toFile(self, f, encoding=None, prettyprint=0): 397 toFile(self, f, encoding, prettyprint) 398 399 # Attribute nodes. 400 401 class Attribute(Node): 402 403 "A class providing attribute access." 404 405 def __init__(self, node, ownerDocument=None, ownerElement=None): 406 Node.__init__(self, node, ownerDocument) 407 self.ownerElement = ownerElement 408 409 def _parentNode(self): 410 return self.ownerElement 411 412 parentNode = property(_parentNode) 413 414 # Document housekeeping mechanisms. 415 416 class Document(Node): 417 418 "A class providing document-level housekeeping." 419 420 def __init__(self, node): 421 self._node = node 422 423 def _ownerDocument(self): 424 return self 425 426 def _parentNode(self): 427 return None 428 429 def __del__(self): 430 #print "Freeing document", self._node 431 libxml2mod.xmlFreeDoc(self._node) 432 433 ownerDocument = property(_ownerDocument) 434 parentNode = property(_parentNode) 435 436 class DocumentType(object): 437 438 "A class providing a container for document type information." 439 440 def __init__(self, localName, publicId, systemId): 441 self.name = self.localName = localName 442 self.publicId = publicId 443 self.systemId = systemId 444 445 # NOTE: Nothing is currently provided to support the following 446 # NOTE: attributes. 447 448 self.entities = {} 449 self.notations = {} 450 451 # Factory functions. 452 453 def get_node(_node, context_node): 454 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 455 return context_node.ownerDocument 456 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 457 return Attribute(_node, context_node.ownerDocument, context_node) 458 else: 459 return Node(_node, context_node.ownerDocument) 460 461 # Utility functions. 462 463 def createDocumentType(localName, publicId, systemId): 464 return DocumentType(localName, publicId, systemId) 465 466 def createDocument(namespaceURI, localName, doctype): 467 return Document(Node_createDocument(namespaceURI, localName, doctype)) 468 469 def parse(stream_or_string, html=0): 470 471 """ 472 Parse the given 'stream_or_string', where the supplied object can either be 473 a stream (such as a file or stream object), or a string (containing the 474 filename of a document). If the optional 'html' parameter is set to a true 475 value, the content to be parsed will be treated as being HTML rather than 476 XML. 477 478 A document object is returned by this function. 479 """ 480 481 if hasattr(stream_or_string, "read"): 482 stream = stream_or_string 483 return parseString(stream.read(), html) 484 else: 485 return parseFile(stream_or_string, html) 486 487 def parseFile(filename, html=0): 488 489 """ 490 Parse the file having the given 'filename'. If the optional 'html' parameter 491 is set to a true value, the content to be parsed will be treated as being 492 HTML rather than XML. 493 494 A document object is returned by this function. 495 """ 496 497 return Document(Node_parseFile(filename, html)) 498 499 def parseString(s, html=0): 500 501 """ 502 Parse the content of the given string 's'. If the optional 'html' parameter 503 is set to a true value, the content to be parsed will be treated as being 504 HTML rather than XML. 505 506 A document object is returned by this function. 507 """ 508 509 return Document(Node_parseString(s, html)) 510 511 def parseURI(uri, html=0): 512 513 """ 514 Parse the content found at the given 'uri'. If the optional 'html' parameter 515 is set to a true value, the content to be parsed will be treated as being 516 HTML rather than XML. 517 518 The parseURI does not currently work with HTML. Use parse with a stream 519 object instead. For example: 520 521 d = parse(urllib.urlopen("http://www.python.org"), html=1) 522 523 A document object is returned by this function. 524 """ 525 526 return Document(Node_parseURI(uri, html)) 527 528 def toString(node, encoding=None, prettyprint=0): 529 530 """ 531 Return a string containing the serialised form of the given 'node' and its 532 children. The optional 'encoding' can be used to override the default 533 character encoding used in the serialisation. The optional 'prettyprint' 534 indicates whether the serialised form is prettyprinted or not (the default 535 setting). 536 """ 537 538 return Node_toString(node.as_native_node(), encoding, prettyprint) 539 540 def toStream(node, stream, encoding=None, prettyprint=0): 541 542 """ 543 Write the serialised form of the given 'node' and its children to the given 544 'stream'. The optional 'encoding' can be used to override the default 545 character encoding used in the serialisation. The optional 'prettyprint' 546 indicates whether the serialised form is prettyprinted or not (the default 547 setting). 548 """ 549 550 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 551 552 def toFile(node, filename, encoding=None, prettyprint=0): 553 554 """ 555 Write the serialised form of the given 'node' and its children to a file 556 having the given 'filename'. The optional 'encoding' can be used to override 557 the default character encoding used in the serialisation. The optional 558 'prettyprint' indicates whether the serialised form is prettyprinted or not 559 (the default setting). 560 """ 561 562 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 563 564 def adoptNodes(nodes): 565 566 """ 567 A special utility method which adopts the given low-level 'nodes' and which 568 returns a list of high-level equivalents. This is currently experimental and 569 should not be casually used. 570 """ 571 572 if len(nodes) == 0: 573 return [] 574 doc = Document(libxml2mod.doc(nodes[0])) 575 results = [] 576 for node in nodes: 577 results.append(Node(node, doc)) 578 return results 579 580 # vim: tabstop=4 expandtab shiftwidth=4