1 #!/usr/bin/env python 2 3 """ 4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module. 5 6 Copyright (C) 2003, 2004, 2005 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 __version__ = "0.3.5" 24 25 from libxml2dom.macrolib import * 26 from libxml2dom.macrolib import \ 27 createDocument as Node_createDocument, \ 28 parseString as Node_parseString, parseURI as Node_parseURI, \ 29 parseFile as Node_parseFile, \ 30 toString as Node_toString, toStream as Node_toStream, \ 31 toFile as Node_toFile 32 33 # Attribute and node list wrappers. 34 35 class NamedNodeMap(object): 36 37 """ 38 A wrapper around Node objects providing DOM and dictionary convenience 39 methods. 40 """ 41 42 def __init__(self, node): 43 self.node = node 44 45 def getNamedItem(self, name): 46 return self.node.getAttributeNode(name) 47 48 def getNamedItemNS(self, ns, localName): 49 return self.node.getAttributeNodeNS(ns, localName) 50 51 def setNamedItem(self, node): 52 try: 53 old = self.getNamedItem(node.nodeName) 54 except KeyError: 55 old = None 56 self.node.setAttributeNode(node) 57 return old 58 59 def setNamedItemNS(self, node): 60 try: 61 old = self.getNamedItemNS(node.namespaceURI, node.localName) 62 except KeyError: 63 old = None 64 self.node.setAttributeNodeNS(node) 65 return old 66 67 def removeNamedItem(self, name): 68 try: 69 old = self.getNamedItem(name) 70 except KeyError: 71 old = None 72 self.node.removeAttribute(name) 73 return old 74 75 def removeNamedItemNS(self, ns, localName): 76 try: 77 old = self.getNamedItemNS(ns, localName) 78 except KeyError: 79 old = None 80 self.node.removeAttributeNS(ns, localName) 81 return old 82 83 # Dictionary emulation methods. 84 85 def __getitem__(self, name): 86 return self.getNamedItem(name) 87 88 def __setitem__(self, name, node): 89 if name == node.nodeName: 90 self.setNamedItem(node) 91 else: 92 raise KeyError, name 93 94 def __delitem__(self, name): 95 # NOTE: To be implemented. 96 pass 97 98 def values(self): 99 return [Attribute(_node, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()] 100 101 def keys(self): 102 return [(attr.namespaceURI, attr.localName) for attr in self.values()] 103 104 def items(self): 105 return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()] 106 107 def __repr__(self): 108 return str(self) 109 110 def __str__(self): 111 return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()]) 112 113 def _length(self): 114 return len(self.values()) 115 116 length = property(_length) 117 118 class NodeList(list): 119 120 "A wrapper around node lists." 121 122 def item(self, index): 123 return self[index] 124 125 def _length(self): 126 return len(self) 127 128 length = property(_length) 129 130 # Node classes. 131 132 class Node(object): 133 134 """ 135 A DOM-style wrapper around libxml2mod objects. 136 """ 137 138 ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE 139 COMMENT_NODE = xml.dom.Node.COMMENT_NODE 140 DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE 141 DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE 142 ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE 143 ENTITY_NODE = xml.dom.Node.ENTITY_NODE 144 ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE 145 NOTATION_NODE = xml.dom.Node.NOTATION_NODE 146 PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE 147 TEXT_NODE = xml.dom.Node.TEXT_NODE 148 149 def __init__(self, node, ownerDocument=None): 150 self._node = node 151 self.ownerDocument = ownerDocument 152 153 def as_native_node(self): 154 return self._node 155 156 def _nodeType(self): 157 return Node_nodeType(self._node) 158 159 def _childNodes(self): 160 161 # NOTE: Consider a generator instead. 162 163 return NodeList([Node(_node, self.ownerDocument) for _node in Node_childNodes(self._node)]) 164 165 def _attributes(self): 166 return NamedNodeMap(self) 167 168 def _namespaceURI(self): 169 return Node_namespaceURI(self._node) 170 171 def _nodeValue(self): 172 return Node_nodeValue(self._node) 173 174 def _setNodeValue(self, value): 175 Node_setNodeValue(self._node, value) 176 177 def _prefix(self): 178 return Node_prefix(self._node) 179 180 def _nodeName(self): 181 return Node_nodeName(self._node) 182 183 def _tagName(self): 184 return Node_tagName(self._node) 185 186 def _localName(self): 187 return Node_localName(self._node) 188 189 def _parentNode(self): 190 return get_node(Node_parentNode(self._node), self) 191 192 def _previousSibling(self): 193 return Node(Node_previousSibling(self._node), self.ownerDocument) 194 195 def _nextSibling(self): 196 return Node(Node_nextSibling(self._node), self.ownerDocument) 197 198 def _doctype(self): 199 return Node(Node_doctype(self._node), self.ownerDocument) 200 201 def _publicId(self): 202 # NOTE: To be fixed when the libxml2mod API has been figured out. 203 if self.nodeType != self.DOCUMENT_TYPE_NODE: 204 return None 205 declaration = self.toString() 206 return self._findId(declaration, "PUBLIC") 207 208 def _systemId(self): 209 # NOTE: To be fixed when the libxml2mod API has been figured out. 210 if self.nodeType != self.DOCUMENT_TYPE_NODE: 211 return None 212 declaration = self.toString() 213 if self._findId(declaration, "PUBLIC"): 214 return self._findIdValue(declaration, 0) 215 return self._findId(declaration, "SYSTEM") 216 217 # NOTE: To be removed when the libxml2mod API has been figured out. 218 219 def _findId(self, declaration, identifier): 220 i = declaration.find(identifier) 221 if i == -1: 222 return None 223 return self._findIdValue(declaration, i) 224 225 def _findIdValue(self, declaration, i): 226 q = declaration.find('"', i) 227 if q == -1: 228 return None 229 q2 = declaration.find('"', q + 1) 230 if q2 == -1: 231 return None 232 return declaration[q+1:q2] 233 234 def hasAttributeNS(self, ns, localName): 235 return Node_hasAttributeNS(self._node, ns, localName) 236 237 def hasAttribute(self, name): 238 return Node_hasAttribute(self._node, name) 239 240 def getAttributeNS(self, ns, localName): 241 return Node_getAttributeNS(self._node, ns, localName) 242 243 def getAttribute(self, name): 244 return Node_getAttribute(self._node, name) 245 246 def getAttributeNodeNS(self, ns, localName): 247 return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.ownerDocument, self) 248 249 def getAttributeNode(self, localName): 250 return Attribute(Node_getAttributeNode(self._node, localName), self.ownerDocument, self) 251 252 def setAttributeNS(self, ns, name, value): 253 Node_setAttributeNS(self._node, ns, name, value) 254 255 def setAttribute(self, name, value): 256 Node_setAttribute(self._node, name, value) 257 258 def setAttributeNodeNS(self, node): 259 Node_setAttributeNodeNS(self._node, node._node) 260 261 def setAttributeNode(self, node): 262 Node_setAttributeNode(self._node, node._node) 263 264 def removeAttributeNS(self, ns, localName): 265 Node_removeAttributeNS(self._node, ns, localName) 266 267 def removeAttribute(self, name): 268 Node_removeAttribute(self._node, name) 269 270 def createElementNS(self, ns, name): 271 return Node(Node_createElementNS(self._node, ns, name), self.ownerDocument) 272 273 def createElement(self, name): 274 return Node(Node_createElement(self._node, name), self.ownerDocument) 275 276 def createAttributeNS(self, ns, name): 277 tmp = self.createElement("tmp") 278 return Attribute(Node_createAttributeNS(tmp._node, ns, name)) 279 280 def createAttribute(self, name): 281 tmp = self.createElement("tmp") 282 return Attribute(Node_createAttribute(tmp._node, name)) 283 284 def createTextNode(self, value): 285 return Node(Node_createTextNode(self._node, value), self.ownerDocument) 286 287 def createComment(self, value): 288 return Node(Node_createComment(self._node, value), self.ownerDocument) 289 290 def importNode(self, node, deep): 291 if hasattr(node, "as_native_node"): 292 return Node(Node_importNode(self._node, node.as_native_node(), deep), self.ownerDocument) 293 else: 294 return Node(Node_importNode_DOM(self._node, node, deep), self.ownerDocument) 295 296 def insertBefore(self, tmp, oldNode): 297 if hasattr(tmp, "as_native_node"): 298 return Node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self.ownerDocument) 299 else: 300 return Node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self.ownerDocument) 301 302 def replaceChild(self, tmp, oldNode): 303 if hasattr(tmp, "as_native_node"): 304 return Node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self.ownerDocument) 305 else: 306 return Node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self.ownerDocument) 307 308 def appendChild(self, tmp): 309 if hasattr(tmp, "as_native_node"): 310 return Node(Node_appendChild(self._node, tmp.as_native_node()), self.ownerDocument) 311 else: 312 return Node(Node_appendChild(self._node, tmp), self.ownerDocument) 313 314 def removeChild(self, tmp): 315 if hasattr(tmp, "as_native_node"): 316 Node_removeChild(self._node, tmp.as_native_node()) 317 else: 318 Node_removeChild(self._node, tmp) 319 320 def getElementsByTagName(self, tagName): 321 return self.xpath("//" + tagName) 322 323 def getElementsByTagNameNS(self, namespaceURI, localName): 324 return self.xpath("//ns:" + localName, namespaces={"ns" : namespaceURI}) 325 326 def normalize(self): 327 text_nodes = [] 328 for node in self.childNodes: 329 if node.nodeType == node.TEXT_NODE: 330 text_nodes.append(node) 331 elif len(text_nodes) != 0: 332 self._normalize(text_nodes) 333 text_nodes = [] 334 if len(text_nodes) != 0: 335 self._normalize(text_nodes) 336 337 def _normalize(self, text_nodes): 338 texts = [] 339 for text_node in text_nodes[:-1]: 340 texts.append(text_node.nodeValue) 341 self.removeChild(text_node) 342 texts.append(text_nodes[-1].nodeValue) 343 self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1]) 344 345 childNodes = property(_childNodes) 346 value = data = nodeValue = property(_nodeValue, _setNodeValue) 347 name = nodeName = property(_nodeName) 348 tagName = property(_tagName) 349 namespaceURI = property(_namespaceURI) 350 prefix = property(_prefix) 351 localName = property(_localName) 352 parentNode = property(_parentNode) 353 nodeType = property(_nodeType) 354 attributes = property(_attributes) 355 previousSibling = property(_previousSibling) 356 nextSibling = property(_nextSibling) 357 doctype = property(_doctype) 358 publicId = property(_publicId) 359 systemId = property(_systemId) 360 361 # NOTE: To be fixed - these being doctype-specific values. 362 363 entities = {} 364 notations = {} 365 366 #def isSameNode(self, other): 367 # return self._node.nodePath() == other._node.nodePath() 368 369 #def __eq__(self, other): 370 # return self._node.nodePath() == other._node.nodePath() 371 372 # 4DOM extensions to the usual PyXML API. 373 # NOTE: To be finished. 374 375 def xpath(self, expr, variables=None, namespaces=None): 376 result = Node_xpath(self._node, expr, variables, namespaces) 377 if isinstance(result, str): 378 return to_unicode(result) 379 elif hasattr(result, "__len__"): 380 return NodeList([get_node(_node, self) for _node in result]) 381 else: 382 return result 383 384 # Convenience methods. 385 386 def toString(self, encoding=None, prettyprint=0): 387 return toString(self, encoding, prettyprint) 388 389 def toStream(self, stream, encoding=None, prettyprint=0): 390 toStream(self, stream, encoding, prettyprint) 391 392 def toFile(self, f, encoding=None, prettyprint=0): 393 toFile(self, f, encoding, prettyprint) 394 395 # Attribute nodes. 396 397 class Attribute(Node): 398 399 "A class providing attribute access." 400 401 def __init__(self, node, ownerDocument=None, ownerElement=None): 402 Node.__init__(self, node, ownerDocument) 403 self.ownerElement = ownerElement 404 405 def _parentNode(self): 406 return self.ownerElement 407 408 parentNode = property(_parentNode) 409 410 # Document housekeeping mechanisms. 411 412 class Document(Node): 413 414 "A class providing document-level housekeeping." 415 416 def __init__(self, node): 417 self._node = node 418 419 def _ownerDocument(self): 420 return self 421 422 def _parentNode(self): 423 return None 424 425 def __del__(self): 426 #print "Freeing document", self._node 427 libxml2mod.xmlFreeDoc(self._node) 428 429 ownerDocument = property(_ownerDocument) 430 parentNode = property(_parentNode) 431 432 class DocumentType(object): 433 434 "A class providing a container for document type information." 435 436 def __init__(self, localName, publicId, systemId): 437 self.name = self.localName = localName 438 self.publicId = publicId 439 self.systemId = systemId 440 441 # NOTE: Nothing is currently provided to support the following 442 # NOTE: attributes. 443 444 self.entities = {} 445 self.notations = {} 446 447 # Factory functions. 448 449 def get_node(_node, context_node): 450 if Node_nodeType(_node) == context_node.DOCUMENT_NODE: 451 return context_node.ownerDocument 452 elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE: 453 return Attribute(_node, context_node.ownerDocument, context_node) 454 else: 455 return Node(_node, context_node.ownerDocument) 456 457 # Utility functions. 458 459 def createDocumentType(localName, publicId, systemId): 460 return DocumentType(localName, publicId, systemId) 461 462 def createDocument(namespaceURI, localName, doctype): 463 return Document(Node_createDocument(namespaceURI, localName, doctype)) 464 465 def parse(stream_or_string, html=0): 466 467 """ 468 Parse the given 'stream_or_string', where the supplied object can either be 469 a stream (such as a file or stream object), or a string (containing the 470 filename of a document). If the optional 'html' parameter is set to a true 471 value, the content to be parsed will be treated as being HTML rather than 472 XML. 473 474 A document object is returned by this function. 475 """ 476 477 if hasattr(stream_or_string, "read"): 478 stream = stream_or_string 479 return parseString(stream.read(), html) 480 else: 481 return parseFile(stream_or_string, html) 482 483 def parseFile(filename, html=0): 484 485 """ 486 Parse the file having the given 'filename'. If the optional 'html' parameter 487 is set to a true value, the content to be parsed will be treated as being 488 HTML rather than XML. 489 490 A document object is returned by this function. 491 """ 492 493 return Document(Node_parseFile(filename, html)) 494 495 def parseString(s, html=0): 496 497 """ 498 Parse the content of the given string 's'. If the optional 'html' parameter 499 is set to a true value, the content to be parsed will be treated as being 500 HTML rather than XML. 501 502 A document object is returned by this function. 503 """ 504 505 return Document(Node_parseString(s, html)) 506 507 def parseURI(uri, html=0): 508 509 """ 510 Parse the content found at the given 'uri'. If the optional 'html' parameter 511 is set to a true value, the content to be parsed will be treated as being 512 HTML rather than XML. 513 514 The parseURI does not currently work with HTML. Use parse with a stream 515 object instead. For example: 516 517 d = parse(urllib.urlopen("http://www.python.org"), html=1) 518 519 A document object is returned by this function. 520 """ 521 522 return Document(Node_parseURI(uri, html)) 523 524 def toString(node, encoding=None, prettyprint=0): 525 526 """ 527 Return a string containing the serialised form of the given 'node' and its 528 children. The optional 'encoding' can be used to override the default 529 character encoding used in the serialisation. The optional 'prettyprint' 530 indicates whether the serialised form is prettyprinted or not (the default 531 setting). 532 """ 533 534 return Node_toString(node.as_native_node(), encoding, prettyprint) 535 536 def toStream(node, stream, encoding=None, prettyprint=0): 537 538 """ 539 Write the serialised form of the given 'node' and its children to the given 540 'stream'. The optional 'encoding' can be used to override the default 541 character encoding used in the serialisation. The optional 'prettyprint' 542 indicates whether the serialised form is prettyprinted or not (the default 543 setting). 544 """ 545 546 Node_toStream(node.as_native_node(), stream, encoding, prettyprint) 547 548 def toFile(node, filename, encoding=None, prettyprint=0): 549 550 """ 551 Write the serialised form of the given 'node' and its children to a file 552 having the given 'filename'. The optional 'encoding' can be used to override 553 the default character encoding used in the serialisation. The optional 554 'prettyprint' indicates whether the serialised form is prettyprinted or not 555 (the default setting). 556 """ 557 558 Node_toFile(node.as_native_node(), filename, encoding, prettyprint) 559 560 def adoptNodes(nodes): 561 562 """ 563 A special utility method which adopts the given low-level 'nodes' and which 564 returns a list of high-level equivalents. This is currently experimental and 565 should not be casually used. 566 """ 567 568 if len(nodes) == 0: 569 return [] 570 doc = Document(libxml2mod.doc(nodes[0])) 571 results = [] 572 for node in nodes: 573 results.append(Node(node, doc)) 574 return results 575 576 # vim: tabstop=4 expandtab shiftwidth=4