1 #!/usr/bin/env python 2 3 """ 4 DOM macros for virtual libxml2mod node methods and properties. 5 6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 import xml.dom 24 25 # Try the conventional import first. 26 27 try: 28 import libxml2mod 29 except ImportError: 30 from libxmlmods import libxml2mod 31 32 # NOTE: libxml2 seems to use UTF-8 throughout. 33 34 def from_unicode(s): 35 if isinstance(s, unicode): 36 return s.encode("utf-8") 37 else: 38 # The string might contain non-ASCII characters, thus upsetting libxml2 39 # as it encounters a non-UTF-8 string. 40 try: 41 unicode(s) 42 except UnicodeError: 43 raise TypeError, "Please use Unicode for non-ASCII data." 44 return s 45 46 def to_unicode(s): 47 if isinstance(s, str): 48 return unicode(s, encoding="utf-8") 49 else: 50 return s 51 52 def check_document(node, tmp): 53 if Node_ownerDocument(tmp) is not None and Node_ownerDocument(node) is not Node_ownerDocument(tmp): 54 raise xml.dom.DOMException(xml.dom.WRONG_DOCUMENT_ERR) 55 56 def get_ns(ns): 57 out_ns = to_unicode(libxml2mod.xmlNodeGetContent(ns)) 58 # Detect "" and produce None as the empty namespace. 59 if out_ns: 60 return out_ns 61 else: 62 return None 63 64 def _get_prefix_and_localName(name): 65 t = name.split(":") 66 if len(t) == 1: 67 return None, name 68 elif len(t) == 2: 69 return t 70 else: 71 # NOTE: Should raise an exception. 72 return None, None 73 74 def _find_namespace_for_prefix(node, prefix): 75 76 "Find the namespace definition node in the given 'node' for 'prefix'." 77 78 current = libxml2mod.xmlNodeGetNsDefs(node) 79 while current is not None: 80 if libxml2mod.name(current) == prefix: 81 return current 82 current = libxml2mod.next(current) 83 return None 84 85 def _find_namespace(node, ns, prefix): 86 87 """ 88 Find the namespace definition node in the given 'node' for the given 'ns' 89 and 'prefix'. 90 """ 91 92 new_ns = None 93 current = libxml2mod.xmlNodeGetNsDefs(node) 94 while current is not None: 95 if _check_namespace(current, ns, prefix): 96 new_ns = current 97 break 98 current = libxml2mod.next(current) 99 if new_ns is None: 100 node_ns = libxml2mod.xmlNodeGetNs(node) 101 if node_ns is not None and _check_namespace(node_ns, ns, prefix): 102 new_ns = node_ns 103 return new_ns 104 105 def _check_namespace(current, ns, prefix): 106 107 "Check the 'current' namespace definition node against 'ns' and 'prefix'." 108 109 current_ns = get_ns(current) 110 current_prefix = libxml2mod.name(current) 111 if ns == current_ns and (prefix is None or prefix == current_prefix): 112 return 1 113 else: 114 return 0 115 116 def _make_namespace(node, ns, prefix, set_default=0): 117 118 """ 119 Make a new namespace definition node within the given 'node' for 'ns', 120 'prefix', setting the default namespace on 'node' when 'prefix' is None and 121 'set_default' is set to a true value (unlike the default value for that 122 parameter). 123 """ 124 125 if prefix is not None or set_default: 126 new_ns = libxml2mod.xmlNewNs(node, ns, prefix) 127 else: 128 new_ns = None 129 return new_ns 130 131 def _get_invented_prefix(node, ns): 132 current = libxml2mod.xmlNodeGetNsDefs(node) 133 prefixes = [] 134 while current is not None: 135 current_prefix = libxml2mod.name(current) 136 prefixes.append(current_prefix) 137 current = libxml2mod.next(current) 138 i = 0 139 while 1: 140 prefix = "NS%d" % i 141 if prefix not in prefixes: 142 return prefix 143 i += 1 144 145 _nodeTypes = { 146 "attribute" : xml.dom.Node.ATTRIBUTE_NODE, 147 "cdata" : xml.dom.Node.CDATA_SECTION_NODE, 148 "comment" : xml.dom.Node.COMMENT_NODE, 149 "document_xml" : xml.dom.Node.DOCUMENT_NODE, 150 "document_html" : xml.dom.Node.DOCUMENT_NODE, 151 "doctype" : xml.dom.Node.DOCUMENT_TYPE_NODE, 152 "dtd" : xml.dom.Node.DOCUMENT_TYPE_NODE, # NOTE: Needs verifying. 153 "element" : xml.dom.Node.ELEMENT_NODE, 154 "entity" : xml.dom.Node.ENTITY_NODE, 155 "entity_ref" : xml.dom.Node.ENTITY_REFERENCE_NODE, 156 "notation" : xml.dom.Node.NOTATION_NODE, 157 "pi" : xml.dom.Node.PROCESSING_INSTRUCTION_NODE, 158 "text" : xml.dom.Node.TEXT_NODE 159 } 160 161 _reverseNodeTypes = {} 162 for label, value in _nodeTypes.items(): 163 _reverseNodeTypes[value] = label 164 165 def Node_ownerDocument(node): 166 return libxml2mod.doc(node) 167 168 def Node_nodeType(node): 169 return _nodeTypes[libxml2mod.type(node)] 170 171 def Node_childNodes(node): 172 173 # NOTE: Consider a generator instead. 174 175 child_nodes = [] 176 node = libxml2mod.children(node) 177 while node is not None: 178 # Remove doctypes. 179 if Node_nodeType(node) != xml.dom.Node.DOCUMENT_TYPE_NODE: 180 child_nodes.append(node) 181 node = libxml2mod.next(node) 182 return child_nodes 183 184 def Node_attributes(node): 185 attributes = {} 186 187 # Include normal attributes. 188 189 current = libxml2mod.properties(node) 190 while current is not None: 191 ns = libxml2mod.xmlNodeGetNs(current) 192 if ns is not None: 193 attributes[(get_ns(ns), libxml2mod.name(current))] = current 194 else: 195 attributes[(None, libxml2mod.name(current))] = current 196 current = libxml2mod.next(current) 197 198 # Include xmlns attributes. 199 200 #current = libxml2mod.xmlNodeGetNsDefs(node) 201 #while current is not None: 202 # ns = get_ns(current) 203 # prefix = libxml2mod.name(current) 204 # attributes[(xml.dom.XMLNS_NAMESPACE, "xmlns:" + prefix)] = ns # NOTE: Need a real node here. 205 # current = libxml2mod.next(current) 206 207 return attributes 208 209 def Node_namespaceURI(node): 210 ns = libxml2mod.xmlNodeGetNs(node) 211 if ns is not None: 212 return get_ns(ns) 213 else: 214 return None 215 216 def Node_nodeValue(node): 217 return to_unicode(libxml2mod.xmlNodeGetContent(node)) 218 219 # NOTE: This is not properly exposed in the libxml2macro interface as the 220 # NOTE: writable form of nodeValue. 221 222 def Node_setNodeValue(node, value): 223 # NOTE: Cannot set attribute node values. 224 libxml2mod.xmlNodeSetContent(node, from_unicode(value)) 225 226 # NOTE: Verify this. The data attribute should only really exist for text, 227 # NOTE: character data, processing instructions and comments. 228 229 Node_data = Node_nodeValue 230 231 Node_textContent = Node_nodeValue 232 233 def Node_prefix(node): 234 ns = libxml2mod.xmlNodeGetNs(node) 235 if ns is not None: 236 return to_unicode(libxml2mod.name(ns)) 237 else: 238 return None 239 240 def Node_nodeName(node): 241 prefix = Node_prefix(node) 242 if prefix is not None: 243 return prefix + ":" + Node_localName(node) 244 else: 245 return Node_localName(node) 246 247 def Node_tagName(node): 248 if libxml2mod.type(node) == "element": 249 return Node_nodeName(node) 250 else: 251 return None 252 253 def Node_localName(node): 254 return to_unicode(libxml2mod.name(node)) 255 256 def Node_parentNode(node): 257 if libxml2mod.type(node) == "document_xml": 258 return None 259 else: 260 return libxml2mod.parent(node) 261 262 def Node_previousSibling(node): 263 if libxml2mod.prev(node) is not None: 264 return libxml2mod.prev(node) 265 else: 266 return None 267 268 def Node_nextSibling(node): 269 if libxml2mod.next(node) is not None: 270 return libxml2mod.next(node) 271 else: 272 return None 273 274 def Node_doctype(node): 275 return libxml2mod.xmlGetIntSubset(node) 276 277 def Node_hasAttributeNS(node, ns, localName): 278 return Node_getAttributeNS(node, ns, localName) is not None or \ 279 _find_namespace(node, ns, localName) is not None 280 281 def Node_hasAttribute(node, name): 282 return Node_getAttribute(node, name) is not None 283 284 def Node_getAttributeNS(node, ns, localName): 285 if ns == xml.dom.XMLNS_NAMESPACE: 286 ns_def = _find_namespace_for_prefix(node, localName) 287 if ns_def is not None: 288 return get_ns(ns_def) 289 else: 290 return None 291 else: 292 return to_unicode(libxml2mod.xmlGetNsProp(node, localName, ns)) 293 294 def Node_getAttribute(node, name): 295 return to_unicode(libxml2mod.xmlGetProp(node, name)) 296 297 def Node_getAttributeNodeNS(node, ns, localName): 298 # NOTE: Needs verifying. 299 return Node_attributes(node)[(ns, localName)] 300 301 def Node_getAttributeNode(node, name): 302 # NOTE: Needs verifying. 303 return Node_attributes(node)[(None, name)] 304 305 def Node_setAttributeNS(node, ns, name, value): 306 ns, name, value = map(from_unicode, [ns, name, value]) 307 prefix, localName = _get_prefix_and_localName(name) 308 309 # Detect setting of xmlns:localName=value, looking for cases where 310 # x:attr=value have caused the definition of xmlns:x=y (as a declaration 311 # with prefix=x, ns=y). 312 if prefix == "xmlns" and ns == xml.dom.XMLNS_NAMESPACE: 313 if _find_namespace(node, value, localName): 314 return 315 new_ns = _make_namespace(node, value, localName, set_default=0) 316 # For non-xmlns attributes, we find or make a namespace declaration and then 317 # set an attribute. 318 elif ns is not None: 319 # Look for a suitable namespace. 320 new_ns = _find_namespace(node, ns, prefix) 321 # Create a declaration if no suitable one was found. 322 if new_ns is None: 323 # Invent a prefix for unprefixed attributes with namespaces. 324 if prefix is None: 325 prefix = _get_invented_prefix(node, ns) 326 new_ns = _make_namespace(node, ns, prefix, set_default=0) 327 libxml2mod.xmlSetNsProp(node, new_ns, localName, value) 328 else: 329 # NOTE: Needs verifying: what should happen to the namespace? 330 # NOTE: This also catches the case where None is the element's 331 # NOTE: namespace and is also used for the attribute. 332 libxml2mod.xmlSetNsProp(node, None, localName, value) 333 334 def Node_setAttribute(node, name, value): 335 name, value = map(from_unicode, [name, value]) 336 337 libxml2mod.xmlSetProp(node, name, value) 338 339 def Node_setAttributeNodeNS(node, attr): 340 # NOTE: Not actually putting the node on the element. 341 Node_setAttributeNS(node, Node_namespaceURI(attr), Node_nodeName(attr), Node_nodeValue(attr)) 342 343 def Node_setAttributeNode(node, attr): 344 # NOTE: Not actually putting the node on the element. 345 Node_setAttribute(node, Node_nodeName(attr), Node_nodeValue(attr)) 346 347 def Node_removeAttributeNS(node, ns, localName): 348 attr = Node_getAttributeNodeNS(node, ns, localName) 349 libxml2mod.xmlUnsetNsProp(node, libxml2mod.xmlNodeGetNs(attr), libxml2mod.name(attr)) 350 351 def Node_removeAttribute(node, name): 352 name = from_unicode(name) 353 libxml2mod.xmlUnsetProp(node, name) 354 355 def Node_createElementNS(node, ns, name): 356 ns, name = map(from_unicode, [ns, name]) 357 358 prefix, localName = _get_prefix_and_localName(name) 359 new_node = libxml2mod.xmlNewNode(localName) 360 361 # If the namespace is not empty, set the declaration. 362 if ns is not None: 363 new_ns = _find_namespace(new_node, ns, prefix) 364 if new_ns is None: 365 new_ns = _make_namespace(new_node, ns, prefix, set_default=1) 366 libxml2mod.xmlSetNs(new_node, new_ns) 367 # If the namespace is empty, set a "null" declaration. 368 elif prefix is not None: 369 new_ns = _find_namespace(new_node, "", prefix) 370 if new_ns is None: 371 new_ns = _make_namespace(new_node, "", prefix) 372 libxml2mod.xmlSetNs(new_node, new_ns) 373 else: 374 libxml2mod.xmlSetNs(new_node, None) 375 Node_setAttribute(new_node, "xmlns", "") 376 return new_node 377 378 def Node_createElement(node, name): 379 name = from_unicode(name) 380 381 new_node = libxml2mod.xmlNewNode(name) 382 return new_node 383 384 def Node_createAttributeNS(node, ns, name): 385 ns, name = map(from_unicode, [ns, name]) 386 387 prefix, localName = _get_prefix_and_localName(name) 388 # NOTE: Does it make sense to set the namespace if it is empty? 389 if ns is not None: 390 new_ns = _find_namespace(node, ns, prefix) 391 if new_ns is None: 392 new_ns = _make_namespace(node, ns, prefix, set_default=0) 393 else: 394 new_ns = None 395 new_node = libxml2mod.xmlNewNsProp(node, new_ns, localName, None) 396 return new_node 397 398 def Node_createAttribute(node, name): 399 name = from_unicode(name) 400 401 # NOTE: xmlNewProp does not seem to work. 402 return Node_createAttributeNS(node, None, name) 403 404 def Node_createTextNode(node, value): 405 value = from_unicode(value) 406 407 return libxml2mod.xmlNewText(value) 408 409 def Node_createComment(node, value): 410 value = from_unicode(value) 411 412 return libxml2mod.xmlNewComment(value) 413 414 def Node_createCDATASection(node, value): 415 value = from_unicode(value) 416 417 return libxml2mod.xmlNewCDataBlock(Node_ownerDocument(node), value, len(value)) 418 419 def Node_insertBefore(node, tmp, oldNode): 420 check_document(node, tmp) 421 return libxml2mod.xmlAddPrevSibling(oldNode, tmp) 422 423 def Node_replaceChild(node, tmp, oldNode): 424 check_document(node, tmp) 425 return libxml2mod.xmlReplaceNode(oldNode, tmp) 426 427 def Node_appendChild(node, tmp): 428 check_document(node, tmp) 429 return libxml2mod.xmlAddChild(node, tmp) 430 431 def Node_removeChild(node, child): 432 libxml2mod.xmlUnlinkNode(child) 433 434 def Node_importNode(node, other, deep): 435 if Node_nodeType(other) == xml.dom.Node.ELEMENT_NODE: 436 imported_element = Node_createElementNS(node, Node_namespaceURI(other), Node_tagName(other)) 437 for attr in Node_attributes(other).values(): 438 Node_setAttributeNS(imported_element, Node_namespaceURI(attr), Node_nodeName(attr), Node_nodeValue(attr)) 439 440 if deep: 441 for child in Node_childNodes(other): 442 imported_child = Node_importNode(node, child, deep) 443 if imported_child: 444 Node_appendChild(imported_element, imported_child) 445 446 return imported_element 447 448 elif Node_nodeType(other) == xml.dom.Node.TEXT_NODE: 449 return Node_createTextNode(node, Node_nodeValue(other)) 450 451 elif Node_nodeType(other) == xml.dom.Node.COMMENT_NODE: 452 return Node_createComment(node, Node_data(other)) 453 454 elif Node_nodeType(other) == xml.dom.Node.CDATA_SECTION_NODE: 455 return Node_createCDATASection(node, Node_data(other)) 456 457 raise xml.dom.DOMException(xml.dom.NOT_SUPPORTED_ERR, "Node type '%s' (%d) not supported." % (other, Node_nodeType(other))) 458 459 def Node_importNode_DOM(node, other, deep): 460 if other.nodeType == xml.dom.Node.ELEMENT_NODE: 461 imported_element = Node_createElementNS(node, other.namespaceURI, other.tagName) 462 for attr in other.attributes.values(): 463 Node_setAttributeNS(imported_element, attr.namespaceURI, attr.nodeName, attr.nodeValue) 464 465 if deep: 466 for child in other.childNodes: 467 imported_child = Node_importNode_DOM(node, child, deep) 468 if imported_child: 469 Node_appendChild(imported_element, imported_child) 470 471 return imported_element 472 473 elif other.nodeType == xml.dom.Node.TEXT_NODE: 474 return Node_createTextNode(node, other.nodeValue) 475 476 elif other.nodeType == xml.dom.Node.COMMENT_NODE: 477 return Node_createComment(node, other.data) 478 479 elif other.nodeType == xml.dom.Node.CDATA_SECTION_NODE: 480 return Node_createCDATASection(node, other.data) 481 482 raise xml.dom.DOMException(xml.dom.NOT_SUPPORTED_ERR, "Node type '%s' (%d) not supported." % (_reverseNodeTypes[other.nodeType], other.nodeType)) 483 484 def Node_xpath(node, expr, variables=None, namespaces=None): 485 expr = from_unicode(expr) 486 487 context = libxml2mod.xmlXPathNewContext(Node_ownerDocument(node) or node) 488 libxml2mod.xmlXPathSetContextNode(context, node) 489 # NOTE: Discover namespaces from the node. 490 # NOTE: Work out how to specify paths without having to use prefixes on 491 # NOTE: names all the time. 492 for prefix, ns in (namespaces or {}).items(): 493 libxml2mod.xmlXPathRegisterNs(context, prefix, ns) 494 # NOTE: No such functions are exposed in current versions of libxml2. 495 #for (prefix, ns), value in (variables or {}).items(): 496 # value = from_unicode(value) 497 # libxml2mod.xmlXPathRegisterVariableNS(context, prefix, ns, value) 498 result = libxml2mod.xmlXPathEval(expr, context) 499 libxml2mod.xmlXPathFreeContext(context) 500 return result 501 502 # Exceptions. 503 504 class LSException(Exception): 505 506 "DOM Level 3 Load/Save exception." 507 508 PARSE_ERR = 81 509 SERIALIZE_ERR = 82 510 511 # Utility functions. 512 513 def createDocument(namespaceURI, localName, doctype): 514 # NOTE: Fixed to use version 1.0 only. 515 d = libxml2mod.xmlNewDoc("1.0") 516 if localName is not None: 517 # NOTE: Verify that this is always what should occur. 518 root = Node_createElementNS(d, namespaceURI, localName) 519 Node_appendChild(d, root) 520 if doctype is not None: 521 libxml2mod.xmlCreateIntSubset(d, doctype.localName, doctype.publicId, doctype.systemId) 522 return d 523 524 def parse(stream_or_string, html=0, htmlencoding=None, unfinished=0): 525 if hasattr(stream_or_string, "read"): 526 stream = stream_or_string 527 return parseString(stream.read(), html=html, htmlencoding=htmlencoding, unfinished=unfinished) 528 else: 529 return parseFile(stream_or_string, html=html, htmlencoding=htmlencoding, unfinished=unfinished) 530 531 def parseFile(s, html=0, htmlencoding=None, unfinished=0): 532 # NOTE: Switching off validation and remote DTD resolution. 533 if not html: 534 context = libxml2mod.xmlCreateFileParserCtxt(s) 535 libxml2mod.xmlParserSetPedantic(context, 0) 536 libxml2mod.xmlParserSetValidate(context, 0) 537 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 538 libxml2mod.xmlParseDocument(context) 539 doc = libxml2mod.xmlParserGetDoc(context) 540 if unfinished or libxml2mod.xmlParserGetWellFormed(context): 541 return doc 542 else: 543 raise LSException(LSException.PARSE_ERR) 544 else: 545 return libxml2mod.htmlReadFile(s, htmlencoding, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET) 546 547 def parseString(s, html=0, htmlencoding=None, unfinished=0): 548 # NOTE: Switching off validation and remote DTD resolution. 549 if not html: 550 context = libxml2mod.xmlCreateMemoryParserCtxt(s, len(s)) 551 libxml2mod.xmlParserSetPedantic(context, 0) 552 libxml2mod.xmlParserSetValidate(context, 0) 553 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 554 libxml2mod.xmlParseDocument(context) 555 doc = libxml2mod.xmlParserGetDoc(context) 556 if unfinished or libxml2mod.xmlParserGetWellFormed(context): 557 return doc 558 else: 559 raise LSException(LSException.PARSE_ERR) 560 else: 561 # NOTE: URL given as None. 562 html_url = None 563 return libxml2mod.htmlReadMemory(s, len(s), html_url, htmlencoding, 564 HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET) 565 566 def parseURI(uri, html=0, htmlencoding=None, unfinished=0): 567 # NOTE: Switching off validation and remote DTD resolution. 568 if not html: 569 context = libxml2mod.xmlCreateURLParserCtxt(uri, 0) 570 libxml2mod.xmlParserSetPedantic(context, 0) 571 libxml2mod.xmlParserSetValidate(context, 0) 572 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 573 libxml2mod.xmlParseDocument(context) 574 doc = libxml2mod.xmlParserGetDoc(context) 575 if unfinished or libxml2mod.xmlParserGetWellFormed(context): 576 return doc 577 else: 578 raise LSException(LSException.PARSE_ERR) 579 else: 580 raise NotImplementedError, "parseURI does not yet support HTML" 581 582 def toString(node, encoding=None, prettyprint=0): 583 return libxml2mod.serializeNode(node, encoding, prettyprint) 584 585 def toStream(node, stream, encoding=None, prettyprint=0): 586 stream.write(toString(node, encoding, prettyprint)) 587 588 def toFile(node, f, encoding=None, prettyprint=0): 589 libxml2mod.saveNodeTo(node, f, encoding, prettyprint) 590 591 # libxml2mod constants. 592 593 HTML_PARSE_NOERROR = 32 594 HTML_PARSE_NOWARNING = 64 595 HTML_PARSE_NONET = 2048 596 XML_PARSE_NOERROR = 32 597 XML_PARSE_NOWARNING = 64 598 XML_PARSE_NONET = 2048 599 600 # vim: tabstop=4 expandtab shiftwidth=4