1 #!/usr/bin/env python 2 3 """ 4 DOM macros for virtual libxml2mod node methods and properties. 5 6 Copyright (C) 2003, 2004, 2005, 2006 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 import xml.dom 24 25 # Try the conventional import first. 26 27 try: 28 import libxml2mod 29 except ImportError: 30 from libxmlmods import libxml2mod 31 32 # NOTE: libxml2 seems to use UTF-8 throughout. 33 34 def from_unicode(s): 35 if isinstance(s, unicode): 36 return s.encode("utf-8") 37 else: 38 # The string might contain non-ASCII characters, thus upsetting libxml2 39 # as it encounters a non-UTF-8 string. 40 try: 41 unicode(s) 42 except UnicodeError: 43 raise TypeError, "Please use Unicode for non-ASCII data." 44 return s 45 46 def to_unicode(s): 47 if isinstance(s, str): 48 return unicode(s, encoding="utf-8") 49 else: 50 return s 51 52 def get_ns(ns): 53 out_ns = to_unicode(libxml2mod.xmlNodeGetContent(ns)) 54 # Detect "" and produce None as the empty namespace. 55 if out_ns: 56 return out_ns 57 else: 58 return None 59 60 def _get_prefix_and_localName(name): 61 t = name.split(":") 62 if len(t) == 1: 63 return None, name 64 elif len(t) == 2: 65 return t 66 else: 67 # NOTE: Should raise an exception. 68 return None, None 69 70 def _find_namespace_for_prefix(node, prefix): 71 72 "Find the namespace definition node in the given 'node' for 'prefix'." 73 74 current = libxml2mod.xmlNodeGetNsDefs(node) 75 while current is not None: 76 if libxml2mod.name(current) == prefix: 77 return current 78 current = libxml2mod.next(current) 79 return None 80 81 def _find_namespace(node, ns, prefix): 82 83 """ 84 Find the namespace definition node in the given 'node' for the given 'ns' 85 and 'prefix'. 86 """ 87 88 new_ns = None 89 current = libxml2mod.xmlNodeGetNsDefs(node) 90 while current is not None: 91 if _check_namespace(current, ns, prefix): 92 new_ns = current 93 break 94 current = libxml2mod.next(current) 95 if new_ns is None: 96 node_ns = libxml2mod.xmlNodeGetNs(node) 97 if node_ns is not None and _check_namespace(node_ns, ns, prefix): 98 new_ns = node_ns 99 return new_ns 100 101 def _check_namespace(current, ns, prefix): 102 103 "Check the 'current' namespace definition node against 'ns' and 'prefix'." 104 105 current_ns = get_ns(current) 106 current_prefix = libxml2mod.name(current) 107 if ns == current_ns and (prefix is None or prefix == current_prefix): 108 return 1 109 else: 110 return 0 111 112 def _make_namespace(node, ns, prefix, set_default=0): 113 114 """ 115 Make a new namespace definition node within the given 'node' for 'ns', 116 'prefix', setting the default namespace on 'node' when 'prefix' is None and 117 'set_default' is set to a true value (unlike the default value for that 118 parameter). 119 """ 120 121 if prefix is not None or set_default: 122 new_ns = libxml2mod.xmlNewNs(node, ns, prefix) 123 else: 124 new_ns = None 125 return new_ns 126 127 def _get_invented_prefix(node, ns): 128 current = libxml2mod.xmlNodeGetNsDefs(node) 129 prefixes = [] 130 while current is not None: 131 current_prefix = libxml2mod.name(current) 132 prefixes.append(current_prefix) 133 current = libxml2mod.next(current) 134 i = 0 135 while 1: 136 prefix = "NS%d" % i 137 if prefix not in prefixes: 138 return prefix 139 i += 1 140 141 _nodeTypes = { 142 "attribute" : xml.dom.Node.ATTRIBUTE_NODE, 143 "comment" : xml.dom.Node.COMMENT_NODE, 144 "document_xml" : xml.dom.Node.DOCUMENT_NODE, 145 "document_html" : xml.dom.Node.DOCUMENT_NODE, 146 "doctype" : xml.dom.Node.DOCUMENT_TYPE_NODE, 147 "dtd" : xml.dom.Node.DOCUMENT_TYPE_NODE, # NOTE: Needs verifying. 148 "element" : xml.dom.Node.ELEMENT_NODE, 149 "entity" : xml.dom.Node.ENTITY_NODE, 150 "entity_ref" : xml.dom.Node.ENTITY_REFERENCE_NODE, 151 "notation" : xml.dom.Node.NOTATION_NODE, 152 "pi" : xml.dom.Node.PROCESSING_INSTRUCTION_NODE, 153 "text" : xml.dom.Node.TEXT_NODE 154 } 155 156 _reverseNodeTypes = {} 157 for label, value in _nodeTypes.items(): 158 _reverseNodeTypes[value] = label 159 160 def Node_ownerDocument(node): 161 return libxml2mod.doc(node) or node 162 163 def Node_nodeType(node): 164 return _nodeTypes[libxml2mod.type(node)] 165 166 def Node_childNodes(node): 167 168 # NOTE: Consider a generator instead. 169 170 child_nodes = [] 171 node = libxml2mod.children(node) 172 while node is not None: 173 # Remove doctypes. 174 if Node_nodeType(node) != xml.dom.Node.DOCUMENT_TYPE_NODE: 175 child_nodes.append(node) 176 node = libxml2mod.next(node) 177 return child_nodes 178 179 def Node_attributes(node): 180 attributes = {} 181 182 # Include normal attributes. 183 184 current = libxml2mod.properties(node) 185 while current is not None: 186 ns = libxml2mod.xmlNodeGetNs(current) 187 if ns is not None: 188 attributes[(get_ns(ns), libxml2mod.name(current))] = current 189 else: 190 attributes[(None, libxml2mod.name(current))] = current 191 current = libxml2mod.next(current) 192 193 # Include xmlns attributes. 194 195 #current = libxml2mod.xmlNodeGetNsDefs(node) 196 #while current is not None: 197 # ns = get_ns(current) 198 # prefix = libxml2mod.name(current) 199 # attributes[(xml.dom.XMLNS_NAMESPACE, "xmlns:" + prefix)] = ns # NOTE: Need a real node here. 200 # current = libxml2mod.next(current) 201 202 return attributes 203 204 def Node_namespaceURI(node): 205 ns = libxml2mod.xmlNodeGetNs(node) 206 if ns is not None: 207 return get_ns(ns) 208 else: 209 return None 210 211 def Node_nodeValue(node): 212 return to_unicode(libxml2mod.xmlNodeGetContent(node)) 213 214 # NOTE: This is not properly exposed in the libxml2macro interface as the 215 # NOTE: writable form of nodeValue. 216 217 def Node_setNodeValue(node, value): 218 # NOTE: Cannot set attribute node values. 219 libxml2mod.xmlNodeSetContent(node, from_unicode(value)) 220 221 # NOTE: Verify this. 222 223 Node_data = Node_nodeValue 224 225 def Node_prefix(node): 226 ns = libxml2mod.xmlNodeGetNs(node) 227 if ns is not None: 228 return to_unicode(libxml2mod.name(ns)) 229 else: 230 return None 231 232 def Node_nodeName(node): 233 prefix = Node_prefix(node) 234 if prefix is not None: 235 return prefix + ":" + Node_localName(node) 236 else: 237 return Node_localName(node) 238 239 def Node_tagName(node): 240 if libxml2mod.type(node) == "element": 241 return Node_nodeName(node) 242 else: 243 return None 244 245 def Node_localName(node): 246 return to_unicode(libxml2mod.name(node)) 247 248 def Node_parentNode(node): 249 if libxml2mod.type(node) == "document_xml": 250 return None 251 else: 252 return libxml2mod.parent(node) 253 254 def Node_previousSibling(node): 255 if libxml2mod.prev(node) is not None: 256 return libxml2mod.prev(node) 257 else: 258 return None 259 260 def Node_nextSibling(node): 261 if libxml2mod.next(node) is not None: 262 return libxml2mod.next(node) 263 else: 264 return None 265 266 def Node_doctype(node): 267 return libxml2mod.xmlGetIntSubset(node) 268 269 def Node_hasAttributeNS(node, ns, localName): 270 return Node_getAttributeNS(node, ns, localName) is not None or \ 271 _find_namespace(node, ns, localName) is not None 272 273 def Node_hasAttribute(node, name): 274 return Node_getAttribute(node, name) is not None 275 276 def Node_getAttributeNS(node, ns, localName): 277 if ns == xml.dom.XMLNS_NAMESPACE: 278 ns_def = _find_namespace_for_prefix(node, localName) 279 if ns_def is not None: 280 return get_ns(ns_def) 281 else: 282 return None 283 else: 284 return to_unicode(libxml2mod.xmlGetNsProp(node, localName, ns)) 285 286 def Node_getAttribute(node, name): 287 return to_unicode(libxml2mod.xmlGetProp(node, name)) 288 289 def Node_getAttributeNodeNS(node, ns, localName): 290 # NOTE: Needs verifying. 291 return Node_attributes(node)[(ns, localName)] 292 293 def Node_getAttributeNode(node, name): 294 # NOTE: Needs verifying. 295 return Node_attributes(node)[(None, name)] 296 297 def Node_setAttributeNS(node, ns, name, value): 298 ns, name, value = map(from_unicode, [ns, name, value]) 299 prefix, localName = _get_prefix_and_localName(name) 300 301 # Detect setting of xmlns:localName=value, looking for cases where 302 # x:attr=value have caused the definition of xmlns:x=y (as a declaration 303 # with prefix=x, ns=y). 304 if prefix == "xmlns" and ns == xml.dom.XMLNS_NAMESPACE: 305 if _find_namespace(node, value, localName): 306 return 307 new_ns = _make_namespace(node, value, localName, set_default=0) 308 # For non-xmlns attributes, we find or make a namespace declaration and then 309 # set an attribute. 310 elif ns is not None: 311 # Look for a suitable namespace. 312 new_ns = _find_namespace(node, ns, prefix) 313 # Create a declaration if no suitable one was found. 314 if new_ns is None: 315 # Invent a prefix for unprefixed attributes with namespaces. 316 if prefix is None: 317 prefix = _get_invented_prefix(node, ns) 318 new_ns = _make_namespace(node, ns, prefix, set_default=0) 319 libxml2mod.xmlSetNsProp(node, new_ns, localName, value) 320 else: 321 # NOTE: Needs verifying: what should happen to the namespace? 322 # NOTE: This also catches the case where None is the element's 323 # NOTE: namespace and is also used for the attribute. 324 libxml2mod.xmlSetNsProp(node, None, localName, value) 325 326 def Node_setAttribute(node, name, value): 327 name, value = map(from_unicode, [name, value]) 328 329 libxml2mod.xmlSetProp(node, name, value) 330 331 def Node_setAttributeNodeNS(node, attr): 332 # NOTE: Not actually putting the node on the element. 333 Node_setAttributeNS(node, Node_namespaceURI(attr), Node_nodeName(attr), Node_nodeValue(attr)) 334 335 def Node_setAttributeNode(node, attr): 336 # NOTE: Not actually putting the node on the element. 337 Node_setAttribute(node, Node_nodeName(attr), Node_nodeValue(attr)) 338 339 def Node_removeAttributeNS(node, ns, localName): 340 attr = Node_getAttributeNodeNS(node, ns, localName) 341 libxml2mod.xmlUnsetNsProp(node, libxml2mod.xmlNodeGetNs(attr), libxml2mod.name(attr)) 342 343 def Node_removeAttribute(node, name): 344 name = from_unicode(name) 345 libxml2mod.xmlUnsetProp(node, name) 346 347 def Node_createElementNS(node, ns, name): 348 ns, name = map(from_unicode, [ns, name]) 349 350 prefix, localName = _get_prefix_and_localName(name) 351 new_node = libxml2mod.xmlNewNode(localName) 352 353 # If the namespace is not empty, set the declaration. 354 if ns is not None: 355 new_ns = _find_namespace(new_node, ns, prefix) 356 if new_ns is None: 357 new_ns = _make_namespace(new_node, ns, prefix, set_default=1) 358 libxml2mod.xmlSetNs(new_node, new_ns) 359 # If the namespace is empty, set a "null" declaration. 360 elif prefix is not None: 361 new_ns = _find_namespace(new_node, "", prefix) 362 if new_ns is None: 363 new_ns = _make_namespace(new_node, "", prefix) 364 libxml2mod.xmlSetNs(new_node, new_ns) 365 else: 366 libxml2mod.xmlSetNs(new_node, None) 367 Node_setAttribute(new_node, "xmlns", "") 368 return new_node 369 370 def Node_createElement(node, name): 371 name = from_unicode(name) 372 373 new_node = libxml2mod.xmlNewNode(name) 374 return new_node 375 376 def Node_createAttributeNS(node, ns, name): 377 ns, name = map(from_unicode, [ns, name]) 378 379 prefix, localName = _get_prefix_and_localName(name) 380 # NOTE: Does it make sense to set the namespace if it is empty? 381 if ns is not None: 382 new_ns = _find_namespace(node, ns, prefix) 383 if new_ns is None: 384 new_ns = _make_namespace(node, ns, prefix, set_default=0) 385 else: 386 new_ns = None 387 new_node = libxml2mod.xmlNewNsProp(node, new_ns, localName, None) 388 return new_node 389 390 def Node_createAttribute(node, name): 391 name = from_unicode(name) 392 393 # NOTE: xmlNewProp does not seem to work. 394 return Node_createAttributeNS(node, None, name) 395 396 def Node_createTextNode(node, value): 397 value = from_unicode(value) 398 399 return libxml2mod.xmlNewText(value) 400 401 def Node_createComment(node, value): 402 value = from_unicode(value) 403 404 return libxml2mod.xmlNewComment(value) 405 406 def Node_insertBefore(node, tmp, oldNode): 407 return libxml2mod.xmlAddPrevSibling(oldNode, tmp) 408 409 def Node_replaceChild(node, tmp, oldNode): 410 return libxml2mod.xmlReplaceNode(oldNode, tmp) 411 412 def Node_appendChild(node, tmp): 413 return libxml2mod.xmlAddChild(node, tmp) 414 415 def Node_removeChild(node, child): 416 libxml2mod.xmlUnlinkNode(child) 417 418 def Node_importNode(node, other, deep): 419 if Node_nodeType(other) == xml.dom.Node.ELEMENT_NODE: 420 imported_element = Node_createElementNS(node, Node_namespaceURI(other), Node_tagName(other)) 421 for attr in Node_attributes(other).values(): 422 Node_setAttributeNS(imported_element, Node_namespaceURI(attr), Node_nodeName(attr), Node_nodeValue(attr)) 423 424 if deep: 425 for child in Node_childNodes(other): 426 imported_child = Node_importNode(node, child, deep) 427 if imported_child: 428 Node_appendChild(imported_element, imported_child) 429 430 return imported_element 431 432 elif Node_nodeType(other) == xml.dom.Node.TEXT_NODE: 433 return Node_createTextNode(node, Node_nodeValue(other)) 434 435 elif Node_nodeType(other) == xml.dom.Node.COMMENT_NODE: 436 return Node_createComment(node, Node_data(other)) 437 438 raise ValueError, "Node type '%s' (%d) not supported." % (other, Node_nodeType(other)) 439 440 def Node_importNode_DOM(node, other, deep): 441 if other.nodeType == xml.dom.Node.ELEMENT_NODE: 442 imported_element = Node_createElementNS(node, other.namespaceURI, other.tagName) 443 for attr in other.attributes.values(): 444 Node_setAttributeNS(imported_element, attr.namespaceURI, attr.nodeName, attr.nodeValue) 445 446 if deep: 447 for child in other.childNodes: 448 imported_child = Node_importNode_DOM(node, child, deep) 449 if imported_child: 450 Node_appendChild(imported_element, imported_child) 451 452 return imported_element 453 454 elif other.nodeType == xml.dom.Node.TEXT_NODE: 455 return Node_createTextNode(node, other.nodeValue) 456 457 elif other.nodeType == xml.dom.Node.COMMENT_NODE: 458 return Node_createComment(node, other.data) 459 460 raise ValueError, "Node type '%s' (%d) not supported." % (_reverseNodeTypes[other.nodeType], other.nodeType) 461 462 def Node_xpath(node, expr, variables=None, namespaces=None): 463 expr = from_unicode(expr) 464 465 context = libxml2mod.xmlXPathNewContext(Node_ownerDocument(node)) 466 libxml2mod.xmlXPathSetContextNode(context, node) 467 # NOTE: Discover namespaces from the node. 468 # NOTE: Work out how to specify paths without having to use prefixes on 469 # NOTE: names all the time. 470 for prefix, ns in (namespaces or {}).items(): 471 libxml2mod.xmlXPathRegisterNs(context, prefix, ns) 472 # NOTE: No such functions are exposed in current versions of libxml2. 473 #for (prefix, ns), value in (variables or {}).items(): 474 # value = from_unicode(value) 475 # libxml2mod.xmlXPathRegisterVariableNS(context, prefix, ns, value) 476 result = libxml2mod.xmlXPathEval(expr, context) 477 libxml2mod.xmlXPathFreeContext(context) 478 return result 479 480 # Utility functions. 481 482 def createDocument(namespaceURI, localName, doctype): 483 # NOTE: Fixed to use version 1.0 only. 484 d = libxml2mod.xmlNewDoc("1.0") 485 if localName is not None: 486 # NOTE: Verify that this is always what should occur. 487 root = Node_createElementNS(d, namespaceURI, localName) 488 Node_appendChild(d, root) 489 if doctype is not None: 490 libxml2mod.xmlCreateIntSubset(d, doctype.localName, doctype.publicId, doctype.systemId) 491 return d 492 493 def parse(stream_or_string, html=0): 494 if hasattr(stream_or_string, "read"): 495 stream = stream_or_string 496 return parseString(stream.read(), html) 497 else: 498 return parseFile(stream_or_string, html) 499 500 def parseFile(s, html=0): 501 # NOTE: Switching off validation and remote DTD resolution. 502 if not html: 503 context = libxml2mod.xmlCreateFileParserCtxt(s) 504 libxml2mod.xmlParserSetPedantic(context, 0) 505 libxml2mod.xmlParserSetValidate(context, 0) 506 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 507 libxml2mod.xmlParseDocument(context) 508 return libxml2mod.xmlParserGetDoc(context) 509 else: 510 return libxml2mod.htmlReadFile(s, None, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET) 511 512 def parseString(s, html=0): 513 # NOTE: Switching off validation and remote DTD resolution. 514 if not html: 515 context = libxml2mod.xmlCreateMemoryParserCtxt(s, len(s)) 516 libxml2mod.xmlParserSetPedantic(context, 0) 517 libxml2mod.xmlParserSetValidate(context, 0) 518 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 519 libxml2mod.xmlParseDocument(context) 520 return libxml2mod.xmlParserGetDoc(context) 521 else: 522 # NOTE: URL given as None. 523 html_url = None 524 return libxml2mod.htmlReadMemory(s, len(s), html_url, None, 525 HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET) 526 527 def parseURI(uri, html=0): 528 # NOTE: Switching off validation and remote DTD resolution. 529 if not html: 530 context = libxml2mod.xmlCreateURLParserCtxt(uri, 0) 531 libxml2mod.xmlParserSetPedantic(context, 0) 532 libxml2mod.xmlParserSetValidate(context, 0) 533 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 534 libxml2mod.xmlParseDocument(context) 535 return libxml2mod.xmlParserGetDoc(context) 536 else: 537 raise NotImplementedError, "parseURI does not yet support HTML" 538 539 def toString(node, encoding=None, prettyprint=0): 540 return libxml2mod.serializeNode(node, encoding, prettyprint) 541 542 def toStream(node, stream, encoding=None, prettyprint=0): 543 stream.write(toString(node, encoding, prettyprint)) 544 545 def toFile(node, f, encoding=None, prettyprint=0): 546 libxml2mod.saveNodeTo(node, f, encoding, prettyprint) 547 548 # libxml2mod constants. 549 550 HTML_PARSE_NOERROR = 32 551 HTML_PARSE_NOWARNING = 64 552 HTML_PARSE_NONET = 2048 553 XML_PARSE_NOERROR = 32 554 XML_PARSE_NOWARNING = 64 555 XML_PARSE_NONET = 2048 556 557 # vim: tabstop=4 expandtab shiftwidth=4