1 #!/usr/bin/env python 2 3 """ 4 DOM macros for virtual libxml2mod node methods and properties. 5 6 Copyright (C) 2003, 2004, 2005 Paul Boddie <paul@boddie.org.uk> 7 8 This library is free software; you can redistribute it and/or 9 modify it under the terms of the GNU Lesser General Public 10 License as published by the Free Software Foundation; either 11 version 2.1 of the License, or (at your option) any later version. 12 13 This library is distributed in the hope that it will be useful, 14 but WITHOUT ANY WARRANTY; without even the implied warranty of 15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU 16 Lesser General Public License for more details. 17 18 You should have received a copy of the GNU Lesser General Public 19 License along with this library; if not, write to the Free Software 20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301, USA 21 """ 22 23 import xml.dom 24 import libxml2mod 25 26 # NOTE: libxml2 seems to use UTF-8 throughout. 27 28 def from_unicode(s): 29 if isinstance(s, unicode): 30 return s.encode("utf-8") 31 else: 32 # The string might contain non-ASCII characters, thus upsetting libxml2 33 # as it encounters a non-UTF-8 string. 34 try: 35 unicode(s) 36 except UnicodeError: 37 raise TypeError, "Please use Unicode for non-ASCII data." 38 return s 39 40 def to_unicode(s): 41 if isinstance(s, str): 42 return unicode(s, encoding="utf-8") 43 else: 44 return s 45 46 def get_ns(ns): 47 out_ns = to_unicode(libxml2mod.xmlNodeGetContent(ns)) 48 # Detect "" and produce None as the empty namespace. 49 if out_ns: 50 return out_ns 51 else: 52 return None 53 54 def _get_prefix_and_localName(name): 55 t = name.split(":") 56 if len(t) == 1: 57 return None, name 58 elif len(t) == 2: 59 return t 60 else: 61 # NOTE: Should raise an exception. 62 return None, None 63 64 def _find_namespace_for_prefix(node, prefix): 65 66 "Find the namespace definition node in the given 'node' for 'prefix'." 67 68 current = libxml2mod.xmlNodeGetNsDefs(node) 69 while current is not None: 70 if libxml2mod.name(current) == prefix: 71 return current 72 current = libxml2mod.next(current) 73 return None 74 75 def _find_namespace(node, ns, prefix): 76 77 """ 78 Find the namespace definition node in the given 'node' for the given 'ns' 79 and 'prefix'. 80 """ 81 82 new_ns = None 83 current = libxml2mod.xmlNodeGetNsDefs(node) 84 while current is not None: 85 if _check_namespace(current, ns, prefix): 86 new_ns = current 87 break 88 current = libxml2mod.next(current) 89 if new_ns is None: 90 node_ns = libxml2mod.xmlNodeGetNs(node) 91 if node_ns is not None and _check_namespace(node_ns, ns, prefix): 92 new_ns = node_ns 93 return new_ns 94 95 def _check_namespace(current, ns, prefix): 96 97 "Check the 'current' namespace definition node against 'ns' and 'prefix'." 98 99 current_ns = get_ns(current) 100 current_prefix = libxml2mod.name(current) 101 if ns == current_ns and (prefix is None or prefix == current_prefix): 102 return 1 103 else: 104 return 0 105 106 def _make_namespace(node, ns, prefix, set_default=0): 107 108 """ 109 Make a new namespace definition node within the given 'node' for 'ns', 110 'prefix', setting the default namespace on 'node' when 'prefix' is None and 111 'set_default' is set to a true value (unlike the default value for that 112 parameter). 113 """ 114 115 if prefix is not None or set_default: 116 new_ns = libxml2mod.xmlNewNs(node, ns, prefix) 117 else: 118 new_ns = None 119 return new_ns 120 121 def _get_invented_prefix(node, ns): 122 current = libxml2mod.xmlNodeGetNsDefs(node) 123 prefixes = [] 124 while current is not None: 125 current_prefix = libxml2mod.name(current) 126 prefixes.append(current_prefix) 127 current = libxml2mod.next(current) 128 i = 0 129 while 1: 130 prefix = "NS%d" % i 131 if prefix not in prefixes: 132 return prefix 133 i += 1 134 135 _nodeTypes = { 136 "attribute" : xml.dom.Node.ATTRIBUTE_NODE, 137 "comment" : xml.dom.Node.COMMENT_NODE, 138 "document_xml" : xml.dom.Node.DOCUMENT_NODE, 139 "document_html" : xml.dom.Node.DOCUMENT_NODE, 140 "doctype" : xml.dom.Node.DOCUMENT_TYPE_NODE, 141 "dtd" : xml.dom.Node.DOCUMENT_TYPE_NODE, # NOTE: Needs verifying. 142 "element" : xml.dom.Node.ELEMENT_NODE, 143 "entity" : xml.dom.Node.ENTITY_NODE, 144 "entity_ref" : xml.dom.Node.ENTITY_REFERENCE_NODE, 145 "notation" : xml.dom.Node.NOTATION_NODE, 146 "pi" : xml.dom.Node.PROCESSING_INSTRUCTION_NODE, 147 "text" : xml.dom.Node.TEXT_NODE 148 } 149 150 _reverseNodeTypes = {} 151 for label, value in _nodeTypes.items(): 152 _reverseNodeTypes[value] = label 153 154 def Node_ownerDocument(node): 155 return libxml2mod.doc(node) or node 156 157 def Node_nodeType(node): 158 return _nodeTypes[libxml2mod.type(node)] 159 160 def Node_childNodes(node): 161 162 # NOTE: Consider a generator instead. 163 164 child_nodes = [] 165 node = libxml2mod.children(node) 166 while node is not None: 167 # Remove doctypes. 168 if Node_nodeType(node) != xml.dom.Node.DOCUMENT_TYPE_NODE: 169 child_nodes.append(node) 170 node = libxml2mod.next(node) 171 return child_nodes 172 173 def Node_attributes(node): 174 attributes = {} 175 176 # Include normal attributes. 177 178 current = libxml2mod.properties(node) 179 while current is not None: 180 ns = libxml2mod.xmlNodeGetNs(current) 181 if ns is not None: 182 attributes[(get_ns(ns), libxml2mod.name(current))] = current 183 else: 184 attributes[(None, libxml2mod.name(current))] = current 185 current = libxml2mod.next(current) 186 187 # Include xmlns attributes. 188 189 #current = libxml2mod.xmlNodeGetNsDefs(node) 190 #while current is not None: 191 # ns = get_ns(current) 192 # prefix = libxml2mod.name(current) 193 # attributes[(xml.dom.XMLNS_NAMESPACE, "xmlns:" + prefix)] = ns # NOTE: Need a real node here. 194 # current = libxml2mod.next(current) 195 196 return attributes 197 198 def Node_namespaceURI(node): 199 ns = libxml2mod.xmlNodeGetNs(node) 200 if ns is not None: 201 return get_ns(ns) 202 else: 203 return None 204 205 def Node_nodeValue(node): 206 return to_unicode(libxml2mod.xmlNodeGetContent(node)) 207 208 # NOTE: This is not properly exposed in the libxml2macro interface as the 209 # NOTE: writable form of nodeValue. 210 211 def Node_setNodeValue(node, value): 212 # NOTE: Cannot set attribute node values. 213 libxml2mod.xmlNodeSetContent(node, from_unicode(value)) 214 215 # NOTE: Verify this. 216 217 Node_data = Node_nodeValue 218 219 def Node_prefix(node): 220 ns = libxml2mod.xmlNodeGetNs(node) 221 if ns is not None: 222 return to_unicode(libxml2mod.name(ns)) 223 else: 224 return None 225 226 def Node_nodeName(node): 227 prefix = Node_prefix(node) 228 if prefix is not None: 229 return prefix + ":" + Node_localName(node) 230 else: 231 return Node_localName(node) 232 233 def Node_tagName(node): 234 if libxml2mod.type(node) == "element": 235 return Node_nodeName(node) 236 else: 237 return None 238 239 def Node_localName(node): 240 return to_unicode(libxml2mod.name(node)) 241 242 def Node_parentNode(node): 243 if libxml2mod.type(node) == "document_xml": 244 return None 245 else: 246 return libxml2mod.parent(node) 247 248 def Node_previousSibling(node): 249 if libxml2mod.prev(node) is not None: 250 return libxml2mod.prev(node) 251 else: 252 return None 253 254 def Node_nextSibling(node): 255 if libxml2mod.next(node) is not None: 256 return libxml2mod.next(node) 257 else: 258 return None 259 260 def Node_doctype(node): 261 return libxml2mod.xmlGetIntSubset(node) 262 263 def Node_hasAttributeNS(node, ns, localName): 264 return Node_getAttributeNS(node, ns, localName) is not None or \ 265 _find_namespace(node, ns, localName) is not None 266 267 def Node_hasAttribute(node, name): 268 return Node_getAttribute(node, name) is not None 269 270 def Node_getAttributeNS(node, ns, localName): 271 if ns == xml.dom.XMLNS_NAMESPACE: 272 ns_def = _find_namespace_for_prefix(node, localName) 273 if ns_def is not None: 274 return get_ns(ns_def) 275 else: 276 return None 277 else: 278 return to_unicode(libxml2mod.xmlGetNsProp(node, localName, ns)) 279 280 def Node_getAttribute(node, name): 281 return to_unicode(libxml2mod.xmlGetProp(node, name)) 282 283 def Node_getAttributeNodeNS(node, ns, localName): 284 # NOTE: Needs verifying. 285 return Node_attributes(node)[(ns, localName)] 286 287 def Node_getAttributeNode(node, name): 288 # NOTE: Needs verifying. 289 return Node_attributes(node)[(None, name)] 290 291 def Node_setAttributeNS(node, ns, name, value): 292 ns, name, value = map(from_unicode, [ns, name, value]) 293 prefix, localName = _get_prefix_and_localName(name) 294 295 # Detect setting of xmlns:localName=value, looking for cases where 296 # x:attr=value have caused the definition of xmlns:x=y (as a declaration 297 # with prefix=x, ns=y). 298 if prefix == "xmlns" and ns == xml.dom.XMLNS_NAMESPACE: 299 if _find_namespace(node, value, localName): 300 return 301 new_ns = _make_namespace(node, value, localName, set_default=0) 302 # For non-xmlns attributes, we find or make a namespace declaration and then 303 # set an attribute. 304 elif ns is not None: 305 # Look for a suitable namespace. 306 new_ns = _find_namespace(node, ns, prefix) 307 # Create a declaration if no suitable one was found. 308 if new_ns is None: 309 # Invent a prefix for unprefixed attributes with namespaces. 310 if prefix is None: 311 prefix = _get_invented_prefix(node, ns) 312 new_ns = _make_namespace(node, ns, prefix, set_default=0) 313 libxml2mod.xmlSetNsProp(node, new_ns, localName, value) 314 else: 315 # NOTE: Needs verifying: what should happen to the namespace? 316 # NOTE: This also catches the case where None is the element's 317 # NOTE: namespace and is also used for the attribute. 318 libxml2mod.xmlSetNsProp(node, None, localName, value) 319 320 def Node_setAttribute(node, name, value): 321 name, value = map(from_unicode, [name, value]) 322 323 libxml2mod.xmlSetProp(node, name, value) 324 325 def Node_setAttributeNodeNS(node, attr): 326 # NOTE: Not actually putting the node on the element. 327 Node_setAttributeNS(node, Node_namespaceURI(attr), Node_nodeName(attr), Node_nodeValue(attr)) 328 329 def Node_setAttributeNode(node, attr): 330 # NOTE: Not actually putting the node on the element. 331 Node_setAttribute(node, Node_nodeName(attr), Node_nodeValue(attr)) 332 333 def Node_removeAttributeNS(node, ns, localName): 334 attr = Node_getAttributeNodeNS(node, ns, localName) 335 libxml2mod.xmlUnsetNsProp(node, libxml2mod.xmlNodeGetNs(attr), libxml2mod.name(attr)) 336 337 def Node_removeAttribute(node, name): 338 name = from_unicode(name) 339 libxml2mod.xmlUnsetProp(node, name) 340 341 def Node_createElementNS(node, ns, name): 342 ns, name = map(from_unicode, [ns, name]) 343 344 prefix, localName = _get_prefix_and_localName(name) 345 new_node = libxml2mod.xmlNewNode(localName) 346 347 # If the namespace is not empty, set the declaration. 348 if ns is not None: 349 new_ns = _find_namespace(new_node, ns, prefix) 350 if new_ns is None: 351 new_ns = _make_namespace(new_node, ns, prefix, set_default=1) 352 libxml2mod.xmlSetNs(new_node, new_ns) 353 # If the namespace is empty, set a "null" declaration. 354 elif prefix is not None: 355 new_ns = _find_namespace(new_node, "", prefix) 356 if new_ns is None: 357 new_ns = _make_namespace(new_node, "", prefix) 358 libxml2mod.xmlSetNs(new_node, new_ns) 359 else: 360 libxml2mod.xmlSetNs(new_node, None) 361 Node_setAttribute(new_node, "xmlns", "") 362 return new_node 363 364 def Node_createElement(node, name): 365 name = from_unicode(name) 366 367 new_node = libxml2mod.xmlNewNode(name) 368 return new_node 369 370 def Node_createAttributeNS(node, ns, name): 371 ns, name = map(from_unicode, [ns, name]) 372 373 prefix, localName = _get_prefix_and_localName(name) 374 # NOTE: Does it make sense to set the namespace if it is empty? 375 if ns is not None: 376 new_ns = _find_namespace(node, ns, prefix) 377 if new_ns is None: 378 new_ns = _make_namespace(node, ns, prefix, set_default=0) 379 else: 380 new_ns = None 381 new_node = libxml2mod.xmlNewNsProp(node, new_ns, localName, None) 382 return new_node 383 384 def Node_createAttribute(node, name): 385 name = from_unicode(name) 386 387 # NOTE: xmlNewProp does not seem to work. 388 return Node_createAttributeNS(node, None, name) 389 390 def Node_createTextNode(node, value): 391 value = from_unicode(value) 392 393 return libxml2mod.xmlNewText(value) 394 395 def Node_createComment(node, value): 396 value = from_unicode(value) 397 398 return libxml2mod.xmlNewComment(value) 399 400 def Node_insertBefore(node, tmp, oldNode): 401 return libxml2mod.xmlAddPrevSibling(oldNode, tmp) 402 403 def Node_replaceChild(node, tmp, oldNode): 404 return libxml2mod.xmlReplaceNode(oldNode, tmp) 405 406 def Node_appendChild(node, tmp): 407 return libxml2mod.xmlAddChild(node, tmp) 408 409 def Node_removeChild(node, child): 410 libxml2mod.xmlUnlinkNode(child) 411 412 def Node_importNode(node, other, deep): 413 if Node_nodeType(other) == xml.dom.Node.ELEMENT_NODE: 414 imported_element = Node_createElementNS(node, Node_namespaceURI(other), Node_tagName(other)) 415 for attr in Node_attributes(other).values(): 416 Node_setAttributeNS(imported_element, Node_namespaceURI(attr), Node_nodeName(attr), Node_nodeValue(attr)) 417 418 if deep: 419 for child in Node_childNodes(other): 420 imported_child = Node_importNode(node, child, deep) 421 if imported_child: 422 Node_appendChild(imported_element, imported_child) 423 424 return imported_element 425 426 elif Node_nodeType(other) == xml.dom.Node.TEXT_NODE: 427 return Node_createTextNode(node, Node_nodeValue(other)) 428 429 elif Node_nodeType(other) == xml.dom.Node.COMMENT_NODE: 430 return Node_createComment(node, Node_data(other)) 431 432 raise ValueError, "Node type '%s' (%d) not supported." % (other, Node_nodeType(other)) 433 434 def Node_importNode_DOM(node, other, deep): 435 if other.nodeType == xml.dom.Node.ELEMENT_NODE: 436 imported_element = Node_createElementNS(node, other.namespaceURI, other.tagName) 437 for attr in other.attributes.values(): 438 Node_setAttributeNS(imported_element, attr.namespaceURI, attr.nodeName, attr.nodeValue) 439 440 if deep: 441 for child in other.childNodes: 442 imported_child = Node_importNode_DOM(node, child, deep) 443 if imported_child: 444 Node_appendChild(imported_element, imported_child) 445 446 return imported_element 447 448 elif other.nodeType == xml.dom.Node.TEXT_NODE: 449 return Node_createTextNode(node, other.nodeValue) 450 451 elif other.nodeType == xml.dom.Node.COMMENT_NODE: 452 return Node_createComment(node, other.data) 453 454 raise ValueError, "Node type '%s' (%d) not supported." % (_reverseNodeTypes[other.nodeType], other.nodeType) 455 456 def Node_xpath(node, expr, variables=None, namespaces=None): 457 expr = from_unicode(expr) 458 459 context = libxml2mod.xmlXPathNewContext(Node_ownerDocument(node)) 460 libxml2mod.xmlXPathSetContextNode(context, node) 461 # NOTE: Discover namespaces from the node. 462 # NOTE: Work out how to specify paths without having to use prefixes on 463 # NOTE: names all the time. 464 for prefix, ns in (namespaces or {}).items(): 465 libxml2mod.xmlXPathRegisterNs(context, prefix, ns) 466 # NOTE: No such functions are exposed in current versions of libxml2. 467 #for (prefix, ns), value in (variables or {}).items(): 468 # value = from_unicode(value) 469 # libxml2mod.xmlXPathRegisterVariableNS(context, prefix, ns, value) 470 result = libxml2mod.xmlXPathEval(expr, context) 471 libxml2mod.xmlXPathFreeContext(context) 472 return result 473 474 # Utility functions. 475 476 def createDocument(namespaceURI, localName, doctype): 477 # NOTE: Fixed to use version 1.0 only. 478 d = libxml2mod.xmlNewDoc("1.0") 479 if localName is not None: 480 # NOTE: Verify that this is always what should occur. 481 root = Node_createElementNS(d, namespaceURI, localName) 482 Node_appendChild(d, root) 483 if doctype is not None: 484 libxml2mod.xmlCreateIntSubset(d, doctype.localName, doctype.publicId, doctype.systemId) 485 return d 486 487 def parse(stream_or_string, html=0): 488 if hasattr(stream_or_string, "read"): 489 stream = stream_or_string 490 return parseString(stream.read(), html) 491 else: 492 return parseFile(stream_or_string, html) 493 494 def parseFile(s, html=0): 495 # NOTE: Switching off validation and remote DTD resolution. 496 if not html: 497 context = libxml2mod.xmlCreateFileParserCtxt(s) 498 libxml2mod.xmlParserSetPedantic(context, 0) 499 libxml2mod.xmlParserSetValidate(context, 0) 500 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 501 libxml2mod.xmlParseDocument(context) 502 return libxml2mod.xmlParserGetDoc(context) 503 else: 504 return libxml2mod.htmlReadFile(s, None, HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET) 505 506 def parseString(s, html=0): 507 # NOTE: Switching off validation and remote DTD resolution. 508 if not html: 509 context = libxml2mod.xmlCreateMemoryParserCtxt(s, len(s)) 510 libxml2mod.xmlParserSetPedantic(context, 0) 511 libxml2mod.xmlParserSetValidate(context, 0) 512 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 513 libxml2mod.xmlParseDocument(context) 514 return libxml2mod.xmlParserGetDoc(context) 515 else: 516 # NOTE: URL given as None. 517 html_url = None 518 return libxml2mod.htmlReadMemory(s, len(s), html_url, None, 519 HTML_PARSE_NOERROR | HTML_PARSE_NOWARNING | HTML_PARSE_NONET) 520 521 def parseURI(uri, html=0): 522 # NOTE: Switching off validation and remote DTD resolution. 523 if not html: 524 context = libxml2mod.xmlCreateURLParserCtxt(uri, 0) 525 libxml2mod.xmlParserSetPedantic(context, 0) 526 libxml2mod.xmlParserSetValidate(context, 0) 527 libxml2mod.xmlCtxtUseOptions(context, XML_PARSE_NOERROR | XML_PARSE_NOWARNING | XML_PARSE_NONET) 528 libxml2mod.xmlParseDocument(context) 529 return libxml2mod.xmlParserGetDoc(context) 530 else: 531 raise NotImplementedError, "parseURI does not yet support HTML" 532 533 def toString(node, encoding=None, prettyprint=0): 534 return libxml2mod.serializeNode(node, encoding, prettyprint) 535 536 def toStream(node, stream, encoding=None, prettyprint=0): 537 stream.write(toString(node, encoding, prettyprint)) 538 539 def toFile(node, f, encoding=None, prettyprint=0): 540 libxml2mod.saveNodeTo(node, f, encoding, prettyprint) 541 542 # libxml2mod constants. 543 544 HTML_PARSE_NOERROR = 32 545 HTML_PARSE_NOWARNING = 64 546 HTML_PARSE_NONET = 2048 547 XML_PARSE_NOERROR = 32 548 XML_PARSE_NOWARNING = 64 549 XML_PARSE_NONET = 2048 550 551 # vim: tabstop=4 expandtab shiftwidth=4