libxml2dom (file libxml2dom/__init_

     1 #!/usr/bin/env python     2      3 """     4 DOM wrapper around libxml2, specifically the libxml2mod Python extension module.     5      6 Copyright (C) 2003, 2004, 2005, 2006, 2007 Paul Boddie <paul@boddie.org.uk>     7      8 This library is free software; you can redistribute it and/or     9 modify it under the terms of the GNU Lesser General Public    10 License as published by the Free Software Foundation; either    11 version 2.1 of the License, or (at your option) any later version.    12     13 This library is distributed in the hope that it will be useful,    14 but WITHOUT ANY WARRANTY; without even the implied warranty of    15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the GNU    16 Lesser General Public License for more details.    17     18 You should have received a copy of the GNU Lesser General Public    19 License along with this library; if not, write to the Free Software    20 Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA  02110-1301, USA    21 """    22     23 __version__ = "0.4.3"    24     25 from libxml2dom.macrolib import *    26 from libxml2dom.macrolib import \    27     createDocument as Node_createDocument, \    28     parseString as Node_parseString, parseURI as Node_parseURI, \    29     parseFile as Node_parseFile, \    30     toString as Node_toString, toStream as Node_toStream, \    31     toFile as Node_toFile    32 import urllib # for parseURI in HTML mode    33 import xml.dom # for getElementById    34     35 class Implementation(object):    36     37     "Contains an abstraction over the DOM implementation."    38     39     def createDocumentType(self, localName, publicId, systemId):    40         return DocumentType(localName, publicId, systemId)    41     42     def createDocument(self, namespaceURI, localName, doctype):    43         return Document(Node_createDocument(namespaceURI, localName, doctype), self)    44     45     # Wrapping of documents.    46     47     def adoptDocument(self, node):    48         return Document(node, self)    49     50     # Factory functions.    51     52     def get_node(self, _node, context_node):    53         if Node_nodeType(_node) == context_node.DOCUMENT_NODE:    54             return context_node.ownerDocument    55         elif Node_nodeType(_node) == context_node.ATTRIBUTE_NODE:    56             return Attribute(_node, self, context_node.ownerDocument, context_node)    57         else:    58             return Node(_node, self, context_node.ownerDocument)    59     60     def get_node_or_none(self, _node, context_node):    61         if _node is None:    62             return None    63         else:    64             return self.get_node(_node, context_node)    65     66 # Attribute and node list wrappers.    67     68 class NamedNodeMap(object):    69     70     """    71     A wrapper around Node objects providing DOM and dictionary convenience    72     methods.    73     """    74     75     def __init__(self, node, impl):    76         self.node = node    77         self.impl = impl    78     79     def getNamedItem(self, name):    80         return self.node.getAttributeNode(name)    81     82     def getNamedItemNS(self, ns, localName):    83         return self.node.getAttributeNodeNS(ns, localName)    84     85     def setNamedItem(self, node):    86         try:    87             old = self.getNamedItem(node.nodeName)    88         except KeyError:    89             old = None    90         self.node.setAttributeNode(node)    91         return old    92     93     def setNamedItemNS(self, node):    94         try:    95             old = self.getNamedItemNS(node.namespaceURI, node.localName)    96         except KeyError:    97             old = None    98         self.node.setAttributeNodeNS(node)    99         return old   100    101     def removeNamedItem(self, name):   102         try:   103             old = self.getNamedItem(name)   104         except KeyError:   105             old = None   106         self.node.removeAttribute(name)   107         return old   108    109     def removeNamedItemNS(self, ns, localName):   110         try:   111             old = self.getNamedItemNS(ns, localName)   112         except KeyError:   113             old = None   114         self.node.removeAttributeNS(ns, localName)   115         return old   116    117     # Dictionary emulation methods.   118    119     def __getitem__(self, name):   120         return self.getNamedItem(name)   121    122     def __setitem__(self, name, node):   123         if name == node.nodeName:   124             self.setNamedItem(node)   125         else:   126             raise KeyError, name   127    128     def __delitem__(self, name):   129         # NOTE: To be implemented.   130         pass   131    132     def values(self):   133         return [Attribute(_node, self.impl, self.node.ownerDocument) for _node in Node_attributes(self.node.as_native_node()).values()]   134    135     def keys(self):   136         return [(attr.namespaceURI, attr.localName) for attr in self.values()]   137    138     def items(self):   139         return [((attr.namespaceURI, attr.localName), attr) for attr in self.values()]   140    141     def __repr__(self):   142         return str(self)   143    144     def __str__(self):   145         return "{%s}" % ",\n".join(["%s : %s" % (repr(key), repr(value)) for key, value in self.items()])   146    147     def _length(self):   148         return len(self.values())   149    150     length = property(_length)   151    152 class NodeList(list):   153    154     "A wrapper around node lists."   155    156     def item(self, index):   157         return self[index]   158    159     def _length(self):   160         return len(self)   161    162     length = property(_length)   163    164 # Node classes.   165    166 class Node(object):   167    168     """   169     A DOM-style wrapper around libxml2mod objects.   170     """   171    172     ATTRIBUTE_NODE = xml.dom.Node.ATTRIBUTE_NODE   173     COMMENT_NODE = xml.dom.Node.COMMENT_NODE   174     DOCUMENT_NODE = xml.dom.Node.DOCUMENT_NODE   175     DOCUMENT_TYPE_NODE = xml.dom.Node.DOCUMENT_TYPE_NODE   176     ELEMENT_NODE = xml.dom.Node.ELEMENT_NODE   177     ENTITY_NODE = xml.dom.Node.ENTITY_NODE   178     ENTITY_REFERENCE_NODE = xml.dom.Node.ENTITY_REFERENCE_NODE   179     NOTATION_NODE = xml.dom.Node.NOTATION_NODE   180     PROCESSING_INSTRUCTION_NODE = xml.dom.Node.PROCESSING_INSTRUCTION_NODE   181     TEXT_NODE = xml.dom.Node.TEXT_NODE   182    183     def __init__(self, node, impl=None, ownerDocument=None):   184         self._node = node   185         self.impl = impl or default_impl   186         self.ownerDocument = ownerDocument   187    188     def as_native_node(self):   189         return self._node   190    191     def _nodeType(self):   192         return Node_nodeType(self._node)   193    194     def _childNodes(self):   195    196         # NOTE: Consider a generator instead.   197    198         return NodeList([self.impl.get_node(_node, self) for _node in Node_childNodes(self._node)])   199    200     def _attributes(self):   201         return NamedNodeMap(self, self.impl)   202    203     def _namespaceURI(self):   204         return Node_namespaceURI(self._node)   205    206     def _textContent(self):   207         return Node_textContent(self._node)   208    209     def _nodeValue(self):   210         if self.nodeType in null_value_node_types:   211             return None   212         return Node_nodeValue(self._node)   213    214     def _setNodeValue(self, value):   215         Node_setNodeValue(self._node, value)   216    217     def _prefix(self):   218         return Node_prefix(self._node)   219    220     def _nodeName(self):   221         return Node_nodeName(self._node)   222    223     def _tagName(self):   224         return Node_tagName(self._node)   225    226     def _localName(self):   227         return Node_localName(self._node)   228    229     def _parentNode(self):   230         return self.impl.get_node_or_none(Node_parentNode(self._node), self)   231    232     def _previousSibling(self):   233         return self.impl.get_node_or_none(Node_previousSibling(self._node), self)   234    235     def _nextSibling(self):   236         return self.impl.get_node_or_none(Node_nextSibling(self._node), self)   237    238     def _doctype(self):   239         return self.impl.get_node(Node_doctype(self._node), self)   240    241     def _publicId(self):   242         # NOTE: To be fixed when the libxml2mod API has been figured out.   243         if self.nodeType != self.DOCUMENT_TYPE_NODE:   244             return None   245         declaration = self.toString()   246         return self._findId(declaration, "PUBLIC")   247    248     def _systemId(self):   249         # NOTE: To be fixed when the libxml2mod API has been figured out.   250         if self.nodeType != self.DOCUMENT_TYPE_NODE:   251             return None   252         declaration = self.toString()   253         if self._findId(declaration, "PUBLIC"):   254             return self._findIdValue(declaration, 0)   255         return self._findId(declaration, "SYSTEM")   256    257     # NOTE: To be removed when the libxml2mod API has been figured out.   258    259     def _findId(self, declaration, identifier):   260         i = declaration.find(identifier)   261         if i == -1:   262             return None   263         return self._findIdValue(declaration, i)   264    265     def _findIdValue(self, declaration, i):   266         q = declaration.find('"', i)   267         if q == -1:   268             return None   269         q2 = declaration.find('"', q + 1)   270         if q2 == -1:   271             return None   272         return declaration[q+1:q2]   273    274     def hasAttributeNS(self, ns, localName):   275         return Node_hasAttributeNS(self._node, ns, localName)   276    277     def hasAttribute(self, name):   278         return Node_hasAttribute(self._node, name)   279    280     def getAttributeNS(self, ns, localName):   281         return Node_getAttributeNS(self._node, ns, localName)   282    283     def getAttribute(self, name):   284         return Node_getAttribute(self._node, name)   285    286     def getAttributeNodeNS(self, ns, localName):   287         return Attribute(Node_getAttributeNodeNS(self._node, ns, localName), self.impl, self.ownerDocument, self)   288    289     def getAttributeNode(self, localName):   290         return Attribute(Node_getAttributeNode(self._node, localName), self.impl, self.ownerDocument, self)   291    292     def setAttributeNS(self, ns, name, value):   293         Node_setAttributeNS(self._node, ns, name, value)   294    295     def setAttribute(self, name, value):   296         Node_setAttribute(self._node, name, value)   297    298     def setAttributeNodeNS(self, node):   299         Node_setAttributeNodeNS(self._node, node._node)   300    301     def setAttributeNode(self, node):   302         Node_setAttributeNode(self._node, node._node)   303    304     def removeAttributeNS(self, ns, localName):   305         Node_removeAttributeNS(self._node, ns, localName)   306    307     def removeAttribute(self, name):   308         Node_removeAttribute(self._node, name)   309    310     def createElementNS(self, ns, name):   311         return self.impl.get_node(Node_createElementNS(self._node, ns, name), self)   312    313     def createElement(self, name):   314         return self.impl.get_node(Node_createElement(self._node, name), self)   315    316     def createAttributeNS(self, ns, name):   317         tmp = self.createElement("tmp")   318         return Attribute(Node_createAttributeNS(tmp._node, self.impl, ns, name))   319    320     def createAttribute(self, name):   321         tmp = self.createElement("tmp")   322         return Attribute(Node_createAttribute(tmp._node, name), self.impl)   323    324     def createTextNode(self, value):   325         return self.impl.get_node(Node_createTextNode(self._node, value), self)   326    327     def createComment(self, value):   328         return self.impl.get_node(Node_createComment(self._node, value), self)   329    330     def createCDATASection(self, value):   331         return self.impl.get_node(Node_createCDATASection(self._node, value), self)   332    333     def importNode(self, node, deep):   334         if hasattr(node, "as_native_node"):   335             return self.impl.get_node(Node_importNode(self._node, node.as_native_node(), deep), self)   336         else:   337             return self.impl.get_node(Node_importNode_DOM(self._node, node, deep), self)   338    339     def cloneNode(self, deep):   340         # This takes advantage of the ubiquity of importNode (in spite of the DOM specification).   341         return self.importNode(self, deep)   342    343     def insertBefore(self, tmp, oldNode):   344         if hasattr(tmp, "as_native_node"):   345             return self.impl.get_node(Node_insertBefore(self._node, tmp.as_native_node(), oldNode.as_native_node()), self)   346         else:   347             return self.impl.get_node(Node_insertBefore(self._node, tmp, oldNode.as_native_node()), self)   348    349     def replaceChild(self, tmp, oldNode):   350         if hasattr(tmp, "as_native_node"):   351             return self.impl.get_node(Node_replaceChild(self._node, tmp.as_native_node(), oldNode.as_native_node()), self)   352         else:   353             return self.impl.get_node(Node_replaceChild(self._node, tmp, oldNode.as_native_node()), self)   354    355     def appendChild(self, tmp):   356         if hasattr(tmp, "as_native_node"):   357             return self.impl.get_node(Node_appendChild(self._node, tmp.as_native_node()), self)   358         else:   359             return self.impl.get_node(Node_appendChild(self._node, tmp), self)   360    361     def removeChild(self, tmp):   362         if hasattr(tmp, "as_native_node"):   363             Node_removeChild(self._node, tmp.as_native_node())   364         else:   365             Node_removeChild(self._node, tmp)   366    367     def getElementById(self, identifier):   368         nodes = self.xpath(".//*[@xml:id='" + identifier.replace("'", "&apos;") + "']",   369             namespaces={"xml" : xml.dom.XML_NAMESPACE})   370         if nodes:   371             return nodes[0]   372         else:   373             return None   374    375     def getElementsByTagName(self, tagName):   376         return self.xpath(".//" + tagName)   377    378     def getElementsByTagNameNS(self, namespaceURI, localName):   379         return self.xpath(".//ns:" + localName, namespaces={"ns" : namespaceURI})   380    381     def normalize(self):   382         text_nodes = []   383         for node in self.childNodes:   384             if node.nodeType == node.TEXT_NODE:   385                 text_nodes.append(node)   386             elif len(text_nodes) != 0:   387                 self._normalize(text_nodes)   388                 text_nodes = []   389         if len(text_nodes) != 0:   390             self._normalize(text_nodes)   391    392     def _normalize(self, text_nodes):   393         texts = []   394         for text_node in text_nodes[:-1]:   395             texts.append(text_node.nodeValue)   396             self.removeChild(text_node)   397         texts.append(text_nodes[-1].nodeValue)   398         self.replaceChild(self.ownerDocument.createTextNode("".join(texts)), text_nodes[-1])   399    400     childNodes = property(_childNodes)   401     value = data = nodeValue = property(_nodeValue, _setNodeValue)   402     textContent = property(_textContent)   403     name = nodeName = property(_nodeName)   404     tagName = property(_tagName)   405     namespaceURI = property(_namespaceURI)   406     prefix = property(_prefix)   407     localName = property(_localName)   408     parentNode = property(_parentNode)   409     nodeType = property(_nodeType)   410     attributes = property(_attributes)   411     previousSibling = property(_previousSibling)   412     nextSibling = property(_nextSibling)   413     doctype = property(_doctype)   414     publicId = property(_publicId)   415     systemId = property(_systemId)   416    417     # NOTE: To be fixed - these being doctype-specific values.   418    419     entities = {}   420     notations = {}   421    422     def isSameNode(self, other):   423         return self == other   424    425     def __hash__(self):   426         return hash(self.localName)   427    428     def __eq__(self, other):   429         return isinstance(other, Node) and libxml2mod.xmlXPathCmpNodes(self._node, other._node) == 0   430    431     def __ne__(self, other):   432         return not (self == other)   433    434     # 4DOM extensions to the usual PyXML API.   435     # NOTE: To be finished.   436    437     def xpath(self, expr, variables=None, namespaces=None):   438         result = Node_xpath(self._node, expr, variables, namespaces)   439         if isinstance(result, str):   440             return to_unicode(result)   441         elif hasattr(result, "__len__"):   442             return NodeList([self.impl.get_node(_node, self) for _node in result])   443         else:   444             return result   445    446     # Convenience methods.   447    448     def toString(self, encoding=None, prettyprint=0):   449         return toString(self, encoding, prettyprint)   450    451     def toStream(self, stream, encoding=None, prettyprint=0):   452         toStream(self, stream, encoding, prettyprint)   453    454     def toFile(self, f, encoding=None, prettyprint=0):   455         toFile(self, f, encoding, prettyprint)   456    457 # Attribute nodes.   458    459 class Attribute(Node):   460    461     "A class providing attribute access."   462    463     def __init__(self, node, impl, ownerDocument=None, ownerElement=None):   464         Node.__init__(self, node, impl, ownerDocument)   465         self.ownerElement = ownerElement   466    467     def _parentNode(self):   468         return self.ownerElement   469    470     parentNode = property(_parentNode)   471    472 # Document housekeeping mechanisms.   473    474 class _Document:   475    476     """   477     An abstract class providing document-level housekeeping and distinct   478     functionality.   479     """   480    481     def __init__(self, node, impl):   482         self._node = node   483         self.implementation = self.impl = impl   484    485     def _documentElement(self):   486         return self.xpath("*")[0]   487    488     def _ownerDocument(self):   489         return self   490    491     def __del__(self):   492         #print "Freeing document", self._node   493         libxml2mod.xmlFreeDoc(self._node)   494    495     documentElement = property(_documentElement)   496     ownerDocument = property(_ownerDocument)   497    498 class Document(_Document, Node):   499    500     """   501     A generic document class. Specialised document classes should inherit from   502     the _Document class and their own variation of Node.   503     """   504    505     pass   506    507 class DocumentType(object):   508    509     "A class providing a container for document type information."   510    511     def __init__(self, localName, publicId, systemId):   512         self.name = self.localName = localName   513         self.publicId = publicId   514         self.systemId = systemId   515    516         # NOTE: Nothing is currently provided to support the following   517         # NOTE: attributes.   518    519         self.entities = {}   520         self.notations = {}   521    522 # Constants.   523    524 null_value_node_types = [   525     Node.DOCUMENT_NODE, Node.DOCUMENT_TYPE_NODE, Node.ELEMENT_NODE,   526     Node.ENTITY_NODE, Node.ENTITY_REFERENCE_NODE, Node.NOTATION_NODE   527     ]   528    529 # Utility functions.   530    531 def createDocumentType(localName, publicId, systemId):   532     return default_impl.createDocumentType(localName, publicId, systemId)   533    534 def createDocument(namespaceURI, localName, doctype):   535     return default_impl.createDocument(namespaceURI, localName, doctype)   536    537 def parse(stream_or_string, html=0, htmlencoding=None, unfinished=0, impl=None):   538    539     """   540     Parse the given 'stream_or_string', where the supplied object can either be   541     a stream (such as a file or stream object), or a string (containing the   542     filename of a document). The optional parameters described below should be   543     provided as keyword arguments.   544    545     If the optional 'html' parameter is set to a true value, the content to be   546     parsed will be treated as being HTML rather than XML. If the optional   547     'htmlencoding' is specified, HTML parsing will be performed with the   548     document encoding assumed to that specified.   549    550     If the optional 'unfinished' parameter is set to a true value, unfinished   551     documents will be parsed, even though such documents may be missing content   552     such as closing tags.   553    554     A document object is returned by this function.   555     """   556    557     impl = impl or default_impl   558    559     if hasattr(stream_or_string, "read"):   560         stream = stream_or_string   561         return parseString(stream.read(), html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=impl)   562     else:   563         return parseFile(stream_or_string, html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=impl)   564    565 def parseFile(filename, html=0, htmlencoding=None, unfinished=0, impl=None):   566    567     """   568     Parse the file having the given 'filename'. The optional parameters   569     described below should be provided as keyword arguments.   570    571     If the optional 'html' parameter is set to a true value, the content to be   572     parsed will be treated as being HTML rather than XML. If the optional   573     'htmlencoding' is specified, HTML parsing will be performed with the   574     document encoding assumed to that specified.   575    576     If the optional 'unfinished' parameter is set to a true value, unfinished   577     documents will be parsed, even though such documents may be missing content   578     such as closing tags.   579    580     A document object is returned by this function.   581     """   582    583     impl = impl or default_impl   584     return impl.adoptDocument(Node_parseFile(filename, html=html, htmlencoding=htmlencoding, unfinished=unfinished))   585    586 def parseString(s, html=0, htmlencoding=None, unfinished=0, impl=None):   587    588     """   589     Parse the content of the given string 's'. The optional parameters described   590     below should be provided as keyword arguments.   591    592     If the optional 'html' parameter is set to a true value, the content to be   593     parsed will be treated as being HTML rather than XML. If the optional   594     'htmlencoding' is specified, HTML parsing will be performed with the   595     document encoding assumed to that specified.   596    597     If the optional 'unfinished' parameter is set to a true value, unfinished   598     documents will be parsed, even though such documents may be missing content   599     such as closing tags.   600    601     A document object is returned by this function.   602     """   603    604     impl = impl or default_impl   605     return impl.adoptDocument(Node_parseString(s, html=html, htmlencoding=htmlencoding, unfinished=unfinished))   606    607 def parseURI(uri, html=0, htmlencoding=None, unfinished=0, impl=None):   608    609     """   610     Parse the content found at the given 'uri'. The optional parameters   611     described below should be provided as keyword arguments.   612    613     If the optional 'html' parameter is set to a true value, the content to be   614     parsed will be treated as being HTML rather than XML. If the optional   615     'htmlencoding' is specified, HTML parsing will be performed with the   616     document encoding assumed to that specified.   617    618     If the optional 'unfinished' parameter is set to a true value, unfinished   619     documents will be parsed, even though such documents may be missing content   620     such as closing tags.   621    622     XML documents are retrieved using libxml2's own network capabilities; HTML   623     documents are retrieved using the urllib module provided by Python. To   624     retrieve either kind of document using Python's own modules for this purpose   625     (such as urllib), open a stream and pass it to the parse function:   626    627     f = urllib.urlopen(uri)   628     try:   629         doc = libxml2dom.parse(f, html)   630     finally:   631         f.close()   632    633     A document object is returned by this function.   634     """   635    636     if html:   637         f = urllib.urlopen(uri)   638         try:   639             return parse(f, html=html, htmlencoding=htmlencoding, unfinished=unfinished, impl=impl)   640         finally:   641             f.close()   642     else:   643         impl = impl or default_impl   644         return impl.adoptDocument(Node_parseURI(uri, html=html, htmlencoding=htmlencoding, unfinished=unfinished))   645    646 def toString(node, encoding=None, prettyprint=0):   647    648     """   649     Return a string containing the serialised form of the given 'node' and its   650     children. The optional 'encoding' can be used to override the default   651     character encoding used in the serialisation. The optional 'prettyprint'   652     indicates whether the serialised form is prettyprinted or not (the default   653     setting).   654     """   655    656     return Node_toString(node.as_native_node(), encoding, prettyprint)   657    658 def toStream(node, stream, encoding=None, prettyprint=0):   659    660     """   661     Write the serialised form of the given 'node' and its children to the given   662     'stream'. The optional 'encoding' can be used to override the default   663     character encoding used in the serialisation. The optional 'prettyprint'   664     indicates whether the serialised form is prettyprinted or not (the default   665     setting).   666     """   667    668     Node_toStream(node.as_native_node(), stream, encoding, prettyprint)   669    670 def toFile(node, filename, encoding=None, prettyprint=0):   671    672     """   673     Write the serialised form of the given 'node' and its children to a file   674     having the given 'filename'. The optional 'encoding' can be used to override   675     the default character encoding used in the serialisation. The optional   676     'prettyprint' indicates whether the serialised form is prettyprinted or not   677     (the default setting).   678     """   679    680     Node_toFile(node.as_native_node(), filename, encoding, prettyprint)   681    682 def adoptNodes(nodes, impl=None):   683    684     """   685     A special utility method which adopts the given low-level 'nodes' and which   686     returns a list of high-level equivalents. This is currently experimental and   687     should not be casually used.   688     """   689    690     impl = impl or default_impl   691    692     if len(nodes) == 0:   693         return []   694     doc = impl.adoptDocument(libxml2mod.doc(nodes[0]))   695     results = []   696     for node in nodes:   697         results.append(Node(node, impl, doc))   698     return results   699    700 def getDOMImplementation():   701    702     "Return the default DOM implementation."   703    704     return default_impl   705    706 # Single instance of the implementation.   707    708 default_impl = Implementation()   709    710 # vim: tabstop=4 expandtab shiftwidth=4
libxml2dom

libxml2dom/__init__.py

libxml2dom/init.py