diff options
Diffstat (limited to 'python/libxml.py')
-rw-r--r-- | python/libxml.py | 628 |
1 files changed, 628 insertions, 0 deletions
diff --git a/python/libxml.py b/python/libxml.py new file mode 100644 index 0000000..0c3ba6c --- /dev/null +++ b/python/libxml.py @@ -0,0 +1,628 @@ +import libxml2mod +import types + +# +# Errors raised by the wrappers when some tree handling failed. +# +class treeError: + def __init__(self, msg): + self.msg = msg + def __str__(self): + return self.msg + +class parserError: + def __init__(self, msg): + self.msg = msg + def __str__(self): + return self.msg + +class uriError: + def __init__(self, msg): + self.msg = msg + def __str__(self): + return self.msg + +class xpathError: + def __init__(self, msg): + self.msg = msg + def __str__(self): + return self.msg + +class ioWrapper: + def __init__(self, _obj): + self.__io = _obj + self._o = None + + def io_close(self): + if self.__io == None: + return(-1) + self.__io.close() + self.__io = None + return(0) + + def io_flush(self): + if self.__io == None: + return(-1) + self.__io.flush() + return(0) + + def io_read(self, len = -1): + if self.__io == None: + return(-1) + if len < 0: + return(self.__io.read()) + return(self.__io.read(len)) + + def io_write(self, str, len = -1): + if self.__io == None: + return(-1) + if len < 0: + return(self.__io.write(str)) + return(self.__io.write(str, len)) + +class ioReadWrapper(ioWrapper): + def __init__(self, _obj, enc = ""): + ioWrapper.__init__(self, _obj) + self._o = libxml2mod.xmlCreateInputBuffer(self, enc) + + def __del__(self): + print "__del__" + self.io_close() + if self._o != None: + libxml2mod.xmlFreeParserInputBuffer(self._o) + self._o = None + + def close(self): + self.io_close() + if self._o != None: + libxml2mod.xmlFreeParserInputBuffer(self._o) + self._o = None + +class ioWriteWrapper(ioWrapper): + def __init__(self, _obj, enc = ""): +# print "ioWriteWrapper.__init__", _obj + if type(_obj) == type(''): + print "write io from a string" + self.o = None + elif type(_obj) == types.InstanceType: + print "write io from instance of %s" % (_obj.__class__) + ioWrapper.__init__(self, _obj) + self._o = libxml2mod.xmlCreateOutputBuffer(self, enc) + else: + file = libxml2mod.outputBufferGetPythonFile(_obj) + if file != None: + ioWrapper.__init__(self, file) + else: + ioWrapper.__init__(self, _obj) + self._o = _obj + + def __del__(self): +# print "__del__" + self.io_close() + if self._o != None: + libxml2mod.xmlOutputBufferClose(self._o) + self._o = None + + def flush(self): + self.io_flush() + if self._o != None: + libxml2mod.xmlOutputBufferClose(self._o) + self._o = None + + def close(self): + self.io_flush() + if self._o != None: + libxml2mod.xmlOutputBufferClose(self._o) + self._o = None + +# +# Example of a class to handle SAX events +# +class SAXCallback: + """Base class for SAX handlers""" + def startDocument(self): + """called at the start of the document""" + pass + + def endDocument(self): + """called at the end of the document""" + pass + + def startElement(self, tag, attrs): + """called at the start of every element, tag is the name of + the element, attrs is a dictionary of the element's attributes""" + pass + + def endElement(self, tag): + """called at the start of every element, tag is the name of + the element""" + pass + + def characters(self, data): + """called when character data have been read, data is the string + containing the data, multiple consecutive characters() callback + are possible.""" + pass + + def cdataBlock(self, data): + """called when CDATA section have been read, data is the string + containing the data, multiple consecutive cdataBlock() callback + are possible.""" + pass + + def reference(self, name): + """called when an entity reference has been found""" + pass + + def ignorableWhitespace(self, data): + """called when potentially ignorable white spaces have been found""" + pass + + def processingInstruction(self, target, data): + """called when a PI has been found, target contains the PI name and + data is the associated data in the PI""" + pass + + def comment(self, content): + """called when a comment has been found, content contains the comment""" + pass + + def externalSubset(self, name, externalID, systemID): + """called when a DOCTYPE declaration has been found, name is the + DTD name and externalID, systemID are the DTD public and system + identifier for that DTd if available""" + pass + + def internalSubset(self, name, externalID, systemID): + """called when a DOCTYPE declaration has been found, name is the + DTD name and externalID, systemID are the DTD public and system + identifier for that DTD if available""" + pass + + def entityDecl(self, name, type, externalID, systemID, content): + """called when an ENTITY declaration has been found, name is the + entity name and externalID, systemID are the entity public and + system identifier for that entity if available, type indicates + the entity type, and content reports it's string content""" + pass + + def notationDecl(self, name, externalID, systemID): + """called when an NOTATION declaration has been found, name is the + notation name and externalID, systemID are the notation public and + system identifier for that notation if available""" + pass + + def attributeDecl(self, elem, name, type, defi, defaultValue, nameList): + """called when an ATTRIBUTE definition has been found""" + pass + + def elementDecl(self, name, type, content): + """called when an ELEMENT definition has been found""" + pass + + def entityDecl(self, name, publicId, systemID, notationName): + """called when an unparsed ENTITY declaration has been found, + name is the entity name and publicId,, systemID are the entity + public and system identifier for that entity if available, + and notationName indicate the associated NOTATION""" + pass + + def warning(self, msg): + print msg + + def error(self, msg): + raise parserError(msg) + + def fatalError(self, msg): + raise parserError(msg) + +# +# This class is the ancestor of all the Node classes. It provides +# the basic functionalities shared by all nodes (and handle +# gracefylly the exception), like name, navigation in the tree, +# doc reference, content access and serializing to a string or URI +# +class xmlCore: + def __init__(self, _obj=None): + if _obj != None: + self._o = _obj; + return + self._o = None + def get_parent(self): + ret = libxml2mod.parent(self._o) + if ret == None: + return None + return xmlNode(_obj=ret) + def get_children(self): + ret = libxml2mod.children(self._o) + if ret == None: + return None + return xmlNode(_obj=ret) + def get_last(self): + ret = libxml2mod.last(self._o) + if ret == None: + return None + return xmlNode(_obj=ret) + def get_next(self): + ret = libxml2mod.next(self._o) + if ret == None: + return None + return xmlNode(_obj=ret) + def get_properties(self): + ret = libxml2mod.properties(self._o) + if ret == None: + return None + return xmlAttr(_obj=ret) + def get_prev(self): + ret = libxml2mod.prev(self._o) + if ret == None: + return None + return xmlNode(_obj=ret) + def get_content(self): + return libxml2mod.xmlNodeGetContent(self._o) + getContent = get_content # why is this duplicate naming needed ? + def get_name(self): + return libxml2mod.name(self._o) + def get_type(self): + return libxml2mod.type(self._o) + def get_doc(self): + ret = libxml2mod.doc(self._o) + if ret == None: + if self.type in ["document_xml", "document_html"]: + return xmlDoc(_obj=self._o) + else: + return None + return xmlDoc(_obj=ret) + # + # Those are common attributes to nearly all type of nodes + # defined as python2 properties + # + import sys + if float(sys.version[0:3]) < 2.2: + def __getattr__(self, attr): + if attr == "parent": + ret = libxml2mod.parent(self._o) + if ret == None: + return None + return xmlNode(_obj=ret) + elif attr == "properties": + ret = libxml2mod.properties(self._o) + if ret == None: + return None + return xmlAttr(_obj=ret) + elif attr == "children": + ret = libxml2mod.children(self._o) + if ret == None: + return None + return xmlNode(_obj=ret) + elif attr == "last": + ret = libxml2mod.last(self._o) + if ret == None: + return None + return xmlNode(_obj=ret) + elif attr == "next": + ret = libxml2mod.next(self._o) + if ret == None: + return None + return xmlNode(_obj=ret) + elif attr == "prev": + ret = libxml2mod.prev(self._o) + if ret == None: + return None + return xmlNode(_obj=ret) + elif attr == "content": + return libxml2mod.xmlNodeGetContent(self._o) + elif attr == "name": + return libxml2mod.name(self._o) + elif attr == "type": + return libxml2mod.type(self._o) + elif attr == "doc": + ret = libxml2mod.doc(self._o) + if ret == None: + if self.type == "document_xml" or self.type == "document_html": + return xmlDoc(_obj=self._o) + else: + return None + return xmlDoc(_obj=ret) + raise AttributeError,attr + else: + parent = property(get_parent, None, None, "Parent node") + children = property(get_children, None, None, "First child node") + last = property(get_last, None, None, "Last sibling node") + next = property(get_next, None, None, "Next sibling node") + prev = property(get_prev, None, None, "Previous sibling node") + properties = property(get_properties, None, None, "List of properies") + content = property(get_content, None, None, "Content of this node") + name = property(get_name, None, None, "Node name") + type = property(get_type, None, None, "Node type") + doc = property(get_doc, None, None, "The document this node belongs to") + + # + # Serialization routines, the optional arguments have the following + # meaning: + # encoding: string to ask saving in a specific encoding + # indent: if 1 the serializer is asked to indent the output + # + def serialize(self, encoding = None, format = 0): + return libxml2mod.serializeNode(self._o, encoding, format) + def saveTo(self, file, encoding = None, format = 0): + return libxml2mod.saveNodeTo(self._o, file, encoding, format) + + # + # Canonicalization routines: + # + # nodes: the node set (tuple or list) to be included in the + # canonized image or None if all document nodes should be + # included. + # exclusive: the exclusive flag (0 - non-exclusive + # canonicalization; otherwise - exclusive canonicalization) + # prefixes: the list of inclusive namespace prefixes (strings), + # or None if there is no inclusive namespaces (only for + # exclusive canonicalization, ignored otherwise) + # with_comments: include comments in the result (!=0) or not + # (==0) + def c14nMemory(self, + nodes=None, + exclusive=0, + prefixes=None, + with_comments=0): + if nodes: + nodes = map(lambda n: n._o, nodes) + return libxml2mod.xmlC14NDocDumpMemory( + self.get_doc()._o, + nodes, + exclusive != 0, + prefixes, + with_comments != 0) + def c14nSaveTo(self, + file, + nodes=None, + exclusive=0, + prefixes=None, + with_comments=0): + if nodes: + nodes = map(lambda n: n._o, nodes) + return libxml2mod.xmlC14NDocSaveTo( + self.get_doc()._o, + nodes, + exclusive != 0, + prefixes, + with_comments != 0, + file) + + # + # Selecting nodes using XPath, a bit slow because the context + # is allocated/freed every time but convenient. + # + def xpathEval(self, expr): + doc = self.doc + if doc == None: + return None + ctxt = doc.xpathNewContext() + ctxt.setContextNode(self) + res = ctxt.xpathEval(expr) + ctxt.xpathFreeContext() + return res + +# # +# # Selecting nodes using XPath, faster because the context +# # is allocated just once per xmlDoc. +# # +# # Removed: DV memleaks c.f. #126735 +# # +# def xpathEval2(self, expr): +# doc = self.doc +# if doc == None: +# return None +# try: +# doc._ctxt.setContextNode(self) +# except: +# doc._ctxt = doc.xpathNewContext() +# doc._ctxt.setContextNode(self) +# res = doc._ctxt.xpathEval(expr) +# return res + def xpathEval2(self, expr): + return self.xpathEval(expr) + + # support for python2 iterators + def walk_depth_first(self): + return xmlCoreDepthFirstItertor(self) + def walk_breadth_first(self): + return xmlCoreBreadthFirstItertor(self) + __iter__ = walk_depth_first + + def free(self): + try: + self.doc._ctxt.xpathFreeContext() + except: + pass + libxml2mod.xmlFreeDoc(self._o) + + +# +# implements the depth-first iterator for libxml2 DOM tree +# +class xmlCoreDepthFirstItertor: + def __init__(self, node): + self.node = node + self.parents = [] + def __iter__(self): + return self + def next(self): + while 1: + if self.node: + ret = self.node + self.parents.append(self.node) + self.node = self.node.children + return ret + try: + parent = self.parents.pop() + except IndexError: + raise StopIteration + self.node = parent.next + +# +# implements the breadth-first iterator for libxml2 DOM tree +# +class xmlCoreBreadthFirstItertor: + def __init__(self, node): + self.node = node + self.parents = [] + def __iter__(self): + return self + def next(self): + while 1: + if self.node: + ret = self.node + self.parents.append(self.node) + self.node = self.node.next + return ret + try: + parent = self.parents.pop() + except IndexError: + raise StopIteration + self.node = parent.children + +# +# converters to present a nicer view of the XPath returns +# +def nodeWrap(o): + # TODO try to cast to the most appropriate node class + name = libxml2mod.name(o) + if name == "element" or name == "text": + return xmlNode(_obj=o) + if name == "attribute": + return xmlAttr(_obj=o) + if name[0:8] == "document": + return xmlDoc(_obj=o) + if name[0:8] == "namespace": + return xmlNs(_obj=o) + if name == "elem_decl": + return xmlElement(_obj=o) + if name == "attribute_decl": + return xmlAtribute(_obj=o) + if name == "entity_decl": + return xmlEntity(_obj=o) + if name == "dtd": + return xmlDtd(_obj=o) + return xmlNode(_obj=o) + +def xpathObjectRet(o): + if type(o) == type([]) or type(o) == type(()): + ret = map(lambda x: nodeWrap(x), o) + return ret + return o + +# +# register an XPath function +# +def registerXPathFunction(ctxt, name, ns_uri, f): + ret = libxml2mod.xmlRegisterXPathFunction(ctxt, name, ns_uri, f) + +# +# For the xmlTextReader parser configuration +# +PARSER_LOADDTD=1 +PARSER_DEFAULTATTRS=2 +PARSER_VALIDATE=3 +PARSER_SUBST_ENTITIES=4 + +# +# For the error callback severities +# +PARSER_SEVERITY_VALIDITY_WARNING=1 +PARSER_SEVERITY_VALIDITY_ERROR=2 +PARSER_SEVERITY_WARNING=3 +PARSER_SEVERITY_ERROR=4 + +# +# register the libxml2 error handler +# +def registerErrorHandler(f, ctx): + """Register a Python written function to for error reporting. + The function is called back as f(ctx, error). """ + import sys + if not sys.modules.has_key('libxslt'): + # normal behaviour when libxslt is not imported + ret = libxml2mod.xmlRegisterErrorHandler(f,ctx) + else: + # when libxslt is already imported, one must + # use libxst's error handler instead + import libxslt + ret = libxslt.registerErrorHandler(f,ctx) + return ret + +class parserCtxtCore: + + def __init__(self, _obj=None): + if _obj != None: + self._o = _obj; + return + self._o = None + + def __del__(self): + if self._o != None: + libxml2mod.xmlFreeParserCtxt(self._o) + self._o = None + + def setErrorHandler(self,f,arg): + """Register an error handler that will be called back as + f(arg,msg,severity,reserved). + + @reserved is currently always None.""" + libxml2mod.xmlParserCtxtSetErrorHandler(self._o,f,arg) + + def getErrorHandler(self): + """Return (f,arg) as previously registered with setErrorHandler + or (None,None).""" + return libxml2mod.xmlParserCtxtGetErrorHandler(self._o) + + def addLocalCatalog(self, uri): + """Register a local catalog with the parser""" + return libxml2mod.addLocalCatalog(self._o, uri) + + +def _xmlTextReaderErrorFunc((f,arg),msg,severity,locator): + """Intermediate callback to wrap the locator""" + return f(arg,msg,severity,xmlTextReaderLocator(locator)) + +class xmlTextReaderCore: + + def __init__(self, _obj=None): + self.input = None + if _obj != None:self._o = _obj;return + self._o = None + + def __del__(self): + if self._o != None: + libxml2mod.xmlFreeTextReader(self._o) + self._o = None + + def SetErrorHandler(self,f,arg): + """Register an error handler that will be called back as + f(arg,msg,severity,locator).""" + if f is None: + libxml2mod.xmlTextReaderSetErrorHandler(\ + self._o,None,None) + else: + libxml2mod.xmlTextReaderSetErrorHandler(\ + self._o,_xmlTextReaderErrorFunc,(f,arg)) + + def GetErrorHandler(self): + """Return (f,arg) as previously registered with setErrorHandler + or (None,None).""" + f,arg = libxml2mod.xmlTextReaderGetErrorHandler(self._o) + if f is None: + return None,None + else: + # assert f is _xmlTextReaderErrorFunc + return arg + + +# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING +# +# Everything before this line comes from libxml.py +# Everything after this line is automatically generated +# +# WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING WARNING + |