diff options
Diffstat (limited to 'HTMLtree.c')
-rw-r--r-- | HTMLtree.c | 1177 |
1 files changed, 1177 insertions, 0 deletions
diff --git a/HTMLtree.c b/HTMLtree.c new file mode 100644 index 0000000..5724540 --- /dev/null +++ b/HTMLtree.c @@ -0,0 +1,1177 @@ +/* + * HTMLtree.c : implementation of access function for an HTML tree. + * + * See Copyright for the status of this software. + * + * daniel@veillard.com + */ + + +#define IN_LIBXML +#include "libxml.h" +#ifdef LIBXML_HTML_ENABLED + +#include <string.h> /* for memset() only ! */ + +#ifdef HAVE_CTYPE_H +#include <ctype.h> +#endif +#ifdef HAVE_STDLIB_H +#include <stdlib.h> +#endif + +#include <libxml/xmlmemory.h> +#include <libxml/HTMLparser.h> +#include <libxml/HTMLtree.h> +#include <libxml/entities.h> +#include <libxml/valid.h> +#include <libxml/xmlerror.h> +#include <libxml/parserInternals.h> +#include <libxml/globals.h> +#include <libxml/uri.h> + +/************************************************************************ + * * + * Getting/Setting encoding meta tags * + * * + ************************************************************************/ + +/** + * htmlGetMetaEncoding: + * @doc: the document + * + * Encoding definition lookup in the Meta tags + * + * Returns the current encoding as flagged in the HTML source + */ +const xmlChar * +htmlGetMetaEncoding(htmlDocPtr doc) { + htmlNodePtr cur; + const xmlChar *content; + const xmlChar *encoding; + + if (doc == NULL) + return(NULL); + cur = doc->children; + + /* + * Search the html + */ + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrEqual(cur->name, BAD_CAST"html")) + break; + if (xmlStrEqual(cur->name, BAD_CAST"head")) + goto found_head; + if (xmlStrEqual(cur->name, BAD_CAST"meta")) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(NULL); + cur = cur->children; + + /* + * Search the head + */ + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrEqual(cur->name, BAD_CAST"head")) + break; + if (xmlStrEqual(cur->name, BAD_CAST"meta")) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(NULL); +found_head: + cur = cur->children; + + /* + * Search the meta elements + */ +found_meta: + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrEqual(cur->name, BAD_CAST"meta")) { + xmlAttrPtr attr = cur->properties; + int http; + const xmlChar *value; + + content = NULL; + http = 0; + while (attr != NULL) { + if ((attr->children != NULL) && + (attr->children->type == XML_TEXT_NODE) && + (attr->children->next == NULL)) { + value = attr->children->content; + if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) + && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) + http = 1; + else if ((value != NULL) + && (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) + content = value; + if ((http != 0) && (content != NULL)) + goto found_content; + } + attr = attr->next; + } + } + } + cur = cur->next; + } + return(NULL); + +found_content: + encoding = xmlStrstr(content, BAD_CAST"charset="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"Charset="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"CHARSET="); + if (encoding != NULL) { + encoding += 8; + } else { + encoding = xmlStrstr(content, BAD_CAST"charset ="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"Charset ="); + if (encoding == NULL) + encoding = xmlStrstr(content, BAD_CAST"CHARSET ="); + if (encoding != NULL) + encoding += 9; + } + if (encoding != NULL) { + while ((*encoding == ' ') || (*encoding == '\t')) encoding++; + } + return(encoding); +} + +/** + * htmlSetMetaEncoding: + * @doc: the document + * @encoding: the encoding string + * + * Sets the current encoding in the Meta tags + * NOTE: this will not change the document content encoding, just + * the META flag associated. + * + * Returns 0 in case of success and -1 in case of error + */ +int +htmlSetMetaEncoding(htmlDocPtr doc, const xmlChar *encoding) { + htmlNodePtr cur, meta; + const xmlChar *content; + char newcontent[100]; + + + if (doc == NULL) + return(-1); + + if (encoding != NULL) { + snprintf(newcontent, sizeof(newcontent), "text/html; charset=%s", + encoding); + newcontent[sizeof(newcontent) - 1] = 0; + } + + cur = doc->children; + + /* + * Search the html + */ + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrcasecmp(cur->name, BAD_CAST"html") == 0) + break; + if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) + goto found_head; + if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(-1); + cur = cur->children; + + /* + * Search the head + */ + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrcasecmp(cur->name, BAD_CAST"head") == 0) + break; + if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) + goto found_meta; + } + cur = cur->next; + } + if (cur == NULL) + return(-1); +found_head: + if (cur->children == NULL) { + if (encoding == NULL) + return(0); + meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); + xmlAddChild(cur, meta); + xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); + xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); + return(0); + } + cur = cur->children; + +found_meta: + if (encoding != NULL) { + /* + * Create a new Meta element with the right attributes + */ + + meta = xmlNewDocNode(doc, NULL, BAD_CAST"meta", NULL); + xmlAddPrevSibling(cur, meta); + xmlNewProp(meta, BAD_CAST"http-equiv", BAD_CAST"Content-Type"); + xmlNewProp(meta, BAD_CAST"content", BAD_CAST newcontent); + } + + /* + * Search and destroy all the remaining the meta elements carrying + * encoding informations + */ + while (cur != NULL) { + if ((cur->type == XML_ELEMENT_NODE) && (cur->name != NULL)) { + if (xmlStrcasecmp(cur->name, BAD_CAST"meta") == 0) { + xmlAttrPtr attr = cur->properties; + int http; + const xmlChar *value; + + content = NULL; + http = 0; + while (attr != NULL) { + if ((attr->children != NULL) && + (attr->children->type == XML_TEXT_NODE) && + (attr->children->next == NULL)) { + value = attr->children->content; + if ((!xmlStrcasecmp(attr->name, BAD_CAST"http-equiv")) + && (!xmlStrcasecmp(value, BAD_CAST"Content-Type"))) + http = 1; + else + { + if ((value != NULL) && + (!xmlStrcasecmp(attr->name, BAD_CAST"content"))) + content = value; + } + if ((http != 0) && (content != NULL)) + break; + } + attr = attr->next; + } + if ((http != 0) && (content != NULL)) { + meta = cur; + cur = cur->next; + xmlUnlinkNode(meta); + xmlFreeNode(meta); + continue; + } + + } + } + cur = cur->next; + } + return(0); +} + +/** + * booleanHTMLAttrs: + * + * These are the HTML attributes which will be output + * in minimized form, i.e. <option selected="selected"> will be + * output as <option selected>, as per XSLT 1.0 16.2 "HTML Output Method" + * + */ +static const char* htmlBooleanAttrs[] = { + "checked", "compact", "declare", "defer", "disabled", "ismap", + "multiple", "nohref", "noresize", "noshade", "nowrap", "readonly", + "selected", NULL +}; + + +/** + * htmlIsBooleanAttr: + * @name: the name of the attribute to check + * + * Determine if a given attribute is a boolean attribute. + * + * returns: false if the attribute is not boolean, true otherwise. + */ +int +htmlIsBooleanAttr(const xmlChar *name) +{ + int i = 0; + + while (htmlBooleanAttrs[i] != NULL) { + if (xmlStrcasecmp((const xmlChar *)htmlBooleanAttrs[i], name) == 0) + return 1; + i++; + } + return 0; +} + +#ifdef LIBXML_OUTPUT_ENABLED +/************************************************************************ + * * + * Output error handlers * + * * + ************************************************************************/ +/** + * htmlSaveErrMemory: + * @extra: extra informations + * + * Handle an out of memory condition + */ +static void +htmlSaveErrMemory(const char *extra) +{ + __xmlSimpleError(XML_FROM_OUTPUT, XML_ERR_NO_MEMORY, NULL, NULL, extra); +} + +/** + * htmlSaveErr: + * @code: the error number + * @node: the location of the error. + * @extra: extra informations + * + * Handle an out of memory condition + */ +static void +htmlSaveErr(int code, xmlNodePtr node, const char *extra) +{ + const char *msg = NULL; + + switch(code) { + case XML_SAVE_NOT_UTF8: + msg = "string is not in UTF-8"; + break; + case XML_SAVE_CHAR_INVALID: + msg = "invalid character value"; + break; + case XML_SAVE_UNKNOWN_ENCODING: + msg = "unknown encoding %s"; + break; + case XML_SAVE_NO_DOCTYPE: + msg = "HTML has no DOCTYPE"; + break; + default: + msg = "unexpected error number"; + } + __xmlSimpleError(XML_FROM_OUTPUT, code, node, msg, extra); +} + +/************************************************************************ + * * + * Dumping HTML tree content to a simple buffer * + * * + ************************************************************************/ + +static int +htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, + int format); + +/** + * htmlNodeDumpFormat: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the current node + * @format: should formatting spaces been added + * + * Dump an HTML node, recursive behaviour,children are printed too. + * + * Returns the number of byte written or -1 in case of error + */ +static int +htmlNodeDumpFormat(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur, + int format) { + unsigned int use; + int ret; + xmlOutputBufferPtr outbuf; + + if (cur == NULL) { + return (-1); + } + if (buf == NULL) { + return (-1); + } + outbuf = (xmlOutputBufferPtr) xmlMalloc(sizeof(xmlOutputBuffer)); + if (outbuf == NULL) { + htmlSaveErrMemory("allocating HTML output buffer"); + return (-1); + } + memset(outbuf, 0, (size_t) sizeof(xmlOutputBuffer)); + outbuf->buffer = buf; + outbuf->encoder = NULL; + outbuf->writecallback = NULL; + outbuf->closecallback = NULL; + outbuf->context = NULL; + outbuf->written = 0; + + use = buf->use; + htmlNodeDumpFormatOutput(outbuf, doc, cur, NULL, format); + xmlFree(outbuf); + ret = buf->use - use; + return (ret); +} + +/** + * htmlNodeDump: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the current node + * + * Dump an HTML node, recursive behaviour,children are printed too, + * and formatting returns are added. + * + * Returns the number of byte written or -1 in case of error + */ +int +htmlNodeDump(xmlBufferPtr buf, xmlDocPtr doc, xmlNodePtr cur) { + xmlInitParser(); + + return(htmlNodeDumpFormat(buf, doc, cur, 1)); +} + +/** + * htmlNodeDumpFileFormat: + * @out: the FILE pointer + * @doc: the document + * @cur: the current node + * @encoding: the document encoding + * @format: should formatting spaces been added + * + * Dump an HTML node, recursive behaviour,children are printed too. + * + * TODO: if encoding == NULL try to save in the doc encoding + * + * returns: the number of byte written or -1 in case of failure. + */ +int +htmlNodeDumpFileFormat(FILE *out, xmlDocPtr doc, + xmlNodePtr cur, const char *encoding, int format) { + xmlOutputBufferPtr buf; + xmlCharEncodingHandlerPtr handler = NULL; + int ret; + + xmlInitParser(); + + if (encoding != NULL) { + xmlCharEncoding enc; + + enc = xmlParseCharEncoding(encoding); + if (enc != XML_CHAR_ENCODING_UTF8) { + handler = xmlFindCharEncodingHandler(encoding); + if (handler == NULL) + return(-1); + } + } + + /* + * Fallback to HTML or ASCII when the encoding is unspecified + */ + if (handler == NULL) + handler = xmlFindCharEncodingHandler("HTML"); + if (handler == NULL) + handler = xmlFindCharEncodingHandler("ascii"); + + /* + * save the content to a temp buffer. + */ + buf = xmlOutputBufferCreateFile(out, handler); + if (buf == NULL) return(0); + + htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); + + ret = xmlOutputBufferClose(buf); + return(ret); +} + +/** + * htmlNodeDumpFile: + * @out: the FILE pointer + * @doc: the document + * @cur: the current node + * + * Dump an HTML node, recursive behaviour,children are printed too, + * and formatting returns are added. + */ +void +htmlNodeDumpFile(FILE *out, xmlDocPtr doc, xmlNodePtr cur) { + htmlNodeDumpFileFormat(out, doc, cur, NULL, 1); +} + +/** + * htmlDocDumpMemory: + * @cur: the document + * @mem: OUT: the memory pointer + * @size: OUT: the memory length + * + * Dump an HTML document in memory and return the xmlChar * and it's size. + * It's up to the caller to free the memory. + */ +void +htmlDocDumpMemory(xmlDocPtr cur, xmlChar**mem, int *size) { + xmlOutputBufferPtr buf; + xmlCharEncodingHandlerPtr handler = NULL; + const char *encoding; + + xmlInitParser(); + + if (cur == NULL) { + *mem = NULL; + *size = 0; + return; + } + + encoding = (const char *) htmlGetMetaEncoding(cur); + + if (encoding != NULL) { + xmlCharEncoding enc; + + enc = xmlParseCharEncoding(encoding); + if (enc != cur->charset) { + if (cur->charset != XML_CHAR_ENCODING_UTF8) { + /* + * Not supported yet + */ + *mem = NULL; + *size = 0; + return; + } + + handler = xmlFindCharEncodingHandler(encoding); + if (handler == NULL) { + *mem = NULL; + *size = 0; + return; + } + } + } + + /* + * Fallback to HTML or ASCII when the encoding is unspecified + */ + if (handler == NULL) + handler = xmlFindCharEncodingHandler("HTML"); + if (handler == NULL) + handler = xmlFindCharEncodingHandler("ascii"); + + buf = xmlAllocOutputBuffer(handler); + if (buf == NULL) { + *mem = NULL; + *size = 0; + return; + } + + htmlDocContentDumpOutput(buf, cur, NULL); + xmlOutputBufferFlush(buf); + if (buf->conv != NULL) { + *size = buf->conv->use; + *mem = xmlStrndup(buf->conv->content, *size); + } else { + *size = buf->buffer->use; + *mem = xmlStrndup(buf->buffer->content, *size); + } + (void)xmlOutputBufferClose(buf); +} + + +/************************************************************************ + * * + * Dumping HTML tree content to an I/O output buffer * + * * + ************************************************************************/ + +void xmlNsListDumpOutput(xmlOutputBufferPtr buf, xmlNsPtr cur); + +/** + * htmlDtdDumpOutput: + * @buf: the HTML buffer output + * @doc: the document + * @encoding: the encoding string + * + * TODO: check whether encoding is needed + * + * Dump the HTML document DTD, if any. + */ +static void +htmlDtdDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + const char *encoding ATTRIBUTE_UNUSED) { + xmlDtdPtr cur = doc->intSubset; + + if (cur == NULL) { + htmlSaveErr(XML_SAVE_NO_DOCTYPE, (xmlNodePtr) doc, NULL); + return; + } + xmlOutputBufferWriteString(buf, "<!DOCTYPE "); + xmlOutputBufferWriteString(buf, (const char *)cur->name); + if (cur->ExternalID != NULL) { + xmlOutputBufferWriteString(buf, " PUBLIC "); + xmlBufferWriteQuotedString(buf->buffer, cur->ExternalID); + if (cur->SystemID != NULL) { + xmlOutputBufferWriteString(buf, " "); + xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); + } + } else if (cur->SystemID != NULL) { + xmlOutputBufferWriteString(buf, " SYSTEM "); + xmlBufferWriteQuotedString(buf->buffer, cur->SystemID); + } + xmlOutputBufferWriteString(buf, ">\n"); +} + +/** + * htmlAttrDumpOutput: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the attribute pointer + * @encoding: the encoding string + * + * Dump an HTML attribute + */ +static void +htmlAttrDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, + const char *encoding ATTRIBUTE_UNUSED) { + xmlChar *value; + + /* + * TODO: The html output method should not escape a & character + * occurring in an attribute value immediately followed by + * a { character (see Section B.7.1 of the HTML 4.0 Recommendation). + */ + + if (cur == NULL) { + return; + } + xmlOutputBufferWriteString(buf, " "); + if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { + xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); + xmlOutputBufferWriteString(buf, ":"); + } + xmlOutputBufferWriteString(buf, (const char *)cur->name); + if ((cur->children != NULL) && (!htmlIsBooleanAttr(cur->name))) { + value = xmlNodeListGetString(doc, cur->children, 0); + if (value) { + xmlOutputBufferWriteString(buf, "="); + if ((cur->ns == NULL) && (cur->parent != NULL) && + (cur->parent->ns == NULL) && + ((!xmlStrcasecmp(cur->name, BAD_CAST "href")) || + (!xmlStrcasecmp(cur->name, BAD_CAST "action")) || + (!xmlStrcasecmp(cur->name, BAD_CAST "src")))) { + xmlChar *escaped; + xmlChar *tmp = value; + + while (IS_BLANK_CH(*tmp)) tmp++; + + escaped = xmlURIEscapeStr(tmp, BAD_CAST"@/:=?;#%&,+"); + if (escaped != NULL) { + xmlBufferWriteQuotedString(buf->buffer, escaped); + xmlFree(escaped); + } else { + xmlBufferWriteQuotedString(buf->buffer, value); + } + } else { + xmlBufferWriteQuotedString(buf->buffer, value); + } + xmlFree(value); + } else { + xmlOutputBufferWriteString(buf, "=\"\""); + } + } +} + +/** + * htmlAttrListDumpOutput: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the first attribute pointer + * @encoding: the encoding string + * + * Dump a list of HTML attributes + */ +static void +htmlAttrListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, xmlAttrPtr cur, const char *encoding) { + if (cur == NULL) { + return; + } + while (cur != NULL) { + htmlAttrDumpOutput(buf, doc, cur, encoding); + cur = cur->next; + } +} + + + +/** + * htmlNodeListDumpOutput: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the first node + * @encoding: the encoding string + * @format: should formatting spaces been added + * + * Dump an HTML node list, recursive behaviour,children are printed too. + */ +static void +htmlNodeListDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + xmlNodePtr cur, const char *encoding, int format) { + if (cur == NULL) { + return; + } + while (cur != NULL) { + htmlNodeDumpFormatOutput(buf, doc, cur, encoding, format); + cur = cur->next; + } +} + +/** + * htmlNodeDumpFormatOutput: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the current node + * @encoding: the encoding string + * @format: should formatting spaces been added + * + * Dump an HTML node, recursive behaviour,children are printed too. + */ +void +htmlNodeDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + xmlNodePtr cur, const char *encoding, int format) { + const htmlElemDesc * info; + + xmlInitParser(); + + if (cur == NULL) { + return; + } + /* + * Special cases. + */ + if (cur->type == XML_DTD_NODE) + return; + if (cur->type == XML_HTML_DOCUMENT_NODE) { + htmlDocContentDumpOutput(buf, (xmlDocPtr) cur, encoding); + return; + } + if (cur->type == HTML_TEXT_NODE) { + if (cur->content != NULL) { + if (((cur->name == (const xmlChar *)xmlStringText) || + (cur->name != (const xmlChar *)xmlStringTextNoenc)) && + ((cur->parent == NULL) || + ((xmlStrcasecmp(cur->parent->name, BAD_CAST "script")) && + (xmlStrcasecmp(cur->parent->name, BAD_CAST "style"))))) { + xmlChar *buffer; + + buffer = xmlEncodeEntitiesReentrant(doc, cur->content); + if (buffer != NULL) { + xmlOutputBufferWriteString(buf, (const char *)buffer); + xmlFree(buffer); + } + } else { + xmlOutputBufferWriteString(buf, (const char *)cur->content); + } + } + return; + } + if (cur->type == HTML_COMMENT_NODE) { + if (cur->content != NULL) { + xmlOutputBufferWriteString(buf, "<!--"); + xmlOutputBufferWriteString(buf, (const char *)cur->content); + xmlOutputBufferWriteString(buf, "-->"); + } + return; + } + if (cur->type == HTML_PI_NODE) { + if (cur->name == NULL) + return; + xmlOutputBufferWriteString(buf, "<?"); + xmlOutputBufferWriteString(buf, (const char *)cur->name); + if (cur->content != NULL) { + xmlOutputBufferWriteString(buf, " "); + xmlOutputBufferWriteString(buf, (const char *)cur->content); + } + xmlOutputBufferWriteString(buf, ">"); + return; + } + if (cur->type == HTML_ENTITY_REF_NODE) { + xmlOutputBufferWriteString(buf, "&"); + xmlOutputBufferWriteString(buf, (const char *)cur->name); + xmlOutputBufferWriteString(buf, ";"); + return; + } + if (cur->type == HTML_PRESERVE_NODE) { + if (cur->content != NULL) { + xmlOutputBufferWriteString(buf, (const char *)cur->content); + } + return; + } + + /* + * Get specific HTML info for that node. + */ + if (cur->ns == NULL) + info = htmlTagLookup(cur->name); + else + info = NULL; + + xmlOutputBufferWriteString(buf, "<"); + if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { + xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); + xmlOutputBufferWriteString(buf, ":"); + } + xmlOutputBufferWriteString(buf, (const char *)cur->name); + if (cur->nsDef) + xmlNsListDumpOutput(buf, cur->nsDef); + if (cur->properties != NULL) + htmlAttrListDumpOutput(buf, doc, cur->properties, encoding); + + if ((info != NULL) && (info->empty)) { + xmlOutputBufferWriteString(buf, ">"); + if ((format) && (!info->isinline) && (cur->next != NULL)) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE) && + (cur->parent != NULL) && + (cur->parent->name != NULL) && + (cur->parent->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + } + return; + } + if (((cur->type == XML_ELEMENT_NODE) || (cur->content == NULL)) && + (cur->children == NULL)) { + if ((info != NULL) && (info->saveEndTag != 0) && + (xmlStrcmp(BAD_CAST info->name, BAD_CAST "html")) && + (xmlStrcmp(BAD_CAST info->name, BAD_CAST "body"))) { + xmlOutputBufferWriteString(buf, ">"); + } else { + xmlOutputBufferWriteString(buf, "></"); + if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { + xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); + xmlOutputBufferWriteString(buf, ":"); + } + xmlOutputBufferWriteString(buf, (const char *)cur->name); + xmlOutputBufferWriteString(buf, ">"); + } + if ((format) && (cur->next != NULL) && + (info != NULL) && (!info->isinline)) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE) && + (cur->parent != NULL) && + (cur->parent->name != NULL) && + (cur->parent->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + } + return; + } + xmlOutputBufferWriteString(buf, ">"); + if ((cur->type != XML_ELEMENT_NODE) && + (cur->content != NULL)) { + /* + * Uses the OutputBuffer property to automatically convert + * invalids to charrefs + */ + + xmlOutputBufferWriteString(buf, (const char *) cur->content); + } + if (cur->children != NULL) { + if ((format) && (info != NULL) && (!info->isinline) && + (cur->children->type != HTML_TEXT_NODE) && + (cur->children->type != HTML_ENTITY_REF_NODE) && + (cur->children != cur->last) && + (cur->name != NULL) && + (cur->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + htmlNodeListDumpOutput(buf, doc, cur->children, encoding, format); + if ((format) && (info != NULL) && (!info->isinline) && + (cur->last->type != HTML_TEXT_NODE) && + (cur->last->type != HTML_ENTITY_REF_NODE) && + (cur->children != cur->last) && + (cur->name != NULL) && + (cur->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + } + xmlOutputBufferWriteString(buf, "</"); + if ((cur->ns != NULL) && (cur->ns->prefix != NULL)) { + xmlOutputBufferWriteString(buf, (const char *)cur->ns->prefix); + xmlOutputBufferWriteString(buf, ":"); + } + xmlOutputBufferWriteString(buf, (const char *)cur->name); + xmlOutputBufferWriteString(buf, ">"); + if ((format) && (info != NULL) && (!info->isinline) && + (cur->next != NULL)) { + if ((cur->next->type != HTML_TEXT_NODE) && + (cur->next->type != HTML_ENTITY_REF_NODE) && + (cur->parent != NULL) && + (cur->parent->name != NULL) && + (cur->parent->name[0] != 'p')) /* p, pre, param */ + xmlOutputBufferWriteString(buf, "\n"); + } +} + +/** + * htmlNodeDumpOutput: + * @buf: the HTML buffer output + * @doc: the document + * @cur: the current node + * @encoding: the encoding string + * + * Dump an HTML node, recursive behaviour,children are printed too, + * and formatting returns/spaces are added. + */ +void +htmlNodeDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr doc, + xmlNodePtr cur, const char *encoding) { + htmlNodeDumpFormatOutput(buf, doc, cur, encoding, 1); +} + +/** + * htmlDocContentDumpFormatOutput: + * @buf: the HTML buffer output + * @cur: the document + * @encoding: the encoding string + * @format: should formatting spaces been added + * + * Dump an HTML document. + */ +void +htmlDocContentDumpFormatOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, + const char *encoding, int format) { + int type; + + xmlInitParser(); + + /* + * force to output the stuff as HTML, especially for entities + */ + type = cur->type; + cur->type = XML_HTML_DOCUMENT_NODE; + if (cur->intSubset != NULL) { + htmlDtdDumpOutput(buf, cur, NULL); + } + if (cur->children != NULL) { + htmlNodeListDumpOutput(buf, cur, cur->children, encoding, format); + } + xmlOutputBufferWriteString(buf, "\n"); + cur->type = (xmlElementType) type; +} + +/** + * htmlDocContentDumpOutput: + * @buf: the HTML buffer output + * @cur: the document + * @encoding: the encoding string + * + * Dump an HTML document. Formating return/spaces are added. + */ +void +htmlDocContentDumpOutput(xmlOutputBufferPtr buf, xmlDocPtr cur, + const char *encoding) { + htmlDocContentDumpFormatOutput(buf, cur, encoding, 1); +} + +/************************************************************************ + * * + * Saving functions front-ends * + * * + ************************************************************************/ + +/** + * htmlDocDump: + * @f: the FILE* + * @cur: the document + * + * Dump an HTML document to an open FILE. + * + * returns: the number of byte written or -1 in case of failure. + */ +int +htmlDocDump(FILE *f, xmlDocPtr cur) { + xmlOutputBufferPtr buf; + xmlCharEncodingHandlerPtr handler = NULL; + const char *encoding; + int ret; + + xmlInitParser(); + + if (cur == NULL) { + return(-1); + } + + encoding = (const char *) htmlGetMetaEncoding(cur); + + if (encoding != NULL) { + xmlCharEncoding enc; + + enc = xmlParseCharEncoding(encoding); + if (enc != cur->charset) { + if (cur->charset != XML_CHAR_ENCODING_UTF8) { + /* + * Not supported yet + */ + return(-1); + } + + handler = xmlFindCharEncodingHandler(encoding); + if (handler == NULL) + return(-1); + } + } + + /* + * Fallback to HTML or ASCII when the encoding is unspecified + */ + if (handler == NULL) + handler = xmlFindCharEncodingHandler("HTML"); + if (handler == NULL) + handler = xmlFindCharEncodingHandler("ascii"); + + buf = xmlOutputBufferCreateFile(f, handler); + if (buf == NULL) return(-1); + htmlDocContentDumpOutput(buf, cur, NULL); + + ret = xmlOutputBufferClose(buf); + return(ret); +} + +/** + * htmlSaveFile: + * @filename: the filename (or URL) + * @cur: the document + * + * Dump an HTML document to a file. If @filename is "-" the stdout file is + * used. + * returns: the number of byte written or -1 in case of failure. + */ +int +htmlSaveFile(const char *filename, xmlDocPtr cur) { + xmlOutputBufferPtr buf; + xmlCharEncodingHandlerPtr handler = NULL; + const char *encoding; + int ret; + + xmlInitParser(); + + encoding = (const char *) htmlGetMetaEncoding(cur); + + if (encoding != NULL) { + xmlCharEncoding enc; + + enc = xmlParseCharEncoding(encoding); + if (enc != cur->charset) { + if (cur->charset != XML_CHAR_ENCODING_UTF8) { + /* + * Not supported yet + */ + return(-1); + } + + handler = xmlFindCharEncodingHandler(encoding); + if (handler == NULL) + return(-1); + } + } + + /* + * Fallback to HTML or ASCII when the encoding is unspecified + */ + if (handler == NULL) + handler = xmlFindCharEncodingHandler("HTML"); + if (handler == NULL) + handler = xmlFindCharEncodingHandler("ascii"); + + /* + * save the content to a temp buffer. + */ + buf = xmlOutputBufferCreateFilename(filename, handler, cur->compression); + if (buf == NULL) return(0); + + htmlDocContentDumpOutput(buf, cur, NULL); + + ret = xmlOutputBufferClose(buf); + return(ret); +} + +/** + * htmlSaveFileFormat: + * @filename: the filename + * @cur: the document + * @format: should formatting spaces been added + * @encoding: the document encoding + * + * Dump an HTML document to a file using a given encoding. + * + * returns: the number of byte written or -1 in case of failure. + */ +int +htmlSaveFileFormat(const char *filename, xmlDocPtr cur, + const char *encoding, int format) { + xmlOutputBufferPtr buf; + xmlCharEncodingHandlerPtr handler = NULL; + int ret; + + xmlInitParser(); + + if (encoding != NULL) { + xmlCharEncoding enc; + + enc = xmlParseCharEncoding(encoding); + if (enc != cur->charset) { + if (cur->charset != XML_CHAR_ENCODING_UTF8) { + /* + * Not supported yet + */ + return(-1); + } + + handler = xmlFindCharEncodingHandler(encoding); + if (handler == NULL) + return(-1); + htmlSetMetaEncoding(cur, (const xmlChar *) encoding); + } + } else { + htmlSetMetaEncoding(cur, (const xmlChar *) "UTF-8"); + } + + /* + * Fallback to HTML or ASCII when the encoding is unspecified + */ + if (handler == NULL) + handler = xmlFindCharEncodingHandler("HTML"); + if (handler == NULL) + handler = xmlFindCharEncodingHandler("ascii"); + + /* + * save the content to a temp buffer. + */ + buf = xmlOutputBufferCreateFilename(filename, handler, 0); + if (buf == NULL) return(0); + + htmlDocContentDumpFormatOutput(buf, cur, encoding, format); + + ret = xmlOutputBufferClose(buf); + return(ret); +} + +/** + * htmlSaveFileEnc: + * @filename: the filename + * @cur: the document + * @encoding: the document encoding + * + * Dump an HTML document to a file using a given encoding + * and formatting returns/spaces are added. + * + * returns: the number of byte written or -1 in case of failure. + */ +int +htmlSaveFileEnc(const char *filename, xmlDocPtr cur, const char *encoding) { + return(htmlSaveFileFormat(filename, cur, encoding, 1)); +} + +#endif /* LIBXML_OUTPUT_ENABLED */ + +#endif /* LIBXML_HTML_ENABLED */ |