summaryrefslogtreecommitdiff
path: root/HTMLparser.c
diff options
context:
space:
mode:
authorAron Xu <aron@debian.org>2012-05-25 04:03:35 +0000
committerAron Xu <aron@debian.org>2012-05-25 04:03:35 +0000
commitd7372d053bbd1d58216fbb04d1771ffa4cc3e624 (patch)
tree62b661911406394bbeaca8951d660bb6d8aac0de /HTMLparser.c
parent2d1849b271fa8697b88d07ba7d78dc83591e1363 (diff)
downloadlibxml2-d7372d053bbd1d58216fbb04d1771ffa4cc3e624.tar.gz
Imported Upstream version 2.8.0+dfsg1upstream/2.8.0+dfsg1
Diffstat (limited to 'HTMLparser.c')
-rw-r--r--HTMLparser.c131
1 files changed, 109 insertions, 22 deletions
diff --git a/HTMLparser.c b/HTMLparser.c
index 4d43b93..66ff17b 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -727,7 +727,7 @@ static const char* const map_contents[] = { BLOCK, "area", NULL } ;
static const char* const name_attr[] = { "name", NULL } ;
static const char* const action_attr[] = { "action", NULL } ;
static const char* const blockli_elt[] = { BLOCK, "li", NULL } ;
-static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", NULL } ;
+static const char* const meta_attrs[] = { I18N, "http-equiv", "name", "scheme", "charset", NULL } ;
static const char* const content_attr[] = { "content", NULL } ;
static const char* const type_attr[] = { "type", NULL } ;
static const char* const noframes_content[] = { "body", FLOW MODIFIER, NULL } ;
@@ -1080,7 +1080,7 @@ static const char * const htmlStartClose[] = {
"menu", "p", "head", "ul", NULL,
"p", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", FONTSTYLE, NULL,
"div", "p", "head", NULL,
-"noscript", "p", "head", NULL,
+"noscript", "p", NULL,
"center", "font", "b", "i", "p", "head", NULL,
"a", "a", NULL,
"caption", "p", NULL,
@@ -3435,34 +3435,26 @@ htmlParseAttribute(htmlParserCtxtPtr ctxt, xmlChar **value) {
}
/**
- * htmlCheckEncoding:
+ * htmlCheckEncodingDirect:
* @ctxt: an HTML parser context
* @attvalue: the attribute value
*
- * Checks an http-equiv attribute from a Meta tag to detect
+ * Checks an attribute value to detect
* the encoding
* If a new encoding is detected the parser is switched to decode
* it and pass UTF8
*/
static void
-htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
- const xmlChar *encoding;
+htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) {
- if ((ctxt == NULL) || (attvalue == NULL))
+ if ((ctxt == NULL) || (encoding == NULL) ||
+ (ctxt->options & HTML_PARSE_IGNORE_ENC))
return;
/* do not change encoding */
if (ctxt->input->encoding != NULL)
return;
- encoding = xmlStrcasestr(attvalue, BAD_CAST"charset=");
- if (encoding != NULL) {
- encoding += 8;
- } else {
- encoding = xmlStrcasestr(attvalue, BAD_CAST"charset =");
- if (encoding != NULL)
- encoding += 9;
- }
if (encoding != NULL) {
xmlCharEncoding enc;
xmlCharEncodingHandlerPtr handler;
@@ -3500,7 +3492,9 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
xmlSwitchToEncoding(ctxt, handler);
ctxt->charset = XML_CHAR_ENCODING_UTF8;
} else {
- ctxt->errNo = XML_ERR_UNSUPPORTED_ENCODING;
+ htmlParseErr(ctxt, XML_ERR_UNSUPPORTED_ENCODING,
+ "htmlCheckEncoding: unknown encoding %s\n",
+ encoding, NULL);
}
}
@@ -3533,6 +3527,38 @@ htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
}
/**
+ * htmlCheckEncoding:
+ * @ctxt: an HTML parser context
+ * @attvalue: the attribute value
+ *
+ * Checks an http-equiv attribute from a Meta tag to detect
+ * the encoding
+ * If a new encoding is detected the parser is switched to decode
+ * it and pass UTF8
+ */
+static void
+htmlCheckEncoding(htmlParserCtxtPtr ctxt, const xmlChar *attvalue) {
+ const xmlChar *encoding;
+
+ if (!attvalue)
+ return;
+
+ encoding = xmlStrcasestr(attvalue, BAD_CAST"charset");
+ if (encoding != NULL) {
+ encoding += 7;
+ }
+ /*
+ * skip blank
+ */
+ if (encoding && IS_BLANK_CH(*encoding))
+ encoding = xmlStrcasestr(attvalue, BAD_CAST"=");
+ if (encoding && *encoding == '=') {
+ encoding ++;
+ htmlCheckEncodingDirect(ctxt, encoding);
+ }
+}
+
+/**
* htmlCheckMeta:
* @ctxt: an HTML parser context
* @atts: the attributes values
@@ -3556,6 +3582,8 @@ htmlCheckMeta(htmlParserCtxtPtr ctxt, const xmlChar **atts) {
if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"http-equiv"))
&& (!xmlStrcasecmp(value, BAD_CAST"Content-Type")))
http = 1;
+ else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"charset")))
+ htmlCheckEncodingDirect(ctxt, value);
else if ((value != NULL) && (!xmlStrcasecmp(att, BAD_CAST"content")))
content = value;
att = atts[i++];
@@ -3885,6 +3913,7 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
if ((oldname != NULL) && (xmlStrEqual(oldname, name))) {
if ((ctxt->sax != NULL) && (ctxt->sax->endElement != NULL))
ctxt->sax->endElement(ctxt->userData, name);
+ htmlNodeInfoPop(ctxt);
htmlnamePop(ctxt);
ret = 1;
} else {
@@ -5173,6 +5202,8 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int avail = 0;
xmlChar cur, next;
+ htmlParserNodeInfo node_info;
+
#ifdef DEBUG_PUSH
switch (ctxt->instate) {
case XML_PARSER_EOF:
@@ -5312,10 +5343,23 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
avail = in->length - (in->cur - in->base);
else
avail = in->buf->buffer->use - (in->cur - in->base);
- if (avail < 2)
+ /*
+ * no chars in buffer
+ */
+ if (avail < 1)
goto done;
+ /*
+ * not enouth chars in buffer
+ */
+ if (avail < 2) {
+ if (!terminate)
+ goto done;
+ else
+ next = ' ';
+ } else {
+ next = in->cur[1];
+ }
cur = in->cur[0];
- next = in->cur[1];
if ((cur == '<') && (next == '!') &&
(in->cur[2] == '-') && (in->cur[3] == '-')) {
if ((!terminate) &&
@@ -5465,8 +5509,22 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
int failed;
const htmlElemDesc * info;
- if (avail < 2)
+ /*
+ * no chars in buffer
+ */
+ if (avail < 1)
goto done;
+ /*
+ * not enouth chars in buffer
+ */
+ if (avail < 2) {
+ if (!terminate)
+ goto done;
+ else
+ next = ' ';
+ } else {
+ next = in->cur[1];
+ }
cur = in->cur[0];
if (cur != '<') {
ctxt->instate = XML_PARSER_CONTENT;
@@ -5476,7 +5534,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
#endif
break;
}
- if (in->cur[1] == '/') {
+ if (next == '/') {
ctxt->instate = XML_PARSER_END_TAG;
ctxt->checkIndex = 0;
#ifdef DEBUG_PUSH
@@ -5489,6 +5547,14 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
(htmlParseLookupSequence(ctxt, '>', 0, 0, 0, 1) < 0))
goto done;
+ /* Capture start position */
+ if (ctxt->record_info) {
+ node_info.begin_pos = ctxt->input->consumed +
+ (CUR_PTR - ctxt->input->base);
+ node_info.begin_line = ctxt->input->line;
+ }
+
+
failed = htmlParseStartTag(ctxt);
name = ctxt->name;
if ((failed == -1) ||
@@ -5538,6 +5604,9 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
htmlnamePop(ctxt);
}
+ if (ctxt->record_info)
+ htmlNodeInfoPush(ctxt, &node_info);
+
ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -5554,6 +5623,10 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) {
ctxt->sax->endElement(ctxt->userData, name);
htmlnamePop(ctxt);
}
+
+ if (ctxt->record_info)
+ htmlNodeInfoPush(ctxt, &node_info);
+
ctxt->instate = XML_PARSER_CONTENT;
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext,
@@ -6537,6 +6610,14 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
ctxt->options |= HTML_PARSE_NODEFDTD;
options -= HTML_PARSE_NODEFDTD;
}
+ if (options & HTML_PARSE_IGNORE_ENC) {
+ ctxt->options |= HTML_PARSE_IGNORE_ENC;
+ options -= HTML_PARSE_IGNORE_ENC;
+ }
+ if (options & HTML_PARSE_NOIMPLIED) {
+ ctxt->options |= HTML_PARSE_NOIMPLIED;
+ options -= HTML_PARSE_NOIMPLIED;
+ }
ctxt->dictNames = 0;
return (options);
}
@@ -6730,8 +6811,11 @@ htmlReadIO(xmlInputReadCallback ioread, xmlInputCloseCallback ioclose,
input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
XML_CHAR_ENCODING_NONE);
- if (input == NULL)
+ if (input == NULL) {
+ if (ioclose != NULL)
+ ioclose(ioctx);
return (NULL);
+ }
ctxt = htmlNewParserCtxt();
if (ctxt == NULL) {
xmlFreeParserInputBuffer(input);
@@ -6930,8 +7014,11 @@ htmlCtxtReadIO(htmlParserCtxtPtr ctxt, xmlInputReadCallback ioread,
input = xmlParserInputBufferCreateIO(ioread, ioclose, ioctx,
XML_CHAR_ENCODING_NONE);
- if (input == NULL)
+ if (input == NULL) {
+ if (ioclose != NULL)
+ ioclose(ioctx);
return (NULL);
+ }
stream = xmlNewIOInputStream(ctxt, input, XML_CHAR_ENCODING_NONE);
if (stream == NULL) {
xmlFreeParserInputBuffer(input);