summaryrefslogtreecommitdiff
path: root/HTMLparser.c
diff options
context:
space:
mode:
Diffstat (limited to 'HTMLparser.c')
-rw-r--r--HTMLparser.c102
1 files changed, 79 insertions, 23 deletions
diff --git a/HTMLparser.c b/HTMLparser.c
index c6115d0..d11ae08 100644
--- a/HTMLparser.c
+++ b/HTMLparser.c
@@ -192,16 +192,16 @@ htmlnamePop(htmlParserCtxtPtr ctxt)
const xmlChar *ret;
if (ctxt->nameNr <= 0)
- return (0);
+ return (NULL);
ctxt->nameNr--;
if (ctxt->nameNr < 0)
- return (0);
+ return (NULL);
if (ctxt->nameNr > 0)
ctxt->name = ctxt->nameTab[ctxt->nameNr - 1];
else
ctxt->name = NULL;
ret = ctxt->nameTab[ctxt->nameNr];
- ctxt->nameTab[ctxt->nameNr] = 0;
+ ctxt->nameTab[ctxt->nameNr] = NULL;
return (ret);
}
@@ -964,7 +964,6 @@ NULL
static const char *htmlNoContentElements[] = {
"html",
"head",
- "body",
NULL
};
@@ -2042,6 +2041,7 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
unsigned int i;
int j;
xmlNodePtr lastChild;
+ xmlDtdPtr dtd;
for (j = 0;j < len;j++)
if (!(IS_BLANK_CH(str[j]))) return(0);
@@ -2054,8 +2054,17 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) {
return(1);
if (xmlStrEqual(ctxt->name, BAD_CAST"head"))
return(1);
- if (xmlStrEqual(ctxt->name, BAD_CAST"body"))
- return(1);
+
+ /* Only strip CDATA children of the body tag for strict HTML DTDs */
+ if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) {
+ dtd = xmlGetIntSubset(ctxt->myDoc);
+ if (dtd != NULL && dtd->ExternalID != NULL) {
+ if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") ||
+ !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN"))
+ return(1);
+ }
+ }
+
if (ctxt->node == NULL) return(0);
lastChild = xmlGetLastChild(ctxt->node);
while ((lastChild) && (lastChild->type == XML_COMMENT_NODE))
@@ -2627,12 +2636,12 @@ htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) {
*/
static void
htmlParseScript(htmlParserCtxtPtr ctxt) {
- xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1];
+ xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5];
int nbchar = 0;
- xmlChar cur;
+ int cur,l;
SHRINK;
- cur = CUR;
+ cur = CUR_CHAR(l);
while (IS_CHAR_CH(cur)) {
if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') &&
(NXT(3) == '-')) {
@@ -2648,20 +2657,39 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
}
nbchar = 0;
htmlParseComment(ctxt);
- cur = CUR;
+ cur = CUR_CHAR(l);
continue;
} else if ((cur == '<') && (NXT(1) == '/')) {
- /*
- * One should break here, the specification is clear:
- * Authors should therefore escape "</" within the content.
- * Escape mechanisms are specific to each scripting or
- * style sheet language.
- */
- if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
- ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
- break; /* while */
+ /*
+ * One should break here, the specification is clear:
+ * Authors should therefore escape "</" within the content.
+ * Escape mechanisms are specific to each scripting or
+ * style sheet language.
+ *
+ * In recovery mode, only break if end tag match the
+ * current tag, effectively ignoring all tags inside the
+ * script/style block and treating the entire block as
+ * CDATA.
+ */
+ if (ctxt->recovery) {
+ if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2,
+ xmlStrlen(ctxt->name)) == 0)
+ {
+ break; /* while */
+ } else {
+ htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH,
+ "Element %s embbeds close tag\n",
+ ctxt->name, NULL);
+ }
+ } else {
+ if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) ||
+ ((NXT(2) >= 'a') && (NXT(2) <= 'z')))
+ {
+ break; /* while */
+ }
+ }
}
- buf[nbchar++] = cur;
+ COPY_BUF(l,buf,nbchar,cur);
if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) {
if (ctxt->sax->cdataBlock!= NULL) {
/*
@@ -2673,9 +2701,10 @@ htmlParseScript(htmlParserCtxtPtr ctxt) {
}
nbchar = 0;
}
- NEXT;
- cur = CUR;
+ NEXTL(l);
+ cur = CUR_CHAR(l);
}
+
if (!(IS_CHAR_CH(cur))) {
htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR,
"Invalid char in CDATA 0x%X\n", cur);
@@ -2743,6 +2772,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) {
}
}
if (nbchar != 0) {
+ buf[nbchar] = 0;
+
/*
* Ok the segment is to be consumed as chars.
*/
@@ -3578,6 +3609,15 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt)
if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) {
htmlParseErr(ctxt, XML_ERR_GT_REQUIRED,
"End tag : expected '>'\n", NULL, NULL);
+ if (ctxt->recovery) {
+ /*
+ * We're not at the ending > !!
+ * Error, unless in recover mode where we search forwards
+ * until we find a >
+ */
+ while (CUR != '\0' && CUR != '>') NEXT;
+ NEXT;
+ }
} else
NEXT;
@@ -5176,10 +5216,18 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size,
(ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) {
int base = ctxt->input->base - ctxt->input->buf->buffer->content;
int cur = ctxt->input->cur - ctxt->input->base;
+ int res;
- xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
+ res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk);
+ if (res < 0) {
+ ctxt->errNo = XML_PARSER_EOF;
+ ctxt->disableSAX = 1;
+ return (XML_PARSER_EOF);
+ }
ctxt->input->base = ctxt->input->buf->buffer->content + base;
ctxt->input->cur = ctxt->input->base + cur;
+ ctxt->input->end =
+ &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use];
#ifdef DEBUG_PUSH
xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size);
#endif
@@ -5777,6 +5825,14 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options)
ctxt->options |= XML_PARSE_NOBLANKS;
} else
ctxt->keepBlanks = 1;
+ if (options & HTML_PARSE_RECOVER) {
+ ctxt->recovery = 1;
+ } else
+ ctxt->recovery = 0;
+ if (options & HTML_PARSE_COMPACT) {
+ ctxt->options |= HTML_PARSE_COMPACT;
+ options -= HTML_PARSE_COMPACT;
+ }
ctxt->dictNames = 0;
return (options);
}