diff options
Diffstat (limited to 'HTMLparser.c')
-rw-r--r-- | HTMLparser.c | 102 |
1 files changed, 79 insertions, 23 deletions
diff --git a/HTMLparser.c b/HTMLparser.c index c6115d0..d11ae08 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -192,16 +192,16 @@ htmlnamePop(htmlParserCtxtPtr ctxt) const xmlChar *ret; if (ctxt->nameNr <= 0) - return (0); + return (NULL); ctxt->nameNr--; if (ctxt->nameNr < 0) - return (0); + return (NULL); if (ctxt->nameNr > 0) ctxt->name = ctxt->nameTab[ctxt->nameNr - 1]; else ctxt->name = NULL; ret = ctxt->nameTab[ctxt->nameNr]; - ctxt->nameTab[ctxt->nameNr] = 0; + ctxt->nameTab[ctxt->nameNr] = NULL; return (ret); } @@ -964,7 +964,6 @@ NULL static const char *htmlNoContentElements[] = { "html", "head", - "body", NULL }; @@ -2042,6 +2041,7 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { unsigned int i; int j; xmlNodePtr lastChild; + xmlDtdPtr dtd; for (j = 0;j < len;j++) if (!(IS_BLANK_CH(str[j]))) return(0); @@ -2054,8 +2054,17 @@ static int areBlanks(htmlParserCtxtPtr ctxt, const xmlChar *str, int len) { return(1); if (xmlStrEqual(ctxt->name, BAD_CAST"head")) return(1); - if (xmlStrEqual(ctxt->name, BAD_CAST"body")) - return(1); + + /* Only strip CDATA children of the body tag for strict HTML DTDs */ + if (xmlStrEqual(ctxt->name, BAD_CAST "body") && ctxt->myDoc != NULL) { + dtd = xmlGetIntSubset(ctxt->myDoc); + if (dtd != NULL && dtd->ExternalID != NULL) { + if (!xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4.01//EN") || + !xmlStrcasecmp(dtd->ExternalID, BAD_CAST "-//W3C//DTD HTML 4//EN")) + return(1); + } + } + if (ctxt->node == NULL) return(0); lastChild = xmlGetLastChild(ctxt->node); while ((lastChild) && (lastChild->type == XML_COMMENT_NODE)) @@ -2627,12 +2636,12 @@ htmlParsePubidLiteral(htmlParserCtxtPtr ctxt) { */ static void htmlParseScript(htmlParserCtxtPtr ctxt) { - xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 1]; + xmlChar buf[HTML_PARSER_BIG_BUFFER_SIZE + 5]; int nbchar = 0; - xmlChar cur; + int cur,l; SHRINK; - cur = CUR; + cur = CUR_CHAR(l); while (IS_CHAR_CH(cur)) { if ((cur == '<') && (NXT(1) == '!') && (NXT(2) == '-') && (NXT(3) == '-')) { @@ -2648,20 +2657,39 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { } nbchar = 0; htmlParseComment(ctxt); - cur = CUR; + cur = CUR_CHAR(l); continue; } else if ((cur == '<') && (NXT(1) == '/')) { - /* - * One should break here, the specification is clear: - * Authors should therefore escape "</" within the content. - * Escape mechanisms are specific to each scripting or - * style sheet language. - */ - if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || - ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) - break; /* while */ + /* + * One should break here, the specification is clear: + * Authors should therefore escape "</" within the content. + * Escape mechanisms are specific to each scripting or + * style sheet language. + * + * In recovery mode, only break if end tag match the + * current tag, effectively ignoring all tags inside the + * script/style block and treating the entire block as + * CDATA. + */ + if (ctxt->recovery) { + if (xmlStrncasecmp(ctxt->name, ctxt->input->cur+2, + xmlStrlen(ctxt->name)) == 0) + { + break; /* while */ + } else { + htmlParseErr(ctxt, XML_ERR_TAG_NAME_MISMATCH, + "Element %s embbeds close tag\n", + ctxt->name, NULL); + } + } else { + if (((NXT(2) >= 'A') && (NXT(2) <= 'Z')) || + ((NXT(2) >= 'a') && (NXT(2) <= 'z'))) + { + break; /* while */ + } + } } - buf[nbchar++] = cur; + COPY_BUF(l,buf,nbchar,cur); if (nbchar >= HTML_PARSER_BIG_BUFFER_SIZE) { if (ctxt->sax->cdataBlock!= NULL) { /* @@ -2673,9 +2701,10 @@ htmlParseScript(htmlParserCtxtPtr ctxt) { } nbchar = 0; } - NEXT; - cur = CUR; + NEXTL(l); + cur = CUR_CHAR(l); } + if (!(IS_CHAR_CH(cur))) { htmlParseErrInt(ctxt, XML_ERR_INVALID_CHAR, "Invalid char in CDATA 0x%X\n", cur); @@ -2743,6 +2772,8 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) { } } if (nbchar != 0) { + buf[nbchar] = 0; + /* * Ok the segment is to be consumed as chars. */ @@ -3578,6 +3609,15 @@ htmlParseEndTag(htmlParserCtxtPtr ctxt) if ((!IS_CHAR_CH(CUR)) || (CUR != '>')) { htmlParseErr(ctxt, XML_ERR_GT_REQUIRED, "End tag : expected '>'\n", NULL, NULL); + if (ctxt->recovery) { + /* + * We're not at the ending > !! + * Error, unless in recover mode where we search forwards + * until we find a > + */ + while (CUR != '\0' && CUR != '>') NEXT; + NEXT; + } } else NEXT; @@ -5176,10 +5216,18 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { int base = ctxt->input->base - ctxt->input->buf->buffer->content; int cur = ctxt->input->cur - ctxt->input->base; + int res; - xmlParserInputBufferPush(ctxt->input->buf, size, chunk); + res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); + if (res < 0) { + ctxt->errNo = XML_PARSER_EOF; + ctxt->disableSAX = 1; + return (XML_PARSER_EOF); + } ctxt->input->base = ctxt->input->buf->buffer->content + base; ctxt->input->cur = ctxt->input->base + cur; + ctxt->input->end = + &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); #endif @@ -5777,6 +5825,14 @@ htmlCtxtUseOptions(htmlParserCtxtPtr ctxt, int options) ctxt->options |= XML_PARSE_NOBLANKS; } else ctxt->keepBlanks = 1; + if (options & HTML_PARSE_RECOVER) { + ctxt->recovery = 1; + } else + ctxt->recovery = 0; + if (options & HTML_PARSE_COMPACT) { + ctxt->options |= HTML_PARSE_COMPACT; + options -= HTML_PARSE_COMPACT; + } ctxt->dictNames = 0; return (options); } |