diff options
author | Aron Xu <aron@debian.org> | 2012-09-18 01:15:22 +0800 |
---|---|---|
committer | Aron Xu <aron@debian.org> | 2012-09-18 01:15:22 +0800 |
commit | f660f9d2924c7549bc87e7f9b4ece7c9727b3682 (patch) | |
tree | 816b3ef0e89d1601803b5dc1b90b50ade0567043 /HTMLparser.c | |
parent | d7372d053bbd1d58216fbb04d1771ffa4cc3e624 (diff) | |
download | libxml2-f660f9d2924c7549bc87e7f9b4ece7c9727b3682.tar.gz |
Imported Upstream version 2.9.0upstream/2.9.0
Diffstat (limited to 'HTMLparser.c')
-rw-r--r-- | HTMLparser.c | 137 |
1 files changed, 89 insertions, 48 deletions
diff --git a/HTMLparser.c b/HTMLparser.c index 66ff17b..a533f37 100644 --- a/HTMLparser.c +++ b/HTMLparser.c @@ -44,6 +44,9 @@ #include <libxml/globals.h> #include <libxml/uri.h> +#include "buf.h" +#include "enc.h" + #define HTML_MAX_NAMELEN 1000 #define HTML_PARSER_BIG_BUFFER_SIZE 1000 #define HTML_PARSER_BUFFER_SIZE 100 @@ -1082,7 +1085,7 @@ static const char * const htmlStartClose[] = { "div", "p", "head", NULL, "noscript", "p", NULL, "center", "font", "b", "i", "p", "head", NULL, -"a", "a", NULL, +"a", "a", "head", NULL, "caption", "p", NULL, "colgroup", "caption", "colgroup", "col", "p", NULL, "col", "caption", "col", "p", NULL, @@ -1100,6 +1103,43 @@ static const char * const htmlStartClose[] = { "option", "option", NULL, "fieldset", "legend", "p", "head", "h1", "h2", "h3", "h4", "h5", "h6", "pre", "listing", "xmp", "a", NULL, +/* most tags in in FONTSTYLE, PHRASE and SPECIAL should close <head> */ +"tt", "head", NULL, +"i", "head", NULL, +"b", "head", NULL, +"u", "head", NULL, +"s", "head", NULL, +"strike", "head", NULL, +"big", "head", NULL, +"small", "head", NULL, + +"em", "head", NULL, +"strong", "head", NULL, +"dfn", "head", NULL, +"code", "head", NULL, +"samp", "head", NULL, +"kbd", "head", NULL, +"var", "head", NULL, +"cite", "head", NULL, +"abbr", "head", NULL, +"acronym", "head", NULL, + +/* "a" */ +"img", "head", NULL, +/* "applet" */ +/* "embed" */ +/* "object" */ +"font", "head", NULL, +/* "basefont" */ +"br", "head", NULL, +/* "script" */ +"map", "head", NULL, +"q", "head", NULL, +"sub", "head", NULL, +"sup", "head", NULL, +"span", "head", NULL, +"bdo", "head", NULL, +"iframe", "head", NULL, NULL }; @@ -2941,9 +2981,14 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) { */ if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { if (areBlanks(ctxt, buf, nbchar)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, - buf, nbchar); + if (ctxt->keepBlanks) { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, nbchar); + } else { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, + buf, nbchar); + } } else { htmlCheckParagraph(ctxt); if (ctxt->sax->characters != NULL) @@ -2974,8 +3019,14 @@ htmlParseCharData(htmlParserCtxtPtr ctxt) { */ if ((ctxt->sax != NULL) && (!ctxt->disableSAX)) { if (areBlanks(ctxt, buf, nbchar)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace(ctxt->userData, buf, nbchar); + if (ctxt->keepBlanks) { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters(ctxt->userData, buf, nbchar); + } else { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace(ctxt->userData, + buf, nbchar); + } } else { htmlCheckParagraph(ctxt); if (ctxt->sax->characters != NULL) @@ -3509,19 +3560,14 @@ htmlCheckEncodingDirect(htmlParserCtxtPtr ctxt, const xmlChar *encoding) { * convert as much as possible to the parser reading buffer. */ processed = ctxt->input->cur - ctxt->input->base; - xmlBufferShrink(ctxt->input->buf->buffer, processed); - nbchars = xmlCharEncInFunc(ctxt->input->buf->encoder, - ctxt->input->buf->buffer, - ctxt->input->buf->raw); + xmlBufShrink(ctxt->input->buf->buffer, processed); + nbchars = xmlCharEncInput(ctxt->input->buf); if (nbchars < 0) { htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, "htmlCheckEncoding: encoder error\n", NULL, NULL); } - ctxt->input->base = - ctxt->input->cur = ctxt->input->buf->buffer->content; - ctxt->input->end = - &ctxt->input->base[ctxt->input->buf->buffer->use]; + xmlBufResetInput(ctxt->input->buf->buffer, ctxt->input); } } } @@ -4906,9 +4952,7 @@ htmlCreateMemoryParserCtxt(const char *buffer, int size) { input->filename = NULL; input->buf = buf; - input->base = input->buf->buffer->content; - input->cur = input->buf->buffer->content; - input->end = &input->buf->buffer->content[input->buf->buffer->use]; + xmlBufResetInput(buf->buffer, input); inputPush(ctxt, input); return(ctxt); @@ -5025,8 +5069,8 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, buf = in->base; len = in->length; } else { - buf = in->buf->buffer->content; - len = in->buf->buffer->use; + buf = xmlBufContent(in->buf->buffer); + len = xmlBufUse(in->buf->buffer); } /* take into account the sequence length */ @@ -5118,13 +5162,13 @@ htmlParseLookupSequence(htmlParserCtxtPtr ctxt, xmlChar first, * @stop: Array of chars, which stop the lookup. * @stopLen: Length of stop-Array * - * Try to find if any char of the stop-Array is available in the input + * Try to find if any char of the stop-Array is available in the input * stream. * This function has a side effect of (possibly) incrementing ctxt->checkIndex * to avoid rescanning sequences of bytes, it DOES change the state of the * parser, do not use liberally. * - * Returns the index to the current parsing point if a stopChar + * Returns the index to the current parsing point if a stopChar * is available, -1 otherwise. */ static int @@ -5152,8 +5196,8 @@ htmlParseLookupChars(htmlParserCtxtPtr ctxt, const xmlChar * stop, buf = in->base; len = in->length; } else { - buf = in->buf->buffer->content; - len = in->buf->buffer->use; + buf = xmlBufContent(in->buf->buffer); + len = xmlBufUse(in->buf->buffer); } for (; base < len; base++) { @@ -5264,7 +5308,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = in->buf->buffer->use - (in->cur - in->base); + avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); if ((avail == 0) && (terminate)) { htmlAutoCloseOnEnd(ctxt); if ((ctxt->nameNr == 0) && (ctxt->instate != XML_PARSER_EOF)) { @@ -5300,7 +5344,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = in->buf->buffer->use - (in->cur - in->base); + avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); } if ((ctxt->sax) && (ctxt->sax->setDocumentLocator)) ctxt->sax->setDocumentLocator(ctxt->userData, @@ -5342,7 +5386,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = in->buf->buffer->use - (in->cur - in->base); + avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); /* * no chars in buffer */ @@ -5415,7 +5459,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = in->buf->buffer->use - (in->cur - in->base); + avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); if (avail < 2) goto done; cur = in->cur[0]; @@ -5456,7 +5500,7 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if (in->buf == NULL) avail = in->length - (in->cur - in->base); else - avail = in->buf->buffer->use - (in->cur - in->base); + avail = xmlBufUse(in->buf->buffer) - (in->cur - in->base); if (avail < 1) goto done; cur = in->cur[0]; @@ -5654,9 +5698,15 @@ htmlParseTryOrFinish(htmlParserCtxtPtr ctxt, int terminate) { if ((cur != '<') && (cur != '&')) { if (ctxt->sax != NULL) { if (IS_BLANK_CH(cur)) { - if (ctxt->sax->ignorableWhitespace != NULL) - ctxt->sax->ignorableWhitespace( - ctxt->userData, &cur, 1); + if (ctxt->keepBlanks) { + if (ctxt->sax->characters != NULL) + ctxt->sax->characters( + ctxt->userData, &cur, 1); + } else { + if (ctxt->sax->ignorableWhitespace != NULL) + ctxt->sax->ignorableWhitespace( + ctxt->userData, &cur, 1); + } } else { htmlCheckParagraph(ctxt); if (ctxt->sax->characters != NULL) @@ -5979,8 +6029,8 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, } if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && (ctxt->input->buf != NULL) && (ctxt->instate != XML_PARSER_EOF)) { - int base = ctxt->input->base - ctxt->input->buf->buffer->content; - int cur = ctxt->input->cur - ctxt->input->base; + size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); + size_t cur = ctxt->input->cur - ctxt->input->base; int res; res = xmlParserInputBufferPush(ctxt->input->buf, size, chunk); @@ -5989,10 +6039,7 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, ctxt->disableSAX = 1; return (XML_PARSER_EOF); } - ctxt->input->base = ctxt->input->buf->buffer->content + base; - ctxt->input->cur = ctxt->input->base + cur; - ctxt->input->end = - &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; + xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); #endif @@ -6008,7 +6055,7 @@ htmlParseChunk(htmlParserCtxtPtr ctxt, const char *chunk, int size, (in->raw != NULL)) { int nbchars; - nbchars = xmlCharEncInFunc(in->encoder, in->buffer, in->raw); + nbchars = xmlCharEncInput(in); if (nbchars < 0) { htmlParseErr(ctxt, XML_ERR_INVALID_ENCODING, "encoder error\n", NULL, NULL); @@ -6107,24 +6154,18 @@ htmlCreatePushParserCtxt(htmlSAXHandlerPtr sax, void *user_data, inputStream->filename = (char *) xmlCanonicPath((const xmlChar *) filename); inputStream->buf = buf; - inputStream->base = inputStream->buf->buffer->content; - inputStream->cur = inputStream->buf->buffer->content; - inputStream->end = - &inputStream->buf->buffer->content[inputStream->buf->buffer->use]; + xmlBufResetInput(buf->buffer, inputStream); inputPush(ctxt, inputStream); if ((size > 0) && (chunk != NULL) && (ctxt->input != NULL) && (ctxt->input->buf != NULL)) { - int base = ctxt->input->base - ctxt->input->buf->buffer->content; - int cur = ctxt->input->cur - ctxt->input->base; + size_t base = xmlBufGetInputBase(ctxt->input->buf->buffer, ctxt->input); + size_t cur = ctxt->input->cur - ctxt->input->base; xmlParserInputBufferPush(ctxt->input->buf, size, chunk); - ctxt->input->base = ctxt->input->buf->buffer->content + base; - ctxt->input->cur = ctxt->input->base + cur; - ctxt->input->end = - &ctxt->input->buf->buffer->content[ctxt->input->buf->buffer->use]; + xmlBufSetInputBaseCur(ctxt->input->buf->buffer, ctxt->input, base, cur); #ifdef DEBUG_PUSH xmlGenericError(xmlGenericErrorContext, "HPP: pushed %d\n", size); #endif |