diff options
Diffstat (limited to 'parserInternals.c')
-rw-r--r-- | parserInternals.c | 130 |
1 files changed, 108 insertions, 22 deletions
diff --git a/parserInternals.c b/parserInternals.c index 758c6b3..ff20435 100644 --- a/parserInternals.c +++ b/parserInternals.c @@ -494,20 +494,26 @@ xmlNextChar(xmlParserCtxtPtr ctxt) if (c & 0x80) { if (c == 0xC0) goto encoding_error; - if (cur[1] == 0) + if (cur[1] == 0) { xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } if ((cur[1] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xe0) == 0xe0) { unsigned int val; - if (cur[2] == 0) + if (cur[2] == 0) { xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } if ((cur[2] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xf0) == 0xf0) { - if (cur[3] == 0) + if (cur[3] == 0) { xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) goto encoding_error; @@ -640,18 +646,24 @@ xmlCurrentChar(xmlParserCtxtPtr ctxt, int *len) { if (c & 0x80) { if (((c & 0x40) == 0) || (c == 0xC0)) goto encoding_error; - if (cur[1] == 0) + if (cur[1] == 0) { xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } if ((cur[1] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xe0) == 0xe0) { - if (cur[2] == 0) + if (cur[2] == 0) { xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } if ((cur[2] & 0xc0) != 0x80) goto encoding_error; if ((c & 0xf0) == 0xf0) { - if (cur[3] == 0) + if (cur[3] == 0) { xmlParserInputGrow(ctxt->input, INPUT_CHUNK); + cur = ctxt->input->cur; + } if (((c & 0xf8) != 0xf0) || ((cur[3] & 0xc0) != 0x80)) goto encoding_error; @@ -933,6 +945,17 @@ xmlCopyChar(int len ATTRIBUTE_UNUSED, xmlChar *out, int val) { * * ************************************************************************/ +/* defined in encoding.c, not public */ +int +xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, + xmlBufferPtr in, int len); + +static int +xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt, + xmlCharEncodingHandlerPtr handler, int len); +static int +xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, + xmlCharEncodingHandlerPtr handler, int len); /** * xmlSwitchEncoding: * @ctxt: the parser context @@ -947,6 +970,7 @@ int xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) { xmlCharEncodingHandlerPtr handler; + int len = -1; if (ctxt == NULL) return(-1); switch (enc) { @@ -990,9 +1014,33 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) (ctxt->input->cur[2] == 0xBF)) { ctxt->input->cur += 3; } - break ; - default: - break; + len = 90; + break; + case XML_CHAR_ENCODING_UCS2: + len = 90; + break; + case XML_CHAR_ENCODING_UCS4BE: + case XML_CHAR_ENCODING_UCS4LE: + case XML_CHAR_ENCODING_UCS4_2143: + case XML_CHAR_ENCODING_UCS4_3412: + len = 180; + break; + case XML_CHAR_ENCODING_EBCDIC: + case XML_CHAR_ENCODING_8859_1: + case XML_CHAR_ENCODING_8859_2: + case XML_CHAR_ENCODING_8859_3: + case XML_CHAR_ENCODING_8859_4: + case XML_CHAR_ENCODING_8859_5: + case XML_CHAR_ENCODING_8859_6: + case XML_CHAR_ENCODING_8859_7: + case XML_CHAR_ENCODING_8859_8: + case XML_CHAR_ENCODING_8859_9: + case XML_CHAR_ENCODING_ASCII: + case XML_CHAR_ENCODING_2022_JP: + case XML_CHAR_ENCODING_SHIFT_JIS: + case XML_CHAR_ENCODING_EUC_JP: + len = 45; + break; } handler = xmlGetCharEncodingHandler(enc); if (handler == NULL) { @@ -1083,7 +1131,7 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) if (handler == NULL) return(-1); ctxt->charset = XML_CHAR_ENCODING_UTF8; - return(xmlSwitchToEncoding(ctxt, handler)); + return(xmlSwitchToEncodingInt(ctxt, handler, len)); } /** @@ -1091,15 +1139,16 @@ xmlSwitchEncoding(xmlParserCtxtPtr ctxt, xmlCharEncoding enc) * @ctxt: the parser context * @input: the input stream * @handler: the encoding handler + * @len: the number of bytes to convert for the first line or -1 * * change the input functions when discovering the character encoding * of a given entity. * * Returns 0 in case of success, -1 otherwise */ -int -xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, - xmlCharEncodingHandlerPtr handler) +static int +xmlSwitchInputEncodingInt(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, + xmlCharEncodingHandlerPtr handler, int len) { int nbchars; @@ -1196,9 +1245,10 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, * parsed with the autodetected encoding * into the parser reading buffer. */ - nbchars = xmlCharEncFirstLine(input->buf->encoder, - input->buf->buffer, - input->buf->raw); + nbchars = xmlCharEncFirstLineInt(input->buf->encoder, + input->buf->buffer, + input->buf->raw, + len); } if (nbchars < 0) { xmlErrInternal(ctxt, @@ -1224,8 +1274,9 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, } /** - * xmlSwitchToEncoding: + * xmlSwitchInputEncoding: * @ctxt: the parser context + * @input: the input stream * @handler: the encoding handler * * change the input functions when discovering the character encoding @@ -1234,13 +1285,32 @@ xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, * Returns 0 in case of success, -1 otherwise */ int -xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) -{ +xmlSwitchInputEncoding(xmlParserCtxtPtr ctxt, xmlParserInputPtr input, + xmlCharEncodingHandlerPtr handler) { + return(xmlSwitchInputEncodingInt(ctxt, input, handler, -1)); +} + +/** + * xmlSwitchToEncodingInt: + * @ctxt: the parser context + * @handler: the encoding handler + * @len: the lenght to convert or -1 + * + * change the input functions when discovering the character encoding + * of a given entity, and convert only @len bytes of the output, this + * is needed on auto detect to allows any declared encoding later to + * convert the actual content after the xmlDecl + * + * Returns 0 in case of success, -1 otherwise + */ +static int +xmlSwitchToEncodingInt(xmlParserCtxtPtr ctxt, + xmlCharEncodingHandlerPtr handler, int len) { int ret = 0; if (handler != NULL) { if (ctxt->input != NULL) { - ret = xmlSwitchInputEncoding(ctxt, ctxt->input, handler); + ret = xmlSwitchInputEncodingInt(ctxt, ctxt->input, handler, len); } else { xmlErrInternal(ctxt, "xmlSwitchToEncoding : no input\n", NULL); @@ -1250,11 +1320,27 @@ xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) * The parsing is now done in UTF8 natively */ ctxt->charset = XML_CHAR_ENCODING_UTF8; - } else + } else return(-1); return(ret); } +/** + * xmlSwitchToEncoding: + * @ctxt: the parser context + * @handler: the encoding handler + * + * change the input functions when discovering the character encoding + * of a given entity. + * + * Returns 0 in case of success, -1 otherwise + */ +int +xmlSwitchToEncoding(xmlParserCtxtPtr ctxt, xmlCharEncodingHandlerPtr handler) +{ + return (xmlSwitchToEncodingInt(ctxt, handler, -1)); +} + /************************************************************************ * * * Commodity functions to handle entities processing * @@ -2067,7 +2153,7 @@ xmlKeepBlanksDefault(int val) { int old = xmlKeepBlanksDefaultValue; xmlKeepBlanksDefaultValue = val; - xmlIndentTreeOutput = !val; + if (!val) xmlIndentTreeOutput = 1; return(old); } |