diff options
Diffstat (limited to 'encoding.c')
-rw-r--r-- | encoding.c | 492 |
1 files changed, 7 insertions, 485 deletions
@@ -24,7 +24,6 @@ #include "libxml.h" #include <string.h> -#include <limits.h> #ifdef HAVE_CTYPE_H #include <ctype.h> @@ -45,9 +44,6 @@ #include <libxml/globals.h> #include <libxml/xmlerror.h> -#include "buf.h" -#include "enc.h" - static xmlCharEncodingHandlerPtr xmlUTF16LEHandler = NULL; static xmlCharEncodingHandlerPtr xmlUTF16BEHandler = NULL; @@ -1517,8 +1513,6 @@ xmlGetCharEncodingHandler(xmlCharEncoding enc) { if (handler != NULL) return(handler); handler = xmlFindCharEncodingHandler("EBCDIC-US"); if (handler != NULL) return(handler); - handler = xmlFindCharEncodingHandler("IBM-037"); - if (handler != NULL) return(handler); break; case XML_CHAR_ENCODING_UCS4BE: handler = xmlFindCharEncodingHandler("ISO-10646-UCS-4"); @@ -1831,7 +1825,7 @@ xmlIconvWrapper(iconv_t cd, unsigned char *out, int *outlen, /************************************************************************ * * - * ICU based generic conversion functions * + * ICU based generic conversion functions * * * ************************************************************************/ @@ -1903,6 +1897,9 @@ xmlUconvWrapper(uconv_t *cd, int toUnicode, unsigned char *out, int *outlen, * The real API used by libxml for on-the-fly conversion * * * ************************************************************************/ +int +xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, + xmlBufferPtr in, int len); /** * xmlCharEncFirstLineInt: @@ -1949,7 +1946,7 @@ xmlCharEncFirstLineInt(xmlCharEncodingHandler *handler, xmlBufferPtr out, toconv = 180; } if (toconv * 2 >= written) { - xmlBufferGrow(out, toconv * 2); + xmlBufferGrow(out, toconv); written = out->size - out->use - 1; } @@ -2032,252 +2029,6 @@ xmlCharEncFirstLine(xmlCharEncodingHandler *handler, xmlBufferPtr out, } /** - * xmlCharEncFirstLineInput: - * @input: a parser input buffer - * @len: number of bytes to convert for the first line, or -1 - * - * Front-end for the encoding handler input function, but handle only - * the very first line. Point is that this is based on autodetection - * of the encoding and once that first line is converted we may find - * out that a different decoder is needed to process the input. - * - * Returns the number of byte written if success, or - * -1 general error - * -2 if the transcoding fails (for *in is not valid utf8 string or - * the result of transformation can't fit into the encoding we want), or - */ -int -xmlCharEncFirstLineInput(xmlParserInputBufferPtr input, int len) -{ - int ret = -2; - size_t written; - size_t toconv; - int c_in; - int c_out; - xmlBufPtr in; - xmlBufPtr out; - - if ((input == NULL) || (input->encoder == NULL) || - (input->buffer == NULL) || (input->raw == NULL)) - return (-1); - out = input->buffer; - in = input->raw; - - toconv = xmlBufUse(in); - if (toconv == 0) - return (0); - written = xmlBufAvail(out) - 1; /* count '\0' */ - /* - * echo '<?xml version="1.0" encoding="UCS4"?>' | wc -c => 38 - * 45 chars should be sufficient to reach the end of the encoding - * declaration without going too far inside the document content. - * on UTF-16 this means 90bytes, on UCS4 this means 180 - * The actual value depending on guessed encoding is passed as @len - * if provided - */ - if (len >= 0) { - if (toconv > (unsigned int) len) - toconv = len; - } else { - if (toconv > 180) - toconv = 180; - } - if (toconv * 2 >= written) { - xmlBufGrow(out, toconv * 2); - written = xmlBufAvail(out) - 1; - } - if (written > 360) - written = 360; - - c_in = toconv; - c_out = written; - if (input->encoder->input != NULL) { - ret = input->encoder->input(xmlBufEnd(out), &c_out, - xmlBufContent(in), &c_in); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - } -#ifdef LIBXML_ICONV_ENABLED - else if (input->encoder->iconv_in != NULL) { - ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out), - &c_out, xmlBufContent(in), &c_in); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - if (ret == -1) - ret = -3; - } -#endif /* LIBXML_ICONV_ENABLED */ -#ifdef LIBXML_ICU_ENABLED - else if (input->encoder->uconv_in != NULL) { - ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out), - &c_out, xmlBufContent(in), &c_in); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - if (ret == -1) - ret = -3; - } -#endif /* LIBXML_ICU_ENABLED */ - switch (ret) { - case 0: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input\n", - c_in, c_out); -#endif - break; - case -1: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input, %d left\n", - c_in, c_out, (int)xmlBufUse(in)); -#endif - break; - case -3: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input, %d left\n", - c_in, c_out, (int)xmlBufUse(in)); -#endif - break; - case -2: { - char buf[50]; - const xmlChar *content = xmlBufContent(in); - - snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", - content[0], content[1], - content[2], content[3]); - buf[49] = 0; - xmlEncodingErr(XML_I18N_CONV_FAILED, - "input conversion failed due to input error, bytes %s\n", - buf); - } - } - /* - * Ignore when input buffer is not on a boundary - */ - if (ret == -3) ret = 0; - if (ret == -1) ret = 0; - return(ret); -} - -/** - * xmlCharEncInput: - * @input: a parser input buffer - * @flush: try to flush all the raw buffer - * - * Generic front-end for the encoding handler on parser input - * - * Returns the number of byte written if success, or - * -1 general error - * -2 if the transcoding fails (for *in is not valid utf8 string or - * the result of transformation can't fit into the encoding we want), or - */ -int -xmlCharEncInput(xmlParserInputBufferPtr input, int flush) -{ - int ret = -2; - size_t written; - size_t toconv; - int c_in; - int c_out; - xmlBufPtr in; - xmlBufPtr out; - - if ((input == NULL) || (input->encoder == NULL) || - (input->buffer == NULL) || (input->raw == NULL)) - return (-1); - out = input->buffer; - in = input->raw; - - toconv = xmlBufUse(in); - if (toconv == 0) - return (0); - if ((toconv > 64 * 1024) && (flush == 0)) - toconv = 64 * 1024; - written = xmlBufAvail(out); - if (written > 0) - written--; /* count '\0' */ - if (toconv * 2 >= written) { - xmlBufGrow(out, toconv * 2); - written = xmlBufAvail(out); - if (written > 0) - written--; /* count '\0' */ - } - if ((written > 128 * 1024) && (flush == 0)) - written = 128 * 1024; - - c_in = toconv; - c_out = written; - if (input->encoder->input != NULL) { - ret = input->encoder->input(xmlBufEnd(out), &c_out, - xmlBufContent(in), &c_in); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - } -#ifdef LIBXML_ICONV_ENABLED - else if (input->encoder->iconv_in != NULL) { - ret = xmlIconvWrapper(input->encoder->iconv_in, xmlBufEnd(out), - &c_out, xmlBufContent(in), &c_in); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - if (ret == -1) - ret = -3; - } -#endif /* LIBXML_ICONV_ENABLED */ -#ifdef LIBXML_ICU_ENABLED - else if (input->encoder->uconv_in != NULL) { - ret = xmlUconvWrapper(input->encoder->uconv_in, 1, xmlBufEnd(out), - &c_out, xmlBufContent(in), &c_in); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - if (ret == -1) - ret = -3; - } -#endif /* LIBXML_ICU_ENABLED */ - switch (ret) { - case 0: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input\n", - c_in, c_out); -#endif - break; - case -1: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input, %d left\n", - c_in, c_out, (int)xmlBufUse(in)); -#endif - break; - case -3: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of input, %d left\n", - c_in, c_out, (int)xmlBufUse(in)); -#endif - break; - case -2: { - char buf[50]; - const xmlChar *content = xmlBufContent(in); - - snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", - content[0], content[1], - content[2], content[3]); - buf[49] = 0; - xmlEncodingErr(XML_I18N_CONV_FAILED, - "input conversion failed due to input error, bytes %s\n", - buf); - } - } - /* - * Ignore when input buffer is not on a boundary - */ - if (ret == -3) - ret = 0; - return (c_out? c_out : ret); -} - -/** * xmlCharEncInFunc: * @handler: char encoding transformation data structure * @out: an xmlBuffer for the output. @@ -2385,235 +2136,6 @@ xmlCharEncInFunc(xmlCharEncodingHandler * handler, xmlBufferPtr out, } /** - * xmlCharEncOutput: - * @output: a parser output buffer - * @init: is this an initialization call without data - * - * Generic front-end for the encoding handler on parser output - * a first call with @init == 1 has to be made first to initiate the - * output in case of non-stateless encoding needing to initiate their - * state or the output (like the BOM in UTF16). - * In case of UTF8 sequence conversion errors for the given encoder, - * the content will be automatically remapped to a CharRef sequence. - * - * Returns the number of byte written if success, or - * -1 general error - * -2 if the transcoding fails (for *in is not valid utf8 string or - * the result of transformation can't fit into the encoding we want), or - */ -int -xmlCharEncOutput(xmlOutputBufferPtr output, int init) -{ - int ret = -2; - size_t written; - size_t writtentot = 0; - size_t toconv; - int c_in; - int c_out; - xmlBufPtr in; - xmlBufPtr out; - int charref_len = 0; - - if ((output == NULL) || (output->encoder == NULL) || - (output->buffer == NULL) || (output->conv == NULL)) - return (-1); - out = output->conv; - in = output->buffer; - -retry: - - written = xmlBufAvail(out); - if (written > 0) - written--; /* count '\0' */ - - /* - * First specific handling of the initialization call - */ - if (init) { - c_in = 0; - c_out = written; - if (output->encoder->output != NULL) { - ret = output->encoder->output(xmlBufEnd(out), &c_out, - NULL, &c_in); - if (ret > 0) /* Gennady: check return value */ - xmlBufAddLen(out, c_out); - } -#ifdef LIBXML_ICONV_ENABLED - else if (output->encoder->iconv_out != NULL) { - ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out), - &c_out, NULL, &c_in); - xmlBufAddLen(out, c_out); - } -#endif /* LIBXML_ICONV_ENABLED */ -#ifdef LIBXML_ICU_ENABLED - else if (output->encoder->uconv_out != NULL) { - ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out), - &c_out, NULL, &c_in); - xmlBufAddLen(out, c_out); - } -#endif /* LIBXML_ICU_ENABLED */ -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "initialized encoder\n"); -#endif - return(0); - } - - /* - * Conversion itself. - */ - toconv = xmlBufUse(in); - if (toconv == 0) - return (0); - if (toconv > 64 * 1024) - toconv = 64 * 1024; - if (toconv * 4 >= written) { - xmlBufGrow(out, toconv * 4); - written = xmlBufAvail(out) - 1; - } - if (written > 256 * 1024) - written = 256 * 1024; - - c_in = toconv; - c_out = written; - if (output->encoder->output != NULL) { - ret = output->encoder->output(xmlBufEnd(out), &c_out, - xmlBufContent(in), &c_in); - if (c_out > 0) { - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - writtentot += c_out; - } - } -#ifdef LIBXML_ICONV_ENABLED - else if (output->encoder->iconv_out != NULL) { - ret = xmlIconvWrapper(output->encoder->iconv_out, xmlBufEnd(out), - &c_out, xmlBufContent(in), &c_in); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - writtentot += c_out; - if (ret == -1) { - if (c_out > 0) { - /* - * Can be a limitation of iconv - */ - charref_len = 0; - goto retry; - } - ret = -3; - } - } -#endif /* LIBXML_ICONV_ENABLED */ -#ifdef LIBXML_ICU_ENABLED - else if (output->encoder->uconv_out != NULL) { - ret = xmlUconvWrapper(output->encoder->uconv_out, 0, xmlBufEnd(out), - &c_out, xmlBufContent(in), &c_in); - xmlBufShrink(in, c_in); - xmlBufAddLen(out, c_out); - writtentot += c_out; - if (ret == -1) { - if (c_out > 0) { - /* - * Can be a limitation of uconv - */ - charref_len = 0; - goto retry; - } - ret = -3; - } - } -#endif /* LIBXML_ICU_ENABLED */ - else { - xmlEncodingErr(XML_I18N_NO_OUTPUT, - "xmlCharEncOutFunc: no output function !\n", NULL); - return(-1); - } - - if (ret >= 0) output += ret; - - /* - * Attempt to handle error cases - */ - switch (ret) { - case 0: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "converted %d bytes to %d bytes of output\n", - c_in, c_out); -#endif - break; - case -1: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "output conversion failed by lack of space\n"); -#endif - break; - case -3: -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext,"converted %d bytes to %d bytes of output %d left\n", - c_in, c_out, (int) xmlBufUse(in)); -#endif - break; - case -2: { - int len = (int) xmlBufUse(in); - xmlChar *content = xmlBufContent(in); - int cur; - - cur = xmlGetUTF8Char(content, &len); - if ((charref_len != 0) && (c_out < charref_len)) { - /* - * We attempted to insert a character reference and failed. - * Undo what was written and skip the remaining charref. - */ - xmlBufErase(out, c_out); - writtentot -= c_out; - xmlBufShrink(in, charref_len - c_out); - charref_len = 0; - - ret = -1; - break; - } else if (cur > 0) { - xmlChar charref[20]; - -#ifdef DEBUG_ENCODING - xmlGenericError(xmlGenericErrorContext, - "handling output conversion error\n"); - xmlGenericError(xmlGenericErrorContext, - "Bytes: 0x%02X 0x%02X 0x%02X 0x%02X\n", - content[0], content[1], - content[2], content[3]); -#endif - /* - * Removes the UTF8 sequence, and replace it by a charref - * and continue the transcoding phase, hoping the error - * did not mangle the encoder state. - */ - charref_len = snprintf((char *) &charref[0], sizeof(charref), - "&#%d;", cur); - xmlBufShrink(in, len); - xmlBufAddHead(in, charref, -1); - - goto retry; - } else { - char buf[50]; - - snprintf(&buf[0], 49, "0x%02X 0x%02X 0x%02X 0x%02X", - content[0], content[1], - content[2], content[3]); - buf[49] = 0; - xmlEncodingErr(XML_I18N_CONV_FAILED, - "output conversion failed due to conv error, bytes %s\n", - buf); - if (xmlBufGetAllocationScheme(in) != XML_BUFFER_ALLOC_IMMUTABLE) - content[0] = ' '; - } - break; - } - } - return(ret); -} - -/** * xmlCharEncOutFunc: * @handler: char enconding transformation data structure * @out: an xmlBuffer for the output. @@ -2676,7 +2198,7 @@ retry: else if (handler->uconv_out != NULL) { ret = xmlUconvWrapper(handler->uconv_out, 0, &out->content[out->use], - &written, NULL, &toconv); + &written, NULL, &toconv); out->use += written; out->content[out->use] = 0; } @@ -3097,7 +2619,7 @@ UTF8ToISO8859x(unsigned char* out, int *outlen, c2 = c2 & 0x3F; d = d & 0x0F; d = xlattable [48 + c2 + xlattable [48 + c1 + - xlattable [32 + d] * 64] * 64]; + xlattable [32 + d] * 64] * 64]; if (d == 0) { /* not in character set */ *outlen = out - outstart; |