diff options
Diffstat (limited to 'usr/src/common/smbsrv/smb_utf8.c')
| -rw-r--r-- | usr/src/common/smbsrv/smb_utf8.c | 418 |
1 files changed, 418 insertions, 0 deletions
diff --git a/usr/src/common/smbsrv/smb_utf8.c b/usr/src/common/smbsrv/smb_utf8.c new file mode 100644 index 0000000000..704f01877e --- /dev/null +++ b/usr/src/common/smbsrv/smb_utf8.c @@ -0,0 +1,418 @@ +/* + * CDDL HEADER START + * + * The contents of this file are subject to the terms of the + * Common Development and Distribution License (the "License"). + * You may not use this file except in compliance with the License. + * + * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE + * or http://www.opensolaris.org/os/licensing. + * See the License for the specific language governing permissions + * and limitations under the License. + * + * When distributing Covered Code, include this CDDL HEADER in each + * file and include the License file at usr/src/OPENSOLARIS.LICENSE. + * If applicable, add the following below this CDDL HEADER, with the + * fields enclosed by brackets "[]" replaced with your own identifying + * information: Portions Copyright [yyyy] [name of copyright owner] + * + * CDDL HEADER END + */ +/* + * Copyright 2007 Sun Microsystems, Inc. All rights reserved. + * Use is subject to license terms. + */ + +/* + * Multibyte/wide-char conversion routines. Wide-char encoding provides + * a fixed size character encoding that maps to the Unicode 16-bit + * (UCS-2) character set standard. Multibyte or UCS transformation + * format (UTF) encoding is a variable length character encoding scheme + * that s compatible with existing ASCII characters and guarantees that + * the resultant strings do not contain embedded null characters. Both + * types of encoding provide a null terminator: single byte for UTF-8 + * and a wide-char null for Unicode. See RFC 2044. + * + * The table below illustrates the UTF-8 encoding scheme. The letter x + * indicates bits available for encoding the character value. + * + * UCS-2 UTF-8 octet sequence (binary) + * 0x0000-0x007F 0xxxxxxx + * 0x0080-0x07FF 110xxxxx 10xxxxxx + * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx + * + * RFC 2044 + * UTF-8,a transformation format of UNICODE and ISO 10646 + * F. Yergeau + * Alis Technologies + * October 1996 + */ + +#pragma ident "%Z%%M% %I% %E% SMI" + +#ifdef _KERNEL +#include <sys/types.h> +#include <sys/sunddi.h> +#else +#include <stdio.h> +#include <stdlib.h> +#include <assert.h> +#include <strings.h> +#endif +#include <smbsrv/smb_i18n.h> +#include <smbsrv/string.h> + +int mbtowc_verbose = 0; +int mbtowc_announce = 0; + +/* + * mbstowcs + * + * The mbstowcs() function converts a multibyte character string + * mbstring into a wide character string wcstring. No more than + * nwchars wide characters are stored. A terminating null wide + * character is appended if there is room. + * + * Returns the number of wide characters converted, not counting + * any terminating null wide character. Returns -1 if an invalid + * multibyte character is encountered. + */ +size_t +mts_mbstowcs(mts_wchar_t *wcstring, const char *mbstring, size_t nwchars) +{ + int len; + mts_wchar_t *start = wcstring; + + while (nwchars--) { + len = mts_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX); + if (len < 0) { + *wcstring = 0; + return ((size_t)-1); + } + + if (*mbstring == 0) + break; + + ++wcstring; + mbstring += len; + } + + return (wcstring - start); +} + + +/* + * mbtowc + * + * The mbtowc() function converts a multibyte character mbchar into + * a wide character and stores the result in the object pointed to + * by wcharp. Up to nbytes bytes are examined. + * + * If mbchar is NULL, mbtowc() returns zero to indicate that shift + * states are not supported. If mbchar is valid, returns the number + * of bytes processed in mbchar. If mbchar is invalid, returns -1. + */ +int /*ARGSUSED*/ +mts_mbtowc(mts_wchar_t *wcharp, const char *mbchar, size_t nbytes) +{ + unsigned char mbyte; + mts_wchar_t wide_char; + int count; + int bytes_left; + + if (mbchar == 0) + return (0); /* shift states not supported */ + + /* 0xxxxxxx -> 1 byte ASCII encoding */ + if (((mbyte = *mbchar++) & 0x80) == 0) { + if (wcharp) + *wcharp = (mts_wchar_t)mbyte; + + return (mbyte ? 1 : 0); + } + + /* 10xxxxxx -> invalid first byte */ + if ((mbyte & 0x40) == 0) { + if (mbtowc_verbose || mbtowc_announce == 0) { + mbtowc_announce = 1; + } + return (-1); + } + + wide_char = mbyte; + if ((mbyte & 0x20) == 0) { + wide_char &= 0x1f; + bytes_left = 1; + } else if ((mbyte & 0x10) == 0) { + wide_char &= 0x0f; + bytes_left = 2; + } else { + if (mbtowc_verbose || mbtowc_announce == 0) { + mbtowc_announce = 1; + } + return (-1); + } + + count = 1; + while (bytes_left--) { + if (((mbyte = *mbchar++) & 0xc0) != 0x80) { + if (mbtowc_verbose || mbtowc_announce == 0) { + mbtowc_announce = 1; + } + return (-1); + } + + count++; + wide_char = (wide_char << 6) | (mbyte & 0x3f); + } + + if (wcharp) + *wcharp = wide_char; + + return (count); +} + + +/* + * wctomb + * + * The wctomb() function converts a wide character wchar into a multibyte + * character and stores the result in mbchar. The object pointed to by + * mbchar must be large enough to accommodate the multibyte character. + * + * Returns the numberof bytes written to mbchar. + */ +int +mts_wctomb(char *mbchar, mts_wchar_t wchar) +{ +#ifdef UTF8_DEBUG + char *start = mbchar; +#endif + + if ((wchar & ~0x7f) == 0) { + *mbchar = (char)wchar; + return (1); + } + + if ((wchar & ~0x7ff) == 0) { + *mbchar++ = (wchar >> 6) | 0xc0; + *mbchar = (wchar & 0x3f) | 0x80; + return (2); + } + + *mbchar++ = (wchar >> 12) | 0xe0; + *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80; + *mbchar = (wchar & 0x3f) | 0x80; + return (3); +} + + +/* + * wcstombs + * + * The wcstombs() function converts a wide character string wcstring + * into a multibyte character string mbstring. Up to nbytes bytes are + * stored in mbstring. Partial multibyte characters at the end of the + * string are not stored. The multibyte character string is null + * terminated if there is room. + * + * Returns the number of bytes converted, not counting the terminating + * null byte. + */ +size_t +mts_wcstombs(char *mbstring, const mts_wchar_t *wcstring, size_t nbytes) +{ + char *start = mbstring; + const mts_wchar_t *wcp = wcstring; + mts_wchar_t wide_char; + char buf[4]; + size_t len; + + if ((mbstring == 0) || (wcstring == 0)) + return (0); + + while (nbytes > MTS_MB_CHAR_MAX) { + wide_char = *wcp++; + len = mts_wctomb(mbstring, wide_char); + + if (wide_char == 0) + /*LINTED E_PTRDIFF_OVERFLOW*/ + return (mbstring - start); + + mbstring += len; + nbytes -= len; + } + + while (wide_char && nbytes) { + wide_char = *wcp++; + if ((len = mts_wctomb(buf, wide_char)) > nbytes) { + *mbstring = 0; + break; + } + + bcopy(buf, mbstring, len); + mbstring += len; + nbytes -= len; + } + + /*LINTED E_PTRDIFF_OVERFLOW*/ + return (mbstring - start); +} + + +/* + * Returns the number of bytes that would be written if the multi- + * byte string mbs was converted to a wide character string, not + * counting the terminating null wide character. + */ +size_t +mts_wcequiv_strlen(const char *mbs) +{ + mts_wchar_t wide_char; + size_t bytes; + size_t len = 0; + + while (*mbs) { + bytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); + if (bytes == ((size_t)-1)) + return ((size_t)-1); + + len += sizeof (mts_wchar_t); + mbs += bytes; + } + + return (len); +} + + +/* + * Returns the number of bytes that would be written if the multi- + * byte string mbs was converted to a single byte character string, + * not counting the terminating null character. + */ +size_t +mts_sbequiv_strlen(const char *mbs) +{ + mts_wchar_t wide_char; + size_t nbytes; + size_t len = 0; + + while (*mbs) { + nbytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX); + if (nbytes == ((size_t)-1)) + return ((size_t)-1); + + if (wide_char & 0xFF00) + len += sizeof (mts_wchar_t); + else + ++len; + + mbs += nbytes; + } + + return (len); +} + + +/* + * stombs + * + * Convert a regular null terminated string 'string' to a UTF-8 encoded + * null terminated multi-byte string 'mbstring'. Only full converted + * UTF-8 characters will be written 'mbstring'. If a character will not + * fit within the remaining buffer space or 'mbstring' will overflow + * max_mblen, the conversion process will be terminated and 'mbstring' + * will be null terminated. + * + * Returns the number of bytes written to 'mbstring', excluding the + * terminating null character. + * + * If either mbstring or string is a null pointer, -1 is returned. + */ +int +mts_stombs(char *mbstring, char *string, int max_mblen) +{ + char *start = mbstring; + unsigned char *p = (unsigned char *)string; + int space_left = max_mblen; + int len; + mts_wchar_t wide_char; + char buf[4]; + + if (!mbstring || !string) + return (-1); + + while (*p && space_left > 2) { + wide_char = *p++; + len = mts_wctomb(mbstring, wide_char); + mbstring += len; + space_left -= len; + } + + if (*p) { + wide_char = *p; + if ((len = mts_wctomb(buf, wide_char)) < 2) { + *mbstring = *buf; + mbstring += len; + space_left -= len; + } + } + + *mbstring = '\0'; + + /*LINTED E_PTRDIFF_OVERFLOW*/ + return (mbstring - start); +} + + +/* + * mbstos + * + * Convert a null terminated multi-byte string 'mbstring' to a regular + * null terminated string 'string'. A 1-byte character in 'mbstring' + * maps to a 1-byte character in 'string'. A 2-byte character in + * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null. + * Otherwise the upper byte null will be discarded to ensure that the + * output stream does not contain embedded null characters. + * + * If the input stream contains invalid multi-byte characters, a value + * of -1 will be returned. Otherwise the length of 'string', excluding + * the terminating null character, is returned. + * + * If either mbstring or string is a null pointer, -1 is returned. + */ +int +mts_mbstos(char *string, const char *mbstring) +{ + mts_wchar_t wc; + unsigned char *start = (unsigned char *)string; + int len; + + if (string == 0 || mbstring == 0) + return (-1); + + while (*mbstring) { + if ((len = mts_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) { + *string = 0; + return (-1); + } + + if (wc & 0xFF00) { + /*LINTED E_BAD_PTR_CAST_ALIGN*/ + *((mts_wchar_t *)string) = wc; + string += sizeof (mts_wchar_t); + } + else + { + *string = (unsigned char)wc; + string++; + } + + mbstring += len; + } + + *string = 0; + + /*LINTED E_PTRDIFF_OVERFLOW*/ + return ((unsigned char *)string - start); +} |
