summaryrefslogtreecommitdiff
path: root/usr/src/common/smbsrv/smb_utf8.c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/common/smbsrv/smb_utf8.c')
-rw-r--r--usr/src/common/smbsrv/smb_utf8.c418
1 files changed, 418 insertions, 0 deletions
diff --git a/usr/src/common/smbsrv/smb_utf8.c b/usr/src/common/smbsrv/smb_utf8.c
new file mode 100644
index 0000000000..704f01877e
--- /dev/null
+++ b/usr/src/common/smbsrv/smb_utf8.c
@@ -0,0 +1,418 @@
+/*
+ * CDDL HEADER START
+ *
+ * The contents of this file are subject to the terms of the
+ * Common Development and Distribution License (the "License").
+ * You may not use this file except in compliance with the License.
+ *
+ * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
+ * or http://www.opensolaris.org/os/licensing.
+ * See the License for the specific language governing permissions
+ * and limitations under the License.
+ *
+ * When distributing Covered Code, include this CDDL HEADER in each
+ * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
+ * If applicable, add the following below this CDDL HEADER, with the
+ * fields enclosed by brackets "[]" replaced with your own identifying
+ * information: Portions Copyright [yyyy] [name of copyright owner]
+ *
+ * CDDL HEADER END
+ */
+/*
+ * Copyright 2007 Sun Microsystems, Inc. All rights reserved.
+ * Use is subject to license terms.
+ */
+
+/*
+ * Multibyte/wide-char conversion routines. Wide-char encoding provides
+ * a fixed size character encoding that maps to the Unicode 16-bit
+ * (UCS-2) character set standard. Multibyte or UCS transformation
+ * format (UTF) encoding is a variable length character encoding scheme
+ * that s compatible with existing ASCII characters and guarantees that
+ * the resultant strings do not contain embedded null characters. Both
+ * types of encoding provide a null terminator: single byte for UTF-8
+ * and a wide-char null for Unicode. See RFC 2044.
+ *
+ * The table below illustrates the UTF-8 encoding scheme. The letter x
+ * indicates bits available for encoding the character value.
+ *
+ * UCS-2 UTF-8 octet sequence (binary)
+ * 0x0000-0x007F 0xxxxxxx
+ * 0x0080-0x07FF 110xxxxx 10xxxxxx
+ * 0x0800-0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
+ *
+ * RFC 2044
+ * UTF-8,a transformation format of UNICODE and ISO 10646
+ * F. Yergeau
+ * Alis Technologies
+ * October 1996
+ */
+
+#pragma ident "%Z%%M% %I% %E% SMI"
+
+#ifdef _KERNEL
+#include <sys/types.h>
+#include <sys/sunddi.h>
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <assert.h>
+#include <strings.h>
+#endif
+#include <smbsrv/smb_i18n.h>
+#include <smbsrv/string.h>
+
+int mbtowc_verbose = 0;
+int mbtowc_announce = 0;
+
+/*
+ * mbstowcs
+ *
+ * The mbstowcs() function converts a multibyte character string
+ * mbstring into a wide character string wcstring. No more than
+ * nwchars wide characters are stored. A terminating null wide
+ * character is appended if there is room.
+ *
+ * Returns the number of wide characters converted, not counting
+ * any terminating null wide character. Returns -1 if an invalid
+ * multibyte character is encountered.
+ */
+size_t
+mts_mbstowcs(mts_wchar_t *wcstring, const char *mbstring, size_t nwchars)
+{
+ int len;
+ mts_wchar_t *start = wcstring;
+
+ while (nwchars--) {
+ len = mts_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
+ if (len < 0) {
+ *wcstring = 0;
+ return ((size_t)-1);
+ }
+
+ if (*mbstring == 0)
+ break;
+
+ ++wcstring;
+ mbstring += len;
+ }
+
+ return (wcstring - start);
+}
+
+
+/*
+ * mbtowc
+ *
+ * The mbtowc() function converts a multibyte character mbchar into
+ * a wide character and stores the result in the object pointed to
+ * by wcharp. Up to nbytes bytes are examined.
+ *
+ * If mbchar is NULL, mbtowc() returns zero to indicate that shift
+ * states are not supported. If mbchar is valid, returns the number
+ * of bytes processed in mbchar. If mbchar is invalid, returns -1.
+ */
+int /*ARGSUSED*/
+mts_mbtowc(mts_wchar_t *wcharp, const char *mbchar, size_t nbytes)
+{
+ unsigned char mbyte;
+ mts_wchar_t wide_char;
+ int count;
+ int bytes_left;
+
+ if (mbchar == 0)
+ return (0); /* shift states not supported */
+
+ /* 0xxxxxxx -> 1 byte ASCII encoding */
+ if (((mbyte = *mbchar++) & 0x80) == 0) {
+ if (wcharp)
+ *wcharp = (mts_wchar_t)mbyte;
+
+ return (mbyte ? 1 : 0);
+ }
+
+ /* 10xxxxxx -> invalid first byte */
+ if ((mbyte & 0x40) == 0) {
+ if (mbtowc_verbose || mbtowc_announce == 0) {
+ mbtowc_announce = 1;
+ }
+ return (-1);
+ }
+
+ wide_char = mbyte;
+ if ((mbyte & 0x20) == 0) {
+ wide_char &= 0x1f;
+ bytes_left = 1;
+ } else if ((mbyte & 0x10) == 0) {
+ wide_char &= 0x0f;
+ bytes_left = 2;
+ } else {
+ if (mbtowc_verbose || mbtowc_announce == 0) {
+ mbtowc_announce = 1;
+ }
+ return (-1);
+ }
+
+ count = 1;
+ while (bytes_left--) {
+ if (((mbyte = *mbchar++) & 0xc0) != 0x80) {
+ if (mbtowc_verbose || mbtowc_announce == 0) {
+ mbtowc_announce = 1;
+ }
+ return (-1);
+ }
+
+ count++;
+ wide_char = (wide_char << 6) | (mbyte & 0x3f);
+ }
+
+ if (wcharp)
+ *wcharp = wide_char;
+
+ return (count);
+}
+
+
+/*
+ * wctomb
+ *
+ * The wctomb() function converts a wide character wchar into a multibyte
+ * character and stores the result in mbchar. The object pointed to by
+ * mbchar must be large enough to accommodate the multibyte character.
+ *
+ * Returns the numberof bytes written to mbchar.
+ */
+int
+mts_wctomb(char *mbchar, mts_wchar_t wchar)
+{
+#ifdef UTF8_DEBUG
+ char *start = mbchar;
+#endif
+
+ if ((wchar & ~0x7f) == 0) {
+ *mbchar = (char)wchar;
+ return (1);
+ }
+
+ if ((wchar & ~0x7ff) == 0) {
+ *mbchar++ = (wchar >> 6) | 0xc0;
+ *mbchar = (wchar & 0x3f) | 0x80;
+ return (2);
+ }
+
+ *mbchar++ = (wchar >> 12) | 0xe0;
+ *mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
+ *mbchar = (wchar & 0x3f) | 0x80;
+ return (3);
+}
+
+
+/*
+ * wcstombs
+ *
+ * The wcstombs() function converts a wide character string wcstring
+ * into a multibyte character string mbstring. Up to nbytes bytes are
+ * stored in mbstring. Partial multibyte characters at the end of the
+ * string are not stored. The multibyte character string is null
+ * terminated if there is room.
+ *
+ * Returns the number of bytes converted, not counting the terminating
+ * null byte.
+ */
+size_t
+mts_wcstombs(char *mbstring, const mts_wchar_t *wcstring, size_t nbytes)
+{
+ char *start = mbstring;
+ const mts_wchar_t *wcp = wcstring;
+ mts_wchar_t wide_char;
+ char buf[4];
+ size_t len;
+
+ if ((mbstring == 0) || (wcstring == 0))
+ return (0);
+
+ while (nbytes > MTS_MB_CHAR_MAX) {
+ wide_char = *wcp++;
+ len = mts_wctomb(mbstring, wide_char);
+
+ if (wide_char == 0)
+ /*LINTED E_PTRDIFF_OVERFLOW*/
+ return (mbstring - start);
+
+ mbstring += len;
+ nbytes -= len;
+ }
+
+ while (wide_char && nbytes) {
+ wide_char = *wcp++;
+ if ((len = mts_wctomb(buf, wide_char)) > nbytes) {
+ *mbstring = 0;
+ break;
+ }
+
+ bcopy(buf, mbstring, len);
+ mbstring += len;
+ nbytes -= len;
+ }
+
+ /*LINTED E_PTRDIFF_OVERFLOW*/
+ return (mbstring - start);
+}
+
+
+/*
+ * Returns the number of bytes that would be written if the multi-
+ * byte string mbs was converted to a wide character string, not
+ * counting the terminating null wide character.
+ */
+size_t
+mts_wcequiv_strlen(const char *mbs)
+{
+ mts_wchar_t wide_char;
+ size_t bytes;
+ size_t len = 0;
+
+ while (*mbs) {
+ bytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
+ if (bytes == ((size_t)-1))
+ return ((size_t)-1);
+
+ len += sizeof (mts_wchar_t);
+ mbs += bytes;
+ }
+
+ return (len);
+}
+
+
+/*
+ * Returns the number of bytes that would be written if the multi-
+ * byte string mbs was converted to a single byte character string,
+ * not counting the terminating null character.
+ */
+size_t
+mts_sbequiv_strlen(const char *mbs)
+{
+ mts_wchar_t wide_char;
+ size_t nbytes;
+ size_t len = 0;
+
+ while (*mbs) {
+ nbytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
+ if (nbytes == ((size_t)-1))
+ return ((size_t)-1);
+
+ if (wide_char & 0xFF00)
+ len += sizeof (mts_wchar_t);
+ else
+ ++len;
+
+ mbs += nbytes;
+ }
+
+ return (len);
+}
+
+
+/*
+ * stombs
+ *
+ * Convert a regular null terminated string 'string' to a UTF-8 encoded
+ * null terminated multi-byte string 'mbstring'. Only full converted
+ * UTF-8 characters will be written 'mbstring'. If a character will not
+ * fit within the remaining buffer space or 'mbstring' will overflow
+ * max_mblen, the conversion process will be terminated and 'mbstring'
+ * will be null terminated.
+ *
+ * Returns the number of bytes written to 'mbstring', excluding the
+ * terminating null character.
+ *
+ * If either mbstring or string is a null pointer, -1 is returned.
+ */
+int
+mts_stombs(char *mbstring, char *string, int max_mblen)
+{
+ char *start = mbstring;
+ unsigned char *p = (unsigned char *)string;
+ int space_left = max_mblen;
+ int len;
+ mts_wchar_t wide_char;
+ char buf[4];
+
+ if (!mbstring || !string)
+ return (-1);
+
+ while (*p && space_left > 2) {
+ wide_char = *p++;
+ len = mts_wctomb(mbstring, wide_char);
+ mbstring += len;
+ space_left -= len;
+ }
+
+ if (*p) {
+ wide_char = *p;
+ if ((len = mts_wctomb(buf, wide_char)) < 2) {
+ *mbstring = *buf;
+ mbstring += len;
+ space_left -= len;
+ }
+ }
+
+ *mbstring = '\0';
+
+ /*LINTED E_PTRDIFF_OVERFLOW*/
+ return (mbstring - start);
+}
+
+
+/*
+ * mbstos
+ *
+ * Convert a null terminated multi-byte string 'mbstring' to a regular
+ * null terminated string 'string'. A 1-byte character in 'mbstring'
+ * maps to a 1-byte character in 'string'. A 2-byte character in
+ * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
+ * Otherwise the upper byte null will be discarded to ensure that the
+ * output stream does not contain embedded null characters.
+ *
+ * If the input stream contains invalid multi-byte characters, a value
+ * of -1 will be returned. Otherwise the length of 'string', excluding
+ * the terminating null character, is returned.
+ *
+ * If either mbstring or string is a null pointer, -1 is returned.
+ */
+int
+mts_mbstos(char *string, const char *mbstring)
+{
+ mts_wchar_t wc;
+ unsigned char *start = (unsigned char *)string;
+ int len;
+
+ if (string == 0 || mbstring == 0)
+ return (-1);
+
+ while (*mbstring) {
+ if ((len = mts_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
+ *string = 0;
+ return (-1);
+ }
+
+ if (wc & 0xFF00) {
+ /*LINTED E_BAD_PTR_CAST_ALIGN*/
+ *((mts_wchar_t *)string) = wc;
+ string += sizeof (mts_wchar_t);
+ }
+ else
+ {
+ *string = (unsigned char)wc;
+ string++;
+ }
+
+ mbstring += len;
+ }
+
+ *string = 0;
+
+ /*LINTED E_PTRDIFF_OVERFLOW*/
+ return ((unsigned char *)string - start);
+}