6740240 ssh: password prompt is garbled on ja_JP.PCK/ja_JP.eucJP locale

author: Nobutomo Nakano <Nobutomo.Nakano@Sun.COM> 2009-05-11 12:11:53 -0700
committer: Nobutomo Nakano <Nobutomo.Nakano@Sun.COM> 2009-05-11 12:11:53 -0700
commit: 6f786ace10b9c0c7c5515e525fb660fbccfda6a3 (patch)
tree: a455902edb891743770d97ebb458821963392a26 /usr/src/cmd/ssh/libssh/common/g11n.c
parent: 1c7408c96b4914bcab86c14af29f1af003397b16 (diff)
download: illumos-gate-6f786ace10b9c0c7c5515e525fb660fbccfda6a3.tar.gz
1 files changed, 238 insertions, 351 deletions
diff --git a/usr/src/cmd/ssh/libssh/common/g11n.c b/usr/src/cmd/ssh/libssh/common/g11n.c
index 2ea20d9467..ac35a1c8c5 100644
--- a/usr/src/cmd/ssh/libssh/common/g11n.c
+++ b/usr/src/cmd/ssh/libssh/common/g11n.c
@@ -18,17 +18,16 @@
  *
  * CDDL HEADER END
  *
- * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
+ * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
  * Use is subject to license terms.
  */
 
-#pragma ident	"%Z%%M%	%I%	%E% SMI"
-
 #include <errno.h>
 #include <locale.h>
 #include <langinfo.h>
 #include <iconv.h>
 #include <ctype.h>
+#include <wctype.h>
 #include <strings.h>
 #include <string.h>
 #include <stdio.h>
@@ -36,6 +35,8 @@
 #include "includes.h"
 #include "xmalloc.h"
 #include "xlist.h"
+#include "compat.h"
+#include "log.h"
 
 #ifdef MIN
 #undef MIN
@@ -48,14 +49,17 @@
 /* two-char country code, '-' and two-char region code */
 #define	LANGTAG_MAX	5
 
-static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf,
-    uint_t len, uint_t *outlen, int *err, uchar_t **err_str);
-
 static int locale_cmp(const void *d1, const void *d2);
 static char *g11n_locale2langtag(char *locale);
 
-uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str);
-uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str);
+static char *do_iconv(iconv_t cd, const char *s, uint_t *lenp, char **err_str);
+
+/*
+ * native_codeset records the codeset of the default system locale.
+ * It is used to convert the contents of file (eg /etc/issue) which is
+ * supposed to be in the codeset of default system locale.
+ */
+static char *native_codeset;
 
 /*
  * Convert locale string name into a language tag. The caller is responsible for
@@ -213,6 +217,13 @@ g11n_setlocale(int category, const char *locale)
 {
 	char *curr;
 
+	if (native_codeset == NULL) {
+		/* set default locale, and record current codeset */
+		(void) setlocale(LC_ALL, "");
+		curr = nl_langinfo(CODESET);
+		native_codeset = xstrdup(curr);
+	}
+
 	/* we have one text domain - always set it */
 	(void) textdomain(TEXT_DOMAIN);
 
@@ -662,398 +673,274 @@ err:
 	return (result);
 }
 
-
 /*
- * Functions for validating ASCII and UTF-8 strings
+ * Functions for converting to UTF-8 from the local codeset and
+ * converting from UTF-8 to the local codeset.
  *
- * The error_str parameter is an optional pointer to a char variable
- * where to store a string suitable for use with error() or fatal() or
- * friends.
- *
- * The return value is 0 if success, EILSEQ or EINVAL.
+ * The error_str parameter is an pointer to a char variable where to
+ * store a string suitable for use with error() or fatal() or friends.
+ * It is also used for an error indicator when NULL is returned.
  *
+ * If conversion isn't necessary, *error_str is set to NULL, and
+ * NULL is returned.
+ * If conversion error occured, *error_str points to an error message,
+ * and NULL is returned.
  */
-uint_t
-g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str)
+char *
+g11n_convert_from_utf8(const char *str, uint_t *lenp, char **error_str)
 {
-	uchar_t *p;
+	static char *last_codeset;
+	static iconv_t cd = (iconv_t)-1;
+	char	*codeset;
 
-	for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++)
-		;
+	*error_str = NULL;
 
-	if (len && ((p - (uchar_t *)str) != len))
-		return (EILSEQ);
+	codeset = nl_langinfo(CODESET);
 
-	return (0);
-}
-
-uint_t
-g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str)
-{
-	uchar_t *p;
-	uint_t c, l;
-
-	if (len == 0)
-		len = strlen((const char *)str);
-
-	for (p = (uchar_t *)str; p && (p - str < len) && *p; ) {
-		/* 8-bit chars begin a UTF-8 sequence */
-		if (*p & 0x80) {
-			/* get sequence length and sanity check first byte */
-			if (*p < 0xc0)
-				return (EILSEQ);
-			else if (*p < 0xe0)
-				l = 2;
-			else if (*p < 0xf0)
-				l = 3;
-			else if (*p < 0xf8)
-				l = 4;
-			else if (*p < 0xfc)
-				l = 5;
-			else if (*p < 0xfe)
-				l = 6;
-			else
-				return (EILSEQ);
-
-			if ((p + l - str) >= len)
-				return (EILSEQ);
-
-			/* overlong detection - build codepoint */
-			c = *p & 0x3f;
-			/* shift c bits from first byte */
-			c = c << (6 * (l - 1));
-
-			if (l > 1) {
-				if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80))
-					c = c | ((*(p + 1) & 0x3f) <<
-					    (6 * (l - 2)));
-				else
-					return (EILSEQ);
-
-				if (c < 0x80)
-					return (EILSEQ);
-			}
-
-			if (l > 2) {
-				if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80))
-					c = c | ((*(p + 2) & 0x3f) <<
-					    (6 * (l - 3)));
-				else
-					return (EILSEQ);
+	if (strcmp(codeset, "UTF-8") == 0)
+		return (NULL);
 
-				if (c < 0x800)
-					return (EILSEQ);
-			}
+	if (last_codeset == NULL || strcmp(codeset, last_codeset) != 0) {
+		if (last_codeset != NULL) {
+			xfree(last_codeset);
+			last_codeset = NULL;
+		}
+		if (cd != (iconv_t)-1)
+			(void) iconv_close(cd);
 
-			if (l > 3) {
-				if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80))
-					c = c | ((*(p + 3) & 0x3f) <<
-					    (6 * (l - 4)));
-				else
-					return (EILSEQ);
+		if ((cd = iconv_open(codeset, "UTF-8")) == (iconv_t)-1) {
+			*error_str = gettext("Cannot convert UTF-8 "
+			    "strings to the local codeset");
+			return (NULL);
+		}
+		last_codeset = xstrdup(codeset);
+	}
+	return (do_iconv(cd, str, lenp, error_str));
+}
 
-				if (c < 0x10000)
-					return (EILSEQ);
-			}
+char *
+g11n_convert_to_utf8(const char *str, uint_t *lenp,
+    int native, char **error_str)
+{
+	static char *last_codeset;
+	static iconv_t cd = (iconv_t)-1;
+	char	*codeset;
 
-			if (l > 4) {
-				if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80))
-					c = c | ((*(p + 4) & 0x3f) <<
-					    (6 * (l - 5)));
-				else
-					return (EILSEQ);
+	*error_str = NULL;
 
-				if (c < 0x200000)
-					return (EILSEQ);
-			}
+	if (native)
+		codeset = native_codeset;
+	else
+		codeset = nl_langinfo(CODESET);
 
-			if (l > 5) {
-				if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80))
-					c = c | (*(p + 5) & 0x3f);
-				else
-					return (EILSEQ);
+	if (strcmp(codeset, "UTF-8") == 0)
+		return (NULL);
 
-				if (c < 0x4000000)
-					return (EILSEQ);
-			}
+	if (last_codeset == NULL || strcmp(codeset, last_codeset) != 0) {
+		if (last_codeset != NULL) {
+			xfree(last_codeset);
+			last_codeset = NULL;
+		}
+		if (cd != (iconv_t)-1)
+			(void) iconv_close(cd);
 
-			/*
-			 * check for UTF-16 surrogates ifs other illegal
-			 * UTF-8 * points
-			 */
-			if (((c <= 0xdfff) && (c >= 0xd800)) ||
-			    (c == 0xfffe) || (c == 0xffff))
-				return (EILSEQ);
-			p += l;
+		if ((cd = iconv_open("UTF-8", codeset)) == (iconv_t)-1) {
+			*error_str = gettext("Cannot convert the "
+			    "local codeset strings to UTF-8");
+			return (NULL);
 		}
-		/* 7-bit chars are fine */
-		else
-			p++;
+		last_codeset = xstrdup(codeset);
 	}
-	return (0);
+	return (do_iconv(cd, str, lenp, error_str));
 }
 
 /*
- * Functions for converting to ASCII or UTF-8 from the local codeset
- * Functions for converting from ASCII or UTF-8 to the local codeset
- *
- * The error_str parameter is an optional pointer to a char variable
- * where to store a string suitable for use with error() or fatal() or
- * friends.
- *
- * The err parameter is an optional pointer to an integer where 0
- * (success) or EILSEQ or EINVAL will be stored (failure).
- *
- * These functions return NULL if the conversion fails.
+ * Wrapper around iconv()
  *
+ * The caller is responsible for freeing the result. NULL is returned when
+ * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
+ * The caller must ensure that the input string isn't NULL pointer.
  */
-uchar_t *
-g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str)
+static char *
+do_iconv(iconv_t cd, const char *str, uint_t *lenp, char **err_str)
 {
-	static uint_t initialized = 0;
-	static uint_t do_convert = 0;
-	iconv_t cd;
-	int err;
-
-	if (!initialized) {
-		/*
-		 * iconv_open() fails if the to/from codesets are the
-		 * same, and there are aliases of codesets to boot...
-		 */
-		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
-		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
-		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
-			initialized = 1;
-			do_convert = 0;
-		} else {
-			cd = iconv_open(nl_langinfo(CODESET), "646");
-			if (cd == (iconv_t)-1) {
-				if (err_ptr)
-					*err_ptr = errno;
-				if (error_str)
-					*error_str = (uchar_t *)"Cannot "
-					    "convert ASCII strings to the local"
-					    " codeset";
+	int	ilen, olen;
+	size_t	ileft, oleft;
+	char	*ostr, *optr;
+	const char *istr;
+
+	ilen = *lenp;
+	olen = ilen + 1;
+
+	ostr = NULL;
+	for (;;) {
+		olen *= 2;
+		oleft = olen;
+		ostr = optr = xrealloc(ostr, olen);
+		istr = (const char *)str;
+		if ((ileft = ilen) == 0)
+			break;
+
+		if (iconv(cd, &istr, &ileft, &optr, &oleft) != (size_t)-1) {
+			/* success: generate reset sequence */
+			if (iconv(cd, NULL, NULL,
+			    &optr, &oleft) == (size_t)-1 && errno == E2BIG) {
+				continue;
 			}
-			initialized = 1;
-			do_convert = 1;
+			break;
 		}
-	}
-
-	if (!do_convert) {
-		if ((err = g11n_validate_ascii(str, 0, error_str))) {
-			if (err_ptr)
-				*err_ptr = err;
+		/* failed */
+		if (errno != E2BIG) {
+			oleft = olen;
+			(void) iconv(cd, NULL, NULL, &ostr, &oleft);
+			xfree(ostr);
+			*err_str = gettext("Codeset conversion failed");
 			return (NULL);
-		} else
-			return ((uchar_t *)xstrdup(str));
-	}
-
-	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
-}
-
-uchar_t *
-g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
-{
-	static uint_t initialized = 0;
-	static uint_t do_convert = 0;
-	iconv_t cd;
-	int err;
-
-	if (!initialized) {
-		/*
-		 * iconv_open() fails if the to/from codesets are the
-		 * same, and there are aliases of codesets to boot...
-		 */
-		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
-		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
-			initialized = 1;
-			do_convert = 0;
-		} else {
-			cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
-			if (cd == (iconv_t)-1) {
-				if (err_ptr)
-					*err_ptr = errno;
-				if (error_str)
-					*error_str = (uchar_t *)"Cannot "
-					    "convert UTF-8 strings to the "
-					    "local codeset";
-			}
-			initialized = 1;
-			do_convert = 1;
 		}
 	}
+	olen = optr - ostr;
+	optr = xmalloc(olen + 1);
+	(void) memcpy(optr, ostr, olen);
+	xfree(ostr);
 
-	if (!do_convert) {
-		if ((err = g11n_validate_utf8(str, 0, error_str))) {
-			if (err_ptr)
-				*err_ptr = err;
-			return (NULL);
-		} else
-			return ((uchar_t *)xstrdup((char *)str));
-	}
+	optr[olen] = '\0';
+	*lenp = olen;
 
-	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
+	return (optr);
 }
 
+/*
+ * A filter for output string. Control and unprintable characters
+ * are converted into visible form (eg "\ooo").
+ */
 char *
-g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str)
+g11n_filter_string(char *s)
 {
-	static uint_t initialized = 0;
-	static uint_t do_convert = 0;
-	iconv_t cd;
-
-	if (!initialized) {
-		/*
-		 * iconv_open() fails if the to/from codesets are the
-		 * same, and there are aliases of codesets to boot...
-		 */
-		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
-		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
-		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
-			initialized = 1;
-			do_convert = 0;
-		} else {
-			cd = iconv_open("646", nl_langinfo(CODESET));
-			if (cd == (iconv_t)-1) {
-				if (err_ptr)
-					*err_ptr = errno;
-				if (error_str)
-					*error_str = (uchar_t *)"Cannot "
-					    "convert UTF-8 strings to the "
-					    "local codeset";
-			}
-			initialized = 1;
-			do_convert = 1;
+	int	mb_cur_max = MB_CUR_MAX;
+	int	mblen, len;
+	char	*os = s;
+	wchar_t	wc;
+	char	*obuf, *op;
+
+	/* all character may be converted into the form of \ooo */
+	obuf = op = xmalloc(strlen(s) * 4 + 1);
+
+	while (*s != '\0') {
+		mblen = mbtowc(&wc, s, mb_cur_max);
+		if (mblen <= 0) {
+			mblen = 1;
+			wc = (unsigned char)*s;
 		}
-	}
-
-	if (!do_convert)
-		return (xstrdup((char *)str));
-
-	return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
-}
-
-uchar_t *
-g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
-{
-	static uint_t initialized = 0;
-	static uint_t do_convert = 0;
-	iconv_t cd;
-
-	if (!initialized) {
-		/*
-		 * iconv_open() fails if the to/from codesets are the
-		 * same, and there are aliases of codesets to boot...
-		 */
-		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
-		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
-			initialized = 1;
-			do_convert = 0;
+		if (!iswprint(wc) &&
+		    wc != L'\n' && wc != L'\r' && wc != L'\t') {
+			/*
+			 * control chars which need to be replaced
+			 * with safe character sequence.
+			 */
+			while (mblen != 0) {
+				op += sprintf(op, "\\%03o",
+				    (unsigned char)*s++);
+				mblen--;
+			}
 		} else {
-			cd = iconv_open("UTF-8", nl_langinfo(CODESET));
-			if (cd == (iconv_t)-1) {
-				if (err_ptr)
-					*err_ptr = errno;
-				if (error_str)
-					*error_str = (uchar_t *)"Cannot "
-					    "convert UTF-8 strings to the "
-					    "local codeset";
+			while (mblen != 0) {
+				*op++ = *s++;
+				mblen--;
 			}
-			initialized = 1;
-			do_convert = 1;
 		}
 	}
-
-	if (!do_convert)
-		return ((uchar_t *)xstrdup((char *)str));
-
-	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
+	*op = '\0';
+	len = op - obuf + 1;
+	op = xrealloc(os, len);
+	(void) memcpy(op, obuf, len);
+	xfree(obuf);
+	return (op);
 }
 
-
 /*
- * Wrapper around iconv()
+ * Once we negotiated with a langtag, server need to map it to a system
+ * locale. That is done based on the locale supported on the server side.
+ * We know (with the locale supported on Solaris) how the langtag is
+ * mapped to. However, from the client point of view, there is no way to
+ * know exactly what locale(encoding) will be used.
  *
- * The caller is responsible for freeing the result and for handling
- * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
+ * With the bug fix of SSH_BUG_STRING_ENCODING, it is guaranteed that the
+ * UTF-8 characters always come over the wire, so it is no longer the problem
+ * as long as both side has the bug fix. However if the server side doesn't
+ * have the fix, client can't safely perform the code conversion since the
+ * incoming character encoding is unknown.
+ *
+ * To alleviate this situation, we take an empirical approach to find
+ * encoding from langtag.
+ *
+ * If langtag has a subtag, we can directly map the langtag to UTF-8 locale
+ * (eg en-US can be mapped to en_US.UTF-8) with a few exceptions.
+ * Certain xx_YY locales don't support UTF-8 encoding (probably due to lack
+ * of L10N support ..). Those are:
+ *
+ * 	no_NO, no_NY, sr_SP, sr_YU
+ *
+ * They all use ISO8859-X encoding.
+ *
+ * For those "xx" langtags, some of them can be mapped to "xx.UTF-8",
+ * but others cannot. So we need to use the "xx" as the locale name.
+ * Those locales are:
+ *
+ * ar, ca, cs, da, et, fi, he, hu, ja, lt, lv, nl, no, pt, sh, th, tr
+ *
+ * Their encoding vary. They could be ISO8859-X or EUC or something else.
+ * So we don't perform code conversion for these langtags.
  */
-static uchar_t *
-do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len,
-    uint_t *outlen, int *err, uchar_t **err_str)
-{
-	size_t inbytesleft, outbytesleft, converted_size;
-	char *outbuf;
-	uchar_t *converted;
-	const char *inbuf;
-	uint_t mul = 0;
+static const char *non_utf8_langtag[] = {
+	"no-NO", "no-NY", "sr-SP", "sr-YU",
+	"ar", "ca", "cs", "da", "et", "fi", "he", "hu", "ja",
+	"lt", "lv", "nl", "no", "pt", "sh", "th", "tr", NULL};
 
-	if (!buf || !(*(char *)buf))
-		return (NULL);
+void
+g11n_test_langtag(const char *lang, int server)
+{
+	const char	**lp;
 
-	if (len == 0)
-		len = strlen(buf);
-
-	/* reset conversion descriptor */
-	/* XXX Do we need initial shift sequences for UTF-8??? */
-	(void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft);
-	inbuf = (const char *) buf;
-
-	if (mul_ptr)
-		mul = *mul_ptr;
-
-	converted_size = (len << mul);
-	outbuf = (char *)xmalloc(converted_size + 1); /* for null */
-	converted = (uchar_t *)outbuf;
-	outbytesleft = len;
-
-	do {
-		if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) ==
-		    (size_t)-1) {
-			if (errno == E2BIG) {
-				/* UTF-8 codepoints are at most 8 bytes long */
-				if (mul > 2) {
-					if (err_str)
-						*err_str = (uchar_t *)
-						    "Conversion to UTF-8 failed"
-						    " due to preposterous space"
-						    " requirements";
-					if (err)
-						*err = EILSEQ;
-					return (NULL);
-				}
+	if (datafellows & SSH_BUG_LOCALES_NOT_LANGTAGS) {
+		/*
+		 * We negotiated with real locale name (not lang tag).
+		 * We shouldn't expect UTF-8, thus shouldn't do code
+		 * conversion.
+		 */
+		datafellows |= SSH_BUG_STRING_ENCODING;
+		return;
+	}
 
-				/*
-				 * re-alloc output and ensure that the outbuf
-				 * and outbytesleft values are adjusted
-				 */
-				converted = xrealloc(converted,
-				    converted_size << 1 + 1);
-				outbuf = (char *)converted + converted_size -
-				    outbytesleft;
-				converted_size = (len << ++(mul));
-				outbytesleft = converted_size - outbytesleft;
-			} else {
-				/*
-				 * let the caller deal with iconv() errors,
-				 * probably by calling fatal(); xfree() does
-				 * not set errno
-				 */
-				if (err)
-					*err = errno;
-				xfree(converted);
-				return (NULL);
-			}
+	if (datafellows & SSH_BUG_STRING_ENCODING) {
+		if (server) {
+			/*
+			 * Whatever bug exists in the client side, server
+			 * side has nothing to do, since server has no way
+			 * to know what actual encoding is used on the client
+			 * side. For example, even if we negotiated with
+			 * en_US, client locale could be en_US.ISO8859-X or
+			 * en_US.UTF-8.
+			 */
+			return;
 		}
-	} while (inbytesleft);
-
-	*outbuf = '\0'; /* ensure null-termination */
-	if (outlen)
-		*outlen = converted_size - outbytesleft;
-	if (mul_ptr)
-		*mul_ptr = mul;
-
-	return (converted);
+		/*
+		 * We are on the client side. We'll check with known
+		 * locales to see if non-UTF8 characters could come in.
+		 */
+		for (lp = non_utf8_langtag; *lp != NULL; lp++) {
+			if (strcmp(lang, *lp) == 0)
+				break;
+		}
+		if (*lp == NULL) {
+			debug2("Server is expected to use UTF-8 locale");
+			datafellows &= ~SSH_BUG_STRING_ENCODING;
+		} else {
+			/*
+			 * Server is expected to use non-UTF8 encoding.
+			 */
+			debug2("Enforcing no code conversion: %s", lang);
+		}
+	}
 }
 
 /*
author	Nobutomo Nakano <Nobutomo.Nakano@Sun.COM>	2009-05-11 12:11:53 -0700
committer	Nobutomo Nakano <Nobutomo.Nakano@Sun.COM>	2009-05-11 12:11:53 -0700
commit	6f786ace10b9c0c7c5515e525fb660fbccfda6a3 (patch)
tree	a455902edb891743770d97ebb458821963392a26 /usr/src/cmd/ssh/libssh/common/g11n.c
parent	1c7408c96b4914bcab86c14af29f1af003397b16 (diff)
download	illumos-gate-6f786ace10b9c0c7c5515e525fb660fbccfda6a3.tar.gz