summaryrefslogtreecommitdiff
path: root/usr/src/man/man3c/c16rtomb.3c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/man/man3c/c16rtomb.3c')
-rw-r--r--usr/src/man/man3c/c16rtomb.3c285
1 files changed, 285 insertions, 0 deletions
diff --git a/usr/src/man/man3c/c16rtomb.3c b/usr/src/man/man3c/c16rtomb.3c
new file mode 100644
index 0000000000..33c6189dd3
--- /dev/null
+++ b/usr/src/man/man3c/c16rtomb.3c
@@ -0,0 +1,285 @@
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright 2020 Robert Mustacchi
+.\"
+.Dd April 23, 2020
+.Dt C16RTOMB 3C
+.Os
+.Sh NAME
+.Nm c16rtomb ,
+.Nm c32rtomb ,
+.Nm wcrtomb ,
+.Nm wcrtomb_l
+.Nd convert wide-characters to character sequences
+.Sh SYNOPSIS
+.In uchar.h
+.Ft size_t
+.Fo c16rtomb
+.Fa "char *restrict str"
+.Fa "char16_t c16"
+.Fa "mbstate_t *restrict ps"
+.Fc
+.Ft size_t
+.Fo c32rtomb
+.Fa "char *restrict str"
+.Fa "char32_t c32"
+.Fa "mbstate_t *restrict ps"
+.Fc
+.In stdio.h
+.Ft size_t
+.Fo wcrtomb
+.Fa "char *restrict str"
+.Fa "wchar_t wc"
+.Fa "mbstate_t *restrict ps"
+.Fc
+.In stdio.h
+.In xlocale.h
+.Ft size_t
+.Fo wcrtomb_l
+.Fa "char *restrict str"
+.Fa "wchar_t wc"
+.Fa "mbstate_t *restrict ps"
+.Fa "locale_t loc"
+.Fc
+.Sh DESCRIPTION
+The
+.Fn c16rtomb ,
+.Fn c32rtomb ,
+.Fn wcrtomb ,
+and
+.Fn wcrtomb_l
+functions convert wide-character sequences into a series of multi-byte
+characters.
+The functions work in the following formats:
+.Bl -tag -width wcrtomb_l
+.It Fn c16rtomb
+A UTF-16 code sequence, where every code point is represented by one or
+two
+.Vt char16_t .
+The UTF-16 encoding will encode certain Unicode code points as a pair of
+two 16-bit code sequences, commonly referred to as a surrogate pair.
+.It Fn c32rtomb
+A UTF-32 code sequence, where every code point is represented by a
+single
+.Vt char32_t .
+It is illegal to pass reserved Unicode code points.
+.It Fn wcrtomb , Fn wcrtomb_l
+Wide characters, being a 32-bit value where every code point is
+represented by a single
+.Vt wchar_t .
+While the
+.Vt wchar_t
+and
+.Vt char32_t
+are different types, in this implementation, they are similar encodings.
+.El
+.Pp
+The functions all work by looking at the passed in wide-character
+.Po
+.Fa c16 ,
+.Fa c32 ,
+.Fa wc
+.Pc
+and appending it to the current conversion state,
+.Fa ps .
+Once a valid code point, based on the current locale, is found, then it
+will be converted into a series of characters that are stored in
+.Fa str .
+Up to
+.Dv MB_CUR_MAX
+bytes will be stored in
+.Fa str .
+It is the caller's responsibility to ensure that there is sufficient
+space in
+.Fa str .
+.Pp
+The functions are all influenced by the
+.Dv LC_CTYPE
+category of the current locale for determining what is considered a
+valid character.
+For example, in the
+.Sy C
+locale,
+only ASCII characters are recognized, while in a
+.Sy UTF-8
+based locale like
+.Sy en_us.UTF-8 ,
+all valid Unicode code points are recognized and will be converted into
+the corresponding multi-byte sequence.
+The
+.Fn wcrtomb_l
+function uses the locale passed in
+.Fa loc
+rather than the locale of the current thread.
+.Pp
+The
+.Fa ps
+argument represents a multi-byte conversion state which can be used
+across multiple calls to a given function
+.Pq but not mixed between functions .
+These allow for characters to be consumed from subsequent buffers, e.g.
+different values of
+.Fa str .
+The functions may be called from multiple threads as long as they use
+unique values for
+.Fa ps .
+If
+.Fa ps
+is
+.Dv NULL ,
+then a function-specific buffer will be used for the conversion state;
+however, this is stored between all threads and its use is not
+recommended.
+.Pp
+The functions all have a special behavior when
+.Dv NULL
+is passed for
+.Fa str .
+They instead will treat it as though a the NULL wide-character was
+passed in
+.Fa c16 ,
+.Fa c32 ,
+or
+.Fa wc
+and an internal buffer
+.Pq buf
+will be used to write out the results of the
+converstion.
+In other words, the functions would be called as:
+.Bd -literal -offset indent
+c16rtomb(buf, L'\\0', ps)
+c32rtomb(buf, L'\\0', ps)
+wcrtomb(buf, L'\\0', ps)
+wcrtomb_l(buf, L'\\0', ps, loc)
+.Ed
+.Ss Locale Details
+Not all locales in the system are Unicode based locales.
+For example, ISO 8859 family locales have code points with values that
+do not match their counterparts in Unicode.
+When using these functions with non-Unicode based locales, the code
+points returned will be those determined by the locale.
+They will not be converted from the corresponding Unicode code point.
+For example, if using the Euro sign in ISO 8859-15, these functions
+will not encode the Unicode value 0x20ac into the ISO 8859-15 value
+0xa4.
+.Pp
+Regardless of the locale, the characters returned will be encoded as
+though the code point were the corresponding value in Unicode.
+This means that when using UTF-16, if the corresponding code point were
+in the range for surorgate pairs, then the
+.Fn c16rtomb
+function will expect to receive that code point in that fashion.
+.Pp
+This behavior of the
+.Fn c16rtomb
+and
+.Fn c32rtomb
+functions should not be relied upon, is not portable, and subject to
+change for non-Unicode locales.
+.Sh RETURN VALUES
+Upon successful completion, the
+.Fn c16rtomb ,
+.Fn c32rtomb ,
+.Fn wcrtomb ,
+and
+.Fn wcrtomb_l
+functions return the number of bytes stored in
+.Fa str .
+Otherwise,
+.Sy (size_t)-1
+is returned to indicate an encoding error and
+.Va errno
+is set.
+.Sh EXAMPLES
+.Sy Example 1
+Converting a UTF-32 character into a multi-byte character sequence.
+.Bd -literal
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <stdio.h>
+#include <uchar.h>
+
+int
+main(void)
+{
+ mbstate_t mbs;
+ size_t ret;
+ char buf[MB_CUR_MAX];
+ char32_t val = 0x5149;
+ const char *uchar_exp = "\exe5\ex85\ex89";
+
+ (void) memset(&mbs, 0, sizeof (mbs));
+ (void) setlocale(LC_CTYPE, "en_US.UTF-8");
+ ret = c32rtomb(buf, val, &mbs);
+ if (ret != strlen(uchar_exp)) {
+ errx(EXIT_FAILURE, "failed to convert string, got %zd",
+ ret);
+ }
+
+ if (strncmp(buf, uchar_exp, ret) != 0) {
+ errx(EXIT_FAILURE, "converted char32_t does not match "
+ "expected value");
+ }
+
+ return (0);
+}
+.Ed
+.Sh ERRORS
+The
+.Fn c16rtomb ,
+.Fn c32rtomb ,
+.Fn wcrtomb ,
+and
+.Fn wcrtomb_l
+functions will fail if:
+.Bl -tag -width Er
+.It Er EINVAL
+The conversion state in
+.Fa ps
+is invalid.
+.It Er EILSEQ
+An invalid character sequence has been detected.
+.El
+.Sh MT-LEVEL
+The
+.Fn c16rtomb ,
+.Fn c32rtomb ,
+.Fn wcrtomb ,
+and
+.Fn wcrtomb_l
+functions are
+.Sy MT-Safe
+as long as different
+.Vt mbstate_t
+structures are passed in
+.Fa ps .
+If
+.Fa ps
+is
+.Dv NULL
+or different threads use the same value for
+.Fa ps ,
+then the functions are
+.Sy Unsafe .
+.Sh INTERFACE STABILITY
+.Sy Committed
+.Sh SEE ALSO
+.Xr mbrtoc16 3C ,
+.Xr mbrtoc32 3C ,
+.Xr mbrtowc 3C ,
+.Xr newlocale 3C ,
+.Xr setlocale 3C ,
+.Xr uselocale 3C ,
+.Xr uchar.h 3HEAD ,
+.Xr environ 5