diff options
Diffstat (limited to 'usr/src/man/man3c/mbrtoc16.3c')
-rw-r--r-- | usr/src/man/man3c/mbrtoc16.3c | 397 |
1 files changed, 397 insertions, 0 deletions
diff --git a/usr/src/man/man3c/mbrtoc16.3c b/usr/src/man/man3c/mbrtoc16.3c new file mode 100644 index 0000000000..d1b3ab478b --- /dev/null +++ b/usr/src/man/man3c/mbrtoc16.3c @@ -0,0 +1,397 @@ +.\" +.\" This file and its contents are supplied under the terms of the +.\" Common Development and Distribution License ("CDDL"), version 1.0. +.\" You may only use this file in accordance with the terms of version +.\" 1.0 of the CDDL. +.\" +.\" A full copy of the text of the CDDL should have accompanied this +.\" source. A copy of the CDDL is also available via the Internet at +.\" http://www.illumos.org/license/CDDL. +.\" +.\" +.\" Copyright 2020 Robert Mustacchi +.\" +.Dd April 23, 2020 +.Dt MBRTOC16 3C +.Os +.Sh NAME +.Nm mbrtoc16 , +.Nm mbrtoc32 , +.Nm mbrtowc , +.Nm mbrtowc_l +.Nd convert characters to wide characters +.Sh SYNOPSIS +.In wchar.h +.Ft size_t +.Fo mbrtowc +.Fa "wchar_t *restrict pwc" +.Fa "const char *restrict str" +.Fa "size_t len" +.Fa "mstate_t *restrict ps" +.Fc +.In wchar.h +.In xlocale.h +.Ft size_t +.Fo mbrtowc +.Fa "wchar_t *restrict pwc" +.Fa "const char *restrict str" +.Fa "size_t len" +.Fa "mstate_t *restrict ps" +.Fa "locale_t loc" +.Fc +.In uchar.h +.Ft size_t +.Fo mbrtoc16 +.Fa "char16_t *restrict p16c" +.Fa "const char *restrict str" +.Fa "size_t len" +.Fa "mbstate_t *restrict ps" +.Fc +.Ft size_t +.Fo mbrtoc32 +.Fa "char32_t *restrict p32c" +.Fa "const char *restrict str" +.Fa "size_t len" +.Fa "mbstate_t *restrict ps" +.Fc +.Sh DESCRIPTION +The +.Fn mbrtoc16 , +.Fn mbrtoc32 , +.Fn mbrtowc , +and +.Fn mbrtowc_l +functions convert character sequences, which may contain multi-byte +characters, into different character formats. +The functions work in the following formats: +.Bl -tag -width mbrtowc_l +.It Fn mbrtoc16 +A UTF-16 code sequence, where every code point is represented by one or +two +.Vt char16_t . +The UTF-16 encoding will encode certain Unicode code points as a pair of +two 16-bit code sequences, commonly referred to as a surrogate pair. +.It Fn mbrtoc32 +A UTF-32 code sequence, where every code point is represented by a +single +.Vt char32_t . +.It Fn mbrtowc , Fn mbrtowc_l +Wide characters, being a 32-bit value where every code point is +represented by a single +.Vt wchar_t . +While the +.Vt wchar_t +and +.Vt char32_t +are different types, in this implementation, they are similar encodings. +.El +.Pp +The functions consume up to +.Fa len +characters from the string +.Fa str +and accumulate them in +.Fa ps +until a valid character is found, which is influenced by +the +.Dv LC_CTYPE +category of the current locale. +For example, in the +.Sy C +locale, only ASCII characters are recognized, while in a +.Sy UTF-8 +based locale like +.Sy en_US.UTF-8 , +UTF-8 multi-byte character sequences that represent Unicode code points +are recognized. +The +.Fn mbrtowc_l +function uses the locale passed in +.Fa loc +rather than the locale of the current thread. +.Pp +When a valid character sequence has been found, it is converted to +either a 16-bit character sequence for +.Fn mbrtoc16 +or a 32-bit character sequence for +.Fn mbrtoc32 +and will be stored in +.Fa p16c +and +.Fa p32c +respectively. +.Pp +The +.Fa ps +argument represents a multi-byte conversion state which can be used +across multiple calls to a given function +.Pq but not mixed between functions . +These allow for characters to be consumed from subsequent buffers, e.g. +different values of +.Fa str . +The functions may be called from multiple threads as long as they use +unique values for +.Fa ps . +If +.Fa ps +is +.Dv NULL , +then a function-specific buffer will be used for the conversion state; +however, this is stored between all threads and its use is not +recommended. +.Pp +When using these functions, more than one character may be output for a +given set of consumed input characters. +An example of this is when a given code point is represented as a set of +surrogate pairs in UTF-16, which require two 16-bit characters to +represent a code point. +When this occurs, the functions return the special return value +.Sy -3 . +.Pp +The functions all have a special behavior when +.Dv NULL +is passed for +.Fa str . +They instead will treat it as though +.Fa pwc , +.Fa p16c , +or +.Fa p32c +were +.Dv NULL , +.Fa str +had been passed as the empty string, "" and the length, +.Fa len , +would appear as the value 1. +In other words, the functions would be called as: +.Bd -literal -offset indent +mbrtowc(NULL, "", 1, ps) +mbrtowc_l(NULL, "", 1, ps) +mbrtoc16(NULL, "", 1, ps) +mbrtoc32(NULL, "", 1, ps) +.Ed +.Ss Locale Details +Not all locales in the system are Unicode based locales. +For example, ISO 8859 family locales have code points with values that +do not match their counterparts in Unicode. +When using these functions with non-Unicode based locales, the code +points returned will be those determined by the locale. +They will not be converted to the corresponding Unicode code point. +For example, if using the Euro sign in ISO 8859-15, these functions +might return the code point 0xa4 and not the Unicode value 0x20ac. +.Pp +Regardless of the locale, the characters returned will be encoded as +though the code point were the corresponding value in Unicode. +This means that if a locale returns a value that would be a surrogate +pair in the UTF-16 encoding, it will still be encoded as a UTF-16 +character. +.Pp +This behavior of the +.Fn mbrtoc16 +and +.Fn mbrtoc32 +functions should not be relied upon, is not portable, and subject to +change for non-Unicode locales. +.Sh RETURN VALUES +The +.Fn mbrtoc16 , +.Fn mbrtoc32 , +.Fn mbrtowc , +and +.Fn mbrtowc_l +functions return the following values: +.Bl -tag -width (size_t)-3 +.It Sy 0 +.Fa len +or fewer bytes of +.Fa str +were consumed and the null wide character was written into the wide +character buffer +.Po +.Fa pwc , +.Fa p16c , +.Fa p32c +.Pc . +.It Sy between 1 and len +The specified number of bytes were consumed and a single character was +written into the wide character buffer +.Po +.Fa pwc , +.Fa p16c , +.Fa p32c +.Pc . +.It Sy (size_t)-1 +An encoding error has occurred. +The next +.Fa len +bytes of +.Fa str +do not contribute to a valid character. +.Va errno +has been set to +.Er EILSEQ . +No data was written into the wide character buffer +.Po +.Fa pwc , +.Fa p16c , +.Fa p32c +.Pc . +.It Sy (size_t)-2 +.Fa len +bytes of +.Fa str +were consumed, but a complete multi-byte character sequence has not been +found and no data was written into the wide character buffer +.Po +.Fa pwc , +.Fa p16c , +.Fa p32c +.Pc . +.It Sy (size_t)-3 +A character has been written into the wide character buffer +.Po +.Fa pwc , +.Fa p16c , +.Fa p32c +.Pc . +This character was from a previous call (such as another part of a +UTF-16 surrogate pair) and no input was consumed. +This is limited to the +.Fn mbrtoc16 +and +.Fn mbrtoc32 +functions. +.El +.Sh EXAMPLES +.Sy Example 1 +Using the +.Fn mbrtoc32 +function to convert a multibyte string. +.Bd -literal +#include <locale.h> +#include <stdlib.h> +#include <string.h> +#include <err.h> +#include <stdio.h> +#include <uchar.h> + +int +main(void) +{ + mbstate_t mbs; + char32_t out; + size_t ret; + const char *uchar_str = "\exe5\ex85\ex89"; + + (void) memset(&mbs, 0, sizeof (mbs)); + (void) setlocale(LC_CTYPE, "en_US.UTF-8"); + ret = mbrtoc32(&out, uchar_str, strlen(uchar_str), &mbs); + if (ret != strlen(uchar_str)) { + errx(EXIT_FAILURE, "failed to convert string, got %zd", + ret); + } + + (void) printf("Converted %zu bytes into UTF-32 character " + "0x%x\n", ret, out); + return (0); +} +.Ed +.Pp +When compiled and run, this produces: +.Bd -literal -offset indent +$ ./a.out +Converted 3 bytes into UTF-32 character 0x5149 +.Ed +.Pp +.Sy Example 2 +Handling surrogate pairs from the +.Fn mbrtoc16 +function. +.Bd -literal +#include <locale.h> +#include <stdlib.h> +#include <string.h> +#include <err.h> +#include <stdio.h> +#include <uchar.h> + +int +main(void) +{ + mbstate_t mbs; + char16_t first, second; + size_t ret; + const char *uchar_str = "\exf0\ex9f\ex92\exa9"; + + (void) memset(&mbs, '\0', sizeof (mbs)); + (void) setlocale(LC_CTYPE, "en_US.UTF-8"); + ret = mbrtoc16(&first, uchar_str, strlen(uchar_str), &mbs); + if (ret != strlen(uchar_str)) { + errx(EXIT_FAILURE, "failed to convert string, got %zd", + ret); + } + + ret = mbrtoc16(&second, "", 0, &mbs); + if (ret != (size_t)-3) { + errx(EXIT_FAILURE, "didn't get second surrogate pair, " + "got %zd", ret); + } + + (void) printf("UTF-16 surrogates: 0x%x 0x%x\n", first, second); + return (0); +} +.Ed +.Pp +When compiled and run, this produces: +.Bd -literal -offset indent +$ ./a.out +UTF-16 surrogates: 0xd83d 0xdca9 +.Ed +.Sh ERRORS +The +.Fn mbrtoc16 , +.Fn mbrtoc32 , +.Fn mbrtowc , +and +.Fn mbrtowc_l +functions will fail if: +.Bl -tag -width Er +.It Er EINVAL +The conversion state in +.Fa ps +is invalid. +.It Er EILSEQ +An invalid character sequence has been detected. +.El +.Sh MT-LEVEL +The +.Fn mbrtoc16 , +.Fn mbrtoc32 , +.Fn mbrtowc , +and +.Fn mbrtowc_l +functions are +.Sy MT-Safe +as long as different +.Vt mbstate_t +structures are passed in +.Fa ps . +If +.Fa ps +is +.Dv NULL +or different threads use the same value for +.Fa ps , +then the functions are +.Sy Unsafe . +.Sh INTERFACE STABILITY +.Sy Committed +.Sh SEE ALSO +.Xr c16rtomb 3C , +.Xr c32rtomb 3C , +.Xr newlocale 3C , +.Xr setlocale 3C , +.Xr uselocale 3C , +.Xr wcrtomb 3C , +.Xr uchar.h 3HEAD , +.Xr environ 5 |