summaryrefslogtreecommitdiff
path: root/usr/src/man/man3c/mbrtoc16.3c
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src/man/man3c/mbrtoc16.3c')
-rw-r--r--usr/src/man/man3c/mbrtoc16.3c397
1 files changed, 397 insertions, 0 deletions
diff --git a/usr/src/man/man3c/mbrtoc16.3c b/usr/src/man/man3c/mbrtoc16.3c
new file mode 100644
index 0000000000..d1b3ab478b
--- /dev/null
+++ b/usr/src/man/man3c/mbrtoc16.3c
@@ -0,0 +1,397 @@
+.\"
+.\" This file and its contents are supplied under the terms of the
+.\" Common Development and Distribution License ("CDDL"), version 1.0.
+.\" You may only use this file in accordance with the terms of version
+.\" 1.0 of the CDDL.
+.\"
+.\" A full copy of the text of the CDDL should have accompanied this
+.\" source. A copy of the CDDL is also available via the Internet at
+.\" http://www.illumos.org/license/CDDL.
+.\"
+.\"
+.\" Copyright 2020 Robert Mustacchi
+.\"
+.Dd April 23, 2020
+.Dt MBRTOC16 3C
+.Os
+.Sh NAME
+.Nm mbrtoc16 ,
+.Nm mbrtoc32 ,
+.Nm mbrtowc ,
+.Nm mbrtowc_l
+.Nd convert characters to wide characters
+.Sh SYNOPSIS
+.In wchar.h
+.Ft size_t
+.Fo mbrtowc
+.Fa "wchar_t *restrict pwc"
+.Fa "const char *restrict str"
+.Fa "size_t len"
+.Fa "mstate_t *restrict ps"
+.Fc
+.In wchar.h
+.In xlocale.h
+.Ft size_t
+.Fo mbrtowc
+.Fa "wchar_t *restrict pwc"
+.Fa "const char *restrict str"
+.Fa "size_t len"
+.Fa "mstate_t *restrict ps"
+.Fa "locale_t loc"
+.Fc
+.In uchar.h
+.Ft size_t
+.Fo mbrtoc16
+.Fa "char16_t *restrict p16c"
+.Fa "const char *restrict str"
+.Fa "size_t len"
+.Fa "mbstate_t *restrict ps"
+.Fc
+.Ft size_t
+.Fo mbrtoc32
+.Fa "char32_t *restrict p32c"
+.Fa "const char *restrict str"
+.Fa "size_t len"
+.Fa "mbstate_t *restrict ps"
+.Fc
+.Sh DESCRIPTION
+The
+.Fn mbrtoc16 ,
+.Fn mbrtoc32 ,
+.Fn mbrtowc ,
+and
+.Fn mbrtowc_l
+functions convert character sequences, which may contain multi-byte
+characters, into different character formats.
+The functions work in the following formats:
+.Bl -tag -width mbrtowc_l
+.It Fn mbrtoc16
+A UTF-16 code sequence, where every code point is represented by one or
+two
+.Vt char16_t .
+The UTF-16 encoding will encode certain Unicode code points as a pair of
+two 16-bit code sequences, commonly referred to as a surrogate pair.
+.It Fn mbrtoc32
+A UTF-32 code sequence, where every code point is represented by a
+single
+.Vt char32_t .
+.It Fn mbrtowc , Fn mbrtowc_l
+Wide characters, being a 32-bit value where every code point is
+represented by a single
+.Vt wchar_t .
+While the
+.Vt wchar_t
+and
+.Vt char32_t
+are different types, in this implementation, they are similar encodings.
+.El
+.Pp
+The functions consume up to
+.Fa len
+characters from the string
+.Fa str
+and accumulate them in
+.Fa ps
+until a valid character is found, which is influenced by
+the
+.Dv LC_CTYPE
+category of the current locale.
+For example, in the
+.Sy C
+locale, only ASCII characters are recognized, while in a
+.Sy UTF-8
+based locale like
+.Sy en_US.UTF-8 ,
+UTF-8 multi-byte character sequences that represent Unicode code points
+are recognized.
+The
+.Fn mbrtowc_l
+function uses the locale passed in
+.Fa loc
+rather than the locale of the current thread.
+.Pp
+When a valid character sequence has been found, it is converted to
+either a 16-bit character sequence for
+.Fn mbrtoc16
+or a 32-bit character sequence for
+.Fn mbrtoc32
+and will be stored in
+.Fa p16c
+and
+.Fa p32c
+respectively.
+.Pp
+The
+.Fa ps
+argument represents a multi-byte conversion state which can be used
+across multiple calls to a given function
+.Pq but not mixed between functions .
+These allow for characters to be consumed from subsequent buffers, e.g.
+different values of
+.Fa str .
+The functions may be called from multiple threads as long as they use
+unique values for
+.Fa ps .
+If
+.Fa ps
+is
+.Dv NULL ,
+then a function-specific buffer will be used for the conversion state;
+however, this is stored between all threads and its use is not
+recommended.
+.Pp
+When using these functions, more than one character may be output for a
+given set of consumed input characters.
+An example of this is when a given code point is represented as a set of
+surrogate pairs in UTF-16, which require two 16-bit characters to
+represent a code point.
+When this occurs, the functions return the special return value
+.Sy -3 .
+.Pp
+The functions all have a special behavior when
+.Dv NULL
+is passed for
+.Fa str .
+They instead will treat it as though
+.Fa pwc ,
+.Fa p16c ,
+or
+.Fa p32c
+were
+.Dv NULL ,
+.Fa str
+had been passed as the empty string, "" and the length,
+.Fa len ,
+would appear as the value 1.
+In other words, the functions would be called as:
+.Bd -literal -offset indent
+mbrtowc(NULL, "", 1, ps)
+mbrtowc_l(NULL, "", 1, ps)
+mbrtoc16(NULL, "", 1, ps)
+mbrtoc32(NULL, "", 1, ps)
+.Ed
+.Ss Locale Details
+Not all locales in the system are Unicode based locales.
+For example, ISO 8859 family locales have code points with values that
+do not match their counterparts in Unicode.
+When using these functions with non-Unicode based locales, the code
+points returned will be those determined by the locale.
+They will not be converted to the corresponding Unicode code point.
+For example, if using the Euro sign in ISO 8859-15, these functions
+might return the code point 0xa4 and not the Unicode value 0x20ac.
+.Pp
+Regardless of the locale, the characters returned will be encoded as
+though the code point were the corresponding value in Unicode.
+This means that if a locale returns a value that would be a surrogate
+pair in the UTF-16 encoding, it will still be encoded as a UTF-16
+character.
+.Pp
+This behavior of the
+.Fn mbrtoc16
+and
+.Fn mbrtoc32
+functions should not be relied upon, is not portable, and subject to
+change for non-Unicode locales.
+.Sh RETURN VALUES
+The
+.Fn mbrtoc16 ,
+.Fn mbrtoc32 ,
+.Fn mbrtowc ,
+and
+.Fn mbrtowc_l
+functions return the following values:
+.Bl -tag -width (size_t)-3
+.It Sy 0
+.Fa len
+or fewer bytes of
+.Fa str
+were consumed and the null wide character was written into the wide
+character buffer
+.Po
+.Fa pwc ,
+.Fa p16c ,
+.Fa p32c
+.Pc .
+.It Sy between 1 and len
+The specified number of bytes were consumed and a single character was
+written into the wide character buffer
+.Po
+.Fa pwc ,
+.Fa p16c ,
+.Fa p32c
+.Pc .
+.It Sy (size_t)-1
+An encoding error has occurred.
+The next
+.Fa len
+bytes of
+.Fa str
+do not contribute to a valid character.
+.Va errno
+has been set to
+.Er EILSEQ .
+No data was written into the wide character buffer
+.Po
+.Fa pwc ,
+.Fa p16c ,
+.Fa p32c
+.Pc .
+.It Sy (size_t)-2
+.Fa len
+bytes of
+.Fa str
+were consumed, but a complete multi-byte character sequence has not been
+found and no data was written into the wide character buffer
+.Po
+.Fa pwc ,
+.Fa p16c ,
+.Fa p32c
+.Pc .
+.It Sy (size_t)-3
+A character has been written into the wide character buffer
+.Po
+.Fa pwc ,
+.Fa p16c ,
+.Fa p32c
+.Pc .
+This character was from a previous call (such as another part of a
+UTF-16 surrogate pair) and no input was consumed.
+This is limited to the
+.Fn mbrtoc16
+and
+.Fn mbrtoc32
+functions.
+.El
+.Sh EXAMPLES
+.Sy Example 1
+Using the
+.Fn mbrtoc32
+function to convert a multibyte string.
+.Bd -literal
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <stdio.h>
+#include <uchar.h>
+
+int
+main(void)
+{
+ mbstate_t mbs;
+ char32_t out;
+ size_t ret;
+ const char *uchar_str = "\exe5\ex85\ex89";
+
+ (void) memset(&mbs, 0, sizeof (mbs));
+ (void) setlocale(LC_CTYPE, "en_US.UTF-8");
+ ret = mbrtoc32(&out, uchar_str, strlen(uchar_str), &mbs);
+ if (ret != strlen(uchar_str)) {
+ errx(EXIT_FAILURE, "failed to convert string, got %zd",
+ ret);
+ }
+
+ (void) printf("Converted %zu bytes into UTF-32 character "
+ "0x%x\n", ret, out);
+ return (0);
+}
+.Ed
+.Pp
+When compiled and run, this produces:
+.Bd -literal -offset indent
+$ ./a.out
+Converted 3 bytes into UTF-32 character 0x5149
+.Ed
+.Pp
+.Sy Example 2
+Handling surrogate pairs from the
+.Fn mbrtoc16
+function.
+.Bd -literal
+#include <locale.h>
+#include <stdlib.h>
+#include <string.h>
+#include <err.h>
+#include <stdio.h>
+#include <uchar.h>
+
+int
+main(void)
+{
+ mbstate_t mbs;
+ char16_t first, second;
+ size_t ret;
+ const char *uchar_str = "\exf0\ex9f\ex92\exa9";
+
+ (void) memset(&mbs, '\0', sizeof (mbs));
+ (void) setlocale(LC_CTYPE, "en_US.UTF-8");
+ ret = mbrtoc16(&first, uchar_str, strlen(uchar_str), &mbs);
+ if (ret != strlen(uchar_str)) {
+ errx(EXIT_FAILURE, "failed to convert string, got %zd",
+ ret);
+ }
+
+ ret = mbrtoc16(&second, "", 0, &mbs);
+ if (ret != (size_t)-3) {
+ errx(EXIT_FAILURE, "didn't get second surrogate pair, "
+ "got %zd", ret);
+ }
+
+ (void) printf("UTF-16 surrogates: 0x%x 0x%x\n", first, second);
+ return (0);
+}
+.Ed
+.Pp
+When compiled and run, this produces:
+.Bd -literal -offset indent
+$ ./a.out
+UTF-16 surrogates: 0xd83d 0xdca9
+.Ed
+.Sh ERRORS
+The
+.Fn mbrtoc16 ,
+.Fn mbrtoc32 ,
+.Fn mbrtowc ,
+and
+.Fn mbrtowc_l
+functions will fail if:
+.Bl -tag -width Er
+.It Er EINVAL
+The conversion state in
+.Fa ps
+is invalid.
+.It Er EILSEQ
+An invalid character sequence has been detected.
+.El
+.Sh MT-LEVEL
+The
+.Fn mbrtoc16 ,
+.Fn mbrtoc32 ,
+.Fn mbrtowc ,
+and
+.Fn mbrtowc_l
+functions are
+.Sy MT-Safe
+as long as different
+.Vt mbstate_t
+structures are passed in
+.Fa ps .
+If
+.Fa ps
+is
+.Dv NULL
+or different threads use the same value for
+.Fa ps ,
+then the functions are
+.Sy Unsafe .
+.Sh INTERFACE STABILITY
+.Sy Committed
+.Sh SEE ALSO
+.Xr c16rtomb 3C ,
+.Xr c32rtomb 3C ,
+.Xr newlocale 3C ,
+.Xr setlocale 3C ,
+.Xr uselocale 3C ,
+.Xr wcrtomb 3C ,
+.Xr uchar.h 3HEAD ,
+.Xr environ 5