summaryrefslogtreecommitdiff
path: root/srclib/apr/misc/win32/utf8.c
diff options
context:
space:
mode:
authorStefan Fritsch <sf@sfritsch.de>2011-12-27 19:42:03 +0100
committerStefan Fritsch <sf@sfritsch.de>2011-12-27 19:42:03 +0100
commit80db94fff6a9620fb469ee911347ed973e3f7735 (patch)
tree35ccde4018b7e6b84103e5e85dc1085ef9e7d6c2 /srclib/apr/misc/win32/utf8.c
downloadapache2-80db94fff6a9620fb469ee911347ed973e3f7735.tar.gz
Upstream tarball 2.2.3upstream/2.2.3
Diffstat (limited to 'srclib/apr/misc/win32/utf8.c')
-rw-r--r--srclib/apr/misc/win32/utf8.c254
1 files changed, 254 insertions, 0 deletions
diff --git a/srclib/apr/misc/win32/utf8.c b/srclib/apr/misc/win32/utf8.c
new file mode 100644
index 00000000..d77e656d
--- /dev/null
+++ b/srclib/apr/misc/win32/utf8.c
@@ -0,0 +1,254 @@
+/* Copyright 2000-2005 The Apache Software Foundation or its licensors, as
+ * applicable.
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include "apr.h"
+#include "apr_private.h"
+#include "apr_errno.h"
+#include "apr_arch_utf8.h"
+
+/* Implement the design principal specified by RFC 2718 2.2.5
+ * Guidelines for new URL Schemes - within the APR.
+ *
+ * Since many architectures support unicode, and UCS2 is the most
+ * efficient storage used by those archictures, these functions
+ * exist to validate a UCS string. It is up to the operating system
+ * to determine the validitity of the string in the context of it's
+ * native language support. File systems that support filename
+ * characters of 0x80-0xff but have no support of Unicode will find
+ * this function useful only for validating the character sequences
+ * and rejecting poorly encoded strings, if RFC 2718 2.2.5 naming is
+ * desired.
+ *
+ * from RFC 2279 UTF-8, a transformation format of ISO 10646
+ *
+ * UCS-4 range (hex.) UTF-8 octet sequence (binary)
+ * 1:2 0000 0000-0000 007F 0xxxxxxx
+ * 2:2 0000 0080-0000 07FF 110XXXXx 10xxxxxx
+ * 3:2 0000 0800-0000 FFFF 1110XXXX 10Xxxxxx 10xxxxxx
+ * 4:4 0001 0000-001F FFFF 11110zXX 10XXxxxx 10xxxxxx 10xxxxxx
+ * inv 0020 0000-03FF FFFF 111110XX 10XXXxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ * inv 0400 0000-7FFF FFFF 1111110X 10XXXXxx 10xxxxxx 10xxxxxx 10xxxxxx 10xxxxxx
+ *
+ * One of the X values must be one for the encoding length to be legit.
+ * Neither the z bit, nor the final two forms, are used for ucs-2
+ *
+ * "Pairs of UCS-2 values between D800 and DFFF (surrogate pairs in
+ * Unicode parlance), being actually UCS-4 characters transformed
+ * through UTF-16, need special treatment: the UTF-16 transformation
+ * must be undone, yielding a UCS-4 character that is then transformed
+ * as above."
+ *
+ * from RFC2781 UTF-16: the compressed ISO 10646 encoding bitmask
+ *
+ * U' = U - 0x10000
+ * U' = 000000000000yyyyyyyyyyxxxxxxxxxx
+ * W1 = 110110yyyyyyyyyy
+ * W2 = 110111xxxxxxxxxx
+ *
+ * apr_conv_utf8_to_ucs2 out bytes:sizeof(in) * 1 <= Req <= sizeof(in) * 2
+ *
+ * apr_conv_ucs2_to_utf8 out words:sizeof(in) / 2 <= Req <= sizeof(in) * 3 / 2
+ */
+
+APR_DECLARE(apr_status_t) apr_conv_utf8_to_ucs2(const char *in,
+ apr_size_t *inbytes,
+ apr_wchar_t *out,
+ apr_size_t *outwords)
+{
+ apr_int64_t newch, mask;
+ apr_size_t expect, eating;
+ int ch;
+
+ while (*inbytes && *outwords)
+ {
+ ch = (unsigned char)(*in++);
+ if (!(ch & 0200)) {
+ /* US-ASCII-7 plain text
+ */
+ --*inbytes;
+ --*outwords;
+ *(out++) = ch;
+ }
+ else
+ {
+ if ((ch & 0300) != 0300) {
+ /* Multibyte Continuation is out of place
+ */
+ return APR_EINVAL;
+ }
+ else
+ {
+ /* Multibyte Sequence Lead Character
+ *
+ * Compute the expected bytes while adjusting
+ * or lead byte and leading zeros mask.
+ */
+ mask = 0340;
+ expect = 1;
+ while ((ch & mask) == mask) {
+ mask |= mask >> 1;
+ if (++expect > 3) /* (truly 5 for ucs-4) */
+ return APR_EINVAL;
+ }
+ newch = ch & ~mask;
+ eating = expect + 1;
+ if (*inbytes <= expect)
+ return APR_INCOMPLETE;
+ /* Reject values of excessive leading 0 bits
+ * utf-8 _demands_ the shortest possible byte length
+ */
+ if (expect == 1) {
+ if (!(newch & 0036))
+ return APR_EINVAL;
+ }
+ else {
+ /* Reject values of excessive leading 0 bits
+ */
+ if (!newch && !((unsigned char)*in & 0077 & (mask << 1)))
+ return APR_EINVAL;
+ if (expect == 2) {
+ /* Reject values D800-DFFF when not utf16 encoded
+ * (may not be an appropriate restriction for ucs-4)
+ */
+ if (newch == 0015 && ((unsigned char)*in & 0040))
+ return APR_EINVAL;
+ }
+ else if (expect == 3) {
+ /* Short circuit values > 110000
+ */
+ if (newch > 4)
+ return APR_EINVAL;
+ if (newch == 4 && ((unsigned char)*in & 0060))
+ return APR_EINVAL;
+ }
+ }
+ /* Where the boolean (expect > 2) is true, we will need
+ * an extra word for the output.
+ */
+ if (*outwords < (apr_size_t)(expect > 2) + 1)
+ break; /* buffer full */
+ while (expect--)
+ {
+ /* Multibyte Continuation must be legal */
+ if (((ch = (unsigned char)*(in++)) & 0300) != 0200)
+ return APR_EINVAL;
+ newch <<= 6;
+ newch |= (ch & 0077);
+ }
+ *inbytes -= eating;
+ /* newch is now a true ucs-4 character
+ *
+ * now we need to fold to ucs-2
+ */
+ if (newch < 0x10000)
+ {
+ --*outwords;
+ *(out++) = (apr_wchar_t) newch;
+ }
+ else
+ {
+ *outwords -= 2;
+ newch -= 0x10000;
+ *(out++) = (apr_wchar_t) (0xD800 | (newch >> 10));
+ *(out++) = (apr_wchar_t) (0xDC00 | (newch & 0x03FF));
+ }
+ }
+ }
+ }
+ /* Buffer full 'errors' aren't errors, the client must inspect both
+ * the inbytes and outwords values
+ */
+ return APR_SUCCESS;
+}
+
+APR_DECLARE(apr_status_t) apr_conv_ucs2_to_utf8(const apr_wchar_t *in,
+ apr_size_t *inwords,
+ char *out,
+ apr_size_t *outbytes)
+{
+ apr_int64_t newch, require;
+ apr_size_t need;
+ char *invout;
+ int ch;
+
+ while (*inwords && *outbytes)
+ {
+ ch = (unsigned short)(*in++);
+ if (ch < 0x80)
+ {
+ --*inwords;
+ --*outbytes;
+ *(out++) = (unsigned char) ch;
+ }
+ else
+ {
+ if ((ch & 0xFC00) == 0xDC00) {
+ /* Invalid Leading ucs-2 Multiword Continuation Character
+ */
+ return APR_EINVAL;
+ }
+ if ((ch & 0xFC00) == 0xD800) {
+ /* Leading ucs-2 Multiword Character
+ */
+ if (*inwords < 2) {
+ /* Missing ucs-2 Multiword Continuation Character
+ */
+ return APR_INCOMPLETE;
+ }
+ if (((unsigned short)(*in) & 0xFC00) != 0xDC00) {
+ /* Invalid ucs-2 Multiword Continuation Character
+ */
+ return APR_EINVAL;
+ }
+ newch = (ch & 0x03FF) << 10 | ((unsigned short)(*in++) & 0x03FF);
+ newch += 0x10000;
+ }
+ else {
+ /* ucs-2 Single Word Character
+ */
+ newch = ch;
+ }
+ /* Determine the absolute minimum utf-8 bytes required
+ */
+ require = newch >> 11;
+ need = 1;
+ while (require)
+ require >>= 5, ++need;
+ if (need >= *outbytes)
+ break; /* Insufficient buffer */
+ *inwords -= (need > 2) + 1;
+ *outbytes -= need + 1;
+ /* Compute the utf-8 characters in last to first order,
+ * calculating the lead character length bits along the way.
+ */
+ ch = 0200;
+ out += need + 1;
+ invout = out;
+ while (need--) {
+ ch |= ch >> 1;
+ *(--invout) = (unsigned char)(0200 | (newch & 0077));
+ newch >>= 6;
+ }
+ /* Compute the lead utf-8 character and move the dest offset
+ */
+ *(--invout) = (unsigned char)(ch | newch);
+ }
+ }
+ /* Buffer full 'errors' aren't errors, the client must inspect both
+ * the inwords and outbytes values
+ */
+ return APR_SUCCESS;
+}