summaryrefslogtreecommitdiff
path: root/ext/mbstring/php_unicode.c
diff options
context:
space:
mode:
Diffstat (limited to 'ext/mbstring/php_unicode.c')
-rw-r--r--ext/mbstring/php_unicode.c313
1 files changed, 313 insertions, 0 deletions
diff --git a/ext/mbstring/php_unicode.c b/ext/mbstring/php_unicode.c
new file mode 100644
index 000000000..42ecaa8d5
--- /dev/null
+++ b/ext/mbstring/php_unicode.c
@@ -0,0 +1,313 @@
+/*
+ +----------------------------------------------------------------------+
+ | PHP Version 5 |
+ +----------------------------------------------------------------------+
+ | Copyright (c) 1997-2004 The PHP Group |
+ +----------------------------------------------------------------------+
+ | This source file is subject to version 3.0 of the PHP license, |
+ | that is bundled with this package in the file LICENSE, and is |
+ | available through the world-wide-web at the following url: |
+ | http://www.php.net/license/3_0.txt. |
+ | If you did not receive a copy of the PHP license and are unable to |
+ | obtain it through the world-wide-web, please send a note to |
+ | license@php.net so we can mail you a copy immediately. |
+ +----------------------------------------------------------------------+
+ | Author: Wez Furlong (wez@thebrainroom.com) |
+ +----------------------------------------------------------------------+
+
+ Based on code from ucdata-2.5, which has the following Copyright:
+
+ Copyright 2001 Computing Research Labs, New Mexico State University
+
+ Permission is hereby granted, free of charge, to any person obtaining a
+ copy of this software and associated documentation files (the "Software"),
+ to deal in the Software without restriction, including without limitation
+ the rights to use, copy, modify, merge, publish, distribute, sublicense,
+ and/or sell copies of the Software, and to permit persons to whom the
+ Software is furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+*/
+
+#ifdef HAVE_CONFIG_H
+#include "config.h"
+#endif
+
+#include "php.h"
+#include "php_ini.h"
+
+#if HAVE_MBSTRING
+
+/* include case folding data generated from the official UnicodeData.txt file */
+#include "mbstring.h"
+#include "php_unicode.h"
+#include "unicode_data.h"
+
+/*
+ * A simple array of 32-bit masks for lookup.
+ */
+static unsigned long masks32[32] = {
+ 0x00000001, 0x00000002, 0x00000004, 0x00000008, 0x00000010, 0x00000020,
+ 0x00000040, 0x00000080, 0x00000100, 0x00000200, 0x00000400, 0x00000800,
+ 0x00001000, 0x00002000, 0x00004000, 0x00008000, 0x00010000, 0x00020000,
+ 0x00040000, 0x00080000, 0x00100000, 0x00200000, 0x00400000, 0x00800000,
+ 0x01000000, 0x02000000, 0x04000000, 0x08000000, 0x10000000, 0x20000000,
+ 0x40000000, 0x80000000
+};
+
+
+static int prop_lookup(unsigned long code, unsigned long n)
+{
+ long l, r, m;
+
+ /*
+ * There is an extra node on the end of the offsets to allow this routine
+ * to work right. If the index is 0xffff, then there are no nodes for the
+ * property.
+ */
+ if ((l = _ucprop_offsets[n]) == 0xffff)
+ return 0;
+
+ /*
+ * Locate the next offset that is not 0xffff. The sentinel at the end of
+ * the array is the max index value.
+ */
+ for (m = 1; n + m < _ucprop_size && _ucprop_offsets[n + m] == 0xffff; m++)
+ ;
+
+ r = _ucprop_offsets[n + m] - 1;
+
+ while (l <= r) {
+ /*
+ * Determine a "mid" point and adjust to make sure the mid point is at
+ * the beginning of a range pair.
+ */
+ m = (l + r) >> 1;
+ m -= (m & 1);
+ if (code > _ucprop_ranges[m + 1])
+ l = m + 2;
+ else if (code < _ucprop_ranges[m])
+ r = m - 2;
+ else if (code >= _ucprop_ranges[m] && code <= _ucprop_ranges[m + 1])
+ return 1;
+ }
+ return 0;
+
+}
+
+MBSTRING_API int php_unicode_is_prop(unsigned long code, unsigned long mask1,
+ unsigned long mask2)
+{
+ unsigned long i;
+
+ if (mask1 == 0 && mask2 == 0)
+ return 0;
+
+ for (i = 0; mask1 && i < 32; i++) {
+ if ((mask1 & masks32[i]) && prop_lookup(code, i))
+ return 1;
+ }
+
+ for (i = 32; mask2 && i < _ucprop_size; i++) {
+ if ((mask2 & masks32[i & 31]) && prop_lookup(code, i))
+ return 1;
+ }
+
+ return 0;
+}
+
+static unsigned long case_lookup(unsigned long code, long l, long r, int field)
+{
+ long m;
+
+ /*
+ * Do the binary search.
+ */
+ while (l <= r) {
+ /*
+ * Determine a "mid" point and adjust to make sure the mid point is at
+ * the beginning of a case mapping triple.
+ */
+ m = (l + r) >> 1;
+ m -= (m % 3);
+ if (code > _uccase_map[m])
+ l = m + 3;
+ else if (code < _uccase_map[m])
+ r = m - 3;
+ else if (code == _uccase_map[m])
+ return _uccase_map[m + field];
+ }
+
+ return code;
+}
+
+MBSTRING_API unsigned long php_unicode_toupper(unsigned long code)
+{
+ int field;
+ long l, r;
+
+ if (php_unicode_is_upper(code))
+ return code;
+
+ if (php_unicode_is_lower(code)) {
+ /*
+ * The character is lower case.
+ */
+ field = 2;
+ l = _uccase_len[0];
+ r = (l + _uccase_len[1]) - 3;
+ } else {
+ /*
+ * The character is title case.
+ */
+ field = 1;
+ l = _uccase_len[0] + _uccase_len[1];
+ r = _uccase_size - 3;
+ }
+ return case_lookup(code, l, r, field);
+}
+
+MBSTRING_API unsigned long php_unicode_tolower(unsigned long code)
+{
+ int field;
+ long l, r;
+
+ if (php_unicode_is_lower(code))
+ return code;
+
+ if (php_unicode_is_upper(code)) {
+ /*
+ * The character is upper case.
+ */
+ field = 1;
+ l = 0;
+ r = _uccase_len[0] - 3;
+ } else {
+ /*
+ * The character is title case.
+ */
+ field = 2;
+ l = _uccase_len[0] + _uccase_len[1];
+ r = _uccase_size - 3;
+ }
+ return case_lookup(code, l, r, field);
+}
+
+MBSTRING_API unsigned long php_unicode_totitle(unsigned long code)
+{
+ int field;
+ long l, r;
+
+ if (php_unicode_is_title(code))
+ return code;
+
+ /*
+ * The offset will always be the same for converting to title case.
+ */
+ field = 2;
+
+ if (php_unicode_is_upper(code)) {
+ /*
+ * The character is upper case.
+ */
+ l = 0;
+ r = _uccase_len[0] - 3;
+ } else {
+ /*
+ * The character is lower case.
+ */
+ l = _uccase_len[0];
+ r = (l + _uccase_len[1]) - 3;
+ }
+ return case_lookup(code, l, r, field);
+
+}
+
+
+#define BE_ARY_TO_UINT32(ptr) (\
+ ((unsigned char*)(ptr))[0]<<24 |\
+ ((unsigned char*)(ptr))[1]<<16 |\
+ ((unsigned char*)(ptr))[2]<< 8 |\
+ ((unsigned char*)(ptr))[3] )
+
+#define UINT32_TO_BE_ARY(ptr,val) { \
+ unsigned int v = val; \
+ ((unsigned char*)(ptr))[0] = (v>>24) & 0xff,\
+ ((unsigned char*)(ptr))[1] = (v>>16) & 0xff,\
+ ((unsigned char*)(ptr))[2] = (v>> 8) & 0xff,\
+ ((unsigned char*)(ptr))[3] = (v ) & 0xff;\
+}
+
+MBSTRING_API char *php_unicode_convert_case(int case_mode, char *srcstr, size_t srclen, size_t *ret_len,
+ char *src_encoding TSRMLS_DC)
+{
+ char *unicode, *newstr;
+ size_t unicode_len;
+ unsigned char *unicode_ptr;
+ size_t i;
+
+ unicode = php_mb_convert_encoding(srcstr, srclen, "UCS-4BE", src_encoding, &unicode_len TSRMLS_CC);
+ if (unicode == NULL)
+ return NULL;
+
+ unicode_ptr = unicode;
+
+ switch(case_mode) {
+ case PHP_UNICODE_CASE_UPPER:
+ for (i = 0; i < unicode_len; i+=4) {
+ UINT32_TO_BE_ARY(&unicode_ptr[i],
+ php_unicode_toupper(BE_ARY_TO_UINT32(&unicode_ptr[i])));
+ }
+ break;
+
+ case PHP_UNICODE_CASE_LOWER:
+ for (i = 0; i < unicode_len; i+=4) {
+ UINT32_TO_BE_ARY(&unicode_ptr[i],
+ php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i])));
+ }
+ break;
+
+ case PHP_UNICODE_CASE_TITLE: {
+ int mode = 0;
+
+ for (i = 0; i < unicode_len; i+=4) {
+ int res = php_unicode_is_prop(
+ BE_ARY_TO_UINT32(&unicode_ptr[i]),
+ UC_MN|UC_ME|UC_CF|UC_LM|UC_SK|UC_LU|UC_LL|UC_LT, 0);
+ if (mode) {
+ if (res) {
+ UINT32_TO_BE_ARY(&unicode_ptr[i],
+ php_unicode_tolower(BE_ARY_TO_UINT32(&unicode_ptr[i])));
+ } else {
+ mode = 0;
+ }
+ } else {
+ if (res) {
+ mode = 1;
+ UINT32_TO_BE_ARY(&unicode_ptr[i],
+ php_unicode_totitle(BE_ARY_TO_UINT32(&unicode_ptr[i])));
+ }
+ }
+ }
+ } break;
+
+ }
+
+ newstr = php_mb_convert_encoding(unicode, unicode_len, src_encoding, "UCS-4BE", ret_len TSRMLS_CC);
+ efree(unicode);
+
+ return newstr;
+}
+
+
+#endif /* HAVE_MBSTRING */
+
+/*
+ * Local variables:
+ * tab-width: 4
+ * c-basic-offset: 4
+ * End:
+ * vim600: sw=4 ts=4 fdm=marker
+ * vim<600: sw=4 ts=4
+ */