summaryrefslogtreecommitdiff
path: root/usr/src
diff options
context:
space:
mode:
Diffstat (limited to 'usr/src')
-rw-r--r--usr/src/cmd/localedef/collate.c308
-rw-r--r--usr/src/cmd/localedef/ctype.c14
-rw-r--r--usr/src/cmd/localedef/localedef.h1
-rw-r--r--usr/src/cmd/localedef/wide.c94
-rw-r--r--usr/src/lib/libc/port/locale/collate.c146
-rw-r--r--usr/src/lib/libc/port/locale/collate.h19
6 files changed, 428 insertions, 154 deletions
diff --git a/usr/src/cmd/localedef/collate.c b/usr/src/cmd/localedef/collate.c
index dbcda9886c..91356346bd 100644
--- a/usr/src/cmd/localedef/collate.c
+++ b/usr/src/cmd/localedef/collate.c
@@ -33,6 +33,67 @@
#include "collate.h"
/*
+ * Design notes.
+ *
+ * It will be extremely helpful to the reader if they have access to
+ * the localedef and locale file format specifications available.
+ * Latest versions of these are available from www.opengroup.org.
+ *
+ * The design for the collation code is a bit complex. The goal is a
+ * single collation database as described in collate.h (in
+ * libc/port/locale). However, there are some other tidbits:
+ *
+ * a) The substitution entries are now a directly indexable array. A
+ * priority elsewhere in the table is taken as an index into the
+ * substitution table if it has a high bit (COLLATE_SUBST_PRIORITY)
+ * set. (The bit is cleared and the result is the index into the
+ * table.
+ *
+ * b) We eliminate duplicate entries into the substitution table.
+ * This saves a lot of space.
+ *
+ * c) The priorities for each level are "compressed", so that each
+ * sorting level has consecutively numbered priorities starting at 1.
+ * (O is reserved for the ignore priority.) This means sort levels
+ * which only have a few distinct priorities can represent the
+ * priority level in fewer bits, which makes the strxfrm output
+ * smaller.
+ *
+ * d) We record the total number of priorities so that strxfrm can
+ * figure out how many bytes to expand a numeric priority into.
+ *
+ * e) For the UNDEFINED pass (the last pass), we record the maximum
+ * number of bits needed to uniquely prioritize these entries, so that
+ * the last pass can also use smaller strxfrm output when possible.
+ *
+ * f) Priorities with the sign bit set are verboten. This works out
+ * because no active character set needs that bit to carry significant
+ * information once the character is in wide form.
+ *
+ * To process the entire data to make the database, we actually run
+ * multiple passes over the data.
+ *
+ * The first pass, which is done at parse time, identifies elements,
+ * substitutions, and such, and records them in priority order. As
+ * some priorities can refer to other priorities, using forward
+ * references, we use a table of references indicating whether the
+ * priority's value has been resolved, or whether it is still a
+ * reference.
+ *
+ * The second pass walks over all the items in priority order, noting
+ * that they are used directly, and not just an indirect reference.
+ * This is done by creating a "weight" structure for the item. The
+ * weights are stashed in an AVL tree sorted by relative "priority".
+ *
+ * The third pass walks over all the weight structures, in priority
+ * order, and assigns a new monotonically increasing (per sort level)
+ * weight value to them. These are the values that will actually be
+ * written to the file.
+ *
+ * The fourth pass just writes the data out.
+ */
+
+/*
* In order to resolve the priorities, we create a table of priorities.
* Entries in the table can be in one of three states.
*
@@ -48,8 +109,6 @@
* this is used for forward references. A collating-symbol can never
* have this value.
*
- * The "self" field is a reference to or own index in the pri list.
- *
* The "pass" field is used during final resolution to aid in detection
* of referencing loops. (For example <A> depends on <B>, but <B> has its
* priority dependent on <A>.)
@@ -60,15 +119,19 @@ typedef enum {
REFER /* priority is a reference (index) */
} res_t;
+typedef struct weight {
+ int32_t pri;
+ int opt;
+ avl_node_t avl;
+} weight_t;
+
typedef struct priority {
res_t res;
int32_t pri;
- int32_t self;
int pass;
int lineno;
} collpri_t;
-
#define NUM_WT collinfo.directive_count
/*
@@ -110,7 +173,7 @@ struct collelem {
*/
typedef struct collchar {
wchar_t wc;
- wchar_t ref[COLL_WEIGHTS_MAX];
+ int32_t ref[COLL_WEIGHTS_MAX];
avl_node_t avl;
} collchar_t;
@@ -124,6 +187,7 @@ typedef struct {
int32_t key;
int32_t ref[COLLATE_STR_LEN];
avl_node_t avl;
+ avl_node_t avl_ref;
} subst_t;
static avl_tree_t collsyms;
@@ -132,6 +196,9 @@ static avl_tree_t elem_by_symbol;
static avl_tree_t elem_by_expand;
static avl_tree_t collchars;
static avl_tree_t substs[COLL_WEIGHTS_MAX];
+static avl_tree_t substs_ref[COLL_WEIGHTS_MAX];
+static avl_tree_t weights[COLL_WEIGHTS_MAX];
+static int32_t nweight[COLL_WEIGHTS_MAX];
/*
* This is state tracking for the ellipsis token. Note that we start
@@ -152,6 +219,7 @@ static int32_t ellipsis_weights[COLL_WEIGHTS_MAX];
* We keep a running tally of weights.
*/
static int nextpri = 1;
+static int nextsubst[COLL_WEIGHTS_MAX] = { 0 };
/*
* This array collects up the weights for each level.
@@ -191,7 +259,6 @@ new_pri(void)
prilist[i].res = UNKNOWN;
prilist[i].pri = 0;
prilist[i].pass = 0;
- prilist[i].self = i;
}
}
return (numpri++);
@@ -267,6 +334,15 @@ resolve_pri(int32_t ref)
}
static int
+weight_compare(const void *n1, const void *n2)
+{
+ int32_t k1 = ((const weight_t *)n1)->pri;
+ int32_t k2 = ((const weight_t *)n2)->pri;
+
+ return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
+}
+
+static int
collsym_compare(const void *n1, const void *n2)
{
const collsym_t *c1 = n1;
@@ -328,6 +404,17 @@ subst_compare(const void *n1, const void *n2)
return (k1 < k2 ? -1 : k1 > k2 ? 1 : 0);
}
+static int
+subst_compare_ref(const void *n1, const void *n2)
+{
+ int32_t *c1 = ((subst_t *)n1)->ref;
+ int32_t *c2 = ((subst_t *)n2)->ref;
+ int rv;
+
+ rv = wcscmp((wchar_t *)c1, (wchar_t *)c2);
+ return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0);
+}
+
void
init_collate(void)
{
@@ -347,9 +434,15 @@ init_collate(void)
avl_create(&collchars, collchar_compare, sizeof (collchar_t),
offsetof(collchar_t, avl));
- for (i = 0; i < COLL_WEIGHTS_MAX; i++)
+ for (i = 0; i < COLL_WEIGHTS_MAX; i++) {
avl_create(&substs[i], subst_compare, sizeof (subst_t),
offsetof(subst_t, avl));
+ avl_create(&substs_ref[i], subst_compare_ref,
+ sizeof (subst_t), offsetof(subst_t, avl_ref));
+ avl_create(&weights[i], weight_compare, sizeof (weight_t),
+ offsetof(weight_t, avl));
+ nweight[i] = 1;
+ }
(void) memset(&collinfo, 0, sizeof (collinfo));
@@ -796,37 +889,52 @@ add_order_ellipsis(void)
void
add_order_subst(void)
{
+ subst_t srch;
subst_t *s;
avl_index_t where;
int i;
- if ((s = calloc(sizeof (*s), 1)) == NULL) {
- errf(_("out of memory"));
- return;
+ (void) memset(&srch, 0, sizeof (srch));
+ for (i = 0; i < curr_subst; i++) {
+ srch.ref[i] = subst_weights[i];
+ subst_weights[i] = 0;
}
- s->key = new_pri();
+ s = avl_find(&substs_ref[curr_weight], &srch, &where);
- /*
- * We use a self reference for our key, but we set a high bit
- * to indicate that this is a substitution reference. This
- * will expedite table lookups later, and prevent table
- * lookups for situations that don't require it. (In short,
- * its a big win, because we can skip a lot of binary
- * searching.)
- */
- set_pri(s->key, (nextpri | COLLATE_SUBST_PRIORITY), RESOLVED);
+ if (s == NULL) {
+ if ((s = calloc(sizeof (*s), 1)) == NULL) {
+ errf(_("out of memory"));
+ return;
+ }
+ s->key = new_pri();
- for (i = 0; i < curr_subst; i++) {
- s->ref[i] = subst_weights[i];
- subst_weights[i] = 0;
+ /*
+ * We use a self reference for our key, but we set a
+ * high bit to indicate that this is a substitution
+ * reference. This will expedite table lookups later,
+ * and prevent table lookups for situations that don't
+ * require it. (In short, its a big win, because we
+ * can skip a lot of binary searching.)
+ */
+ set_pri(s->key,
+ (nextsubst[curr_weight] | COLLATE_SUBST_PRIORITY),
+ RESOLVED);
+ nextsubst[curr_weight] += 1;
+
+ for (i = 0; i < curr_subst; i++) {
+ s->ref[i] = srch.ref[i];
+ }
+
+ avl_insert(&substs_ref[curr_weight], s, where);
+
+ if (avl_find(&substs[curr_weight], s, &where) != NULL) {
+ INTERR;
+ return;
+ }
+ avl_insert(&substs[curr_weight], s, where);
}
curr_subst = 0;
- if (avl_find(&substs[curr_weight], s, &where) != NULL) {
- INTERR;
- return;
- }
- avl_insert(&substs[curr_weight], s, where);
/*
* We are using the current (unique) priority as a search key
@@ -884,6 +992,65 @@ add_subst_symbol(char *ptr)
}
void
+add_weight(int32_t ref, int pass)
+{
+ weight_t srch;
+ weight_t *w;
+ avl_index_t where;
+
+ srch.pri = resolve_pri(ref);
+
+ /* No translation of ignores */
+ if (srch.pri == 0)
+ return;
+
+ /* Substitution priorities are not weights */
+ if (srch.pri & COLLATE_SUBST_PRIORITY)
+ return;
+
+ if (avl_find(&weights[pass], &srch, &where) != NULL)
+ return;
+
+ if ((w = calloc(sizeof (*w), 1)) == NULL) {
+ errf(_("out of memory"));
+ return;
+ }
+ w->pri = srch.pri;
+ avl_insert(&weights[pass], w, where);
+}
+
+void
+add_weights(int32_t *refs)
+{
+ int i;
+ for (i = 0; i < NUM_WT; i++) {
+ add_weight(refs[i], i);
+ }
+}
+
+int32_t
+get_weight(int32_t ref, int pass)
+{
+ weight_t srch;
+ weight_t *w;
+ int32_t pri;
+
+ pri = resolve_pri(ref);
+ if (pri & COLLATE_SUBST_PRIORITY) {
+ return (pri);
+ }
+ if (pri <= 0) {
+ return (pri);
+ }
+ srch.pri = pri;
+ if ((w = avl_find(&weights[pass], &srch, NULL)) == NULL) {
+ INTERR;
+ return (-1);
+ }
+ return (w->opt);
+}
+
+void
dump_collate(void)
{
FILE *f;
@@ -894,18 +1061,56 @@ dump_collate(void)
collchar_t *cc;
subst_t *sb;
char vers[COLLATE_STR_LEN];
- collate_char_pri_t char_pri[UCHAR_MAX + 1];
- collate_large_pri_t *large;
+ collate_char_t chars[UCHAR_MAX + 1];
+ collate_large_t *large;
collate_subst_t *subst[COLL_WEIGHTS_MAX];
- collate_chain_pri_t *chain;
+ collate_chain_t *chain;
- (void) memset(&char_pri, 0, sizeof (char_pri));
+ /*
+ * We have to run throught a preliminary pass to identify all the
+ * weights that we use for each sorting level.
+ */
+ for (i = 0; i < NUM_WT; i++) {
+ add_weight(pri_ignore, i);
+ }
+ for (i = 0; i < NUM_WT; i++) {
+ for (sb = avl_first(&substs[i]); sb;
+ sb = AVL_NEXT(&substs[i], sb)) {
+ for (j = 0; sb->ref[j]; j++) {
+ add_weight(sb->ref[j], i);
+ }
+ }
+ }
+ for (ce = avl_first(&elem_by_expand);
+ ce != NULL;
+ ce = AVL_NEXT(&elem_by_expand, ce)) {
+ add_weights(ce->ref);
+ }
+ for (cc = avl_first(&collchars); cc; cc = AVL_NEXT(&collchars, cc)) {
+ add_weights(cc->ref);
+ }
+
+ /*
+ * Now we walk the entire set of weights, removing the gaps
+ * in the weights. This gives us optimum usage. The walk
+ * occurs in priority.
+ */
+ for (i = 0; i < NUM_WT; i++) {
+ weight_t *w;
+ for (w = avl_first(&weights[i]); w;
+ w = AVL_NEXT(&weights[i], w)) {
+ w->opt = nweight[i];
+ nweight[i] += 1;
+ }
+ }
+
+ (void) memset(&chars, 0, sizeof (chars));
(void) memset(vers, 0, COLLATE_STR_LEN);
- (void) snprintf(vers, sizeof (vers), COLLATE_VERSION);
+ (void) strlcpy(vers, COLLATE_VERSION, sizeof (vers));
/*
* We need to make sure we arrange for the UNDEFINED field
- * to show up.
+ * to show up. Also, set the total weight counts.
*/
for (i = 0; i < NUM_WT; i++) {
if (resolve_pri(pri_undefined[i]) == -1) {
@@ -913,8 +1118,10 @@ dump_collate(void)
/* they collate at the end of everything else */
collinfo.undef_pri[i] = COLLATE_MAX_PRIORITY;
}
+ collinfo.pri_count[i] = nweight[i];
}
+ collinfo.pri_count[NUM_WT] = max_wide();
collinfo.undef_pri[NUM_WT] = COLLATE_MAX_PRIORITY;
collinfo.directive[NUM_WT] = DIRECTIVE_UNDEFINED;
@@ -924,26 +1131,26 @@ dump_collate(void)
for (i = 0; i <= UCHAR_MAX; i++) {
if ((cc = get_collchar(i, 0)) != NULL) {
for (j = 0; j < NUM_WT; j++) {
- char_pri[i].pri[j] = resolve_pri(cc->ref[j]);
+ chars[i].pri[j] = get_weight(cc->ref[j], j);
}
} else {
for (j = 0; j < NUM_WT; j++) {
- char_pri[i].pri[j] =
- resolve_pri(pri_undefined[j]);
+ chars[i].pri[j] =
+ get_weight(pri_undefined[j], j);
}
/*
* Per POSIX, for undefined characters, we
* also have to add a last item, which is the
* character code.
*/
- char_pri[i].pri[NUM_WT] = i;
+ chars[i].pri[NUM_WT] = i;
}
}
/*
* Substitution tables
*/
- for (i = 0; i < collinfo.directive_count; i++) {
+ for (i = 0; i < NUM_WT; i++) {
collate_subst_t *st = NULL;
n = collinfo.subst_count[i] = avl_numnodes(&substs[i]);
if ((st = calloc(sizeof (collate_subst_t) * n, 1)) == NULL) {
@@ -957,8 +1164,11 @@ dump_collate(void)
/* by definition these resolve! */
INTERR;
}
+ if (st[n].key != (n | COLLATE_SUBST_PRIORITY)) {
+ INTERR;
+ }
for (j = 0; sb->ref[j]; j++) {
- st[n].pri[j] = resolve_pri(sb->ref[j]);
+ st[n].pri[j] = get_weight(sb->ref[j], i);
}
n++;
}
@@ -967,11 +1177,12 @@ dump_collate(void)
subst[i] = st;
}
+
/*
* Chains, i.e. collating elements
*/
collinfo.chain_count = avl_numnodes(&elem_by_expand);
- chain = calloc(sizeof (collate_chain_pri_t), collinfo.chain_count);
+ chain = calloc(sizeof (collate_chain_t), collinfo.chain_count);
if (chain == NULL) {
errf(_("out of memory"));
return;
@@ -981,7 +1192,7 @@ dump_collate(void)
ce = AVL_NEXT(&elem_by_expand, ce), n++) {
(void) wsncpy(chain[n].str, ce->expand, COLLATE_STR_LEN);
for (i = 0; i < NUM_WT; i++) {
- chain[n].pri[i] = resolve_pri(ce->ref[i]);
+ chain[n].pri[i] = get_weight(ce->ref[i], i);
}
}
if (n != collinfo.chain_count)
@@ -990,8 +1201,7 @@ dump_collate(void)
/*
* Large (> UCHAR_MAX) character priorities
*/
- large = calloc(sizeof (collate_large_pri_t) * avl_numnodes(&collchars),
- 1);
+ large = calloc(sizeof (collate_large_t) * avl_numnodes(&collchars), 1);
if (large == NULL) {
errf(_("out of memory"));
return;
@@ -1004,7 +1214,7 @@ dump_collate(void)
if (cc->wc <= UCHAR_MAX)
continue;
for (j = 0; j < NUM_WT; j++) {
- if ((pri = resolve_pri(cc->ref[j])) < 0) {
+ if ((pri = get_weight(cc->ref[j], j)) < 0) {
undef = 1;
}
if (undef && (pri >= 0)) {
@@ -1016,7 +1226,7 @@ dump_collate(void)
}
if (!undef) {
large[i].val = cc->wc;
- collinfo.large_pri_count = i++;
+ collinfo.large_count = i++;
}
}
@@ -1028,7 +1238,7 @@ dump_collate(void)
if ((wr_category(vers, COLLATE_STR_LEN, f) < 0) ||
(wr_category(&collinfo, sizeof (collinfo), f) < 0) ||
- (wr_category(&char_pri, sizeof (char_pri), f) < 0)) {
+ (wr_category(&chars, sizeof (chars), f) < 0)) {
return;
}
@@ -1038,11 +1248,11 @@ dump_collate(void)
return;
}
}
- sz = sizeof (collate_chain_pri_t) * collinfo.chain_count;
+ sz = sizeof (collate_chain_t) * collinfo.chain_count;
if (wr_category(chain, sz, f) < 0) {
return;
}
- sz = sizeof (collate_large_pri_t) * collinfo.large_pri_count;
+ sz = sizeof (collate_large_t) * collinfo.large_count;
if (wr_category(large, sz, f) < 0) {
return;
}
diff --git a/usr/src/cmd/localedef/ctype.c b/usr/src/cmd/localedef/ctype.c
index c7ff53b049..e7eb103701 100644
--- a/usr/src/cmd/localedef/ctype.c
+++ b/usr/src/cmd/localedef/ctype.c
@@ -244,6 +244,20 @@ dump_ctype(void)
ctn->ctype |= _ISXDIGIT;
if (strchr(" \t", (char)wc))
ctn->ctype |= _ISBLANK;
+
+ /*
+ * Technically these settings are only
+ * required for the C locale. However, it
+ * turns out that because of the historical
+ * version of isprint(), we need them for all
+ * locales as well. Note that these are not
+ * necessarily valid punctation characters in
+ * the current language, but ispunct() needs
+ * to return TRUE for them.
+ */
+ if (strchr("!\"'#$%&()*+,-./:;<=>?@[\\]^_`{|}~",
+ (char)wc))
+ ctn->ctype |= _ISPUNCT;
}
/*
diff --git a/usr/src/cmd/localedef/localedef.h b/usr/src/cmd/localedef/localedef.h
index 0af69d3f06..c7ae5ae1b3 100644
--- a/usr/src/cmd/localedef/localedef.h
+++ b/usr/src/cmd/localedef/localedef.h
@@ -134,6 +134,7 @@ int to_mbs(char *, wchar_t);
char *to_mb_string(const wchar_t *);
void set_wide_encoding(const char *);
const char *get_wide_encoding(void);
+int max_wide(void);
#define _(x) gettext(x)
#define INTERR errf(_("internal fault (%s:%d)"), __FILE__, __LINE__)
diff --git a/usr/src/cmd/localedef/wide.c b/usr/src/cmd/localedef/wide.c
index a3ec14d53a..4d43c80daf 100644
--- a/usr/src/cmd/localedef/wide.c
+++ b/usr/src/cmd/localedef/wide.c
@@ -44,6 +44,7 @@ static int tomb_mbs(char *, wchar_t);
static int (*_towide)(wchar_t *, const char *, int) = towide_none;
static int (*_tomb)(char *, wchar_t) = tomb_none;
static const char *_encoding = "NONE";
+static int _nbits = 7;
/*
* Table of supported encodings. We only bother to list the multibyte
@@ -53,45 +54,71 @@ static struct {
const char *name;
/* the name that the underlying libc implemenation uses */
const char *cname;
+ /* the maximum number of bits required for priorities */
+ int nbits;
int (*towide)(wchar_t *, const char *, int);
int (*tomb)(char *, wchar_t);
} mb_encodings[] = {
- { "UTF-8", "UTF-8", towide_utf8, tomb_utf8 },
- { "UTF8", "UTF-8", towide_utf8, tomb_utf8 },
- { "utf8", "UTF-8", towide_utf8, tomb_utf8 },
- { "utf-8", "UTF-8", towide_utf8, tomb_utf8 },
-
- { "EUC-CN", "EUC-CN", towide_euccn, tomb_mbs },
- { "eucCN", "EUC-CN", towide_euccn, tomb_mbs },
-
- { "EUC-JP", "EUC-JP", towide_eucjp, tomb_mbs },
- { "eucJP", "EUC-JP", towide_eucjp, tomb_mbs },
+ /*
+ * UTF8 values max out at 0x1fffff (although in theory there could
+ * be later extensions, but it won't happen.) This means we only need
+ * 21 bits to be able to encode the entire range of priorities.
+ */
+ { "UTF-8", "UTF-8", 21, towide_utf8, tomb_utf8 },
+ { "UTF8", "UTF-8", 21, towide_utf8, tomb_utf8 },
+ { "utf8", "UTF-8", 21, towide_utf8, tomb_utf8 },
+ { "utf-8", "UTF-8", 21, towide_utf8, tomb_utf8 },
- { "EUC-KR", "EUC-KR", towide_euckr, tomb_mbs },
- { "eucKR", "EUC-KR", towide_euckr, tomb_mbs },
+ { "EUC-CN", "EUC-CN", 16, towide_euccn, tomb_mbs },
+ { "eucCN", "EUC-CN", 16, towide_euccn, tomb_mbs },
+ /*
+ * Becuase the 3-byte form of EUC-JP use the same leading byte,
+ * only 17 bits required to provide unique priorities. (The low
+ * bit of that first byte is set.) By setting this value low,
+ * we can get by with only 3 bytes in the strxfrm expansion.
+ */
+ { "EUC-JP", "EUC-JP", 17, towide_eucjp, tomb_mbs },
+ { "eucJP", "EUC-JP", 17, towide_eucjp, tomb_mbs },
- { "EUC-TW", "EUC-TW", towide_euctw, tomb_mbs },
- { "eucTW", "EUC-TW", towide_euctw, tomb_mbs },
+ { "EUC-KR", "EUC-KR", 16, towide_euckr, tomb_mbs },
+ { "eucKR", "EUC-KR", 16, towide_euckr, tomb_mbs },
+ /*
+ * EUC-TW uses 2 bytes most of the time, but 4 bytes if the
+ * high order byte is 0x8E. However, with 4 byte encodings,
+ * the third byte will be A0-B0. So we only need to consider
+ * the lower order 24 bits for collation.
+ */
+ { "EUC-TW", "EUC-TW", 24, towide_euctw, tomb_mbs },
+ { "eucTW", "EUC-TW", 24, towide_euctw, tomb_mbs },
- { "MS_Kanji", "MSKanji", towide_mskanji, tomb_mbs },
- { "MSKanji", "MSKanji", towide_mskanji, tomb_mbs },
- { "PCK", "MSKanji", towide_mskanji, tomb_mbs },
- { "SJIS", "MSKanji", towide_mskanji, tomb_mbs },
- { "Shift_JIS", "MSKanji", towide_mskanji, tomb_mbs },
+ { "MS_Kanji", "MSKanji", 16, towide_mskanji, tomb_mbs },
+ { "MSKanji", "MSKanji", 16, towide_mskanji, tomb_mbs },
+ { "PCK", "MSKanji", 16, towide_mskanji, tomb_mbs },
+ { "SJIS", "MSKanji", 16, towide_mskanji, tomb_mbs },
+ { "Shift_JIS", "MSKanji", 16, towide_mskanji, tomb_mbs },
- { "BIG5", "BIG5", towide_big5, tomb_mbs },
- { "big5", "BIG5", towide_big5, tomb_mbs },
- { "Big5", "BIG5", towide_big5, tomb_mbs },
+ { "BIG5", "BIG5", 16, towide_big5, tomb_mbs },
+ { "big5", "BIG5", 16, towide_big5, tomb_mbs },
+ { "Big5", "BIG5", 16, towide_big5, tomb_mbs },
- { "GBK", "GBK", towide_gbk, tomb_mbs },
+ { "GBK", "GBK", 16, towide_gbk, tomb_mbs },
- { "GB18030", "GB18030", towide_gb18030, tomb_mbs },
+ /*
+ * GB18030 can get away with just 31 bits. This is because the
+ * high order bit is always set for 4 byte values, and the
+ * at least one of the other bits in that 4 byte value will
+ * be non-zero.
+ */
+ { "GB18030", "GB18030", 31, towide_gb18030, tomb_mbs },
- { "GB2312", "GB2312", towide_gb2312, tomb_mbs },
+ /*
+ * This should probably be an aliase for euc-cn, or vice versa.
+ */
+ { "GB2312", "GB2312", 16, towide_gb2312, tomb_mbs },
- { "ASCII", "ASCII", towide_none, tomb_none },
- { "US-ASCII", "ASCII", towide_none, tomb_none },
- { "646", "ASCII", towide_none, tomb_none },
+ { "ASCII", "ASCII", 7, towide_none, tomb_none },
+ { "US-ASCII", "ASCII", 7, towide_none, tomb_none },
+ { "646", "ASCII", 7, towide_none, tomb_none },
{ NULL, NULL },
};
@@ -511,7 +538,7 @@ towide_euc_impl(wchar_t *wc, const char *mb, int n,
*
* Code set 0 (ASCII): 0x21-0x7E
* Code set 1 (CNS 11643-1992 Plane 1): 0xA1A1-0xFEFE
- * Code set 2 (CNS 11643-1992 Planes 1-16): 0x8EA1A1A1-0x8EB0FEFE
+ * Code set 2: unused
* Code set 3: unused
*/
int
@@ -620,12 +647,15 @@ set_wide_encoding(const char *encoding)
_towide = towide_none;
_tomb = tomb_none;
_encoding = "NONE";
+ _nbits = 8;
for (i = 0; mb_encodings[i].name; i++) {
if (strcasecmp(encoding, mb_encodings[i].name) == 0) {
_towide = mb_encodings[i].towide;
_tomb = mb_encodings[i].tomb;
_encoding = mb_encodings[i].cname;
+ _nbits = mb_encodings[i].nbits;
+ break;
}
}
}
@@ -635,3 +665,9 @@ get_wide_encoding(void)
{
return (_encoding);
}
+
+int
+max_wide(void)
+{
+ return ((int)((1U << _nbits) - 1));
+}
diff --git a/usr/src/lib/libc/port/locale/collate.c b/usr/src/lib/libc/port/locale/collate.c
index 5ce0b847e5..5428f7aa94 100644
--- a/usr/src/lib/libc/port/locale/collate.c
+++ b/usr/src/lib/libc/port/locale/collate.c
@@ -46,16 +46,23 @@
#include "setlocale.h"
#include "ldpart.h"
-static collate_subst_t *subst_table[COLL_WEIGHTS_MAX];
-static collate_char_pri_t *char_pri_table;
-static collate_large_pri_t *large_pri_table;
-static collate_chain_pri_t *chain_pri_table;
-static char *cache = NULL;
-static size_t cachesz;
-static char collate_encoding[ENCODING_LEN + 1];
+/*
+ * See the comments in usr/src/cmd/localedef/collate.c for further
+ * information. It would also be very helpful to have a copy of the
+ * POSIX standard for collation (in the locale format manual page)
+ * handy (www.opengroup.org).
+ */
+
+static collate_subst_t *subst_table[COLL_WEIGHTS_MAX];
+static collate_char_t *char_pri_table;
+static collate_large_t *large_pri_table;
+static collate_chain_t *chain_pri_table;
+static char *cache = NULL;
+static size_t cachesz;
+static char collate_encoding[ENCODING_LEN + 1];
/* Exposed externally to other parts of libc. */
-collate_info_t *_collate_info;
+collate_info_t *_collate_info;
int _collate_load_error = 1;
@@ -126,9 +133,9 @@ _collate_load_tables(const char *encoding)
return (_LDP_ERROR);
}
- i = (sizeof (collate_char_pri_t) * (UCHAR_MAX + 1)) +
- (sizeof (collate_chain_pri_t) * chains) +
- (sizeof (collate_large_pri_t) * info->large_pri_count);
+ i = (sizeof (collate_char_t) * (UCHAR_MAX + 1)) +
+ (sizeof (collate_chain_t) * chains) +
+ (sizeof (collate_large_t) * info->large_count);
for (z = 0; z < (info->directive_count); z++) {
i += sizeof (collate_subst_t) * info->subst_count[z];
}
@@ -139,7 +146,7 @@ _collate_load_tables(const char *encoding)
}
char_pri_table = (void *)TMP;
- TMP += sizeof (collate_char_pri_t) * (UCHAR_MAX + 1);
+ TMP += sizeof (collate_char_t) * (UCHAR_MAX + 1);
for (z = 0; z < info->directive_count; z++) {
if (info->subst_count[z] > 0) {
@@ -152,10 +159,10 @@ _collate_load_tables(const char *encoding)
if (chains > 0) {
chain_pri_table = (void *)TMP;
- TMP += chains * sizeof (collate_chain_pri_t);
+ TMP += chains * sizeof (collate_chain_t);
} else
chain_pri_table = NULL;
- if (info->large_pri_count > 0)
+ if (info->large_count > 0)
large_pri_table = (void *)TMP;
else
large_pri_table = NULL;
@@ -173,49 +180,39 @@ _collate_load_tables(const char *encoding)
return (_LDP_LOADED);
}
-/*
- * Note: for performance reasons, we have expanded bsearch here. This avoids
- * function call overhead with each comparison.
- */
-
static int32_t *
substsearch(const wchar_t key, int pass)
{
- int low;
- int high;
- int next, compar;
collate_subst_t *p;
- collate_subst_t *tab;
int n = _collate_info->subst_count[pass];
if (n == 0)
return (NULL);
- tab = subst_table[pass];
- low = 0;
- high = n - 1;
- while (low <= high) {
- next = (low + high) / 2;
- p = tab + next;
- compar = key - p->key;
- if (compar == 0)
- return (p->pri);
- if (compar > 0)
- low = next + 1;
- else
- high = next - 1;
- }
- return (NULL);
+ if (pass >= _collate_info->directive_count)
+ return (NULL);
+
+ if (!(key & COLLATE_SUBST_PRIORITY))
+ return (NULL);
+
+ p = subst_table[pass] + (key & ~COLLATE_SUBST_PRIORITY);
+ assert(p->key == key);
+ return (p->pri);
}
-static collate_chain_pri_t *
+/*
+ * Note: for performance reasons, we have expanded bsearch here. This avoids
+ * function call overhead with each comparison.
+ */
+
+static collate_chain_t *
chainsearch(const wchar_t *key, int *len)
{
int low;
int high;
int next, compar, l;
- collate_chain_pri_t *p;
- collate_chain_pri_t *tab;
+ collate_chain_t *p;
+ collate_chain_t *tab;
if (_collate_info->chain_count == 0)
return (NULL);
@@ -244,16 +241,16 @@ chainsearch(const wchar_t *key, int *len)
return (NULL);
}
-static collate_large_pri_t *
+static collate_large_t *
largesearch(const wchar_t key)
{
int low = 0;
- int high = _collate_info->large_pri_count - 1;
+ int high = _collate_info->large_count - 1;
int next, compar;
- collate_large_pri_t *p;
- collate_large_pri_t *tab = large_pri_table;
+ collate_large_t *p;
+ collate_large_t *tab = large_pri_table;
- if (_collate_info->large_pri_count == 0)
+ if (_collate_info->large_count == 0)
return (NULL);
while (low <= high) {
@@ -271,15 +268,26 @@ largesearch(const wchar_t key)
}
void
-_collate_lookup(const wchar_t *t, int *len, int *pri, int which,
- int **state)
+_collate_lookup(const wchar_t *t, int *len, int *pri, int which, int **state)
{
- collate_chain_pri_t *p2;
- collate_large_pri_t *match;
+ collate_chain_t *p2;
+ collate_large_t *match;
+ collate_info_t *info = _collate_info;
int p, l;
int *sptr;
/*
+ * If this is the "last" pass for the UNDEFINED, then
+ * we just return the priority itself.
+ */
+ if (which >= info->directive_count) {
+ *pri = *t;
+ *len = 1;
+ *state = NULL;
+ return;
+ }
+
+ /*
* If we have remaining substitution data from a previous
* call, consume it first.
*/
@@ -312,7 +320,7 @@ _collate_lookup(const wchar_t *t, int *len, int *pri, int which,
*/
*pri = char_pri_table[*t].pri[which];
- } else if ((_collate_info->large_pri_count > 0) &&
+ } else if ((info->large_count > 0) &&
((match = largesearch(*t)) != NULL)) {
/*
@@ -324,11 +332,11 @@ _collate_lookup(const wchar_t *t, int *len, int *pri, int which,
/*
* Character lacks a specific definition.
*/
- if (_collate_info->directive[which] & DIRECTIVE_UNDEFINED) {
+ if (info->directive[which] & DIRECTIVE_UNDEFINED) {
/* Mask off sign bit to prevent ordering confusion. */
*pri = (*t & COLLATE_MAX_PRIORITY);
} else {
- *pri = _collate_info->undef_pri[which];
+ *pri = info->undef_pri[which];
}
/* No substitutions for undefined characters! */
return;
@@ -346,8 +354,7 @@ _collate_lookup(const wchar_t *t, int *len, int *pri, int which,
* to be substituted be unique for that level. The localedef
* code ensures this for us.
*/
- if ((*pri & COLLATE_SUBST_PRIORITY) &&
- (sptr = substsearch(*pri, which)) != NULL) {
+ if ((sptr = substsearch(*pri, which)) != NULL) {
if ((*pri = *sptr) != 0) {
sptr++;
*state = *sptr ? sptr : NULL;
@@ -489,14 +496,21 @@ fail:
#define XFRM_MASK ((1 << XFRM_SHIFT) - 1)
#define XFRM_SEP ('.') /* chosen to be less than XFRM_OFFSET */
-static void
-xfrm(unsigned char *p, int pri)
+static int
+xfrm(unsigned char *p, int pri, int pass)
{
- int i;
- for (i = 0; i < XFRM_BYTES; i++) {
- p[i] = (pri & XFRM_MASK) + XFRM_OFFSET;
+ /* we use unsigned to ensure zero fill on right shift */
+ uint32_t val = (uint32_t)_collate_info->pri_count[pass];
+ int nc = 0;
+
+ while (val) {
+ *p = (pri & XFRM_MASK) + XFRM_OFFSET;
pri >>= XFRM_SHIFT;
+ val >>= XFRM_SHIFT;
+ p++;
+ nc++;
}
+ return (nc);
}
size_t
@@ -569,10 +583,9 @@ _collate_sxfrm(const wchar_t *src, char *xf, size_t room)
pri = COLLATE_MAX_PRIORITY;
}
+ b = xfrm(buf, pri, pass);
+ want += b;
if (room) {
- b = XFRM_BYTES;
- xfrm(buf, pri);
-
while (b) {
b--;
if (room)
@@ -580,7 +593,6 @@ _collate_sxfrm(const wchar_t *src, char *xf, size_t room)
room--;
}
}
- want += XFRM_BYTES;
need = want;
}
} else {
@@ -594,9 +606,10 @@ _collate_sxfrm(const wchar_t *src, char *xf, size_t room)
}
continue;
}
+
+ b = xfrm(buf, pri, pass);
+ want += b;
if (room) {
- b = XFRM_BYTES;
- xfrm(buf, pri);
while (b) {
b--;
@@ -605,7 +618,6 @@ _collate_sxfrm(const wchar_t *src, char *xf, size_t room)
room--;
}
}
- want += XFRM_BYTES;
need = want;
}
}
diff --git a/usr/src/lib/libc/port/locale/collate.h b/usr/src/lib/libc/port/locale/collate.h
index 02bf4be6f3..a175ce45bf 100644
--- a/usr/src/lib/libc/port/locale/collate.h
+++ b/usr/src/lib/libc/port/locale/collate.h
@@ -33,7 +33,7 @@
#include <limits.h>
#define COLLATE_STR_LEN 24 /* should be 64-bit multiple */
-#define COLLATE_VERSION "I1.0\n"
+#define COLLATE_VERSION "IllumosCollate2\n"
#define COLLATE_MAX_PRIORITY (0x7fffffff) /* max signed value */
#define COLLATE_SUBST_PRIORITY (0x40000000) /* bit indicates subst table */
@@ -65,26 +65,27 @@
typedef struct collate_info {
uint8_t directive_count;
uint8_t directive[COLL_WEIGHTS_MAX];
+ int32_t pri_count[COLL_WEIGHTS_MAX];
int32_t flags;
int32_t chain_count;
- int32_t large_pri_count;
+ int32_t large_count;
int32_t subst_count[COLL_WEIGHTS_MAX];
int32_t undef_pri[COLL_WEIGHTS_MAX];
} collate_info_t;
-typedef struct collate_char_pri {
+typedef struct collate_char {
int32_t pri[COLL_WEIGHTS_MAX];
-} collate_char_pri_t;
+} collate_char_t;
-typedef struct collate_chain_pri {
+typedef struct collate_chain {
wchar_t str[COLLATE_STR_LEN];
int32_t pri[COLL_WEIGHTS_MAX];
-} collate_chain_pri_t;
+} collate_chain_t;
-typedef struct collate_large_pri {
+typedef struct collate_large {
int32_t val;
- collate_char_pri_t pri;
-} collate_large_pri_t;
+ collate_char_t pri;
+} collate_large_t;
typedef struct collate_subst {
int32_t key;