diff options
| author | Gordon Ross <gwr@nexenta.com> | 2011-05-28 21:30:16 -0400 |
|---|---|---|
| committer | Dan McDonald <danmcd@omniti.com> | 2016-10-24 14:17:22 -0400 |
| commit | 48edc7cf07b5dccc3ad84bf2dafe4150bd666d60 (patch) | |
| tree | 164586150388a2a388179accfea08d51e34ab040 | |
| parent | 3db3a6b813432461e8e60af00e9ad6f0bf0d5eaf (diff) | |
| download | illumos-joyent-48edc7cf07b5dccc3ad84bf2dafe4150bd666d60.tar.gz | |
30 Need iconv
Reviewed by: Robert Mustacchi <rm@joyent.com>
Reviewed by: Patrick Mooney <patrick.mooney@joyent.com>
Reviewed by: Yuri Pankov <yuri.pankov@nexenta.com>
Approved by: Dan McDonald <danmcd@omniti.com>
| -rw-r--r-- | exception_lists/closed-bins | 1 | ||||
| -rw-r--r-- | usr/src/cmd/Makefile | 2 | ||||
| -rw-r--r-- | usr/src/cmd/iconv/Makefile | 66 | ||||
| -rw-r--r-- | usr/src/cmd/iconv/charmap.c | 559 | ||||
| -rw-r--r-- | usr/src/cmd/iconv/charmap.h | 68 | ||||
| -rw-r--r-- | usr/src/cmd/iconv/iconv_list.c | 298 | ||||
| -rw-r--r-- | usr/src/cmd/iconv/iconv_main.c | 310 | ||||
| -rw-r--r-- | usr/src/cmd/iconv/parser.y | 118 | ||||
| -rw-r--r-- | usr/src/cmd/iconv/scanner.c | 682 | ||||
| -rw-r--r-- | usr/src/pkg/manifests/system-test-utiltest.mf | 2 | ||||
| -rw-r--r-- | usr/src/test/util-tests/tests/Makefile | 3 | ||||
| -rw-r--r-- | usr/src/test/util-tests/tests/iconv/Makefile | 49 | ||||
| -rw-r--r-- | usr/src/test/util-tests/tests/iconv/iconv_test.sh | 111 |
13 files changed, 2267 insertions, 2 deletions
diff --git a/exception_lists/closed-bins b/exception_lists/closed-bins index 3b54696fa2..8a7ecbc066 100644 --- a/exception_lists/closed-bins +++ b/exception_lists/closed-bins @@ -81,6 +81,7 @@ ./usr/lib/nfs/lockd ./usr/lib/snmp ./usr/lib/snmp/mibiisa +./usr/bin/iconv ./usr/bin/kbdcomp ./usr/bin/localedef ./usr/bin/od diff --git a/usr/src/cmd/Makefile b/usr/src/cmd/Makefile index 0c8f8fcdac..779f8de7fc 100644 --- a/usr/src/cmd/Makefile +++ b/usr/src/cmd/Makefile @@ -197,6 +197,7 @@ COMMON_SUBDIRS= \ hotplugd \ hwdata \ ibd_upgrade \ + iconv \ id \ idmap \ infocmp \ @@ -595,6 +596,7 @@ MSGSUBDIRS= \ head \ hostname \ hotplug \ + iconv \ id \ idmap \ isaexec \ diff --git a/usr/src/cmd/iconv/Makefile b/usr/src/cmd/iconv/Makefile new file mode 100644 index 0000000000..9e4a83cc18 --- /dev/null +++ b/usr/src/cmd/iconv/Makefile @@ -0,0 +1,66 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2011 Nexenta Systems, Inc. All rights reserved. +# + +PROG=iconv + +include ../Makefile.cmd +include ../Makefile.ctf + +OBJS = iconv_main.o iconv_list.o charmap.o parser.tab.o scanner.o + +SRCS = $(OBJS:%.o=%.c) + +C99MODE= $(C99_ENABLE) +LDLIBS += -lcmdutils -lavl +YFLAGS = -d -b parser +CPPFLAGS += -D_FILE_OFFSET_BITS=64 -D_LARGEFILE_SOURCE +$(RELEASE_BUILD) CPPFLAGS += -DNDEBUG + +CERRWARN += -_gcc=-Wno-unused-label + +CLEANFILES = $(OBJS) parser.tab.c parser.tab.h +CLOBBERFILES = $(PROG) $(POFILE) +PIFILES = $(OBJS:%.o=%.i) +POFILE = iconv_cmd.po + +all: $(PROG) + +install: all $(ROOTPROG) + +$(PROG): $(OBJS) + $(LINK.c) $(OBJS) -o $@ $(LDLIBS) + $(POST_PROCESS) + +$(OBJS): parser.tab.h + +parser.tab.c parser.tab.h: parser.y + $(YACC) $(YFLAGS) parser.y + +lint: $(SRCS) + $(LINT.c) $(CPPFLAGS) $(SRCS) + +clean: + $(RM) $(CLEANFILES) + +$(POFILE): $(PIFILES) + $(RM) $@ + $(RM) messages.po + $(XGETTEXT) -s $(PIFILES) + $(SED) -e '/domain/d' messages.po > $@ + $(RM) $(PIFILES) messages.po + +.KEEP_STATE: + +include ../Makefile.targ diff --git a/usr/src/cmd/iconv/charmap.c b/usr/src/cmd/iconv/charmap.c new file mode 100644 index 0000000000..5d510326c6 --- /dev/null +++ b/usr/src/cmd/iconv/charmap.c @@ -0,0 +1,559 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +/* + * CHARMAP file handling for iconv. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <limits.h> +#include <unistd.h> +#include <alloca.h> +#include <sys/avl.h> +#include <stddef.h> +#include <unistd.h> +#include "charmap.h" +#include "parser.tab.h" +#include <assert.h> + +enum cmap_pass cmap_pass; +static avl_tree_t cmap_sym; +static avl_tree_t cmap_mbs; + +typedef struct charmap { + const char *cm_name; + struct charmap *cm_alias_of; + avl_node_t cm_avl_sym; + avl_node_t cm_avl_mbs; + int cm_warned; + int cm_frmbs_len; + int cm_tombs_len; + char cm_frmbs[MB_LEN_MAX + 1]; /* input */ + char cm_tombs[MB_LEN_MAX + 1]; /* output */ +} charmap_t; + +static void add_charmap_impl_fr(char *sym, char *mbs, int mbs_len, int nodups); +static void add_charmap_impl_to(char *sym, char *mbs, int mbs_len, int nodups); + +/* + * Array of POSIX specific portable characters. + */ +static const struct { + char *name; + int ch; +} portable_chars[] = { + { "NUL", '\0' }, + { "alert", '\a' }, + { "backspace", '\b' }, + { "tab", '\t' }, + { "carriage-return", '\r' }, + { "newline", '\n' }, + { "vertical-tab", '\v' }, + { "form-feed", '\f' }, + { "space", ' ' }, + { "exclamation-mark", '!' }, + { "quotation-mark", '"' }, + { "number-sign", '#' }, + { "dollar-sign", '$' }, + { "percent-sign", '%' }, + { "ampersand", '&' }, + { "apostrophe", '\'' }, + { "left-parenthesis", '(' }, + { "right-parenthesis", '(' }, + { "asterisk", '*' }, + { "plus-sign", '+' }, + { "comma", ','}, + { "hyphen-minus", '-' }, + { "hyphen", '-' }, + { "full-stop", '.' }, + { "period", '.' }, + { "slash", '/' }, + { "solidus", '/' }, + { "zero", '0' }, + { "one", '1' }, + { "two", '2' }, + { "three", '3' }, + { "four", '4' }, + { "five", '5' }, + { "six", '6' }, + { "seven", '7' }, + { "eight", '8' }, + { "nine", '9' }, + { "colon", ':' }, + { "semicolon", ';' }, + { "less-than-sign", '<' }, + { "equals-sign", '=' }, + { "greater-than-sign", '>' }, + { "question-mark", '?' }, + { "commercial-at", '@' }, + { "left-square-bracket", '[' }, + { "backslash", '\\' }, + { "reverse-solidus", '\\' }, + { "right-square-bracket", ']' }, + { "circumflex", '^' }, + { "circumflex-accent", '^' }, + { "low-line", '_' }, + { "underscore", '_' }, + { "grave-accent", '`' }, + { "left-brace", '{' }, + { "left-curly-bracket", '{' }, + { "vertical-line", '|' }, + { "right-brace", '}' }, + { "right-curly-bracket", '}' }, + { "tilde", '~' }, + { "A", 'A' }, + { "B", 'B' }, + { "C", 'C' }, + { "D", 'D' }, + { "E", 'E' }, + { "F", 'F' }, + { "G", 'G' }, + { "H", 'H' }, + { "I", 'I' }, + { "J", 'J' }, + { "K", 'K' }, + { "L", 'L' }, + { "M", 'M' }, + { "N", 'N' }, + { "O", 'O' }, + { "P", 'P' }, + { "Q", 'Q' }, + { "R", 'R' }, + { "S", 'S' }, + { "T", 'T' }, + { "U", 'U' }, + { "V", 'V' }, + { "W", 'W' }, + { "X", 'X' }, + { "Y", 'Y' }, + { "Z", 'Z' }, + { "a", 'a' }, + { "b", 'b' }, + { "c", 'c' }, + { "d", 'd' }, + { "e", 'e' }, + { "f", 'f' }, + { "g", 'g' }, + { "h", 'h' }, + { "i", 'i' }, + { "j", 'j' }, + { "k", 'k' }, + { "l", 'l' }, + { "m", 'm' }, + { "n", 'n' }, + { "o", 'o' }, + { "p", 'p' }, + { "q", 'q' }, + { "r", 'r' }, + { "s", 's' }, + { "t", 't' }, + { "u", 'u' }, + { "v", 'v' }, + { "w", 'w' }, + { "x", 'x' }, + { "y", 'y' }, + { "z", 'z' }, + { NULL, 0 } +}; + +static int +cmap_compare_sym(const void *n1, const void *n2) +{ + const charmap_t *c1 = n1; + const charmap_t *c2 = n2; + int rv; + + rv = strcmp(c1->cm_name, c2->cm_name); + return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); +} + +/* + * In order for partial match searches to work, + * we need these sorted by mbs contents. + */ +static int +cmap_compare_mbs(const void *n1, const void *n2) +{ + const charmap_t *c1 = n1; + const charmap_t *c2 = n2; + int len, rv; + + len = c1->cm_frmbs_len; + if (len < c2->cm_frmbs_len) + len = c2->cm_frmbs_len; + rv = memcmp(c1->cm_frmbs, c2->cm_frmbs, len); + if (rv < 0) + return (-1); + if (rv > 0) + return (1); + /* they match through length */ + if (c1->cm_frmbs_len < c2->cm_frmbs_len) + return (-1); + if (c2->cm_frmbs_len < c1->cm_frmbs_len) + return (1); + return (0); +} + +void +charmap_init(char *to_map, char *from_map) +{ + avl_create(&cmap_sym, cmap_compare_sym, sizeof (charmap_t), + offsetof(charmap_t, cm_avl_sym)); + + avl_create(&cmap_mbs, cmap_compare_mbs, sizeof (charmap_t), + offsetof(charmap_t, cm_avl_mbs)); + + cmap_pass = CMAP_PASS_FROM; + reset_scanner(from_map); + (void) yyparse(); + add_charmap_posix(); + + cmap_pass = CMAP_PASS_TO; + reset_scanner(to_map); + (void) yyparse(); +} + +void +charmap_dump() +{ + charmap_t *cm; + int i; + + cm = avl_first(&cmap_mbs); + while (cm != NULL) { + (void) printf("name=\"%s\"\n", cm->cm_name); + + (void) printf("\timbs=\""); + for (i = 0; i < cm->cm_frmbs_len; i++) + (void) printf("\\x%02x", cm->cm_frmbs[i] & 0xFF); + (void) printf("\"\n"); + + (void) printf("\tombs=\""); + for (i = 0; i < cm->cm_tombs_len; i++) + (void) printf("\\x%02x", cm->cm_tombs[i] & 0xFF); + (void) printf("\"\n"); + + cm = AVL_NEXT(&cmap_mbs, cm); + } +} + +/* + * We parse two charmap files: First the "from" map, where we build + * cmap_mbs and cmap_sym which we'll later use to translate the input + * stream (mbs encodings) to symbols. Second, we parse the "to" map, + * where we fill in the tombs members of entries in cmap_sym, (which + * must alread exist) used later to write the output encoding. + */ +static void +add_charmap_impl(char *sym, char *mbs, int mbs_len, int nodups) +{ + + /* + * While parsing both the "from" and "to" cmaps, + * require both the symbol and encoding. + */ + if (sym == NULL || mbs == NULL) { + errf(_("invalid charmap entry")); + return; + } + + switch (cmap_pass) { + case CMAP_PASS_FROM: + add_charmap_impl_fr(sym, mbs, mbs_len, nodups); + break; + case CMAP_PASS_TO: + add_charmap_impl_to(sym, mbs, mbs_len, nodups); + break; + default: + abort(); + break; + } +} + +static void +add_charmap_impl_fr(char *sym, char *mbs, int mbs_len, int nodups) +{ + charmap_t *m, *n, *s; + avl_index_t where_sym, where_mbs; + + if ((n = calloc(1, sizeof (*n))) == NULL) { + errf(_("out of memory")); + return; + } + n->cm_name = sym; + + assert(0 < mbs_len && mbs_len <= MB_LEN_MAX); + (void) memcpy(n->cm_frmbs, mbs, mbs_len); + n->cm_frmbs_len = mbs_len; + + m = avl_find(&cmap_mbs, n, &where_mbs); + s = avl_find(&cmap_sym, n, &where_sym); + + /* + * If we found the symbol, this is a dup. + */ + if (s != NULL) { + if (nodups) { + warn(_("%s: duplicate character symbol"), sym); + } + free(n); + return; + } + + /* + * If we found the mbs, the new one is an alias, + * which we'll add _only_ to the symbol AVL. + */ + if (m != NULL) { + /* The new one is an alias of the original. */ + n->cm_alias_of = m; + avl_insert(&cmap_sym, n, where_sym); + return; + } + + avl_insert(&cmap_sym, n, where_sym); + avl_insert(&cmap_mbs, n, where_mbs); +} + +static void +add_charmap_impl_to(char *sym, char *mbs, int mbs_len, int nodups) +{ + charmap_t srch = {0}; + charmap_t *m; + + assert(0 < mbs_len && mbs_len <= MB_LEN_MAX); + + srch.cm_name = sym; + + m = avl_find(&cmap_sym, &srch, NULL); + if (m == NULL) { + if (sflag == 0) + warn(_("%s: symbol not found"), sym); + return; + } + if (m->cm_alias_of != NULL) { + m = m->cm_alias_of; + + /* don't warn for dups with aliases */ + if (m->cm_tombs_len != 0) + return; + } + + if (m->cm_tombs_len != 0) { + if (nodups) { + warn(_("%s: duplicate encoding for"), sym); + } + return; + } + + (void) memcpy(m->cm_tombs, mbs, mbs_len); + m->cm_tombs_len = mbs_len; +} + +void +add_charmap(char *sym, char *mbs) +{ + /* mbs[0] is the length */ + int mbs_len = *mbs++; + assert(0 < mbs_len && mbs_len <= MB_LEN_MAX); + add_charmap_impl(sym, mbs, mbs_len, 1); +} + + +/* + * This is called by the parser with start/end symbol strings (ssym, esym), + * which are allocated in the scanner (T_SYMBOL) and free'd here. + */ +void +add_charmap_range(char *ssym, char *esym, char *mbs) +{ + int ls, le; + int si; + int sn, en; + int i; + int mbs_len; + char tmbs[MB_LEN_MAX+1]; + char *mb_last; + + static const char *digits = "0123456789"; + + /* mbs[0] is the length */ + mbs_len = *mbs++; + assert(0 < mbs_len && mbs_len <= MB_LEN_MAX); + (void) memcpy(tmbs, mbs, mbs_len); + mb_last = tmbs + mbs_len - 1; + + ls = strlen(ssym); + le = strlen(esym); + + if (((si = strcspn(ssym, digits)) == 0) || (si == ls) || + (strncmp(ssym, esym, si) != 0) || + (strspn(ssym + si, digits) != (ls - si)) || + (strspn(esym + si, digits) != (le - si)) || + ((sn = atoi(ssym + si)) > ((en = atoi(esym + si))))) { + errf(_("malformed charmap range")); + return; + } + + ssym[si] = 0; + for (i = sn; i <= en; i++) { + char *nn; + (void) asprintf(&nn, "%s%0*u", ssym, ls - si, i); + if (nn == NULL) { + errf(_("out of memory")); + return; + } + + add_charmap_impl(nn, tmbs, mbs_len, 1); + (*mb_last)++; + } + free(ssym); + free(esym); +} + +void +add_charmap_char(char *name, int c) +{ + char mbs[MB_LEN_MAX+1]; + + mbs[0] = c; + mbs[1] = '\0'; + add_charmap_impl(name, mbs, 1, 0); +} + +/* + * POSIX insists that certain entries be present, even when not in the + * orginal charmap file. + */ +void +add_charmap_posix(void) +{ + int i; + + for (i = 0; portable_chars[i].name; i++) { + add_charmap_char(portable_chars[i].name, portable_chars[i].ch); + } +} + +/* + * This is called with a buffer of (typically) MB_LEN_MAX bytes, + * which is potentially a multi-byte symbol, but often contains + * extra bytes. Find and return the longest match in the charmap. + */ +static charmap_t * +find_mbs(const char *mbs, int len) +{ + charmap_t srch = {0}; + charmap_t *cm = NULL; + + while (len > 0) { + (void) memcpy(srch.cm_frmbs, mbs, len); + srch.cm_frmbs_len = len; + cm = avl_find(&cmap_mbs, &srch, NULL); + if (cm != NULL) + break; + len--; + } + + return (cm); +} + +/* + * Return true if this sequence matches the initial part + * of any sequence known in this charmap. + */ +static boolean_t +find_mbs_partial(const char *mbs, int len) +{ + charmap_t srch = {0}; + charmap_t *cm; + avl_index_t where; + + (void) memcpy(srch.cm_frmbs, mbs, len); + srch.cm_frmbs_len = len; + cm = avl_find(&cmap_mbs, &srch, &where); + if (cm != NULL) { + /* full match - not expected, but OK */ + return (B_TRUE); + } + cm = avl_nearest(&cmap_mbs, where, AVL_AFTER); + if (cm != NULL && 0 == memcmp(cm->cm_frmbs, mbs, len)) + return (B_TRUE); + + return (B_FALSE); +} + +/* + * Do like iconv(3), but with charmaps. + */ +size_t +cm_iconv(const char **iptr, size_t *ileft, char **optr, size_t *oleft) +{ + charmap_t *cm; + int mbs_len; + + /* Ignore state reset requests. */ + if (iptr == NULL || *iptr == NULL) + return (0); + + if (*oleft < MB_LEN_MAX) { + errno = E2BIG; + return ((size_t)-1); + } + + while (*ileft > 0 && *oleft >= MB_LEN_MAX) { + mbs_len = MB_LEN_MAX; + if (mbs_len > *ileft) + mbs_len = *ileft; + cm = find_mbs(*iptr, mbs_len); + if (cm == NULL) { + if (mbs_len < MB_LEN_MAX && + find_mbs_partial(*iptr, mbs_len)) { + /* incomplete sequence */ + errno = EINVAL; + } else { + errno = EILSEQ; + } + return ((size_t)-1); + } + assert(cm->cm_frmbs_len > 0); + if (cm->cm_tombs_len == 0) { + if (sflag == 0 && cm->cm_warned == 0) { + cm->cm_warned = 1; + warn(_("To-map does not encode <%s>\n"), + cm->cm_name); + } + if (cflag == 0) { + errno = EILSEQ; + return ((size_t)-1); + } + /* just skip this input seq. */ + *iptr += cm->cm_frmbs_len; + *ileft -= cm->cm_frmbs_len; + continue; + } + + *iptr += cm->cm_frmbs_len; + *ileft -= cm->cm_frmbs_len; + (void) memcpy(*optr, cm->cm_tombs, cm->cm_tombs_len); + *optr += cm->cm_tombs_len; + *oleft -= cm->cm_tombs_len; + } + + return (0); +} diff --git a/usr/src/cmd/iconv/charmap.h b/usr/src/cmd/iconv/charmap.h new file mode 100644 index 0000000000..e2c36ea9e7 --- /dev/null +++ b/usr/src/cmd/iconv/charmap.h @@ -0,0 +1,68 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy is of the CDDL is also available via the Internet + * at http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +#ifndef _CHARMAP_H +#define _CHARMAP_H + +/* + * CHARMAP file handling for iconv. + */ + +/* Common header files. */ +#include <stdio.h> +#include <stdlib.h> +#include <stdarg.h> +#include <sys/types.h> +#include <libintl.h> + +enum cmap_pass { + CMAP_PASS_FROM, + CMAP_PASS_TO +}; + +extern int com_char; +extern int esc_char; +extern int mb_cur_max; +extern int mb_cur_min; +extern int last_kw; +extern int verbose; +extern int yydebug; +extern int lineno; +extern int debug; +extern int warnings; +extern int cflag; +extern int sflag; + +int yyparse(void); +void yyerror(const char *); +void errf(const char *, ...); +void warn(const char *, ...); + +void reset_scanner(const char *); +void scan_to_eol(void); + +/* charmap.c - CHARMAP handling */ +void init_charmap(void); +void add_charmap(char *, char *); +void add_charmap_posix(void); +void add_charmap_range(char *, char *, char *); + +void charmap_init(char *to, char *fr); +size_t cm_iconv(const char **iptr, size_t *ileft, char **optr, size_t *oleft); +void charmap_dump(void); + +#define _(x) gettext(x) + +#endif /* _CHARMAP_H */ diff --git a/usr/src/cmd/iconv/iconv_list.c b/usr/src/cmd/iconv/iconv_list.c new file mode 100644 index 0000000000..4fac3506d8 --- /dev/null +++ b/usr/src/cmd/iconv/iconv_list.c @@ -0,0 +1,298 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2016 Nexenta Systems, Inc. All rights reserved. + */ + +/* + * implement "iconv -l" + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <errno.h> +#include <limits.h> +#include <unistd.h> +#include <alloca.h> +#include <sys/avl.h> +#include <sys/list.h> +#include <sys/param.h> +#include <stddef.h> +#include <dirent.h> +#include <unistd.h> + +#define PATH_LIBICONV "/usr/lib/iconv" +#define PATH_BTABLES "/usr/lib/iconv/geniconvtbl/binarytables" +#define PATH_ALIASES "/usr/lib/iconv/alias" + +typedef struct codeset { + avl_node_t cs_node; + char *cs_name; + list_t cs_aliases; +} codeset_t; + +typedef struct csalias { + list_node_t a_node; + char *a_name; +} csalias_t; + +static avl_tree_t cs_avl; + +static void alias_destroy(csalias_t *); + +/* + * codesets + */ + +static int +cs_compare(const void *n1, const void *n2) +{ + const codeset_t *c1 = n1; + const codeset_t *c2 = n2; + int rv; + + rv = strcmp(c1->cs_name, c2->cs_name); + return ((rv < 0) ? -1 : (rv > 0) ? 1 : 0); +} + +static void +cs_insert(char *key) +{ + codeset_t tmp, *cs; + avl_index_t where; + + (void) memset(&tmp, 0, sizeof (tmp)); + tmp.cs_name = key; + + cs = avl_find(&cs_avl, &tmp, &where); + if (cs != NULL) + return; /* already there */ + + cs = calloc(1, sizeof (*cs)); + if (cs == NULL) { + perror("cs_insert:calloc"); + exit(1); + } + cs->cs_name = strdup(key); + if (cs->cs_name == NULL) { + perror("cs_insert:strdup"); + exit(1); + } + list_create(&cs->cs_aliases, sizeof (csalias_t), + offsetof(csalias_t, a_node)); + + avl_insert(&cs_avl, cs, where); +} + +const char topmatter[] = + "The following are all supported code set names. All combinations\n" + "of those names are not necessarily available for the pair of the\n" + "fromcode-tocode. Some of those code set names have aliases, which\n" + "are case-insensitive and described in parentheses following the\n" + "canonical name:\n"; + + +static void +cs_dump(void) +{ + codeset_t *cs; + csalias_t *a; + + (void) puts(topmatter); + + for (cs = avl_first(&cs_avl); cs != NULL; + cs = AVL_NEXT(&cs_avl, cs)) { + + (void) printf(" %s", cs->cs_name); + if (!list_is_empty(&cs->cs_aliases)) { + a = list_head(&cs->cs_aliases); + (void) printf(" (%s", a->a_name); + while ((a = list_next(&cs->cs_aliases, a)) != NULL) { + (void) printf(", %s", a->a_name); + } + (void) printf(")"); + } + (void) printf(",\n"); + } +} + +static void +cs_destroy(void) +{ + void *cookie = NULL; + codeset_t *cs; + csalias_t *a; + + while ((cs = avl_destroy_nodes(&cs_avl, &cookie)) != NULL) { + while ((a = list_remove_head(&cs->cs_aliases)) != NULL) { + alias_destroy(a); + } + free(cs->cs_name); + free(cs); + } + avl_destroy(&cs_avl); +} + +/* + * aliases + */ + +static void +alias_insert(char *codeset, char *alias) +{ + codeset_t tcs, *cs; + csalias_t *a; + + /* + * Find the codeset. If non-existent, + * ignore aliases of this codeset. + */ + (void) memset(&tcs, 0, sizeof (tcs)); + tcs.cs_name = codeset; + cs = avl_find(&cs_avl, &tcs, NULL); + if (cs == NULL) + return; + + /* + * Add this alias + */ + a = calloc(1, sizeof (*a)); + if (a == NULL) { + perror("alias_insert:calloc"); + exit(1); + } + a->a_name = strdup(alias); + if (a->a_name == NULL) { + perror("alias_insert:strdup"); + exit(1); + } + + list_insert_tail(&cs->cs_aliases, a); +} + +static void +alias_destroy(csalias_t *a) +{ + free(a->a_name); + free(a); +} + + +static void +scan_dir(DIR *dh, char sep, char *suffix) +{ + char namebuf[MAXNAMELEN]; + struct dirent *de; + + while ((de = readdir(dh)) != NULL) { + char *p2, *p1; + + /* + * We'll modify, so let's copy. If the dirent name is + * longer than MAXNAMELEN, then it can't possibly be a + * valid pair of codeset names, so just skip it. + */ + if (strlcpy(namebuf, de->d_name, sizeof (namebuf)) >= + sizeof (namebuf)) + continue; + + /* Find suffix (.so | .t) */ + p2 = strrchr(namebuf, *suffix); + if (p2 == NULL) + continue; + if (strcmp(p2, suffix) != 0) + continue; + *p2 = '\0'; + + p1 = strchr(namebuf, sep); + if (p1 == NULL) + continue; + *p1++ = '\0'; + + /* More than one sep? */ + if (strchr(p1, sep) != NULL) + continue; + + /* Empty strings? */ + if (*namebuf == '\0' || *p1 == '\0') + continue; + + /* OK, add both to the map. */ + cs_insert(namebuf); + cs_insert(p1); + } +} + +static void +scan_aliases(FILE *fh) +{ + char linebuf[256]; + char *p1, *p2; + + while (fgets(linebuf, sizeof (linebuf), fh) != NULL) { + if (linebuf[0] == '#') + continue; + p1 = strchr(linebuf, ' '); + if (p1 == NULL) + continue; + *p1++ = '\0'; + p2 = strchr(p1, '\n'); + if (p2 == NULL) + continue; + *p2 = '\0'; + alias_insert(p1, linebuf); + } +} + +int +list_codesets(void) +{ + DIR *dh; + FILE *fh; + + avl_create(&cs_avl, cs_compare, sizeof (codeset_t), + offsetof(codeset_t, cs_node)); + + dh = opendir(PATH_LIBICONV); + if (dh == NULL) { + perror(PATH_LIBICONV); + return (1); + } + scan_dir(dh, '%', ".so"); + rewinddir(dh); + scan_dir(dh, '.', ".t"); + (void) closedir(dh); + + dh = opendir(PATH_BTABLES); + if (dh == NULL) { + perror(PATH_BTABLES); + return (1); + } + scan_dir(dh, '%', ".bt"); + (void) closedir(dh); + + fh = fopen(PATH_ALIASES, "r"); + if (fh == NULL) { + perror(PATH_ALIASES); + /* let's continue */ + } else { + scan_aliases(fh); + (void) fclose(fh); + } + + cs_dump(); + + cs_destroy(); + + return (0); +} diff --git a/usr/src/cmd/iconv/iconv_main.c b/usr/src/cmd/iconv/iconv_main.c new file mode 100644 index 0000000000..260d6ba9bc --- /dev/null +++ b/usr/src/cmd/iconv/iconv_main.c @@ -0,0 +1,310 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +/* + * iconv(1) command. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <string.h> +#include <unistd.h> +#include <errno.h> +#include <limits.h> +#include <iconv.h> +#include <libintl.h> +#include <langinfo.h> +#include <locale.h> +#include "charmap.h" + +#include <assert.h> + +const char *progname; + +char *from_cs; +char *to_cs; +int debug; +int cflag; /* skip invalid characters */ +int sflag; /* silent */ +int lflag; /* list conversions */ + +void iconv_file(FILE *, const char *); +extern int list_codesets(void); + +iconv_t ich; /* iconv(3c) lib handle */ +size_t (*pconv)(const char **iptr, size_t *ileft, + char **optr, size_t *oleft); + +size_t +lib_iconv(const char **iptr, size_t *ileft, char **optr, size_t *oleft) +{ + return (iconv(ich, iptr, ileft, optr, oleft)); +} + +void +usage(void) +{ + (void) fprintf(stderr, gettext( + "usage: %s [-cs] [-f from-codeset] [-t to-codeset] " + "[file ...]\n"), progname); + (void) fprintf(stderr, gettext("\t%s -l\n"), progname); + exit(1); +} + +int +main(int argc, char **argv) +{ + FILE *fp; + char *fslash, *tslash; + int c; + + yydebug = 0; + progname = getprogname(); + + (void) setlocale(LC_ALL, ""); + +#if !defined(TEXT_DOMAIN) +#define TEXT_DOMAIN "SYS_TEST" +#endif + (void) textdomain(TEXT_DOMAIN); + + while ((c = getopt(argc, argv, "cdlsf:t:")) != EOF) { + switch (c) { + case 'c': + cflag++; + break; + case 'd': + debug++; + break; + case 'l': + lflag++; + break; + case 's': + sflag++; + break; + case 'f': + from_cs = optarg; + break; + case 't': + to_cs = optarg; + break; + case '?': + usage(); + } + } + + if (lflag) { + if (from_cs != NULL || to_cs != NULL || optind != argc) + usage(); + exit(list_codesets()); + } + + if (from_cs == NULL) + from_cs = nl_langinfo(CODESET); + if (to_cs == NULL) + to_cs = nl_langinfo(CODESET); + + /* + * If either "from" or "to" contains a slash, + * then we're using charmaps. + */ + fslash = strchr(from_cs, '/'); + tslash = strchr(to_cs, '/'); + if (fslash != NULL || tslash != NULL) { + charmap_init(to_cs, from_cs); + pconv = cm_iconv; + if (debug) + charmap_dump(); + } else { + ich = iconv_open(to_cs, from_cs); + if (ich == ((iconv_t)-1)) { + switch (errno) { + case EINVAL: + (void) fprintf(stderr, + _("Not supported %s to %s\n"), + from_cs, to_cs); + break; + default: + (void) fprintf(stderr, + _("iconv_open failed: %s\n"), + strerror(errno)); + break; + } + exit(1); + } + pconv = lib_iconv; + } + + if (optind == argc || + (optind == argc - 1 && 0 == strcmp(argv[optind], "-"))) { + iconv_file(stdin, "stdin"); + exit(warnings ? 1 : 0); + } + + for (; optind < argc; optind++) { + fp = fopen(argv[optind], "r"); + if (fp == NULL) { + perror(argv[optind]); + exit(1); + } + iconv_file(fp, argv[optind]); + (void) fclose(fp); + } + exit(warnings ? 1 : 0); +} + +/* + * Conversion buffer sizes: + * + * The input buffer has room to prepend one mbs character if needed for + * handling a left-over at the end of a previous conversion buffer. + * + * Conversions may grow or shrink data, so using a larger output buffer + * to reduce the likelihood of leftover input buffer data in each pass. + */ +#define IBUFSIZ (MB_LEN_MAX + BUFSIZ) +#define OBUFSIZ (2 * BUFSIZ) + +void +iconv_file(FILE *fp, const char *fname) +{ + static char ibuf[IBUFSIZ]; + static char obuf[OBUFSIZ]; + const char *iptr; + char *optr; + off64_t offset; + size_t ileft, oleft, ocnt; + int iconv_errno; + int nr, nw, rc; + + offset = 0; + ileft = 0; + iptr = ibuf + MB_LEN_MAX; + + while ((nr = fread(ibuf+MB_LEN_MAX, 1, BUFSIZ, fp)) > 0) { + + assert(iptr <= ibuf+MB_LEN_MAX); + assert(ileft <= MB_LEN_MAX); + ileft += nr; + offset += nr; + + optr = obuf; + oleft = OBUFSIZ; + + /* + * Note: the *pconv function is either iconv(3c) or our + * private equivalent when using charmaps. Both update + * ileft, oleft etc. even when conversion stops due to + * an illegal sequence or whatever, so we need to copy + * the partially converted buffer even on error. + */ + iconv_again: + rc = (*pconv)(&iptr, &ileft, &optr, &oleft); + iconv_errno = errno; + + ocnt = OBUFSIZ - oleft; + if (ocnt > 0) { + nw = fwrite(obuf, 1, ocnt, stdout); + if (nw != ocnt) { + perror("fwrite"); + exit(1); + } + } + optr = obuf; + oleft = OBUFSIZ; + + if (rc == (size_t)-1) { + switch (iconv_errno) { + + case E2BIG: /* no room in output buffer */ + goto iconv_again; + + case EINVAL: /* incomplete sequence on input */ + if (debug) { + (void) fprintf(stderr, + _("Incomplete sequence in %s at offset %lld\n"), + fname, offset - ileft); + } + /* + * Copy the remainder to the space reserved + * at the start of the input buffer. + */ + assert(ileft > 0); + if (ileft <= MB_LEN_MAX) { + char *p = ibuf+MB_LEN_MAX-ileft; + (void) memmove(p, iptr, ileft); + iptr = p; + continue; /* read again */ + } + /* + * Should not see ileft > MB_LEN_MAX, + * but if we do, handle as EILSEQ. + */ + /* FALLTHROUGH */ + + case EILSEQ: /* invalid sequence on input */ + if (!sflag) { + (void) fprintf(stderr, + _("Illegal sequence in %s at offset %lld\n"), + fname, offset - ileft); + (void) fprintf(stderr, + _("bad seq: \\x%02x\\x%02x\\x%02x\n"), + iptr[0] & 0xff, + iptr[1] & 0xff, + iptr[2] & 0xff); + } + assert(ileft > 0); + /* skip one */ + iptr++; + ileft--; + assert(oleft > 0); + if (!cflag) { + *optr++ = '?'; + oleft--; + } + goto iconv_again; + + default: + (void) fprintf(stderr, + _("iconv error (%s) in file $s at offset %lld\n"), + strerror(iconv_errno), fname, + offset - ileft); + break; + } + } + + /* normal iconv return */ + ileft = 0; + iptr = ibuf + MB_LEN_MAX; + } + + /* + * End of file + * Flush any shift encodings. + */ + iptr = NULL; + ileft = 0; + optr = obuf; + oleft = OBUFSIZ; + (*pconv)(&iptr, &ileft, &optr, &oleft); + ocnt = OBUFSIZ - oleft; + if (ocnt > 0) { + nw = fwrite(obuf, 1, ocnt, stdout); + if (nw != ocnt) { + perror("fwrite"); + exit(1); + } + } +} diff --git a/usr/src/cmd/iconv/parser.y b/usr/src/cmd/iconv/parser.y new file mode 100644 index 0000000000..5abd7e2024 --- /dev/null +++ b/usr/src/cmd/iconv/parser.y @@ -0,0 +1,118 @@ +%{ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +/* + * POSIX iconv charmap grammar. + */ + +#include <wchar.h> +#include <stdio.h> +#include <limits.h> +#include "charmap.h" + +%} +%union { + char *token; + int num; + char mbs[MB_LEN_MAX + 2]; /* NB: [0] is length! */ +} + +%token T_CODE_SET +%token T_MB_CUR_MAX +%token T_MB_CUR_MIN +%token T_COM_CHAR +%token T_ESC_CHAR +%token T_LT +%token T_GT +%token T_NL +%token T_SEMI +%token T_COMMA +%token T_ELLIPSIS +%token T_RPAREN +%token T_LPAREN +%token T_QUOTE +%token T_NULL +%token T_END +%token T_CHARMAP +%token T_WIDTH +%token T_WIDTH_DEFAULT +%token <mbs> T_CHAR +%token <token> T_NAME +%token <num> T_NUMBER +%token <token> T_SYMBOL + +%% + +goal : setting_list charmap + | charmap + ; + +string : T_QUOTE charlist T_QUOTE + | T_QUOTE T_QUOTE + ; + +charlist : charlist T_CHAR + | T_CHAR + ; + +setting_list : setting_list setting + | setting + ; + +setting : T_COM_CHAR T_CHAR T_NL + { + com_char = $2[1]; + } + | T_ESC_CHAR T_CHAR T_NL + { + esc_char = $2[1]; + } + | T_MB_CUR_MAX T_NUMBER T_NL + { + mb_cur_max = $2; + } + | T_MB_CUR_MIN T_NUMBER T_NL + { + mb_cur_min = $2; + } + | T_CODE_SET T_NAME T_NL + { + /* ignore */ + } + | T_CODE_SET string T_NL + { + /* ignore */ + } + ; + +charmap : T_CHARMAP T_NL charmap_list T_END T_CHARMAP T_NL + +charmap_list : charmap_list charmap_entry + | charmap_entry + ; + +charmap_entry : T_SYMBOL T_CHAR + { + add_charmap($1, $2); + scan_to_eol(); + } + | T_SYMBOL T_ELLIPSIS T_SYMBOL T_CHAR + { + add_charmap_range($1, $3, $4); + scan_to_eol(); + } + | T_NL + ; diff --git a/usr/src/cmd/iconv/scanner.c b/usr/src/cmd/iconv/scanner.c new file mode 100644 index 0000000000..5c53695282 --- /dev/null +++ b/usr/src/cmd/iconv/scanner.c @@ -0,0 +1,682 @@ +/* + * This file and its contents are supplied under the terms of the + * Common Development and Distribution License ("CDDL"), version 1.0. + * You may only use this file in accordance with the terms of version + * 1.0 of the CDDL. + * + * A full copy of the text of the CDDL should have accompanied this + * source. A copy of the CDDL is also available via the Internet at + * http://www.illumos.org/license/CDDL. + */ + +/* + * Copyright 2011 Nexenta Systems, Inc. All rights reserved. + */ + +/* + * This file contains the "scanner", which tokenizes charmap files + * for iconv for processing by the higher level grammar processor. + */ + +#include <stdio.h> +#include <stdlib.h> +#include <ctype.h> +#include <limits.h> +#include <string.h> +#include <widec.h> +#include <sys/types.h> +#include <assert.h> +#include "charmap.h" +#include "parser.tab.h" + +int com_char = '#'; +int esc_char = '\\'; +int mb_cur_min = 1; +int mb_cur_max = MB_LEN_MAX; +int lineno = 1; +int warnings = 0; +static int nextline; +static FILE *input = stdin; +static const char *filename = "<stdin>"; +static int instring = 0; +static int escaped = 0; + +/* + * Token space ... grows on demand. + */ +static char *token = NULL; +static int tokidx; +static int toksz = 0; +static int hadtok = 0; + +/* + * The last keyword seen. This is useful to trigger the special lexer rules + * for "copy" and also collating symbols and elements. + */ +int last_kw = 0; +static int category = T_END; + +static struct token { + int id; + const char *name; +} keywords[] = { + { T_COM_CHAR, "comment_char" }, + { T_ESC_CHAR, "escape_char" }, + { T_END, "END" }, + + /* + * These are keywords used in the charmap file. Note that + * Solaris orginally used angle brackets to wrap some of them, + * but we removed that to simplify our parser. The first of these + * items are "global items." + */ + { T_CHARMAP, "CHARMAP" }, + { T_WIDTH, "WIDTH" }, + { T_WIDTH_DEFAULT, "WIDTH_DEFAULT" }, + + { -1, NULL }, +}; + +/* + * These special words are only used in a charmap file, enclosed in <>. + */ +static struct token symwords[] = { + { T_COM_CHAR, "comment_char" }, + { T_ESC_CHAR, "escape_char" }, + { T_CODE_SET, "code_set_name" }, + { T_MB_CUR_MAX, "mb_cur_max" }, + { T_MB_CUR_MIN, "mb_cur_min" }, + { -1, NULL }, +}; + +static int categories[] = { + T_CHARMAP, + 0 +}; + +void +reset_scanner(const char *fname) +{ + if (fname == NULL) { + filename = "<stdin>"; + input = stdin; + } else { + if (input != stdin) + (void) fclose(input); + if ((input = fopen(fname, "r")) == NULL) { + perror(fname); + exit(1); + } + filename = fname; + } + com_char = '#'; + esc_char = '\\'; + instring = 0; + escaped = 0; + lineno = 1; + nextline = 1; + tokidx = 0; + last_kw = 0; + category = T_END; +} + +#define hex(x) \ + (isdigit(x) ? (x - '0') : ((islower(x) ? (x - 'a') : (x - 'A')) + 10)) +#define isodigit(x) ((x >= '0') && (x <= '7')) + +static int +scanc(void) +{ + int c; + + c = getc(input); + lineno = nextline; + if (c == '\n') { + nextline++; + } + return (c); +} + +static void +unscanc(int c) +{ + if (c == '\n') { + nextline--; + } + if (ungetc(c, input) < 0) { + yyerror(_("ungetc failed")); + } +} + +static int +scan_hex_byte(void) +{ + int c1, c2; + int v; + + c1 = scanc(); + if (!isxdigit(c1)) { + yyerror(_("malformed hex digit")); + return (0); + } + c2 = scanc(); + if (!isxdigit(c2)) { + yyerror(_("malformed hex digit")); + return (0); + } + v = ((hex(c1) << 4) | hex(c2)); + return (v); +} + +static int +scan_dec_byte(void) +{ + int c1, c2, c3; + int b; + + c1 = scanc(); + if (!isdigit(c1)) { + yyerror(_("malformed decimal digit")); + return (0); + } + b = c1 - '0'; + c2 = scanc(); + if (!isdigit(c2)) { + yyerror(_("malformed decimal digit")); + return (0); + } + b *= 10; + b += (c2 - '0'); + c3 = scanc(); + if (!isdigit(c3)) { + unscanc(c3); + } else { + b *= 10; + b += (c3 - '0'); + } + return (b); +} + +static int +scan_oct_byte(void) +{ + int c1, c2, c3; + int b; + + b = 0; + + c1 = scanc(); + if (!isodigit(c1)) { + yyerror(_("malformed octal digit")); + return (0); + } + b = c1 - '0'; + c2 = scanc(); + if (!isodigit(c2)) { + yyerror(_("malformed octal digit")); + return (0); + } + b *= 8; + b += (c2 - '0'); + c3 = scanc(); + if (!isodigit(c3)) { + unscanc(c3); + } else { + b *= 8; + b += (c3 - '0'); + } + return (b); +} + +void +add_tok(int c) +{ + if ((tokidx + 1) >= toksz) { + toksz += 64; + if ((token = realloc(token, toksz)) == NULL) { + yyerror(_("out of memory")); + tokidx = 0; + toksz = 0; + return; + } + } + + token[tokidx++] = (char)c; + token[tokidx] = 0; +} + +static int +get_byte(void) +{ + int c; + + if ((c = scanc()) != esc_char) { + unscanc(c); + return (EOF); + } + c = scanc(); + + switch (c) { + case 'd': + case 'D': + return (scan_dec_byte()); + case 'x': + case 'X': + return (scan_hex_byte()); + case '0': + case '1': + case '2': + case '3': + case '4': + case '5': + case '6': + case '7': + /* put the character back so we can get it */ + unscanc(c); + return (scan_oct_byte()); + default: + unscanc(c); + unscanc(esc_char); + return (EOF); + } +} + +int +get_escaped(int c) +{ + switch (c) { + case 'n': + return ('\n'); + case 'r': + return ('\r'); + case 't': + return ('\t'); + case 'f': + return ('\f'); + case 'v': + return ('\v'); + case 'b': + return ('\b'); + case 'a': + return ('\a'); + default: + return (c); + } +} + +int +get_wide(void) +{ + /* NB: yylval.mbs[0] is the length */ + char *mbs = &yylval.mbs[1]; + int mbi = 0; + int c; + + mbs[mbi] = 0; + if (mb_cur_max > MB_LEN_MAX) { + yyerror(_("max multibyte character size too big")); + return (T_NULL); + } + for (;;) { + if ((c = get_byte()) == EOF) + break; + if (mbi == mb_cur_max) { + unscanc(c); + yyerror(_("length > mb_cur_max")); + return (T_NULL); + } + mbs[mbi++] = c; + mbs[mbi] = 0; + } + + /* result in yylval.mbs */ + mbs[-1] = mbi; + return (T_CHAR); +} + +int +get_symbol(void) +{ + int c; + + while ((c = scanc()) != EOF) { + if (escaped) { + escaped = 0; + if (c == '\n') + continue; + add_tok(get_escaped(c)); + continue; + } + if (c == esc_char) { + escaped = 1; + continue; + } + if (c == '\n') { /* well that's strange! */ + yyerror(_("unterminated symbolic name")); + continue; + } + if (c == '>') { /* end of symbol */ + + /* + * This restarts the token from the beginning + * the next time we scan a character. (This + * token is complete.) + */ + + if (token == NULL) { + yyerror(_("missing symbolic name")); + return (T_NULL); + } + tokidx = 0; + + /* + * A few symbols are handled as keywords outside + * of the normal categories. + */ + if (category == T_END) { + int i; + for (i = 0; symwords[i].name != 0; i++) { + if (strcmp(token, symwords[i].name) == + 0) { + last_kw = symwords[i].id; + return (last_kw); + } + } + } + /* its an undefined symbol */ + yylval.token = strdup(token); + if (yylval.token == NULL) { + perror("malloc"); + exit(1); + } + token = NULL; + toksz = 0; + tokidx = 0; + return (T_SYMBOL); + } + add_tok(c); + } + + yyerror(_("unterminated symbolic name")); + return (EOF); +} + + +static int +consume_token(void) +{ + int len = tokidx; + int i; + + tokidx = 0; + if (token == NULL) + return (T_NULL); + + /* + * this one is special, because we don't want it to alter the + * last_kw field. + */ + if (strcmp(token, "...") == 0) { + return (T_ELLIPSIS); + } + + /* search for reserved words first */ + for (i = 0; keywords[i].name; i++) { + int j; + if (strcmp(keywords[i].name, token) != 0) { + continue; + } + + last_kw = keywords[i].id; + + /* clear the top level category if we're done with it */ + if (last_kw == T_END) { + category = T_END; + } + + /* set the top level category if we're changing */ + for (j = 0; categories[j]; j++) { + if (categories[j] != last_kw) + continue; + category = last_kw; + } + + return (keywords[i].id); + } + + /* maybe its a numeric constant? */ + if (isdigit(*token) || (*token == '-' && isdigit(token[1]))) { + char *eptr; + yylval.num = strtol(token, &eptr, 10); + if (*eptr != 0) + yyerror(_("malformed number")); + return (T_NUMBER); + } + + /* + * A single lone character is treated as a character literal. + * To avoid duplication of effort, we stick in the charmap. + */ + if (len == 1) { + yylval.mbs[0] = 1; /* length */ + yylval.mbs[1] = token[0]; + yylval.mbs[2] = '\0'; + return (T_CHAR); + } + + /* anything else is treated as a symbolic name */ + yylval.token = strdup(token); + token = NULL; + toksz = 0; + tokidx = 0; + return (T_NAME); +} + +void +scan_to_eol(void) +{ + int c; + while ((c = scanc()) != '\n') { + if (c == EOF) { + /* end of file without newline! */ + errf(_("missing newline")); + return; + } + } + assert(c == '\n'); +} + +int +yylex(void) +{ + int c; + + while ((c = scanc()) != EOF) { + + /* special handling for quoted string */ + if (instring) { + if (escaped) { + escaped = 0; + + /* if newline, just eat and forget it */ + if (c == '\n') + continue; + + if (strchr("xXd01234567", c)) { + unscanc(c); + unscanc(esc_char); + return (get_wide()); + } + yylval.mbs[0] = 1; /* length */ + yylval.mbs[1] = get_escaped(c); + yylval.mbs[2] = '\0'; + return (T_CHAR); + } + if (c == esc_char) { + escaped = 1; + continue; + } + switch (c) { + case '<': + return (get_symbol()); + case '>': + /* oops! should generate syntax error */ + return (T_GT); + case '"': + instring = 0; + return (T_QUOTE); + default: + yylval.mbs[0] = 1; /* length */ + yylval.mbs[1] = c; + yylval.mbs[2] = '\0'; + return (T_CHAR); + } + } + + /* escaped characters first */ + if (escaped) { + escaped = 0; + if (c == '\n') { + /* eat the newline */ + continue; + } + hadtok = 1; + if (tokidx) { + /* an escape mid-token is nonsense */ + return (T_NULL); + } + + /* numeric escapes are treated as wide characters */ + if (strchr("xXd01234567", c)) { + unscanc(c); + unscanc(esc_char); + return (get_wide()); + } + + add_tok(get_escaped(c)); + continue; + } + + /* if it is the escape charter itself note it */ + if (c == esc_char) { + escaped = 1; + continue; + } + + /* remove from the comment char to end of line */ + if (c == com_char) { + while (c != '\n') { + if ((c = scanc()) == EOF) { + /* end of file without newline! */ + return (EOF); + } + } + assert(c == '\n'); + if (!hadtok) { + /* + * If there were no tokens on this line, + * then just pretend it didn't exist at all. + */ + continue; + } + hadtok = 0; + return (T_NL); + } + + if (strchr(" \t\n;()<>,\"", c) && (tokidx != 0)) { + /* + * These are all token delimiters. If there + * is a token already in progress, we need to + * process it. + */ + unscanc(c); + return (consume_token()); + } + + switch (c) { + case '\n': + if (!hadtok) { + /* + * If the line was completely devoid of tokens, + * then just ignore it. + */ + continue; + } + /* we're starting a new line, reset the token state */ + hadtok = 0; + return (T_NL); + case ',': + hadtok = 1; + return (T_COMMA); + case ';': + hadtok = 1; + return (T_SEMI); + case '(': + hadtok = 1; + return (T_LPAREN); + case ')': + hadtok = 1; + return (T_RPAREN); + case '>': + hadtok = 1; + return (T_GT); + case '<': + /* symbol start! */ + hadtok = 1; + return (get_symbol()); + case ' ': + case '\t': + /* whitespace, just ignore it */ + continue; + case '"': + hadtok = 1; + instring = 1; + return (T_QUOTE); + default: + hadtok = 1; + add_tok(c); + continue; + } + } + return (EOF); +} + +void +yyerror(const char *msg) +{ + (void) fprintf(stderr, _("%s: %d: error: %s\n"), + filename, lineno, msg); + exit(1); +} + +void +errf(const char *fmt, ...) +{ + char *msg; + + va_list va; + va_start(va, fmt); + (void) vasprintf(&msg, fmt, va); + va_end(va); + + (void) fprintf(stderr, _("%s: %d: error: %s\n"), + filename, lineno, msg); + free(msg); + exit(1); +} + +void +warn(const char *fmt, ...) +{ + char *msg; + + va_list va; + va_start(va, fmt); + (void) vasprintf(&msg, fmt, va); + va_end(va); + + (void) fprintf(stderr, _("%s: %d: warning: %s\n"), + filename, lineno, msg); + free(msg); + warnings++; +} diff --git a/usr/src/pkg/manifests/system-test-utiltest.mf b/usr/src/pkg/manifests/system-test-utiltest.mf index 6d0ec62ca3..9a4ddd4efa 100644 --- a/usr/src/pkg/manifests/system-test-utiltest.mf +++ b/usr/src/pkg/manifests/system-test-utiltest.mf @@ -30,6 +30,7 @@ file path=opt/util-tests/bin/print_json mode=0555 file path=opt/util-tests/bin/utiltest mode=0555 file path=opt/util-tests/runfiles/default.run mode=0444 file path=opt/util-tests/tests/allowed-ips mode=0555 +file path=opt/util-tests/tests/iconv_test mode=0555 file path=opt/util-tests/tests/libnvpair_json/json_00_blank mode=0555 file path=opt/util-tests/tests/libnvpair_json/json_01_boolean mode=0555 file path=opt/util-tests/tests/libnvpair_json/json_02_numbers mode=0555 @@ -42,4 +43,5 @@ file path=opt/util-tests/tests/libnvpair_json/json_common mode=0555 file path=opt/util-tests/tests/printf_test mode=0555 file path=opt/util-tests/tests/xargs_test mode=0555 license lic_CDDL license=lic_CDDL +depend fmri=system/library/iconv/utf-8 type=require depend fmri=system/test/testrunner type=require diff --git a/usr/src/test/util-tests/tests/Makefile b/usr/src/test/util-tests/tests/Makefile index 4709c7adcd..e12ab73c58 100644 --- a/usr/src/test/util-tests/tests/Makefile +++ b/usr/src/test/util-tests/tests/Makefile @@ -14,7 +14,6 @@ # Copyright 2014 Garrett D'Amore <garrett@damore.org> # -SUBDIRS = dladm printf xargs -SUBDIRS = dladm libnvpair_json printf xargs +SUBDIRS = dladm iconv libnvpair_json printf xargs include $(SRC)/test/Makefile.com diff --git a/usr/src/test/util-tests/tests/iconv/Makefile b/usr/src/test/util-tests/tests/iconv/Makefile new file mode 100644 index 0000000000..c0fb8a9940 --- /dev/null +++ b/usr/src/test/util-tests/tests/iconv/Makefile @@ -0,0 +1,49 @@ +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright (c) 2012 by Delphix. All rights reserved. +# Copyright 2012 Nexenta Systems, Inc. All rights reserved. +# + +include $(SRC)/cmd/Makefile.cmd +include $(SRC)/test/Makefile.com + +SHFILES = iconv_test + +ROOTBIN = $(ROOT)/opt/util-tests +TESTDIR = $(ROOTBIN)/tests + +CMDS = $(SHFILES:%=$(TESTDIR)/%) +$(CMDS) := FILEMODE = 0555 + +all: $(PROG) + +$(SHFILES): $(SHFILES).sh + -$(RM) $(SHFILES) + $(CP) $(SHFILES).sh $(SHFILES) + +install: all $(CMDS) + +lint: + +clobber: clean + -$(RM) $(SHFILES) + +clean: + +$(CMDS): $(TESTDIR) + +$(TESTDIR): + $(INS.dir) + +$(TESTDIR)/%: % + $(INS.file) diff --git a/usr/src/test/util-tests/tests/iconv/iconv_test.sh b/usr/src/test/util-tests/tests/iconv/iconv_test.sh new file mode 100644 index 0000000000..e22bce7099 --- /dev/null +++ b/usr/src/test/util-tests/tests/iconv/iconv_test.sh @@ -0,0 +1,111 @@ +#!/bin/sh + +# +# This file and its contents are supplied under the terms of the +# Common Development and Distribution License ("CDDL"), version 1.0. +# You may only use this file in accordance with the terms of version +# 1.0 of the CDDL. +# +# A full copy of the text of the CDDL should have accompanied this +# source. A copy of the CDDL is also available via the Internet at +# http://www.illumos.org/license/CDDL. +# + +# +# Copyright 2012 Nexenta Systems, Inc. All rights reserved. +# + +ICONV=${ICONV:-/usr/bin/iconv} +#ICONV=${ROOT}/usr/bin/iconv + +# test name, file a, file b +check() { + if ! cmp -s "$2" "$3" ; then + echo "TEST FAIL: $1" + exit 1 + fi + echo "TEST PASS: $1" +} + + +# fromcs, tocs, in, out +test_conv() { + echo "$3" > in + echo "$4" > o1 + $ICONV -f "$1" -t "$2" < in > o2 + check "${1}:${2}" o1 o2 + rm in o1 o2 +} + +mkmap_one() { + echo '<code_set_name> one' + echo 'CHARMAP' + echo '<NULL>\t\x00' + for i in 8 9 a b c d e f + do + for j in 0 1 2 3 4 5 6 7 8 9 a b c d e f + do + echo "<c1-$i$j>\t\x$i$j" + done + done + echo 'END CHARMAP' +} + +mkmap_two() { + echo '<code_set_name> two' + echo 'CHARMAP' + echo '<NULL>\t\x00' + for i in 8 9 a b c d e f + do + for j in 0 1 2 3 4 5 6 7 8 9 a b c d e f + do + echo "<c1-$i$j>\t\x20\x$i$j" + done + done + echo 'END CHARMAP' +} + +# write 1023 bytes of space +wr1023() { + n=1023 + while [[ $n -gt 0 ]]; do + echo ' \c' + ((n-=1)) + done +} + +# two-byte utf-8 crossing 1024 byte boundary +mkbuf_utf8() { + wr1023 + echo '\0303\0240' +} + +# one-byte 8859-1 at 1024 byte boundary +mkbuf_8859() { + wr1023 + echo '\0340' +} + +# Test some simple, built-in conversions + +test_conv ascii utf-8 abcdef abcdef +test_conv utf-8 ascii abcdef abcdef +test_conv ascii ucs-2le abc 'a\0b\0c\0\n\0\c' +test_conv ucs-2le ascii 'a\0b\0c\0\n\0\c' abc + +# Test user-provided charmap + +mkmap_one > one.cm +mkmap_two > two.cm +test_conv ./one.cm ./two.cm '\0200\0201\0202\c' ' \0200 \0201 \0202\c' +rm one.cm two.cm + +# test crossing 1024 byte buffer boundary + +mkbuf_utf8 > in +mkbuf_8859 > o1 +$ICONV -f UTF-8 -t 8859-1 < in > o2 +check "boundary" o1 o2 +rm in o1 o2 + +exit 0 |
