diff options
author | Igor Pashev <pashev.igor@gmail.com> | 2012-06-24 22:28:35 +0000 |
---|---|---|
committer | Igor Pashev <pashev.igor@gmail.com> | 2012-06-24 22:28:35 +0000 |
commit | 3950ffe2a485479f6561c27364d3d7df5a21d124 (patch) | |
tree | 468c6e14449d1b1e279222ec32f676b0311917d2 /src/lib/libast/comp/iconv.c | |
download | ksh-upstream.tar.gz |
Imported Upstream version 93u+upstream
Diffstat (limited to 'src/lib/libast/comp/iconv.c')
-rw-r--r-- | src/lib/libast/comp/iconv.c | 1599 |
1 files changed, 1599 insertions, 0 deletions
diff --git a/src/lib/libast/comp/iconv.c b/src/lib/libast/comp/iconv.c new file mode 100644 index 0000000..ba24988 --- /dev/null +++ b/src/lib/libast/comp/iconv.c @@ -0,0 +1,1599 @@ +/*********************************************************************** +* * +* This software is part of the ast package * +* Copyright (c) 1985-2012 AT&T Intellectual Property * +* and is licensed under the * +* Eclipse Public License, Version 1.0 * +* by AT&T Intellectual Property * +* * +* A copy of the License is available at * +* http://www.eclipse.org/org/documents/epl-v10.html * +* (with md5 checksum b35adb5213ca9657e911e9befb180842) * +* * +* Information and Software Systems Research * +* AT&T Research * +* Florham Park NJ * +* * +* Glenn Fowler <gsf@research.att.com> * +* David Korn <dgk@research.att.com> * +* Phong Vo <kpv@research.att.com> * +* * +***********************************************************************/ +#pragma prototyped + +/* + * Glenn Fowler + * AT&T Research + * + * iconv intercept + * minimally provides { utf*<=>bin ascii<=>ebcdic* } + */ + +#include <ast.h> +#include <dirent.h> + +#define DEBUG_TRACE 0 +#define _ICONV_LIST_PRIVATE_ + +#include <ccode.h> +#include <ctype.h> +#include <iconv.h> + +#include "lclib.h" + +#if !_lib_iconv_open + +#define _ast_iconv_t iconv_t +#define _ast_iconv_f iconv_f +#define _ast_iconv_list_t iconv_list_t +#define _ast_iconv_open iconv_open +#define _ast_iconv iconv +#define _ast_iconv_close iconv_close +#define _ast_iconv_list iconv_list +#define _ast_iconv_move iconv_move +#define _ast_iconv_name iconv_name +#define _ast_iconv_write iconv_write + +#endif + +#ifndef E2BIG +#define E2BIG ENOMEM +#endif +#ifndef EILSEQ +#define EILSEQ EIO +#endif + +#define RETURN(e,n,fn) \ + if (*fn && !e) e = E2BIG; \ + if (e) { errno = e; return (size_t)(-1); } \ + return n; + +typedef struct Map_s +{ + char* name; + const unsigned char* map; + _ast_iconv_f fun; + int index; +} Map_t; + +typedef struct Conv_s +{ + iconv_t cvt; + char* buf; + size_t size; + Map_t from; + Map_t to; +} Conv_t; + +static Conv_t* freelist[4]; +static int freeindex; + +static const char name_local[] = "local"; +static const char name_native[] = "native"; + +static const _ast_iconv_list_t codes[] = +{ + { + "utf", + "un|unicode|utf", + "multibyte 8-bit unicode", + "UTF-%s", + "8", + CC_UTF, + }, + + { + "ume", + "um|ume|utf?(-)7", + "multibyte 7-bit unicode", + "UTF-7", + 0, + CC_UME, + }, + + { + "euc", + "(big|euc)*", + "euc family", + 0, + 0, + CC_ICONV, + }, + + { + "dos", + "dos?(-)?(855)", + "dos code page", + "DOS855", + 0, + CC_ICONV, + }, + + { + "ucs", + "ucs?(-)?(2)?(be)|utf-16?(be)", + "unicode runes", + "UCS-%s", + "2", + CC_UCS, + }, + + { + "ucs-le", + "ucs?(-)?(2)le|utf-16le", + "little endian unicode runes", + "UCS-%sLE", + "2", + CC_SCU, + }, + + { 0 }, +}; + +#if _UWIN + +#include <ast_windows.h> + +#ifndef CP_UCS2 +#define CP_UCS2 0x0000 +#endif + +static char _win_maps[] = "/reg/local_machine/SOFTWARE/Classes/MIME/Database/Charset"; + +/* + * return the codeset index given its name or alias + * the map is in the what? oh, the registry + */ + +static int +_win_codeset(const char* name) +{ + register char* s; + char* e; + int n; + Sfio_t* sp; + char aka[128]; + char tmp[128]; + +#if DEBUG_TRACE +error(DEBUG_TRACE, "AHA#%d _win_codeset name=%s", __LINE__, name); +#endif + if (name == name_native) + return CP_ACP; + if (!strcasecmp(name, "utf") || !strcasecmp(name, "utf8") || !strcasecmp(name, "utf-8")) + return CP_UTF8; + if (!strcasecmp(name, "ucs") || !strcasecmp(name, "ucs2") || !strcasecmp(name, "ucs-2")) + return CP_UCS2; + if (name[0] == '0' && name[1] == 'x' && (n = strtol(name, &e, 0)) > 0 && !*e) + return n; + for (;;) + { + sfsprintf(tmp, sizeof(tmp), "%s/%s", _win_maps, name); + if (!(sp = sfopen(0, tmp, "r"))) + { + s = (char*)name; + if ((s[0] == 'c' || s[0] == 'C') && (s[1] == 'p' || s[1] == 'P')) + s += 2; + if (!isdigit(s[0])) + break; + sfsprintf(tmp, sizeof(tmp), "%s/windows-%s", _win_maps, s); + if (!(sp = sfopen(0, tmp, "r"))) + break; + } + for (;;) + { + if (!(s = sfgetr(sp, '\n', 0))) + { + sfclose(sp); + return -1; + } + if (!strncasecmp(s, "AliasForCharSet=", 16)) + { + n = sfvalue(sp) - 17; + s += 16; + if (n >= sizeof(aka)) + n = sizeof(aka) - 1; + memcpy(aka, s, n); + aka[n] = 0; + sfclose(sp); + name = (const char*)aka; + break; + } + if (!strncasecmp(s, "CodePage=", 9)) + { + s += 9; + n = strtol(s, 0, 0); + sfclose(sp); + return n; + } + } + } + return -1; +} + +/* + * get and check the codeset indices + */ + +static _ast_iconv_t +_win_iconv_open(register Conv_t* cc, const char* t, const char* f) +{ +#if DEBUG_TRACE +error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=%s t=%s\n", __LINE__, f, t); +#endif + if ((cc->from.index = _win_codeset(f)) < 0) + return (_ast_iconv_t)(-1); + if ((cc->to.index = _win_codeset(t)) < 0) + return (_ast_iconv_t)(-1); +#if DEBUG_TRACE +error(DEBUG_TRACE, "AHA#%d _win_iconv_open f=0x%04x t=0x%04x\n", __LINE__, cc->from.index, cc->to.index); +#endif + return (_ast_iconv_t)cc; +} + +/* + * even though the indices already check out + * they could still be rejected + */ + +static size_t +_win_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) +{ + Conv_t* cc = (Conv_t*)cd; + size_t un; + size_t tz; + size_t fz; + size_t bz; + size_t pz; + size_t oz; + LPWSTR ub; + +#if DEBUG_TRACE +error(DEBUG_TRACE, "AHA#%d _win_iconv from=0x%04x to=0x%04x\n", __LINE__, cc->from.index, cc->to.index); +#endif + if (cc->from.index == cc->to.index || cc->from.index != CP_UCS2 && cc->to.index == 0) + { + /* + * easy + */ + + fz = tz = (*fn < *tn) ? *fn : *tn; + memcpy(*tb, *fb, fz); + } + else + { + ub = 0; + un = *fn; + + /* + * from => ucs-2 + */ + + if (cc->to.index == CP_UCS2) + { + if ((tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, *tn)) && tz <= *tn) + { + fz = *fn; + tz *= sizeof(WCHAR); + } + else + { + /* + * target too small + * binary search on input size to make it fit + */ + + oz = 0; + pz = *fn / 2; + fz = *fn - pz; + for (;;) + { + while (!(tz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)fz, (LPWSTR)*tb, 0))) + if (++fz >= *fn) + goto nope; + tz *= sizeof(WCHAR); + if (tz == *tn) + break; + if (!(pz /= 2)) + { + if (!(fz = oz)) + goto nope; + break; + } + if (tz > *tn) + fz -= pz; + else + { + oz = fz; + fz += pz; + } + } + } + } + else + { + if (cc->from.index == CP_UCS2) + { + un = *fn / sizeof(WCHAR); + ub = (LPWSTR)*fb; + } + else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)*tb, 0))) + goto nope; + else if (!(ub = (LPWSTR)malloc(un * sizeof(WCHAR)))) + goto nope; + else if (!(un = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)*fn, (LPWSTR)ub, un))) + goto nope; + + /* + * ucs-2 => to + */ + + if (tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, un, *tb, *tn, 0, 0)) + fz = *fn; + else + { + /* + * target too small + * binary search on input size to make it fit + */ + + oz = 0; + pz = *fn / 2; + bz = *fn - pz; + for (;;) + { + while (!(fz = MultiByteToWideChar(cc->from.index, 0, (LPCSTR)*fb, (int)bz, (LPWSTR)ub, un))) + if (++bz > *fn) + goto nope; + if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, 0, 0, 0))) + goto nope; + if (tz == *tn) + break; + if (!(pz /= 2)) + { + if (!(fz = oz)) + goto nope; + break; + } + if (tz > *tn) + bz -= pz; + else + { + oz = bz; + bz += pz; + } + } + if (!(tz = WideCharToMultiByte(cc->to.index, 0, (LPCWSTR)ub, fz, *tb, tz, 0, 0))) + goto nope; +#if DEBUG_TRACE +error(DEBUG_TRACE, "AHA#%d _win_iconv *fn=%u fz=%u[%u] *tn=%u tz=%u\n", __LINE__, *fn, fz, fz * sizeof(WCHAR), *tn, tz); +#endif +#if 0 + fz *= sizeof(WCHAR); +#endif + } + if (ub != (LPWSTR)*fb) + free(ub); + } + } + *fb += fz; + *fn -= fz; + *tb += tz; + *tn -= tz; + return fz; + nope: + if (ub && ub != (LPWSTR)*fb) + free(ub); + errno = EINVAL; + return (size_t)(-1); +} + +#endif + +/* + * return canonical character code set name for m + * if b!=0 then canonical name placed in b of size n + * <ccode.h> index returned + */ + +int +_ast_iconv_name(register const char* m, register char* b, size_t n) +{ + register const _ast_iconv_list_t* cp; + const _ast_iconv_list_t* bp; + register int c; + register char* e; + int sub[2]; + char buf[16]; +#if DEBUG_TRACE + char* o; +#endif + + if (!b) + { + b = buf; + n = sizeof(buf); + } +#if DEBUG_TRACE + o = b; +#endif + e = b + n - 1; + bp = 0; + n = 0; + cp = ccmaplist(NiL); +#if DEBUG_TRACE +if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name m=\"%s\"\n", error_info.id, error_info.trace, __LINE__, m); +#endif + for (;;) + { +#if DEBUG_TRACE +if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name n=%d bp=%p cp=%p ccode=%d name=\"%s\"\n", error_info.id, error_info.trace, __LINE__, n, bp, cp, cp->ccode, cp->name); +#endif + if (strgrpmatch(m, cp->match, sub, elementsof(sub) / 2, STR_MAXIMAL|STR_LEFT|STR_ICASE)) + { + if (!(c = m[sub[1]])) + { + bp = cp; + break; + } + if (sub[1] > n && !isalpha(c)) + { + bp = cp; + n = sub[1]; + } + } + if (cp->ccode < 0) + { + if (!(++cp)->name) + break; + } + else if (!(cp = (const _ast_iconv_list_t*)ccmaplist((_ast_iconv_list_t*)cp))) + cp = codes; + } + if (cp = bp) + { + if (cp->canon) + { + if (cp->index) + { + for (m += sub[1]; *m && !isalnum(*m); m++); + if (!isdigit(*m)) + m = cp->index; + } + else + m = "1"; + b += sfsprintf(b, e - b, cp->canon, m); + } + else if (cp->ccode == CC_NATIVE) + { + if ((locales[AST_LC_CTYPE]->flags & LC_default) || !locales[AST_LC_CTYPE]->charset || !(m = locales[AST_LC_CTYPE]->charset->code) || streq(m, "iso8859-1")) + switch (CC_NATIVE) + { + case CC_EBCDIC: + m = (const char*)"EBCDIC"; + break; + case CC_EBCDIC_I: + m = (const char*)"EBCDIC-I"; + break; + case CC_EBCDIC_O: + m = (const char*)"EBCDIC-O"; + break; + default: + m = (const char*)"ISO-8859-1"; + break; + } + b += sfsprintf(b, e - b, "%s", m); + } + *b = 0; +#if DEBUG_TRACE +if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, cp->ccode, o); +#endif + return cp->ccode; + } + while (b < e && (c = *m++)) + { + if (islower(c)) + c = toupper(c); + *b++ = c; + } + *b = 0; +#if DEBUG_TRACE +if (error_info.trace < DEBUG_TRACE) sfprintf(sfstderr, "%s: debug-%d: AHA%d _ast_iconv_name ccode=%d canon=\"%s\"\n", error_info.id, error_info.trace, __LINE__, CC_ICONV, o); +#endif + return CC_ICONV; +} + +/* + * convert utf-8 to bin + */ + +static size_t +utf2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) +{ + register unsigned char* f; + register unsigned char* fe; + register unsigned char* t; + register unsigned char* te; + register unsigned char* p; + register int c; + register int w; + size_t n; + int e; + + e = 0; + f = (unsigned char*)(*fb); + fe = f + (*fn); + t = (unsigned char*)(*tb); + te = t + (*tn); + while (t < te && f < fe) + { + p = f; + c = *f++; + if (c & 0x80) + { + if (!(c & 0x40)) + { + f = p; + e = EILSEQ; + break; + } + if (c & 0x20) + { + w = (c & 0x0F) << 12; + if (f >= fe) + { + f = p; + e = EINVAL; + break; + } + c = *f++; + if (c & 0x40) + { + f = p; + e = EILSEQ; + break; + } + w |= (c & 0x3F) << 6; + } + else + w = (c & 0x1F) << 6; + if (f >= fe) + { + f = p; + e = EINVAL; + break; + } + c = *f++; + w |= (c & 0x3F); + } + else + w = c; + *t++ = w; + } + *fn -= (char*)f - (*fb); + *fb = (char*)f; + *tn -= (n = (char*)t - (*tb)); + *tb = (char*)t; + RETURN(e, n, fn); +} + +/* + * convert bin to utf-8 + */ + +static size_t +bin2utf(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) +{ + register unsigned char* f; + register unsigned char* fe; + register unsigned char* t; + register unsigned char* te; + register int c; + wchar_t w; + size_t n; + int e; + + e = 0; + f = (unsigned char*)(*fb); + fe = f + (*fn); + t = (unsigned char*)(*tb); + te = t + (*tn); + while (f < fe && t < te) + { + if (!mbwide()) + { + c = 1; + w = *f; + } + else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0) + { + e = EINVAL; + break; + } + else if (!c) + c = 1; + if (!(w & ~0x7F)) + *t++ = w; + else + { + if (!(w & ~0x7FF)) + { + if (t >= (te - 2)) + { + e = E2BIG; + break; + } + *t++ = 0xC0 + (w >> 6); + } + else if (!(w & ~0xffff)) + { + if (t >= (te - 3)) + { + e = E2BIG; + break; + } + *t++ = 0xE0 + (w >> 12); + *t++ = 0x80 + ((w >> 6 ) & 0x3F); + } + else + { + e = EILSEQ; + break; + } + *t++ = 0x80 + (w & 0x3F); + } + f += c; + } + *fn -= (n = (char*)f - (*fb)); + *fb = (char*)f; + *tn -= (char*)t - (*tb); + *tb = (char*)t; + RETURN(e, n, fn); +} + +static const unsigned char ume_D[] = +"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789'(),-./:?!\"#$%&*;<=>@[]^_`{|} \t\n"; + +static const unsigned char ume_M[] = +"ABCDEFGHIJKLMNOPQRSTUVWXYZabcdefghijklmnopqrstuvwxyz0123456789+/"; + +static unsigned char ume_d[UCHAR_MAX+1]; + +static unsigned char ume_m[UCHAR_MAX+1]; + +#define NOE 0xFF +#define UMEINIT() (ume_d[ume_D[0]]?0:umeinit()) + +/* + * initialize the ume tables + */ + +static int +umeinit(void) +{ + register const unsigned char* s; + register int i; + register int c; + + if (!ume_d[ume_D[0]]) + { + s = ume_D; + while (c = *s++) + ume_d[c] = 1; + memset(ume_m, NOE, sizeof(ume_m)); + for (i = 0; c = ume_M[i]; i++) + ume_m[c] = i; + } + return 0; +} + +/* + * convert utf-7 to bin + */ + +static size_t +ume2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) +{ + register unsigned char* f; + register unsigned char* fe; + register unsigned char* t; + register unsigned char* te; + register unsigned char* p; + register int s; + register int c; + register int w; + size_t n; + int e; + + e = 0; + UMEINIT(); + f = (unsigned char*)(*fb); + fe = f + (*fn); + t = (unsigned char*)(*tb); + te = t + (*tn); + s = 0; + while (f < fe && t < te) + { + p = f; + c = *f++; + if (s) + { + if (c == '-' && s > 1) + s = 0; + else if ((w = ume_m[c]) == NOE) + { + s = 0; + *t++ = c; + } + else if (f >= (fe - 2)) + { + f = p; + e = EINVAL; + break; + } + else + { + s = 2; + w = (w << 6) | ume_m[*f++]; + w = (w << 6) | ume_m[*f++]; + if (!(w & ~0xFF)) + *t++ = w; + else if (t >= (te - 1)) + { + f = p; + e = E2BIG; + break; + } + else + { + *t++ = (w >> 8) & 0xFF; + *t++ = w & 0xFF; + } + } + } + else if (c == '+') + s = 1; + else + *t++ = c; + } + *fn -= (char*)f - (*fb); + *fb = (char*)f; + *tn -= (n = (char*)t - (*tb)); + *tb = (char*)t; + RETURN(e, n, fn); +} + +/* + * convert bin to utf-7 + */ + +static size_t +bin2ume(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) +{ + register unsigned char* f; + register unsigned char* fe; + register unsigned char* t; + register unsigned char* te; + register int c; + register int s; + wchar_t w; + size_t n; + int e; + + e = 0; + UMEINIT(); + f = (unsigned char*)(*fb); + fe = f + (*fn); + t = (unsigned char*)(*tb); + te = t + (*tn); + s = 0; + while (f < fe && t < (te - s)) + { + if (!mbwide()) + { + c = 1; + w = *f; + } + else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0) + { + e = EINVAL; + break; + } + else if (!c) + c = 1; + if (!(w & ~0x7F) && ume_d[w]) + { + if (s) + { + s = 0; + *t++ = '-'; + } + *t++ = w; + } + else if (t >= (te - (4 + s))) + { + e = E2BIG; + break; + } + else + { + if (!s) + { + s = 1; + *t++ = '+'; + } + *t++ = ume_M[(w >> 12) & 0x3F]; + *t++ = ume_M[(w >> 6) & 0x3F]; + *t++ = ume_M[w & 0x3F]; + } + f += c; + } + if (s) + *t++ = '-'; + *fn -= (n = (char*)f - (*fb)); + *fb = (char*)f; + *tn -= (char*)t - (*tb); + *tb = (char*)t; + RETURN(e, n, fn); +} + +/* + * convert ucs-2 to bin with no byte swap + */ + +static size_t +ucs2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) +{ + register unsigned char* f; + register unsigned char* fe; + register unsigned char* t; + register unsigned char* te; + register int w; + size_t n; + int e; + + e = 0; + f = (unsigned char*)(*fb); + fe = f + (*fn); + t = (unsigned char*)(*tb); + te = t + (*tn); + while (f < (fe - 1) && t < te) + { + w = *f++; + w = (w << 8) | *f++; + if (!(w & ~0xFF)) + *t++ = w; + else if (t >= (te - 1)) + { + f -= 2; + e = E2BIG; + break; + } + else + { + *t++ = (w >> 8) & 0xFF; + *t++ = w & 0xFF; + } + } + *fn -= (char*)f - (*fb); + *fb = (char*)f; + *tn -= (n = (char*)t - (*tb)); + *tb = (char*)t; + RETURN(e, n, fn); +} + +/* + * convert bin to ucs-2 with no byte swap + */ + +static size_t +bin2ucs(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) +{ + register unsigned char* f; + register unsigned char* fe; + register unsigned char* t; + register unsigned char* te; + register int c; + wchar_t w; + size_t n; + int e; + + e = 0; + f = (unsigned char*)(*fb); + fe = f + (*fn); + t = (unsigned char*)(*tb); + te = t + (*tn); + while (f < fe && t < (te - 1)) + { + if (!mbwide()) + { + c = 1; + w = *f; + } + if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0) + { + e = EINVAL; + break; + } + else if (!c) + c = 1; + *t++ = (w >> 8) & 0xFF; + *t++ = w & 0xFF; + f += c; + } + *fn -= (n = (char*)f - (*fb)); + *fb = (char*)f; + *tn -= (char*)t - (*tb); + *tb = (char*)t; + RETURN(e, n, fn); +} + +/* + * convert ucs-2 to bin with byte swap + */ + +static size_t +scu2bin(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) +{ + register unsigned char* f; + register unsigned char* fe; + register unsigned char* t; + register unsigned char* te; + register int w; + size_t n; + int e; + + e = 0; + f = (unsigned char*)(*fb); + fe = f + (*fn); + t = (unsigned char*)(*tb); + te = t + (*tn); + while (f < (fe - 1) && t < te) + { + w = *f++; + w = w | (*f++ << 8); + if (!(w & ~0xFF)) + *t++ = w; + else if (t >= (te - 1)) + { + f -= 2; + e = E2BIG; + break; + } + else + { + *t++ = (w >> 8) & 0xFF; + *t++ = w & 0xFF; + } + } + *fn -= (char*)f - (*fb); + *fb = (char*)f; + *tn -= (n = (char*)t - (*tb)); + *tb = (char*)t; + RETURN(e, n, fn); +} + +/* + * convert bin to ucs-2 with byte swap + */ + +static size_t +bin2scu(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) +{ + register unsigned char* f; + register unsigned char* fe; + register unsigned char* t; + register unsigned char* te; + register int c; + wchar_t w; + size_t n; + int e; + + e = 0; + f = (unsigned char*)(*fb); + fe = f + (*fn); + t = (unsigned char*)(*tb); + te = t + (*tn); + while (f < fe && t < (te - 1)) + { + if (!mbwide()) + { + c = 1; + w = *f; + } + else if ((c = (*_ast_info.mb_towc)(&w, (char*)f, fe - f)) < 0) + { + e = EINVAL; + break; + } + else if (!c) + c = 1; + *t++ = w & 0xFF; + *t++ = (w >> 8) & 0xFF; + f += c; + } + *fn -= (n = (char*)f - (*fb)); + *fb = (char*)f; + *tn -= (char*)t - (*tb); + *tb = (char*)t; + RETURN(e, n, fn); +} + +/* + * open a character code conversion map from f to t + */ + +_ast_iconv_t +_ast_iconv_open(const char* t, const char* f) +{ + register Conv_t* cc; + int fc; + int tc; + int i; + + char fr[64]; + char to[64]; + +#if DEBUG_TRACE +error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s t=%s\n", __LINE__, f, t); +#endif + if (!t || !*t || *t == '-' && !*(t + 1) || !strcasecmp(t, name_local) || !strcasecmp(t, name_native)) + t = name_native; + if (!f || !*f || *f == '-' && !*(f + 1) || !strcasecmp(t, name_local) || !strcasecmp(f, name_native)) + f = name_native; + + /* + * the ast identify is always (iconv_t)(0) + */ + + if (t == f) + return (iconv_t)(0); + fc = _ast_iconv_name(f, fr, sizeof(fr)); + tc = _ast_iconv_name(t, to, sizeof(to)); +#if DEBUG_TRACE +error(DEBUG_TRACE, "AHA#%d _ast_iconv_open f=%s:%s:%d t=%s:%s:%d\n", __LINE__, f, fr, fc, t, to, tc); +#endif + if (fc != CC_ICONV && fc == tc || streq(fr, to)) + return (iconv_t)(0); + + /* + * first check the free list + */ + + for (i = 0; i < elementsof(freelist); i++) + if ((cc = freelist[i]) && streq(to, cc->to.name) && streq(fr, cc->from.name)) + { + freelist[i] = 0; +#if _lib_iconv_open + /* + * reset the shift state if any + */ + + if (cc->cvt != (iconv_t)(-1)) + iconv(cc->cvt, NiL, NiL, NiL, NiL); +#endif + return cc; + } + + /* + * allocate a new one + */ + + if (!(cc = newof(0, Conv_t, 1, strlen(to) + strlen(fr) + 2))) + return (iconv_t)(-1); + cc->to.name = (char*)(cc + 1); + cc->from.name = strcopy(cc->to.name, to) + 1; + strcpy(cc->from.name, fr); + cc->cvt = (iconv_t)(-1); + + /* + * 8 bit maps are the easiest + */ + + if (fc >= 0 && tc >= 0) + cc->from.map = ccmap(fc, tc); +#if _lib_iconv_open + else if ((cc->cvt = iconv_open(t, f)) != (iconv_t)(-1) || (cc->cvt = iconv_open(to, fr)) != (iconv_t)(-1)) + cc->from.fun = (_ast_iconv_f)iconv; +#endif +#if _UWIN + else if ((cc->cvt = _win_iconv_open(cc, t, f)) != (_ast_iconv_t)(-1) || (cc->cvt = _win_iconv_open(cc, to, fr)) != (_ast_iconv_t)(-1)) + cc->from.fun = (_ast_iconv_f)_win_iconv; +#endif + else + { + switch (fc) + { + case CC_UTF: + cc->from.fun = utf2bin; + break; + case CC_UME: + cc->from.fun = ume2bin; + break; + case CC_UCS: + cc->from.fun = ucs2bin; + break; + case CC_SCU: + cc->from.fun = scu2bin; + break; + case CC_ASCII: + break; + default: + if (fc < 0) + goto nope; + cc->from.map = ccmap(fc, CC_ASCII); + break; + } + switch (tc) + { + case CC_UTF: + cc->to.fun = bin2utf; + break; + case CC_UME: + cc->to.fun = bin2ume; + break; + case CC_UCS: + cc->to.fun = bin2ucs; + break; + case CC_SCU: + cc->to.fun = bin2scu; + break; + case CC_ASCII: + break; + default: + if (tc < 0) + goto nope; + cc->to.map = ccmap(CC_ASCII, tc); + break; + } + } + return (iconv_t)cc; + nope: + return (iconv_t)(-1); +} + +/* + * close a character code conversion map + */ + +int +_ast_iconv_close(_ast_iconv_t cd) +{ + Conv_t* cc; + Conv_t* oc; + int i; + int r = 0; + + if (cd == (_ast_iconv_t)(-1)) + return -1; + if (!(cc = (Conv_t*)cd)) + return 0; + + /* + * add to the free list + */ + + i = freeindex; + for (;;) + { + if (++ i >= elementsof(freelist)) + i = 0; + if (!freelist[i]) + break; + if (i == freeindex) + { + if (++ i >= elementsof(freelist)) + i = 0; + + /* + * close the oldest + */ + + if (oc = freelist[i]) + { +#if _lib_iconv_open + if (oc->cvt != (iconv_t)(-1)) + r = iconv_close(oc->cvt); +#endif + if (oc->buf) + free(oc->buf); + free(oc); + } + break; + } + } + freelist[freeindex = i] = cc; + return r; +} + +/* + * copy *fb size *fn to *tb size *tn + * fb,fn tb,tn updated on return + */ + +size_t +_ast_iconv(_ast_iconv_t cd, char** fb, size_t* fn, char** tb, size_t* tn) +{ + Conv_t* cc = (Conv_t*)cd; + register unsigned char* f; + register unsigned char* t; + register unsigned char* e; + register const unsigned char* m; + register size_t n; + char* b; + char* tfb; + size_t tfn; + size_t i; + + if (!fb || !*fb) + { + /* TODO: reset to the initial state */ + if (!tb || !*tb) + return 0; + /* TODO: write the initial state shift sequence */ + return 0; + } + n = *tn; + if (cc) + { + if (cc->from.fun) + { + if (cc->to.fun) + { + if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0))) + { + errno = ENOMEM; + return -1; + } + b = cc->buf; + i = cc->size; + tfb = *fb; + tfn = *fn; + if ((*cc->from.fun)(cc->cvt, &tfb, &tfn, &b, &i) == (size_t)(-1)) + return -1; + tfn = b - cc->buf; + tfb = cc->buf; + n = (*cc->to.fun)(cc->cvt, &tfb, &tfn, tb, tn); + i = tfb - cc->buf; + *fb += i; + *fn -= i; + return n; + } + if ((*cc->from.fun)(cc->cvt, fb, fn, tb, tn) == (size_t)(-1)) + return -1; + n -= *tn; + if (m = cc->to.map) + { + e = (unsigned char*)(*tb); + for (t = e - n; t < e; t++) + *t = m[*t]; + } + return n; + } + else if (cc->to.fun) + { + if (!(m = cc->from.map)) + return (*cc->to.fun)(cc->cvt, fb, fn, tb, tn); + if (!cc->buf && !(cc->buf = oldof(0, char, cc->size = SF_BUFSIZE, 0))) + { + errno = ENOMEM; + return -1; + } + if ((n = *fn) > cc->size) + n = cc->size; + f = (unsigned char*)(*fb); + e = f + n; + t = (unsigned char*)(b = cc->buf); + while (f < e) + *t++ = m[*f++]; + n = (*cc->to.fun)(cc->cvt, &b, fn, tb, tn); + *fb += b - cc->buf; + return n; + } + } + if (n > *fn) + n = *fn; + if (cc && (m = cc->from.map)) + { + f = (unsigned char*)(*fb); + e = f + n; + t = (unsigned char*)(*tb); + while (f < e) + *t++ = m[*f++]; + } + else + memcpy(*tb, *fb, n); + *fb += n; + *fn -= n; + *tb += n; + *tn -= n; + return n; +} + +#define OK ((size_t)-1) + +/* + * write *fb size *fn to op + * fb,fn updated on return + * total bytes written to op returned + */ + +ssize_t +_ast_iconv_write(_ast_iconv_t cd, Sfio_t* op, char** fb, size_t* fn, Iconv_disc_t* disc) +{ + char* fo = *fb; + char* tb; + char* ts; + size_t* e; + size_t tn; + size_t r; + int ok; + Iconv_disc_t compat; + + /* + * the old api had optional size_t* instead of Iconv_disc_t* + */ + + if (!disc || disc->version < 20110101L || disc->version >= 30000101L) + { + e = (size_t*)disc; + disc = &compat; + iconv_init(disc, 0); + } + else + e = 0; + r = 0; + tn = 0; + ok = 1; + while (ok && *fn > 0) + { + if (!(tb = (char*)sfreserve(op, -(tn + 1), SF_WRITE|SF_LOCKR)) || !(tn = sfvalue(op))) + { + if (!r) + r = -1; + break; + } + ts = tb; +#if DEBUG_TRACE +error(DEBUG_TRACE, "AHA#%d iconv_write ts=%p tn=%d", __LINE__, ts, tn); + for (;;) +#else + while (*fn > 0 && _ast_iconv(cd, fb, fn, &ts, &tn) == (size_t)(-1)) +#endif + { +#if DEBUG_TRACE + ssize_t _r; +error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d `%-.*s'", __LINE__, *fn, tn, *fn, *fb); + _r = _ast_iconv(cd, fb, fn, &ts, &tn); +error(DEBUG_TRACE, "AHA#%d iconv_write %d => %d [%d]", __LINE__, *fn, tn, _r); + if (_r != (size_t)(-1) || !fn) + break; +#endif + switch (errno) + { + case E2BIG: + break; + case EINVAL: + if (disc->errorf) + (*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "incomplete multibyte sequence at offset %I*u", sizeof(fo), *fb - fo); + goto bad; + default: + if (disc->errorf) + (*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "invalid multibyte sequence at offset %I*u", sizeof(fo), *fb - fo); + bad: + disc->errors++; + if (!(disc->flags & ICONV_FATAL)) + { + if (!(disc->flags & ICONV_OMIT) && tn > 0) + { + *ts++ = (disc->fill >= 0) ? disc->fill : **fb; + tn--; + } + (*fb)++; + (*fn)--; + continue; + } + ok = 0; + break; + } + break; + } +#if DEBUG_TRACE +error(DEBUG_TRACE, "AHA#%d iconv_write %d", __LINE__, ts - tb); +#endif + sfwrite(op, tb, ts - tb); + r += ts - tb; + } + if (e) + *e = disc->errors; + return r; +} + +/* + * move n bytes from ip to op + */ + +ssize_t +_ast_iconv_move(_ast_iconv_t cd, Sfio_t* ip, Sfio_t* op, size_t n, Iconv_disc_t* disc) +{ + char* fb; + char* fs; + char* tb; + char* ts; + size_t* e; + size_t fe; + size_t fn; + size_t fo; + size_t ft; + size_t tn; + size_t i; + ssize_t r = 0; + int ok = 1; + int locked; + Iconv_disc_t compat; + + /* + * the old api had optional size_t* instead of Iconv_disc_t* + */ + + if (!disc || disc->version < 20110101L || disc->version >= 30000101L) + { + e = (size_t*)disc; + disc = &compat; + iconv_init(disc, 0); + } + else + e = 0; + tb = 0; + fe = OK; + ft = 0; + fn = n; + do + { + if (n != SF_UNBOUND) + n = -((ssize_t)(n & (((size_t)(~0))>>1))); + if ((!(fb = (char*)sfreserve(ip, n, locked = SF_LOCKR)) || !(fo = sfvalue(ip))) && + (!(fb = (char*)sfreserve(ip, n, locked = 0)) || !(fo = sfvalue(ip)))) + break; + fs = fb; + fn = fo; + if (!(tb = (char*)sfreserve(op, SF_UNBOUND, SF_WRITE|SF_LOCKR))) + { + if (!r) + r = -1; + break; + } + ts = tb; + tn = sfvalue(op); + while (fn > 0 && _ast_iconv(cd, &fs, &fn, &ts, &tn) == (size_t)(-1)) + { + switch (errno) + { + case E2BIG: + break; + case EINVAL: + if (fe == ft + (fo - fn)) + { + fe = OK; + if (disc->errorf) + (*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "incomplete multibyte sequence at offset %I*u", sizeof(ft), ft + (fo - fn)); + goto bad; + } + fe = ft; + break; + default: + if (disc->errorf) + (*disc->errorf)(NiL, disc, ERROR_SYSTEM|2, "invalid multibyte sequence at offset %I*u", sizeof(ft), ft + (fo - fn)); + bad: + disc->errors++; + if (!(disc->flags & ICONV_FATAL)) + { + if (!(disc->flags & ICONV_OMIT) && tn > 0) + { + *ts++ = (disc->fill >= 0) ? disc->fill : *fs; + tn--; + } + fs++; + fn--; + continue; + } + ok = 0; + break; + } + break; + } + sfwrite(op, tb, ts - tb); + r += ts - tb; + ts = tb; + if (locked) + sfread(ip, fb, fs - fb); + else + for (i = fn; --i >= (fs - fb);) + sfungetc(ip, fb[i]); + if (n != SF_UNBOUND) + { + if (n <= (fs - fb)) + break; + n -= fs - fb; + } + ft += (fs - fb); + if (fn == fo) + fn++; + } while (ok); + if (fb && locked) + sfread(ip, fb, 0); + if (tb) + { + sfwrite(op, tb, 0); + if (ts > tb) + { + sfwrite(op, tb, ts - tb); + r += ts - tb; + } + } + if (e) + *e = disc->errors; + return r; +} + +/* + * iconv_list_t iterator + * call with arg 0 to start + * prev return value is current arg + */ + +_ast_iconv_list_t* +_ast_iconv_list(_ast_iconv_list_t* cp) +{ +#if _UWIN + struct dirent* ent; + + if (!cp) + { + if (!(cp = newof(0, _ast_iconv_list_t, 1, 0))) + return ccmaplist(NiL); + if (!(cp->data = opendir(_win_maps))) + { + free(cp); + return ccmaplist(NiL); + } + } + if (cp->data) + { + if (ent = readdir((DIR*)cp->data)) + { + cp->name = cp->match = cp->desc = (const char*)ent->d_name; + return cp; + } + closedir((DIR*)cp->data); + free(cp); + return ccmaplist(NiL); + } +#else + if (!cp) + return ccmaplist(NiL); +#endif + if (cp->ccode >= 0) + return (cp = ccmaplist(cp)) ? cp : (_ast_iconv_list_t*)codes; + return (++cp)->name ? cp : (_ast_iconv_list_t*)0; +} |