diff options
Diffstat (limited to 'os400/iconv/bldcsndfa/bldcsndfa.c')
-rw-r--r-- | os400/iconv/bldcsndfa/bldcsndfa.c | 1953 |
1 files changed, 0 insertions, 1953 deletions
diff --git a/os400/iconv/bldcsndfa/bldcsndfa.c b/os400/iconv/bldcsndfa/bldcsndfa.c deleted file mode 100644 index 48afd54..0000000 --- a/os400/iconv/bldcsndfa/bldcsndfa.c +++ /dev/null @@ -1,1953 +0,0 @@ -/** -*** Build a deterministic finite automaton to associate CCSIDs with -*** character set names. -*** -*** Compile on OS/400 with options SYSIFCOPT(*IFSIO). -*** -*** See Copyright for the status of this software. -*** -*** Author: Patrick Monnerat <pm@datasphere.ch>, DATASPHERE S.A. -**/ - -#include <stdio.h> -#include <errno.h> -#include <stdlib.h> -#include <string.h> -#include <fcntl.h> -#include <ctype.h> - -#include <iconv.h> - - -#ifdef OLDXML -#include "xml.h" -#else -#include <libxml/hash.h> -#include <libxml/parser.h> -#include <libxml/xpath.h> -#include <libxml/xpathInternals.h> -#endif - - -#ifdef __OS400__ -#define iconv_open_error(cd) ((cd).return_value == -1) -#define set_iconv_open_error(cd) ((cd).return_value = -1) -#else -#define iconv_open_error(cd) ((cd) == (iconv_t) -1) -#define set_iconv_open_error(cd) ((cd) = (iconv_t) -1) -#endif - - -#define C_SOURCE_CCSID 500 -#define C_UTF8_CCSID 1208 - - -#define UTF8_SPACE 0x20 -#define UTF8_HT 0x09 -#define UTF8_0 0x30 -#define UTF8_9 0x39 -#define UTF8_A 0x41 -#define UTF8_Z 0x5A -#define UTF8_a 0x61 -#define UTF8_z 0x7A - - -#define GRANULE 128 /* Memory allocation granule. */ - -#define EPSILON 0x100 /* Token for empty transition. */ - - -#ifndef OFFSETOF -#define OFFSETOF(t, f) ((unsigned int) ((char *) &((t *) 0)->f - (char *) 0)) -#endif - -#ifndef OFFSETBY -#define OFFSETBY(t, p, o) ((t *) ((char *) (p) + (unsigned int) (o))) -#endif - - -typedef struct t_transition t_transition; /* NFA/DFA transition. */ -typedef struct t_state t_state; /* NFA/DFA state node. */ -typedef struct t_symlist t_symlist; /* Symbol (i.e.: name) list. */ -typedef struct t_chset t_chset; /* Character set. */ -typedef struct t_stategroup t_stategroup; /* Optimization group. */ -typedef unsigned char utf8char; /* UTF-8 character byte. */ -typedef unsigned char byte; /* Untyped data byte. */ - - -typedef struct { /* Set of pointers. */ - unsigned int p_size; /* Current allocated size. */ - unsigned int p_card; /* Current element count. */ - void * p_set[1]; /* Element array. */ -} t_powerset; - - -struct t_transition { - t_transition * t_forwprev; /* Head of forward transition list. */ - t_transition * t_forwnext; /* Tail of forward transition list. */ - t_transition * t_backprev; /* Head of backward transition list. */ - t_transition * t_backnext; /* Tail of backward transition list. */ - t_state * t_from; /* Incoming state. */ - t_state * t_to; /* Destination state. */ - unsigned short t_token; /* Transition token. */ - unsigned int t_index; /* Transition array index. */ -}; - - -struct t_state { - t_state * s_next; /* Next state (for DFA construction). */ - t_state * s_stack; /* Unprocessed DFA states stack. */ - t_transition * s_forward; /* Forward transitions. */ - t_transition * s_backward; /* Backward transitions. */ - t_chset * s_final; /* Recognized character set. */ - t_powerset * s_nfastates; /* Corresponding NFA states. */ - unsigned int s_index; /* State index. */ -}; - - -struct t_symlist { - t_symlist * l_next; /* Next name in list. */ - utf8char l_symbol[1]; /* Name bytes. */ -}; - - -struct t_chset { - t_chset * c_next; /* Next character set. */ - t_symlist * c_names; /* Character set name list. */ - iconv_t c_fromUTF8; /* Conversion from UTF-8. */ - unsigned int c_ccsid; /* IBM character set code. */ - unsigned int c_mibenum; /* IANA character code. */ -}; - - -struct t_stategroup { - t_stategroup * g_next; /* Next group. */ - t_state * g_member; /* Group member (s_stack) list. */ - unsigned int g_id; /* Group ident. */ -}; - - - -t_chset * chset_list; /* Character set list. */ -t_state * initial_state; /* Initial NFA state. */ -iconv_t job2utf8; /* Job CCSID to UTF-8 conversion. */ -iconv_t utf82job; /* UTF-8 to job CCSID conversion. */ -t_state * dfa_states; /* List of DFA states. */ -unsigned int groupid; /* Group ident counter. */ - - -/** -*** UTF-8 strings. -**/ - -#pragma convert(819) - -static const utf8char utf8_MIBenum[] = "MIBenum"; -static const utf8char utf8_mibenum[] = "mibenum"; -static const utf8char utf8_ibm_[] = "ibm-"; -static const utf8char utf8_IBMCCSID[] = "IBMCCSID"; -static const utf8char utf8_iana_[] = "iana-"; -static const utf8char utf8_Name[] = "Name"; -static const utf8char utf8_Pref_MIME_Name[] = "Preferred MIME Name"; -static const utf8char utf8_Aliases[] = "Aliases"; -static const utf8char utf8_html[] = "html"; -static const utf8char utf8_htmluri[] = "http://www.w3.org/1999/xhtml"; -static const utf8char utf8_A[] = "A"; -static const utf8char utf8_C[] = "C"; -static const utf8char utf8_M[] = "M"; -static const utf8char utf8_N[] = "N"; -static const utf8char utf8_P[] = "P"; -static const utf8char utf8_T[] = "T"; -static const utf8char utf8_ccsid[] = "ccsid"; -static const utf8char utf8_EBCDIC[] = "EBCDIC"; -static const utf8char utf8_ASCII[] = "ASCII"; -static const utf8char utf8_assocnodes[] = "/ccsid_mibenum/assoc[@ccsid]"; -static const utf8char utf8_aliastext[] = - "/ccsid_mibenum/assoc[@ccsid=$C]/alias/text()"; -#ifdef OLDXML -static const utf8char utf8_tablerows[] = - "//table[@id='table-character-sets-1']/*/tr"; -static const utf8char utf8_headerpos[] = - "count(th[text()=$T]/preceding-sibling::th)+1"; -static const utf8char utf8_getmibenum[] = "number(td[$M])"; -static const utf8char utf8_getprefname[] = "string(td[$P])"; -static const utf8char utf8_getname[] = "string(td[$N])"; -static const utf8char utf8_getaliases[] = "td[$A]/text()"; -#else -static const utf8char utf8_tablerows[] = - "//html:table[@id='table-character-sets-1']/*/html:tr"; -static const utf8char utf8_headerpos[] = - "count(html:th[text()=$T]/preceding-sibling::html:th)+1"; -static const utf8char utf8_getmibenum[] = "number(html:td[$M])"; -static const utf8char utf8_getprefname[] = "string(html:td[$P])"; -static const utf8char utf8_getname[] = "string(html:td[$N])"; -static const utf8char utf8_getaliases[] = "html:td[$A]/text()"; -#endif - -#pragma convert(0) - - -/** -*** UTF-8 character length table. -*** -*** Index is first character byte, value is the character byte count. -**/ - -static signed char utf8_chlen[] = { -/* 00-07 */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 08-0F */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 10-17 */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 18-1F */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 20-27 */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 28-2F */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 30-37 */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 38-3F */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 40-47 */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 48-4F */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 50-57 */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 58-5F */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 60-67 */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 68-6F */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 70-77 */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 78-7F */ 1, 1, 1, 1, 1, 1, 1, 1, -/* 80-87 */ -1, -1, -1, -1, -1, -1, -1, -1, -/* 88-8F */ -1, -1, -1, -1, -1, -1, -1, -1, -/* 90-97 */ -1, -1, -1, -1, -1, -1, -1, -1, -/* 98-9F */ -1, -1, -1, -1, -1, -1, -1, -1, -/* A0-A7 */ -1, -1, -1, -1, -1, -1, -1, -1, -/* A8-AF */ -1, -1, -1, -1, -1, -1, -1, -1, -/* B0-B7 */ -1, -1, -1, -1, -1, -1, -1, -1, -/* B8-BF */ -1, -1, -1, -1, -1, -1, -1, -1, -/* C0-C7 */ 2, 2, 2, 2, 2, 2, 2, 2, -/* C8-CF */ 2, 2, 2, 2, 2, 2, 2, 2, -/* D0-D7 */ 2, 2, 2, 2, 2, 2, 2, 2, -/* D8-DF */ 2, 2, 2, 2, 2, 2, 2, 2, -/* E0-E7 */ 3, 3, 3, 3, 3, 3, 3, 3, -/* E8-EF */ 3, 3, 3, 3, 3, 3, 3, 3, -/* F0-F7 */ 4, 4, 4, 4, 4, 4, 4, 4, -/* F8-FF */ 5, 5, 5, 5, 6, 6, -1, -1 -}; - - - -void -chknull(void * p) - -{ - if (p) - return; - - fprintf(stderr, "Not enough memory\n"); - exit(1); -} - - -void -makecode(char * buf, unsigned int ccsid) - -{ - ccsid &= 0xFFFF; - memset(buf, 0, 32); - sprintf(buf, "IBMCCSID%05u0000000", ccsid); -} - - -iconv_t -iconv_open_ccsid(unsigned int ccsidout, - unsigned int ccsidin, unsigned int nullflag) - -{ - char fromcode[33]; - char tocode[33]; - - makecode(fromcode, ccsidin); - makecode(tocode, ccsidout); - memset(tocode + 13, 0, sizeof tocode - 13); - - if (nullflag) - fromcode[18] = '1'; - - return iconv_open(tocode, fromcode); -} - - -unsigned int -getnum(char * * cpp) - -{ - unsigned int n; - char * cp; - - cp = *cpp; - n = 0; - - while (isdigit(*cp)) - n = 10 * n + *cp++ - '0'; - - *cpp = cp; - return n; -} - - -const utf8char * -hashBinaryKey(const byte * bytes, unsigned int len) - -{ - const byte * bp; - utf8char * key; - utf8char * cp; - unsigned int n; - unsigned int n4; - unsigned int i; - - /** - *** Encode binary data in character form to be used as hash - *** table key. - **/ - - n = (4 * len + 2) / 3; - key = (utf8char *) malloc(n + 1); - chknull(key); - bp = bytes; - cp = key; - - for (n4 = n >> 2; n4; n4--) { - i = (bp[0] << 16) | (bp[1] << 8) | bp[2]; - *cp++ = 0x21 + ((i >> 18) & 0x3F); - *cp++ = 0x21 + ((i >> 12) & 0x3F); - *cp++ = 0x21 + ((i >> 6) & 0x3F); - *cp++ = 0x21 + (i & 0x3F); - bp += 3; - } - - switch (n & 0x3) { - - case 2: - *cp++ = 0x21 + ((*bp >> 2) & 0x3F); - *cp++ = 0x21 + ((*bp << 4) & 0x3F); - break; - - case 3: - i = (bp[0] << 8) | bp[1]; - *cp++ = 0x21 + ((i >> 10) & 0x3F); - *cp++ = 0x21 + ((i >> 4) & 0x3F); - *cp++ = 0x21 + ((i << 2) & 0x3F); - break; - } - - *cp = '\0'; - return key; -} - - -void * -hash_get(xmlHashTablePtr h, const void * binkey, unsigned int len) - -{ - const utf8char * key; - void * result; - - key = hashBinaryKey((const byte *) binkey, len); - result = xmlHashLookup(h, key); - free((char *) key); - return result; -} - - -int -hash_add(xmlHashTablePtr h, const void * binkey, unsigned int len, void * data) - -{ - const utf8char * key; - int result; - - key = hashBinaryKey((const byte *) binkey, len); - result = xmlHashAddEntry(h, key, data); - free((char *) key); - return result; -} - - -xmlDocPtr -loadXMLFile(const char * filename) - -{ - struct stat sbuf; - byte * databuf; - int fd; - int i; - xmlDocPtr doc; - - if (stat(filename, &sbuf)) - return (xmlDocPtr) NULL; - - databuf = malloc(sbuf.st_size + 4); - - if (!databuf) - return (xmlDocPtr) NULL; - - fd = open(filename, O_RDONLY -#ifdef O_BINARY - | O_BINARY -#endif - ); - - if (fd < 0) { - free((char *) databuf); - return (xmlDocPtr) NULL; - } - - i = read(fd, (char *) databuf, sbuf.st_size); - close(fd); - - if (i != sbuf.st_size) { - free((char *) databuf); - return (xmlDocPtr) NULL; - } - - databuf[i] = databuf[i + 1] = databuf[i + 2] = databuf[i + 3] = 0; - doc = xmlParseMemory((xmlChar *) databuf, i); - free((char *) databuf); - return doc; -} - - -int -match(char * * cpp, char * s) - -{ - char * cp; - int c1; - int c2; - - cp = *cpp; - - for (cp = *cpp; c2 = *s++; cp++) { - c1 = *cp; - - if (c1 != c2) { - if (isupper(c1)) - c1 = tolower(c1); - - if (isupper(c2)) - c2 = tolower(c2); - } - - if (c1 != c2) - return 0; - } - - c1 = *cp; - - while (c1 == ' ' || c1 == '\t') - c1 = *++cp; - - *cpp = cp; - return 1; -} - - -t_state * -newstate(void) - -{ - t_state * s; - - s = (t_state *) malloc(sizeof *s); - chknull(s); - memset((char *) s, 0, sizeof *s); - return s; -} - - -void -unlink_transition(t_transition * t) - -{ - if (t->t_backnext) - t->t_backnext->t_backprev = t->t_backprev; - - if (t->t_backprev) - t->t_backprev->t_backnext = t->t_backnext; - else if (t->t_to) - t->t_to->s_backward = t->t_backnext; - - if (t->t_forwnext) - t->t_forwnext->t_forwprev = t->t_forwprev; - - if (t->t_forwprev) - t->t_forwprev->t_forwnext = t->t_forwnext; - else if (t->t_from) - t->t_from->s_forward = t->t_forwnext; - - t->t_backprev = (t_transition *) NULL; - t->t_backnext = (t_transition *) NULL; - t->t_forwprev = (t_transition *) NULL; - t->t_forwnext = (t_transition *) NULL; - t->t_from = (t_state *) NULL; - t->t_to = (t_state *) NULL; -} - - -void -link_transition(t_transition * t, t_state * from, t_state * to) - -{ - if (!from) - from = t->t_from; - - if (!to) - to = t->t_to; - - unlink_transition(t); - - if ((t->t_from = from)) { - if ((t->t_forwnext = from->s_forward)) - t->t_forwnext->t_forwprev = t; - - from->s_forward = t; - } - - if ((t->t_to = to)) { - if ((t->t_backnext = to->s_backward)) - t->t_backnext->t_backprev = t; - - to->s_backward = t; - } -} - - -t_transition * -newtransition(unsigned int token, t_state * from, t_state * to) - -{ - t_transition * t; - - t = (t_transition *) malloc(sizeof *t); - chknull(t); - memset((char *) t, 0, sizeof *t); - t->t_token = token; - link_transition(t, from, to); - return t; -} - - -t_transition * -uniquetransition(unsigned int token, t_state * from, t_state * to) - -{ - t_transition * t; - - for (t = from->s_forward; t; t = t->t_forwnext) - if (t->t_token == token && (t->t_to == to || !to)) - return t; - - return to? newtransition(token, from, to): (t_transition *) NULL; -} - - -int -set_position(t_powerset * s, void * e) - -{ - unsigned int l; - unsigned int h; - unsigned int m; - int i; - - l = 0; - h = s->p_card; - - while (l < h) { - m = (l + h) >> 1; - - /** - *** If both pointers belong to different allocation arenas, - *** native comparison may find them neither - *** equal, nor greater, nor smaller. - *** We thus compare using memcmp() to get an orthogonal - *** result. - **/ - - i = memcmp(&e, s->p_set + m, sizeof e); - - if (i < 0) - h = m; - else if (!i) - return m; - else - l = m + 1; - } - - return l; -} - - -t_powerset * -set_include(t_powerset * s, void * e) - -{ - unsigned int pos; - unsigned int n; - - if (!s) { - s = (t_powerset *) malloc(sizeof *s + - GRANULE * sizeof s->p_set); - chknull(s); - s->p_size = GRANULE; - s->p_set[GRANULE] = (t_state *) NULL; - s->p_set[0] = e; - s->p_card = 1; - return s; - } - - pos = set_position(s, e); - - if (pos < s->p_card && s->p_set[pos] == e) - return s; - - if (s->p_card >= s->p_size) { - s->p_size += GRANULE; - s = (t_powerset *) realloc(s, - sizeof *s + s->p_size * sizeof s->p_set); - chknull(s); - s->p_set[s->p_size] = (t_state *) NULL; - } - - n = s->p_card - pos; - - if (n) - memmove((char *) (s->p_set + pos + 1), - (char *) (s->p_set + pos), n * sizeof s->p_set[0]); - - s->p_set[pos] = e; - s->p_card++; - return s; -} - - -t_state * -nfatransition(t_state * to, byte token) - -{ - t_state * from; - - from = newstate(); - newtransition(token, from, to); - return from; -} - - -static t_state * nfadevelop(t_state * from, t_state * final, iconv_t icc, - const utf8char * name, unsigned int len); - - -void -nfaslice(t_state * * from, t_state * * to, iconv_t icc, - const utf8char * chr, unsigned int chlen, - const utf8char * name, unsigned int len, t_state * final) - -{ - char * srcp; - char * dstp; - size_t srcc; - size_t dstc; - unsigned int cnt; - t_state * f; - t_state * t; - t_transition * tp; - byte bytebuf[8]; - - srcp = (char *) chr; - srcc = chlen; - dstp = (char *) bytebuf; - dstc = sizeof bytebuf; - iconv(icc, &srcp, &srcc, &dstp, &dstc); - dstp = (char *) bytebuf; - cnt = sizeof bytebuf - dstc; - t = *to; - f = *from; - - /** - *** Check for end of string. - **/ - - if (!len) - if (t && t != final) - uniquetransition(EPSILON, t, final); - else - t = final; - - if (f) - while (cnt) { - tp = uniquetransition(*dstp, f, (t_state *) NULL); - - if (!tp) - break; - - f = tp->t_to; - dstp++; - cnt--; - } - - if (!cnt) { - if (!t) - t = nfadevelop(f, final, icc, name, len); - - *to = t; - return; - } - - if (!t) { - t = nfadevelop((t_state *) NULL, final, icc, name, len); - *to = t; - } - - if (!f) - *from = f = newstate(); - - while (cnt > 1) - t = nfatransition(t, dstp[--cnt]); - - newtransition(*dstp, f, t); -} - - -t_state * -nfadevelop(t_state * from, t_state * final, iconv_t icc, - const utf8char * name, unsigned int len) - -{ - int chlen; - int i; - t_state * to; - int uccnt; - int lccnt; - utf8char chr; - - chlen = utf8_chlen[*name]; - - for (i = 1; i < chlen; i++) - if ((name[i] & 0xC0) != 0x80) - break; - - if (i != chlen) { - fprintf(stderr, - "Invalid UTF8 character in character set name\n"); - return (t_state *) NULL; - } - - to = (t_state *) NULL; - nfaslice(&from, &to, - icc, name, chlen, name + chlen, len - chlen, final); - - if (*name >= UTF8_a && *name <= UTF8_z) - chr = *name - UTF8_a + UTF8_A; - else if (*name >= UTF8_A && *name <= UTF8_Z) - chr = *name - UTF8_A + UTF8_a; - else - return from; - - nfaslice(&from, &to, icc, &chr, 1, name + chlen, len - chlen, final); - return from; -} - - - -void -nfaenter(const utf8char * name, int len, t_chset * charset) - -{ - t_chset * s; - t_state * final; - t_state * sp; - t_symlist * lp; - - /** - *** Enter case-insensitive `name' in NFA in all known - *** character codes. - *** Redundant shift state changes as well as shift state - *** differences between uppercase and lowercase are - *** not handled. - **/ - - if (len < 0) - len = strlen(name) + 1; - - for (lp = charset->c_names; lp; lp = lp->l_next) - if (!memcmp(name, lp->l_symbol, len)) - return; /* Already entered. */ - - lp = (t_symlist *) malloc(sizeof *lp + len); - chknull(lp); - memcpy(lp->l_symbol, name, len); - lp->l_symbol[len] = '\0'; - lp->l_next = charset->c_names; - charset->c_names = lp; - final = newstate(); - final->s_final = charset; - - for (s = chset_list; s; s = s->c_next) - if (!iconv_open_error(s->c_fromUTF8)) - sp = nfadevelop(initial_state, final, - s->c_fromUTF8, name, len); -} - - -unsigned int -utf8_utostr(utf8char * s, unsigned int v) - -{ - unsigned int d; - unsigned int i; - - d = v / 10; - v -= d * 10; - i = d? utf8_utostr(s, d): 0; - s[i++] = v + UTF8_0; - s[i] = '\0'; - return i; -} - - -unsigned int -utf8_utostrpad(utf8char * s, unsigned int v, int digits) - -{ - unsigned int i = utf8_utostr(s, v); - utf8char pad = UTF8_SPACE; - - if (digits < 0) { - pad = UTF8_0; - digits = -digits; - } - - if (i >= digits) - return i; - - memmove(s + digits - i, s, i + 1); - memset(s, pad, digits - i); - return digits; -} - - -unsigned int -utf8_strtou(const utf8char * s) - -{ - unsigned int v; - - while (*s == UTF8_SPACE || *s == UTF8_HT) - s++; - - for (v = 0; *s >= UTF8_0 && *s <= UTF8_9;) - v = 10 * v + *s++ - UTF8_0; - - return v; -} - - -unsigned int -getNumAttr(xmlNodePtr node, const xmlChar * name) - -{ - const xmlChar * s; - unsigned int val; - - s = xmlGetProp(node, name); - - if (!s) - return 0; - - val = utf8_strtou(s); - xmlFree((xmlChar *) s); - return val; -} - - -void -read_assocs(const char * filename) - -{ - xmlDocPtr doc; - xmlXPathContextPtr ctxt; - xmlXPathObjectPtr obj; - xmlNodePtr node; - t_chset * sp; - int i; - unsigned int ccsid; - unsigned int mibenum; - utf8char symbuf[32]; - - doc = loadXMLFile(filename); - - if (!doc) { - fprintf(stderr, "Cannot load file %s\n", filename); - exit(1); - } - - ctxt = xmlXPathNewContext(doc); - obj = xmlXPathEval(utf8_assocnodes, ctxt); - - if (!obj || obj->type != XPATH_NODESET || !obj->nodesetval || - !obj->nodesetval->nodeTab || !obj->nodesetval->nodeNr) { - fprintf(stderr, "No association found in %s\n", filename); - exit(1); - } - - for (i = 0; i < obj->nodesetval->nodeNr; i++) { - node = obj->nodesetval->nodeTab[i]; - ccsid = getNumAttr(node, utf8_ccsid); - mibenum = getNumAttr(node, utf8_mibenum); - - /** - *** Check for duplicate. - **/ - - for (sp = chset_list; sp; sp = sp->c_next) - if (ccsid && ccsid == sp->c_ccsid || - mibenum && mibenum == sp->c_mibenum) { - fprintf(stderr, "Duplicate character set: "); - fprintf(stderr, "CCSID = %u/%u, ", - ccsid, sp->c_ccsid); - fprintf(stderr, "MIBenum = %u/%u\n", - mibenum, sp->c_mibenum); - break; - } - - if (sp) - continue; - - /** - *** Allocate the new character set. - **/ - - sp = (t_chset *) malloc(sizeof *sp); - chknull(sp); - memset(sp, 0, sizeof *sp); - - if (!ccsid) /* Do not attempt with current job CCSID. */ - set_iconv_open_error(sp->c_fromUTF8); - else { - sp->c_fromUTF8 = - iconv_open_ccsid(ccsid, C_UTF8_CCSID, 0); - - if (iconv_open_error(sp->c_fromUTF8) == -1) - fprintf(stderr, - "Cannot convert into CCSID %u: ignored\n", - ccsid); - } - - sp->c_ccsid = ccsid; - sp->c_mibenum = mibenum; - sp->c_next = chset_list; - chset_list = sp; - } - - xmlXPathFreeObject(obj); - - /** - *** Enter aliases. - **/ - - for (sp = chset_list; sp; sp = sp->c_next) { - strcpy(symbuf, utf8_ibm_); - utf8_utostr(symbuf + 4, sp->c_ccsid); - nfaenter(symbuf, -1, sp); - strcpy(symbuf, utf8_IBMCCSID); - utf8_utostrpad(symbuf + 8, sp->c_ccsid, -5); - nfaenter(symbuf, 13, sp); /* Not null-terminated. */ - - if (sp->c_mibenum) { - strcpy(symbuf, utf8_iana_); - utf8_utostr(symbuf + 5, sp->c_mibenum); - nfaenter(symbuf, -1, sp); - } - - xmlXPathRegisterVariable(ctxt, utf8_C, - xmlXPathNewFloat((double) sp->c_ccsid)); - obj = xmlXPathEval(utf8_aliastext, ctxt); - - if (!obj || obj->type != XPATH_NODESET) { - fprintf(stderr, "getAlias failed in %s\n", filename); - exit(1); - } - - if (obj->nodesetval && - obj->nodesetval->nodeTab && obj->nodesetval->nodeNr) { - for (i = 0; i < obj->nodesetval->nodeNr; i++) { - node = obj->nodesetval->nodeTab[i]; - nfaenter(node->content, -1, sp); - } - } - - xmlXPathFreeObject(obj); - } - - xmlXPathFreeContext(ctxt); - xmlFreeDoc(doc); -} - - -unsigned int -columnPosition(xmlXPathContextPtr ctxt, const xmlChar * header) - -{ - xmlXPathObjectPtr obj; - unsigned int res = 0; - - xmlXPathRegisterVariable(ctxt, utf8_T, xmlXPathNewString(header)); - obj = xmlXPathEval(utf8_headerpos, ctxt); - - if (obj) { - if (obj->type == XPATH_NUMBER) - res = (unsigned int) obj->floatval; - - xmlXPathFreeObject(obj); - } - - return res; -} - - -void -read_iana(const char * filename) - -{ - xmlDocPtr doc; - xmlXPathContextPtr ctxt; - xmlXPathObjectPtr obj1; - xmlXPathObjectPtr obj2; - xmlNodePtr node; - int prefnamecol; - int namecol; - int mibenumcol; - int aliascol; - int mibenum; - t_chset * sp; - int n; - int i; - - doc = loadXMLFile(filename); - - if (!doc) { - fprintf(stderr, "Cannot load file %s\n", filename); - exit(1); - } - - ctxt = xmlXPathNewContext(doc); - -#ifndef OLDXML - xmlXPathRegisterNs(ctxt, utf8_html, utf8_htmluri); -#endif - - obj1 = xmlXPathEval(utf8_tablerows, ctxt); - - if (!obj1 || obj1->type != XPATH_NODESET || !obj1->nodesetval || - !obj1->nodesetval->nodeTab || obj1->nodesetval->nodeNr <= 1) { - fprintf(stderr, "No data in %s\n", filename); - exit(1); - } - - /** - *** Identify columns. - **/ - - xmlXPathSetContextNode(obj1->nodesetval->nodeTab[0], ctxt); - prefnamecol = columnPosition(ctxt, utf8_Pref_MIME_Name); - namecol = columnPosition(ctxt, utf8_Name); - mibenumcol = columnPosition(ctxt, utf8_MIBenum); - aliascol = columnPosition(ctxt, utf8_Aliases); - - if (!prefnamecol || !namecol || !mibenumcol || !aliascol) { - fprintf(stderr, "Key column(s) missing in %s\n", filename); - exit(1); - } - - xmlXPathRegisterVariable(ctxt, utf8_P, - xmlXPathNewFloat((double) prefnamecol)); - xmlXPathRegisterVariable(ctxt, utf8_N, - xmlXPathNewFloat((double) namecol)); - xmlXPathRegisterVariable(ctxt, utf8_M, - xmlXPathNewFloat((double) mibenumcol)); - xmlXPathRegisterVariable(ctxt, utf8_A, - xmlXPathNewFloat((double) aliascol)); - - /** - *** Process each row. - **/ - - for (n = 1; n < obj1->nodesetval->nodeNr; n++) { - xmlXPathSetContextNode(obj1->nodesetval->nodeTab[n], ctxt); - - /** - *** Get the MIBenum from current row. - */ - - obj2 = xmlXPathEval(utf8_getmibenum, ctxt); - - if (!obj2 || obj2->type != XPATH_NUMBER) { - fprintf(stderr, "get MIBenum failed at row %u\n", n); - exit(1); - } - - if (xmlXPathIsNaN(obj2->floatval) || - obj2->floatval < 1.0 || obj2->floatval > 65535.0 || - ((unsigned int) obj2->floatval) != obj2->floatval) { - fprintf(stderr, "invalid MIBenum at row %u\n", n); - xmlXPathFreeObject(obj2); - continue; - } - - mibenum = obj2->floatval; - xmlXPathFreeObject(obj2); - - /** - *** Search the associations for a corresponding CCSID. - **/ - - for (sp = chset_list; sp; sp = sp->c_next) - if (sp->c_mibenum == mibenum) - break; - - if (!sp) - continue; /* No CCSID for this MIBenum. */ - - /** - *** Process preferred MIME name. - **/ - - obj2 = xmlXPathEval(utf8_getprefname, ctxt); - - if (!obj2 || obj2->type != XPATH_STRING) { - fprintf(stderr, - "get Preferred_MIME_Name failed at row %u\n", n); - exit(1); - } - - if (obj2->stringval && obj2->stringval[0]) - nfaenter(obj2->stringval, -1, sp); - - xmlXPathFreeObject(obj2); - - /** - *** Process name. - **/ - - obj2 = xmlXPathEval(utf8_getname, ctxt); - - if (!obj2 || obj2->type != XPATH_STRING) { - fprintf(stderr, "get name failed at row %u\n", n); - exit(1); - } - - if (obj2->stringval && obj2->stringval[0]) - nfaenter(obj2->stringval, -1, sp); - - xmlXPathFreeObject(obj2); - - /** - *** Process aliases. - **/ - - obj2 = xmlXPathEval(utf8_getaliases, ctxt); - - if (!obj2 || obj2->type != XPATH_NODESET) { - fprintf(stderr, "get aliases failed at row %u\n", n); - exit(1); - } - - if (obj2->nodesetval && obj2->nodesetval->nodeTab) - for (i = 0; i < obj2->nodesetval->nodeNr; i++) { - node = obj2->nodesetval->nodeTab[i]; - - if (node && node->content && node->content[0]) - nfaenter(node->content, -1, sp); - } - - xmlXPathFreeObject(obj2); - } - - xmlXPathFreeObject(obj1); - xmlXPathFreeContext(ctxt); - xmlFreeDoc(doc); -} - - -t_powerset * closureset(t_powerset * dst, t_powerset * src); - - -t_powerset * -closure(t_powerset * dst, t_state * src) - -{ - t_transition * t; - unsigned int oldcard; - - if (src->s_nfastates) { - /** - *** Is a DFA state: return closure of set of equivalent - *** NFA states. - **/ - - return closureset(dst, src->s_nfastates); - } - - /** - *** Compute closure of NFA state. - **/ - - dst = set_include(dst, src); - - for (t = src->s_forward; t; t = t->t_forwnext) - if (t->t_token == EPSILON) { - oldcard = dst->p_card; - dst = set_include(dst, t->t_to); - - if (oldcard != dst->p_card) - dst = closure(dst, t->t_to); - } - - return dst; -} - - -t_powerset * -closureset(t_powerset * dst, t_powerset * src) - -{ - unsigned int i; - - for (i = 0; i < src->p_card; i++) - dst = closure(dst, (t_state *) src->p_set[i]); - - return dst; -} - - -t_state * -get_dfa_state(t_state * * stack, - t_powerset * nfastates, xmlHashTablePtr sethash) - -{ - t_state * s; - - if (s = hash_get(sethash, nfastates->p_set, - nfastates->p_card * sizeof nfastates->p_set[0])) { - /** - *** DFA state already present. - *** Release the NFA state set and return - *** the address of the old DFA state. - **/ - - free((char *) nfastates); - return s; - } - - /** - *** Build the new state. - **/ - - s = newstate(); - s->s_nfastates = nfastates; - s->s_next = dfa_states; - dfa_states = s; - s->s_stack = *stack; - *stack = s; - - /** - *** Enter it in hash. - **/ - - if (hash_add(sethash, nfastates->p_set, - nfastates->p_card * sizeof nfastates->p_set[0], s)) - chknull(NULL); /* Memory allocation error. */ - - return s; -} - - -int -transcmp(const void * p1, const void * p2) - -{ - t_transition * t1; - t_transition * t2; - - t1 = *(t_transition * *) p1; - t2 = *(t_transition * *) p2; - return ((int) t1->t_token) - ((int) t2->t_token); -} - - -void -builddfa(void) - -{ - t_powerset * transset; - t_powerset * stateset; - t_state * s; - t_state * s2; - unsigned int n; - unsigned int i; - unsigned int token; - t_transition * t; - t_state * stack; - xmlHashTablePtr sethash; - unsigned int nst; - - transset = set_include(NULL, NULL); - chknull(transset); - stateset = set_include(NULL, NULL); - chknull(stateset); - sethash = xmlHashCreate(1); - chknull(sethash); - dfa_states = (t_state *) NULL; - stack = (t_state *) NULL; - nst = 0; - - /** - *** Build the DFA initial state. - **/ - - get_dfa_state(&stack, closure(NULL, initial_state), sethash); - - /** - *** Build the other DFA states by looking at each - *** possible transition from stacked DFA states. - **/ - - do { - if (!(++nst % 100)) - fprintf(stderr, "%u DFA states\n", nst); - - s = stack; - stack = s->s_stack; - s->s_stack = (t_state *) NULL; - - /** - *** Build a set of all non-epsilon transitions from this - *** state. - **/ - - transset->p_card = 0; - - for (n = 0; n < s->s_nfastates->p_card; n++) { - s2 = s->s_nfastates->p_set[n]; - - for (t = s2->s_forward; t; t = t->t_forwnext) - if (t->t_token != EPSILON) { - transset = set_include(transset, t); - chknull(transset); - } - } - - /** - *** Sort transitions by token. - **/ - - qsort(transset->p_set, transset->p_card, - sizeof transset->p_set[0], transcmp); - - /** - *** Process all transitions, grouping them by token. - **/ - - stateset->p_card = 0; - token = EPSILON; - - for (i = 0; i < transset->p_card; i++) { - t = transset->p_set[i]; - - if (token != t->t_token) { - if (stateset->p_card) { - /** - *** Get the equivalent DFA state - *** and create transition. - **/ - - newtransition(token, s, - get_dfa_state(&stack, - closureset(NULL, stateset), - sethash)); - stateset->p_card = 0; - } - - token = t->t_token; - } - - stateset = set_include(stateset, t->t_to); - } - - if (stateset->p_card) - newtransition(token, s, get_dfa_state(&stack, - closureset(NULL, stateset), sethash)); - } while (stack); - - free((char *) transset); - free((char *) stateset); - xmlHashFree(sethash, NULL); - - /** - *** Reverse the state list to get the initial state first, - *** check for ambiguous prefixes, determine final states, - *** destroy NFA state sets. - **/ - - while (s = dfa_states) { - dfa_states = s->s_next; - s->s_next = stack; - stack = s; - stateset = s->s_nfastates; - s->s_nfastates = (t_powerset *) NULL; - - for (n = 0; n < stateset->p_card; n++) { - s2 = (t_state *) stateset->p_set[n]; - - if (s2->s_final) { - if (s->s_final && s->s_final != s2->s_final) - fprintf(stderr, - "Ambiguous name for CCSIDs %u/%u\n", - s->s_final->c_ccsid, - s2->s_final->c_ccsid); - - s->s_final = s2->s_final; - } - } - - free((char *) stateset); - } - - dfa_states = stack; -} - - -void -deletenfa(void) - -{ - t_transition * t; - t_state * s; - t_state * u; - t_state * stack; - - stack = initial_state; - stack->s_stack = (t_state *) NULL; - - while ((s = stack)) { - stack = s->s_stack; - - while ((t = s->s_forward)) { - u = t->t_to; - unlink_transition(t); - free((char *) t); - - if (!u->s_backward) { - u->s_stack = stack; - stack = u; - } - } - - free((char *) s); - } -} - - -t_stategroup * -newgroup(void) - -{ - t_stategroup * g; - - g = (t_stategroup *) malloc(sizeof *g); - chknull(g); - memset((char *) g, 0, sizeof *g); - g->g_id = groupid++; - return g; -} - - -void -optimizedfa(void) - -{ - unsigned int i; - xmlHashTablePtr h; - t_state * s1; - t_state * s2; - t_state * finstates; - t_state * * sp; - t_stategroup * g1; - t_stategroup * g2; - t_stategroup * ghead; - t_transition * t1; - t_transition * t2; - unsigned int done; - unsigned int startgroup; - unsigned int gtrans[1 << (8 * sizeof(unsigned char))]; - - /** - *** Reduce DFA state count. - **/ - - groupid = 0; - ghead = (t_stategroup *) NULL; - - /** - *** First split: non-final and each distinct final states. - **/ - - h = xmlHashCreate(4); - chknull(h); - - for (s1 = dfa_states; s1; s1 = s1->s_next) { - if (!(g1 = hash_get(h, &s1->s_final, sizeof s1->s_final))) { - g1 = newgroup(); - g1->g_next = ghead; - ghead = g1; - - if (hash_add(h, &s1->s_final, sizeof s1->s_final, g1)) - chknull(NULL); /* Memory allocation error. */ - } - - s1->s_index = g1->g_id; - s1->s_stack = g1->g_member; - g1->g_member = s1; - } - - xmlHashFree(h, NULL); - - /** - *** Subsequent splits: states that have the same forward - *** transition tokens to states in the same group. - **/ - - do { - done = 1; - - for (g2 = ghead; g2; g2 = g2->g_next) { - s1 = g2->g_member; - - if (!s1->s_stack) - continue; - - h = xmlHashCreate(1); - chknull(h); - - /** - *** Build the group transition map. - **/ - - memset((char *) gtrans, ~0, sizeof gtrans); - - for (t1 = s1->s_forward; t1; t1 = t1->t_forwnext) - gtrans[t1->t_token] = t1->t_to->s_index; - - if (hash_add(h, gtrans, sizeof gtrans, g2)) - chknull(NULL); - - /** - *** Process other states in group. - **/ - - sp = &s1->s_stack; - s1 = *sp; - - do { - *sp = s1->s_stack; - - /** - *** Build the transition map. - **/ - - memset((char *) gtrans, ~0, sizeof gtrans); - - for (t1 = s1->s_forward; - t1; t1 = t1->t_forwnext) - gtrans[t1->t_token] = t1->t_to->s_index; - - g1 = hash_get(h, gtrans, sizeof gtrans); - - if (g1 == g2) { - *sp = s1; - sp = &s1->s_stack; - } - else { - if (!g1) { - g1 = newgroup(); - g1->g_next = ghead; - ghead = g1; - - if (hash_add(h, gtrans, - sizeof gtrans, g1)) - chknull(NULL); - } - - s1->s_index = g1->g_id; - s1->s_stack = g1->g_member; - g1->g_member = s1; - done = 0; - } - } while (s1 = *sp); - - xmlHashFree(h, NULL); - } - } while (!done); - - /** - *** Establish group leaders and remap transitions. - **/ - - startgroup = dfa_states->s_index; - - for (g1 = ghead; g1; g1 = g1->g_next) - for (s1 = g1->g_member->s_stack; s1; s1 = s1->s_stack) - for (t1 = s1->s_backward; t1; t1 = t2) { - t2 = t1->t_backnext; - link_transition(t1, NULL, g1->g_member); - } - - /** - *** Remove redundant states and transitions. - **/ - - for (g1 = ghead; g1; g1 = g1->g_next) { - g1->g_member->s_next = (t_state *) NULL; - - while ((s1 = g1->g_member->s_stack)) { - g1->g_member->s_stack = s1->s_stack; - - for (t1 = s1->s_forward; t1; t1 = t2) { - t2 = t1->t_forwnext; - unlink_transition(t1); - free((char *) t1); - } - - free((char *) s1); - } - } - - /** - *** Remove group support and relink DFA states. - **/ - - dfa_states = (t_state *) NULL; - s2 = (t_state *) NULL; - finstates = (t_state *) NULL; - - while (g1 = ghead) { - ghead = g1->g_next; - s1 = g1->g_member; - - if (g1->g_id == startgroup) - dfa_states = s1; /* Keep start state first. */ - else if (s1->s_final) { /* Then final states. */ - s1->s_next = finstates; - finstates = s1; - } - else { /* Finish with non-final states. */ - s1->s_next = s2; - s2 = s1; - } - - free((char *) g1); - } - - for (dfa_states->s_next = finstates; finstates->s_next;) - finstates = finstates->s_next; - - finstates->s_next = s2; -} - - -const char * -inttype(unsigned long max) - -{ - int i; - - for (i = 0; max; i++) - max >>= 1; - - if (i > 8 * sizeof(unsigned int)) - return "unsigned long"; - - if (i > 8 * sizeof(unsigned short)) - return "unsigned int"; - - if (i > 8 * sizeof(unsigned char)) - return "unsigned short"; - - return "unsigned char"; -} - - -listids(FILE * fp) - -{ - unsigned int pos; - t_chset * cp; - t_symlist * lp; - char * srcp; - char * dstp; - size_t srcc; - size_t dstc; - char buf[80]; - - fprintf(fp, "/**\n*** CCSID For arg Recognized name.\n"); - pos = 0; - - for (cp = chset_list; cp; cp = cp->c_next) { - if (pos) { - fprintf(fp, "\n"); - pos = 0; - } - - if (!cp->c_names) - continue; - - pos = fprintf(fp, "*** %5u %c ", cp->c_ccsid, - iconv_open_error(cp->c_fromUTF8)? ' ': 'X'); - - for (lp = cp->c_names; lp; lp = lp->l_next) { - srcp = (char *) lp->l_symbol; - srcc = strlen(srcp); - dstp = buf; - dstc = sizeof buf; - iconv(utf82job, &srcp, &srcc, &dstp, &dstc); - srcc = dstp - buf; - - if (pos + srcc > 79) { - fprintf(fp, "\n***%22c", ' '); - pos = 25; - } - - pos += fprintf(fp, " %.*s", srcc, buf); - } - } - - if (pos) - fprintf(fp, "\n"); - - fprintf(fp, "**/\n\n"); -} - - -void -generate(FILE * fp) - -{ - unsigned int nstates; - unsigned int ntrans; - unsigned int maxfinal; - t_state * s; - t_transition * t; - unsigned int i; - unsigned int pos; - char * ns; - - /** - *** Assign indexes to states and transitions. - **/ - - nstates = 0; - ntrans = 0; - maxfinal = 0; - - for (s = dfa_states; s; s = s->s_next) { - s->s_index = nstates++; - - if (s->s_final) - maxfinal = nstates; - - for (t = s->s_forward; t; t = t->t_forwnext) - t->t_index = ntrans++; - } - - fprintf(fp, - "/**\n*** %u states, %u finals, %u transitions.\n**/\n\n", - nstates, maxfinal, ntrans); - fprintf(stderr, "%u states, %u finals, %u transitions.\n", - nstates, maxfinal, ntrans); - - /** - *** Generate types. - **/ - - fprintf(fp, "typedef unsigned short t_ccsid;\n"); - fprintf(fp, "typedef %-23s t_staterange;\n", inttype(nstates)); - fprintf(fp, "typedef %-23s t_transrange;\n\n", inttype(ntrans)); - - /** - *** Generate first transition index for each state. - **/ - - fprintf(fp, "static t_transrange trans_array[] = {\n"); - pos = 0; - ntrans = 0; - - for (s = dfa_states; s; s = s->s_next) { - pos += fprintf(fp, " %u,", ntrans); - - if (pos > 72) { - fprintf(fp, "\n"); - pos = 0; - } - - for (t = s->s_forward; t; t = t->t_forwnext) - ntrans++; - } - - fprintf(fp, " %u\n};\n\n", ntrans); - - /** - *** Generate final state info. - **/ - - fprintf(fp, "static t_ccsid final_array[] = {\n"); - pos = 0; - ns =""; - i = 0; - - for (s = dfa_states; s && i++ < maxfinal; s = s->s_next) { - pos += fprintf(fp, "%s", ns); - ns = ","; - - if (pos > 72) { - fprintf(fp, "\n"); - pos = 0; - } - - pos += fprintf(fp, " %u", - s->s_final? s->s_final->c_ccsid + 1: 0); - } - - fprintf(fp, "\n};\n\n"); - - /** - *** Generate goto table. - **/ - - fprintf(fp, "static t_staterange goto_array[] = {\n"); - pos = 0; - - for (s = dfa_states; s; s = s->s_next) - for (t = s->s_forward; t; t = t->t_forwnext) { - pos += fprintf(fp, " %u,", t->t_to->s_index); - - if (pos > 72) { - fprintf(fp, "\n"); - pos = 0; - } - } - - fprintf(fp, " %u\n};\n\n", nstates); - - /** - *** Generate transition label table. - **/ - - fprintf(fp, "static unsigned char label_array[] = {\n"); - pos = 0; - ns =""; - - for (s = dfa_states; s; s = s->s_next) - for (t = s->s_forward; t; t = t->t_forwnext) { - pos += fprintf(fp, "%s", ns); - ns = ","; - - if (pos > 72) { - fprintf(fp, "\n"); - pos = 0; - } - - pos += fprintf(fp, " 0x%02X", t->t_token); - } - - fprintf(fp, "\n};\n", nstates); -} - - -main(argc, argv) -int argc; -char * * argv; - -{ - FILE * fp; - t_chset * csp; - char symbuf[20]; - - chset_list = (t_chset *) NULL; - initial_state = newstate(); - job2utf8 = iconv_open_ccsid(C_UTF8_CCSID, C_SOURCE_CCSID, 0); - utf82job = iconv_open_ccsid(C_SOURCE_CCSID, C_UTF8_CCSID, 0); - - if (argc != 4) { - fprintf(stderr, "Usage: %s <ccsid-mibenum file> ", *argv); - fprintf(stderr, "<iana-character-set file> <output file>\n"); - exit(1); - } - - /** - *** Read CCSID/MIBenum associations. Define special names. - **/ - - read_assocs(argv[1]); - - /** - *** Read character set names and establish the case-independent - *** name DFA in all possible CCSIDs. - **/ - - read_iana(argv[2]); - - /** - *** Build DFA from NFA. - **/ - - builddfa(); - - /** - *** Delete NFA. - **/ - - deletenfa(); - - /** - *** Minimize the DFA state count. - **/ - - optimizedfa(); - - /** - *** Generate the table. - **/ - - fp = fopen(argv[3], "w+"); - - if (!fp) { - perror(argv[3]); - exit(1); - } - - fprintf(fp, "/**\n"); - fprintf(fp, "*** Character set names table.\n"); - fprintf(fp, "*** Generated by program BLDCSNDFA from"); - fprintf(fp, " IANA character set assignment file\n"); - fprintf(fp, "*** and CCSID/MIBenum equivalence file.\n"); - fprintf(fp, "*** *** Do not edit by hand ***\n"); - fprintf(fp, "**/\n\n"); - listids(fp); - generate(fp); - - if (ferror(fp)) { - perror(argv[3]); - fclose(fp); - exit(1); - } - - fclose(fp); - iconv_close(job2utf8); - iconv_close(utf82job); - exit(0); -} |