summaryrefslogtreecommitdiff
path: root/usr/src/lib/iconv_modules/zh/common/zh_TW-big5%zh_TW-iso2022-CN-EXT.c
blob: 0fa5fb41c0c6af022211cfc54169e5fc88393e9a (plain)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
/*
 * CDDL HEADER START
 *
 * The contents of this file are subject to the terms of the
 * Common Development and Distribution License (the "License").
 * You may not use this file except in compliance with the License.
 *
 * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
 * or http://www.opensolaris.org/os/licensing.
 * See the License for the specific language governing permissions
 * and limitations under the License.
 *
 * When distributing Covered Code, include this CDDL HEADER in each
 * file and include the License file at src/OPENSOLARIS.LICENSE.
 * If applicable, add the following below this CDDL HEADER, with the
 * fields enclosed by brackets "[]" replaced with your own identifying
 * information: Portions Copyright [yyyy] [name of copyright owner]
 *
 * CDDL HEADER END
 */
/*
 * Copyright (c) 1997, by Sun Microsystems, Inc.
 * All rights reserved.
 */


/*
   Converts From:	Taiwanese BIG5 encoding
   Converts To:		ISO2022-CN-EXT encoding.

   NOTE: This file was created using vi editor with tabstop set to 4.
		 To view this file correctly set tabstop appropriately.
		 e.g. for vi use command	ESC:se ts=4
 */

#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include "big5_cns11643.h"	/* Big5 to CNS 11643 mapping table */

#define MSB			0x80	/* The most significant bit */
#define ONEBYTE		0xff	/* The right most byte */

#define SI		0x0f	/* shift in */
#define SO		0x0e	/* shift out */
#define SS2		0x4e	/* SS2 low byte. High byte is ESC */
#define SS3		0x4f	/* SS3 low byte. High byte is ESC */
#define ESC		0x1b	/* The Escape character */
#define NON_ID_CHAR	'_' /*Substitute this for all unidentified characters*/

/* GET_PLANEC() - Gets the corresponding ISO assigned plane character for
                  the CNS11643 plane */
static const char plane_char[] = "0GHIJKLMNOPQRSTUV";
#define GET_PLANEC(i)	(plane_char[(i)])

typedef struct _icv_state {
	char	keepc[2];	/* Save the recieved bytes here */
	short	cstate;		/* Current state the state machine is in.
				   These states are C0 or C1*/
	char	ishiftfunc;	/* The currently active shift funtion SI or SO
				   in the output ISO buffer */
	int	iSOplane;	/* The current CNS11643 plane which is
				   assigned to the SOdesignation in the output
				   ISO buffer. Only CNS11643 plane 1 can be
				   assigned to SOdesignation */
	int	iSS2plane;	/* The current CNS11643 plane which is
				   assigned to the SS2designation in the output
				   ISO buffer. Only CNS11643 plane 2 can be
				   assigned to SS2designation */
	int	iSS3plane; 	/* The current CNS11643 plane which is
				   assigned to the SS3designation in the output
				   ISO buffer. All CNS11643 planes >= 3 are
				   assigned to SS3designation */
	size_t	nonidcount; /* Keeps track of skipped input bytes in conversion */
	int	_errno;		/* Internal error number */
} _iconv_st;

enum _CSTATE	{ C0, C1 };

static int isbig5(unsigned char*);
static int hascns(char*);
static int ascii_to_iso(char, _iconv_st*, char**, size_t*);
static int big5_to_iso(int, _iconv_st*, char**, size_t*);
static int getcnsbytes(int, char*, int*);
static int binsearch(unsigned long, table_t[], int);


/*
 * _icv_open: Called from iconv_open. Allocates and initializes _iconv_st
 *            structure. Returns pointer to the structure as (void *).
 */


void *
_icv_open()
{
	_iconv_st  *st;

#ifdef DEBUG
	fprintf(stderr, "_icv_open(): Come into!\n");
#endif
	/* Allocate */
	if ((st = (_iconv_st *) malloc(sizeof(_iconv_st))) == NULL){
		errno = ENOMEM;
#ifdef DEBUG
	fprintf(stderr, "Error\n");
#endif
		return ((void *) -1);
	}

	/* Initialize */
	st->cstate = C0;
	st->ishiftfunc = SI;
	st->iSOplane = -1;
	st->iSS2plane = -1;
	st->iSS3plane = -1;
	st->nonidcount = 0;
	st->_errno = 0;

#ifdef DEBUG
	fprintf(stderr, "====== _icv_open(): Big5 --> ISO2022-CN-EXT =====\n");
#endif

	/* Return struct */
	return ((void *) st);
}



/*
 * _icv_close: Called from iconv_close(). Frees the _iconv_st structure as
 *	       pointed by the argument.
 */

void
_icv_close(_iconv_st *st)
{
	if (st == NULL)
		errno = EBADF;
	else
		free(st);
}

/*
 * _icv_iconv: Called from iconv(). Does the convertion from BIG5 to
 *	       ISO2022-CN-EXT.
 */
/*=======================================================
 *
 *   State Machine for interpreting Big-5 code
 *
 *=======================================================
 *
 *                     1st C
 *    +--------> C0 ----------> C1
 *    |    ascii |        2nd C |
 *    ^          v              v
 *    +----<-----+-----<--------+
 *
 *=======================================================*/
size_t
_icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
				char **outbuf, size_t *outbytesleft)
{

	int n, idx;

#ifdef DEBUG
    fprintf(stderr, "=== _icv_iconv(): Big5 --> ISO2022-CN-EXT =====\n");
#endif

	if (st == NULL) {
	    errno = EBADF;
	    return ((size_t) -1);
	}

	if (inbuf == NULL || *inbuf == NULL ||
	    inbytesleft == NULL || *inbytesleft == 0) { /* Reset request */
	    if (st->ishiftfunc == SO) {
		if (outbytesleft && *outbytesleft >= 1  && outbuf && *outbuf) {
		    **outbuf = SI;
		    (*outbuf)++;
		    (*outbytesleft)--;
		} else {
		    errno = E2BIG;
		    return((size_t) -1);
		}
	    }
	    st->cstate = C0;
	    st->ishiftfunc = SI;
	    st->iSOplane = -1;
	    st->iSS2plane = -1;
	    st->iSS3plane = -1;
	    st->nonidcount = 0;
	    st->_errno = 0;
	    return ((size_t) 0);
	}

	st->_errno = 0;
	errno = 0;

	/* Before we use *inbytesleft or *outbytesleft we should confirm that
	inbytesleft and outbytesleft are non-NULL. I am considering inbytesleft
	or *inbytesleft having 0 value as a reset request. I am considering
	outbytesleft having 0 value as no space in output buffer. Also, here
	itself I am verifying that outbuf and *outbuf should be non-NULL pointers
	so I do not have to worry about them being NULL below in the conversion
	sub-routines. I also confirm here that *outbytesleft should be > 0 before
	we can continue further */

	if (outbytesleft == NULL || *outbytesleft == 0 ||
		outbuf == NULL || *outbuf == NULL){
	    errno = E2BIG;
	    return ((size_t)-1);
	}

	/* A state machine for interpreting Big-5 code */
	while (*inbytesleft > 0 && *outbytesleft > 0) {
	    switch (st->cstate) {
	    case C0:
		if (**inbuf & MSB) { /* May have got the first byte ofa BIG5 code */

		    st->keepc[0] = **inbuf;		/*Save byte */
		    st->cstate = C1;	/* Go to the next state where
					   the next BIG5 byte is recieved */
		    st->nonidcount += 1;/* Until we have verified that this and
					   the next byte make a valid BIG5 code
					   we shall consider this as an
					   unidentified byte */
		} else if (**inbuf == ESC || **inbuf == SI || **inbuf == SO){

		    /* We should not process these ASCII control codes as these
		       have special significance in the output ISO encoding.
		       Instead we will output NON_ID_CHAR and continue processing */

		    n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
		    if (n < 0) /* Insufficient space in the outbuf */
			    return ((size_t)-1); /* The errno etc. are set in ascii_to_iso */
		    st->nonidcount += 1;
		} else { /* Got ASCII code */
		    n = ascii_to_iso(**inbuf, st, outbuf, outbytesleft);
		    if (n < 0) /* Insufficient space in the outbuf */
			return ((size_t)-1);
		}
		break;

	    case C1:
		st->keepc[1] = (**inbuf);
		if (isbig5((unsigned char*) st->keepc) == 0) {
		    if ((idx = hascns(st->keepc)) >= 0){
			n = big5_to_iso(idx, st, outbuf, outbytesleft);
			if (n < 0) /* Insufficient space in the outbuf */
			    return ((size_t)-1);
			st->nonidcount -= 1; /* The first byte of this big5 saved in
						state C0 is confirmed valid BIG5 High
						byte and is processed correctly */

		    } else { /* Valid BIG5 but has no CNS encoding */
			/* We will output the NON_ID_CHAR character */
			n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
			if (n < 0) /* Insufficient space in the outbuf */
			    return ((size_t)-1);
			n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
			if (n < 0) /* Insufficient space in the outbuf */
			    return ((size_t)-1);
			st->nonidcount -= 1; /* Include the 2nd byte also as
						    unidentified byte */
		    }
		} else { /* Input character is not BIG5 encoding */
		    st->nonidcount += 1;
		    st->_errno = errno = EILSEQ; /* This will cause the code to
						    break out of while loop below
						    to return to the caller */

		}
		st->cstate = C0; /* Go to the initial state */
		break;

	    default:		/* Should never come here */
		fprintf(stderr,
	 "_icv_iconv():Big5-->ISO2022-CN-EXT: Should not have come here\n");
		st->_errno = errno = EILSEQ;
		st->cstate = C0;
		break;

	    } /* end switch */

	    (*inbuf)++;
	    (*inbytesleft)--;

	    if (st->_errno)
		    break; /* Break out of while loop */

	    if (errno) /* We set st->_errno before we set errno. If errno is set
				      somewhere else we handle that here */
		return ((size_t)-1);

	} /* end while */

/* We now have to handle the case where we have successfully processed the
   previous input character which exhausted the output buffer. This is handled
   by the while loop. However, since there are more input characters that
   haven't been processed yet, we need to set the errno appropriately and
   return -1. */
	if (*inbytesleft > 0 && *outbytesleft == 0) {
		errno = E2BIG;
		return ((size_t)-1);
	}

	return (*inbytesleft + st->nonidcount);

}


/*
 * Big-5 encoding range:
 *	High byte: 0xA1 - 0xFE				(94 encoding space)
 *	Low byte:  0x40 - 0x7E, 0xA1 - 0xFE	(157 encoding space)
 *	Plane #1:  0xA140 - 0xC8FE			(6280 encoding space)
 *	Plane #2:  0xC940 - 0xFEFE			(8478 encoding space)
 *	Total:	   94 * 157 = 14,758		(14758 encoding space)
 */
static int isbig5(unsigned char *twobytes)
{
	if (twobytes[0] >= 0xa1 && twobytes[0] <= 0xfe)
	    if ((twobytes[1] >= 0x40 && twobytes[1] <= 0x7e) ||
					(twobytes[1] >= 0xa1 && twobytes[1] <= 0xfe))
		return (0);
	return(-1);
}


/*
 * hascns() : checks whether we have a CNS 11643 code for the big5 character
 *			  code. If exists returns the index of the big5 character in the
 *			  big5 to CNS table else returns -1.
 */
static int hascns(char* big5mbchar)
{

	int idx;
	unsigned long big5code;

	big5code = (unsigned long) ((big5mbchar[0] & ONEBYTE) << 8) +
										(big5mbchar[1] & ONEBYTE);

	idx = binsearch(big5code, big5_cns_tab, MAX_BIG5_NUM);

	return (idx); /* binsearch returns -1 if not found, else index */
}


/* ascii_to_iso() : If required, outputs the SI shift function. Outputs the
 *					character. If there is insufficient space in the output
 *					buffer, it flags the error and returns -1. On success it
 *					returns 0.
 */
static int ascii_to_iso(char c, _iconv_st *st, char **outbuf,
							size_t *outbytesleft)
{
	if (st->ishiftfunc != SI){
	    **outbuf = SI;
	    (*outbuf)++;
	    (*outbytesleft)--;
	    st->ishiftfunc = SI;

	    if (*outbytesleft < 1){ /* Do we now have space for ASCII character?*/
		    st->_errno = errno = E2BIG;
		    return (-1);
	    }
	}

	**outbuf = c;
	(*outbuf)++;
	(*outbytesleft)--;

	/* Each line in ISO is expected to have the character set information
	   for the Chinese characters in that line. This facilitates text
	   scrollling. Hence, on encountering newline reset designations to
	   unknown */
	if (c == '\n'){
	    st->iSOplane = -1;
	    st->iSS2plane = -1;
	    st->iSS3plane = -1;
	}

	return (0);

}



/* big5_to_iso() : Converts the Big5 code, for which the index idx in
 *				   the big5 to cns table is provided as an argument, to
 *				   its corresponding ISO2022-CN-EXT code. This may
 *				   require outputting of SO shift function and/or
 *				   the designations. In case we do not have sufficient
 *				   space in the outbuf to to do the convertion we flag error
 *				   and return -1
 */
static int big5_to_iso(int idx, _iconv_st *st, char **outbuf,
							size_t *outbytesleft)
{

	char cnsbytes[2];
	int cnsplane;
	int ret;

	ret = getcnsbytes(idx, cnsbytes, &cnsplane);
	if (ret < 0){
	    /* This means that the cnscode is invalid. Should have been taken
	       care of in function hascns() and thus this code should never come
	       here. We catch this by the error message below */
	    fprintf(stderr,
	      "big5_to_iso():Big5->ISO2022-CN-EXT:gencnsbyte() rejected cnscode\n");
	    st->_errno = errno = EILSEQ;
	    return (0);
	}

	switch (cnsplane) {
	case 1:
	    if (st->iSOplane != cnsplane){ /* Is SODESIGNATION set to this plane?*/
		/* Output Escape sequence to set the SODESIGNATION to plane 1 */
		/* Before that check that we have space in outbuf for it */
		if (*outbytesleft < 4){
			st->_errno = errno = E2BIG;
			return (-1);
		}

		**outbuf = ESC;
		*(*outbuf+1) = '$';
		*(*outbuf+2) = ')';
		*(*outbuf+3) = GET_PLANEC(cnsplane);
		(*outbuf) += 4;
		(*outbytesleft) -= 4;
		st->iSOplane = cnsplane;
	    }

	    /* Check the current shift function whether it is SO. If not
	       set the SO shift function after confirming that you have
	       space for it. */
	    if (st->ishiftfunc != SO){
		if (*outbytesleft < 1){
		    st->_errno = errno = E2BIG;
		    return (-1);
		}

		**outbuf = SO;
		(*outbuf)++;
		(*outbytesleft)--;
		st->ishiftfunc = SO;
	    }
	    break;

	case 2:
	    if (st->iSS2plane != cnsplane){ /* Is SS2DESIGNATION set tothis plane ? */
		/* Output escape sequence to set SS2DESIGNATION to plane 2 */
		/* Before that check that we have space in outbuf for it */
		if (*outbytesleft < 4){
			st->_errno = errno = E2BIG;
			return (-1);
		}

		**outbuf = ESC;
		*(*outbuf+1) = '$';
		*(*outbuf+2) = '*';
		*(*outbuf+3) = GET_PLANEC(cnsplane);
		(*outbuf) += 4;
		(*outbytesleft) -= 4;
		st->iSS2plane = cnsplane;
	    }

	    /* Output the SS2 shift function only when we have sufficient space
	       for the 2 cns code bytes also */
	    if (*outbytesleft < 4){
		st->_errno = errno = E2BIG;
		return (-1);
	    }

	    **outbuf = ESC;
	    *(*outbuf+1) = SS2;
	    (*outbuf) += 2;
	    (*outbytesleft) -= 2;

	    break;

	case 3:
	case 4:
	case 5:
	case 6:
	case 7:
	case 12:
	case 14:
	case 15:
	case 16:
	    if (st->iSS3plane != cnsplane){ /* Is SS3DESIGNATION set tothis plane? */
		/* Output escape sequence to set SS3DESIGNATION to cnsplane */
		/* Before that check that we have space in outbuf for it */
		if (*outbytesleft < 4){
			st->_errno = errno = E2BIG;
			return (-1);
		}

		**outbuf = ESC;
		*(*outbuf+1) = '$';
		*(*outbuf+2) = '+';
		*(*outbuf+3) = GET_PLANEC(cnsplane);
		(*outbuf) += 4;
		(*outbytesleft) -= 4;
		st->iSS3plane = cnsplane;

	    }

	    /* Output the SS3 shift function only when we have sufficient space
	       for the 2 cns code bytes also */
	    if (*outbytesleft < 4){
		st->_errno = errno = E2BIG;
		return (-1);
	    }

	    **outbuf = ESC;
	    *(*outbuf+1) = SS3;
	    (*outbuf) += 2;
	    (*outbytesleft) -= 2;

	    break;

	default: /* Should have been taken care of in caller of this funcion */

	    /* This means that the cnscode is invalid. Should have been taken
	       care of in function hascns() and thus this code should never
	       come here. We catch this by the error message below */
	    fprintf(stderr, "big5_to_iso():Big5->ISO2022-CN-EXT:Rejecting cnscode\n");
	    st->_errno = errno = EILSEQ;
	    return (0);

	    break;

	} /* end switch */

	/* Output the cns code */
	if (*outbytesleft < 2){
	    st->_errno = errno = E2BIG;
	    return (-1);
	}

	**outbuf = cnsbytes[0];
	*(*outbuf+1) = cnsbytes[1];
	(*outbuf) += 2;
	(*outbytesleft) -= 2;


	return (0);

}


static int getcnsbytes(int idx, char *cnsbytes, int *cnsplane)
{

	unsigned long cnscode;
	unsigned long val;
	int plane;

	cnscode = big5_cns_tab[idx].value;

	plane = (int) (cnscode >> 16);
	switch (plane) {
	case 0x21:	/* 0x8EA1 - G */
	case 0x22:	/* 0x8EA2 - H */
	case 0x23:	/* 0x8EA3 - I */
	case 0x24:	/* 0x8EA4 - J */
	case 0x25:	/* 0x8EA5 - K */
	case 0x26:	/* 0x8EA6 - L */
	case 0x27:	/* 0x8EA7 - M */
	case 0x28:	/* 0x8EA8 - N */
	case 0x29:	/* 0x8EA9 - O */
	case 0x2a:	/* 0x8EAA - P */
	case 0x2b:	/* 0x8EAB - Q */
	case 0x2c:	/* 0x8EAC - R */
	case 0x2d:	/* 0x8EAD - S */
	case 0x2f:	/* 0x8EAF - U */
	case 0x30:	/* 0x8EB0 - V */
	    *cnsplane = plane - 0x20;	/* so that we can use GET_PLANEC() */
	    break;

	case 0x2e:	/* 0x8EAE - T */
	    *cnsplane = 3;		/* CNS 11643-1992. Why is this returning 3?  */
	    break;

	default:
	    return (-1); /* Should not have happened */
	    break;
	}

	val = cnscode & 0xffff;
	cnsbytes[0] = (val & 0xff00) >> 8;
	cnsbytes[1] = val & 0xff;

	return (0);

}


/* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
static int binsearch(unsigned long x, table_t v[], int n)
{
	int low, high, mid;

	low = 0;
	high = n - 1;
	while (low <= high) {
	    mid = (low + high) / 2;
	    if (x < v[mid].key)
		high = mid - 1;
	    else if (x > v[mid].key)
		low = mid + 1;
	    else	/* found match */
		return mid;
	}
	return (-1);	/* no match */
}