1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
|
/*
* CDDL HEADER START
*
* The contents of this file are subject to the terms of the
* Common Development and Distribution License (the "License").
* You may not use this file except in compliance with the License.
*
* You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
* or http://www.opensolaris.org/os/licensing.
* See the License for the specific language governing permissions
* and limitations under the License.
*
* When distributing Covered Code, include this CDDL HEADER in each
* file and include the License file at src/OPENSOLARIS.LICENSE.
* If applicable, add the following below this CDDL HEADER, with the
* fields enclosed by brackets "[]" replaced with your own identifying
* information: Portions Copyright [yyyy] [name of copyright owner]
*
* CDDL HEADER END
*/
/*
* Copyright (c) 1997, by Sun Microsystems, Inc.
* All rights reserved.
*/
/*
Converts From: Taiwanese BIG5 encoding
Converts To: ISO2022-CN-EXT encoding.
NOTE: This file was created using vi editor with tabstop set to 4.
To view this file correctly set tabstop appropriately.
e.g. for vi use command ESC:se ts=4
*/
#include <stdio.h>
#include <stdlib.h>
#include <errno.h>
#include "big5_cns11643.h" /* Big5 to CNS 11643 mapping table */
#define MSB 0x80 /* The most significant bit */
#define ONEBYTE 0xff /* The right most byte */
#define SI 0x0f /* shift in */
#define SO 0x0e /* shift out */
#define SS2 0x4e /* SS2 low byte. High byte is ESC */
#define SS3 0x4f /* SS3 low byte. High byte is ESC */
#define ESC 0x1b /* The Escape character */
#define NON_ID_CHAR '_' /*Substitute this for all unidentified characters*/
/* GET_PLANEC() - Gets the corresponding ISO assigned plane character for
the CNS11643 plane */
static const char plane_char[] = "0GHIJKLMNOPQRSTUV";
#define GET_PLANEC(i) (plane_char[(i)])
typedef struct _icv_state {
char keepc[2]; /* Save the recieved bytes here */
short cstate; /* Current state the state machine is in.
These states are C0 or C1*/
char ishiftfunc; /* The currently active shift funtion SI or SO
in the output ISO buffer */
int iSOplane; /* The current CNS11643 plane which is
assigned to the SOdesignation in the output
ISO buffer. Only CNS11643 plane 1 can be
assigned to SOdesignation */
int iSS2plane; /* The current CNS11643 plane which is
assigned to the SS2designation in the output
ISO buffer. Only CNS11643 plane 2 can be
assigned to SS2designation */
int iSS3plane; /* The current CNS11643 plane which is
assigned to the SS3designation in the output
ISO buffer. All CNS11643 planes >= 3 are
assigned to SS3designation */
size_t nonidcount; /* Keeps track of skipped input bytes in conversion */
int _errno; /* Internal error number */
} _iconv_st;
enum _CSTATE { C0, C1 };
static int isbig5(unsigned char*);
static int hascns(char*);
static int ascii_to_iso(char, _iconv_st*, char**, size_t*);
static int big5_to_iso(int, _iconv_st*, char**, size_t*);
static int getcnsbytes(int, char*, int*);
static int binsearch(unsigned long, table_t[], int);
/*
* _icv_open: Called from iconv_open. Allocates and initializes _iconv_st
* structure. Returns pointer to the structure as (void *).
*/
void *
_icv_open()
{
_iconv_st *st;
#ifdef DEBUG
fprintf(stderr, "_icv_open(): Come into!\n");
#endif
/* Allocate */
if ((st = (_iconv_st *) malloc(sizeof(_iconv_st))) == NULL){
errno = ENOMEM;
#ifdef DEBUG
fprintf(stderr, "Error\n");
#endif
return ((void *) -1);
}
/* Initialize */
st->cstate = C0;
st->ishiftfunc = SI;
st->iSOplane = -1;
st->iSS2plane = -1;
st->iSS3plane = -1;
st->nonidcount = 0;
st->_errno = 0;
#ifdef DEBUG
fprintf(stderr, "====== _icv_open(): Big5 --> ISO2022-CN-EXT =====\n");
#endif
/* Return struct */
return ((void *) st);
}
/*
* _icv_close: Called from iconv_close(). Frees the _iconv_st structure as
* pointed by the argument.
*/
void
_icv_close(_iconv_st *st)
{
if (st == NULL)
errno = EBADF;
else
free(st);
}
/*
* _icv_iconv: Called from iconv(). Does the convertion from BIG5 to
* ISO2022-CN-EXT.
*/
/*=======================================================
*
* State Machine for interpreting Big-5 code
*
*=======================================================
*
* 1st C
* +--------> C0 ----------> C1
* | ascii | 2nd C |
* ^ v v
* +----<-----+-----<--------+
*
*=======================================================*/
size_t
_icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
char **outbuf, size_t *outbytesleft)
{
int n, idx;
#ifdef DEBUG
fprintf(stderr, "=== _icv_iconv(): Big5 --> ISO2022-CN-EXT =====\n");
#endif
if (st == NULL) {
errno = EBADF;
return ((size_t) -1);
}
if (inbuf == NULL || *inbuf == NULL ||
inbytesleft == NULL || *inbytesleft == 0) { /* Reset request */
if (st->ishiftfunc == SO) {
if (outbytesleft && *outbytesleft >= 1 && outbuf && *outbuf) {
**outbuf = SI;
(*outbuf)++;
(*outbytesleft)--;
} else {
errno = E2BIG;
return((size_t) -1);
}
}
st->cstate = C0;
st->ishiftfunc = SI;
st->iSOplane = -1;
st->iSS2plane = -1;
st->iSS3plane = -1;
st->nonidcount = 0;
st->_errno = 0;
return ((size_t) 0);
}
st->_errno = 0;
errno = 0;
/* Before we use *inbytesleft or *outbytesleft we should confirm that
inbytesleft and outbytesleft are non-NULL. I am considering inbytesleft
or *inbytesleft having 0 value as a reset request. I am considering
outbytesleft having 0 value as no space in output buffer. Also, here
itself I am verifying that outbuf and *outbuf should be non-NULL pointers
so I do not have to worry about them being NULL below in the conversion
sub-routines. I also confirm here that *outbytesleft should be > 0 before
we can continue further */
if (outbytesleft == NULL || *outbytesleft == 0 ||
outbuf == NULL || *outbuf == NULL){
errno = E2BIG;
return ((size_t)-1);
}
/* A state machine for interpreting Big-5 code */
while (*inbytesleft > 0 && *outbytesleft > 0) {
switch (st->cstate) {
case C0:
if (**inbuf & MSB) { /* May have got the first byte ofa BIG5 code */
st->keepc[0] = **inbuf; /*Save byte */
st->cstate = C1; /* Go to the next state where
the next BIG5 byte is recieved */
st->nonidcount += 1;/* Until we have verified that this and
the next byte make a valid BIG5 code
we shall consider this as an
unidentified byte */
} else if (**inbuf == ESC || **inbuf == SI || **inbuf == SO){
/* We should not process these ASCII control codes as these
have special significance in the output ISO encoding.
Instead we will output NON_ID_CHAR and continue processing */
n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
if (n < 0) /* Insufficient space in the outbuf */
return ((size_t)-1); /* The errno etc. are set in ascii_to_iso */
st->nonidcount += 1;
} else { /* Got ASCII code */
n = ascii_to_iso(**inbuf, st, outbuf, outbytesleft);
if (n < 0) /* Insufficient space in the outbuf */
return ((size_t)-1);
}
break;
case C1:
st->keepc[1] = (**inbuf);
if (isbig5((unsigned char*) st->keepc) == 0) {
if ((idx = hascns(st->keepc)) >= 0){
n = big5_to_iso(idx, st, outbuf, outbytesleft);
if (n < 0) /* Insufficient space in the outbuf */
return ((size_t)-1);
st->nonidcount -= 1; /* The first byte of this big5 saved in
state C0 is confirmed valid BIG5 High
byte and is processed correctly */
} else { /* Valid BIG5 but has no CNS encoding */
/* We will output the NON_ID_CHAR character */
n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
if (n < 0) /* Insufficient space in the outbuf */
return ((size_t)-1);
n = ascii_to_iso(NON_ID_CHAR, st, outbuf, outbytesleft);
if (n < 0) /* Insufficient space in the outbuf */
return ((size_t)-1);
st->nonidcount -= 1; /* Include the 2nd byte also as
unidentified byte */
}
} else { /* Input character is not BIG5 encoding */
st->nonidcount += 1;
st->_errno = errno = EILSEQ; /* This will cause the code to
break out of while loop below
to return to the caller */
}
st->cstate = C0; /* Go to the initial state */
break;
default: /* Should never come here */
fprintf(stderr,
"_icv_iconv():Big5-->ISO2022-CN-EXT: Should not have come here\n");
st->_errno = errno = EILSEQ;
st->cstate = C0;
break;
} /* end switch */
(*inbuf)++;
(*inbytesleft)--;
if (st->_errno)
break; /* Break out of while loop */
if (errno) /* We set st->_errno before we set errno. If errno is set
somewhere else we handle that here */
return ((size_t)-1);
} /* end while */
/* We now have to handle the case where we have successfully processed the
previous input character which exhausted the output buffer. This is handled
by the while loop. However, since there are more input characters that
haven't been processed yet, we need to set the errno appropriately and
return -1. */
if (*inbytesleft > 0 && *outbytesleft == 0) {
errno = E2BIG;
return ((size_t)-1);
}
return (*inbytesleft + st->nonidcount);
}
/*
* Big-5 encoding range:
* High byte: 0xA1 - 0xFE (94 encoding space)
* Low byte: 0x40 - 0x7E, 0xA1 - 0xFE (157 encoding space)
* Plane #1: 0xA140 - 0xC8FE (6280 encoding space)
* Plane #2: 0xC940 - 0xFEFE (8478 encoding space)
* Total: 94 * 157 = 14,758 (14758 encoding space)
*/
static int isbig5(unsigned char *twobytes)
{
if (twobytes[0] >= 0xa1 && twobytes[0] <= 0xfe)
if ((twobytes[1] >= 0x40 && twobytes[1] <= 0x7e) ||
(twobytes[1] >= 0xa1 && twobytes[1] <= 0xfe))
return (0);
return(-1);
}
/*
* hascns() : checks whether we have a CNS 11643 code for the big5 character
* code. If exists returns the index of the big5 character in the
* big5 to CNS table else returns -1.
*/
static int hascns(char* big5mbchar)
{
int idx;
unsigned long big5code;
big5code = (unsigned long) ((big5mbchar[0] & ONEBYTE) << 8) +
(big5mbchar[1] & ONEBYTE);
idx = binsearch(big5code, big5_cns_tab, MAX_BIG5_NUM);
return (idx); /* binsearch returns -1 if not found, else index */
}
/* ascii_to_iso() : If required, outputs the SI shift function. Outputs the
* character. If there is insufficient space in the output
* buffer, it flags the error and returns -1. On success it
* returns 0.
*/
static int ascii_to_iso(char c, _iconv_st *st, char **outbuf,
size_t *outbytesleft)
{
if (st->ishiftfunc != SI){
**outbuf = SI;
(*outbuf)++;
(*outbytesleft)--;
st->ishiftfunc = SI;
if (*outbytesleft < 1){ /* Do we now have space for ASCII character?*/
st->_errno = errno = E2BIG;
return (-1);
}
}
**outbuf = c;
(*outbuf)++;
(*outbytesleft)--;
/* Each line in ISO is expected to have the character set information
for the Chinese characters in that line. This facilitates text
scrollling. Hence, on encountering newline reset designations to
unknown */
if (c == '\n'){
st->iSOplane = -1;
st->iSS2plane = -1;
st->iSS3plane = -1;
}
return (0);
}
/* big5_to_iso() : Converts the Big5 code, for which the index idx in
* the big5 to cns table is provided as an argument, to
* its corresponding ISO2022-CN-EXT code. This may
* require outputting of SO shift function and/or
* the designations. In case we do not have sufficient
* space in the outbuf to to do the convertion we flag error
* and return -1
*/
static int big5_to_iso(int idx, _iconv_st *st, char **outbuf,
size_t *outbytesleft)
{
char cnsbytes[2];
int cnsplane;
int ret;
ret = getcnsbytes(idx, cnsbytes, &cnsplane);
if (ret < 0){
/* This means that the cnscode is invalid. Should have been taken
care of in function hascns() and thus this code should never come
here. We catch this by the error message below */
fprintf(stderr,
"big5_to_iso():Big5->ISO2022-CN-EXT:gencnsbyte() rejected cnscode\n");
st->_errno = errno = EILSEQ;
return (0);
}
switch (cnsplane) {
case 1:
if (st->iSOplane != cnsplane){ /* Is SODESIGNATION set to this plane?*/
/* Output Escape sequence to set the SODESIGNATION to plane 1 */
/* Before that check that we have space in outbuf for it */
if (*outbytesleft < 4){
st->_errno = errno = E2BIG;
return (-1);
}
**outbuf = ESC;
*(*outbuf+1) = '$';
*(*outbuf+2) = ')';
*(*outbuf+3) = GET_PLANEC(cnsplane);
(*outbuf) += 4;
(*outbytesleft) -= 4;
st->iSOplane = cnsplane;
}
/* Check the current shift function whether it is SO. If not
set the SO shift function after confirming that you have
space for it. */
if (st->ishiftfunc != SO){
if (*outbytesleft < 1){
st->_errno = errno = E2BIG;
return (-1);
}
**outbuf = SO;
(*outbuf)++;
(*outbytesleft)--;
st->ishiftfunc = SO;
}
break;
case 2:
if (st->iSS2plane != cnsplane){ /* Is SS2DESIGNATION set tothis plane ? */
/* Output escape sequence to set SS2DESIGNATION to plane 2 */
/* Before that check that we have space in outbuf for it */
if (*outbytesleft < 4){
st->_errno = errno = E2BIG;
return (-1);
}
**outbuf = ESC;
*(*outbuf+1) = '$';
*(*outbuf+2) = '*';
*(*outbuf+3) = GET_PLANEC(cnsplane);
(*outbuf) += 4;
(*outbytesleft) -= 4;
st->iSS2plane = cnsplane;
}
/* Output the SS2 shift function only when we have sufficient space
for the 2 cns code bytes also */
if (*outbytesleft < 4){
st->_errno = errno = E2BIG;
return (-1);
}
**outbuf = ESC;
*(*outbuf+1) = SS2;
(*outbuf) += 2;
(*outbytesleft) -= 2;
break;
case 3:
case 4:
case 5:
case 6:
case 7:
case 12:
case 14:
case 15:
case 16:
if (st->iSS3plane != cnsplane){ /* Is SS3DESIGNATION set tothis plane? */
/* Output escape sequence to set SS3DESIGNATION to cnsplane */
/* Before that check that we have space in outbuf for it */
if (*outbytesleft < 4){
st->_errno = errno = E2BIG;
return (-1);
}
**outbuf = ESC;
*(*outbuf+1) = '$';
*(*outbuf+2) = '+';
*(*outbuf+3) = GET_PLANEC(cnsplane);
(*outbuf) += 4;
(*outbytesleft) -= 4;
st->iSS3plane = cnsplane;
}
/* Output the SS3 shift function only when we have sufficient space
for the 2 cns code bytes also */
if (*outbytesleft < 4){
st->_errno = errno = E2BIG;
return (-1);
}
**outbuf = ESC;
*(*outbuf+1) = SS3;
(*outbuf) += 2;
(*outbytesleft) -= 2;
break;
default: /* Should have been taken care of in caller of this funcion */
/* This means that the cnscode is invalid. Should have been taken
care of in function hascns() and thus this code should never
come here. We catch this by the error message below */
fprintf(stderr, "big5_to_iso():Big5->ISO2022-CN-EXT:Rejecting cnscode\n");
st->_errno = errno = EILSEQ;
return (0);
break;
} /* end switch */
/* Output the cns code */
if (*outbytesleft < 2){
st->_errno = errno = E2BIG;
return (-1);
}
**outbuf = cnsbytes[0];
*(*outbuf+1) = cnsbytes[1];
(*outbuf) += 2;
(*outbytesleft) -= 2;
return (0);
}
static int getcnsbytes(int idx, char *cnsbytes, int *cnsplane)
{
unsigned long cnscode;
unsigned long val;
int plane;
cnscode = big5_cns_tab[idx].value;
plane = (int) (cnscode >> 16);
switch (plane) {
case 0x21: /* 0x8EA1 - G */
case 0x22: /* 0x8EA2 - H */
case 0x23: /* 0x8EA3 - I */
case 0x24: /* 0x8EA4 - J */
case 0x25: /* 0x8EA5 - K */
case 0x26: /* 0x8EA6 - L */
case 0x27: /* 0x8EA7 - M */
case 0x28: /* 0x8EA8 - N */
case 0x29: /* 0x8EA9 - O */
case 0x2a: /* 0x8EAA - P */
case 0x2b: /* 0x8EAB - Q */
case 0x2c: /* 0x8EAC - R */
case 0x2d: /* 0x8EAD - S */
case 0x2f: /* 0x8EAF - U */
case 0x30: /* 0x8EB0 - V */
*cnsplane = plane - 0x20; /* so that we can use GET_PLANEC() */
break;
case 0x2e: /* 0x8EAE - T */
*cnsplane = 3; /* CNS 11643-1992. Why is this returning 3? */
break;
default:
return (-1); /* Should not have happened */
break;
}
val = cnscode & 0xffff;
cnsbytes[0] = (val & 0xff00) >> 8;
cnsbytes[1] = val & 0xff;
return (0);
}
/* binsearch: find x in v[0] <= v[1] <= ... <= v[n-1] */
static int binsearch(unsigned long x, table_t v[], int n)
{
int low, high, mid;
low = 0;
high = n - 1;
while (low <= high) {
mid = (low + high) / 2;
if (x < v[mid].key)
high = mid - 1;
else if (x > v[mid].key)
low = mid + 1;
else /* found match */
return mid;
}
return (-1); /* no match */
}
|