Home | History | Annotate | Download | only in zh
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright(c) 1998 Sun Microsystems, Inc.
     23  * All right reserved.
     24  */
     25 #pragma ident "@(#)UTF-8_to_zh_CN.iso2022-CN.c	1.3 00/11/04"
     26 
     27 #include <stdio.h>
     28 #include <errno.h>
     29 #include <stdlib.h>
     30 #include <sys/types.h>
     31 #include <public_struc.h>
     32 #include <unicode_gb2312.h>
     33 #include <unicode_cns11643_CN.h>
     34 #ifdef DEBUG
     35 #include <fcntl.h>
     36 #include <sys/stat.h>
     37 #endif
     38 #include "common_defs.h"
     39 
     40 #define	SI	0x0f
     41 #define	SO	0x0e
     42 #define SS2 0x4e
     43 #define SS3 0x4f
     44 #define	ESC	0x1b
     45 #define	MSB	0x80
     46 #define MSB_OFF 0x7f
     47 
     48 #define	NON_ID_CHAR1	0x21
     49 #define NON_ID_CHAR2	0x75
     50 
     51 typedef struct _icv_state {
     52 	short	_ustate;
     53 	short	_istate;
     54 	short	_gstate;
     55 	char	_keepc[6];
     56 	int		_errno;
     57 } _iconv_st;
     58 
     59 enum	_USTATE	{ U0, U1, U2, U3, U4, U5, U6, U7 };
     60 enum	_ISTATE	{ IN, OUT };
     61 enum	_GSTATE	{ G0, G1, G2 };
     62 
     63 
     64 /*
     65  *	Open; called from iconv_open()
     66  */
     67 void * _icv_open() {
     68 	_iconv_st * st;
     69 	if ((st = (_iconv_st *)malloc(sizeof(_iconv_st))) == NULL) {
     70 		errno = ENOMEM;
     71 		return (void *)-1;
     72 	}
     73 
     74 	st->_ustate = U0;
     75 	st->_istate = IN;
     76 	st->_gstate = -1;
     77 	st->_errno = 0;
     78 
     79 	return (void *)st;
     80 }
     81 
     82 /*
     83  *	Close; called from iconv_close()
     84  */
     85 
     86 void _icv_close(_iconv_st *st) {
     87 	if (st == NULL)
     88 		errno = EBADF;
     89 	else
     90 		free(st);
     91 }
     92 
     93 /*
     94  *	Actual conversion; called from iconv()
     95  */
     96 
     97 size_t _icv_iconv(_iconv_st *st, char **inbuf, size_t *inbytesleft,
     98 					char **outbuf, size_t *outbytesleft) {
     99 	char c1, c2;
    100 	int n;
    101 	unsigned long key;
    102 	unsigned long gbk;
    103 	int index;
    104 	short new_state;
    105 
    106 #ifdef DEBUG
    107 	fprintf(stderr, "in length is %d\toutlength is %d\n",
    108 			*inbytesleft, *outbytesleft);
    109 #endif
    110 	if (st == NULL) {
    111 		errno = EBADF;
    112 		return ((size_t)-1);
    113 	}
    114 
    115 	if (inbuf == NULL || *inbuf == NULL) {	/* Reset request. */
    116 		st->_ustate = U0;
    117 		st->_istate = IN;
    118 		st->_gstate = G0;
    119 		st->_errno = 0;
    120 		return ((size_t)0);
    121 	}
    122 
    123 	errno = 0;
    124 	while (*inbytesleft > 0 && *outbytesleft > 0) {
    125 
    126 	        uchar_t  first_byte;
    127 
    128 		switch (st->_ustate) {
    129 			case U0:
    130 				if ((**inbuf & MSB) == 0) {	/* ASCII */
    131 					if (st->_istate == OUT) {
    132 						if (*outbytesleft < 2) {
    133 #ifdef DEBUG
    134 							fprintf(stderr, "11111 outbytesleft is %d\n", *outbytesleft);
    135 #endif
    136 							errno = E2BIG;
    137 							return (size_t) -1;
    138 						}
    139 						st->_istate = IN;
    140 						**outbuf = SI;
    141 						(*outbuf)++;
    142 						(*outbytesleft)--;
    143 					}
    144 					if (*outbytesleft < 1) {
    145 #ifdef DEBUG
    146 						fprintf(stderr, "22222 outbytesleft is %d\n", *outbytesleft);
    147 #endif
    148 						errno = E2BIG;
    149 						return (size_t) -1;
    150 					}
    151 					**outbuf = **inbuf;
    152 					(*outbuf)++;
    153 					(*outbytesleft)--;
    154 				} else {	/* Chinese charactor */
    155 					if ((**inbuf & 0xe0) == 0xc0) {	/* 2-byte unicode 0xc2..0xdf */
    156 
    157 					   /* invalid sequence if the first char is either 0xc0 or 0xc1 */
    158 					   if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
    159 					        st->_errno = errno = EILSEQ;
    160 					   else {
    161 						st->_ustate = U1;
    162 						st->_keepc[0] = **inbuf;
    163 					   }
    164 					} else if ((**inbuf & 0xf0) == 0xe0) {	/* 3-bytes unicode */
    165 						st->_ustate = U2;
    166 						st->_keepc[0] = **inbuf;
    167 					} else {
    168 
    169 					   /* four bytes of UTF-8 sequences */
    170 					   if ( number_of_bytes_in_utf8_char[((uchar_t)**inbuf)] == ICV_TYPE_ILLEGAL_CHAR )
    171 						st->_errno = errno = EILSEQ;
    172 					   else
    173 					     {
    174 						st->_ustate = U5;
    175 						st->_keepc[0] = **inbuf;
    176 					     }
    177 #ifdef DEBUG
    178 						fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]);
    179 #endif
    180 					}
    181 				}
    182 				break;
    183 
    184 			case U1:	/* 2-byte unicode */
    185 				if ((**inbuf & 0xc0) == 0x80) {	/* 2nd byte is 1xxxxxxx */
    186 					st->_ustate = U4;
    187 					st->_keepc[1] = **inbuf;
    188 					c1 = (st->_keepc[0] & 0x1c)>>2;
    189 					c2 = ((st->_keepc[0] & 0x03) << 6) | \
    190 							(st->_keepc[1] & 0x3f);
    191 					continue;
    192 				} else {
    193 					st->_errno = errno = EILSEQ;
    194 #ifdef DEBUG
    195 					fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]);
    196 #endif
    197 				}
    198 				break;
    199 
    200 			case U2:	/* 3-byte unicode - 2nd byte */
    201 		                first_byte = st->_keepc[0];
    202 
    203 		                /* if the first byte is 0xed, it is illegal sequence if the second
    204 				 * one is between 0xa0 and 0xbf because surrogate section is ill-formed
    205 				 */
    206 		                if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
    207 				    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
    208 		                        st->_errno = errno = EILSEQ;
    209 		                else {
    210 					st->_ustate = U3;
    211 					st->_keepc[1] = **inbuf;
    212 				}
    213 				break;
    214 
    215 			case U3:	/* 3-byte unicode - 3th byte */
    216 				if ((**inbuf & 0xc0) == 0x80) {
    217 					st->_ustate = U4;
    218 					st->_keepc[2] = **inbuf;
    219 					c1 = ((st->_keepc[0] & 0x0f) << 4) | \
    220 							((st->_keepc[1] & 0x3c) >> 2);
    221 					c2 = ((st->_keepc[1] & 0x03) << 6) | \
    222 							(st->_keepc[2] & 0x3f);
    223 					continue;
    224 				} else {
    225 					st->_errno = errno = EILSEQ;
    226 #ifdef DEBUG
    227 					fprintf(stderr, "state = %d, keepc is %x\n", st->_ustate, st->_keepc[0]);
    228 #endif
    229 				}
    230 				break;
    231 
    232 			case U4:	/* Generate iso2022 sequence */
    233 				key = ((c1 & 0xff) << 8) | (c2 & 0xff);
    234 
    235 		                /* 0xFFFE and 0xFFFF should not be allowed */
    236 		                if ( key == 0xFFFE || key == 0xFFFF ) {
    237 				        st->_errno = errno = EILSEQ;
    238 				        break;
    239 				}
    240 
    241 				if ((index = binary_search(key, unicode_gb_tab, UNICODEMAX)) != -1) {	/* GB code set */
    242 					gbk = unicode_gb_tab[index].value;
    243 					if (st->_gstate != G0) {
    244 						if (*outbytesleft < 7) {
    245 #ifdef DEBUG
    246 							fprintf(stderr, "33333 outbytesleft is %d\n", *outbytesleft);
    247 #endif
    248 							errno = E2BIG;
    249 							return ((size_t)-1);
    250 						}
    251 						st->_istate = OUT;
    252 						st->_gstate = G0;
    253 						**outbuf = ESC;
    254 						*(*outbuf + 1) = '$';
    255 						*(*outbuf + 2) = ')';
    256 						*(*outbuf + 3) = 'A';
    257 						*(*outbuf + 4) = SO;
    258 						*(*outbuf + 5) = (gbk & 0xff00) >> 8;
    259 						*(*outbuf + 6) = gbk & 0xff;
    260 						n = 7;
    261 					} else if (st->_istate == IN) {
    262 						if (*outbytesleft < 3) {
    263 #ifdef DEBUG
    264 							fprintf(stderr, "44444outbytesleft is %d\n", *outbytesleft);
    265 #endif
    266 							errno = E2BIG;
    267 							return ((size_t) -1);
    268 						}
    269 						st->_istate = OUT;
    270 						**(outbuf) = SO;
    271 						*(*outbuf + 1) = (gbk & 0xff00) >> 8;
    272 						*(*outbuf + 2) = gbk & 0xff;
    273 						n = 3;
    274 					} else {
    275 					        if ( *outbytesleft < 2 ) {
    276 						   errno = E2BIG;
    277 						   return ((size_t)-1);
    278 					        }
    279 
    280 						**outbuf = (gbk & 0xff00) >> 8;
    281 						*(*outbuf + 1) = gbk & 0xff;
    282 						n = 2;
    283 					}
    284 				} else if ((index = binary_search(key, utf_cns_tab, MAX_UTF_NUM)) != -1) {
    285 					gbk = utf_cns_tab[index].value;
    286 					new_state = ((gbk >> 16 ) & 0xff) - 0x20;
    287 					if (new_state == G2 || new_state == G1) {
    288 						if (st->_gstate != new_state) {
    289 							if (*outbytesleft < 7) {
    290 #ifdef DEBUG
    291 								fprintf(stderr, "55555 outbytesleft is %d\n", *outbytesleft);
    292 #endif
    293 								errno = E2BIG;
    294 								return (size_t) -1;
    295 							}
    296 							**outbuf = ESC;
    297 							*(*outbuf + 1) = '$';
    298 							*(*outbuf + 2) = ')';
    299 							*(*outbuf + 3) = 'G' + new_state - 1;
    300 							st->_istate = OUT;
    301 							st->_gstate = new_state;
    302 							*(*outbuf + 4) = SO;
    303 							*(*outbuf + 5) = (gbk & 0xff00) >> 8;
    304 							*(*outbuf + 6) = gbk & 0xff;
    305 							n = 7;
    306 						} else if (st->_istate == IN) {
    307 							if (*outbytesleft < 3) {
    308 #ifdef DEBUG
    309 								fprintf(stderr, "66666 outbytesleft is %d\n", *outbytesleft);
    310 #endif
    311 								errno = E2BIG;
    312 								return (size_t) -1;
    313 							}
    314 							st->_istate = OUT;
    315 							**outbuf = SO;
    316 							*(*outbuf + 1) = (gbk & 0xff00) >> 8;
    317 							*(*outbuf + 2) = gbk & 0xff;
    318 							n = 3;
    319 						} else {
    320 							if (*outbytesleft < 2) {
    321 #ifdef DEBUG
    322 								fprintf(stderr, "77777 outbytesleft is %d\n", *outbytesleft);
    323 #endif
    324 								errno = E2BIG;
    325 								return (size_t) -1;
    326 							}
    327 							**outbuf = (gbk & 0xff00) >> 8;
    328 							*(*outbuf + 1) = gbk & 0xff;
    329 							n = 2;
    330 						}
    331 					} else if (new_state > G2) {
    332 						if (st->_gstate != G0) {
    333 							if (*outbytesleft < 7) {
    334 #ifdef DEBUG
    335 								fprintf(stderr, " 888888 outbytesleft is %d\n", *outbytesleft);
    336 #endif
    337 								errno = E2BIG;
    338 								return (size_t) -1;
    339 							}
    340 							st->_gstate = G0;
    341 							st->_istate = OUT;
    342 							**outbuf = ESC;
    343 							*(*outbuf + 1) = '$';
    344 							*(*outbuf + 2) = ')';
    345 							*(*outbuf + 3) = 'A';
    346 							*(*outbuf + 4) = SO;
    347 							*(*outbuf + 5) = NON_ID_CHAR1;
    348 							*(*outbuf + 6) = NON_ID_CHAR2;
    349 							n = 7;
    350 						} else if (st->_istate == IN) {
    351 							if (*outbytesleft < 3) {
    352 #ifdef DEBUG
    353 								fprintf(stderr, "99999 outbytesleft is %d\n", *outbytesleft);
    354 #endif
    355 								errno = E2BIG;
    356 								return (size_t) -1;
    357 							}
    358 							st->_gstate = G0;
    359 							st->_istate = OUT;
    360 							**outbuf = SO;
    361 							*(*outbuf + 1) = NON_ID_CHAR1;
    362 							*(*outbuf + 2) = NON_ID_CHAR2;
    363 							n = 3;
    364 						} else {
    365 							if (*outbytesleft < 2) {
    366 #ifdef DEBUG
    367 								fprintf(stderr, "aaaaaaoutbytesleft is %d\n", *outbytesleft);
    368 #endif
    369 								errno = E2BIG;
    370 								return (size_t) -1;
    371 							}
    372 							**outbuf = NON_ID_CHAR1;
    373 							*(*outbuf + 1) = NON_ID_CHAR2;
    374 							n = 2;
    375 						}
    376 					}
    377 				} else {	/* Non-GB & Non-Big5 */
    378 					if (st->_gstate != G0) {
    379 						if (*outbytesleft < 7) {
    380 							errno = E2BIG;
    381 							return (size_t) -1;
    382 						}
    383 						st->_gstate = G0;
    384 						st->_istate = OUT;
    385 						**outbuf = ESC;
    386 						*(*outbuf + 1) = '$';
    387 						*(*outbuf + 2) = ')';
    388 						*(*outbuf + 3) = 'A';
    389 						*(*outbuf + 4) = SO;
    390 						*(*outbuf + 5) = NON_ID_CHAR1;
    391 						*(*outbuf + 6) = NON_ID_CHAR2;
    392 						n = 7;
    393 					} else if (st->_istate == IN) {
    394 						if(*outbytesleft < 3) {
    395 							errno = E2BIG;
    396 							return (size_t) -1;
    397 						}
    398 						st->_istate = OUT;
    399 						st->_gstate = G0;
    400 						**outbuf = SO;
    401 						*(*outbuf + 1) = NON_ID_CHAR1;
    402 						*(*outbuf + 2) = NON_ID_CHAR2;
    403 						n = 3;
    404 					} else {
    405 					        /* add sanity check to avoid segment error */
    406 						if (*outbytesleft < 2) {
    407 							errno = E2BIG;
    408 							return (size_t) -1;
    409 						}
    410 						**outbuf = NON_ID_CHAR1;
    411 						*(*outbuf + 1) = NON_ID_CHAR2;
    412 						n = 2;
    413 					}
    414 				}
    415 /*
    416 					n = gen_undef(st, *outbuf, *outbytesleft);
    417 					fprintf(stderr, "gen_undef return %d\n", n );
    418 				}
    419  */
    420 				if (n > 0) {
    421 					(*outbuf) += n;
    422 					(*outbytesleft) -= n;
    423 				} else {
    424 #ifdef DEBUG
    425 					fprintf(stderr, "bbbbb outbytesleft is %d\n", *outbytesleft);
    426 #endif
    427 					errno = E2BIG;
    428 					return ((size_t)-1);
    429 				}
    430 				st->_ustate = U0;
    431 				break;
    432 
    433 		        case U5:
    434 		                first_byte = st->_keepc[0];
    435 
    436 		                /* if the first byte is 0xf0, it is illegal sequence if
    437 				 * the second one is between 0x80 and 0x8f
    438 				 * for Four-Byte UTF: U+10000..U+10FFFF
    439 				 */
    440 		                if (((uchar_t)**inbuf) < valid_min_2nd_byte[first_byte] ||
    441 				    ((uchar_t)**inbuf) > valid_max_2nd_byte[first_byte] )
    442 		                    st->_errno = errno = EILSEQ;
    443 		                else {
    444 				   st->_ustate = U6;
    445 				   st->_keepc[1] = **inbuf;
    446 				}
    447 		                break;
    448 		        case U6:
    449 		                if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */
    450 		                  {
    451 				     st->_ustate = U7;
    452 				     st->_keepc[2] = **inbuf;
    453 				  }
    454 		                else
    455 		                     st->_errno = errno = EILSEQ;
    456 		                break;
    457 		        case U7:
    458 		                if ((**inbuf & 0xc0) == 0x80) /* 0x80..0xbf */
    459 		                  {  /* skip it to simplify */
    460 				     st->_ustate = U0;
    461 				  }
    462 		                else
    463 		                     st->_errno = errno = EILSEQ;
    464 		                break;
    465 			default:
    466 				st->_errno = errno = EILSEQ;
    467 #ifdef DEBUG
    468 				fprintf(stderr, "WHY HERE\n");
    469 #endif
    470 				st->_ustate = U0;	/* reset state */
    471 				break;
    472 		}	/* end of switc */
    473 		if (st->_errno)
    474 			break;
    475 		(*inbuf)++;
    476 		(*inbytesleft)--;
    477 	}
    478 
    479         if (errno)
    480 		return ((size_t)-1);
    481 
    482         if (*inbytesleft == 0 && st->_ustate != U0)
    483          {
    484 	    errno = EINVAL;
    485 	    return ((size_t) -1);
    486          }
    487 
    488 	if (*inbytesleft > 0 && *outbytesleft == 0) {
    489 #ifdef DEBUG
    490 		fprintf(stderr, "cccccc outbytesleft is %d\n", *outbytesleft);
    491 #endif
    492 		errno = E2BIG;
    493 		return ((size_t)-1);
    494 	}
    495 	return ((size_t)(*inbytesleft));
    496 }
    497 
    498 /*
    499  *	gen_undef(); Called when a char non-gb and non-big5 found.
    500  */
    501 int gen_undef(_iconv_st * st, char * outbuf, int bytes) {
    502 	if (st->_gstate != G0) {
    503 		if (bytes < 7) {
    504 #ifdef DEBUG
    505 			fprintf(stderr, "in gen outbytesleft is %d\n", bytes);
    506 #endif
    507 			errno = st->_errno = E2BIG;
    508 			return -1;
    509 		}
    510 		st->_gstate = G0;
    511 		st->_istate = OUT;
    512 		*outbuf = ESC;
    513 		*(outbuf + 1) = '$';
    514 		*(outbuf + 2) = ')';
    515 		*(outbuf + 3) = 'A';
    516 		*(outbuf + 4) = SO;
    517 		*(outbuf + 5) = NON_ID_CHAR1;
    518 		*(outbuf + 6) = NON_ID_CHAR2;
    519 		return 7;
    520 	}
    521 	if (st->_istate == IN) {
    522 		if (bytes < 3) {
    523 #ifdef DEBUG
    524 			fprintf(stderr, "in gen outbytesleft is %d\n", bytes);
    525 #endif
    526 			errno = st->_errno = E2BIG;
    527 			return -1;
    528 		}
    529 		st->_istate = OUT;
    530 		*outbuf = SO;
    531 		*(outbuf + 1) = NON_ID_CHAR1;
    532 		*(outbuf + 2) = NON_ID_CHAR2;
    533 		return 3;
    534 	}
    535 	if (bytes < 2) {
    536 #ifdef DEBUG
    537 		fprintf(stderr, "in gen outbytesleft is %d\n", bytes);
    538 #endif
    539 		errno = st->_errno = E2BIG;
    540 		return -1;
    541 	}
    542 	*outbuf = NON_ID_CHAR1;
    543 	*(outbuf + 1) = NON_ID_CHAR2;
    544 	return 2;
    545 }
    546 
    547 /*
    548  *	binary_search();
    549  */
    550 int binary_search(unsigned long key, table_t *table, int tab_len) {
    551 	int i, low, high;
    552 
    553 	for (low = 0, high = tab_len-1; low < high; ) {
    554 		if (table[low].key == key)
    555 			return low;
    556 		if (table[high].key == key)
    557 			return high;
    558 		i = (low + high) >> 1;
    559 		if (table[i].key == key)
    560 			return i;
    561 		if (table[i].key < key)
    562 			low = i + 1;
    563 		else
    564 			high = i - 1;
    565 	}
    566 	return -1;
    567 }
    568 
    569 #ifdef DEBUG
    570 main(int argc, char ** argv) {
    571 	_iconv_st	* st;
    572 	int fd;
    573 	char * in_str;
    574 	char * out_str;
    575 	char * tmp_in;
    576 	char * tmp_out;
    577 	unsigned int in_len;
    578 	unsigned int out_len;
    579 
    580 	struct stat s;
    581 
    582 	if (argc < 2) {
    583 		fprintf(stderr, "Usage: %s input\n", argv[0]);
    584 		exit(-1);
    585 	}
    586 
    587 	if (stat(argv[1], &s) == -1) {
    588 		perror("stat");
    589 		exit(-1);
    590 	}
    591 
    592 	if ((fd = open(argv[1], O_RDONLY)) == -1) {
    593 		perror("open");
    594 		exit(-1);
    595 	}
    596 
    597 	tmp_in = in_str = (char *) malloc(1024);
    598 	tmp_out = out_str = (char *) malloc(1024);
    599 	if (!in_str || !out_str) {
    600 		perror("malloc");
    601 		exit(-3);
    602 		free(in_str);
    603 		free(out_str);
    604 	}
    605 	in_len = s.st_size;
    606 	out_len = s.st_size << 2;
    607 	st = _icv_open();
    608 	if (st == (_iconv_st *) -1) {
    609 		perror("_icv_open");
    610 		free(in_str);
    611 		free(out_str);
    612 		exit(-3);
    613 	}
    614 
    615 	while (1) {
    616 	in_len = 1024;
    617 	out_len = 1024;
    618 	in_str = tmp_in;
    619 	out_str = tmp_out;
    620 
    621 	if (!read(fd, in_str, in_len))
    622 		exit(0);
    623 
    624 	if (_icv_iconv(st, &in_str, &in_len, &out_str, &out_len) == -1) {
    625 		perror("icv_iconv");
    626 		fprintf(stderr, "\ninbytesleft = %d\n", in_len);
    627 		exit(-2);
    628 	}
    629 	fprintf(stderr, "Result is in len %d, out len %d\n", in_len,
    630 	out_len);
    631 	if (write(1, tmp_out, 4096 - out_len) == -1) {
    632 		perror("write");
    633 	}
    634 	}	/* end of while */
    635 
    636 	free(tmp_in);
    637 	free(tmp_out);
    638 	close(fd);
    639 	_icv_close(st);
    640 }
    641 #endif
    642