Home | History | Annotate | Download | only in common
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  *
     21  * Copyright 2008 Sun Microsystems, Inc.  All rights reserved.
     22  * Use is subject to license terms.
     23  */
     24 
     25 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     26 
     27 #include <errno.h>
     28 #include <locale.h>
     29 #include <langinfo.h>
     30 #include <iconv.h>
     31 #include <ctype.h>
     32 #include <strings.h>
     33 #include <string.h>
     34 #include <stdio.h>
     35 #include <stdlib.h>
     36 #include "includes.h"
     37 #include "xmalloc.h"
     38 #include "xlist.h"
     39 
     40 #ifdef MIN
     41 #undef MIN
     42 #endif /* MIN */
     43 
     44 #define	MIN(x, y)	((x) < (y) ? (x) : (y))
     45 
     46 #define	LOCALE_PATH	"/usr/bin/locale"
     47 
     48 /* two-char country code, '-' and two-char region code */
     49 #define	LANGTAG_MAX	5
     50 
     51 static uchar_t *do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf,
     52     uint_t len, uint_t *outlen, int *err, uchar_t **err_str);
     53 
     54 static int locale_cmp(const void *d1, const void *d2);
     55 static char *g11n_locale2langtag(char *locale);
     56 
     57 uint_t g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str);
     58 uint_t g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str);
     59 
     60 /*
     61  * Convert locale string name into a language tag. The caller is responsible for
     62  * freeing the memory allocated for the result.
     63  */
     64 static char *
     65 g11n_locale2langtag(char *locale)
     66 {
     67 	char *langtag;
     68 
     69 	/* base cases */
     70 	if (!locale || !*locale)
     71 		return (NULL);
     72 
     73 	if (strcmp(locale, "POSIX") == 0 || strcmp(locale, "C") == 0)
     74 		return (xstrdup("i-default"));
     75 
     76 	/* punt for language codes which are not exactly 2 letters */
     77 	if (strlen(locale) < 2 ||
     78 	    !isalpha(locale[0]) ||
     79 	    !isalpha(locale[1]) ||
     80 	    (locale[2] != '\0' &&
     81 	    locale[2] != '_' &&
     82 	    locale[2] != '.' &&
     83 	    locale[2] != '@'))
     84 		return (NULL);
     85 
     86 
     87 	/* we have a primary language sub-tag */
     88 	langtag = (char *)xmalloc(LANGTAG_MAX + 1);
     89 
     90 	strncpy(langtag, locale, 2);
     91 	langtag[2] = '\0';
     92 
     93 	/* do we have country sub-tag? For example: cs_CZ */
     94 	if (locale[2] == '_') {
     95 		if (strlen(locale) < 5 ||
     96 		    !isalpha(locale[3]) ||
     97 		    !isalpha(locale[4]) ||
     98 		    (locale[5] != '\0' && (locale[5] != '.' &&
     99 		    locale[5] != '@'))) {
    100 			return (langtag);
    101 		}
    102 
    103 		/* example: create cs-CZ from cs_CZ */
    104 		if (snprintf(langtag, 6, "%.*s-%.*s", 2, locale, 2,
    105 		    locale + 3) == 5)
    106 			return (langtag);
    107 	}
    108 
    109 	/* in all other cases we just use the primary language sub-tag */
    110 	return (langtag);
    111 }
    112 
    113 uint_t
    114 g11n_langtag_is_default(char *langtag)
    115 {
    116 	return (strcmp(langtag, "i-default") == 0);
    117 }
    118 
    119 /*
    120  * This lang tag / locale matching function works only for two-character
    121  * language primary sub-tags and two-character country sub-tags.
    122  */
    123 uint_t
    124 g11n_langtag_matches_locale(char *langtag, char *locale)
    125 {
    126 	/* match "i-default" to the process' current locale if possible */
    127 	if (g11n_langtag_is_default(langtag)) {
    128 		if (strcasecmp(locale, "POSIX") == 0 ||
    129 		    strcasecmp(locale, "C") == 0)
    130 			return (1);
    131 		else
    132 			return (0);
    133 	}
    134 
    135 	/*
    136 	 * locale must be at least 2 chars long and the lang part must be
    137 	 * exactly two characters
    138 	 */
    139 	if (strlen(locale) < 2 ||
    140 	    (!isalpha(locale[0]) || !isalpha(locale[1]) ||
    141 	    (locale[2] != '\0' && locale[2] != '_' &&
    142 	    locale[2] != '.' && locale[2] != '@')))
    143 		return (0);
    144 
    145 	/* same thing with the langtag */
    146 	if (strlen(langtag) < 2 ||
    147 	    (!isalpha(langtag[0]) || !isalpha(langtag[1]) ||
    148 	    (langtag[2] != '\0' && langtag[2] != '-')))
    149 		return (0);
    150 
    151 	/* primary language sub-tag and the locale's language part must match */
    152 	if (strncasecmp(langtag, locale, 2) != 0)
    153 		return (0);
    154 
    155 	/*
    156 	 * primary language sub-tag and the locale's language match, now
    157 	 * fuzzy check country part
    158 	 */
    159 
    160 	/* neither langtag nor locale have more than one component */
    161 	if (langtag[2] == '\0' &&
    162 	    (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@'))
    163 		return (2);
    164 
    165 	/* langtag has only one sub-tag... */
    166 	if (langtag[2] == '\0')
    167 		return (1);
    168 
    169 	/* locale has no country code... */
    170 	if (locale[2] == '\0' || locale[2] == '.' || locale[2] == '@')
    171 		return (1);
    172 
    173 	/* langtag has more than one subtag and the locale has a country code */
    174 
    175 	/* ignore second subtag if not two chars */
    176 	if (strlen(langtag) < 5)
    177 		return (1);
    178 
    179 	if (!isalpha(langtag[3]) || !isalpha(langtag[4]) ||
    180 	    (langtag[5] != '\0' && langtag[5] != '-'))
    181 		return (1);
    182 
    183 	/* ignore rest of locale if there is no two-character country part */
    184 	if (strlen(locale) < 5)
    185 		return (1);
    186 
    187 	if (locale[2] != '_' || !isalpha(locale[3]) || !isalpha(locale[4]) ||
    188 	    (locale[5] != '\0' && locale[5] != '.' && locale[5] != '@'))
    189 		return (1);
    190 
    191 	/* if the country part matches, return 2 */
    192 	if (strncasecmp(&langtag[3], &locale[3], 2) == 0)
    193 		return (2);
    194 
    195 	return (1);
    196 }
    197 
    198 char *
    199 g11n_getlocale()
    200 {
    201 	/* we have one text domain - always set it */
    202 	(void) textdomain(TEXT_DOMAIN);
    203 
    204 	/* if the locale is not set, set it from the env vars */
    205 	if (!setlocale(LC_MESSAGES, NULL))
    206 		(void) setlocale(LC_MESSAGES, "");
    207 
    208 	return (setlocale(LC_MESSAGES, NULL));
    209 }
    210 
    211 void
    212 g11n_setlocale(int category, const char *locale)
    213 {
    214 	char *curr;
    215 
    216 	/* we have one text domain - always set it */
    217 	(void) textdomain(TEXT_DOMAIN);
    218 
    219 	if (!locale)
    220 		return;
    221 
    222 	if (*locale && ((curr = setlocale(category, NULL))) &&
    223 	    strcmp(curr, locale) == 0)
    224 		return;
    225 
    226 	/* if <category> is bogus, setlocale() will do nothing */
    227 	(void) setlocale(category, locale);
    228 }
    229 
    230 char **
    231 g11n_getlocales()
    232 {
    233 	FILE *locale_out;
    234 	uint_t n_elems, list_size, long_line = 0;
    235 	char **list;
    236 	char locale[64];	/* 64 bytes is plenty for locale names */
    237 
    238 	if ((locale_out = popen(LOCALE_PATH " -a", "r")) == NULL)
    239 		return (NULL);
    240 
    241 	/*
    242 	 * start with enough room for 65 locales - that's a lot fewer than
    243 	 * all the locales available for installation, but a lot more than
    244 	 * what most users will need and install
    245 	 */
    246 	n_elems = 0;
    247 	list_size = 192;
    248 	list = (char **) xmalloc(sizeof (char *) * (list_size + 1));
    249 	memset(list, 0, sizeof (char *) * (list_size + 1));
    250 
    251 	while (fgets(locale, sizeof (locale), locale_out)) {
    252 		/* skip long locale names (if any) */
    253 		if (!strchr(locale, '\n')) {
    254 			long_line = 1;
    255 			continue;
    256 		} else if (long_line) {
    257 			long_line = 0;
    258 			continue;
    259 		}
    260 
    261 		if (strncmp(locale, "iso_8859", 8) == 0)
    262 			/* ignore locale names like "iso_8859-1" */
    263 			continue;
    264 
    265 		if (n_elems == list_size) {
    266 			list_size *= 2;
    267 			list = (char **)xrealloc((void *) list,
    268 			    (list_size + 1) * sizeof (char *));
    269 			memset(&list[n_elems + 1], 0,
    270 			    sizeof (char *) * (list_size - n_elems + 1));
    271 		}
    272 
    273 		*(strchr(locale, '\n')) = '\0';	/* remove the trailing \n */
    274 		list[n_elems++] = xstrdup(locale);
    275 	}
    276 
    277 	(void) pclose(locale_out);
    278 
    279 	if (n_elems == 0) {
    280 		xfree(list);
    281 		return (NULL);
    282 	}
    283 
    284 	list[n_elems] = NULL;
    285 
    286 	qsort(list, n_elems - 1, sizeof (char *), locale_cmp);
    287 	return (list);
    288 }
    289 
    290 char *
    291 g11n_getlangs()
    292 {
    293 	char *locale;
    294 
    295 	if (getenv("SSH_LANGS"))
    296 		return (xstrdup(getenv("SSH_LANGS")));
    297 
    298 	locale = g11n_getlocale();
    299 
    300 	if (!locale || !*locale)
    301 		return (xstrdup("i-default"));
    302 
    303 	return (g11n_locale2langtag(locale));
    304 }
    305 
    306 char *
    307 g11n_locales2langs(char **locale_set)
    308 {
    309 	char **p, **r, **q;
    310 	char *langtag, *langs;
    311 	int locales, skip;
    312 
    313 	for (locales = 0, p = locale_set; p && *p; p++)
    314 		locales++;
    315 
    316 	r = (char **)xmalloc((locales + 1) * sizeof (char *));
    317 	memset(r, 0, (locales + 1) * sizeof (char *));
    318 
    319 	for (p = locale_set; p && *p && ((p - locale_set) <= locales); p++) {
    320 		skip = 0;
    321 		if ((langtag = g11n_locale2langtag(*p)) == NULL)
    322 			continue;
    323 		for (q = r; (q - r) < locales; q++) {
    324 			if (!*q)
    325 				break;
    326 			if (*q && strcmp(*q, langtag) == 0)
    327 				skip = 1;
    328 		}
    329 		if (!skip)
    330 			*(q++) = langtag;
    331 		else
    332 			xfree(langtag);
    333 		*q = NULL;
    334 	}
    335 
    336 	langs = xjoin(r, ',');
    337 	g11n_freelist(r);
    338 
    339 	return (langs);
    340 }
    341 
    342 static int
    343 sortcmp(const void *d1, const void *d2)
    344 {
    345 	char *s1 = *(char **)d1;
    346 	char *s2 = *(char **)d2;
    347 
    348 	return (strcmp(s1, s2));
    349 }
    350 
    351 int
    352 g11n_langtag_match(char *langtag1, char *langtag2)
    353 {
    354 	int len1, len2;
    355 	char c1, c2;
    356 
    357 	len1 = (strchr(langtag1, '-')) ?
    358 	    (strchr(langtag1, '-') - langtag1)
    359 	    : strlen(langtag1);
    360 
    361 	len2 = (strchr(langtag2, '-')) ?
    362 	    (strchr(langtag2, '-') - langtag2)
    363 	    : strlen(langtag2);
    364 
    365 	/* no match */
    366 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
    367 		return (0);
    368 
    369 	c1 = *(langtag1 + len1);
    370 	c2 = *(langtag2 + len2);
    371 
    372 	/* no country sub-tags - exact match */
    373 	if (c1 == '\0' && c2 == '\0')
    374 		return (2);
    375 
    376 	/* one langtag has a country sub-tag, the other doesn't */
    377 	if (c1 == '\0' || c2 == '\0')
    378 		return (1);
    379 
    380 	/* can't happen - both langtags have a country sub-tag */
    381 	if (c1 != '-' || c2 != '-')
    382 		return (1);
    383 
    384 	/* compare country subtags */
    385 	langtag1 = langtag1 + len1 + 1;
    386 	langtag2 = langtag2 + len2 + 1;
    387 
    388 	len1 = (strchr(langtag1, '-')) ?
    389 	    (strchr(langtag1, '-') - langtag1) : strlen(langtag1);
    390 
    391 	len2 = (strchr(langtag2, '-')) ?
    392 	    (strchr(langtag2, '-') - langtag2) : strlen(langtag2);
    393 
    394 	if (len1 != len2 || strncmp(langtag1, langtag2, len1) != 0)
    395 		return (1);
    396 
    397 	/* country tags matched - exact match */
    398 	return (2);
    399 }
    400 
    401 char *
    402 g11n_langtag_set_intersect(char *set1, char *set2)
    403 {
    404 	char **list1, **list2, **list3, **p, **q, **r;
    405 	char *set3, *lang_subtag;
    406 	uint_t n1, n2, n3;
    407 	uint_t do_append;
    408 
    409 	list1 = xsplit(set1, ',');
    410 	list2 = xsplit(set2, ',');
    411 
    412 	for (n1 = 0, p = list1; p && *p; p++, n1++)
    413 		;
    414 	for (n2 = 0, p = list2; p && *p; p++, n2++)
    415 		;
    416 
    417 	list3 = (char **) xmalloc(sizeof (char *) * (n1 + n2 + 1));
    418 	*list3 = NULL;
    419 
    420 	/*
    421 	 * we must not sort the user langtags - sorting or not the server's
    422 	 * should not affect the outcome
    423 	 */
    424 	qsort(list2, n2, sizeof (char *), sortcmp);
    425 
    426 	for (n3 = 0, p = list1; p && *p; p++) {
    427 		do_append = 0;
    428 		for (q = list2; q && *q; q++) {
    429 			if (g11n_langtag_match(*p, *q) != 2) continue;
    430 			/* append element */
    431 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
    432 				do_append = 1;
    433 				if (!*r)
    434 					break;
    435 				if (strcmp(*p, *r) == 0) {
    436 					do_append = 0;
    437 					break;
    438 				}
    439 			}
    440 			if (do_append && n3 <= (n1 + n2)) {
    441 				list3[n3++] = xstrdup(*p);
    442 				list3[n3] = NULL;
    443 			}
    444 		}
    445 	}
    446 
    447 	for (p = list1; p && *p; p++) {
    448 		do_append = 0;
    449 		for (q = list2; q && *q; q++) {
    450 			if (g11n_langtag_match(*p, *q) != 1)
    451 				continue;
    452 
    453 			/* append element */
    454 			lang_subtag = xstrdup(*p);
    455 			if (strchr(lang_subtag, '-'))
    456 				*(strchr(lang_subtag, '-')) = '\0';
    457 			for (r = list3; (r - list3) <= (n1 + n2); r++) {
    458 				do_append = 1;
    459 				if (!*r)
    460 					break;
    461 				if (strcmp(lang_subtag, *r) == 0) {
    462 					do_append = 0;
    463 					break;
    464 				}
    465 			}
    466 			if (do_append && n3 <= (n1 + n2)) {
    467 				list3[n3++] = lang_subtag;
    468 				list3[n3] = NULL;
    469 			} else
    470 				xfree(lang_subtag);
    471 		}
    472 	}
    473 
    474 	set3 = xjoin(list3, ',');
    475 	xfree_split_list(list1);
    476 	xfree_split_list(list2);
    477 	xfree_split_list(list3);
    478 
    479 	return (set3);
    480 }
    481 
    482 char *
    483 g11n_clnt_langtag_negotiate(char *clnt_langtags, char *srvr_langtags)
    484 {
    485 	char *list, *result;
    486 	char **xlist;
    487 
    488 	/* g11n_langtag_set_intersect uses xmalloc - should not return NULL */
    489 	list = g11n_langtag_set_intersect(clnt_langtags, srvr_langtags);
    490 
    491 	if (!list)
    492 		return (NULL);
    493 
    494 	xlist = xsplit(list, ',');
    495 
    496 	xfree(list);
    497 
    498 	if (!xlist || !*xlist)
    499 		return (NULL);
    500 
    501 	result = xstrdup(*xlist);
    502 	xfree_split_list(xlist);
    503 
    504 	return (result);
    505 }
    506 
    507 /*
    508  * Compare locales, preferring UTF-8 codesets to others, otherwise doing
    509  * a stright strcmp()
    510  */
    511 static int
    512 locale_cmp(const void *d1, const void *d2)
    513 {
    514 	char *dot_ptr;
    515 	char *s1 = *(char **)d1;
    516 	char *s2 = *(char **)d2;
    517 	int s1_is_utf8 = 0;
    518 	int s2_is_utf8 = 0;
    519 
    520 	/* check if s1 is a UTF-8 locale */
    521 	if (((dot_ptr = strchr((char *)s1, '.')) != NULL) &&
    522 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
    523 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
    524 		s1_is_utf8++;
    525 	}
    526 
    527 	/* check if s2 is a UTF-8 locale */
    528 	if (((dot_ptr = strchr((char *)s2, '.')) != NULL) &&
    529 	    (*dot_ptr != '\0') && (strncmp(dot_ptr + 1, "UTF-8", 5) == 0) &&
    530 	    (*(dot_ptr + 6) == '\0' || *(dot_ptr + 6) == '@')) {
    531 		s2_is_utf8++;
    532 	}
    533 
    534 	/* prefer UTF-8 locales */
    535 	if (s1_is_utf8 && !s2_is_utf8)
    536 		return (-1);
    537 
    538 	if (s2_is_utf8 && !s1_is_utf8)
    539 		return (1);
    540 
    541 	/* prefer any locale over the default locales */
    542 	if (strcmp(s1, "C") == 0 || strcmp(s1, "POSIX") == 0 ||
    543 	    strcmp(s1, "common") == 0) {
    544 		if (strcmp(s2, "C") != 0 && strcmp(s2, "POSIX") != 0 &&
    545 		    strcmp(s2, "common") != 0)
    546 			return (1);
    547 	}
    548 
    549 	if (strcmp(s2, "C") == 0 || strcmp(s2, "POSIX") == 0 ||
    550 	    strcmp(s2, "common") == 0) {
    551 		if (strcmp(s1, "C") != 0 &&
    552 		    strcmp(s1, "POSIX") != 0 &&
    553 		    strcmp(s1, "common") != 0)
    554 			return (-1);
    555 	}
    556 
    557 	return (strcmp(s1, s2));
    558 }
    559 
    560 
    561 char **
    562 g11n_langtag_set_locale_set_intersect(char *langtag_set, char **locale_set)
    563 {
    564 	char **langtag_list, **result, **p, **q, **r;
    565 	char *s;
    566 	uint_t do_append, n_langtags, n_locales, n_results, max_results;
    567 
    568 	/* count lang tags and locales */
    569 	for (n_locales = 0, p = locale_set; p && *p; p++)
    570 		n_locales++;
    571 
    572 	n_langtags = ((s = langtag_set) != NULL && *s && *s != ',') ? 1 : 0;
    573 	/* count the number of langtags */
    574 	for (; s = strchr(s, ','); s++, n_langtags++)
    575 		;
    576 
    577 	qsort(locale_set, n_locales, sizeof (char *), locale_cmp);
    578 
    579 	langtag_list = xsplit(langtag_set, ',');
    580 	for (n_langtags = 0, p = langtag_list; p && *p; p++, n_langtags++)
    581 		;
    582 
    583 	max_results = MIN(n_locales, n_langtags) * 2;
    584 	result = (char **) xmalloc(sizeof (char *) * (max_results + 1));
    585 	*result = NULL;
    586 	n_results = 0;
    587 
    588 	/* more specific matches first */
    589 	for (p = langtag_list; p && *p; p++) {
    590 		do_append = 0;
    591 		for (q = locale_set; q && *q; q++) {
    592 			if (g11n_langtag_matches_locale(*p, *q) == 2) {
    593 				do_append = 1;
    594 				for (r = result; (r - result) <=
    595 				    MIN(n_locales, n_langtags); r++) {
    596 					if (!*r)
    597 						break;
    598 					if (strcmp(*q, *r) == 0) {
    599 						do_append = 0;
    600 						break;
    601 					}
    602 				}
    603 				if (do_append && n_results < max_results) {
    604 					result[n_results++] = xstrdup(*q);
    605 					result[n_results] = NULL;
    606 				}
    607 				break;
    608 			}
    609 		}
    610 	}
    611 
    612 	for (p = langtag_list; p && *p; p++) {
    613 		do_append = 0;
    614 		for (q = locale_set; q && *q; q++) {
    615 			if (g11n_langtag_matches_locale(*p, *q) == 1) {
    616 				do_append = 1;
    617 				for (r = result; (r - result) <=
    618 				    MIN(n_locales, n_langtags); r++) {
    619 					if (!*r)
    620 						break;
    621 					if (strcmp(*q, *r) == 0) {
    622 						do_append = 0;
    623 						break;
    624 					}
    625 				}
    626 				if (do_append && n_results < max_results) {
    627 					result[n_results++] = xstrdup(*q);
    628 					result[n_results] = NULL;
    629 				}
    630 				break;
    631 			}
    632 		}
    633 	}
    634 
    635 	xfree_split_list(langtag_list);
    636 
    637 	return (result);
    638 }
    639 
    640 char *
    641 g11n_srvr_locale_negotiate(char *clnt_langtags, char **srvr_locales)
    642 {
    643 	char **results, **locales, *result = NULL;
    644 
    645 	if (srvr_locales == NULL)
    646 		locales = g11n_getlocales();
    647 	else
    648 		locales = srvr_locales;
    649 
    650 	if ((results = g11n_langtag_set_locale_set_intersect(clnt_langtags,
    651 	    locales)) == NULL)
    652 		goto err;
    653 
    654 	if (*results != NULL)
    655 		result = xstrdup(*results);
    656 
    657 	xfree_split_list(results);
    658 
    659 err:
    660 	if (locales != srvr_locales)
    661 		g11n_freelist(locales);
    662 	return (result);
    663 }
    664 
    665 
    666 /*
    667  * Functions for validating ASCII and UTF-8 strings
    668  *
    669  * The error_str parameter is an optional pointer to a char variable
    670  * where to store a string suitable for use with error() or fatal() or
    671  * friends.
    672  *
    673  * The return value is 0 if success, EILSEQ or EINVAL.
    674  *
    675  */
    676 uint_t
    677 g11n_validate_ascii(const char *str, uint_t len, uchar_t **error_str)
    678 {
    679 	uchar_t *p;
    680 
    681 	for (p = (uchar_t *)str; p && *p && (!(*p & 0x80)); p++)
    682 		;
    683 
    684 	if (len && ((p - (uchar_t *)str) != len))
    685 		return (EILSEQ);
    686 
    687 	return (0);
    688 }
    689 
    690 uint_t
    691 g11n_validate_utf8(const uchar_t *str, uint_t len, uchar_t **error_str)
    692 {
    693 	uchar_t *p;
    694 	uint_t c, l;
    695 
    696 	if (len == 0)
    697 		len = strlen((const char *)str);
    698 
    699 	for (p = (uchar_t *)str; p && (p - str < len) && *p; ) {
    700 		/* 8-bit chars begin a UTF-8 sequence */
    701 		if (*p & 0x80) {
    702 			/* get sequence length and sanity check first byte */
    703 			if (*p < 0xc0)
    704 				return (EILSEQ);
    705 			else if (*p < 0xe0)
    706 				l = 2;
    707 			else if (*p < 0xf0)
    708 				l = 3;
    709 			else if (*p < 0xf8)
    710 				l = 4;
    711 			else if (*p < 0xfc)
    712 				l = 5;
    713 			else if (*p < 0xfe)
    714 				l = 6;
    715 			else
    716 				return (EILSEQ);
    717 
    718 			if ((p + l - str) >= len)
    719 				return (EILSEQ);
    720 
    721 			/* overlong detection - build codepoint */
    722 			c = *p & 0x3f;
    723 			/* shift c bits from first byte */
    724 			c = c << (6 * (l - 1));
    725 
    726 			if (l > 1) {
    727 				if (*(p + 1) && ((*(p + 1) & 0xc0) == 0x80))
    728 					c = c | ((*(p + 1) & 0x3f) <<
    729 					    (6 * (l - 2)));
    730 				else
    731 					return (EILSEQ);
    732 
    733 				if (c < 0x80)
    734 					return (EILSEQ);
    735 			}
    736 
    737 			if (l > 2) {
    738 				if (*(p + 2) && ((*(p + 2) & 0xc0) == 0x80))
    739 					c = c | ((*(p + 2) & 0x3f) <<
    740 					    (6 * (l - 3)));
    741 				else
    742 					return (EILSEQ);
    743 
    744 				if (c < 0x800)
    745 					return (EILSEQ);
    746 			}
    747 
    748 			if (l > 3) {
    749 				if (*(p + 3) && ((*(p + 3) & 0xc0) == 0x80))
    750 					c = c | ((*(p + 3) & 0x3f) <<
    751 					    (6 * (l - 4)));
    752 				else
    753 					return (EILSEQ);
    754 
    755 				if (c < 0x10000)
    756 					return (EILSEQ);
    757 			}
    758 
    759 			if (l > 4) {
    760 				if (*(p + 4) && ((*(p + 4) & 0xc0) == 0x80))
    761 					c = c | ((*(p + 4) & 0x3f) <<
    762 					    (6 * (l - 5)));
    763 				else
    764 					return (EILSEQ);
    765 
    766 				if (c < 0x200000)
    767 					return (EILSEQ);
    768 			}
    769 
    770 			if (l > 5) {
    771 				if (*(p + 5) && ((*(p + 5) & 0xc0) == 0x80))
    772 					c = c | (*(p + 5) & 0x3f);
    773 				else
    774 					return (EILSEQ);
    775 
    776 				if (c < 0x4000000)
    777 					return (EILSEQ);
    778 			}
    779 
    780 			/*
    781 			 * check for UTF-16 surrogates ifs other illegal
    782 			 * UTF-8 * points
    783 			 */
    784 			if (((c <= 0xdfff) && (c >= 0xd800)) ||
    785 			    (c == 0xfffe) || (c == 0xffff))
    786 				return (EILSEQ);
    787 			p += l;
    788 		}
    789 		/* 7-bit chars are fine */
    790 		else
    791 			p++;
    792 	}
    793 	return (0);
    794 }
    795 
    796 /*
    797  * Functions for converting to ASCII or UTF-8 from the local codeset
    798  * Functions for converting from ASCII or UTF-8 to the local codeset
    799  *
    800  * The error_str parameter is an optional pointer to a char variable
    801  * where to store a string suitable for use with error() or fatal() or
    802  * friends.
    803  *
    804  * The err parameter is an optional pointer to an integer where 0
    805  * (success) or EILSEQ or EINVAL will be stored (failure).
    806  *
    807  * These functions return NULL if the conversion fails.
    808  *
    809  */
    810 uchar_t *
    811 g11n_convert_from_ascii(const char *str, int *err_ptr, uchar_t **error_str)
    812 {
    813 	static uint_t initialized = 0;
    814 	static uint_t do_convert = 0;
    815 	iconv_t cd;
    816 	int err;
    817 
    818 	if (!initialized) {
    819 		/*
    820 		 * iconv_open() fails if the to/from codesets are the
    821 		 * same, and there are aliases of codesets to boot...
    822 		 */
    823 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
    824 		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
    825 		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
    826 			initialized = 1;
    827 			do_convert = 0;
    828 		} else {
    829 			cd = iconv_open(nl_langinfo(CODESET), "646");
    830 			if (cd == (iconv_t)-1) {
    831 				if (err_ptr)
    832 					*err_ptr = errno;
    833 				if (error_str)
    834 					*error_str = (uchar_t *)"Cannot "
    835 					    "convert ASCII strings to the local"
    836 					    " codeset";
    837 			}
    838 			initialized = 1;
    839 			do_convert = 1;
    840 		}
    841 	}
    842 
    843 	if (!do_convert) {
    844 		if ((err = g11n_validate_ascii(str, 0, error_str))) {
    845 			if (err_ptr)
    846 				*err_ptr = err;
    847 			return (NULL);
    848 		} else
    849 			return ((uchar_t *)xstrdup(str));
    850 	}
    851 
    852 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
    853 }
    854 
    855 uchar_t *
    856 g11n_convert_from_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
    857 {
    858 	static uint_t initialized = 0;
    859 	static uint_t do_convert = 0;
    860 	iconv_t cd;
    861 	int err;
    862 
    863 	if (!initialized) {
    864 		/*
    865 		 * iconv_open() fails if the to/from codesets are the
    866 		 * same, and there are aliases of codesets to boot...
    867 		 */
    868 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
    869 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
    870 			initialized = 1;
    871 			do_convert = 0;
    872 		} else {
    873 			cd = iconv_open(nl_langinfo(CODESET), "UTF-8");
    874 			if (cd == (iconv_t)-1) {
    875 				if (err_ptr)
    876 					*err_ptr = errno;
    877 				if (error_str)
    878 					*error_str = (uchar_t *)"Cannot "
    879 					    "convert UTF-8 strings to the "
    880 					    "local codeset";
    881 			}
    882 			initialized = 1;
    883 			do_convert = 1;
    884 		}
    885 	}
    886 
    887 	if (!do_convert) {
    888 		if ((err = g11n_validate_utf8(str, 0, error_str))) {
    889 			if (err_ptr)
    890 				*err_ptr = err;
    891 			return (NULL);
    892 		} else
    893 			return ((uchar_t *)xstrdup((char *)str));
    894 	}
    895 
    896 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
    897 }
    898 
    899 char *
    900 g11n_convert_to_ascii(const uchar_t *str, int *err_ptr, uchar_t **error_str)
    901 {
    902 	static uint_t initialized = 0;
    903 	static uint_t do_convert = 0;
    904 	iconv_t cd;
    905 
    906 	if (!initialized) {
    907 		/*
    908 		 * iconv_open() fails if the to/from codesets are the
    909 		 * same, and there are aliases of codesets to boot...
    910 		 */
    911 		if (strcmp("646", nl_langinfo(CODESET)) == 0 ||
    912 		    strcmp("ASCII",  nl_langinfo(CODESET)) == 0 ||
    913 		    strcmp("US-ASCII",  nl_langinfo(CODESET)) == 0) {
    914 			initialized = 1;
    915 			do_convert = 0;
    916 		} else {
    917 			cd = iconv_open("646", nl_langinfo(CODESET));
    918 			if (cd == (iconv_t)-1) {
    919 				if (err_ptr)
    920 					*err_ptr = errno;
    921 				if (error_str)
    922 					*error_str = (uchar_t *)"Cannot "
    923 					    "convert UTF-8 strings to the "
    924 					    "local codeset";
    925 			}
    926 			initialized = 1;
    927 			do_convert = 1;
    928 		}
    929 	}
    930 
    931 	if (!do_convert)
    932 		return (xstrdup((char *)str));
    933 
    934 	return ((char *)do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
    935 }
    936 
    937 uchar_t *
    938 g11n_convert_to_utf8(const uchar_t *str, int *err_ptr, uchar_t **error_str)
    939 {
    940 	static uint_t initialized = 0;
    941 	static uint_t do_convert = 0;
    942 	iconv_t cd;
    943 
    944 	if (!initialized) {
    945 		/*
    946 		 * iconv_open() fails if the to/from codesets are the
    947 		 * same, and there are aliases of codesets to boot...
    948 		 */
    949 		if (strcmp("UTF-8", nl_langinfo(CODESET)) == 0 ||
    950 		    strcmp("UTF8",  nl_langinfo(CODESET)) == 0) {
    951 			initialized = 1;
    952 			do_convert = 0;
    953 		} else {
    954 			cd = iconv_open("UTF-8", nl_langinfo(CODESET));
    955 			if (cd == (iconv_t)-1) {
    956 				if (err_ptr)
    957 					*err_ptr = errno;
    958 				if (error_str)
    959 					*error_str = (uchar_t *)"Cannot "
    960 					    "convert UTF-8 strings to the "
    961 					    "local codeset";
    962 			}
    963 			initialized = 1;
    964 			do_convert = 1;
    965 		}
    966 	}
    967 
    968 	if (!do_convert)
    969 		return ((uchar_t *)xstrdup((char *)str));
    970 
    971 	return (do_iconv(cd, NULL, str, 0, NULL, err_ptr, error_str));
    972 }
    973 
    974 
    975 /*
    976  * Wrapper around iconv()
    977  *
    978  * The caller is responsible for freeing the result and for handling
    979  * (errno && errno != E2BIG) (i.e., EILSEQ, EINVAL, EBADF).
    980  */
    981 static uchar_t *
    982 do_iconv(iconv_t cd, uint_t *mul_ptr, const void *buf, uint_t len,
    983     uint_t *outlen, int *err, uchar_t **err_str)
    984 {
    985 	size_t inbytesleft, outbytesleft, converted_size;
    986 	char *outbuf;
    987 	uchar_t *converted;
    988 	const char *inbuf;
    989 	uint_t mul = 0;
    990 
    991 	if (!buf || !(*(char *)buf))
    992 		return (NULL);
    993 
    994 	if (len == 0)
    995 		len = strlen(buf);
    996 
    997 	/* reset conversion descriptor */
    998 	/* XXX Do we need initial shift sequences for UTF-8??? */
    999 	(void) iconv(cd, NULL, &inbytesleft, &outbuf, &outbytesleft);
   1000 	inbuf = (const char *) buf;
   1001 
   1002 	if (mul_ptr)
   1003 		mul = *mul_ptr;
   1004 
   1005 	converted_size = (len << mul);
   1006 	outbuf = (char *)xmalloc(converted_size + 1); /* for null */
   1007 	converted = (uchar_t *)outbuf;
   1008 	outbytesleft = len;
   1009 
   1010 	do {
   1011 		if (iconv(cd, &inbuf, &inbytesleft, &outbuf, &outbytesleft) ==
   1012 		    (size_t)-1) {
   1013 			if (errno == E2BIG) {
   1014 				/* UTF-8 codepoints are at most 8 bytes long */
   1015 				if (mul > 2) {
   1016 					if (err_str)
   1017 						*err_str = (uchar_t *)
   1018 						    "Conversion to UTF-8 failed"
   1019 						    " due to preposterous space"
   1020 						    " requirements";
   1021 					if (err)
   1022 						*err = EILSEQ;
   1023 					return (NULL);
   1024 				}
   1025 
   1026 				/*
   1027 				 * re-alloc output and ensure that the outbuf
   1028 				 * and outbytesleft values are adjusted
   1029 				 */
   1030 				converted = xrealloc(converted,
   1031 				    converted_size << 1 + 1);
   1032 				outbuf = (char *)converted + converted_size -
   1033 				    outbytesleft;
   1034 				converted_size = (len << ++(mul));
   1035 				outbytesleft = converted_size - outbytesleft;
   1036 			} else {
   1037 				/*
   1038 				 * let the caller deal with iconv() errors,
   1039 				 * probably by calling fatal(); xfree() does
   1040 				 * not set errno
   1041 				 */
   1042 				if (err)
   1043 					*err = errno;
   1044 				xfree(converted);
   1045 				return (NULL);
   1046 			}
   1047 		}
   1048 	} while (inbytesleft);
   1049 
   1050 	*outbuf = '\0'; /* ensure null-termination */
   1051 	if (outlen)
   1052 		*outlen = converted_size - outbytesleft;
   1053 	if (mul_ptr)
   1054 		*mul_ptr = mul;
   1055 
   1056 	return (converted);
   1057 }
   1058 
   1059 /*
   1060  * Free all strings in the list and then free the list itself. We know that the
   1061  * list ends with a NULL pointer.
   1062  */
   1063 void
   1064 g11n_freelist(char **list)
   1065 {
   1066 	int i = 0;
   1067 
   1068 	while (list[i] != NULL) {
   1069 		xfree(list[i]);
   1070 		i++;
   1071 	}
   1072 
   1073 	xfree(list);
   1074 }
   1075