Home | History | Annotate | Download | only in support
      1 /*
      2  * Copyright 2009 Sun Microsystems, Inc.  All rights reserved.
      3  * Use is subject to license terms.
      4  */
      5 
      6 /*
      7  * util/support/utf8_conv.c
      8  *
      9  * Copyright 2008 by the Massachusetts Institute of Technology.
     10  * All Rights Reserved.
     11  *
     12  * Export of this software from the United States of America may
     13  *   require a specific license from the United States Government.
     14  *   It is the responsibility of any person or organization contemplating
     15  *   export to obtain such a license before exporting.
     16  *
     17  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
     18  * distribute this software and its documentation for any purpose and
     19  * without fee is hereby granted, provided that the above copyright
     20  * notice appear in all copies and that both that copyright notice and
     21  * this permission notice appear in supporting documentation, and that
     22  * the name of M.I.T. not be used in advertising or publicity pertaining
     23  * to distribution of the software without specific, written prior
     24  * permission.  Furthermore if you modify this software you must label
     25  * your software as modified software and not distribute it in such a
     26  * fashion that it might be confused with the original M.I.T. software.
     27  * M.I.T. makes no representations about the suitability of
     28  * this software for any purpose.  It is provided "as is" without express
     29  * or implied warranty.
     30  */
     31 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
     32  *
     33  * Copyright 1998-2008 The OpenLDAP Foundation.
     34  * All rights reserved.
     35  *
     36  * Redistribution and use in source and binary forms, with or without
     37  * modification, are permitted only as authorized by the OpenLDAP
     38  * Public License.
     39  *
     40  * A copy of this license is available in the file LICENSE in the
     41  * top-level directory of the distribution or, alternatively, at
     42  * <http://www.OpenLDAP.org/license.html>.
     43  */
     44 /* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
     45  *
     46  * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
     47  * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
     48  * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
     49  * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
     50  * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
     51  * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
     52  * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
     53  * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
     54  */
     55 
     56 /*
     57  * UTF-8 Conversion Routines
     58  *
     59  * These routines convert between Wide Character and UTF-8,
     60  * or between MultiByte and UTF-8 encodings.
     61  *
     62  * Both single character and string versions of the functions are provided.
     63  * All functions return -1 if the character or string cannot be converted.
     64  */
     65 
     66 #include "k5-platform.h"
     67 #include "k5-utf8.h"
     68 #include "supp-int.h"
     69 #include "errno.h"  /* SUNW17PACresync */
     70 
     71 static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
     72 
     73 static ssize_t
     74 k5_utf8s_to_ucs2s(krb5_ucs2 *ucs2str,
     75 		  const char *utf8str,
     76 		  size_t count,
     77 		  int little_endian)
     78 {
     79     size_t ucs2len = 0;
     80     size_t utflen, i;
     81     krb5_ucs2 ch;
     82 
     83     /* If input ptr is NULL or empty... */
     84     if (utf8str == NULL || *utf8str == '\0') {
     85 	*ucs2str = 0;
     86 
     87 	return 0;
     88     }
     89 
     90     /* Examine next UTF-8 character.  */
     91     while (*utf8str && ucs2len < count) {
     92 	/* Get UTF-8 sequence length from 1st byte */
     93 	utflen = KRB5_UTF8_CHARLEN2(utf8str, utflen);
     94 
     95 	if (utflen == 0 || utflen > KRB5_MAX_UTF8_LEN)
     96 	    return -1;
     97 
     98 	/* First byte minus length tag */
     99 	ch = (krb5_ucs2)(utf8str[0] & mask[utflen]);
    100 
    101 	for (i = 1; i < utflen; i++) {
    102 	    /* Subsequent bytes must start with 10 */
    103 	    if ((utf8str[i] & 0xc0) != 0x80)
    104 		return -1;
    105 
    106 	    ch <<= 6;			/* 6 bits of data in each subsequent byte */
    107 	    ch |= (krb5_ucs2)(utf8str[i] & 0x3f);
    108 	}
    109 
    110 	if (ucs2str != NULL) {
    111 #ifdef K5_BE
    112 #ifndef SWAP16
    113 #define SWAP16(X)	((((X) << 8) | ((X) >> 8)) & 0xFFFF)
    114 #endif
    115 	    if (little_endian)
    116 		ucs2str[ucs2len] = SWAP16(ch);
    117 	    else
    118 #endif
    119 		ucs2str[ucs2len] = ch;
    120 	}
    121 
    122 	utf8str += utflen;	/* Move to next UTF-8 character */
    123 	ucs2len++;		/* Count number of wide chars stored/required */
    124     }
    125 
    126     assert(ucs2len < count);
    127 
    128     if (ucs2str != NULL) {
    129 	/* Add null terminator if there's room in the buffer. */
    130 	ucs2str[ucs2len] = 0;
    131     }
    132 
    133     return ucs2len;
    134 }
    135 
    136 int
    137 krb5int_utf8s_to_ucs2s(const char *utf8s,
    138 		       krb5_ucs2 **ucs2s,
    139 		       size_t *ucs2chars)
    140 {
    141     ssize_t len;
    142     size_t chars;
    143 
    144     chars = krb5int_utf8_chars(utf8s);
    145     *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
    146     if (*ucs2s == NULL) {
    147 	return ENOMEM;
    148     }
    149 
    150     len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0);
    151     if (len < 0) {
    152 	free(*ucs2s);
    153 	*ucs2s = NULL;
    154 	return EINVAL;
    155     }
    156 
    157     if (ucs2chars != NULL) {
    158 	*ucs2chars = chars;
    159     }
    160 
    161     return 0;
    162 }
    163 
    164 int
    165 krb5int_utf8cs_to_ucs2s(const char *utf8s,
    166 			size_t utf8slen,
    167 			krb5_ucs2 **ucs2s,
    168 			size_t *ucs2chars)
    169 {
    170     ssize_t len;
    171     size_t chars;
    172 
    173     chars = krb5int_utf8c_chars(utf8s, utf8slen);
    174     *ucs2s = (krb5_ucs2 *)malloc((chars + 1) * sizeof(krb5_ucs2));
    175     if (*ucs2s == NULL) {
    176 	return ENOMEM;
    177     }
    178 
    179     len = k5_utf8s_to_ucs2s(*ucs2s, utf8s, chars + 1, 0);
    180     if (len < 0) {
    181 	free(*ucs2s);
    182 	*ucs2s = NULL;
    183 	return EINVAL;
    184     }
    185 
    186     if (ucs2chars != NULL) {
    187 	*ucs2chars = chars;
    188     }
    189 
    190     return 0;
    191 }
    192 
    193 int
    194 krb5int_utf8s_to_ucs2les(const char *utf8s,
    195                          unsigned char **ucs2les,
    196 			 size_t *ucs2leslen)
    197 {
    198     ssize_t len;
    199     size_t chars;
    200 
    201     chars = krb5int_utf8_chars(utf8s);
    202 
    203     *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2));
    204     if (*ucs2les == NULL) {
    205 	return ENOMEM;
    206     }
    207 
    208     len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1);
    209     if (len < 0) {
    210 	free(*ucs2les);
    211 	*ucs2les = NULL;
    212 	return EINVAL;
    213     }
    214 
    215     if (ucs2leslen != NULL) {
    216 	*ucs2leslen = chars * sizeof(krb5_ucs2);
    217     }
    218 
    219     return 0;
    220 }
    221 
    222 int
    223 krb5int_utf8cs_to_ucs2les(const char *utf8s,
    224 			  size_t utf8slen,
    225 			  unsigned char **ucs2les,
    226 			  size_t *ucs2leslen)
    227 {
    228     ssize_t len;
    229     size_t chars;
    230 
    231     chars = krb5int_utf8c_chars(utf8s, utf8slen);
    232 
    233     *ucs2les = (unsigned char *)malloc((chars + 1) * sizeof(krb5_ucs2));
    234     if (*ucs2les == NULL) {
    235 	return ENOMEM;
    236     }
    237 
    238     len = k5_utf8s_to_ucs2s((krb5_ucs2 *)*ucs2les, utf8s, chars + 1, 1);
    239     if (len < 0) {
    240 	free(*ucs2les);
    241 	*ucs2les = NULL;
    242 	return EINVAL;
    243     }
    244 
    245     if (ucs2leslen != NULL) {
    246 	*ucs2leslen = chars * sizeof(krb5_ucs2);
    247     }
    248 
    249     return 0;
    250 }
    251 
    252 /*-----------------------------------------------------------------------------
    253    Convert a wide char string to a UTF-8 string.
    254    No more than 'count' bytes will be written to the output buffer.
    255    Return the # of bytes written to the output buffer, excl null terminator.
    256 
    257    ucs2len is -1 if the UCS-2 string is NUL terminated, otherwise it is the
    258    length of the UCS-2 string in characters
    259 */
    260 static ssize_t
    261 k5_ucs2s_to_utf8s(char *utf8str, const krb5_ucs2 *ucs2str,
    262 		  size_t count, ssize_t ucs2len, int little_endian)
    263 {
    264     int len = 0;
    265     int n;
    266     char *p = utf8str;
    267     krb5_ucs2 empty = 0, ch;
    268 
    269     if (ucs2str == NULL)	/* Treat input ptr NULL as an empty string */
    270 	ucs2str = &empty;
    271 
    272     if (utf8str == NULL)	/* Just compute size of output, excl null */
    273     {
    274 	while (ucs2len == -1 ? *ucs2str : --ucs2len >= 0) {
    275 	    /* Get UTF-8 size of next wide char */
    276 	  ch = *ucs2str++;
    277 #ifdef K5_BE
    278 	    if (little_endian)
    279 		ch = SWAP16(ch);
    280 #endif
    281 
    282 	    n = krb5int_ucs2_to_utf8(ch, NULL);
    283 	    if (n < 1)
    284 		return -1;
    285 	    if (len + n < len)
    286 		return -1; /* overflow */
    287 	    len += n;
    288 	}
    289 
    290 	return len;
    291     }
    292 
    293     /* Do the actual conversion. */
    294 
    295     n = 1;					/* In case of empty ucs2str */
    296     while (ucs2len == -1 ? *ucs2str != 0 : --ucs2len >= 0) {
    297       ch = *ucs2str++;
    298 #ifdef K5_BE
    299 	if (little_endian)
    300 	    ch = SWAP16(ch);
    301 #endif
    302 
    303 	n = krb5int_ucs2_to_utf8(ch, p);
    304 
    305 	if (n < 1)
    306 	    break;
    307 
    308 	p += n;
    309 	count -= n;			/* Space left in output buffer */
    310     }
    311 
    312     /* If not enough room for last character, pad remainder with null
    313        so that return value = original count, indicating buffer full. */
    314     if (n == 0) {
    315 	while (count--)
    316 	    *p++ = 0;
    317     }
    318     /* Add a null terminator if there's room. */
    319     else if (count)
    320 	*p = 0;
    321 
    322     if (n == -1)			/* Conversion encountered invalid wide char. */
    323 	return -1;
    324 
    325     /* Return the number of bytes written to output buffer, excl null. */
    326     return (p - utf8str);
    327 }
    328 
    329 int
    330 krb5int_ucs2s_to_utf8s(const krb5_ucs2 *ucs2s,
    331 		       char **utf8s,
    332 		       size_t *utf8slen)
    333 {
    334     ssize_t len;
    335 
    336     len = k5_ucs2s_to_utf8s(NULL, ucs2s, 0, -1, 0);
    337     if (len < 0) {
    338 	return EINVAL;
    339     }
    340 
    341     *utf8s = (char *)malloc((size_t)len + 1);
    342     if (*utf8s == NULL) {
    343 	return ENOMEM;
    344     }
    345 
    346     len = k5_ucs2s_to_utf8s(*utf8s, ucs2s, (size_t)len + 1, -1, 0);
    347     if (len < 0) {
    348 	free(*utf8s);
    349 	*utf8s = NULL;
    350 	return EINVAL;
    351     }
    352 
    353     if (utf8slen != NULL) {
    354 	*utf8slen = len;
    355     }
    356 
    357     return 0;
    358 }
    359 
    360 int
    361 krb5int_ucs2les_to_utf8s(const unsigned char *ucs2les,
    362 			 char **utf8s,
    363 			 size_t *utf8slen)
    364 {
    365     ssize_t len;
    366 
    367     len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0, -1, 1);
    368     if (len < 0)
    369 	return EINVAL;
    370 
    371     *utf8s = (char *)malloc((size_t)len + 1);
    372     if (*utf8s == NULL) {
    373 	return ENOMEM;
    374     }
    375 
    376     len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les, (size_t)len + 1, -1, 1);
    377     if (len < 0) {
    378 	free(*utf8s);
    379 	*utf8s = NULL;
    380 	return EINVAL;
    381     }
    382 
    383     if (utf8slen != NULL) {
    384 	*utf8slen = len;
    385     }
    386 
    387     return 0;
    388 }
    389 
    390 int
    391 krb5int_ucs2cs_to_utf8s(const krb5_ucs2 *ucs2s,
    392                         size_t ucs2slen,
    393                         char **utf8s,
    394                         size_t *utf8slen)
    395 {
    396     ssize_t len;
    397 
    398     if (ucs2slen > SSIZE_MAX)
    399 	return ERANGE;
    400 
    401     len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2s, 0,
    402 			    (ssize_t)ucs2slen, 0);
    403     if (len < 0)
    404 	return EINVAL;
    405 
    406     *utf8s = (char *)malloc((size_t)len + 1);
    407     if (*utf8s == NULL) {
    408 	return ENOMEM;
    409     }
    410 
    411     len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2s,
    412 			    (size_t)len + 1, (ssize_t)ucs2slen, 0);
    413     if (len < 0) {
    414 	free(*utf8s);
    415 	*utf8s = NULL;
    416 	return EINVAL;
    417     }
    418 
    419     if (utf8slen != NULL) {
    420 	*utf8slen = len;
    421     }
    422 
    423     return 0;
    424 }
    425 
    426 int
    427 krb5int_ucs2lecs_to_utf8s(const unsigned char *ucs2les,
    428                           size_t ucs2leslen,
    429                           char **utf8s,
    430                           size_t *utf8slen)
    431 {
    432     ssize_t len;
    433 
    434     if (ucs2leslen > SSIZE_MAX)
    435 	return ERANGE;
    436 
    437     len = k5_ucs2s_to_utf8s(NULL, (krb5_ucs2 *)ucs2les, 0,
    438 			    (ssize_t)ucs2leslen, 1);
    439     if (len < 0)
    440 	return EINVAL;
    441 
    442     *utf8s = (char *)malloc((size_t)len + 1);
    443     if (*utf8s == NULL) {
    444 	return ENOMEM;
    445     }
    446 
    447     len = k5_ucs2s_to_utf8s(*utf8s, (krb5_ucs2 *)ucs2les,
    448 			    (size_t)len + 1, (ssize_t)ucs2leslen, 1);
    449     if (len < 0) {
    450 	free(*utf8s);
    451 	*utf8s = NULL;
    452 	return EINVAL;
    453     }
    454 
    455     if (utf8slen != NULL) {
    456 	*utf8slen = len;
    457     }
    458 
    459     return 0;
    460 }
    461 
    462