Home | History | Annotate | Download | only in smbsrv
      1 /*
      2  * CDDL HEADER START
      3  *
      4  * The contents of this file are subject to the terms of the
      5  * Common Development and Distribution License (the "License").
      6  * You may not use this file except in compliance with the License.
      7  *
      8  * You can obtain a copy of the license at usr/src/OPENSOLARIS.LICENSE
      9  * or http://www.opensolaris.org/os/licensing.
     10  * See the License for the specific language governing permissions
     11  * and limitations under the License.
     12  *
     13  * When distributing Covered Code, include this CDDL HEADER in each
     14  * file and include the License file at usr/src/OPENSOLARIS.LICENSE.
     15  * If applicable, add the following below this CDDL HEADER, with the
     16  * fields enclosed by brackets "[]" replaced with your own identifying
     17  * information: Portions Copyright [yyyy] [name of copyright owner]
     18  *
     19  * CDDL HEADER END
     20  */
     21 /*
     22  * Copyright 2007 Sun Microsystems, Inc.  All rights reserved.
     23  * Use is subject to license terms.
     24  */
     25 
     26 /*
     27  * Multibyte/wide-char conversion routines. Wide-char encoding provides
     28  * a fixed size character encoding that maps to the Unicode 16-bit
     29  * (UCS-2) character set standard. Multibyte or UCS transformation
     30  * format (UTF) encoding is a variable length character encoding scheme
     31  * that s compatible with existing ASCII characters and guarantees that
     32  * the resultant strings do not contain embedded null characters. Both
     33  * types of encoding provide a null terminator: single byte for UTF-8
     34  * and a wide-char null for Unicode. See RFC 2044.
     35  *
     36  * The table below illustrates the UTF-8 encoding scheme. The letter x
     37  * indicates bits available for encoding the character value.
     38  *
     39  *	UCS-2			UTF-8 octet sequence (binary)
     40  *	0x0000-0x007F	0xxxxxxx
     41  *	0x0080-0x07FF	110xxxxx 10xxxxxx
     42  *	0x0800-0xFFFF	1110xxxx 10xxxxxx 10xxxxxx
     43  *
     44  * RFC 2044
     45  * UTF-8,a transformation format of UNICODE and ISO 10646
     46  * F. Yergeau
     47  * Alis Technologies
     48  * October 1996
     49  */
     50 
     51 #pragma ident	"%Z%%M%	%I%	%E% SMI"
     52 
     53 #ifdef _KERNEL
     54 #include <sys/types.h>
     55 #include <sys/sunddi.h>
     56 #else
     57 #include <stdio.h>
     58 #include <stdlib.h>
     59 #include <assert.h>
     60 #include <strings.h>
     61 #endif
     62 #include <smbsrv/smb_i18n.h>
     63 #include <smbsrv/string.h>
     64 
     65 
     66 /*
     67  * mbstowcs
     68  *
     69  * The mbstowcs() function converts a multibyte character string
     70  * mbstring into a wide character string wcstring. No more than
     71  * nwchars wide characters are stored. A terminating null wide
     72  * character is appended if there is room.
     73  *
     74  * Returns the number of wide characters converted, not counting
     75  * any terminating null wide character. Returns -1 if an invalid
     76  * multibyte character is encountered.
     77  */
     78 size_t
     79 mts_mbstowcs(mts_wchar_t *wcstring, const char *mbstring, size_t nwchars)
     80 {
     81 	int len;
     82 	mts_wchar_t	*start = wcstring;
     83 
     84 	while (nwchars--) {
     85 		len = mts_mbtowc(wcstring, mbstring, MTS_MB_CHAR_MAX);
     86 		if (len < 0) {
     87 			*wcstring = 0;
     88 			return ((size_t)-1);
     89 		}
     90 
     91 		if (*mbstring == 0)
     92 			break;
     93 
     94 		++wcstring;
     95 		mbstring += len;
     96 	}
     97 
     98 	return (wcstring - start);
     99 }
    100 
    101 
    102 /*
    103  * mbtowc
    104  *
    105  * The mbtowc() function converts a multibyte character mbchar into
    106  * a wide character and stores the result in the object pointed to
    107  * by wcharp. Up to nbytes bytes are examined.
    108  *
    109  * If mbchar is NULL, mbtowc() returns zero to indicate that shift
    110  * states are not supported.  Shift states are used to switch between
    111  * representation modes using reserved bytes to signal shifting
    112  * without them being interpreted as characters.  If mbchar is null
    113  * mbtowc should return non-zero if the current locale requires shift
    114  * states.  Otherwise it should be return 0.
    115  *
    116  * If mbchar is non-null, returns the number of bytes processed in
    117  * mbchar.  If mbchar is invalid, returns -1.
    118  */
    119 int /*ARGSUSED*/
    120 mts_mbtowc(mts_wchar_t *wcharp, const char *mbchar, size_t nbytes)
    121 {
    122 	unsigned char mbyte;
    123 	mts_wchar_t wide_char;
    124 	int count;
    125 	int bytes_left;
    126 
    127 	if (mbchar == NULL)
    128 		return (0); /* no shift states */
    129 
    130 	/* 0xxxxxxx -> 1 byte ASCII encoding */
    131 	if (((mbyte = *mbchar++) & 0x80) == 0) {
    132 		if (wcharp)
    133 			*wcharp = (mts_wchar_t)mbyte;
    134 
    135 		return (mbyte ? 1 : 0);
    136 	}
    137 
    138 	/* 10xxxxxx -> invalid first byte */
    139 	if ((mbyte & 0x40) == 0)
    140 		return (-1);
    141 
    142 	wide_char = mbyte;
    143 	if ((mbyte & 0x20) == 0) {
    144 		wide_char &= 0x1f;
    145 		bytes_left = 1;
    146 	} else if ((mbyte & 0x10) == 0) {
    147 		wide_char &= 0x0f;
    148 		bytes_left = 2;
    149 	} else {
    150 		return (-1);
    151 	}
    152 
    153 	count = 1;
    154 	while (bytes_left--) {
    155 		if (((mbyte = *mbchar++) & 0xc0) != 0x80)
    156 			return (-1);
    157 
    158 		count++;
    159 		wide_char = (wide_char << 6) | (mbyte & 0x3f);
    160 	}
    161 
    162 	if (wcharp)
    163 		*wcharp = wide_char;
    164 
    165 	return (count);
    166 }
    167 
    168 
    169 /*
    170  * wctomb
    171  *
    172  * The wctomb() function converts a wide character wchar into a multibyte
    173  * character and stores the result in mbchar. The object pointed to by
    174  * mbchar must be large enough to accommodate the multibyte character.
    175  *
    176  * Returns the numberof bytes written to mbchar.
    177  */
    178 int
    179 mts_wctomb(char *mbchar, mts_wchar_t wchar)
    180 {
    181 	if ((wchar & ~0x7f) == 0) {
    182 		*mbchar = (char)wchar;
    183 		return (1);
    184 	}
    185 
    186 	if ((wchar & ~0x7ff) == 0) {
    187 		*mbchar++ = (wchar >> 6) | 0xc0;
    188 		*mbchar = (wchar & 0x3f) | 0x80;
    189 		return (2);
    190 	}
    191 
    192 	*mbchar++ = (wchar >> 12) | 0xe0;
    193 	*mbchar++ = ((wchar >> 6) & 0x3f) | 0x80;
    194 	*mbchar = (wchar & 0x3f) | 0x80;
    195 	return (3);
    196 }
    197 
    198 
    199 /*
    200  * wcstombs
    201  *
    202  * The wcstombs() function converts a wide character string wcstring
    203  * into a multibyte character string mbstring. Up to nbytes bytes are
    204  * stored in mbstring. Partial multibyte characters at the end of the
    205  * string are not stored. The multibyte character string is null
    206  * terminated if there is room.
    207  *
    208  * Returns the number of bytes converted, not counting the terminating
    209  * null byte.
    210  */
    211 size_t
    212 mts_wcstombs(char *mbstring, const mts_wchar_t *wcstring, size_t nbytes)
    213 {
    214 	char *start = mbstring;
    215 	const mts_wchar_t *wcp = wcstring;
    216 	mts_wchar_t wide_char;
    217 	char buf[4];
    218 	size_t len;
    219 
    220 	if ((mbstring == NULL) || (wcstring == NULL))
    221 		return (0);
    222 
    223 	while (nbytes > MTS_MB_CHAR_MAX) {
    224 		wide_char = *wcp++;
    225 		len = mts_wctomb(mbstring, wide_char);
    226 
    227 		if (wide_char == 0)
    228 			/*LINTED E_PTRDIFF_OVERFLOW*/
    229 			return (mbstring - start);
    230 
    231 		mbstring += len;
    232 		nbytes -= len;
    233 	}
    234 
    235 	while (wide_char && nbytes) {
    236 		wide_char = *wcp++;
    237 		if ((len = mts_wctomb(buf, wide_char)) > nbytes) {
    238 			*mbstring = 0;
    239 			break;
    240 		}
    241 
    242 		bcopy(buf, mbstring, len);
    243 		mbstring += len;
    244 		nbytes -= len;
    245 	}
    246 
    247 	/*LINTED E_PTRDIFF_OVERFLOW*/
    248 	return (mbstring - start);
    249 }
    250 
    251 
    252 /*
    253  * Returns the number of bytes that would be written if the multi-
    254  * byte string mbs was converted to a wide character string, not
    255  * counting the terminating null wide character.
    256  */
    257 size_t
    258 mts_wcequiv_strlen(const char *mbs)
    259 {
    260 	mts_wchar_t	wide_char;
    261 	size_t bytes;
    262 	size_t len = 0;
    263 
    264 	while (*mbs) {
    265 		bytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
    266 		if (bytes == ((size_t)-1))
    267 			return ((size_t)-1);
    268 
    269 		len += sizeof (mts_wchar_t);
    270 		mbs += bytes;
    271 	}
    272 
    273 	return (len);
    274 }
    275 
    276 
    277 /*
    278  * Returns the number of bytes that would be written if the multi-
    279  * byte string mbs was converted to a single byte character string,
    280  * not counting the terminating null character.
    281  */
    282 size_t
    283 mts_sbequiv_strlen(const char *mbs)
    284 {
    285 	mts_wchar_t	wide_char;
    286 	size_t nbytes;
    287 	size_t len = 0;
    288 
    289 	while (*mbs) {
    290 		nbytes = mts_mbtowc(&wide_char, mbs, MTS_MB_CHAR_MAX);
    291 		if (nbytes == ((size_t)-1))
    292 			return ((size_t)-1);
    293 
    294 		if (wide_char & 0xFF00)
    295 			len += sizeof (mts_wchar_t);
    296 		else
    297 			++len;
    298 
    299 		mbs += nbytes;
    300 	}
    301 
    302 	return (len);
    303 }
    304 
    305 
    306 /*
    307  * stombs
    308  *
    309  * Convert a regular null terminated string 'string' to a UTF-8 encoded
    310  * null terminated multi-byte string 'mbstring'. Only full converted
    311  * UTF-8 characters will be written 'mbstring'. If a character will not
    312  * fit within the remaining buffer space or 'mbstring' will overflow
    313  * max_mblen, the conversion process will be terminated and 'mbstring'
    314  * will be null terminated.
    315  *
    316  * Returns the number of bytes written to 'mbstring', excluding the
    317  * terminating null character.
    318  *
    319  * If either mbstring or string is a null pointer, -1 is returned.
    320  */
    321 int
    322 mts_stombs(char *mbstring, char *string, int max_mblen)
    323 {
    324 	char *start = mbstring;
    325 	unsigned char *p = (unsigned char *)string;
    326 	int space_left = max_mblen;
    327 	int	len;
    328 	mts_wchar_t	wide_char;
    329 	char buf[4];
    330 
    331 	if (!mbstring || !string)
    332 		return (-1);
    333 
    334 	while (*p && space_left > 2) {
    335 		wide_char = *p++;
    336 		len = mts_wctomb(mbstring, wide_char);
    337 		mbstring += len;
    338 		space_left -= len;
    339 	}
    340 
    341 	if (*p) {
    342 		wide_char = *p;
    343 		if ((len = mts_wctomb(buf, wide_char)) < 2) {
    344 			*mbstring = *buf;
    345 			mbstring += len;
    346 			space_left -= len;
    347 		}
    348 	}
    349 
    350 	*mbstring = '\0';
    351 
    352 	/*LINTED E_PTRDIFF_OVERFLOW*/
    353 	return (mbstring - start);
    354 }
    355 
    356 
    357 /*
    358  * mbstos
    359  *
    360  * Convert a null terminated multi-byte string 'mbstring' to a regular
    361  * null terminated string 'string'.  A 1-byte character in 'mbstring'
    362  * maps to a 1-byte character in 'string'. A 2-byte character in
    363  * 'mbstring' will be mapped to 2-bytes, if the upper byte is non-null.
    364  * Otherwise the upper byte null will be discarded to ensure that the
    365  * output stream does not contain embedded null characters.
    366  *
    367  * If the input stream contains invalid multi-byte characters, a value
    368  * of -1 will be returned. Otherwise the length of 'string', excluding
    369  * the terminating null character, is returned.
    370  *
    371  * If either mbstring or string is a null pointer, -1 is returned.
    372  */
    373 int
    374 mts_mbstos(char *string, const char *mbstring)
    375 {
    376 	mts_wchar_t wc;
    377 	unsigned char *start = (unsigned char *)string;
    378 	int len;
    379 
    380 	if (string == NULL || mbstring == NULL)
    381 		return (-1);
    382 
    383 	while (*mbstring) {
    384 		if ((len = mts_mbtowc(&wc, mbstring, MTS_MB_CHAR_MAX)) < 0) {
    385 			*string = 0;
    386 			return (-1);
    387 		}
    388 
    389 		if (wc & 0xFF00) {
    390 			/*LINTED E_BAD_PTR_CAST_ALIGN*/
    391 			*((mts_wchar_t *)string) = wc;
    392 			string += sizeof (mts_wchar_t);
    393 		}
    394 		else
    395 		{
    396 			*string = (unsigned char)wc;
    397 			string++;
    398 		}
    399 
    400 		mbstring += len;
    401 	}
    402 
    403 	*string = 0;
    404 
    405 	/*LINTED E_PTRDIFF_OVERFLOW*/
    406 	return ((unsigned char *)string - start);
    407 }
    408