Home | History | Annotate | Download | only in support
      1 /*
      2  * util/support/utf8.c
      3  *
      4  * Copyright 2008 by the Massachusetts Institute of Technology.
      5  * All Rights Reserved.
      6  *
      7  * Export of this software from the United States of America may
      8  *   require a specific license from the United States Government.
      9  *   It is the responsibility of any person or organization contemplating
     10  *   export to obtain such a license before exporting.
     11  *
     12  * WITHIN THAT CONSTRAINT, permission to use, copy, modify, and
     13  * distribute this software and its documentation for any purpose and
     14  * without fee is hereby granted, provided that the above copyright
     15  * notice appear in all copies and that both that copyright notice and
     16  * this permission notice appear in supporting documentation, and that
     17  * the name of M.I.T. not be used in advertising or publicity pertaining
     18  * to distribution of the software without specific, written prior
     19  * permission.  Furthermore if you modify this software you must label
     20  * your software as modified software and not distribute it in such a
     21  * fashion that it might be confused with the original M.I.T. software.
     22  * M.I.T. makes no representations about the suitability of
     23  * this software for any purpose.  It is provided "as is" without express
     24  * or implied warranty.
     25  */
     26 /* This work is part of OpenLDAP Software <http://www.openldap.org/>.
     27  *
     28  * Copyright 1998-2008 The OpenLDAP Foundation.
     29  * All rights reserved.
     30  *
     31  * Redistribution and use in source and binary forms, with or without
     32  * modification, are permitted only as authorized by the OpenLDAP
     33  * Public License.
     34  *
     35  * A copy of this license is available in the file LICENSE in the
     36  * top-level directory of the distribution or, alternatively, at
     37  * <http://www.OpenLDAP.org/license.html>.
     38  */
     39 /* Basic UTF-8 routines
     40  *
     41  * These routines are "dumb".  Though they understand UTF-8,
     42  * they don't grok Unicode.  That is, they can push bits,
     43  * but don't have a clue what the bits represent.  That's
     44  * good enough for use with the KRB5 Client SDK.
     45  *
     46  * These routines are not optimized.
     47  */
     48 
     49 #include "k5-platform.h"
     50 #include "k5-utf8.h"
     51 #include "supp-int.h"
     52 
     53 /*
     54  * return the number of bytes required to hold the
     55  * NULL-terminated UTF-8 string NOT INCLUDING the
     56  * termination.
     57  */
     58 size_t krb5int_utf8_bytes(const char *p)
     59 {
     60     size_t bytes;
     61 
     62     for (bytes = 0; p[bytes]; bytes++)
     63 	;
     64 
     65     return bytes;
     66 }
     67 
     68 size_t krb5int_utf8_chars(const char *p)
     69 {
     70     /* could be optimized and could check for invalid sequences */
     71     size_t chars = 0;
     72 
     73     for ( ; *p ; KRB5_UTF8_INCR(p))
     74 	chars++;
     75 
     76     return chars;
     77 }
     78 
     79 size_t krb5int_utf8c_chars(const char *p, size_t length)
     80 {
     81     /* could be optimized and could check for invalid sequences */
     82     size_t chars = 0;
     83     const char *end = p + length;
     84 
     85     for ( ; p < end; KRB5_UTF8_INCR(p))
     86 	chars++;
     87 
     88     return chars;
     89 }
     90 
     91 /* return offset to next character */
     92 int krb5int_utf8_offset(const char *p)
     93 {
     94     return KRB5_UTF8_NEXT(p) - p;
     95 }
     96 
     97 /*
     98  * Returns length indicated by first byte.
     99  */
    100 const char krb5int_utf8_lentab[] = {
    101     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    102     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    103     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    104     0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    105     0, 0, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    106     2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    107     3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3,
    108     4, 4, 4, 4, 4, 4, 4, 4, 5, 5, 5, 5, 6, 6, 0, 0 };
    109 
    110 int krb5int_utf8_charlen(const char *p)
    111 {
    112     if (!(*p & 0x80))
    113 	return 1;
    114 
    115     return krb5int_utf8_lentab[*(const unsigned char *)p ^ 0x80];
    116 }
    117 
    118 /*
    119  * Make sure the UTF-8 char used the shortest possible encoding
    120  * returns charlen if valid, 0 if not.
    121  *
    122  * Here are the valid UTF-8 encodings, taken from RFC 2279 page 4.
    123  * The table is slightly modified from that of the RFC.
    124  *
    125  * UCS-4 range (hex)      UTF-8 sequence (binary)
    126  * 0000 0000-0000 007F   0.......
    127  * 0000 0080-0000 07FF   110++++. 10......
    128  * 0000 0800-0000 FFFF   1110++++ 10+..... 10......
    129  * 0001 0000-001F FFFF   11110+++ 10++.... 10...... 10......
    130  * 0020 0000-03FF FFFF   111110++ 10+++... 10...... 10...... 10......
    131  * 0400 0000-7FFF FFFF   1111110+ 10++++.. 10...... 10...... 10...... 10......
    132  *
    133  * The '.' bits are "don't cares". When validating a UTF-8 sequence,
    134  * at least one of the '+' bits must be set, otherwise the character
    135  * should have been encoded in fewer octets. Note that in the two-octet
    136  * case, only the first octet needs to be validated, and this is done
    137  * in the krb5int_utf8_lentab[] above.
    138  */
    139 
    140 /* mask of required bits in second octet */
    141 #undef c
    142 #define c const char
    143 c krb5int_utf8_mintab[] = {
    144     (c)0x20, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
    145     (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
    146     (c)0x30, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80, (c)0x80,
    147     (c)0x38, (c)0x80, (c)0x80, (c)0x80, (c)0x3c, (c)0x80, (c)0x00, (c)0x00 };
    148 #undef c
    149 
    150 int krb5int_utf8_charlen2(const char *p)
    151 {
    152     int i = KRB5_UTF8_CHARLEN(p);
    153 
    154     if (i > 2) {
    155 	if (!(krb5int_utf8_mintab[*p & 0x1f] & p[1]))
    156 	    i = 0;
    157     }
    158 
    159     return i;
    160 }
    161 
    162 /*
    163  * Convert a UTF8 character to a UCS4 character.  Return 0 on success,
    164  * -1 on failure.
    165  */
    166 int krb5int_utf8_to_ucs4(const char *p, krb5_ucs4 *out)
    167 {
    168     const unsigned char *c = (const unsigned char *) p;
    169     krb5_ucs4 ch;
    170     int len, i;
    171     static unsigned char mask[] = {
    172 	0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };
    173 
    174     *out = 0;
    175     len = KRB5_UTF8_CHARLEN2(p, len);
    176 
    177     if (len == 0)
    178 	return -1;
    179 
    180     ch = c[0] & mask[len];
    181 
    182     for (i = 1; i < len; i++) {
    183 	if ((c[i] & 0xc0) != 0x80)
    184 	    return -1;
    185 
    186 	ch <<= 6;
    187 	ch |= c[i] & 0x3f;
    188     }
    189 
    190     *out = ch;
    191     return 0;
    192 }
    193 
    194 int krb5int_utf8_to_ucs2(const char *p, krb5_ucs2 *out)
    195 {
    196     krb5_ucs4 ch;
    197 
    198     *out = 0;
    199     if (krb5int_utf8_to_ucs4(p, &ch) == -1 || ch > 0xFFFF)
    200 	return -1;
    201     *out = (krb5_ucs2) ch;
    202     return 0;
    203 }
    204 
    205 /* conv UCS-2 to UTF-8, not used */
    206 size_t krb5int_ucs4_to_utf8(krb5_ucs4 c, char *buf)
    207 {
    208     size_t len = 0;
    209     unsigned char *p = (unsigned char *) buf;
    210 
    211     /* not a valid Unicode character */
    212     if (c < 0)
    213 	return 0;
    214 
    215     /* Just return length, don't convert */
    216     if (buf == NULL) {
    217 	if (c < 0x80) return 1;
    218 	else if (c < 0x800) return 2;
    219 	else if (c < 0x10000) return 3;
    220 	else if (c < 0x200000) return 4;
    221 	else if (c < 0x4000000) return 5;
    222 	else return 6;
    223     }
    224 
    225     if (c < 0x80) {
    226 	p[len++] = c;
    227     } else if (c < 0x800) {
    228 	p[len++] = 0xc0 | ( c >> 6 );
    229 	p[len++] = 0x80 | ( c & 0x3f );
    230     } else if (c < 0x10000) {
    231 	p[len++] = 0xe0 | ( c >> 12 );
    232 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
    233 	p[len++] = 0x80 | ( c & 0x3f );
    234     } else if (c < 0x200000) {
    235 	p[len++] = 0xf0 | ( c >> 18 );
    236 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
    237 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
    238 	p[len++] = 0x80 | ( c & 0x3f );
    239     } else if (c < 0x4000000) {
    240 	p[len++] = 0xf8 | ( c >> 24 );
    241 	p[len++] = 0x80 | ( (c >> 18) & 0x3f );
    242 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
    243 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
    244 	p[len++] = 0x80 | ( c & 0x3f );
    245     } else /* if( c < 0x80000000 ) */ {
    246 	p[len++] = 0xfc | ( c >> 30 );
    247 	p[len++] = 0x80 | ( (c >> 24) & 0x3f );
    248 	p[len++] = 0x80 | ( (c >> 18) & 0x3f );
    249 	p[len++] = 0x80 | ( (c >> 12) & 0x3f );
    250 	p[len++] = 0x80 | ( (c >> 6) & 0x3f );
    251 	p[len++] = 0x80 | ( c & 0x3f );
    252     }
    253 
    254     return len;
    255 }
    256 
    257 size_t krb5int_ucs2_to_utf8(krb5_ucs2 c, char *buf)
    258 {
    259     return krb5int_ucs4_to_utf8((krb5_ucs4)c, buf);
    260 }
    261 
    262 #define KRB5_UCS_UTF8LEN(c)	\
    263     c < 0 ? 0 : (c < 0x80 ? 1 : (c < 0x800 ? 2 : (c < 0x10000 ? 3 : \
    264     (c < 0x200000 ? 4 : (c < 0x4000000 ? 5 : 6)))))
    265 
    266 /*
    267  * Advance to the next UTF-8 character
    268  *
    269  * Ignores length of multibyte character, instead rely on
    270  * continuation markers to find start of next character.
    271  * This allows for "resyncing" of when invalid characters
    272  * are provided provided the start of the next character
    273  * is appears within the 6 bytes examined.
    274  */
    275 char *krb5int_utf8_next(const char *p)
    276 {
    277     int i;
    278     const unsigned char *u = (const unsigned char *) p;
    279 
    280     if (KRB5_UTF8_ISASCII(u)) {
    281 	return (char *) &p[1];
    282     }
    283 
    284     for (i = 1; i < 6; i++) {
    285 	if ((u[i] & 0xc0) != 0x80) {
    286 	    return (char *) &p[i];
    287 	}
    288     }
    289 
    290     return (char *) &p[i];
    291 }
    292 
    293 /*
    294  * Advance to the previous UTF-8 character
    295  *
    296  * Ignores length of multibyte character, instead rely on
    297  * continuation markers to find start of next character.
    298  * This allows for "resyncing" of when invalid characters
    299  * are provided provided the start of the next character
    300  * is appears within the 6 bytes examined.
    301  */
    302 char *krb5int_utf8_prev(const char *p)
    303 {
    304     int i;
    305     const unsigned char *u = (const unsigned char *) p;
    306 
    307     for (i = -1; i>-6 ; i--) {
    308 	if ((u[i] & 0xc0 ) != 0x80) {
    309 	    return (char *) &p[i];
    310 	}
    311     }
    312 
    313     return (char *) &p[i];
    314 }
    315 
    316 /*
    317  * Copy one UTF-8 character from src to dst returning
    318  * number of bytes copied.
    319  *
    320  * Ignores length of multibyte character, instead rely on
    321  * continuation markers to find start of next character.
    322  * This allows for "resyncing" of when invalid characters
    323  * are provided provided the start of the next character
    324  * is appears within the 6 bytes examined.
    325  */
    326 int krb5int_utf8_copy(char* dst, const char *src)
    327 {
    328     int i;
    329     const unsigned char *u = (const unsigned char *) src;
    330 
    331     dst[0] = src[0];
    332 
    333     if (KRB5_UTF8_ISASCII(u)) {
    334 	return 1;
    335     }
    336 
    337     for (i=1; i<6; i++) {
    338 	if ((u[i] & 0xc0) != 0x80) {
    339 	    return i;
    340 	}
    341 	dst[i] = src[i];
    342     }
    343 
    344     return i;
    345 }
    346 
    347 #ifndef UTF8_ALPHA_CTYPE
    348 /*
    349  * UTF-8 ctype routines
    350  * Only deals with characters < 0x80 (ie: US-ASCII)
    351  */
    352 
    353 int krb5int_utf8_isascii(const char * p)
    354 {
    355     unsigned c = * (const unsigned char *) p;
    356 
    357     return KRB5_ASCII(c);
    358 }
    359 
    360 int krb5int_utf8_isdigit(const char * p)
    361 {
    362     unsigned c = * (const unsigned char *) p;
    363 
    364     if (!KRB5_ASCII(c))
    365 	return 0;
    366 
    367     return KRB5_DIGIT( c );
    368 }
    369 
    370 int krb5int_utf8_isxdigit(const char * p)
    371 {
    372     unsigned c = * (const unsigned char *) p;
    373 
    374     if (!KRB5_ASCII(c))
    375 	return 0;
    376 
    377     return KRB5_HEX(c);
    378 }
    379 
    380 int krb5int_utf8_isspace(const char * p)
    381 {
    382     unsigned c = * (const unsigned char *) p;
    383 
    384     if (!KRB5_ASCII(c))
    385 	return 0;
    386 
    387     switch(c) {
    388     case ' ':
    389     case '\t':
    390     case '\n':
    391     case '\r':
    392     case '\v':
    393     case '\f':
    394 	return 1;
    395     }
    396 
    397     return 0;
    398 }
    399 
    400 /*
    401  * These are not needed by the C SDK and are
    402  * not "good enough" for general use.
    403  */
    404 int krb5int_utf8_isalpha(const char * p)
    405 {
    406     unsigned c = * (const unsigned char *) p;
    407 
    408     if (!KRB5_ASCII(c))
    409 	return 0;
    410 
    411     return KRB5_ALPHA(c);
    412 }
    413 
    414 int krb5int_utf8_isalnum(const char * p)
    415 {
    416     unsigned c = * (const unsigned char *) p;
    417 
    418     if (!KRB5_ASCII(c))
    419 	return 0;
    420 
    421     return KRB5_ALNUM(c);
    422 }
    423 
    424 #if 0
    425 int krb5int_utf8_islower(const char * p)
    426 {
    427     unsigned c = * (const unsigned char *) p;
    428 
    429     if (!KRB5_ASCII(c))
    430 	return 0;
    431 
    432     return KRB5_LOWER(c);
    433 }
    434 
    435 int krb5int_utf8_isupper(const char * p)
    436 {
    437     unsigned c = * (const unsigned char *) p;
    438 
    439     if (!KRB5_ASCII(c))
    440 	return 0;
    441 
    442     return KRB5_UPPER(c);
    443 }
    444 #endif
    445 #endif
    446 
    447 
    448 /*
    449  * UTF-8 string routines
    450  */
    451 
    452 /* like strchr() */
    453 char *krb5int_utf8_strchr(const char *str, const char *chr)
    454 {
    455     krb5_ucs4 chs, ch;
    456 
    457     if (krb5int_utf8_to_ucs4(chr, &ch) == -1)
    458 	return NULL;
    459     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
    460 	if (krb5int_utf8_to_ucs4(str, &chs) == 0 && chs == ch)
    461 	    return (char *)str;
    462     }
    463 
    464     return NULL;
    465 }
    466 
    467 /* like strcspn() but returns number of bytes, not characters */
    468 size_t krb5int_utf8_strcspn(const char *str, const char *set)
    469 {
    470     const char *cstr, *cset;
    471     krb5_ucs4 chstr, chset;
    472 
    473     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
    474 	for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
    475 	    if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
    476 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
    477 		return cstr - str;
    478 	}
    479     }
    480 
    481     return cstr - str;
    482 }
    483 
    484 /* like strspn() but returns number of bytes, not characters */
    485 size_t krb5int_utf8_strspn(const char *str, const char *set)
    486 {
    487     const char *cstr, *cset;
    488     krb5_ucs4 chstr, chset;
    489 
    490     for (cstr = str; *cstr != '\0'; KRB5_UTF8_INCR(cstr)) {
    491 	for (cset = set; ; KRB5_UTF8_INCR(cset)) {
    492 	    if (*cset == '\0')
    493 		return cstr - str;
    494 	    if (krb5int_utf8_to_ucs4(cstr, &chstr) == 0
    495 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
    496 		break;
    497 	}
    498     }
    499 
    500     return cstr - str;
    501 }
    502 
    503 /* like strpbrk(), replaces strchr() as well */
    504 char *krb5int_utf8_strpbrk(const char *str, const char *set)
    505 {
    506     const char *cset;
    507     krb5_ucs4 chstr, chset;
    508 
    509     for ( ; *str != '\0'; KRB5_UTF8_INCR(str)) {
    510 	for (cset = set; *cset != '\0'; KRB5_UTF8_INCR(cset)) {
    511 	    if (krb5int_utf8_to_ucs4(str, &chstr) == 0
    512 		&& krb5int_utf8_to_ucs4(cset, &chset) == 0 && chstr == chset)
    513 		return (char *)str;
    514 	}
    515     }
    516 
    517     return NULL;
    518 }
    519 
    520 /* like strtok_r(), not strtok() */
    521 char *krb5int_utf8_strtok(char *str, const char *sep, char **last)
    522 {
    523     char *begin;
    524     char *end;
    525 
    526     if (last == NULL)
    527 	return NULL;
    528 
    529     begin = str ? str : *last;
    530 
    531     begin += krb5int_utf8_strspn(begin, sep);
    532 
    533     if (*begin == '\0') {
    534 	*last = NULL;
    535 	return NULL;
    536     }
    537 
    538     end = &begin[krb5int_utf8_strcspn(begin, sep)];
    539 
    540     if (*end != '\0') {
    541 	char *next = KRB5_UTF8_NEXT(end);
    542 	*end = '\0';
    543 	end = next;
    544     }
    545 
    546     *last = end;
    547 
    548     return begin;
    549 }
    550