utf-8-conv.c - OpenGrok cross reference for /netbsd/src/external/bsd/openldap/dist/libraries/libldap/utf-8-conv.c

/*        $NetBSD: utf-8-conv.c,v 1.3 2021/08/14 16:14:56 christos Exp $        */

/* $OpenLDAP$ */
/* This work is part of OpenLDAP Software <http://www.openldap.org/>.
 *
 * Copyright 1998-2021 The OpenLDAP Foundation.
 * All rights reserved.
 *
 * Redistribution and use in source and binary forms, with or without
 * modification, are permitted only as authorized by the OpenLDAP
 * Public License.
 *
 * A copy of this license is available in the file LICENSE in the
 * top-level directory of the distribution or, alternatively, at
 * <http://www.OpenLDAP.org/license.html>.
 */
/* Portions Copyright (C) 1999, 2000 Novell, Inc. All Rights Reserved.
 *
 * THIS WORK IS SUBJECT TO U.S. AND INTERNATIONAL COPYRIGHT LAWS AND
 * TREATIES. USE, MODIFICATION, AND REDISTRIBUTION OF THIS WORK IS SUBJECT
 * TO VERSION 2.0.1 OF THE OPENLDAP PUBLIC LICENSE, A COPY OF WHICH IS
 * AVAILABLE AT HTTP://WWW.OPENLDAP.ORG/LICENSE.HTML OR IN THE FILE "LICENSE"
 * IN THE TOP-LEVEL DIRECTORY OF THE DISTRIBUTION. ANY USE OR EXPLOITATION
 * OF THIS WORK OTHER THAN AS AUTHORIZED IN VERSION 2.0.1 OF THE OPENLDAP
 * PUBLIC LICENSE, OR OTHER PRIOR WRITTEN CONSENT FROM NOVELL, COULD SUBJECT
 * THE PERPETRATOR TO CRIMINAL AND CIVIL LIABILITY.
 *---
 * Note: A verbatim copy of version 2.0.1 of the OpenLDAP Public License
 * can be found in the file "build/LICENSE-2.0.1" in this distribution
 * of OpenLDAP Software.
 */

/*
 * UTF-8 Conversion Routines
 *
 * These routines convert between Wide Character and UTF-8,
 * or between MultiByte and UTF-8 encodings.
 *
 * Both single character and string versions of the functions are provided.
 * All functions return -1 if the character or string cannot be converted.
 */

#include <sys/cdefs.h>
__RCSID("$NetBSD: utf-8-conv.c,v 1.3 2021/08/14 16:14:56 christos Exp $");

#include "portable.h"

#if SIZEOF_WCHAR_T >= 4
/* These routines assume ( sizeof(wchar_t) >= 4 ) */

#include <stdio.h>
#include <ac/stdlib.h>                  /* For wctomb, wcstombs, mbtowc, mbstowcs */
#include <ac/string.h>
#include <ac/time.h>                    /* for time_t */

#include "ldap-int.h"

#include <ldap_utf8.h>

static unsigned char mask[] = { 0, 0x7f, 0x1f, 0x0f, 0x07, 0x03, 0x01 };


/*-----------------------------------------------------------------------------
                                                  UTF-8 Format Summary

ASCII chars                                                           7 bits
    0xxxxxxx

2-character UTF-8 sequence:        11 bits
    110xxxxx  10xxxxxx

3-character UTF-8                  16 bits
    1110xxxx  10xxxxxx  10xxxxxx

4-char UTF-8                       21 bits
    11110xxx  10xxxxxx  10xxxxxx  10xxxxxx

5-char UTF-8                       26 bits
    111110xx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx

6-char UTF-8                       31 bits
    1111110x  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx  10xxxxxx

Unicode address space   (0 - 0x10FFFF)    21 bits
ISO-10646 address space (0 - 0x7FFFFFFF)  31 bits

Note: This code does not prevent UTF-8 sequences which are longer than
      necessary from being decoded.
*/

/*-----------------------------------------------------------------------------
   Convert a UTF-8 character to a wide char.
   Return the length of the UTF-8 input character in bytes.
*/
int
ldap_x_utf8_to_wc ( wchar_t *wchar, const char *utf8char )
{
          int utflen, i;
          wchar_t ch;

          if (utf8char == NULL) return -1;

          /* Get UTF-8 sequence length from 1st byte */
          utflen = LDAP_UTF8_CHARLEN2(utf8char, utflen);

          if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;

          /* First byte minus length tag */
          ch = (wchar_t)(utf8char[0] & mask[utflen]);

          for(i=1; i < utflen; i++) {
                    /* Subsequent bytes must start with 10 */
                    if ((utf8char[i] & 0xc0) != 0x80) return -1;

                    ch <<= 6;                     /* 6 bits of data in each subsequent byte */
                    ch |= (wchar_t)(utf8char[i] & 0x3f);
          }

          if (wchar) *wchar = ch;

          return utflen;
}

/*-----------------------------------------------------------------------------
   Convert a UTF-8 string to a wide char string.
   No more than 'count' wide chars will be written to the output buffer.
   Return the size of the converted string in wide chars, excl null terminator.
*/
int
ldap_x_utf8s_to_wcs ( wchar_t *wcstr, const char *utf8str, size_t count )
{
          size_t wclen = 0;
          int utflen, i;
          wchar_t ch;


          /* If input ptr is NULL or empty... */
          if (utf8str == NULL || !*utf8str) {
                    if ( wcstr )
                              *wcstr = 0;
                    return 0;
          }

          /* Examine next UTF-8 character.  If output buffer is NULL, ignore count */
          while ( *utf8str && (wcstr==NULL || wclen<count) ) {
                    /* Get UTF-8 sequence length from 1st byte */
                    utflen = LDAP_UTF8_CHARLEN2(utf8str, utflen);

                    if( utflen==0 || utflen > (int)LDAP_MAX_UTF8_LEN ) return -1;

                    /* First byte minus length tag */
                    ch = (wchar_t)(utf8str[0] & mask[utflen]);

                    for(i=1; i < utflen; i++) {
                              /* Subsequent bytes must start with 10 */
                              if ((utf8str[i] & 0xc0) != 0x80) return -1;

                              ch <<= 6;                     /* 6 bits of data in each subsequent byte */
                              ch |= (wchar_t)(utf8str[i] & 0x3f);
                    }

                    if (wcstr) wcstr[wclen] = ch;

                    utf8str += utflen;  /* Move to next UTF-8 character */
                    wclen++;                      /* Count number of wide chars stored/required */
          }

          /* Add null terminator if there's room in the buffer. */
          if (wcstr && wclen < count) wcstr[wclen] = 0;

          return wclen;
}


/*-----------------------------------------------------------------------------
   Convert one wide char to a UTF-8 character.
   Return the length of the converted UTF-8 character in bytes.
   No more than 'count' bytes will be written to the output buffer.
*/
int
ldap_x_wc_to_utf8 ( char *utf8char, wchar_t wchar, size_t count )
{
          int len=0;

          if (utf8char == NULL)   /* Just determine the required UTF-8 char length. */
          {                                                           /* Ignore count */
                    if( wchar < 0 )
                              return -1;
                    if( wchar < 0x80 )
                              return 1;
                    if( wchar < 0x800 )
                              return 2;
                    if( wchar < 0x10000 )
                              return 3;
                    if( wchar < 0x200000 )
                              return 4;
                    if( wchar < 0x4000000 )
                              return 5;
#if SIZEOF_WCHAR_T > 4
                    /* UL is not strictly needed by ANSI C */
                    if( wchar < (wchar_t)0x80000000UL )
#endif /* SIZEOF_WCHAR_T > 4 */
                              return 6;
                    return -1;
          }


          if ( wchar < 0 ) {                                /* Invalid wide character */
                    len = -1;

          } else if( wchar < 0x80 ) {
                    if (count >= 1) {
                              utf8char[len++] = (char)wchar;
                    }

          } else if( wchar < 0x800 ) {
                    if (count >=2) {
                              utf8char[len++] = 0xc0 | ( wchar >> 6 );
                              utf8char[len++] = 0x80 | ( wchar & 0x3f );
                    }

          } else if( wchar < 0x10000 ) {
                    if (count >= 3) {
                              utf8char[len++] = 0xe0 | ( wchar >> 12 );
                              utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
                              utf8char[len++] = 0x80 | ( wchar & 0x3f );
                    }

          } else if( wchar < 0x200000 ) {
                    if (count >= 4) {
                              utf8char[len++] = 0xf0 | ( wchar >> 18 );
                              utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
                              utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
                              utf8char[len++] = 0x80 | ( wchar & 0x3f );
                    }

          } else if( wchar < 0x4000000 ) {
                    if (count >= 5) {
                              utf8char[len++] = 0xf8 | ( wchar >> 24 );
                              utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
                              utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
                              utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
                              utf8char[len++] = 0x80 | ( wchar & 0x3f );
                    }

          } else
#if SIZEOF_WCHAR_T > 4
                    /* UL is not strictly needed by ANSI C */
                    if( wchar < (wchar_t)0x80000000UL )
#endif /* SIZEOF_WCHAR_T > 4 */
          {
                    if (count >= 6) {
                              utf8char[len++] = 0xfc | ( wchar >> 30 );
                              utf8char[len++] = 0x80 | ( (wchar >> 24) & 0x3f );
                              utf8char[len++] = 0x80 | ( (wchar >> 18) & 0x3f );
                              utf8char[len++] = 0x80 | ( (wchar >> 12) & 0x3f );
                              utf8char[len++] = 0x80 | ( (wchar >> 6) & 0x3f );
                              utf8char[len++] = 0x80 | ( wchar & 0x3f );
                    }

#if SIZEOF_WCHAR_T > 4
          } else {
                    len = -1;
#endif /* SIZEOF_WCHAR_T > 4 */
          }

          return len;

}


/*-----------------------------------------------------------------------------
   Convert a wide char string to a UTF-8 string.
   No more than 'count' bytes will be written to the output buffer.
   Return the # of bytes written to the output buffer, excl null terminator.
*/
int
ldap_x_wcs_to_utf8s ( char *utf8str, const wchar_t *wcstr, size_t count )
{
          int len = 0;
          int n;
          char *p = utf8str;
          wchar_t empty = 0;            /* To avoid use of L"" construct */

          if (wcstr == NULL)            /* Treat input ptr NULL as an empty string */
                    wcstr = &empty;

          if (utf8str == NULL)          /* Just compute size of output, excl null */
          {
                    while (*wcstr)
                    {
                              /* Get UTF-8 size of next wide char */
                              n = ldap_x_wc_to_utf8( NULL, *wcstr++, LDAP_MAX_UTF8_LEN);
                              if (n == -1)
                                        return -1;
                              len += n;
                    }

                    return len;
          }


          /* Do the actual conversion. */

          n = 1;                                            /* In case of empty wcstr */
          while (*wcstr)
          {
                    n = ldap_x_wc_to_utf8( p, *wcstr++, count);

                    if (n <= 0)                   /* If encoding error (-1) or won't fit (0), quit */
                              break;

                    p += n;
                    count -= n;                             /* Space left in output buffer */
          }

          /* If not enough room for last character, pad remainder with null
             so that return value = original count, indicating buffer full. */
          if (n == 0)
          {
                    while (count--)
                              *p++ = 0;
          }

          /* Add a null terminator if there's room. */
          else if (count)
                    *p = 0;

          if (n == -1)                            /* Conversion encountered invalid wide char. */
                    return -1;

          /* Return the number of bytes written to output buffer, excl null. */
          return (p - utf8str);
}

#ifdef ANDROID
int wctomb(char *s, wchar_t wc) { return wcrtomb(s,wc,NULL); }
int mbtowc(wchar_t *pwc, const char *s, size_t n) { return mbrtowc(pwc, s, n, NULL); }
#endif

/*-----------------------------------------------------------------------------
   Convert a UTF-8 character to a MultiByte character.
   Return the size of the converted character in bytes.
*/
int
ldap_x_utf8_to_mb ( char *mbchar, const char *utf8char,
                    int (*f_wctomb)(char *mbchar, wchar_t wchar) )
{
          wchar_t wchar;
          int n;
          char tmp[6];                                      /* Large enough for biggest multibyte char */

          if (f_wctomb == NULL)                   /* If no conversion function was given... */
                    f_wctomb = wctomb;            /*    use the local ANSI C function */

          /* First convert UTF-8 char to a wide char */
          n = ldap_x_utf8_to_wc( &wchar, utf8char);

          if (n == -1)
                    return -1;                    /* Invalid UTF-8 character */

          if (mbchar == NULL)
                    n = f_wctomb( tmp, wchar );
          else
                    n = f_wctomb( mbchar, wchar);

          return n;
}

/*-----------------------------------------------------------------------------
   Convert a UTF-8 string to a MultiByte string.
   No more than 'count' bytes will be written to the output buffer.
   Return the size of the converted string in bytes, excl null terminator.
*/
int
ldap_x_utf8s_to_mbs ( char *mbstr, const char *utf8str, size_t count,
                    size_t (*f_wcstombs)(char *mbstr, const wchar_t *wcstr, size_t count) )
{
          wchar_t *wcs;
          size_t wcsize;
    int n;

          if (f_wcstombs == NULL)                 /* If no conversion function was given... */
                    f_wcstombs = wcstombs;        /*    use the local ANSI C function */

          if (utf8str == NULL || *utf8str == 0)   /* NULL or empty input string */
          {
                    if (mbstr)
                              *mbstr = 0;
                    return 0;
          }

/* Allocate memory for the maximum size wchar string that we could get. */
          wcsize = strlen(utf8str) + 1;
          wcs = (wchar_t *)LDAP_MALLOC(wcsize * sizeof(wchar_t));
          if (wcs == NULL)
                    return -1;                                        /* Memory allocation failure. */

          /* First convert the UTF-8 string to a wide char string */
          n = ldap_x_utf8s_to_wcs( wcs, utf8str, wcsize);

          /* Then convert wide char string to multi-byte string */
          if (n != -1)
          {
                    n = f_wcstombs(mbstr, wcs, count);
          }

          LDAP_FREE(wcs);

          return n;
}

/*-----------------------------------------------------------------------------
   Convert a MultiByte character to a UTF-8 character.
   'mbsize' indicates the number of bytes of 'mbchar' to check.
   Returns the number of bytes written to the output character.
*/
int
ldap_x_mb_to_utf8 ( char *utf8char, const char *mbchar, size_t mbsize,
                    int (*f_mbtowc)(wchar_t *wchar, const char *mbchar, size_t count) )
{
    wchar_t wchar;
    int n;

          if (f_mbtowc == NULL)                   /* If no conversion function was given... */
                    f_mbtowc = mbtowc;            /*    use the local ANSI C function */

    if (mbsize == 0)                                        /* 0 is not valid. */
        return -1;

    if (mbchar == NULL || *mbchar == 0)
    {
        if (utf8char)
            *utf8char = 0;
        return 1;
    }

          /* First convert the MB char to a Wide Char */
          n = f_mbtowc( &wchar, mbchar, mbsize);

          if (n == -1)
                    return -1;

          /* Convert the Wide Char to a UTF-8 character. */
          n = ldap_x_wc_to_utf8( utf8char, wchar, LDAP_MAX_UTF8_LEN);

          return n;
}


/*-----------------------------------------------------------------------------
   Convert a MultiByte string to a UTF-8 string.
   No more than 'count' bytes will be written to the output buffer.
   Return the size of the converted string in bytes, excl null terminator.
*/
int
ldap_x_mbs_to_utf8s ( char *utf8str, const char *mbstr, size_t count,
                    size_t (*f_mbstowcs)(wchar_t *wcstr, const char *mbstr, size_t count) )
{
          wchar_t *wcs;
          int n;
          size_t wcsize;

          if (mbstr == NULL)               /* Treat NULL input string as an empty string */
                    mbstr = "";

          if (f_mbstowcs == NULL)                 /* If no conversion function was given... */
                    f_mbstowcs = mbstowcs;        /*    use the local ANSI C function */

          /* Allocate memory for the maximum size wchar string that we could get. */
          wcsize = strlen(mbstr) + 1;
          wcs = (wchar_t *)LDAP_MALLOC( wcsize * sizeof(wchar_t) );
          if (wcs == NULL)
                    return -1;

          /* First convert multi-byte string to a wide char string */
          n = f_mbstowcs(wcs, mbstr, wcsize);

          /* Convert wide char string to UTF-8 string */
          if (n != -1)
          {
                    n = ldap_x_wcs_to_utf8s( utf8str, wcs, count);
          }

          LDAP_FREE(wcs);

          return n;
}

#endif /* SIZEOF_WCHAR_T >= 4 */