1 /* $NetBSD: mbrtoc16.c,v 1.7 2024/08/18 20:06:05 rillig Exp $ */
2
3 /*-
4 * Copyright (c) 2024 The NetBSD Foundation, Inc.
5 * All rights reserved.
6 *
7 * Redistribution and use in source and binary forms, with or without
8 * modification, are permitted provided that the following conditions
9 * are met:
10 * 1. Redistributions of source code must retain the above copyright
11 * notice, this list of conditions and the following disclaimer.
12 * 2. Redistributions in binary form must reproduce the above copyright
13 * notice, this list of conditions and the following disclaimer in the
14 * documentation and/or other materials provided with the distribution.
15 *
16 * THIS SOFTWARE IS PROVIDED BY THE NETBSD FOUNDATION, INC. AND CONTRIBUTORS
17 * ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED
18 * TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR
19 * PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE FOUNDATION OR CONTRIBUTORS
20 * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
21 * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
22 * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
23 * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
24 * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
25 * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
26 * POSSIBILITY OF SUCH DAMAGE.
27 */
28
29 /*
30 * mbrtoc16(&c16, s, n, ps)
31 *
32 * Decode a Unicode scalar value from up to n bytes out of the
33 * multibyte string s, using multibyte encoding state ps, and
34 * store the next code unit in the UTF-16 representation of that
35 * scalar value at c16.
36 *
37 * If the next scalar value in s is outside the Basic Multilingual
38 * Plane, mbrtoc16 will yield the high surrogate code point in one
39 * call that consumes input, and will yield the low surrogate code
40 * point in the next call without consuming any input and
41 * returning (size_t)-3 instead.
42 *
43 * Return the number of bytes consumed on success, or:
44 *
45 * - 0 if the code unit is NUL, or
46 * - (size_t)-3 if the trailing low surrogate of a surrogate pair
47 * was returned without consuming any additional input, or
48 * - (size_t)-2 if the input is incomplete, or
49 * - (size_t)-1 on error with errno set to EILSEQ.
50 *
51 * In the case of incomplete input, the decoding state so far
52 * after processing s[0], s[1], ..., s[n - 1] is saved in ps, so
53 * subsequent calls to mbrtoc16 will pick up n bytes later into
54 * the input stream.
55 *
56 * References:
57 *
58 * The Unicode Standard, Version 15.0 -- Core Specification, The
59 * Unicode Consortium, Sec. 3.8 `Surrogates', p. 118.
60 * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144
61 * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=144
62 *
63 * The Unicode Standard, Version 15.0 -- Core Specification, The
64 * Unicode Consortium, Sec. 3.9 `Unicode Encoding Forms': UTF-16,
65 * p. 124.
66 * https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
67 * https://web.archive.org/web/20240718101254/https://www.unicode.org/versions/Unicode15.0.0/UnicodeStandard-15.0.pdf#page=150
68 *
69 * P. Hoffman and F. Yergeau, `UTF-16, an encoding of ISO 10646',
70 * RFC 2781, Internet Engineering Task Force, February 2000,
71 * Sec. 2.1: `Encoding UTF-16'.
72 * https://datatracker.ietf.org/doc/html/rfc2781#section-2.1
73 */
74
75 #include <sys/cdefs.h>
76 __RCSID("$NetBSD: mbrtoc16.c,v 1.7 2024/08/18 20:06:05 rillig Exp $");
77
78 #include "namespace.h"
79
80 #include <assert.h>
81 #include <errno.h>
82 #include <locale.h>
83 #include <stdalign.h>
84 #include <stddef.h>
85 #include <uchar.h>
86
87 #include "mbrtoc32.h"
88 #include "setlocale_local.h"
89
90 struct mbrtoc16state {
91 char16_t surrogate;
92 mbstate_t mbs;
93 };
94 __CTASSERT(offsetof(struct mbrtoc16state, mbs) <= sizeof(mbstate_t));
95 __CTASSERT(sizeof(struct mbrtoc32state) <= sizeof(mbstate_t) -
96 offsetof(struct mbrtoc16state, mbs));
97 __CTASSERT(alignof(struct mbrtoc16state) <= alignof(mbstate_t));
98
99 #ifdef __weak_alias
__weak_alias(mbrtoc16_l,_mbrtoc16_l)100 __weak_alias(mbrtoc16_l,_mbrtoc16_l)
101 #endif
102
103 size_t
104 mbrtoc16(char16_t *restrict pc16, const char *restrict s, size_t n,
105 mbstate_t *restrict ps)
106 {
107
108 return mbrtoc16_l(pc16, s, n, ps, _current_locale());
109 }
110
111 size_t
mbrtoc16_l(char16_t * restrict pc16,const char * restrict s,size_t n,mbstate_t * restrict ps,locale_t restrict loc)112 mbrtoc16_l(char16_t *restrict pc16, const char *restrict s, size_t n,
113 mbstate_t *restrict ps, locale_t restrict loc)
114 {
115 static mbstate_t psbuf;
116 struct mbrtoc16state *S;
117 char32_t c32;
118 size_t len;
119
120 /*
121 * `If ps is a null pointer, each function uses its own
122 * internal mbstate_t object instead, which is initialized at
123 * program startup to the initial conversion state; the
124 * functions are not required to avoid data races with other
125 * calls to the same function in this case. The
126 * implementation behaves as if no library function calls
127 * these functions with a null pointer for ps.'
128 */
129 if (ps == NULL)
130 ps = &psbuf;
131
132 /*
133 * `If s is a null pointer, the mbrtoc16 function is equivalent
134 * to the call:
135 *
136 * mbrtoc16(NULL, "", 1, ps)
137 *
138 * In this case, the values of the parameters pc16 and n are
139 * ignored.'
140 */
141 if (s == NULL) {
142 pc16 = NULL;
143 s = "";
144 n = 1;
145 }
146
147 /*
148 * Get the private conversion state.
149 */
150 S = (struct mbrtoc16state *)(void *)ps;
151
152 /*
153 * If there is a pending surrogate, yield it and consume no
154 * bytes of the input, returning (size_t)-3 to indicate that no
155 * bytes of input were consumed.
156 */
157 if (S->surrogate != 0) {
158 _DIAGASSERT(S->surrogate >= 0xdc00);
159 _DIAGASSERT(S->surrogate <= 0xdfff);
160 if (pc16)
161 *pc16 = S->surrogate;
162 S->surrogate = 0;
163 return (size_t)-3;
164 }
165
166 /*
167 * Consume the next scalar value. If no full scalar value can
168 * be obtained, stop here.
169 */
170 len = mbrtoc32_l(&c32, s, n, &S->mbs, loc);
171 switch (len) {
172 case 0: /* NUL */
173 if (pc16)
174 *pc16 = 0;
175 return 0;
176 case (size_t)-2: /* still incomplete after n bytes */
177 case (size_t)-1: /* error */
178 return len;
179 default: /* consumed len bytes of input */
180 break;
181 }
182
183 /*
184 * We consumed a scalar value from the input.
185 *
186 * If it's inside the Basic Multilingual Plane (16-bit scalar
187 * values), return it.
188 *
189 * If it's outside the Basic Multilingual Plane, split it into
190 * high and low surrogate code points, return the high, and
191 * save the low.
192 */
193 if (c32 <= 0xffff) {
194 if (pc16)
195 *pc16 = c32;
196 _DIAGASSERT(S->surrogate == 0);
197 } else {
198 c32 -= 0x10000;
199 const char16_t w1 = 0xd800 | __SHIFTOUT(c32, __BITS(19,10));
200 const char16_t w2 = 0xdc00 | __SHIFTOUT(c32, __BITS(9,0));
201 if (pc16)
202 *pc16 = w1;
203 S->surrogate = w2;
204 _DIAGASSERT(S->surrogate != 0);
205 _DIAGASSERT(S->surrogate >= 0xdc00);
206 _DIAGASSERT(S->surrogate <= 0xdfff);
207 }
208
209 /*
210 * Return the number of bytes consumed from the input.
211 */
212 return len;
213 }
214