1 /*-
2 * SPDX-License-Identifier: BSD-2-Clause-FreeBSD
3 *
4 * Copyright 2013 Garrett D'Amore <[email protected]>
5 * Copyright 2011 Nexenta Systems, Inc. All rights reserved.
6 * Copyright (c) 2002-2004 Tim J. Robbins
7 * All rights reserved.
8 *
9 * Copyright (c) 2011 The FreeBSD Foundation
10 * All rights reserved.
11 * Portions of this software were developed by David Chisnall
12 * under sponsorship from the FreeBSD Foundation.
13 *
14 * Redistribution and use in source and binary forms, with or without
15 * modification, are permitted provided that the following conditions
16 * are met:
17 * 1. Redistributions of source code must retain the above copyright
18 * notice, this list of conditions and the following disclaimer.
19 * 2. Redistributions in binary form must reproduce the above copyright
20 * notice, this list of conditions and the following disclaimer in the
21 * documentation and/or other materials provided with the distribution.
22 *
23 * THIS SOFTWARE IS PROVIDED BY THE AUTHOR AND CONTRIBUTORS ``AS IS'' AND
24 * ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
25 * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
26 * ARE DISCLAIMED. IN NO EVENT SHALL THE AUTHOR OR CONTRIBUTORS BE LIABLE
27 * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
28 * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
29 * OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
30 * HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
31 * LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
32 * OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
33 * SUCH DAMAGE.
34 */
35
36 #include <sys/param.h>
37 __FBSDID("$FreeBSD$");
38
39 #include <errno.h>
40 #include <limits.h>
41 #include <runetype.h>
42 #include <stdlib.h>
43 #include <string.h>
44 #include <wchar.h>
45 #include "mblocal.h"
46
47 extern int __mb_sb_limit;
48
49 static size_t _UTF8_mbrtowc(wchar_t * __restrict, const char * __restrict,
50 size_t, mbstate_t * __restrict);
51 static int _UTF8_mbsinit(const mbstate_t *);
52 static size_t _UTF8_mbsnrtowcs(wchar_t * __restrict,
53 const char ** __restrict, size_t, size_t,
54 mbstate_t * __restrict);
55 static size_t _UTF8_wcrtomb(char * __restrict, wchar_t,
56 mbstate_t * __restrict);
57 static size_t _UTF8_wcsnrtombs(char * __restrict, const wchar_t ** __restrict,
58 size_t, size_t, mbstate_t * __restrict);
59
60 typedef struct {
61 wchar_t ch;
62 int want;
63 wchar_t lbound;
64 } _UTF8State;
65
66 int
_UTF8_init(struct xlocale_ctype * l,_RuneLocale * rl)67 _UTF8_init(struct xlocale_ctype *l, _RuneLocale *rl)
68 {
69
70 l->__mbrtowc = _UTF8_mbrtowc;
71 l->__wcrtomb = _UTF8_wcrtomb;
72 l->__mbsinit = _UTF8_mbsinit;
73 l->__mbsnrtowcs = _UTF8_mbsnrtowcs;
74 l->__wcsnrtombs = _UTF8_wcsnrtombs;
75 l->runes = rl;
76 l->__mb_cur_max = 4;
77 /*
78 * UCS-4 encoding used as the internal representation, so
79 * slots 0x0080-0x00FF are occuped and must be excluded
80 * from the single byte ctype by setting the limit.
81 */
82 l->__mb_sb_limit = 128;
83
84 return (0);
85 }
86
87 static int
_UTF8_mbsinit(const mbstate_t * ps)88 _UTF8_mbsinit(const mbstate_t *ps)
89 {
90
91 return (ps == NULL || ((const _UTF8State *)ps)->want == 0);
92 }
93
94 static size_t
_UTF8_mbrtowc(wchar_t * __restrict pwc,const char * __restrict s,size_t n,mbstate_t * __restrict ps)95 _UTF8_mbrtowc(wchar_t * __restrict pwc, const char * __restrict s, size_t n,
96 mbstate_t * __restrict ps)
97 {
98 _UTF8State *us;
99 int ch, i, mask, want;
100 wchar_t lbound, wch;
101
102 us = (_UTF8State *)ps;
103
104 if (us->want < 0 || us->want > 6) {
105 errno = EINVAL;
106 return ((size_t)-1);
107 }
108
109 if (s == NULL) {
110 s = "";
111 n = 1;
112 pwc = NULL;
113 }
114
115 if (n == 0)
116 /* Incomplete multibyte sequence */
117 return ((size_t)-2);
118
119 if (us->want == 0) {
120 /*
121 * Determine the number of octets that make up this character
122 * from the first octet, and a mask that extracts the
123 * interesting bits of the first octet. We already know
124 * the character is at least two bytes long.
125 *
126 * We also specify a lower bound for the character code to
127 * detect redundant, non-"shortest form" encodings. For
128 * example, the sequence C0 80 is _not_ a legal representation
129 * of the null character. This enforces a 1-to-1 mapping
130 * between character codes and their multibyte representations.
131 */
132 ch = (unsigned char)*s;
133 if ((ch & 0x80) == 0) {
134 /* Fast path for plain ASCII characters. */
135 if (pwc != NULL)
136 *pwc = ch;
137 return (ch != '\0' ? 1 : 0);
138 }
139 if ((ch & 0xe0) == 0xc0) {
140 mask = 0x1f;
141 want = 2;
142 lbound = 0x80;
143 } else if ((ch & 0xf0) == 0xe0) {
144 mask = 0x0f;
145 want = 3;
146 lbound = 0x800;
147 } else if ((ch & 0xf8) == 0xf0) {
148 mask = 0x07;
149 want = 4;
150 lbound = 0x10000;
151 } else {
152 /*
153 * Malformed input; input is not UTF-8.
154 */
155 errno = EILSEQ;
156 return ((size_t)-1);
157 }
158 } else {
159 want = us->want;
160 lbound = us->lbound;
161 }
162
163 /*
164 * Decode the octet sequence representing the character in chunks
165 * of 6 bits, most significant first.
166 */
167 if (us->want == 0)
168 wch = (unsigned char)*s++ & mask;
169 else
170 wch = us->ch;
171
172 for (i = (us->want == 0) ? 1 : 0; i < MIN(want, n); i++) {
173 if ((*s & 0xc0) != 0x80) {
174 /*
175 * Malformed input; bad characters in the middle
176 * of a character.
177 */
178 errno = EILSEQ;
179 return ((size_t)-1);
180 }
181 wch <<= 6;
182 wch |= *s++ & 0x3f;
183 }
184 if (i < want) {
185 /* Incomplete multibyte sequence. */
186 us->want = want - i;
187 us->lbound = lbound;
188 us->ch = wch;
189 return ((size_t)-2);
190 }
191 if (wch < lbound) {
192 /*
193 * Malformed input; redundant encoding.
194 */
195 errno = EILSEQ;
196 return ((size_t)-1);
197 }
198 if ((wch >= 0xd800 && wch <= 0xdfff) || wch > 0x10ffff) {
199 /*
200 * Malformed input; invalid code points.
201 */
202 errno = EILSEQ;
203 return ((size_t)-1);
204 }
205 if (pwc != NULL)
206 *pwc = wch;
207 us->want = 0;
208 return (wch == L'\0' ? 0 : want);
209 }
210
211 static size_t
_UTF8_mbsnrtowcs(wchar_t * __restrict dst,const char ** __restrict src,size_t nms,size_t len,mbstate_t * __restrict ps)212 _UTF8_mbsnrtowcs(wchar_t * __restrict dst, const char ** __restrict src,
213 size_t nms, size_t len, mbstate_t * __restrict ps)
214 {
215 _UTF8State *us;
216 const char *s;
217 size_t nchr;
218 wchar_t wc;
219 size_t nb;
220
221 us = (_UTF8State *)ps;
222
223 s = *src;
224 nchr = 0;
225
226 if (dst == NULL) {
227 /*
228 * The fast path in the loop below is not safe if an ASCII
229 * character appears as anything but the first byte of a
230 * multibyte sequence. Check now to avoid doing it in the loop.
231 */
232 if (nms > 0 && us->want > 0 && (signed char)*s > 0) {
233 errno = EILSEQ;
234 return ((size_t)-1);
235 }
236 for (;;) {
237 if (nms > 0 && (signed char)*s > 0)
238 /*
239 * Fast path for plain ASCII characters
240 * excluding NUL.
241 */
242 nb = 1;
243 else if ((nb = _UTF8_mbrtowc(&wc, s, nms, ps)) ==
244 (size_t)-1)
245 /* Invalid sequence - mbrtowc() sets errno. */
246 return ((size_t)-1);
247 else if (nb == 0 || nb == (size_t)-2)
248 return (nchr);
249 s += nb;
250 nms -= nb;
251 nchr++;
252 }
253 /*NOTREACHED*/
254 }
255
256 /*
257 * The fast path in the loop below is not safe if an ASCII
258 * character appears as anything but the first byte of a
259 * multibyte sequence. Check now to avoid doing it in the loop.
260 */
261 if (nms > 0 && len > 0 && us->want > 0 && (signed char)*s > 0) {
262 errno = EILSEQ;
263 return ((size_t)-1);
264 }
265 while (len-- > 0) {
266 if (nms > 0 && (signed char)*s > 0) {
267 /*
268 * Fast path for plain ASCII characters
269 * excluding NUL.
270 */
271 *dst = (wchar_t)*s;
272 nb = 1;
273 } else if ((nb = _UTF8_mbrtowc(dst, s, nms, ps)) ==
274 (size_t)-1) {
275 *src = s;
276 return ((size_t)-1);
277 } else if (nb == (size_t)-2) {
278 *src = s + nms;
279 return (nchr);
280 } else if (nb == 0) {
281 *src = NULL;
282 return (nchr);
283 }
284 s += nb;
285 nms -= nb;
286 nchr++;
287 dst++;
288 }
289 *src = s;
290 return (nchr);
291 }
292
293 static size_t
_UTF8_wcrtomb(char * __restrict s,wchar_t wc,mbstate_t * __restrict ps)294 _UTF8_wcrtomb(char * __restrict s, wchar_t wc, mbstate_t * __restrict ps)
295 {
296 _UTF8State *us;
297 unsigned char lead;
298 int i, len;
299
300 us = (_UTF8State *)ps;
301
302 if (us->want != 0) {
303 errno = EINVAL;
304 return ((size_t)-1);
305 }
306
307 if (s == NULL)
308 /* Reset to initial shift state (no-op) */
309 return (1);
310
311 /*
312 * Determine the number of octets needed to represent this character.
313 * We always output the shortest sequence possible. Also specify the
314 * first few bits of the first octet, which contains the information
315 * about the sequence length.
316 */
317 if ((wc & ~0x7f) == 0) {
318 /* Fast path for plain ASCII characters. */
319 *s = (char)wc;
320 return (1);
321 } else if ((wc & ~0x7ff) == 0) {
322 lead = 0xc0;
323 len = 2;
324 } else if ((wc & ~0xffff) == 0) {
325 if (wc >= 0xd800 && wc <= 0xdfff) {
326 errno = EILSEQ;
327 return ((size_t)-1);
328 }
329 lead = 0xe0;
330 len = 3;
331 } else if (wc >= 0 && wc <= 0x10ffff) {
332 lead = 0xf0;
333 len = 4;
334 } else {
335 errno = EILSEQ;
336 return ((size_t)-1);
337 }
338
339 /*
340 * Output the octets representing the character in chunks
341 * of 6 bits, least significant last. The first octet is
342 * a special case because it contains the sequence length
343 * information.
344 */
345 for (i = len - 1; i > 0; i--) {
346 s[i] = (wc & 0x3f) | 0x80;
347 wc >>= 6;
348 }
349 *s = (wc & 0xff) | lead;
350
351 return (len);
352 }
353
354 static size_t
_UTF8_wcsnrtombs(char * __restrict dst,const wchar_t ** __restrict src,size_t nwc,size_t len,mbstate_t * __restrict ps)355 _UTF8_wcsnrtombs(char * __restrict dst, const wchar_t ** __restrict src,
356 size_t nwc, size_t len, mbstate_t * __restrict ps)
357 {
358 _UTF8State *us;
359 char buf[MB_LEN_MAX];
360 const wchar_t *s;
361 size_t nbytes;
362 size_t nb;
363
364 us = (_UTF8State *)ps;
365
366 if (us->want != 0) {
367 errno = EINVAL;
368 return ((size_t)-1);
369 }
370
371 s = *src;
372 nbytes = 0;
373
374 if (dst == NULL) {
375 while (nwc-- > 0) {
376 if (0 <= *s && *s < 0x80)
377 /* Fast path for plain ASCII characters. */
378 nb = 1;
379 else if ((nb = _UTF8_wcrtomb(buf, *s, ps)) ==
380 (size_t)-1)
381 /* Invalid character - wcrtomb() sets errno. */
382 return ((size_t)-1);
383 if (*s == L'\0')
384 return (nbytes + nb - 1);
385 s++;
386 nbytes += nb;
387 }
388 return (nbytes);
389 }
390
391 while (len > 0 && nwc-- > 0) {
392 if (0 <= *s && *s < 0x80) {
393 /* Fast path for plain ASCII characters. */
394 nb = 1;
395 *dst = *s;
396 } else if (len > (size_t)MB_CUR_MAX) {
397 /* Enough space to translate in-place. */
398 if ((nb = _UTF8_wcrtomb(dst, *s, ps)) == (size_t)-1) {
399 *src = s;
400 return ((size_t)-1);
401 }
402 } else {
403 /*
404 * May not be enough space; use temp. buffer.
405 */
406 if ((nb = _UTF8_wcrtomb(buf, *s, ps)) == (size_t)-1) {
407 *src = s;
408 return ((size_t)-1);
409 }
410 if (nb > (int)len)
411 /* MB sequence for character won't fit. */
412 break;
413 memcpy(dst, buf, nb);
414 }
415 if (*s == L'\0') {
416 *src = NULL;
417 return (nbytes + nb - 1);
418 }
419 s++;
420 dst += nb;
421 len -= nb;
422 nbytes += nb;
423 }
424 *src = s;
425 return (nbytes);
426 }
427