xref: /xnu-11215/bsd/vfs/vfs_unicode.c (revision 94d3b452)
1 /*
2  * Copyright (C) 2016-2023 Apple, Inc. All rights reserved.
3  * Some portions covered by other copyrights, listed below.
4  *---
5  * Copyright (C) 2016 and later: Unicode, Inc. and others.
6  * License & terms of use: http://www.unicode.org/copyright.html
7  *---
8  * Copyright (C) 1999-2015, International Business Machines
9  * Corporation and others.  All Rights Reserved.
10  *
11  * add APPLE_OSREFERENCE_LICENSE_HEADER stuff...
12  */
13 
14 #include <libkern/libkern.h>
15 #include <sys/errno.h>
16 #include <sys/unicode.h>
17 #include "vfs_unicode_data.h"
18 #define STATIC_UNLESS_TEST static
19 
20 enum {
21 	/* Maximum number of UTF8 bytes from one Unicode code point (one UTF32 code unit) */
22 	kMaxUTF8BytesPerChar = 4
23 };
24 
25 /* local prototypes used by exported functions (and themselves exported for testing) */
26 STATIC_UNLESS_TEST
27 int32_t utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit);
28 STATIC_UNLESS_TEST
29 int32_t normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens,
30     int32_t u32NormFoldBuf[kNFCSingleCharDecompMax],
31     uint8_t combClass[kNFCSingleCharDecompMax]);
32 /* local prototypes used by exported functions (not exported for separate testing) */
33 static int nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes,
34     int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP,
35     int32_t* buf, uint8_t* bufcc, int32_t* buflenP,
36     bool* needReorderP, bool* startP);
37 void doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen);
38 int32_t u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]);
39 
40 /*
41  * utf8_normalizeOptCaseFoldGetUVersion
42  *
43  * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6
44  * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3
45  * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0
46  * version[3] = Code revision level; for any given Unicode version, this value starts
47  *              at 0 and is incremented for each significant revision to the
48  *              normalizeOptCaseFold functions.
49  */
50 void
utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4])51 utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4])
52 {
53 	version[0] = 15;
54 	version[1] = 1;
55 	version[2] = 0;
56 	version[3] = 0;
57 	return;
58 }
59 
60 /*
61  * utf8_normalizeOptCaseFoldAndHash
62  *
63  * str:       The input UTF-8 string (need not be 0 terminated)
64  * str_len:   The byte length of the input string (excluding any 0 terminator)
65  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
66  *            True for case-sensitive behavior; generates standard NFD.
67  * hash_func: A pointer to a hashing function to compute the hash of the
68  *            normalized/case-folded result. buf contains buf_len bytes
69  *            of data to be added to the hash using the caller-supplied
70  *            context (ctx).
71  * hash_ctx:  The context for the hash function.
72  *
73  * Returns: 0 on success, or
74  *          EILSEQ: The input string contains illegal ASCII-range characters
75  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
76  *                  contains codepoints that are non-characters or unassigned in
77  *                  the version of Unicode currently supported (Unicode 9.0).
78  */
79 
80 int
utf8_normalizeOptCaseFoldAndHash(const char * str,size_t str_len,bool case_sens,void (* hash_func)(void * buf,size_t buf_len,void * ctx),void * hash_ctx)81 utf8_normalizeOptCaseFoldAndHash(const char *str,
82     size_t      str_len,
83     bool        case_sens,
84     void      (*hash_func)(void *buf, size_t buf_len, void *ctx),
85     void       *hash_ctx)
86 {
87 	const char *strLimit = str + str_len;
88 
89 	/* Data for the next pending single-char norm from input;
90 	 *  This will always begin with a base char (combining class 0)
91 	 *  or the first character in the string, which may no be a base */
92 	int32_t unorm[kNFCSingleCharDecompMax];
93 	uint8_t unormcc[kNFCSingleCharDecompMax];
94 	int32_t unormlen = 0;
95 	int32_t unormstart = 0;
96 
97 	bool start = true;
98 
99 	/* main loop:
100 	 * Each input character may be normalized to a sequence of one or more characters,
101 	 * some of which may have non-zero combining class. Any sequence of characters
102 	 * with non-zero combining class resulting from one or more input characters needs
103 	 * to be accumulated in the main buffer so we can reorder as necessary before
104 	 * calling the hash function.
105 	 *
106 	 * At the beginning of the main loop: The normalization buffer and main buffer are
107 	 * both empty.
108 	 *
109 	 * Each time through the main loop we do the following:
110 	 * 1. If there are characters available in the normalization result buffer (from the
111 	 *    result of normalizing a previous input character), copy the first character and
112 	 *    any following characters that have non-zero combining class to the main buffer.
113 	 * 2. If there is nothing left in the normalization buffer, then loop processing
114 	 *    input characters as follows:
115 	 *   a) Get the next input character from UTF8, get its normalized and case-folded
116 	 *      result in the normalization buffer.
117 	 *   b) If the first character in the normalization buffer has combining class 0,
118 	 *      break; we will handle this normalization buffer next time through the main
119 	 *      loop.
120 	 *   c) Else copy the current normalization buffer (which has only combining marks)
121 	 *      to the main buffer, and continue with the loop processing input characters.
122 	 * 3. At this point the first character in the main buffer may or may not have
123 	 *    combining class 0, but any subsequent characters (up to the the limit for
124 	 *    stream safe text) will be combining characters with nonzero combining class.
125 	 *    Reorder the combining marks if necessary into canonical order.
126 	 * 4. Call the hash function for each character in the main buffer.
127 	 *
128 	 */
129 	do {
130 		/* Data for the buffers being built up from input */
131 		int32_t buf[kNCFStreamSafeBufMax];
132 		uint8_t bufcc[kNCFStreamSafeBufMax];
133 		int32_t buflen = 0;
134 		bool needReorder = false;
135 		int err;
136 
137 		err = nextBaseAndAnyMarks(&str, strLimit, case_sens, false /* allow_slashes */,
138 		    unorm, unormcc, &unormlen, &unormstart, buf, bufcc, &buflen, &needReorder, &start);
139 		if (err != 0) {
140 			return err;
141 		}
142 
143 		if (buflen > 0) {
144 			/* Now buffer should have all of the combining marks up to the next base char.
145 			 * Normally it will also start with the last base char encountered (unless the
146 			 * UTF8 string began with a combining mark). */
147 			/* Now reorder combining marks if necessary. */
148 			if (needReorder) {
149 				doReorder(buf, bufcc, buflen);
150 			}
151 			/* Now write to hash func */
152 			hash_func(buf, buflen * sizeof(buf[0]), hash_ctx);
153 		}
154 		/* OK so far, top of loop clears buffers to start refilling again */
155 	} while (str < strLimit || unormlen > 0);
156 	return 0;
157 }
158 
159 /*
160  * utf8_normalizeOptCaseFoldAndCompare
161  *
162  * strA:      A UTF-8 string to be compared (need not be 0 terminated)
163  * strA_len:  The byte length of strA (excluding any 0 terminator)
164  * strB:      The second UTF-8 string to be compared (need not be 0 terminated)
165  * strB_len:  The byte length of strB (excluding any 0 terminator)
166  * case_sens: False for case-insensitive behavior; compares canonical caseless forms.
167  *            True for case-sensitive behavior; compares standard NFD forms.
168  * are_equal: On success, set to true if the strings are equal, or set to false
169  *            if they are not.
170  *
171  * Returns: 0 on success, or
172  *          EILSEQ: One or both of the input strings contains illegal ASCII-range
173  *                  characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,
174  *                  or contains codepoints that are non-characters or unassigned in
175  *                  the version of Unicode currently supported (Unicode 9.0).
176  *                  Note: The comparison may terminate early when a difference is
177  *                        detected, and may return 0 and set *are_equal=false even
178  *                        if one or both strings are invalid.
179  */
180 enum { kNFCSingleCharDecompMaxPlusPushback = kNFCSingleCharDecompMax + 4 }; /* room for 03B9 pushback(s) */
181 
182 int
utf8_normalizeOptCaseFoldAndCompare(const char * strA,size_t strA_len,const char * strB,size_t strB_len,bool case_sens,bool * are_equal)183 utf8_normalizeOptCaseFoldAndCompare(const char *strA,
184     size_t      strA_len,
185     const char *strB,
186     size_t      strB_len,
187     bool        case_sens,
188     bool       *are_equal)
189 {
190 	const char *strALimit = strA + strA_len;
191 	const char *strBLimit = strB + strB_len;
192 
193 	/* Data for the next pending single-char norms from each input;
194 	 *  These will always begin with a base char (combining class 0)
195 	 *  or the first character in the string, which may not be a base */
196 	int32_t unormA[kNFCSingleCharDecompMaxPlusPushback], unormB[kNFCSingleCharDecompMaxPlusPushback];
197 	uint8_t unormAcc[kNFCSingleCharDecompMaxPlusPushback], unormBcc[kNFCSingleCharDecompMaxPlusPushback];
198 	int32_t unormAlen = 0, unormBlen = 0;
199 	int32_t unormAstart = 0, unormBstart = 0;
200 
201 	bool startA = true, startB = true;
202 
203 	/* main loop:
204 	 * The main loop here is similar to the main loop in utf8_normalizeOptCaseFoldAndHash,
205 	 * described above. The differences are:
206 	 * - We keep a normalization buffer and main buffer for each string.
207 	 * - In the main loop, we do steps 1-3 for each string.
208 	 * - In step 4, instead of calling the hash function, we compare the two main
209 	 *   buffers; if they are unequal, we return a non-equal result.
210 	 * - After the end of the main loop, if we still have data for one string but
211 	 *   not the other, return a non-equal result, else return an equal result.
212 	 */
213 	do {
214 		/* Data for the buffers being built up from each input */
215 		int32_t bufA[kNCFStreamSafeBufMax], bufB[kNCFStreamSafeBufMax];
216 		uint8_t bufAcc[kNCFStreamSafeBufMax], bufBcc[kNCFStreamSafeBufMax];
217 		int32_t bufAlen = 0, bufBlen = 0;
218 		bool needReorderA = false, needReorderB = false;
219 		int err;
220 
221 		err = nextBaseAndAnyMarks(&strA, strALimit, case_sens, false /* allow_slashes */,
222 		    unormA, unormAcc, &unormAlen, &unormAstart, bufA, bufAcc, &bufAlen, &needReorderA, &startA);
223 		if (err != 0) {
224 			return err;
225 		}
226 		err = nextBaseAndAnyMarks(&strB, strBLimit, case_sens, false /* allow_slashes */,
227 		    unormB, unormBcc, &unormBlen, &unormBstart, bufB, bufBcc, &bufBlen, &needReorderB, &startB);
228 		if (err != 0) {
229 			return err;
230 		}
231 
232 		if (bufAlen > 0 || bufBlen > 0) {
233 			/* Now each buffer should have all of the combining marks up to the next base char.
234 			 * Normally it will also start with the last base char encountered (unless the
235 			 * UTF8 string began with a combining mark). */
236 			/* Now reorder combining marks if necessary. */
237 			if (needReorderA) {
238 				doReorder(bufA, bufAcc, bufAlen);
239 			}
240 			if (needReorderB) {
241 				doReorder(bufB, bufBcc, bufBlen);
242 			}
243 			/* handle 03B9 pushback */
244 			int32_t idx;
245 			if (!case_sens) {
246 				if (bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9 && unormAstart == 0) {
247 					int32_t tailCount = 0;
248 					while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormAlen && bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9) {
249 						tailCount++;
250 						bufAlen--;
251 					}
252 					for (idx = unormAlen; idx > 0; idx--) {
253 						unormA[idx - 1 + tailCount] = unormA[idx - 1];
254 						unormAcc[idx - 1 + tailCount] = unormAcc[idx - 1];
255 					}
256 					for (idx = 0; idx < tailCount; idx++) {
257 						unormA[idx] = 0x03B9;
258 						unormAcc[idx] = 0;
259 					}
260 					unormAlen += tailCount;
261 				}
262 				if (bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9 && unormBstart == 0) {
263 					int32_t tailCount = 0;
264 					while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormBlen && bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9) {
265 						tailCount++;
266 						bufBlen--;
267 					}
268 					for (idx = unormBlen; idx > 0; idx--) {
269 						unormB[idx - 1 + tailCount] = unormB[idx - 1];
270 						unormBcc[idx - 1 + tailCount] = unormBcc[idx - 1];
271 					}
272 					for (idx = 0; idx < tailCount; idx++) {
273 						unormB[idx] = 0x03B9;
274 						unormBcc[idx] = 0;
275 					}
276 					unormBlen += tailCount;
277 				}
278 			}
279 			/* Now compare the buffers. */
280 			if (bufAlen != bufBlen || memcmp(bufA, bufB, bufAlen * sizeof(bufA[0])) != 0) {
281 				*are_equal = false;
282 				return 0;
283 			}
284 		}
285 		/* OK so far, top of loop clears buffers to start refilling again */
286 	} while ((strA < strALimit || unormAlen > 0) && (strB < strBLimit || unormBlen > 0));
287 
288 	*are_equal = (strA == strALimit && unormAlen == 0 && strB == strBLimit && unormBlen == 0);
289 	return 0;
290 }
291 
292 /*
293  * utf8_normalizeOptCaseFold
294  *
295  * str:       The input UTF-8 string (need not be 0 terminated)
296  * str_len:   The byte length of the input string (excluding any 0 terminator)
297  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
298  *            True for case-sensitive behavior; generates standard NFD.
299  * ustr:      A pointer to a buffer for the resulting UTF-32 string.
300  * ustr_size: The capacity of ustr, in UTF-32 units.
301  * ustr_len:  Pointer to a value that will be filled in with the actual length
302  *            in UTF-32 units of the string copied to ustr.
303  *
304  * Returns: 0 on success, or
305  *          EILSEQ: The input string contains illegal ASCII-range characters
306  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
307  *                  contains codepoints that are non-characters or unassigned in
308  *                  the version of Unicode currently supported.
309  *          ENOMEM: ustr_size is insufficient for the resulting string. In this
310  *                  case the value returned in *ustr_len is invalid.
311  */
312 int
utf8_normalizeOptCaseFold(const char * str,size_t str_len,bool case_sens,int32_t * ustr,int32_t ustr_size,int32_t * ustr_len)313 utf8_normalizeOptCaseFold(const char *str,
314     size_t      str_len,
315     bool        case_sens,
316     int32_t    *ustr,
317     int32_t     ustr_size,
318     int32_t    *ustr_len)
319 {
320 	const char *strLimit = str + str_len;
321 	int32_t *ustrCur = ustr;
322 	const int32_t *ustrLimit = ustr + ustr_size;
323 
324 	/* Data for the next pending single-char norm from input;
325 	 *  This will always begin with a base char (combining class 0) */
326 	int32_t unorm[kNFCSingleCharDecompMax];
327 	uint8_t unormcc[kNFCSingleCharDecompMax];
328 	int32_t unormlen = 0;
329 	int32_t unormstart = 0;
330 
331 	bool start = true;
332 
333 	*ustr_len = 0;
334 	do {
335 		/* Data for the buffers being built up from input */
336 		int32_t buf[kNCFStreamSafeBufMax];
337 		uint8_t bufcc[kNCFStreamSafeBufMax];
338 		int32_t buflen = 0;
339 		bool needReorder = false;
340 		int err;
341 
342 		err = nextBaseAndAnyMarks(&str, strLimit, case_sens, false /* allow_slashes */,
343 		    unorm, unormcc, &unormlen, &unormstart, buf, bufcc, &buflen, &needReorder, &start);
344 		if (err != 0) {
345 			return err;
346 		}
347 
348 		if (buflen > 0) {
349 			if (needReorder) {
350 				doReorder(buf, bufcc, buflen);
351 			}
352 			/* Now copy to output buffer */
353 			int32_t idx;
354 			if (ustrCur + buflen > ustrLimit) {
355 				return ENOMEM;
356 			}
357 			for (idx = 0; idx < buflen; idx++) {
358 				*ustrCur++ = buf[idx];
359 			}
360 		}
361 		/* OK so far, top of loop clears buffers to start refilling again */
362 	} while (str < strLimit || unormlen > 0);
363 	*ustr_len = (uint32_t)(ustrCur - ustr); // XXXpjr: the explicit (uint32_t) cast wasn't present in the original code drop
364 	return 0;
365 }
366 
367 static int
utf8_normalizeOptCaseFoldToUTF8_internal(const char * str,size_t str_len,bool case_sens,bool allow_slashes,char * ustr,size_t ustr_size,size_t * ustr_len)368 utf8_normalizeOptCaseFoldToUTF8_internal(const char *str,
369     size_t      str_len,
370     bool        case_sens,
371     bool        allow_slashes,
372     char       *ustr,
373     size_t      ustr_size,
374     size_t     *ustr_len)
375 {
376 	const char *strLimit = str + str_len;
377 	char *ustrCur = ustr;
378 	const char *ustrLimit = ustr + ustr_size;
379 
380 	/* Data for the next pending single-char norm from input;
381 	 *  This will always begin with a base char (combining class 0) */
382 	int32_t unorm[kNFCSingleCharDecompMax];
383 	uint8_t unormcc[kNFCSingleCharDecompMax];
384 	int32_t unormlen = 0;
385 	int32_t unormstart = 0;
386 
387 	bool start = true;
388 
389 	*ustr_len = 0;
390 	do {
391 		/* Data for the buffers being built up from input */
392 		int32_t buf[kNCFStreamSafeBufMax];
393 		uint8_t bufcc[kNCFStreamSafeBufMax];
394 		int32_t buflen = 0;
395 		bool needReorder = false;
396 		int err;
397 
398 		err = nextBaseAndAnyMarks(&str, strLimit, case_sens, allow_slashes,
399 		    unorm, unormcc, &unormlen, &unormstart, buf, bufcc, &buflen, &needReorder, &start);
400 		if (err != 0) {
401 			return err;
402 		}
403 
404 		if (buflen > 0) {
405 			uint8_t utf8Bytes[kMaxUTF8BytesPerChar];
406 			int32_t *bufPtr = buf;
407 			if (needReorder) {
408 				doReorder(buf, bufcc, buflen);
409 			}
410 			/* Now copy to output buffer */
411 			while (buflen-- > 0) {
412 				int32_t idx, utf8Len = u32CharToUTF8Bytes((uint32_t)*bufPtr++, utf8Bytes);
413 				if (ustrCur + utf8Len > ustrLimit) {
414 					return ENOMEM;
415 				}
416 				for (idx = 0; idx < utf8Len; idx++) {
417 					*ustrCur++ = (char)utf8Bytes[idx];
418 				}
419 			}
420 		}
421 		/* OK so far, top of loop clears buffers to start refilling again */
422 	} while (str < strLimit || unormlen > 0);
423 	*ustr_len = ustrCur - ustr;
424 	return 0;
425 }
426 
427 /*
428  * utf8_normalizeOptCaseFoldToUTF8
429  * (This is similar to normalizeOptCaseFold except that this has a different output
430  * buffer type, and adds conversion to UTF8 while copying to output buffer)
431  *
432  * str:       The input UTF-8 string (need not be 0 terminated)
433  * str_len:   The byte length of the input string (excluding any 0 terminator)
434  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
435  *            True for case-sensitive behavior; generates standard NFD.
436  * ustr:      A pointer to a buffer for the resulting UTF-8 string.
437  * ustr_size: The capacity of ustr, in bytes.
438  * ustr_len:  Pointer to a value that will be filled in with the actual length
439  *            in bytes of the string copied to ustr.
440  *
441  * Returns: 0 on success, or
442  *          EILSEQ: The input string contains illegal ASCII-range characters
443  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
444  *                  contains codepoints that are non-characters or unassigned in
445  *                  the version of Unicode currently supported.
446  *          ENOMEM: ustr_size is insufficient for the resulting string. In this
447  *                  case the value returned in *ustr_len is invalid.
448  */
449 int
utf8_normalizeOptCaseFoldToUTF8(const char * str,size_t str_len,bool case_sens,char * ustr,size_t ustr_size,size_t * ustr_len)450 utf8_normalizeOptCaseFoldToUTF8(const char *str,
451     size_t      str_len,
452     bool        case_sens,
453     char       *ustr,
454     size_t      ustr_size,
455     size_t     *ustr_len)
456 {
457 	return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, false /* allow_slashes */,
458 	           ustr, ustr_size, ustr_len);
459 }
460 
461 /*
462  * utf8_normalizeOptCaseFoldToUTF8ForPath
463  * (This is similar to normalizeOptCaseFoldToUTF8 except that this allows '/' character.)
464  *
465  * str:       The input UTF-8 path string
466  * str_len:   The byte length of the input path string (excluding any 0 terminator)
467  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
468  *            True for case-sensitive behavior; generates standard NFD.
469  * ustr:      A pointer to a buffer for the resulting UTF-8 string.
470  * ustr_size: The capacity of ustr, in bytes.
471  * ustr_len:  Pointer to a value that will be filled in with the actual length
472  *            in bytes of the string copied to ustr.
473  *
474  * Returns: 0 on success, or
475  *          EILSEQ: The input string contains illegal ASCII-range characters
476  *                  (0x00), or is not well-formed stream-safe UTF-8, or
477  *                  contains codepoints that are non-characters or unassigned in
478  *                  the version of Unicode currently supported.
479  *          ENOMEM: ustr_size is insufficient for the resulting string. In this
480  *                  case the value returned in *ustr_len is invalid.
481  */
482 int
utf8_normalizeOptCaseFoldToUTF8ForPath(const char * str,size_t str_len,bool case_sens,char * ustr,size_t ustr_size,size_t * ustr_len)483 utf8_normalizeOptCaseFoldToUTF8ForPath(const char *str,
484     size_t      str_len,
485     bool        case_sens,
486     char       *ustr,
487     size_t      ustr_size,
488     size_t     *ustr_len)
489 {
490 	return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, true /* allow_slashes */,
491 	           ustr, ustr_size, ustr_len);
492 }
493 
494 /*
495  * utf8_normalizeOptCaseFoldAndMatchSubstring
496  *
497  * strA:      A UTF-8 string (need not be 0 terminated) in which to search for the
498  *            substring specified by ustrB.
499  * strA_len:  The byte length of strA (excluding any 0 terminator)
500  * ustrB:     A normalized UTF-32 substring (need not be 0 terminated) to be searched
501  *            for in the UTF-32 string resulting from converting strA to the normalized
502  *            UTF-32 form specified by the case_sens parameter; ustrB must already be
503  *            in that form.
504  * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).
505  * case_sens: False for case-insensitive matching; compares canonical caseless forms.
506  *            True for case-sensitive matching; compares standard NFD forms.
507  * buf:       Pointer to caller-supplied working memory for storing the portion of
508  *            strA which has been converted to normalized UTF-32.
509  * buf_size:  The size of buf.
510  * has_match: On success, set to true if strA (when converter to UTF-32 and normalized
511  *            per case_sens) contains ustrB, set to false otherwise.
512  *
513  * Returns: 0 on success, or
514  *          EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is
515  *                  not well-formed stream-safe UTF-8, or contains codepoints that are
516  *                  non-characters or unassigned in the version of Unicode currently
517  *                  supported.
518  *                  Note: The search may terminate early when a match is detected, and
519  *                        may return 0 and set *has_match=true even if strA is invalid.
520  *          ENOMEM: buf_size is insufficient.
521  */
522 int
utf8_normalizeOptCaseFoldAndMatchSubstring(const char * strA,size_t strA_len,const int32_t * ustrB,int32_t ustrB_len,bool case_sens,void * buf,size_t buf_size,bool * has_match)523 utf8_normalizeOptCaseFoldAndMatchSubstring(const char    *strA,
524     size_t         strA_len,
525     const int32_t *ustrB,
526     int32_t        ustrB_len,
527     bool           case_sens,
528     void          *buf,
529     size_t         buf_size,
530     bool          *has_match)
531 {
532 	/*
533 	 * ustrA represents the current position in the UTF-32 normalized version of strA
534 	 * at which we want to test for a match; ustrANormEnd is the position beyond that
535 	 * which is just after the end of what has already been converted from strA to
536 	 * UTF-32 normalized form.
537 	 * Each time through the main loop:
538 	 * - The first task is to make sure we have enough of strA converted to UTF32
539 	 *   normalized form to test for match with ustrB at the current match position.
540 	 *   If we don't, then convert more of strA to UTF-32 normalized form until we
541 	 *   have enough to compare with ustrB. To do this, run a loop which is like the
542 	 *   main loop in utf8_normalizeOptCaseFoldAndHash except that in step 4, instead of
543 	 *   calling the hash function, we copy the normalized buffer to ustrANormEnd,
544 	 *   advancing the latter. We keep doing this until we have enough additional
545 	 *   converted to match with ustrB.
546 	 * - Then we test for match of ustrB at the current ustrA position. If there is
547 	 *   a match we return; otherwise, if there is more strA to convert we advance
548 	 *   ustrA  and repeat the main loop, otherwise we return without a match.
549 	 */
550 	if (ustrB_len == 0) { /* always matches */
551 		*has_match = true;
552 		return 0;
553 	}
554 	*has_match = false; /* initialize return value */
555 	if (ustrB_len > 2 * strA_len) {
556 		/* If ustrB is clearly too long to find in strA, don't bother normalizing strA.
557 		 * A UTF-8 character of 1 byte (ASCII) will normalize to 1 UTF-32 unit.
558 		 * A UTF-8 character of 2-4 bytes will normalize to a maximum of 4 UTF-32 units.
559 		 * The maximum expansion from unnormalized UTF-8 byte length to normalized
560 		 *  UTF-32 unit length is thus 2. */
561 		return 0;
562 	}
563 
564 	const char *strALimit = strA + strA_len;
565 	int32_t *ustrA = (int32_t *)buf;
566 	const int32_t *ustrALimit = ustrA + (buf_size / sizeof(int32_t));
567 	int32_t *ustrANormEnd = ustrA; /* how far we have already normalized in ustrA */
568 
569 	/* Data for the next pending single-char norms from each input;
570 	 *  These will always begin with a base char (combining class 0)
571 	 *  or the first character in the string, which may not be a base */
572 	int32_t unormA[kNFCSingleCharDecompMax];
573 	uint8_t unormAcc[kNFCSingleCharDecompMax];
574 	int32_t unormAlen = 0;
575 	int32_t unormAstart = 0;
576 
577 	bool startA = true;
578 
579 	while (true) {
580 		/* convert enough more of strA to normalized UTF-32 in ustrA to check for match */
581 		if (ustrANormEnd - ustrA < ustrB_len) {
582 			do {
583 				/* Data for the buffers being built up from each input */
584 				int32_t bufA[kNCFStreamSafeBufMax];
585 				uint8_t bufAcc[kNCFStreamSafeBufMax];
586 				int32_t bufAlen = 0;
587 				bool needReorderA = false;
588 				int err;
589 
590 				err = nextBaseAndAnyMarks(&strA, strALimit, case_sens, false /* allow_slashes */,
591 				    unormA, unormAcc, &unormAlen, &unormAstart, bufA, bufAcc, &bufAlen, &needReorderA, &startA);
592 				if (err != 0) {
593 					return err;
594 				}
595 
596 				if (bufAlen > 0) {
597 					/* Now each buffer should have all of the combining marks up to the next base char.
598 					 * Normally it will also start with the last base char encountered (unless the
599 					 * UTF8 string began with a combining mark). */
600 					/* Now reorder combining marks if necessary. Should be rare, and sequences should
601 					 * usually be short when does occur => simple bubblesort should be sufficient. */
602 					if (needReorderA) {
603 						doReorder(bufA, bufAcc, bufAlen);
604 					}
605 					/* Now copy to working buffer */
606 					int32_t idx;
607 					if (ustrANormEnd + bufAlen > ustrALimit) {
608 						return ENOMEM;
609 					}
610 					for (idx = 0; idx < bufAlen; idx++) {
611 						*ustrANormEnd++ = bufA[idx];
612 					}
613 				}
614 				/* OK so far, top of loop clears buffers to start refilling again */
615 			} while ((ustrANormEnd - ustrA < ustrB_len) && (strA < strALimit || unormAlen > 0));
616 		}
617 
618 		if (ustrANormEnd - ustrA < ustrB_len) {
619 			return 0; /* not enough of strA left for match */
620 		}
621 		/* check for match, return if so */
622 		if (memcmp(ustrA, ustrB, ustrB_len * sizeof(ustrB[0])) == 0) {
623 			*has_match = true;
624 			return 0;
625 		}
626 		ustrA++; /* advance match position */
627 	}
628 }
629 
630 /* nextBaseAndAnyMarks:
631  * Guts of code to get next bufferful of base character (or first char in string)
632  * and all trailing combining marks.
633  * This is called each time through the main loop of functions above, and does the
634  * following:
635  * 1. If there are characters available in the normalization result buffer (from the
636  *    result of normalizing a previous input character), copy the first character and
637  *    any following characters that have non-zero combining class to the main buffer.
638  * 2. If there is nothing left in the normalization buffer, then loop processing
639  *    input characters as follows:
640  *   a) Get the next input character from UTF8, get its normalized and case-folded
641  *      result in the normalization buffer.
642  *   b) If the first character in the normalization buffer has combining class 0,
643  *      break; we will handle this normalization buffer next time through the main
644  *      loop.
645  *   c) Else copy the current normalization buffer (which has only combining marks)
646  *      to the main buffer, and continue with the loop processing input characters.
647  */
648 
649 static int
nextBaseAndAnyMarks(const char ** strP,const char * strLimit,bool case_sens,bool allow_slashes,int32_t * unorm,uint8_t * unormcc,int32_t * unormlenP,int32_t * unormstartP,int32_t * buf,uint8_t * bufcc,int32_t * buflenP,bool * needReorderP,bool * startP)650 nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes,
651     int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP,
652     int32_t* buf, uint8_t* bufcc, int32_t* buflenP,
653     bool* needReorderP, bool* startP)
654 {
655 	/* update buffers for str */
656 	if (*unormlenP > 0 && *unormstartP < *unormlenP) {
657 		/* unorm begins with a base char; buflen should be 0 */
658 		*needReorderP = false;
659 		for (*buflenP = 0; true;) {
660 			if (*buflenP > 0 && unormcc[*unormstartP] > 0 && unormcc[*unormstartP] < bufcc[(*buflenP) - 1]) {
661 				*needReorderP = true;
662 			}
663 			buf[*buflenP] = unorm[*unormstartP];
664 			bufcc[(*buflenP)++] = unormcc[(*unormstartP)++];
665 			if (*unormstartP >= *unormlenP || unormcc[*unormstartP] == 0) {
666 				break;
667 			}
668 		}
669 	}
670 	if (*unormstartP >= *unormlenP) {
671 		*unormstartP = *unormlenP = 0;
672 		while (*strP < strLimit) {
673 			int32_t idx;
674 			uint32_t bytevalue = (uint8_t)*(*strP)++;
675 			/* '/' is not produced by NFD decomposition from another character so we can
676 			 * check for it before normalization */
677 			if (bytevalue == 0 || (bytevalue == 0x2F /*'/'*/ && !allow_slashes)) {
678 				return EILSEQ;
679 			}
680 			if (bytevalue < 0x80) {
681 				unorm[0] = (!case_sens && bytevalue >= 'A' && bytevalue <= 'Z')? bytevalue += 0x20: bytevalue;
682 				*unormlenP = 1;
683 				unormcc[0] = 0;
684 				*startP = false;
685 				break;
686 			} else {
687 				int32_t u32char = utf8ToU32Code(bytevalue, strP, strLimit);
688 				if (u32char <= 0) {
689 					return EILSEQ;
690 				}
691 				*unormlenP = normalizeOptCaseFoldU32Char(u32char, case_sens, unorm, unormcc);
692 				if (*unormlenP <= 0) {
693 					return EILSEQ;
694 				}
695 				if (unormcc[0] == 0 || *startP) {
696 					*startP = false;
697 					break;
698 				}
699 			}
700 			/* the latest char decomposes to just combining sequence, add to buffer being built */
701 			if (*buflenP + *unormlenP > kNCFStreamSafeBufMax) {
702 				return EILSEQ;
703 			}
704 			for (idx = 0; idx < *unormlenP; idx++, (*buflenP)++) {
705 				if (*buflenP > 0 && unormcc[idx] > 0 && unormcc[idx] < bufcc[(*buflenP) - 1]) {
706 					*needReorderP = true;
707 				}
708 				buf[*buflenP] = unorm[idx];
709 				bufcc[*buflenP] = unormcc[idx];
710 			}
711 			*unormlenP = 0;
712 		}
713 	}
714 	return 0;
715 }
716 
717 /*  local prototypes used only by internal functions */
718 static void swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]);
719 static int32_t adjustCase(bool case_sens, int32_t uSeqLen,
720     int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]);
721 static uint8_t getCombClassU32Char(int32_t u32char);
722 static int32_t decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]);
723 
724 /* Reorder combining marks if necessary. Should be rare, and sequences should
725  * usually be short when does occur => simple bubblesort should be sufficient. */
726 void
doReorder(int32_t * buf,uint8_t * bufcc,int32_t buflen)727 doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen)
728 {
729 	int32_t idx, jdx;
730 	for (idx = 0; idx < buflen - 1; idx++) {
731 		for (jdx = buflen - 1; jdx > idx; jdx--) {
732 			if (bufcc[jdx] < bufcc[jdx - 1]) {
733 				swapBufCharCCWithPrevious(jdx, buf, bufcc);
734 			}
735 		}
736 	}
737 }
738 /*  swap function for bubblesort */
739 static void
swapBufCharCCWithPrevious(int32_t jdx,int32_t buf[],uint8_t bufcc[])740 swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[])
741 {
742 	int32_t bufchar = buf[jdx];
743 	uint8_t bufccval = bufcc[jdx];
744 	buf[jdx] = buf[jdx - 1];
745 	bufcc[jdx] = bufcc[jdx - 1];
746 	buf[jdx - 1] = bufchar;
747 	bufcc[jdx - 1] = bufccval;
748 }
749 
750 /*
751  * u32CharToUTF8Bytes, map a valid Unicode character (UTF32 code point) to 1..4 UTF8 bytes,
752  * and returns the number of UTF8 bytes.
753  *
754  * adapted from ICU macro U8_APPEND_UNSAFE (utf8.h).
755  */
756 int32_t
u32CharToUTF8Bytes(uint32_t u32char,uint8_t utf8Bytes[kMaxUTF8BytesPerChar])757 u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar])
758 {
759 	int32_t idx = 0;
760 	if (u32char <= 0x7F) {
761 		utf8Bytes[idx++] = (uint8_t)u32char;
762 	} else {
763 		if (u32char <= 0x7FF) {
764 			utf8Bytes[idx++] = (uint8_t)((u32char >> 6) | 0xC0);
765 		} else {
766 			if (u32char <= 0xFFFF) {
767 				utf8Bytes[idx++] = (uint8_t)((u32char >> 12) | 0xE0);
768 			} else {
769 				utf8Bytes[idx++] = (uint8_t)((u32char >> 18) | 0xF0);
770 				utf8Bytes[idx++] = (uint8_t)(((u32char >> 12) & 0x3F) | 0x80);
771 			}
772 			utf8Bytes[idx++] = (uint8_t)(((u32char >> 6) & 0x3F) | 0x80);
773 		}
774 		utf8Bytes[idx++] = (uint8_t)((u32char & 0x3F) | 0x80);
775 	}
776 	return idx;
777 }
778 
779 /* two macros adapted from ICU's utf8.h */
780 #define U8_COUNT_TRAIL_BYTES_LOC(leadByte) \
781 ((uint8_t)(leadByte)<0XF0 ? \
782 ((uint8_t)(leadByte)>=0XC0)+((uint8_t)(leadByte)>=0XE0) : \
783 (uint8_t)(leadByte)<0XFE ? 3+((uint8_t)(leadByte)>=0XF8)+((uint8_t)(leadByte)>=0XFC) : 0)
784 
785 #define U8_MASK_LEAD_BYTE_LOC(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
786 
787 /* array adapted from ICU's utf_impl.c */
788 static const int32_t utf8_minLegal[4] = { 0, 0X80, 0x800, 0x10000 };
789 
790 /*
791  * utf8ToU32Code, map a non-ASCII byte value plus a buffer of trail bytes to a UTF32 code point
792  *
793  * adapted from ICU macro U8_NEXT (utf8.h) and function utf8_nextCharSafeBody (utf_impl.c);
794  * verified to produce the same results (adusted for the difference in API signature).
795  *
796  * assumes at entry that:
797  * 1. a non-ASCII byte value (>= 0x80) that purports to be the beginning of a UTF8 character
798  *    has been read, and its value is in u32char
799  * 2. *srcPtr points to the input buffer just after that non-ASCII byte, i.e. it purportedly
800  *    points to the trail bytes for that UTF8 char.
801  * 3. srcLimit points to end of the input buffer (just after the last byte in the buffer)
802  *
803  * For a valid and complete UTF8 character, the function returns its value and advances
804  * *srcPtr to the first byte after the UTF8 char. Otherwise, the function returns -1
805  * (and the value in *srcPtr is undefined).
806  * Note that while it does not map to surrogate values (generates an error for malformed
807  * UTF-8 that would map to values in 0xD800..0xD8FF), it does output noncharacter values
808  * whose low 16 bits are 0xFFFE or 0xFFFF without generating an error.
809  *
810  * equivalences used in adapted ICU code:
811  * UChar = uint16_t
812  * UChar32 = int32_t
813  *
814  * This has been validated against ICU behavior.
815  */
816 STATIC_UNLESS_TEST
817 int32_t
utf8ToU32Code(int32_t u32char,const char ** srcPtr,const char * srcLimit)818 utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit)
819 {
820 	const char* src = *srcPtr;
821 	uint8_t pt1, pt2;
822 	if (0xE0 < u32char && u32char <= 0xEC && src + 1 < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F && (pt2 = (uint8_t)(src[1] - 0x80)) <= 0x3F) {
823 		/* handle U+1000..U+CFFF */
824 		/* no need for (u32char&0xF) because the upper bits are truncated after <<12 in the cast to (uint16_t) */
825 		u32char = (uint16_t)((u32char << 12) | (pt1 << 6) | pt2);
826 		src += 2;
827 	} else if (u32char < 0xE0 && u32char >= 0xC2 && src < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F) {
828 		/* handle U+0080..U+07FF */
829 		u32char = ((u32char & 0x1F) << 6) | pt1;
830 		src++;
831 	} else {
832 		/* "complicated" and error cases, adapted from ICU's utf8_nextCharSafeBody() */
833 		uint8_t count = U8_COUNT_TRAIL_BYTES_LOC(u32char);
834 		if (src + count <= srcLimit) {
835 			uint8_t trail;
836 
837 			U8_MASK_LEAD_BYTE_LOC(u32char, count);
838 			switch (count) {
839 			/* branches 3, 2 fall through to the next one */
840 			case 0:         /* count==0 for illegally leading trail bytes and the illegal bytes 0XFE and 0XFF */
841 			case 5:
842 			case 4:          /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
843 				break;
844 			case 3:
845 				trail = *src++ - (char)0X80;
846 				u32char = (u32char << 6) | trail;
847 				/* u32char>=0x110 would result in code point>0x10FFFF, outside Unicode */
848 				if (u32char >= 0x110 || trail > 0X3F) {
849 					break;
850 				}
851 			case 2:
852 				trail = *src++ - (char)0X80;
853 				u32char = (u32char << 6) | trail;
854 				/*
855 				 * test for a surrogate D800..DFFF:
856 				 * before the last (u32char<<6), a surrogate is u32char=360..37F
857 				 */
858 				if (((u32char & 0xFFE0) == 0x360) || trail > 0X3F) {
859 					break;
860 				}
861 			case 1:
862 				trail = *src++ - (char)0X80;
863 				u32char = (u32char << 6) | trail;
864 				if (trail > 0X3F) {
865 					break;
866 				}
867 				/* correct sequence - all trail bytes have (b7..b6)==(10) */
868 				if (u32char >= utf8_minLegal[count]) {
869 					*srcPtr = src;
870 					return u32char;
871 				}
872 				/* no default branch to optimize switch()  - all values are covered */
873 			}
874 		}
875 		u32char = -1;
876 	}
877 	*srcPtr = src;
878 	return u32char;
879 }
880 
881 /*
882  * normalizeCaseFoldU32Code, map a single UTF32 code point to its normalized result
883  * and the combining classes for each resulting char, or indicate it is invalid.
884  *
885  * The normalized and case-folded result might be up to 4 UTF32 characters (current
886  * max, could change in the future).
887  *
888  * u32char - input UTF32 code point
889  * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only
890  * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
891  *          to receive the normalize result.
892  * combClass - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
893  *          to receive the combining classes for the characters in u32NormFoldBuf. If
894  *          the first entry has non-zero combining class, the remaining entries do too.
895  *
896  * returns -1 if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax
897  * is insufficient (though it is assumed to be at least 3), else the length of the
898  * normalized and case-folded result (currently in the range 1..4).
899  *
900  * This has been validated against ICU behavior.
901  *
902  * This function is highly dependent on the structure of the data trie; for details on
903  * that structure, see comments in normalizeCaseFoldData.h
904  */
905 STATIC_UNLESS_TEST
906 int32_t
normalizeOptCaseFoldU32Char(int32_t u32char,bool case_sens,int32_t u32NormFoldBuf[kNFCSingleCharDecompMax],uint8_t combClass[kNFCSingleCharDecompMax])907 normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens,
908     int32_t u32NormFoldBuf[kNFCSingleCharDecompMax],
909     uint8_t combClass[kNFCSingleCharDecompMax])
910 {
911 	combClass[0] = 0;
912 	/*  return hi-range PUA as self, except non-characters */
913 	if (u32char >= kU32HiPUAStart) {
914 		if ((u32char & 0xFFFE) == 0xFFFE) {
915 			return -1;
916 		}
917 		u32NormFoldBuf[0] = u32char;
918 		return 1;
919 	}
920 	/*  for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */
921 	/*  0 - 0x323FF; everything in between in currently invalid. */
922 	int32_t u32charLookup = u32char;
923 	if (u32charLookup >= kU32LowRangeLimit) {
924 		u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit);
925 		if (u32charLookup < kU32LowRangeLimit || u32charLookup >= (kU32LowRangeLimit + kU32HiRangeLen)) {
926 			return -1; /* in the large range of currently-unassigned code points */
927 		}
928 	}
929 	/* Now we have u32charLookup either in 0..0x323FF representing u32char itself,
930 	 *  or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in
931 	 *  the trie that identifies unassigneds in this range, or maps others to
932 	 *  decomps or combining class or just self. */
933 	uint16_t trieValue;
934 	/*  TrieHi */
935 	trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift];
936 	if (trieValue == kInvalidCodeFlag) {
937 		return -1;
938 	}
939 	if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /*  return self; */
940 		u32NormFoldBuf[0] = u32char;
941 		combClass[0] = trieValue & kFlagValueMask;
942 		return 1;
943 	}
944 	if (trieValue == kHangulMask) {
945 		combClass[1] = combClass[2] = 0;
946 		return decomposeHangul(u32char, u32NormFoldBuf);
947 	}
948 	/*  TrieMid */
949 	trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask];
950 	if (trieValue == kInvalidCodeFlag) {
951 		return -1;
952 	}
953 	if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) {
954 		u32NormFoldBuf[0] = u32char;
955 		combClass[0] = trieValue & kFlagValueMask;
956 		return adjustCase(case_sens, 1, u32NormFoldBuf);
957 	}
958 	if ((trieValue & kFlagTestMask) == kInvMaskFlag) {
959 		uint16_t invalidMask = nfU16InvMasks[trieValue & kFlagValueMask];
960 		uint16_t testBit = (uint16_t)(1 << (u32charLookup & kNFTrieLoMask));
961 		if (testBit & invalidMask) {
962 			/* invalid */
963 			return -1;
964 		} else {
965 			/* treat like trieValue == 0 above */
966 			u32NormFoldBuf[0] = u32char;
967 			return adjustCase(case_sens, 1, u32NormFoldBuf);
968 		}
969 	}
970 	if (trieValue == kHangulMask) {
971 		combClass[1] = combClass[2] = 0;
972 		return decomposeHangul(u32char, u32NormFoldBuf);
973 	}
974 	/*  TrieLo */
975 	trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieLoMask];
976 	if (trieValue == kInvalidCodeFlag) {
977 		return -1;
978 	}
979 	if (trieValue == kHangulMask) {
980 		combClass[1] = combClass[2] = 0;
981 		return decomposeHangul(u32char, u32NormFoldBuf);
982 	}
983 	if (trieValue < kToU16Seq2Mask || trieValue > kSpecialsEnd) {
984 		if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) {
985 			u32NormFoldBuf[0] = u32char;
986 			combClass[0] = trieValue & kFlagValueMask;
987 		} else {
988 			u32NormFoldBuf[0] = trieValue;
989 		}
990 		return adjustCase(case_sens, 1, u32NormFoldBuf);
991 	}
992 	const uint16_t* u16SeqPtr = NULL;
993 	const int32_t*  u32SeqPtr = NULL;
994 	int32_t         uSeqLen = 0;
995 	switch (trieValue & kSpecialsMask) {
996 	case kToU16Seq2Mask:
997 		if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
998 			/* don't use the mapping, it is only for case folding */
999 			u32NormFoldBuf[0] = u32char;
1000 			/* already have combClass[0] = 0 */
1001 			return 1;
1002 		}
1003 		u16SeqPtr = nfU16Seq2[trieValue & kToSeqIndexMask];
1004 		uSeqLen = 2;
1005 		break;
1006 	case kToU16Seq3Mask:
1007 		if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
1008 			/* don't use the mapping, it is only for case folding */
1009 			u32NormFoldBuf[0] = u32char;
1010 			/* already have combClass[0] = 0 */
1011 			return 1;
1012 		}
1013 		u16SeqPtr = nfU16Seq3[trieValue & kToSeqIndexMask];
1014 		uSeqLen = 3;
1015 		break;
1016 	case kToU16SeqMiscMask:
1017 		u16SeqPtr = &nfU16SeqMisc[trieValue & kToSeqMiscIndexMask];
1018 		uSeqLen = *u16SeqPtr & kToSeqMiscLenMask;
1019 		combClass[0] = (uint8_t)(*u16SeqPtr++ >> kToSeqMiscCCShift);
1020 		break;
1021 	case kToU32CharMask:
1022 		if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
1023 			/* don't use the mapping, it is only for case folding */
1024 			u32NormFoldBuf[0] = u32char;
1025 			/* already have combClass[0] = 0 */
1026 			return 1;
1027 		}
1028 		u32SeqPtr = &nfU32Char[trieValue & kToSeqIndexMask];
1029 		uSeqLen = 1;
1030 		break;
1031 	case kToU32SeqMiscMask:
1032 		u32SeqPtr = &nfU32SeqMisc[trieValue & kToSeqMiscIndexMask];
1033 		uSeqLen = *u32SeqPtr & kToSeqMiscLenMask;
1034 		combClass[0] = (uint8_t)(*u32SeqPtr++ >> kToSeqMiscCCShift);
1035 		break;
1036 	default:
1037 		return -1;
1038 	}
1039 	if (kNFCSingleCharDecompMax < uSeqLen) {
1040 		return 0;
1041 	}
1042 	int32_t idx;
1043 	for (idx = 0; idx < uSeqLen; idx++) {
1044 		u32NormFoldBuf[idx] = (u16SeqPtr)? *u16SeqPtr++: *u32SeqPtr++;
1045 		if (idx > 0) {
1046 			combClass[idx] = getCombClassU32Char(u32NormFoldBuf[idx]);
1047 		}
1048 	}
1049 	return adjustCase(case_sens, uSeqLen, u32NormFoldBuf);
1050 }
1051 
1052 /*
1053  * adjustCase, final adjustments to normalizeOptCaseFoldU32Char for case folding
1054  *
1055  * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only
1056  * uSeqLen - length of the sequence specified in the u32NormFoldBuf
1057  * u32NormFoldBuf - buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
1058  *          with normalized result.
1059  *
1060  * returns uSeqLen if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax
1061  * is insufficient (though it is assumed to be at least 3), else the length of the
1062  * normalized and case-folded result (currently in the range 1..4).
1063  *
1064  * This function is a reduced version of normalizeOptCaseFoldU32Char above.
1065  */
1066 
1067 static int32_t
adjustCase(bool case_sens,int32_t uSeqLen,int32_t u32NormFoldBuf[kNFCSingleCharDecompMax])1068 adjustCase(bool case_sens, int32_t uSeqLen,
1069     int32_t u32NormFoldBuf[kNFCSingleCharDecompMax])
1070 {
1071 	if (!case_sens && uSeqLen > 0) {
1072 		if (u32NormFoldBuf[0] < kSimpleCaseFoldLimit) {
1073 			u32NormFoldBuf[0] = nfBasicCF[u32NormFoldBuf[0]];
1074 			/* There is one case in which this maps to a character with different combining
1075 			 * class: U+0345 (cc 240) casefolds to U+03B9 (cc 0). However when this is the
1076 			 * first or only character in the sequence, we want to keep the original
1077 			 * combining class, so nothing special to do here.
1078 			 */
1079 		}
1080 		/* The following is the only case where we have a casefolding after the first
1081 		 * character in the sequence. Don't worry about combining class here. that gets
1082 		 * set later for characters after the first.
1083 		 */
1084 		if (uSeqLen > 1 && u32NormFoldBuf[uSeqLen - 1] == 0x0345) {
1085 			u32NormFoldBuf[uSeqLen - 1] = 0x03B9;
1086 		}
1087 	}
1088 	return uSeqLen;
1089 }
1090 
1091 /*
1092  * getCombClassU32Char, map a single character (in UTF32 form) to its combining class.
1093  *
1094  * u32char - input UTF32 code point. This is assumed to be a valid character that does
1095  * not have a decomposition.
1096  *
1097  * returns combining class of the character.
1098  *
1099  * This is only called for characters after the first is a decomposition expansion. In
1100  * this situation, if we encounter U+03B9 (combining class 0), it is only there as the
1101  * case-folding of U+0345 (combining class 240). In this case it is the combining class
1102  * for U+0345 that we want. In the non-casefold case we won't see U+03B9 here at all.
1103  *
1104  * This function is a reduced version of normalizeOptCaseFoldU32Char above.
1105  */
1106 static uint8_t
getCombClassU32Char(int32_t u32char)1107 getCombClassU32Char(int32_t u32char)
1108 {
1109 	if (u32char >= kU32HiPUAStart) {
1110 		return 0;
1111 	}
1112 	if (u32char == 0x03B9) {
1113 		return 240;
1114 	}
1115 	/*  for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */
1116 	/*  0 - 0x323FF; everything in between in currently invalid. */
1117 	int32_t u32charLookup = u32char;
1118 	if (u32charLookup >= kU32LowRangeLimit) {
1119 		u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit);
1120 	}
1121 	/* Now we have u32charLookup either in 0..0x323FF representing u32char itself,
1122 	 *  or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in
1123 	 *  the trie that identifies unassigneds in this range, or maps others to
1124 	 *  decomps or combining class or just self. */
1125 	uint16_t trieValue;
1126 	/*  TrieHi */
1127 	trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift];
1128 	if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) {
1129 		return trieValue & kFlagValueMask;
1130 	}
1131 	/*  TrieMid */
1132 	trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask];
1133 	if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /*  return self; */
1134 		return trieValue & kFlagValueMask;
1135 	}
1136 	if ((trieValue & kFlagTestMask) == kInvMaskFlag) {
1137 		return 0;
1138 	}
1139 	/*  TrieLo */
1140 	trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieMidMask];
1141 	return ((trieValue & kFlagTestMask) == kCombClassFlag)? (trieValue & kFlagValueMask): 0;
1142 }
1143 
1144 /*
1145  * decomposeHangul, map a single UTF32 code point for a composed Hangul
1146  * in the range AC00-D7A3, using algorithmic decomp
1147  *
1148  * The normalized result will be 2 or 3 UTF32 characters.
1149  *
1150  * u32char - input UTF32 code point
1151  * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
1152  *          to receive the normalize result.
1153  *
1154  * returns the length of the normalized result (2..3).
1155  *
1156  * Adapted from ICU Hangul:decompose in normalizer2impl.h
1157  *
1158  */
1159 
1160 enum {
1161 	HANGUL_BASE=0xAC00,
1162 	JAMO_L_BASE=0x1100,     /* "lead" jamo */
1163 	JAMO_V_BASE=0x1161,     /* "vowel" jamo */
1164 	JAMO_T_BASE=0x11A7,     /* "trail" jamo */
1165 	JAMO_L_COUNT=19,
1166 	JAMO_V_COUNT=21,
1167 	JAMO_T_COUNT=28,
1168 };
1169 
1170 static int32_t
decomposeHangul(int32_t u32char,int32_t u32NormFoldBuf[kNFCSingleCharDecompMax])1171 decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax])
1172 {
1173 	u32char -= HANGUL_BASE;
1174 	int32_t tIndex = u32char % JAMO_T_COUNT;
1175 	u32char /= JAMO_T_COUNT;
1176 	u32NormFoldBuf[0] = (uint16_t)(JAMO_L_BASE + u32char / JAMO_V_COUNT);
1177 	u32NormFoldBuf[1] = (uint16_t)(JAMO_V_BASE + u32char % JAMO_V_COUNT);
1178 	if (tIndex == 0) {
1179 		return 2;
1180 	}
1181 	u32NormFoldBuf[2] = (uint16_t)(JAMO_T_BASE + tIndex);
1182 	return 3;
1183 }
1184