xref: /xnu-11215/bsd/sys/unicode.h (revision aca3beaa)
1 /*
2  * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3  *
4  * @APPLE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. Please obtain a copy of the License at
10  * http://www.opensource.apple.com/apsl/ and read it before using this
11  * file.
12  *
13  * The Original Code and all software distributed under the License are
14  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18  * Please see the License for the specific language governing rights and
19  * limitations under the License.
20  *
21  * @APPLE_LICENSE_HEADER_END@
22  */
23 
24 #ifndef unicode_h
25 #define unicode_h
26 
27 #ifdef KERNEL_PRIVATE
28 
29 #include <sys/cdefs.h>
30 #include <stdbool.h>
31 
32 /*
33  * WARNING - callers that use the following Unicode normalization interface for on-disk
34  * structures should be aware that the implementation will be periodically updated for
35  * the latest Unicode standard version.
36  */
37 
38 enum {
39 	/* Maximum size of UTF32 reordering buffer for stream-safe format */
40 	kNCFStreamSafeBufMax = 32
41 };
42 
43 /*
44  * utf8_normalizeOptCaseFoldAndHash
45  *
46  * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,
47  * as specified by the case_sens parameter, and feed the result incrementally to
48  * the provided hash function callback:
49  * - "canonical caseless form" (case-folded NFD, as described by definition D145
50  *    in chapter 3 of The Unicode Standard); for case-insensitive behavior.
51  * - standard NFD; for case-sensitive behavior (if case_sens = true).
52  *
53  * The input string should be valid UTF-8 that meets the criteria for stream safe
54  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
55  * It should not contain ASCII 0x00 or '/'.
56  *
57  * str:       The input UTF-8 string (need not be 0 terminated)
58  * str_len:   The byte length of the input string (excluding any 0 terminator)
59  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
60  *            True for case-sensitive behavior; generates standard NFD.
61  * hash_func: A pointer to a hashing function to compute the hash of the
62  *            normalized/case-folded result. buf contains buf_len bytes
63  *            of data to be added to the hash using the caller-supplied
64  *            context (ctx).
65  * hash_ctx:  The context for the hash function.
66  *
67  * Returns: 0 on success, or
68  *          EILSEQ: The input string contains illegal ASCII-range characters
69  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
70  *                  contains codepoints that are non-characters or unassigned in
71  *                  the version of Unicode currently supported.
72  */
73 int utf8_normalizeOptCaseFoldAndHash(const char *str,
74     size_t      str_len,
75     bool        case_sens,
76     void      (*hash_func)(void *buf, size_t buf_len, void *ctx),
77     void       *hash_ctx);
78 
79 /*
80  * utf8_normalizeOptCaseFoldAndCompare
81  *
82  * Determine whether two UTF-8 strings are equal after converting each to one of the
83  * following normalized forms, as specified by the case_sens parameter:
84  * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.
85  * - standard NFD; for case-sensitive comparison (if case_sens = true).
86  * On success, sets are_equal to true if the strings are equal, or false if they are not.
87  *
88  * The input strings should be valid UTF-8 that meet the criteria for stream safe
89  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
90  * They should not contain ASCII 0x00 or '/'.
91  *
92  * strA:      A UTF-8 string to be compared (need not be 0 terminated)
93  * strA_len:  The byte length of strA (excluding any 0 terminator)
94  * strB:      The second UTF-8 string to be compared (need not be 0 terminated)
95  * strB_len:  The byte length of strB (excluding any 0 terminator)
96  * case_sens: False for case-insensitive behavior; compares canonical caseless forms.
97  *            True for case-sensitive behavior; compares standard NFD forms.
98  * are_equal: On success, set to true if the strings are equal, or set to false
99  *            if they are not.
100  *
101  * Returns: 0 on success, or
102  *          EILSEQ: One or both of the input strings contains illegal ASCII-range
103  *                  characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,
104  *                  or contains codepoints that are non-characters or unassigned in
105  *                  the version of Unicode currently supported.
106  *                  Note: The comparison may terminate early when a difference is
107  *                        detected, and may return 0 and set *are_equal=false even
108  *                        if one or both strings are invalid.
109  */
110 int utf8_normalizeOptCaseFoldAndCompare(const char *strA,
111     size_t      strA_len,
112     const char *strB,
113     size_t      strB_len,
114     bool        case_sens,
115     bool       *are_equal);
116 
117 /*
118  * utf8_normalizeOptCaseFold
119  *
120  * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,
121  * as specified by the case_sens parameter, and copy the result to the ustr
122  * buffer:
123  * - "canonical caseless form" (case-folded NFD, as described by definition D145
124  *    in chapter 3 of The Unicode Standard); for case-insensitive behavior.
125  * - standard NFD; for case-sensitive behavior (if case_sens = true).
126  *
127  * The input string should be valid UTF-8 that meets the criteria for stream safe
128  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
129  * It should not contain ASCII 0x00 or '/'.
130  *
131  * str:       The input UTF-8 string (need not be 0 terminated)
132  * str_len:   The byte length of the input string (excluding any 0 terminator)
133  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
134  *            True for case-sensitive behavior; generates standard NFD.
135  * ustr:      A pointer to a buffer for the resulting UTF-32 string.
136  * ustr_size: The capacity of ustr, in UTF-32 units.
137  * ustr_len:  Pointer to a value that will be filled in with the actual length
138  *            in UTF-32 units of the string copied to ustr.
139  *
140  * Returns: 0 on success, or
141  *          EILSEQ: The input string contains illegal ASCII-range characters
142  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
143  *                  contains codepoints that are non-characters or unassigned in
144  *                  the version of Unicode currently supported.
145  *          ENOMEM: ustr_size is insufficient for the resulting string. In this
146  *                  case the value returned in *ustr_len is invalid.
147  */
148 int utf8_normalizeOptCaseFold(const char *str,
149     size_t      str_len,
150     bool        case_sens,
151     int32_t    *ustr,
152     int32_t     ustr_size,
153     int32_t    *ustr_len);
154 
155 /*
156  * utf8_normalizeOptCaseFoldToUTF8
157  *
158  * Convert a given UTF-8 string to UTF-8 in one of the following normalized forms,
159  * as specified by the case_sens parameter, and copy the result to the ustr
160  * buffer:
161  * - "canonical caseless form" (case-folded NFD, as described by definition D145
162  *    in chapter 3 of The Unicode Standard); for case-insensitive behavior.
163  * - standard NFD; for case-sensitive behavior (if case_sens = true).
164  *
165  * The input string should be valid UTF-8 that meets the criteria for stream safe
166  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
167  * It should not contain ASCII 0x00 or '/'.
168  *
169  * str:       The input UTF-8 string (need not be 0 terminated)
170  * str_len:   The byte length of the input string (excluding any 0 terminator)
171  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
172  *            True for case-sensitive behavior; generates standard NFD.
173  * ustr:      A pointer to a buffer for the resulting UTF-8 string.
174  * ustr_size: The capacity of ustr, in bytes.
175  * ustr_len:  Pointer to a value that will be filled in with the actual length
176  *            in bytes of the string copied to ustr.
177  *
178  * Returns: 0 on success, or
179  *          EILSEQ: The input string contains illegal ASCII-range characters
180  *                  (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
181  *                  contains codepoints that are non-characters or unassigned in
182  *                  the version of Unicode currently supported.
183  *          ENOMEM: ustr_size is insufficient for the resulting string. In this
184  *                  case the value returned in *ustr_len is invalid.
185  */
186 int utf8_normalizeOptCaseFoldToUTF8(const char *str,
187     size_t      str_len,
188     bool        case_sens,
189     char       *ustr,
190     size_t      ustr_size,
191     size_t     *ustr_len);
192 
193 /*
194  * utf8_normalizeOptCaseFoldToUTF8ForPath
195  *
196  * Convert a given UTF-8 path string to UTF-8 in one of the following normalized forms,
197  * as specified by the case_sens parameter, and copy the result to the ustr
198  * buffer:
199  * - "canonical caseless form" (case-folded NFD, as described by definition D145
200  *    in chapter 3 of The Unicode Standard); for case-insensitive behavior.
201  * - standard NFD; for case-sensitive behavior (if case_sens = true).
202  *
203  * The input string should be valid UTF-8 that meets the criteria for stream safe
204  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
205  *
206  * str:       The input UTF-8 path string
207  * str_len:   The byte length of the input path string (excluding any 0 terminator)
208  * case_sens: False for case-insensitive behavior; generates canonical caseless form.
209  *            True for case-sensitive behavior; generates standard NFD.
210  * ustr:      A pointer to a buffer for the resulting UTF-8 string.
211  * ustr_size: The capacity of ustr, in bytes.
212  * ustr_len:  Pointer to a value that will be filled in with the actual length
213  *            in bytes of the string copied to ustr.
214  *
215  * Returns: 0 on success, or
216  *          EILSEQ: The input string contains illegal ASCII-range characters
217  *                  (0x00), or is not well-formed stream-safe UTF-8, or
218  *                  contains codepoints that are non-characters or unassigned in
219  *                  the version of Unicode currently supported.
220  *          ENOMEM: ustr_size is insufficient for the resulting string. In this
221  *                  case the value returned in *ustr_len is invalid.
222  */
223 int utf8_normalizeOptCaseFoldToUTF8ForPath(const char *str,
224     size_t      str_len,
225     bool        case_sens,
226     char       *ustr,
227     size_t      ustr_size,
228     size_t     *ustr_len);
229 
230 /*
231  * utf8_normalizeOptCaseFoldAndMatchSubstring
232  *
233  * Determine whether the normalized UTF32 string derived from a specified UTF-8 string
234  * strA contains another UTF32 string ustrB which has already been normalized, typically
235  * with normalizeOptCaseFold. The normalization for both strings is one of the following,
236  * as specified by the case_sens parameter:
237  * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.
238  * - standard NFD; for case-sensitive comparison (if case_sens = true).
239  * On success, sets are_equal to true if strA contains ustrB, or false otherwise.
240  *
241  * The input string strA should be valid UTF-8 that meets the criteria for stream safe
242  * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
243  * It should not contain ASCII 0x00 or '/'.
244  *
245  * strA:      A UTF-8 string (need not be 0 terminated) in which to search for the
246  *            substring specified by ustrB.
247  * strA_len:  The byte length of strA (excluding any 0 terminator)
248  * ustrB:     A normalized UTF-32 substring (need not be 0 terminated) to be searched
249  *            for in the UTF-32 string resulting from converting strA to the normalized
250  *            UTF-32 form specified by the case_sens parameter; ustrB must already be
251  *            in that form. Normally this will be produced using normalizeOptCaseFold.
252  * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).
253  * case_sens: False for case-insensitive matching; compares canonical caseless forms.
254  *            True for case-sensitive matching; compares standard NFD forms.
255  * buf:       Pointer to caller-supplied working memory for storing the portion of
256  *            strA which has been converted to normalized UTF-32.
257  * buf_size:  The size of buf.
258  * has_match: On success, set to true if strA (when converter to UTF-32 and normalized
259  *            per case_sens) contains ustrB, set to false otherwise.
260  *
261  * Returns: 0 on success, or
262  *          EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is
263  *                  not well-formed stream-safe UTF-8, or contains codepoints that are
264  *                  non-characters or unassigned in the version of Unicode currently
265  *                  supported.
266  *                  Note: The search may terminate early when a match is detected, and
267  *                        may return 0 and set *has_match=true even if strA is invalid.
268  *          ENOMEM: buf_size is insufficient.
269  */
270 int utf8_normalizeOptCaseFoldAndMatchSubstring(const char    *strA,
271     size_t         strA_len,
272     const int32_t *ustrB,
273     int32_t        ustrB_len,
274     bool           case_sens,
275     void          *buf,
276     size_t         buf_size,
277     bool          *has_match);
278 
279 /*
280  * utf8_normalizeOptCaseFoldGetUVersion
281  *
282  * Get the Unicode and code version currently associated with the normalizeOptCaseFold
283  * functions. The caller allocates the version array and passes it to the function,
284  * which will fill out the array as follows:
285  * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6
286  * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3
287  * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0
288  * version[3] = Code revision level; for any given Unicode version, this value starts
289  *              at 0 and is incremented for each significant revision to the
290  *              normalizeOptCaseFold functions.
291  */
292 void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]);
293 
294 #endif /* KERNEL_PRIVATE */
295 
296 #endif  /* unicode_h */
297