1 /* 2 * Copyright (c) 2016-2023 Apple Inc. All rights reserved. 3 * 4 * @APPLE_LICENSE_HEADER_START@ 5 * 6 * This file contains Original Code and/or Modifications of Original Code 7 * as defined in and that are subject to the Apple Public Source License 8 * Version 2.0 (the 'License'). You may not use this file except in 9 * compliance with the License. Please obtain a copy of the License at 10 * http://www.opensource.apple.com/apsl/ and read it before using this 11 * file. 12 * 13 * The Original Code and all software distributed under the License are 14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER 15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, 16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, 17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. 18 * Please see the License for the specific language governing rights and 19 * limitations under the License. 20 * 21 * @APPLE_LICENSE_HEADER_END@ 22 */ 23 24 #ifndef unicode_h 25 #define unicode_h 26 27 #ifdef KERNEL_PRIVATE 28 29 #include <sys/cdefs.h> 30 #include <stdbool.h> 31 32 /* 33 * WARNING - callers that use the following Unicode normalization interface for on-disk 34 * structures should be aware that the implementation will be periodically updated for 35 * the latest Unicode standard version. 36 */ 37 38 enum { 39 /* Maximum size of UTF32 reordering buffer for stream-safe format */ 40 kNCFStreamSafeBufMax = 32 41 }; 42 43 /* 44 * utf8_normalizeOptCaseFoldAndHash 45 * 46 * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms, 47 * as specified by the case_sens parameter, and feed the result incrementally to 48 * the provided hash function callback: 49 * - "canonical caseless form" (case-folded NFD, as described by definition D145 50 * in chapter 3 of The Unicode Standard); for case-insensitive behavior. 51 * - standard NFD; for case-sensitive behavior (if case_sens = true). 52 * 53 * The input string should be valid UTF-8 that meets the criteria for stream safe 54 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 55 * It should not contain ASCII 0x00 or '/'. 56 * 57 * str: The input UTF-8 string (need not be 0 terminated) 58 * str_len: The byte length of the input string (excluding any 0 terminator) 59 * case_sens: False for case-insensitive behavior; generates canonical caseless form. 60 * True for case-sensitive behavior; generates standard NFD. 61 * hash_func: A pointer to a hashing function to compute the hash of the 62 * normalized/case-folded result. buf contains buf_len bytes 63 * of data to be added to the hash using the caller-supplied 64 * context (ctx). 65 * hash_ctx: The context for the hash function. 66 * 67 * Returns: 0 on success, or 68 * EILSEQ: The input string contains illegal ASCII-range characters 69 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or 70 * contains codepoints that are non-characters or unassigned in 71 * the version of Unicode currently supported. 72 */ 73 int utf8_normalizeOptCaseFoldAndHash(const char *str, 74 size_t str_len, 75 bool case_sens, 76 void (*hash_func)(void *buf, size_t buf_len, void *ctx), 77 void *hash_ctx); 78 79 /* 80 * utf8_normalizeOptCaseFoldAndCompare 81 * 82 * Determine whether two UTF-8 strings are equal after converting each to one of the 83 * following normalized forms, as specified by the case_sens parameter: 84 * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison. 85 * - standard NFD; for case-sensitive comparison (if case_sens = true). 86 * On success, sets are_equal to true if the strings are equal, or false if they are not. 87 * 88 * The input strings should be valid UTF-8 that meet the criteria for stream safe 89 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 90 * They should not contain ASCII 0x00 or '/'. 91 * 92 * strA: A UTF-8 string to be compared (need not be 0 terminated) 93 * strA_len: The byte length of strA (excluding any 0 terminator) 94 * strB: The second UTF-8 string to be compared (need not be 0 terminated) 95 * strB_len: The byte length of strB (excluding any 0 terminator) 96 * case_sens: False for case-insensitive behavior; compares canonical caseless forms. 97 * True for case-sensitive behavior; compares standard NFD forms. 98 * are_equal: On success, set to true if the strings are equal, or set to false 99 * if they are not. 100 * 101 * Returns: 0 on success, or 102 * EILSEQ: One or both of the input strings contains illegal ASCII-range 103 * characters (0x00 or '/'), or is not well-formed stream-safe UTF-8, 104 * or contains codepoints that are non-characters or unassigned in 105 * the version of Unicode currently supported. 106 * Note: The comparison may terminate early when a difference is 107 * detected, and may return 0 and set *are_equal=false even 108 * if one or both strings are invalid. 109 */ 110 int utf8_normalizeOptCaseFoldAndCompare(const char *strA, 111 size_t strA_len, 112 const char *strB, 113 size_t strB_len, 114 bool case_sens, 115 bool *are_equal); 116 117 /* 118 * utf8_normalizeOptCaseFold 119 * 120 * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms, 121 * as specified by the case_sens parameter, and copy the result to the ustr 122 * buffer: 123 * - "canonical caseless form" (case-folded NFD, as described by definition D145 124 * in chapter 3 of The Unicode Standard); for case-insensitive behavior. 125 * - standard NFD; for case-sensitive behavior (if case_sens = true). 126 * 127 * The input string should be valid UTF-8 that meets the criteria for stream safe 128 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 129 * It should not contain ASCII 0x00 or '/'. 130 * 131 * str: The input UTF-8 string (need not be 0 terminated) 132 * str_len: The byte length of the input string (excluding any 0 terminator) 133 * case_sens: False for case-insensitive behavior; generates canonical caseless form. 134 * True for case-sensitive behavior; generates standard NFD. 135 * ustr: A pointer to a buffer for the resulting UTF-32 string. 136 * ustr_size: The capacity of ustr, in UTF-32 units. 137 * ustr_len: Pointer to a value that will be filled in with the actual length 138 * in UTF-32 units of the string copied to ustr. 139 * 140 * Returns: 0 on success, or 141 * EILSEQ: The input string contains illegal ASCII-range characters 142 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or 143 * contains codepoints that are non-characters or unassigned in 144 * the version of Unicode currently supported. 145 * ENOMEM: ustr_size is insufficient for the resulting string. In this 146 * case the value returned in *ustr_len is invalid. 147 */ 148 int utf8_normalizeOptCaseFold(const char *str, 149 size_t str_len, 150 bool case_sens, 151 int32_t *ustr, 152 int32_t ustr_size, 153 int32_t *ustr_len); 154 155 /* 156 * utf8_normalizeOptCaseFoldToUTF8 157 * 158 * Convert a given UTF-8 string to UTF-8 in one of the following normalized forms, 159 * as specified by the case_sens parameter, and copy the result to the ustr 160 * buffer: 161 * - "canonical caseless form" (case-folded NFD, as described by definition D145 162 * in chapter 3 of The Unicode Standard); for case-insensitive behavior. 163 * - standard NFD; for case-sensitive behavior (if case_sens = true). 164 * 165 * The input string should be valid UTF-8 that meets the criteria for stream safe 166 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 167 * It should not contain ASCII 0x00 or '/'. 168 * 169 * str: The input UTF-8 string (need not be 0 terminated) 170 * str_len: The byte length of the input string (excluding any 0 terminator) 171 * case_sens: False for case-insensitive behavior; generates canonical caseless form. 172 * True for case-sensitive behavior; generates standard NFD. 173 * ustr: A pointer to a buffer for the resulting UTF-8 string. 174 * ustr_size: The capacity of ustr, in bytes. 175 * ustr_len: Pointer to a value that will be filled in with the actual length 176 * in bytes of the string copied to ustr. 177 * 178 * Returns: 0 on success, or 179 * EILSEQ: The input string contains illegal ASCII-range characters 180 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or 181 * contains codepoints that are non-characters or unassigned in 182 * the version of Unicode currently supported. 183 * ENOMEM: ustr_size is insufficient for the resulting string. In this 184 * case the value returned in *ustr_len is invalid. 185 */ 186 int utf8_normalizeOptCaseFoldToUTF8(const char *str, 187 size_t str_len, 188 bool case_sens, 189 char *ustr, 190 size_t ustr_size, 191 size_t *ustr_len); 192 193 /* 194 * utf8_normalizeOptCaseFoldToUTF8ForPath 195 * 196 * Convert a given UTF-8 path string to UTF-8 in one of the following normalized forms, 197 * as specified by the case_sens parameter, and copy the result to the ustr 198 * buffer: 199 * - "canonical caseless form" (case-folded NFD, as described by definition D145 200 * in chapter 3 of The Unicode Standard); for case-insensitive behavior. 201 * - standard NFD; for case-sensitive behavior (if case_sens = true). 202 * 203 * The input string should be valid UTF-8 that meets the criteria for stream safe 204 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 205 * 206 * str: The input UTF-8 path string 207 * str_len: The byte length of the input path string (excluding any 0 terminator) 208 * case_sens: False for case-insensitive behavior; generates canonical caseless form. 209 * True for case-sensitive behavior; generates standard NFD. 210 * ustr: A pointer to a buffer for the resulting UTF-8 string. 211 * ustr_size: The capacity of ustr, in bytes. 212 * ustr_len: Pointer to a value that will be filled in with the actual length 213 * in bytes of the string copied to ustr. 214 * 215 * Returns: 0 on success, or 216 * EILSEQ: The input string contains illegal ASCII-range characters 217 * (0x00), or is not well-formed stream-safe UTF-8, or 218 * contains codepoints that are non-characters or unassigned in 219 * the version of Unicode currently supported. 220 * ENOMEM: ustr_size is insufficient for the resulting string. In this 221 * case the value returned in *ustr_len is invalid. 222 */ 223 int utf8_normalizeOptCaseFoldToUTF8ForPath(const char *str, 224 size_t str_len, 225 bool case_sens, 226 char *ustr, 227 size_t ustr_size, 228 size_t *ustr_len); 229 230 /* 231 * utf8_normalizeOptCaseFoldAndMatchSubstring 232 * 233 * Determine whether the normalized UTF32 string derived from a specified UTF-8 string 234 * strA contains another UTF32 string ustrB which has already been normalized, typically 235 * with normalizeOptCaseFold. The normalization for both strings is one of the following, 236 * as specified by the case_sens parameter: 237 * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison. 238 * - standard NFD; for case-sensitive comparison (if case_sens = true). 239 * On success, sets are_equal to true if strA contains ustrB, or false otherwise. 240 * 241 * The input string strA should be valid UTF-8 that meets the criteria for stream safe 242 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format. 243 * It should not contain ASCII 0x00 or '/'. 244 * 245 * strA: A UTF-8 string (need not be 0 terminated) in which to search for the 246 * substring specified by ustrB. 247 * strA_len: The byte length of strA (excluding any 0 terminator) 248 * ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched 249 * for in the UTF-32 string resulting from converting strA to the normalized 250 * UTF-32 form specified by the case_sens parameter; ustrB must already be 251 * in that form. Normally this will be produced using normalizeOptCaseFold. 252 * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator). 253 * case_sens: False for case-insensitive matching; compares canonical caseless forms. 254 * True for case-sensitive matching; compares standard NFD forms. 255 * buf: Pointer to caller-supplied working memory for storing the portion of 256 * strA which has been converted to normalized UTF-32. 257 * buf_size: The size of buf. 258 * has_match: On success, set to true if strA (when converter to UTF-32 and normalized 259 * per case_sens) contains ustrB, set to false otherwise. 260 * 261 * Returns: 0 on success, or 262 * EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is 263 * not well-formed stream-safe UTF-8, or contains codepoints that are 264 * non-characters or unassigned in the version of Unicode currently 265 * supported. 266 * Note: The search may terminate early when a match is detected, and 267 * may return 0 and set *has_match=true even if strA is invalid. 268 * ENOMEM: buf_size is insufficient. 269 */ 270 int utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA, 271 size_t strA_len, 272 const int32_t *ustrB, 273 int32_t ustrB_len, 274 bool case_sens, 275 void *buf, 276 size_t buf_size, 277 bool *has_match); 278 279 /* 280 * utf8_normalizeOptCaseFoldGetUVersion 281 * 282 * Get the Unicode and code version currently associated with the normalizeOptCaseFold 283 * functions. The caller allocates the version array and passes it to the function, 284 * which will fill out the array as follows: 285 * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6 286 * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3 287 * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0 288 * version[3] = Code revision level; for any given Unicode version, this value starts 289 * at 0 and is incremented for each significant revision to the 290 * normalizeOptCaseFold functions. 291 */ 292 void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]); 293 294 #endif /* KERNEL_PRIVATE */ 295 296 #endif /* unicode_h */ 297