xref: /xnu-11215/bsd/sys/utfconv.h (revision a5e72196)
1 /*
2  * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3  *
4  * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5  *
6  * This file contains Original Code and/or Modifications of Original Code
7  * as defined in and that are subject to the Apple Public Source License
8  * Version 2.0 (the 'License'). You may not use this file except in
9  * compliance with the License. The rights granted to you under the License
10  * may not be used to create, or enable the creation or redistribution of,
11  * unlawful or unlicensed copies of an Apple operating system, or to
12  * circumvent, violate, or enable the circumvention or violation of, any
13  * terms of an Apple operating system software license agreement.
14  *
15  * Please obtain a copy of the License at
16  * http://www.opensource.apple.com/apsl/ and read it before using this file.
17  *
18  * The Original Code and all software distributed under the License are
19  * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20  * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21  * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22  * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23  * Please see the License for the specific language governing rights and
24  * limitations under the License.
25  *
26  * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27  */
28 
29 #ifndef _SYS_UTFCONV_H_
30 #define _SYS_UTFCONV_H_
31 
32 #include <sys/appleapiopts.h>
33 #include <sys/cdefs.h>
34 
35 #ifdef KERNEL
36 #ifdef __APPLE_API_UNSTABLE
37 
38 /*
39  * UTF-8 encode/decode flags
40  */
41 #define UTF_REVERSE_ENDIAN   0x0001   /* reverse UCS-2 byte order */
42 #define UTF_NO_NULL_TERM     0x0002   /* do not add null termination */
43 #define UTF_DECOMPOSED       0x0004   /* generate fully decomposed UCS-2 */
44 #define UTF_PRECOMPOSED      0x0008   /* generate precomposed UCS-2 */
45 #define UTF_ESCAPE_ILLEGAL   0x0010   /* escape illegal UTF-8 */
46 #define UTF_SFM_CONVERSIONS  0x0020   /* Use SFM mappings for illegal NTFS chars */
47 
48 #define UTF_BIG_ENDIAN       \
49 	((BYTE_ORDER == BIG_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
50 
51 #define UTF_LITTLE_ENDIAN    \
52 	((BYTE_ORDER == LITTLE_ENDIAN) ? 0 : UTF_REVERSE_ENDIAN)
53 
54 __BEGIN_DECLS
55 
56 
57 /*
58  * unicode_combinable - Test for a combining unicode character.
59  *
60  * This function is similar to __CFUniCharIsNonBaseCharacter except
61  * that it also includes Hangul Jamo characters.
62  */
63 
64 int unicode_combinable(u_int16_t character);
65 
66 /*
67  * Test for a precomposed character.
68  *
69  * Similar to __CFUniCharIsDecomposableCharacter.
70  */
71 
72 int unicode_decomposeable(u_int16_t character);
73 
74 
75 /*
76  * utf8_encodelen - Calculate the UTF-8 encoding length
77  *
78  * This function takes an Unicode input string, ucsp, of ucslen bytes
79  * and calculates the size of the UTF-8 output in bytes (not including
80  * a NULL termination byte). The string must reside in kernel memory.
81  *
82  * FLAGS
83  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
84  *
85  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
86  *
87  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
88  *
89  *    UTF_DECOMPOSED:  assume fully decomposed output
90  *
91  * ERRORS
92  *    None
93  */
94 size_t
95 utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash,
96     int flags);
97 
98 
99 /*
100  * utf8_encodestr - Encodes a Unicode string into UTF-8
101  *
102  * This function takes an Unicode input string, ucsp, of ucslen bytes
103  * and produces the UTF-8 output into a buffer of buflen bytes pointed
104  * to by utf8p. The size of the output in bytes (not including a NULL
105  * termination byte) is returned in utf8len. The UTF-8 string output
106  * is NULL terminated. Both buffers must reside in kernel memory.
107  *
108  * If '/' chars are possible in the Unicode input then an alternate
109  * (replacement) char must be provided in altslash.
110  *
111  * FLAGS
112  *    UTF_REVERSE_ENDIAN:  Unicode byte order is opposite current runtime
113  *
114  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
115  *
116  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
117  *
118  *    UTF_NO_NULL_TERM:  do not add null termination to output string
119  *
120  *    UTF_DECOMPOSED:  generate fully decomposed output
121  *
122  * ERRORS
123  *    ENAMETOOLONG:  output did not fit; only utf8len bytes were encoded
124  *
125  *    EINVAL:  illegal Unicode char encountered
126  */
127 int
128 utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
129     size_t * utf8len, size_t buflen, u_int16_t altslash, int flags);
130 
131 
132 /*
133  * utf8_decodestr - Decodes a UTF-8 string into Unicode
134  *
135  * This function takes an UTF-8 input string, utf8p, of utf8len bytes
136  * and produces the Unicode output into a buffer of buflen bytes pointed
137  * to by ucsp. The size of the output in bytes (not including a NULL
138  * termination byte) is returned in ucslen. Both buffers must reside
139  * in kernel memory.
140  *
141  * If '/' chars are allowed in the Unicode output then an alternate
142  * (replacement) char must be provided in altslash.
143  *
144  * FLAGS
145  *    UTF_REV_ENDIAN:  Unicode byte order is opposite current runtime
146  *
147  *    UTF_BIG_ENDIAN:  Unicode byte order is always big endian
148  *
149  *    UTF_LITTLE_ENDIAN:  Unicode byte order is always little endian
150  *
151  *    UTF_DECOMPOSED:  generate fully decomposed output (NFD)
152  *
153  *    UTF_PRECOMPOSED:  generate precomposed output (NFC)
154  *
155  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
156  *
157  * ERRORS
158  *    ENAMETOOLONG:  output did not fit; only ucslen bytes were decoded.
159  *
160  *    EINVAL:  illegal UTF-8 sequence encountered.
161  */
162 int
163 utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
164     size_t *ucslen, size_t buflen, u_int16_t altslash, int flags);
165 
166 
167 /*
168  * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
169  *
170  * This function takes an UTF-8 input string, instr, of inlen bytes
171  * and produces normalized UTF-8 output into a buffer of buflen bytes
172  * pointed to by outstr. The size of the output in bytes (not including
173  * a NULL termination byte) is returned in outlen. In-place conversions
174  * are not supported (i.e. instr != outstr).  Both buffers must reside
175  * in kernel memory.
176  *
177  * FLAGS
178  *    UTF_DECOMPOSED:  output string will be fully decomposed (NFD)
179  *
180  *    UTF_PRECOMPOSED:  output string will be precomposed (NFC)
181  *
182  *    UTF_NO_NULL_TERM:  do not add null termination to output string
183  *
184  *    UTF_ESCAPE_ILLEGAL:  percent escape any illegal UTF-8 input
185  *
186  * ERRORS
187  *    ENAMETOOLONG:  output did not fit or input exceeded MAXPATHLEN bytes
188  *
189  *    EINVAL:  illegal UTF-8 sequence encountered or invalid flags
190  */
191 int
192 utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
193     size_t *outlen, size_t buflen, int flags);
194 
195 
196 /*
197  * utf8_validatestr - validates a UTF-8 string
198  *
199  * This function takes an UTF-8 input string, utf8p, of utf8len bytes
200  * and determines if its valid UTF-8.  The string must reside in kernel
201  * memory.
202  *
203  * ERRORS
204  *    EINVAL:  illegal UTF-8 sequence encountered.
205  */
206 int
207 utf8_validatestr(const u_int8_t* utf8p, size_t utf8len);
208 
209 
210 __END_DECLS
211 
212 #endif /* __APPLE_API_UNSTABLE */
213 #endif /* KERNEL */
214 
215 #endif /* !_SYS_UTFCONV_H_ */
216