1/*
2 * Copyright (c) 2016-2023 Apple Inc. All rights reserved.
3 *
4 * @APPLE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. Please obtain a copy of the License at
10 * http://www.opensource.apple.com/apsl/ and read it before using this
11 * file.
12 *
13 * The Original Code and all software distributed under the License are
14 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
15 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
16 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
17 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
18 * Please see the License for the specific language governing rights and
19 * limitations under the License.
20 *
21 * @APPLE_LICENSE_HEADER_END@
22 */
23
24#ifndef unicode_h
25#define unicode_h
26
27#ifdef KERNEL_PRIVATE
28
29#include <sys/cdefs.h>
30#include <stdbool.h>
31
32/*
33 * WARNING - callers that use the following Unicode normalization interface for on-disk
34 * structures should be aware that the implementation will be periodically updated for
35 * the latest Unicode standard version.
36 */
37
38enum {
39 /* Maximum size of UTF32 reordering buffer for stream-safe format */
40 kNCFStreamSafeBufMax = 32
41};
42
43/*
44 * utf8_normalizeOptCaseFoldAndHash
45 *
46 * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,
47 * as specified by the case_sens parameter, and feed the result incrementally to
48 * the provided hash function callback:
49 * - "canonical caseless form" (case-folded NFD, as described by definition D145
50 * in chapter 3 of The Unicode Standard); for case-insensitive behavior.
51 * - standard NFD; for case-sensitive behavior (if case_sens = true).
52 *
53 * The input string should be valid UTF-8 that meets the criteria for stream safe
54 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
55 * It should not contain ASCII 0x00 or '/'.
56 *
57 * str: The input UTF-8 string (need not be 0 terminated)
58 * str_len: The byte length of the input string (excluding any 0 terminator)
59 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
60 * True for case-sensitive behavior; generates standard NFD.
61 * hash_func: A pointer to a hashing function to compute the hash of the
62 * normalized/case-folded result. buf contains buf_len bytes
63 * of data to be added to the hash using the caller-supplied
64 * context (ctx).
65 * hash_ctx: The context for the hash function.
66 *
67 * Returns: 0 on success, or
68 * EILSEQ: The input string contains illegal ASCII-range characters
69 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
70 * contains codepoints that are non-characters or unassigned in
71 * the version of Unicode currently supported.
72 */
73int utf8_normalizeOptCaseFoldAndHash(const char *str,
74 size_t str_len,
75 bool case_sens,
76 void (*hash_func)(void *buf, size_t buf_len, void *ctx),
77 void *hash_ctx);
78
79/*
80 * utf8_normalizeOptCaseFoldAndCompare
81 *
82 * Determine whether two UTF-8 strings are equal after converting each to one of the
83 * following normalized forms, as specified by the case_sens parameter:
84 * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.
85 * - standard NFD; for case-sensitive comparison (if case_sens = true).
86 * On success, sets are_equal to true if the strings are equal, or false if they are not.
87 *
88 * The input strings should be valid UTF-8 that meet the criteria for stream safe
89 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
90 * They should not contain ASCII 0x00 or '/'.
91 *
92 * strA: A UTF-8 string to be compared (need not be 0 terminated)
93 * strA_len: The byte length of strA (excluding any 0 terminator)
94 * strB: The second UTF-8 string to be compared (need not be 0 terminated)
95 * strB_len: The byte length of strB (excluding any 0 terminator)
96 * case_sens: False for case-insensitive behavior; compares canonical caseless forms.
97 * True for case-sensitive behavior; compares standard NFD forms.
98 * are_equal: On success, set to true if the strings are equal, or set to false
99 * if they are not.
100 *
101 * Returns: 0 on success, or
102 * EILSEQ: One or both of the input strings contains illegal ASCII-range
103 * characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,
104 * or contains codepoints that are non-characters or unassigned in
105 * the version of Unicode currently supported.
106 * Note: The comparison may terminate early when a difference is
107 * detected, and may return 0 and set *are_equal=false even
108 * if one or both strings are invalid.
109 */
110int utf8_normalizeOptCaseFoldAndCompare(const char *strA,
111 size_t strA_len,
112 const char *strB,
113 size_t strB_len,
114 bool case_sens,
115 bool *are_equal);
116
117/*
118 * utf8_normalizeOptCaseFold
119 *
120 * Convert a given UTF-8 string to UTF-32 in one of the following normalized forms,
121 * as specified by the case_sens parameter, and copy the result to the ustr
122 * buffer:
123 * - "canonical caseless form" (case-folded NFD, as described by definition D145
124 * in chapter 3 of The Unicode Standard); for case-insensitive behavior.
125 * - standard NFD; for case-sensitive behavior (if case_sens = true).
126 *
127 * The input string should be valid UTF-8 that meets the criteria for stream safe
128 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
129 * It should not contain ASCII 0x00 or '/'.
130 *
131 * str: The input UTF-8 string (need not be 0 terminated)
132 * str_len: The byte length of the input string (excluding any 0 terminator)
133 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
134 * True for case-sensitive behavior; generates standard NFD.
135 * ustr: A pointer to a buffer for the resulting UTF-32 string.
136 * ustr_size: The capacity of ustr, in UTF-32 units.
137 * ustr_len: Pointer to a value that will be filled in with the actual length
138 * in UTF-32 units of the string copied to ustr.
139 *
140 * Returns: 0 on success, or
141 * EILSEQ: The input string contains illegal ASCII-range characters
142 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
143 * contains codepoints that are non-characters or unassigned in
144 * the version of Unicode currently supported.
145 * ENOMEM: ustr_size is insufficient for the resulting string. In this
146 * case the value returned in *ustr_len is invalid.
147 */
148int utf8_normalizeOptCaseFold(const char *str,
149 size_t str_len,
150 bool case_sens,
151 int32_t *ustr,
152 int32_t ustr_size,
153 int32_t *ustr_len);
154
155/*
156 * utf8_normalizeOptCaseFoldToUTF8
157 *
158 * Convert a given UTF-8 string to UTF-8 in one of the following normalized forms,
159 * as specified by the case_sens parameter, and copy the result to the ustr
160 * buffer:
161 * - "canonical caseless form" (case-folded NFD, as described by definition D145
162 * in chapter 3 of The Unicode Standard); for case-insensitive behavior.
163 * - standard NFD; for case-sensitive behavior (if case_sens = true).
164 *
165 * The input string should be valid UTF-8 that meets the criteria for stream safe
166 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
167 * It should not contain ASCII 0x00 or '/'.
168 *
169 * str: The input UTF-8 string (need not be 0 terminated)
170 * str_len: The byte length of the input string (excluding any 0 terminator)
171 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
172 * True for case-sensitive behavior; generates standard NFD.
173 * ustr: A pointer to a buffer for the resulting UTF-8 string.
174 * ustr_size: The capacity of ustr, in bytes.
175 * ustr_len: Pointer to a value that will be filled in with the actual length
176 * in bytes of the string copied to ustr.
177 *
178 * Returns: 0 on success, or
179 * EILSEQ: The input string contains illegal ASCII-range characters
180 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
181 * contains codepoints that are non-characters or unassigned in
182 * the version of Unicode currently supported.
183 * ENOMEM: ustr_size is insufficient for the resulting string. In this
184 * case the value returned in *ustr_len is invalid.
185 */
186int utf8_normalizeOptCaseFoldToUTF8(const char *str,
187 size_t str_len,
188 bool case_sens,
189 char *ustr,
190 size_t ustr_size,
191 size_t *ustr_len);
192
193/*
194 * utf8_normalizeOptCaseFoldToUTF8ForPath
195 *
196 * Convert a given UTF-8 path string to UTF-8 in one of the following normalized forms,
197 * as specified by the case_sens parameter, and copy the result to the ustr
198 * buffer:
199 * - "canonical caseless form" (case-folded NFD, as described by definition D145
200 * in chapter 3 of The Unicode Standard); for case-insensitive behavior.
201 * - standard NFD; for case-sensitive behavior (if case_sens = true).
202 *
203 * The input string should be valid UTF-8 that meets the criteria for stream safe
204 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
205 *
206 * str: The input UTF-8 path string
207 * str_len: The byte length of the input path string (excluding any 0 terminator)
208 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
209 * True for case-sensitive behavior; generates standard NFD.
210 * ustr: A pointer to a buffer for the resulting UTF-8 string.
211 * ustr_size: The capacity of ustr, in bytes.
212 * ustr_len: Pointer to a value that will be filled in with the actual length
213 * in bytes of the string copied to ustr.
214 *
215 * Returns: 0 on success, or
216 * EILSEQ: The input string contains illegal ASCII-range characters
217 * (0x00), or is not well-formed stream-safe UTF-8, or
218 * contains codepoints that are non-characters or unassigned in
219 * the version of Unicode currently supported.
220 * ENOMEM: ustr_size is insufficient for the resulting string. In this
221 * case the value returned in *ustr_len is invalid.
222 */
223int utf8_normalizeOptCaseFoldToUTF8ForPath(const char *str,
224 size_t str_len,
225 bool case_sens,
226 char *ustr,
227 size_t ustr_size,
228 size_t *ustr_len);
229
230/*
231 * utf8_normalizeOptCaseFoldAndMatchSubstring
232 *
233 * Determine whether the normalized UTF32 string derived from a specified UTF-8 string
234 * strA contains another UTF32 string ustrB which has already been normalized, typically
235 * with normalizeOptCaseFold. The normalization for both strings is one of the following,
236 * as specified by the case_sens parameter:
237 * - "canonical caseless form" (case-folded NFD); for case-insensitive comparison.
238 * - standard NFD; for case-sensitive comparison (if case_sens = true).
239 * On success, sets are_equal to true if strA contains ustrB, or false otherwise.
240 *
241 * The input string strA should be valid UTF-8 that meets the criteria for stream safe
242 * text as described in http://unicode.org/reports/tr15/#Stream_Safe_Text_Format.
243 * It should not contain ASCII 0x00 or '/'.
244 *
245 * strA: A UTF-8 string (need not be 0 terminated) in which to search for the
246 * substring specified by ustrB.
247 * strA_len: The byte length of strA (excluding any 0 terminator)
248 * ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched
249 * for in the UTF-32 string resulting from converting strA to the normalized
250 * UTF-32 form specified by the case_sens parameter; ustrB must already be
251 * in that form. Normally this will be produced using normalizeOptCaseFold.
252 * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).
253 * case_sens: False for case-insensitive matching; compares canonical caseless forms.
254 * True for case-sensitive matching; compares standard NFD forms.
255 * buf: Pointer to caller-supplied working memory for storing the portion of
256 * strA which has been converted to normalized UTF-32.
257 * buf_size: The size of buf.
258 * has_match: On success, set to true if strA (when converter to UTF-32 and normalized
259 * per case_sens) contains ustrB, set to false otherwise.
260 *
261 * Returns: 0 on success, or
262 * EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is
263 * not well-formed stream-safe UTF-8, or contains codepoints that are
264 * non-characters or unassigned in the version of Unicode currently
265 * supported.
266 * Note: The search may terminate early when a match is detected, and
267 * may return 0 and set *has_match=true even if strA is invalid.
268 * ENOMEM: buf_size is insufficient.
269 */
270int utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA,
271 size_t strA_len,
272 const int32_t *ustrB,
273 int32_t ustrB_len,
274 bool case_sens,
275 void *buf,
276 size_t buf_size,
277 bool *has_match);
278
279/*
280 * utf8_normalizeOptCaseFoldGetUVersion
281 *
282 * Get the Unicode and code version currently associated with the normalizeOptCaseFold
283 * functions. The caller allocates the version array and passes it to the function,
284 * which will fill out the array as follows:
285 * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6
286 * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3
287 * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0
288 * version[3] = Code revision level; for any given Unicode version, this value starts
289 * at 0 and is incremented for each significant revision to the
290 * normalizeOptCaseFold functions.
291 */
292void utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]);
293
294#endif /* KERNEL_PRIVATE */
295
296#endif /* unicode_h */
297