vfs_unicode.c source code [xnu/bsd/vfs/vfs_unicode.c]

1	/*
2	* Copyright (C) 2016-2023 Apple, Inc. All rights reserved.
3	* Some portions covered by other copyrights, listed below.
4	*---
5	* Copyright (C) 2016 and later: Unicode, Inc. and others.
6	* License & terms of use: http://www.unicode.org/copyright.html
7	*---
8	* Copyright (C) 1999-2015, International Business Machines
9	* Corporation and others. All Rights Reserved.
10	*
11	* add APPLE_OSREFERENCE_LICENSE_HEADER stuff...
12	*/
13
14	#include <libkern/libkern.h>
15	#include <sys/errno.h>
16	#include <sys/unicode.h>
17	#include "vfs_unicode_data.h"
18	#define STATIC_UNLESS_TEST static
19
20	enum {
21	/ Maximum number of UTF8 bytes from one Unicode code point (one UTF32 code unit) /
22	kMaxUTF8BytesPerChar = `4`
23	};
24
25	/ local prototypes used by exported functions (and themselves exported for testing) /
26	STATIC_UNLESS_TEST
27	int32_t utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit);
28	STATIC_UNLESS_TEST
29	int32_t normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens,
30	int32_t u32NormFoldBuf[kNFCSingleCharDecompMax],
31	uint8_t combClass[kNFCSingleCharDecompMax]);
32	/ local prototypes used by exported functions (not exported for separate testing) /
33	static int nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes,
34	int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP,
35	int32_t* buf, uint8_t* bufcc, int32_t* buflenP,
36	bool* needReorderP, bool* startP);
37	void doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen);
38	int32_t u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]);
39
40	/*
41	* utf8_normalizeOptCaseFoldGetUVersion
42	*
43	* version[0] = Unicode major version; for Unicode 6.3.0 this would be 6
44	* version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3
45	* version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0
46	* version[3] = Code revision level; for any given Unicode version, this value starts
47	* at 0 and is incremented for each significant revision to the
48	* normalizeOptCaseFold functions.
49	*/
50	void
51	utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[`4`])
52	{
53	version[`0`] = `15`;
54	version[`1`] = `1`;
55	version[`2`] = `0`;
56	version[`3`] = `0`;
57	return;
58	}
59
60	/*
61	* utf8_normalizeOptCaseFoldAndHash
62	*
63	* str: The input UTF-8 string (need not be 0 terminated)
64	* str_len: The byte length of the input string (excluding any 0 terminator)
65	* case_sens: False for case-insensitive behavior; generates canonical caseless form.
66	* True for case-sensitive behavior; generates standard NFD.
67	* hash_func: A pointer to a hashing function to compute the hash of the
68	* normalized/case-folded result. buf contains buf_len bytes
69	* of data to be added to the hash using the caller-supplied
70	* context (ctx).
71	* hash_ctx: The context for the hash function.
72	*
73	* Returns: 0 on success, or
74	* EILSEQ: The input string contains illegal ASCII-range characters
75	* (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
76	* contains codepoints that are non-characters or unassigned in
77	* the version of Unicode currently supported (Unicode 9.0).
78	*/
79
80	int
81	utf8_normalizeOptCaseFoldAndHash(const char *str,
82	size_t str_len,
83	bool case_sens,
84	void (hash_func)(void* buf, size_t buf_len, void* *ctx),
85	void *hash_ctx)
86	{
87	const char *strLimit = str + str_len;
88
89	/ Data for the next pending single-char norm from input;*
90	* This will always begin with a base char (combining class 0)
91	* or the first character in the string, which may no be a base */
92	int32_t unorm[kNFCSingleCharDecompMax];
93	uint8_t unormcc[kNFCSingleCharDecompMax];
94	int32_t unormlen = `0`;
95	int32_t unormstart = `0`;
96
97	bool start = true;
98
99	/ main loop:*
100	* Each input character may be normalized to a sequence of one or more characters,
101	* some of which may have non-zero combining class. Any sequence of characters
102	* with non-zero combining class resulting from one or more input characters needs
103	* to be accumulated in the main buffer so we can reorder as necessary before
104	* calling the hash function.
105	*
106	* At the beginning of the main loop: The normalization buffer and main buffer are
107	* both empty.
108	*
109	* Each time through the main loop we do the following:
110	* 1. If there are characters available in the normalization result buffer (from the
111	* result of normalizing a previous input character), copy the first character and
112	* any following characters that have non-zero combining class to the main buffer.
113	* 2. If there is nothing left in the normalization buffer, then loop processing
114	* input characters as follows:
115	* a) Get the next input character from UTF8, get its normalized and case-folded
116	* result in the normalization buffer.
117	* b) If the first character in the normalization buffer has combining class 0,
118	* break; we will handle this normalization buffer next time through the main
119	* loop.
120	* c) Else copy the current normalization buffer (which has only combining marks)
121	* to the main buffer, and continue with the loop processing input characters.
122	* 3. At this point the first character in the main buffer may or may not have
123	* combining class 0, but any subsequent characters (up to the the limit for
124	* stream safe text) will be combining characters with nonzero combining class.
125	* Reorder the combining marks if necessary into canonical order.
126	* 4. Call the hash function for each character in the main buffer.
127	*
128	*/
129	do {
130	/ Data for the buffers being built up from input /
131	int32_t buf[kNCFStreamSafeBufMax];
132	uint8_t bufcc[kNCFStreamSafeBufMax];
133	int32_t buflen = `0`;
134	bool needReorder = false;
135	int err;
136
137	err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, false / allow_slashes /,
138	unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start);
139	if (err != `0`) {
140	return err;
141	}
142
143	if (buflen > `0`) {
144	/ Now buffer should have all of the combining marks up to the next base char.*
145	* Normally it will also start with the last base char encountered (unless the
146	* UTF8 string began with a combining mark). */
147	/ Now reorder combining marks if necessary. /
148	if (needReorder) {
149	doReorder(buf, bufcc, buflen);
150	}
151	/ Now write to hash func /
152	hash_func(buf, buflen * sizeof(buf[`0`]), hash_ctx);
153	}
154	/ OK so far, top of loop clears buffers to start refilling again /
155	} while (str < strLimit \|\| unormlen > `0`);
156	return `0`;
157	}
158
159	/*
160	* utf8_normalizeOptCaseFoldAndCompare
161	*
162	* strA: A UTF-8 string to be compared (need not be 0 terminated)
163	* strA_len: The byte length of strA (excluding any 0 terminator)
164	* strB: The second UTF-8 string to be compared (need not be 0 terminated)
165	* strB_len: The byte length of strB (excluding any 0 terminator)
166	* case_sens: False for case-insensitive behavior; compares canonical caseless forms.
167	* True for case-sensitive behavior; compares standard NFD forms.
168	* are_equal: On success, set to true if the strings are equal, or set to false
169	* if they are not.
170	*
171	* Returns: 0 on success, or
172	* EILSEQ: One or both of the input strings contains illegal ASCII-range
173	* characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,
174	* or contains codepoints that are non-characters or unassigned in
175	* the version of Unicode currently supported (Unicode 9.0).
176	* Note: The comparison may terminate early when a difference is
177	* detected, and may return 0 and set *are_equal=false even
178	* if one or both strings are invalid.
179	*/
180	enum { kNFCSingleCharDecompMaxPlusPushback = kNFCSingleCharDecompMax + `4` }; / room for 03B9 pushback(s) /
181
182	int
183	utf8_normalizeOptCaseFoldAndCompare(const char *strA,
184	size_t strA_len,
185	const char *strB,
186	size_t strB_len,
187	bool case_sens,
188	bool *are_equal)
189	{
190	const char *strALimit = strA + strA_len;
191	const char *strBLimit = strB + strB_len;
192
193	/ Data for the next pending single-char norms from each input;*
194	* These will always begin with a base char (combining class 0)
195	* or the first character in the string, which may not be a base */
196	int32_t unormA[kNFCSingleCharDecompMaxPlusPushback], unormB[kNFCSingleCharDecompMaxPlusPushback];
197	uint8_t unormAcc[kNFCSingleCharDecompMaxPlusPushback], unormBcc[kNFCSingleCharDecompMaxPlusPushback];
198	int32_t unormAlen = `0`, unormBlen = `0`;
199	int32_t unormAstart = `0`, unormBstart = `0`;
200
201	bool startA = true, startB = true;
202
203	/ main loop:*
204	* The main loop here is similar to the main loop in utf8_normalizeOptCaseFoldAndHash,
205	* described above. The differences are:
206	* - We keep a normalization buffer and main buffer for each string.
207	* - In the main loop, we do steps 1-3 for each string.
208	* - In step 4, instead of calling the hash function, we compare the two main
209	* buffers; if they are unequal, we return a non-equal result.
210	* - After the end of the main loop, if we still have data for one string but
211	* not the other, return a non-equal result, else return an equal result.
212	*/
213	do {
214	/ Data for the buffers being built up from each input /
215	int32_t bufA[kNCFStreamSafeBufMax], bufB[kNCFStreamSafeBufMax];
216	uint8_t bufAcc[kNCFStreamSafeBufMax], bufBcc[kNCFStreamSafeBufMax];
217	int32_t bufAlen = `0`, bufBlen = `0`;
218	bool needReorderA = false, needReorderB = false;
219	int err;
220
221	err = nextBaseAndAnyMarks(strP: &strA, strLimit: strALimit, case_sens, false / allow_slashes /,
222	unorm: unormA, unormcc: unormAcc, unormlenP: &unormAlen, unormstartP: &unormAstart, buf: bufA, bufcc: bufAcc, buflenP: &bufAlen, needReorderP: &needReorderA, startP: &startA);
223	if (err != `0`) {
224	return err;
225	}
226	err = nextBaseAndAnyMarks(strP: &strB, strLimit: strBLimit, case_sens, false / allow_slashes /,
227	unorm: unormB, unormcc: unormBcc, unormlenP: &unormBlen, unormstartP: &unormBstart, buf: bufB, bufcc: bufBcc, buflenP: &bufBlen, needReorderP: &needReorderB, startP: &startB);
228	if (err != `0`) {
229	return err;
230	}
231
232	if (bufAlen > `0` \|\| bufBlen > `0`) {
233	/ Now each buffer should have all of the combining marks up to the next base char.*
234	* Normally it will also start with the last base char encountered (unless the
235	* UTF8 string began with a combining mark). */
236	/ Now reorder combining marks if necessary. /
237	if (needReorderA) {
238	doReorder(buf: bufA, bufcc: bufAcc, buflen: bufAlen);
239	}
240	if (needReorderB) {
241	doReorder(buf: bufB, bufcc: bufBcc, buflen: bufBlen);
242	}
243	/ handle 03B9 pushback /
244	int32_t idx;
245	if (!case_sens) {
246	if (bufAlen > `1` && bufA[bufAlen - `1`] == `0x03B9` && unormAstart == `0`) {
247	int32_t tailCount = `0`;
248	while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormAlen && bufAlen > `1` && bufA[bufAlen - `1`] == `0x03B9`) {
249	tailCount++;
250	bufAlen--;
251	}
252	for (idx = unormAlen; idx > `0`; idx--) {
253	unormA[idx - `1` + tailCount] = unormA[idx - `1`];
254	unormAcc[idx - `1` + tailCount] = unormAcc[idx - `1`];
255	}
256	for (idx = `0`; idx < tailCount; idx++) {
257	unormA[idx] = `0x03B9`;
258	unormAcc[idx] = `0`;
259	}
260	unormAlen += tailCount;
261	}
262	if (bufBlen > `1` && bufB[bufBlen - `1`] == `0x03B9` && unormBstart == `0`) {
263	int32_t tailCount = `0`;
264	while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormBlen && bufBlen > `1` && bufB[bufBlen - `1`] == `0x03B9`) {
265	tailCount++;
266	bufBlen--;
267	}
268	for (idx = unormBlen; idx > `0`; idx--) {
269	unormB[idx - `1` + tailCount] = unormB[idx - `1`];
270	unormBcc[idx - `1` + tailCount] = unormBcc[idx - `1`];
271	}
272	for (idx = `0`; idx < tailCount; idx++) {
273	unormB[idx] = `0x03B9`;
274	unormBcc[idx] = `0`;
275	}
276	unormBlen += tailCount;
277	}
278	}
279	/ Now compare the buffers. /
280	if (bufAlen != bufBlen \|\| memcmp(s1: bufA, s2: bufB, n: bufAlen * sizeof(bufA[`0`])) != `0`) {
281	*are_equal = false;
282	return `0`;
283	}
284	}
285	/ OK so far, top of loop clears buffers to start refilling again /
286	} while ((strA < strALimit \|\| unormAlen > `0`) && (strB < strBLimit \|\| unormBlen > `0`));
287
288	*are_equal = (strA == strALimit && unormAlen == `0` && strB == strBLimit && unormBlen == `0`);
289	return `0`;
290	}
291
292	/*
293	* utf8_normalizeOptCaseFold
294	*
295	* str: The input UTF-8 string (need not be 0 terminated)
296	* str_len: The byte length of the input string (excluding any 0 terminator)
297	* case_sens: False for case-insensitive behavior; generates canonical caseless form.
298	* True for case-sensitive behavior; generates standard NFD.
299	* ustr: A pointer to a buffer for the resulting UTF-32 string.
300	* ustr_size: The capacity of ustr, in UTF-32 units.
301	* ustr_len: Pointer to a value that will be filled in with the actual length
302	* in UTF-32 units of the string copied to ustr.
303	*
304	* Returns: 0 on success, or
305	* EILSEQ: The input string contains illegal ASCII-range characters
306	* (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
307	* contains codepoints that are non-characters or unassigned in
308	* the version of Unicode currently supported.
309	* ENOMEM: ustr_size is insufficient for the resulting string. In this
310	* case the value returned in *ustr_len is invalid.
311	*/
312	int
313	utf8_normalizeOptCaseFold(const char *str,
314	size_t str_len,
315	bool case_sens,
316	int32_t *ustr,
317	int32_t ustr_size,
318	int32_t *ustr_len)
319	{
320	const char *strLimit = str + str_len;
321	int32_t *ustrCur = ustr;
322	const int32_t *ustrLimit = ustr + ustr_size;
323
324	/ Data for the next pending single-char norm from input;*
325	* This will always begin with a base char (combining class 0) */
326	int32_t unorm[kNFCSingleCharDecompMax];
327	uint8_t unormcc[kNFCSingleCharDecompMax];
328	int32_t unormlen = `0`;
329	int32_t unormstart = `0`;
330
331	bool start = true;
332
333	*ustr_len = `0`;
334	do {
335	/ Data for the buffers being built up from input /
336	int32_t buf[kNCFStreamSafeBufMax];
337	uint8_t bufcc[kNCFStreamSafeBufMax];
338	int32_t buflen = `0`;
339	bool needReorder = false;
340	int err;
341
342	err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, false / allow_slashes /,
343	unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start);
344	if (err != `0`) {
345	return err;
346	}
347
348	if (buflen > `0`) {
349	if (needReorder) {
350	doReorder(buf, bufcc, buflen);
351	}
352	/ Now copy to output buffer /
353	int32_t idx;
354	if (ustrCur + buflen > ustrLimit) {
355	return ENOMEM;
356	}
357	for (idx = `0`; idx < buflen; idx++) {
358	*ustrCur++ = buf[idx];
359	}
360	}
361	/ OK so far, top of loop clears buffers to start refilling again /
362	} while (str < strLimit \|\| unormlen > `0`);
363	ustr_len = (uint32_t)(ustrCur - ustr); // XXXpjr: the explicit (uint32_t) cast wasn't present in the original code drop*
364	return `0`;
365	}
366
367	static int
368	utf8_normalizeOptCaseFoldToUTF8_internal(const char *str,
369	size_t str_len,
370	bool case_sens,
371	bool allow_slashes,
372	char *ustr,
373	size_t ustr_size,
374	size_t *ustr_len)
375	{
376	const char *strLimit = str + str_len;
377	char *ustrCur = ustr;
378	const char *ustrLimit = ustr + ustr_size;
379
380	/ Data for the next pending single-char norm from input;*
381	* This will always begin with a base char (combining class 0) */
382	int32_t unorm[kNFCSingleCharDecompMax];
383	uint8_t unormcc[kNFCSingleCharDecompMax];
384	int32_t unormlen = `0`;
385	int32_t unormstart = `0`;
386
387	bool start = true;
388
389	*ustr_len = `0`;
390	do {
391	/ Data for the buffers being built up from input /
392	int32_t buf[kNCFStreamSafeBufMax];
393	uint8_t bufcc[kNCFStreamSafeBufMax];
394	int32_t buflen = `0`;
395	bool needReorder = false;
396	int err;
397
398	err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, allow_slashes,
399	unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start);
400	if (err != `0`) {
401	return err;
402	}
403
404	if (buflen > `0`) {
405	uint8_t utf8Bytes[kMaxUTF8BytesPerChar];
406	int32_t *bufPtr = buf;
407	if (needReorder) {
408	doReorder(buf, bufcc, buflen);
409	}
410	/ Now copy to output buffer /
411	while (buflen-- > `0`) {
412	int32_t idx, utf8Len = u32CharToUTF8Bytes(u32char: (uint32_t)*bufPtr++, utf8Bytes);
413	if (ustrCur + utf8Len > ustrLimit) {
414	return ENOMEM;
415	}
416	for (idx = `0`; idx < utf8Len; idx++) {
417	ustrCur++ = (char*)utf8Bytes[idx];
418	}
419	}
420	}
421	/ OK so far, top of loop clears buffers to start refilling again /
422	} while (str < strLimit \|\| unormlen > `0`);
423	*ustr_len = ustrCur - ustr;
424	return `0`;
425	}
426
427	/*
428	* utf8_normalizeOptCaseFoldToUTF8
429	* (This is similar to normalizeOptCaseFold except that this has a different output
430	* buffer type, and adds conversion to UTF8 while copying to output buffer)
431	*
432	* str: The input UTF-8 string (need not be 0 terminated)
433	* str_len: The byte length of the input string (excluding any 0 terminator)
434	* case_sens: False for case-insensitive behavior; generates canonical caseless form.
435	* True for case-sensitive behavior; generates standard NFD.
436	* ustr: A pointer to a buffer for the resulting UTF-8 string.
437	* ustr_size: The capacity of ustr, in bytes.
438	* ustr_len: Pointer to a value that will be filled in with the actual length
439	* in bytes of the string copied to ustr.
440	*
441	* Returns: 0 on success, or
442	* EILSEQ: The input string contains illegal ASCII-range characters
443	* (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
444	* contains codepoints that are non-characters or unassigned in
445	* the version of Unicode currently supported.
446	* ENOMEM: ustr_size is insufficient for the resulting string. In this
447	* case the value returned in *ustr_len is invalid.
448	*/
449	int
450	utf8_normalizeOptCaseFoldToUTF8(const char *str,
451	size_t str_len,
452	bool case_sens,
453	char *ustr,
454	size_t ustr_size,
455	size_t *ustr_len)
456	{
457	return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, false / allow_slashes /,
458	ustr, ustr_size, ustr_len);
459	}
460
461	/*
462	* utf8_normalizeOptCaseFoldToUTF8ForPath
463	* (This is similar to normalizeOptCaseFoldToUTF8 except that this allows '/' character.)
464	*
465	* str: The input UTF-8 path string
466	* str_len: The byte length of the input path string (excluding any 0 terminator)
467	* case_sens: False for case-insensitive behavior; generates canonical caseless form.
468	* True for case-sensitive behavior; generates standard NFD.
469	* ustr: A pointer to a buffer for the resulting UTF-8 string.
470	* ustr_size: The capacity of ustr, in bytes.
471	* ustr_len: Pointer to a value that will be filled in with the actual length
472	* in bytes of the string copied to ustr.
473	*
474	* Returns: 0 on success, or
475	* EILSEQ: The input string contains illegal ASCII-range characters
476	* (0x00), or is not well-formed stream-safe UTF-8, or
477	* contains codepoints that are non-characters or unassigned in
478	* the version of Unicode currently supported.
479	* ENOMEM: ustr_size is insufficient for the resulting string. In this
480	* case the value returned in *ustr_len is invalid.
481	*/
482	int
483	utf8_normalizeOptCaseFoldToUTF8ForPath(const char *str,
484	size_t str_len,
485	bool case_sens,
486	char *ustr,
487	size_t ustr_size,
488	size_t *ustr_len)
489	{
490	return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, true / allow_slashes /,
491	ustr, ustr_size, ustr_len);
492	}
493
494	/*
495	* utf8_normalizeOptCaseFoldAndMatchSubstring
496	*
497	* strA: A UTF-8 string (need not be 0 terminated) in which to search for the
498	* substring specified by ustrB.
499	* strA_len: The byte length of strA (excluding any 0 terminator)
500	* ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched
501	* for in the UTF-32 string resulting from converting strA to the normalized
502	* UTF-32 form specified by the case_sens parameter; ustrB must already be
503	* in that form.
504	* ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).
505	* case_sens: False for case-insensitive matching; compares canonical caseless forms.
506	* True for case-sensitive matching; compares standard NFD forms.
507	* buf: Pointer to caller-supplied working memory for storing the portion of
508	* strA which has been converted to normalized UTF-32.
509	* buf_size: The size of buf.
510	* has_match: On success, set to true if strA (when converter to UTF-32 and normalized
511	* per case_sens) contains ustrB, set to false otherwise.
512	*
513	* Returns: 0 on success, or
514	* EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is
515	* not well-formed stream-safe UTF-8, or contains codepoints that are
516	* non-characters or unassigned in the version of Unicode currently
517	* supported.
518	* Note: The search may terminate early when a match is detected, and
519	* may return 0 and set *has_match=true even if strA is invalid.
520	* ENOMEM: buf_size is insufficient.
521	*/
522	int
523	utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA,
524	size_t strA_len,
525	const int32_t *ustrB,
526	int32_t ustrB_len,
527	bool case_sens,
528	void *buf,
529	size_t buf_size,
530	bool *has_match)
531	{
532	/*
533	* ustrA represents the current position in the UTF-32 normalized version of strA
534	* at which we want to test for a match; ustrANormEnd is the position beyond that
535	* which is just after the end of what has already been converted from strA to
536	* UTF-32 normalized form.
537	* Each time through the main loop:
538	* - The first task is to make sure we have enough of strA converted to UTF32
539	* normalized form to test for match with ustrB at the current match position.
540	* If we don't, then convert more of strA to UTF-32 normalized form until we
541	* have enough to compare with ustrB. To do this, run a loop which is like the
542	* main loop in utf8_normalizeOptCaseFoldAndHash except that in step 4, instead of
543	* calling the hash function, we copy the normalized buffer to ustrANormEnd,
544	* advancing the latter. We keep doing this until we have enough additional
545	* converted to match with ustrB.
546	* - Then we test for match of ustrB at the current ustrA position. If there is
547	* a match we return; otherwise, if there is more strA to convert we advance
548	* ustrA and repeat the main loop, otherwise we return without a match.
549	*/
550	if (ustrB_len == `0`) { / always matches /
551	*has_match = true;
552	return `0`;
553	}
554	has_match = false; /* initialize return value /
555	if (ustrB_len > `2` * strA_len) {
556	/ If ustrB is clearly too long to find in strA, don't bother normalizing strA.*
557	* A UTF-8 character of 1 byte (ASCII) will normalize to 1 UTF-32 unit.
558	* A UTF-8 character of 2-4 bytes will normalize to a maximum of 4 UTF-32 units.
559	* The maximum expansion from unnormalized UTF-8 byte length to normalized
560	* UTF-32 unit length is thus 2. */
561	return `0`;
562	}
563
564	const char *strALimit = strA + strA_len;
565	int32_t ustrA = (int32_t )buf;
566	const int32_t ustrALimit = ustrA + (buf_size / sizeof*(int32_t));
567	int32_t ustrANormEnd = ustrA; /* how far we have already normalized in ustrA /
568
569	/ Data for the next pending single-char norms from each input;*
570	* These will always begin with a base char (combining class 0)
571	* or the first character in the string, which may not be a base */
572	int32_t unormA[kNFCSingleCharDecompMax];
573	uint8_t unormAcc[kNFCSingleCharDecompMax];
574	int32_t unormAlen = `0`;
575	int32_t unormAstart = `0`;
576
577	bool startA = true;
578
579	while (true) {
580	/ convert enough more of strA to normalized UTF-32 in ustrA to check for match /
581	if (ustrANormEnd - ustrA < ustrB_len) {
582	do {
583	/ Data for the buffers being built up from each input /
584	int32_t bufA[kNCFStreamSafeBufMax];
585	uint8_t bufAcc[kNCFStreamSafeBufMax];
586	int32_t bufAlen = `0`;
587	bool needReorderA = false;
588	int err;
589
590	err = nextBaseAndAnyMarks(strP: &strA, strLimit: strALimit, case_sens, false / allow_slashes /,
591	unorm: unormA, unormcc: unormAcc, unormlenP: &unormAlen, unormstartP: &unormAstart, buf: bufA, bufcc: bufAcc, buflenP: &bufAlen, needReorderP: &needReorderA, startP: &startA);
592	if (err != `0`) {
593	return err;
594	}
595
596	if (bufAlen > `0`) {
597	/ Now each buffer should have all of the combining marks up to the next base char.*
598	* Normally it will also start with the last base char encountered (unless the
599	* UTF8 string began with a combining mark). */
600	/ Now reorder combining marks if necessary. Should be rare, and sequences should*
601	* usually be short when does occur => simple bubblesort should be sufficient. */
602	if (needReorderA) {
603	doReorder(buf: bufA, bufcc: bufAcc, buflen: bufAlen);
604	}
605	/ Now copy to working buffer /
606	int32_t idx;
607	if (ustrANormEnd + bufAlen > ustrALimit) {
608	return ENOMEM;
609	}
610	for (idx = `0`; idx < bufAlen; idx++) {
611	*ustrANormEnd++ = bufA[idx];
612	}
613	}
614	/ OK so far, top of loop clears buffers to start refilling again /
615	} while ((ustrANormEnd - ustrA < ustrB_len) && (strA < strALimit \|\| unormAlen > `0`));
616	}
617
618	if (ustrANormEnd - ustrA < ustrB_len) {
619	return `0`; / not enough of strA left for match /
620	}
621	/ check for match, return if so /
622	if (memcmp(s1: ustrA, s2: ustrB, n: ustrB_len * sizeof(ustrB[`0`])) == `0`) {
623	*has_match = true;
624	return `0`;
625	}
626	ustrA++; / advance match position /
627	}
628	}
629
630	/ nextBaseAndAnyMarks:*
631	* Guts of code to get next bufferful of base character (or first char in string)
632	* and all trailing combining marks.
633	* This is called each time through the main loop of functions above, and does the
634	* following:
635	* 1. If there are characters available in the normalization result buffer (from the
636	* result of normalizing a previous input character), copy the first character and
637	* any following characters that have non-zero combining class to the main buffer.
638	* 2. If there is nothing left in the normalization buffer, then loop processing
639	* input characters as follows:
640	* a) Get the next input character from UTF8, get its normalized and case-folded
641	* result in the normalization buffer.
642	* b) If the first character in the normalization buffer has combining class 0,
643	* break; we will handle this normalization buffer next time through the main
644	* loop.
645	* c) Else copy the current normalization buffer (which has only combining marks)
646	* to the main buffer, and continue with the loop processing input characters.
647	*/
648
649	static int
650	nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes,
651	int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP,
652	int32_t* buf, uint8_t* bufcc, int32_t* buflenP,
653	bool* needReorderP, bool* startP)
654	{
655	/ update buffers for str /
656	if (unormlenP > `0` && unormstartP < *unormlenP) {
657	/ unorm begins with a base char; buflen should be 0 /
658	*needReorderP = false;
659	for (*buflenP = `0`; true;) {
660	if (buflenP > `0` && unormcc[unormstartP] > `0` && unormcc[unormstartP] < bufcc[(buflenP) - `1`]) {
661	*needReorderP = true;
662	}
663	buf[buflenP] = unorm[unormstartP];
664	bufcc[(buflenP)++] = unormcc[(unormstartP)++];
665	if (unormstartP >= unormlenP \|\| unormcc[*unormstartP] == `0`) {
666	break;
667	}
668	}
669	}
670	if (unormstartP >= unormlenP) {
671	unormstartP = unormlenP = `0`;
672	while (*strP < strLimit) {
673	int32_t idx;
674	uint32_t bytevalue = (uint8_t)(strP)++;
675	/ '/' is not produced by NFD decomposition from another character so we can*
676	* check for it before normalization */
677	if (bytevalue == `0` \|\| (bytevalue == `0x2F` /'/'/ && !allow_slashes)) {
678	return EILSEQ;
679	}
680	if (bytevalue < `0x80`) {
681	unorm[`0`] = (!case_sens && bytevalue >= `'A'` && bytevalue <= `'Z'`)? bytevalue += `0x20`: bytevalue;
682	*unormlenP = `1`;
683	unormcc[`0`] = `0`;
684	*startP = false;
685	break;
686	} else {
687	int32_t u32char = utf8ToU32Code(u32char: bytevalue, srcPtr: strP, srcLimit: strLimit);
688	if (u32char <= `0`) {
689	return EILSEQ;
690	}
691	*unormlenP = normalizeOptCaseFoldU32Char(u32char, case_sens, u32NormFoldBuf: unorm, combClass: unormcc);
692	if (*unormlenP <= `0`) {
693	return EILSEQ;
694	}
695	if (unormcc[`0`] == `0` \|\| *startP) {
696	*startP = false;
697	break;
698	}
699	}
700	/ the latest char decomposes to just combining sequence, add to buffer being built /
701	if (buflenP + unormlenP > kNCFStreamSafeBufMax) {
702	return EILSEQ;
703	}
704	for (idx = `0`; idx < unormlenP; idx++, (buflenP)++) {
705	if (buflenP > `0` && unormcc[idx] > `0` && unormcc[idx] < bufcc[(buflenP) - `1`]) {
706	*needReorderP = true;
707	}
708	buf[*buflenP] = unorm[idx];
709	bufcc[*buflenP] = unormcc[idx];
710	}
711	*unormlenP = `0`;
712	}
713	}
714	return `0`;
715	}
716
717	/ local prototypes used only by internal functions /
718	static void swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]);
719	static int32_t adjustCase(bool case_sens, int32_t uSeqLen,
720	int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]);
721	static uint8_t getCombClassU32Char(int32_t u32char);
722	static int32_t decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]);
723
724	/ Reorder combining marks if necessary. Should be rare, and sequences should*
725	* usually be short when does occur => simple bubblesort should be sufficient. */
726	void
727	doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen)
728	{
729	int32_t idx, jdx;
730	for (idx = `0`; idx < buflen - `1`; idx++) {
731	for (jdx = buflen - `1`; jdx > idx; jdx--) {
732	if (bufcc[jdx] < bufcc[jdx - `1`]) {
733	swapBufCharCCWithPrevious(jdx, buf, bufcc);
734	}
735	}
736	}
737	}
738	/ swap function for bubblesort /
739	static void
740	swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[])
741	{
742	int32_t bufchar = buf[jdx];
743	uint8_t bufccval = bufcc[jdx];
744	buf[jdx] = buf[jdx - `1`];
745	bufcc[jdx] = bufcc[jdx - `1`];
746	buf[jdx - `1`] = bufchar;
747	bufcc[jdx - `1`] = bufccval;
748	}
749
750	/*
751	* u32CharToUTF8Bytes, map a valid Unicode character (UTF32 code point) to 1..4 UTF8 bytes,
752	* and returns the number of UTF8 bytes.
753	*
754	* adapted from ICU macro U8_APPEND_UNSAFE (utf8.h).
755	*/
756	int32_t
757	u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar])
758	{
759	int32_t idx = `0`;
760	if (u32char <= `0x7F`) {
761	utf8Bytes[idx++] = (uint8_t)u32char;
762	} else {
763	if (u32char <= `0x7FF`) {
764	utf8Bytes[idx++] = (uint8_t)((u32char >> `6`) \| `0xC0`);
765	} else {
766	if (u32char <= `0xFFFF`) {
767	utf8Bytes[idx++] = (uint8_t)((u32char >> `12`) \| `0xE0`);
768	} else {
769	utf8Bytes[idx++] = (uint8_t)((u32char >> `18`) \| `0xF0`);
770	utf8Bytes[idx++] = (uint8_t)(((u32char >> `12`) & `0x3F`) \| `0x80`);
771	}
772	utf8Bytes[idx++] = (uint8_t)(((u32char >> `6`) & `0x3F`) \| `0x80`);
773	}
774	utf8Bytes[idx++] = (uint8_t)((u32char & `0x3F`) \| `0x80`);
775	}
776	return idx;
777	}
778
779	/ two macros adapted from ICU's utf8.h /
780	#define U8_COUNT_TRAIL_BYTES_LOC(leadByte) \
781	((uint8_t)(leadByte)<0XF0 ? \
782	((uint8_t)(leadByte)>=0XC0)+((uint8_t)(leadByte)>=0XE0) : \
783	(uint8_t)(leadByte)<0XFE ? 3+((uint8_t)(leadByte)>=0XF8)+((uint8_t)(leadByte)>=0XFC) : 0)
784
785	#define U8_MASK_LEAD_BYTE_LOC(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
786
787	/ array adapted from ICU's utf_impl.c /
788	static const int32_t utf8_minLegal[`4`] = { `0`, `0X80`, `0x800`, `0x10000` };
789
790	/*
791	* utf8ToU32Code, map a non-ASCII byte value plus a buffer of trail bytes to a UTF32 code point
792	*
793	* adapted from ICU macro U8_NEXT (utf8.h) and function utf8_nextCharSafeBody (utf_impl.c);
794	* verified to produce the same results (adusted for the difference in API signature).
795	*
796	* assumes at entry that:
797	* 1. a non-ASCII byte value (>= 0x80) that purports to be the beginning of a UTF8 character
798	* has been read, and its value is in u32char
799	* 2. *srcPtr points to the input buffer just after that non-ASCII byte, i.e. it purportedly
800	* points to the trail bytes for that UTF8 char.
801	* 3. srcLimit points to end of the input buffer (just after the last byte in the buffer)
802	*
803	* For a valid and complete UTF8 character, the function returns its value and advances
804	* *srcPtr to the first byte after the UTF8 char. Otherwise, the function returns -1
805	* (and the value in *srcPtr is undefined).
806	* Note that while it does not map to surrogate values (generates an error for malformed
807	* UTF-8 that would map to values in 0xD800..0xD8FF), it does output noncharacter values
808	* whose low 16 bits are 0xFFFE or 0xFFFF without generating an error.
809	*
810	* equivalences used in adapted ICU code:
811	* UChar = uint16_t
812	* UChar32 = int32_t
813	*
814	* This has been validated against ICU behavior.
815	*/
816	STATIC_UNLESS_TEST
817	int32_t
818	utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit)
819	{
820	const char* src = *srcPtr;
821	uint8_t pt1, pt2;
822	if (`0xE0` < u32char && u32char <= `0xEC` && src + `1` < srcLimit && (pt1 = (uint8_t)(src[`0`] - `0x80`)) <= `0x3F` && (pt2 = (uint8_t)(src[`1`] - `0x80`)) <= `0x3F`) {
823	/ handle U+1000..U+CFFF /
824	/ no need for (u32char&0xF) because the upper bits are truncated after <<12 in the cast to (uint16_t) /
825	u32char = (uint16_t)((u32char << `12`) \| (pt1 << `6`) \| pt2);
826	src += `2`;
827	} else if (u32char < `0xE0` && u32char >= `0xC2` && src < srcLimit && (pt1 = (uint8_t)(src[`0`] - `0x80`)) <= `0x3F`) {
828	/ handle U+0080..U+07FF /
829	u32char = ((u32char & `0x1F`) << `6`) \| pt1;
830	src++;
831	} else {
832	/ "complicated" and error cases, adapted from ICU's utf8_nextCharSafeBody() /
833	uint8_t count = U8_COUNT_TRAIL_BYTES_LOC(u32char);
834	if (src + count <= srcLimit) {
835	uint8_t trail;
836
837	U8_MASK_LEAD_BYTE_LOC(u32char, count);
838	switch (count) {
839	/ branches 3, 2 fall through to the next one /
840	case `0`: / count==0 for illegally leading trail bytes and the illegal bytes 0XFE and 0XFF /
841	case `5`:
842	case `4`: / count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 /
843	break;
844	case `3`:
845	trail = src++ - (char*)`0X80`;
846	u32char = (u32char << `6`) \| trail;
847	/ u32char>=0x110 would result in code point>0x10FFFF, outside Unicode /
848	if (u32char >= `0x110` \|\| trail > `0X3F`) {
849	break;
850	}
851	case `2`:
852	trail = src++ - (char*)`0X80`;
853	u32char = (u32char << `6`) \| trail;
854	/*
855	* test for a surrogate D800..DFFF:
856	* before the last (u32char<<6), a surrogate is u32char=360..37F
857	*/
858	if (((u32char & `0xFFE0`) == `0x360`) \|\| trail > `0X3F`) {
859	break;
860	}
861	case `1`:
862	trail = src++ - (char*)`0X80`;
863	u32char = (u32char << `6`) \| trail;
864	if (trail > `0X3F`) {
865	break;
866	}
867	/ correct sequence - all trail bytes have (b7..b6)==(10) /
868	if (u32char >= utf8_minLegal[count]) {
869	*srcPtr = src;
870	return u32char;
871	}
872	/ no default branch to optimize switch() - all values are covered /
873	}
874	}
875	u32char = -`1`;
876	}
877	*srcPtr = src;
878	return u32char;
879	}
880
881	/*
882	* normalizeCaseFoldU32Code, map a single UTF32 code point to its normalized result
883	* and the combining classes for each resulting char, or indicate it is invalid.
884	*
885	* The normalized and case-folded result might be up to 4 UTF32 characters (current
886	* max, could change in the future).
887	*
888	* u32char - input UTF32 code point
889	* case_sens - false for case insensiive => casefold, true for case sensitive => NFD only
890	* u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
891	* to receive the normalize result.
892	* combClass - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
893	* to receive the combining classes for the characters in u32NormFoldBuf. If
894	* the first entry has non-zero combining class, the remaining entries do too.
895	*
896	* returns -1 if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax
897	* is insufficient (though it is assumed to be at least 3), else the length of the
898	* normalized and case-folded result (currently in the range 1..4).
899	*
900	* This has been validated against ICU behavior.
901	*
902	* This function is highly dependent on the structure of the data trie; for details on
903	* that structure, see comments in normalizeCaseFoldData.h
904	*/
905	STATIC_UNLESS_TEST
906	int32_t
907	normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens,
908	int32_t u32NormFoldBuf[kNFCSingleCharDecompMax],
909	uint8_t combClass[kNFCSingleCharDecompMax])
910	{
911	combClass[`0`] = `0`;
912	/ return hi-range PUA as self, except non-characters /
913	if (u32char >= kU32HiPUAStart) {
914	if ((u32char & `0xFFFE`) == `0xFFFE`) {
915	return -`1`;
916	}
917	u32NormFoldBuf[`0`] = u32char;
918	return `1`;
919	}
920	/ for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range /
921	/ 0 - 0x323FF; everything in between in currently invalid. /
922	int32_t u32charLookup = u32char;
923	if (u32charLookup >= kU32LowRangeLimit) {
924	u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit);
925	if (u32charLookup < kU32LowRangeLimit \|\| u32charLookup >= (kU32LowRangeLimit + kU32HiRangeLen)) {
926	return -`1`; / in the large range of currently-unassigned code points /
927	}
928	}
929	/ Now we have u32charLookup either in 0..0x323FF representing u32char itself,*
930	* or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in
931	* the trie that identifies unassigneds in this range, or maps others to
932	* decomps or combining class or just self. */
933	uint16_t trieValue;
934	/ TrieHi /
935	trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift];
936	if (trieValue == kInvalidCodeFlag) {
937	return -`1`;
938	}
939	if (trieValue == `0` \|\| (trieValue & kFlagTestMask) == kCombClassFlag) { / return self; /
940	u32NormFoldBuf[`0`] = u32char;
941	combClass[`0`] = trieValue & kFlagValueMask;
942	return `1`;
943	}
944	if (trieValue == kHangulMask) {
945	combClass[`1`] = combClass[`2`] = `0`;
946	return decomposeHangul(u32char, u32NormFoldBuf);
947	}
948	/ TrieMid /
949	trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask];
950	if (trieValue == kInvalidCodeFlag) {
951	return -`1`;
952	}
953	if (trieValue == `0` \|\| (trieValue & kFlagTestMask) == kCombClassFlag) {
954	u32NormFoldBuf[`0`] = u32char;
955	combClass[`0`] = trieValue & kFlagValueMask;
956	return adjustCase(case_sens, uSeqLen: `1`, u32NormFoldBuf);
957	}
958	if ((trieValue & kFlagTestMask) == kInvMaskFlag) {
959	uint16_t invalidMask = nfU16InvMasks[trieValue & kFlagValueMask];
960	uint16_t testBit = (uint16_t)(`1` << (u32charLookup & kNFTrieLoMask));
961	if (testBit & invalidMask) {
962	/ invalid /
963	return -`1`;
964	} else {
965	/ treat like trieValue == 0 above /
966	u32NormFoldBuf[`0`] = u32char;
967	return adjustCase(case_sens, uSeqLen: `1`, u32NormFoldBuf);
968	}
969	}
970	if (trieValue == kHangulMask) {
971	combClass[`1`] = combClass[`2`] = `0`;
972	return decomposeHangul(u32char, u32NormFoldBuf);
973	}
974	/ TrieLo /
975	trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieLoMask];
976	if (trieValue == kInvalidCodeFlag) {
977	return -`1`;
978	}
979	if (trieValue == kHangulMask) {
980	combClass[`1`] = combClass[`2`] = `0`;
981	return decomposeHangul(u32char, u32NormFoldBuf);
982	}
983	if (trieValue < kToU16Seq2Mask \|\| trieValue > kSpecialsEnd) {
984	if (trieValue == `0` \|\| (trieValue & kFlagTestMask) == kCombClassFlag) {
985	u32NormFoldBuf[`0`] = u32char;
986	combClass[`0`] = trieValue & kFlagValueMask;
987	} else {
988	u32NormFoldBuf[`0`] = trieValue;
989	}
990	return adjustCase(case_sens, uSeqLen: `1`, u32NormFoldBuf);
991	}
992	const uint16_t* u16SeqPtr = NULL;
993	const int32_t* u32SeqPtr = NULL;
994	int32_t uSeqLen = `0`;
995	switch (trieValue & kSpecialsMask) {
996	case kToU16Seq2Mask:
997	if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
998	/ don't use the mapping, it is only for case folding /
999	u32NormFoldBuf[`0`] = u32char;
1000	/ already have combClass[0] = 0 /
1001	return `1`;
1002	}
1003	u16SeqPtr = nfU16Seq2[trieValue & kToSeqIndexMask];
1004	uSeqLen = `2`;
1005	break;
1006	case kToU16Seq3Mask:
1007	if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
1008	/ don't use the mapping, it is only for case folding /
1009	u32NormFoldBuf[`0`] = u32char;
1010	/ already have combClass[0] = 0 /
1011	return `1`;
1012	}
1013	u16SeqPtr = nfU16Seq3[trieValue & kToSeqIndexMask];
1014	uSeqLen = `3`;
1015	break;
1016	case kToU16SeqMiscMask:
1017	u16SeqPtr = &nfU16SeqMisc[trieValue & kToSeqMiscIndexMask];
1018	uSeqLen = *u16SeqPtr & kToSeqMiscLenMask;
1019	combClass[`0`] = (uint8_t)(*u16SeqPtr++ >> kToSeqMiscCCShift);
1020	break;
1021	case kToU32CharMask:
1022	if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
1023	/ don't use the mapping, it is only for case folding /
1024	u32NormFoldBuf[`0`] = u32char;
1025	/ already have combClass[0] = 0 /
1026	return `1`;
1027	}
1028	u32SeqPtr = &nfU32Char[trieValue & kToSeqIndexMask];
1029	uSeqLen = `1`;
1030	break;
1031	case kToU32SeqMiscMask:
1032	u32SeqPtr = &nfU32SeqMisc[trieValue & kToSeqMiscIndexMask];
1033	uSeqLen = *u32SeqPtr & kToSeqMiscLenMask;
1034	combClass[`0`] = (uint8_t)(*u32SeqPtr++ >> kToSeqMiscCCShift);
1035	break;
1036	default:
1037	return -`1`;
1038	}
1039	if (kNFCSingleCharDecompMax < uSeqLen) {
1040	return `0`;
1041	}
1042	int32_t idx;
1043	for (idx = `0`; idx < uSeqLen; idx++) {
1044	u32NormFoldBuf[idx] = (u16SeqPtr)? u16SeqPtr++: u32SeqPtr++;
1045	if (idx > `0`) {
1046	combClass[idx] = getCombClassU32Char(u32char: u32NormFoldBuf[idx]);
1047	}
1048	}
1049	return adjustCase(case_sens, uSeqLen, u32NormFoldBuf);
1050	}
1051
1052	/*
1053	* adjustCase, final adjustments to normalizeOptCaseFoldU32Char for case folding
1054	*
1055	* case_sens - false for case insensiive => casefold, true for case sensitive => NFD only
1056	* uSeqLen - length of the sequence specified in the u32NormFoldBuf
1057	* u32NormFoldBuf - buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
1058	* with normalized result.
1059	*
1060	* returns uSeqLen if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax
1061	* is insufficient (though it is assumed to be at least 3), else the length of the
1062	* normalized and case-folded result (currently in the range 1..4).
1063	*
1064	* This function is a reduced version of normalizeOptCaseFoldU32Char above.
1065	*/
1066
1067	static int32_t
1068	adjustCase(bool case_sens, int32_t uSeqLen,
1069	int32_t u32NormFoldBuf[kNFCSingleCharDecompMax])
1070	{
1071	if (!case_sens && uSeqLen > `0`) {
1072	if (u32NormFoldBuf[`0`] < kSimpleCaseFoldLimit) {
1073	u32NormFoldBuf[`0`] = nfBasicCF[u32NormFoldBuf[`0`]];
1074	/ There is one case in which this maps to a character with different combining*
1075	* class: U+0345 (cc 240) casefolds to U+03B9 (cc 0). However when this is the
1076	* first or only character in the sequence, we want to keep the original
1077	* combining class, so nothing special to do here.
1078	*/
1079	}
1080	/ The following is the only case where we have a casefolding after the first*
1081	* character in the sequence. Don't worry about combining class here. that gets
1082	* set later for characters after the first.
1083	*/
1084	if (uSeqLen > `1` && u32NormFoldBuf[uSeqLen - `1`] == `0x0345`) {
1085	u32NormFoldBuf[uSeqLen - `1`] = `0x03B9`;
1086	}
1087	}
1088	return uSeqLen;
1089	}
1090
1091	/*
1092	* getCombClassU32Char, map a single character (in UTF32 form) to its combining class.
1093	*
1094	* u32char - input UTF32 code point. This is assumed to be a valid character that does
1095	* not have a decomposition.
1096	*
1097	* returns combining class of the character.
1098	*
1099	* This is only called for characters after the first is a decomposition expansion. In
1100	* this situation, if we encounter U+03B9 (combining class 0), it is only there as the
1101	* case-folding of U+0345 (combining class 240). In this case it is the combining class
1102	* for U+0345 that we want. In the non-casefold case we won't see U+03B9 here at all.
1103	*
1104	* This function is a reduced version of normalizeOptCaseFoldU32Char above.
1105	*/
1106	static uint8_t
1107	getCombClassU32Char(int32_t u32char)
1108	{
1109	if (u32char >= kU32HiPUAStart) {
1110	return `0`;
1111	}
1112	if (u32char == `0x03B9`) {
1113	return `240`;
1114	}
1115	/ for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range /
1116	/ 0 - 0x323FF; everything in between in currently invalid. /
1117	int32_t u32charLookup = u32char;
1118	if (u32charLookup >= kU32LowRangeLimit) {
1119	u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit);
1120	}
1121	/ Now we have u32charLookup either in 0..0x323FF representing u32char itself,*
1122	* or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in
1123	* the trie that identifies unassigneds in this range, or maps others to
1124	* decomps or combining class or just self. */
1125	uint16_t trieValue;
1126	/ TrieHi /
1127	trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift];
1128	if (trieValue == `0` \|\| (trieValue & kFlagTestMask) == kCombClassFlag) {
1129	return trieValue & kFlagValueMask;
1130	}
1131	/ TrieMid /
1132	trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask];
1133	if (trieValue == `0` \|\| (trieValue & kFlagTestMask) == kCombClassFlag) { / return self; /
1134	return trieValue & kFlagValueMask;
1135	}
1136	if ((trieValue & kFlagTestMask) == kInvMaskFlag) {
1137	return `0`;
1138	}
1139	/ TrieLo /
1140	trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieMidMask];
1141	return ((trieValue & kFlagTestMask) == kCombClassFlag)? (trieValue & kFlagValueMask): `0`;
1142	}
1143
1144	/*
1145	* decomposeHangul, map a single UTF32 code point for a composed Hangul
1146	* in the range AC00-D7A3, using algorithmic decomp
1147	*
1148	* The normalized result will be 2 or 3 UTF32 characters.
1149	*
1150	* u32char - input UTF32 code point
1151	* u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
1152	* to receive the normalize result.
1153	*
1154	* returns the length of the normalized result (2..3).
1155	*
1156	* Adapted from ICU Hangul:decompose in normalizer2impl.h
1157	*
1158	*/
1159
1160	enum {
1161	HANGUL_BASE=`0xAC00`,
1162	JAMO_L_BASE=`0x1100`, / "lead" jamo /
1163	JAMO_V_BASE=`0x1161`, / "vowel" jamo /
1164	JAMO_T_BASE=`0x11A7`, / "trail" jamo /
1165	JAMO_L_COUNT=`19`,
1166	JAMO_V_COUNT=`21`,
1167	JAMO_T_COUNT=`28`,
1168	};
1169
1170	static int32_t
1171	decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax])
1172	{
1173	u32char -= HANGUL_BASE;
1174	int32_t tIndex = u32char % JAMO_T_COUNT;
1175	u32char /= JAMO_T_COUNT;
1176	u32NormFoldBuf[`0`] = (uint16_t)(JAMO_L_BASE + u32char / JAMO_V_COUNT);
1177	u32NormFoldBuf[`1`] = (uint16_t)(JAMO_V_BASE + u32char % JAMO_V_COUNT);
1178	if (tIndex == `0`) {
1179	return `2`;
1180	}
1181	u32NormFoldBuf[`2`] = (uint16_t)(JAMO_T_BASE + tIndex);
1182	return `3`;
1183	}
1184

Browse the source code of xnu/bsd/vfs/vfs_unicode.c