2 * Copyright (C) 2016-2023 Apple, Inc. All rights reserved.
3 * Some portions covered by other copyrights, listed below.
4 *---
5 * Copyright (C) 2016 and later: Unicode, Inc. and others.
6 * License & terms of use: http://www.unicode.org/copyright.html
7 *---
8 * Copyright (C) 1999-2015, International Business Machines
9 * Corporation and others. All Rights Reserved.
10 *
12 */
14#include <libkern/libkern.h>
15#include <sys/errno.h>
16#include <sys/unicode.h>
17#include "vfs_unicode_data.h"
18#define STATIC_UNLESS_TEST static
20enum {
21 /* Maximum number of UTF8 bytes from one Unicode code point (one UTF32 code unit) */
22 kMaxUTF8BytesPerChar = 4
25/* local prototypes used by exported functions (and themselves exported for testing) */
27int32_t utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit);
29int32_t normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens,
30 int32_t u32NormFoldBuf[kNFCSingleCharDecompMax],
31 uint8_t combClass[kNFCSingleCharDecompMax]);
32/* local prototypes used by exported functions (not exported for separate testing) */
33static int nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes,
34 int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP,
35 int32_t* buf, uint8_t* bufcc, int32_t* buflenP,
36 bool* needReorderP, bool* startP);
37void doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen);
38int32_t u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]);
41 * utf8_normalizeOptCaseFoldGetUVersion
42 *
43 * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6
44 * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3
45 * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0
46 * version[3] = Code revision level; for any given Unicode version, this value starts
47 * at 0 and is incremented for each significant revision to the
48 * normalizeOptCaseFold functions.
49 */
51utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4])
53 version[0] = 15;
54 version[1] = 1;
55 version[2] = 0;
56 version[3] = 0;
57 return;
61 * utf8_normalizeOptCaseFoldAndHash
62 *
63 * str: The input UTF-8 string (need not be 0 terminated)
64 * str_len: The byte length of the input string (excluding any 0 terminator)
65 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
66 * True for case-sensitive behavior; generates standard NFD.
67 * hash_func: A pointer to a hashing function to compute the hash of the
68 * normalized/case-folded result. buf contains buf_len bytes
69 * of data to be added to the hash using the caller-supplied
70 * context (ctx).
71 * hash_ctx: The context for the hash function.
72 *
73 * Returns: 0 on success, or
74 * EILSEQ: The input string contains illegal ASCII-range characters
75 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
76 * contains codepoints that are non-characters or unassigned in
77 * the version of Unicode currently supported (Unicode 9.0).
78 */
81utf8_normalizeOptCaseFoldAndHash(const char *str,
82 size_t str_len,
83 bool case_sens,
84 void (*hash_func)(void *buf, size_t buf_len, void *ctx),
85 void *hash_ctx)
87 const char *strLimit = str + str_len;
89 /* Data for the next pending single-char norm from input;
90 * This will always begin with a base char (combining class 0)
91 * or the first character in the string, which may no be a base */
92 int32_t unorm[kNFCSingleCharDecompMax];
93 uint8_t unormcc[kNFCSingleCharDecompMax];
94 int32_t unormlen = 0;
95 int32_t unormstart = 0;
97 bool start = true;
99 /* main loop:
100 * Each input character may be normalized to a sequence of one or more characters,
101 * some of which may have non-zero combining class. Any sequence of characters
102 * with non-zero combining class resulting from one or more input characters needs
103 * to be accumulated in the main buffer so we can reorder as necessary before
104 * calling the hash function.
105 *
106 * At the beginning of the main loop: The normalization buffer and main buffer are
107 * both empty.
108 *
109 * Each time through the main loop we do the following:
110 * 1. If there are characters available in the normalization result buffer (from the
111 * result of normalizing a previous input character), copy the first character and
112 * any following characters that have non-zero combining class to the main buffer.
113 * 2. If there is nothing left in the normalization buffer, then loop processing
114 * input characters as follows:
115 * a) Get the next input character from UTF8, get its normalized and case-folded
116 * result in the normalization buffer.
117 * b) If the first character in the normalization buffer has combining class 0,
118 * break; we will handle this normalization buffer next time through the main
119 * loop.
120 * c) Else copy the current normalization buffer (which has only combining marks)
121 * to the main buffer, and continue with the loop processing input characters.
122 * 3. At this point the first character in the main buffer may or may not have
123 * combining class 0, but any subsequent characters (up to the the limit for
124 * stream safe text) will be combining characters with nonzero combining class.
125 * Reorder the combining marks if necessary into canonical order.
126 * 4. Call the hash function for each character in the main buffer.
127 *
128 */
129 do {
130 /* Data for the buffers being built up from input */
131 int32_t buf[kNCFStreamSafeBufMax];
132 uint8_t bufcc[kNCFStreamSafeBufMax];
133 int32_t buflen = 0;
134 bool needReorder = false;
135 int err;
137 err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, false /* allow_slashes */,
138 unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start);
139 if (err != 0) {
140 return err;
141 }
143 if (buflen > 0) {
144 /* Now buffer should have all of the combining marks up to the next base char.
145 * Normally it will also start with the last base char encountered (unless the
146 * UTF8 string began with a combining mark). */
147 /* Now reorder combining marks if necessary. */
148 if (needReorder) {
149 doReorder(buf, bufcc, buflen);
150 }
151 /* Now write to hash func */
152 hash_func(buf, buflen * sizeof(buf[0]), hash_ctx);
153 }
154 /* OK so far, top of loop clears buffers to start refilling again */
155 } while (str < strLimit || unormlen > 0);
156 return 0;
160 * utf8_normalizeOptCaseFoldAndCompare
161 *
162 * strA: A UTF-8 string to be compared (need not be 0 terminated)
163 * strA_len: The byte length of strA (excluding any 0 terminator)
164 * strB: The second UTF-8 string to be compared (need not be 0 terminated)
165 * strB_len: The byte length of strB (excluding any 0 terminator)
166 * case_sens: False for case-insensitive behavior; compares canonical caseless forms.
167 * True for case-sensitive behavior; compares standard NFD forms.
168 * are_equal: On success, set to true if the strings are equal, or set to false
169 * if they are not.
170 *
171 * Returns: 0 on success, or
172 * EILSEQ: One or both of the input strings contains illegal ASCII-range
173 * characters (0x00 or '/'), or is not well-formed stream-safe UTF-8,
174 * or contains codepoints that are non-characters or unassigned in
175 * the version of Unicode currently supported (Unicode 9.0).
176 * Note: The comparison may terminate early when a difference is
177 * detected, and may return 0 and set *are_equal=false even
178 * if one or both strings are invalid.
179 */
180enum { kNFCSingleCharDecompMaxPlusPushback = kNFCSingleCharDecompMax + 4 }; /* room for 03B9 pushback(s) */
183utf8_normalizeOptCaseFoldAndCompare(const char *strA,
184 size_t strA_len,
185 const char *strB,
186 size_t strB_len,
187 bool case_sens,
188 bool *are_equal)
190 const char *strALimit = strA + strA_len;
191 const char *strBLimit = strB + strB_len;
193 /* Data for the next pending single-char norms from each input;
194 * These will always begin with a base char (combining class 0)
195 * or the first character in the string, which may not be a base */
196 int32_t unormA[kNFCSingleCharDecompMaxPlusPushback], unormB[kNFCSingleCharDecompMaxPlusPushback];
197 uint8_t unormAcc[kNFCSingleCharDecompMaxPlusPushback], unormBcc[kNFCSingleCharDecompMaxPlusPushback];
198 int32_t unormAlen = 0, unormBlen = 0;
199 int32_t unormAstart = 0, unormBstart = 0;
201 bool startA = true, startB = true;
203 /* main loop:
204 * The main loop here is similar to the main loop in utf8_normalizeOptCaseFoldAndHash,
205 * described above. The differences are:
206 * - We keep a normalization buffer and main buffer for each string.
207 * - In the main loop, we do steps 1-3 for each string.
208 * - In step 4, instead of calling the hash function, we compare the two main
209 * buffers; if they are unequal, we return a non-equal result.
210 * - After the end of the main loop, if we still have data for one string but
211 * not the other, return a non-equal result, else return an equal result.
212 */
213 do {
214 /* Data for the buffers being built up from each input */
215 int32_t bufA[kNCFStreamSafeBufMax], bufB[kNCFStreamSafeBufMax];
216 uint8_t bufAcc[kNCFStreamSafeBufMax], bufBcc[kNCFStreamSafeBufMax];
217 int32_t bufAlen = 0, bufBlen = 0;
218 bool needReorderA = false, needReorderB = false;
219 int err;
221 err = nextBaseAndAnyMarks(strP: &strA, strLimit: strALimit, case_sens, false /* allow_slashes */,
222 unorm: unormA, unormcc: unormAcc, unormlenP: &unormAlen, unormstartP: &unormAstart, buf: bufA, bufcc: bufAcc, buflenP: &bufAlen, needReorderP: &needReorderA, startP: &startA);
223 if (err != 0) {
224 return err;
225 }
226 err = nextBaseAndAnyMarks(strP: &strB, strLimit: strBLimit, case_sens, false /* allow_slashes */,
227 unorm: unormB, unormcc: unormBcc, unormlenP: &unormBlen, unormstartP: &unormBstart, buf: bufB, bufcc: bufBcc, buflenP: &bufBlen, needReorderP: &needReorderB, startP: &startB);
228 if (err != 0) {
229 return err;
230 }
232 if (bufAlen > 0 || bufBlen > 0) {
233 /* Now each buffer should have all of the combining marks up to the next base char.
234 * Normally it will also start with the last base char encountered (unless the
235 * UTF8 string began with a combining mark). */
236 /* Now reorder combining marks if necessary. */
237 if (needReorderA) {
238 doReorder(buf: bufA, bufcc: bufAcc, buflen: bufAlen);
239 }
240 if (needReorderB) {
241 doReorder(buf: bufB, bufcc: bufBcc, buflen: bufBlen);
242 }
243 /* handle 03B9 pushback */
244 int32_t idx;
245 if (!case_sens) {
246 if (bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9 && unormAstart == 0) {
247 int32_t tailCount = 0;
248 while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormAlen && bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9) {
249 tailCount++;
250 bufAlen--;
251 }
252 for (idx = unormAlen; idx > 0; idx--) {
253 unormA[idx - 1 + tailCount] = unormA[idx - 1];
254 unormAcc[idx - 1 + tailCount] = unormAcc[idx - 1];
255 }
256 for (idx = 0; idx < tailCount; idx++) {
257 unormA[idx] = 0x03B9;
258 unormAcc[idx] = 0;
259 }
260 unormAlen += tailCount;
261 }
262 if (bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9 && unormBstart == 0) {
263 int32_t tailCount = 0;
264 while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormBlen && bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9) {
265 tailCount++;
266 bufBlen--;
267 }
268 for (idx = unormBlen; idx > 0; idx--) {
269 unormB[idx - 1 + tailCount] = unormB[idx - 1];
270 unormBcc[idx - 1 + tailCount] = unormBcc[idx - 1];
271 }
272 for (idx = 0; idx < tailCount; idx++) {
273 unormB[idx] = 0x03B9;
274 unormBcc[idx] = 0;
275 }
276 unormBlen += tailCount;
277 }
278 }
279 /* Now compare the buffers. */
280 if (bufAlen != bufBlen || memcmp(s1: bufA, s2: bufB, n: bufAlen * sizeof(bufA[0])) != 0) {
281 *are_equal = false;
282 return 0;
283 }
284 }
285 /* OK so far, top of loop clears buffers to start refilling again */
286 } while ((strA < strALimit || unormAlen > 0) && (strB < strBLimit || unormBlen > 0));
288 *are_equal = (strA == strALimit && unormAlen == 0 && strB == strBLimit && unormBlen == 0);
289 return 0;
293 * utf8_normalizeOptCaseFold
294 *
295 * str: The input UTF-8 string (need not be 0 terminated)
296 * str_len: The byte length of the input string (excluding any 0 terminator)
297 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
298 * True for case-sensitive behavior; generates standard NFD.
299 * ustr: A pointer to a buffer for the resulting UTF-32 string.
300 * ustr_size: The capacity of ustr, in UTF-32 units.
301 * ustr_len: Pointer to a value that will be filled in with the actual length
302 * in UTF-32 units of the string copied to ustr.
303 *
304 * Returns: 0 on success, or
305 * EILSEQ: The input string contains illegal ASCII-range characters
306 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
307 * contains codepoints that are non-characters or unassigned in
308 * the version of Unicode currently supported.
309 * ENOMEM: ustr_size is insufficient for the resulting string. In this
310 * case the value returned in *ustr_len is invalid.
311 */
313utf8_normalizeOptCaseFold(const char *str,
314 size_t str_len,
315 bool case_sens,
316 int32_t *ustr,
317 int32_t ustr_size,
318 int32_t *ustr_len)
320 const char *strLimit = str + str_len;
321 int32_t *ustrCur = ustr;
322 const int32_t *ustrLimit = ustr + ustr_size;
324 /* Data for the next pending single-char norm from input;
325 * This will always begin with a base char (combining class 0) */
326 int32_t unorm[kNFCSingleCharDecompMax];
327 uint8_t unormcc[kNFCSingleCharDecompMax];
328 int32_t unormlen = 0;
329 int32_t unormstart = 0;
331 bool start = true;
333 *ustr_len = 0;
334 do {
335 /* Data for the buffers being built up from input */
336 int32_t buf[kNCFStreamSafeBufMax];
337 uint8_t bufcc[kNCFStreamSafeBufMax];
338 int32_t buflen = 0;
339 bool needReorder = false;
340 int err;
342 err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, false /* allow_slashes */,
343 unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start);
344 if (err != 0) {
345 return err;
346 }
348 if (buflen > 0) {
349 if (needReorder) {
350 doReorder(buf, bufcc, buflen);
351 }
352 /* Now copy to output buffer */
353 int32_t idx;
354 if (ustrCur + buflen > ustrLimit) {
355 return ENOMEM;
356 }
357 for (idx = 0; idx < buflen; idx++) {
358 *ustrCur++ = buf[idx];
359 }
360 }
361 /* OK so far, top of loop clears buffers to start refilling again */
362 } while (str < strLimit || unormlen > 0);
363 *ustr_len = (uint32_t)(ustrCur - ustr); // XXXpjr: the explicit (uint32_t) cast wasn't present in the original code drop
364 return 0;
367static int
368utf8_normalizeOptCaseFoldToUTF8_internal(const char *str,
369 size_t str_len,
370 bool case_sens,
371 bool allow_slashes,
372 char *ustr,
373 size_t ustr_size,
374 size_t *ustr_len)
376 const char *strLimit = str + str_len;
377 char *ustrCur = ustr;
378 const char *ustrLimit = ustr + ustr_size;
380 /* Data for the next pending single-char norm from input;
381 * This will always begin with a base char (combining class 0) */
382 int32_t unorm[kNFCSingleCharDecompMax];
383 uint8_t unormcc[kNFCSingleCharDecompMax];
384 int32_t unormlen = 0;
385 int32_t unormstart = 0;
387 bool start = true;
389 *ustr_len = 0;
390 do {
391 /* Data for the buffers being built up from input */
392 int32_t buf[kNCFStreamSafeBufMax];
393 uint8_t bufcc[kNCFStreamSafeBufMax];
394 int32_t buflen = 0;
395 bool needReorder = false;
396 int err;
398 err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, allow_slashes,
399 unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start);
400 if (err != 0) {
401 return err;
402 }
404 if (buflen > 0) {
405 uint8_t utf8Bytes[kMaxUTF8BytesPerChar];
406 int32_t *bufPtr = buf;
407 if (needReorder) {
408 doReorder(buf, bufcc, buflen);
409 }
410 /* Now copy to output buffer */
411 while (buflen-- > 0) {
412 int32_t idx, utf8Len = u32CharToUTF8Bytes(u32char: (uint32_t)*bufPtr++, utf8Bytes);
413 if (ustrCur + utf8Len > ustrLimit) {
414 return ENOMEM;
415 }
416 for (idx = 0; idx < utf8Len; idx++) {
417 *ustrCur++ = (char)utf8Bytes[idx];
418 }
419 }
420 }
421 /* OK so far, top of loop clears buffers to start refilling again */
422 } while (str < strLimit || unormlen > 0);
423 *ustr_len = ustrCur - ustr;
424 return 0;
428 * utf8_normalizeOptCaseFoldToUTF8
429 * (This is similar to normalizeOptCaseFold except that this has a different output
430 * buffer type, and adds conversion to UTF8 while copying to output buffer)
431 *
432 * str: The input UTF-8 string (need not be 0 terminated)
433 * str_len: The byte length of the input string (excluding any 0 terminator)
434 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
435 * True for case-sensitive behavior; generates standard NFD.
436 * ustr: A pointer to a buffer for the resulting UTF-8 string.
437 * ustr_size: The capacity of ustr, in bytes.
438 * ustr_len: Pointer to a value that will be filled in with the actual length
439 * in bytes of the string copied to ustr.
440 *
441 * Returns: 0 on success, or
442 * EILSEQ: The input string contains illegal ASCII-range characters
443 * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or
444 * contains codepoints that are non-characters or unassigned in
445 * the version of Unicode currently supported.
446 * ENOMEM: ustr_size is insufficient for the resulting string. In this
447 * case the value returned in *ustr_len is invalid.
448 */
450utf8_normalizeOptCaseFoldToUTF8(const char *str,
451 size_t str_len,
452 bool case_sens,
453 char *ustr,
454 size_t ustr_size,
455 size_t *ustr_len)
457 return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, false /* allow_slashes */,
458 ustr, ustr_size, ustr_len);
462 * utf8_normalizeOptCaseFoldToUTF8ForPath
463 * (This is similar to normalizeOptCaseFoldToUTF8 except that this allows '/' character.)
464 *
465 * str: The input UTF-8 path string
466 * str_len: The byte length of the input path string (excluding any 0 terminator)
467 * case_sens: False for case-insensitive behavior; generates canonical caseless form.
468 * True for case-sensitive behavior; generates standard NFD.
469 * ustr: A pointer to a buffer for the resulting UTF-8 string.
470 * ustr_size: The capacity of ustr, in bytes.
471 * ustr_len: Pointer to a value that will be filled in with the actual length
472 * in bytes of the string copied to ustr.
473 *
474 * Returns: 0 on success, or
475 * EILSEQ: The input string contains illegal ASCII-range characters
476 * (0x00), or is not well-formed stream-safe UTF-8, or
477 * contains codepoints that are non-characters or unassigned in
478 * the version of Unicode currently supported.
479 * ENOMEM: ustr_size is insufficient for the resulting string. In this
480 * case the value returned in *ustr_len is invalid.
481 */
483utf8_normalizeOptCaseFoldToUTF8ForPath(const char *str,
484 size_t str_len,
485 bool case_sens,
486 char *ustr,
487 size_t ustr_size,
488 size_t *ustr_len)
490 return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, true /* allow_slashes */,
491 ustr, ustr_size, ustr_len);
495 * utf8_normalizeOptCaseFoldAndMatchSubstring
496 *
497 * strA: A UTF-8 string (need not be 0 terminated) in which to search for the
498 * substring specified by ustrB.
499 * strA_len: The byte length of strA (excluding any 0 terminator)
500 * ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched
501 * for in the UTF-32 string resulting from converting strA to the normalized
502 * UTF-32 form specified by the case_sens parameter; ustrB must already be
503 * in that form.
504 * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator).
505 * case_sens: False for case-insensitive matching; compares canonical caseless forms.
506 * True for case-sensitive matching; compares standard NFD forms.
507 * buf: Pointer to caller-supplied working memory for storing the portion of
508 * strA which has been converted to normalized UTF-32.
509 * buf_size: The size of buf.
510 * has_match: On success, set to true if strA (when converter to UTF-32 and normalized
511 * per case_sens) contains ustrB, set to false otherwise.
512 *
513 * Returns: 0 on success, or
514 * EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is
515 * not well-formed stream-safe UTF-8, or contains codepoints that are
516 * non-characters or unassigned in the version of Unicode currently
517 * supported.
518 * Note: The search may terminate early when a match is detected, and
519 * may return 0 and set *has_match=true even if strA is invalid.
520 * ENOMEM: buf_size is insufficient.
521 */
523utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA,
524 size_t strA_len,
525 const int32_t *ustrB,
526 int32_t ustrB_len,
527 bool case_sens,
528 void *buf,
529 size_t buf_size,
530 bool *has_match)
532 /*
533 * ustrA represents the current position in the UTF-32 normalized version of strA
534 * at which we want to test for a match; ustrANormEnd is the position beyond that
535 * which is just after the end of what has already been converted from strA to
536 * UTF-32 normalized form.
537 * Each time through the main loop:
538 * - The first task is to make sure we have enough of strA converted to UTF32
539 * normalized form to test for match with ustrB at the current match position.
540 * If we don't, then convert more of strA to UTF-32 normalized form until we
541 * have enough to compare with ustrB. To do this, run a loop which is like the
542 * main loop in utf8_normalizeOptCaseFoldAndHash except that in step 4, instead of
543 * calling the hash function, we copy the normalized buffer to ustrANormEnd,
544 * advancing the latter. We keep doing this until we have enough additional
545 * converted to match with ustrB.
546 * - Then we test for match of ustrB at the current ustrA position. If there is
547 * a match we return; otherwise, if there is more strA to convert we advance
548 * ustrA and repeat the main loop, otherwise we return without a match.
549 */
550 if (ustrB_len == 0) { /* always matches */
551 *has_match = true;
552 return 0;
553 }
554 *has_match = false; /* initialize return value */
555 if (ustrB_len > 2 * strA_len) {
556 /* If ustrB is clearly too long to find in strA, don't bother normalizing strA.
557 * A UTF-8 character of 1 byte (ASCII) will normalize to 1 UTF-32 unit.
558 * A UTF-8 character of 2-4 bytes will normalize to a maximum of 4 UTF-32 units.
559 * The maximum expansion from unnormalized UTF-8 byte length to normalized
560 * UTF-32 unit length is thus 2. */
561 return 0;
562 }
564 const char *strALimit = strA + strA_len;
565 int32_t *ustrA = (int32_t *)buf;
566 const int32_t *ustrALimit = ustrA + (buf_size / sizeof(int32_t));
567 int32_t *ustrANormEnd = ustrA; /* how far we have already normalized in ustrA */
569 /* Data for the next pending single-char norms from each input;
570 * These will always begin with a base char (combining class 0)
571 * or the first character in the string, which may not be a base */
572 int32_t unormA[kNFCSingleCharDecompMax];
573 uint8_t unormAcc[kNFCSingleCharDecompMax];
574 int32_t unormAlen = 0;
575 int32_t unormAstart = 0;
577 bool startA = true;
579 while (true) {
580 /* convert enough more of strA to normalized UTF-32 in ustrA to check for match */
581 if (ustrANormEnd - ustrA < ustrB_len) {
582 do {
583 /* Data for the buffers being built up from each input */
584 int32_t bufA[kNCFStreamSafeBufMax];
585 uint8_t bufAcc[kNCFStreamSafeBufMax];
586 int32_t bufAlen = 0;
587 bool needReorderA = false;
588 int err;
590 err = nextBaseAndAnyMarks(strP: &strA, strLimit: strALimit, case_sens, false /* allow_slashes */,
591 unorm: unormA, unormcc: unormAcc, unormlenP: &unormAlen, unormstartP: &unormAstart, buf: bufA, bufcc: bufAcc, buflenP: &bufAlen, needReorderP: &needReorderA, startP: &startA);
592 if (err != 0) {
593 return err;
594 }
596 if (bufAlen > 0) {
597 /* Now each buffer should have all of the combining marks up to the next base char.
598 * Normally it will also start with the last base char encountered (unless the
599 * UTF8 string began with a combining mark). */
600 /* Now reorder combining marks if necessary. Should be rare, and sequences should
601 * usually be short when does occur => simple bubblesort should be sufficient. */
602 if (needReorderA) {
603 doReorder(buf: bufA, bufcc: bufAcc, buflen: bufAlen);
604 }
605 /* Now copy to working buffer */
606 int32_t idx;
607 if (ustrANormEnd + bufAlen > ustrALimit) {
608 return ENOMEM;
609 }
610 for (idx = 0; idx < bufAlen; idx++) {
611 *ustrANormEnd++ = bufA[idx];
612 }
613 }
614 /* OK so far, top of loop clears buffers to start refilling again */
615 } while ((ustrANormEnd - ustrA < ustrB_len) && (strA < strALimit || unormAlen > 0));
616 }
618 if (ustrANormEnd - ustrA < ustrB_len) {
619 return 0; /* not enough of strA left for match */
620 }
621 /* check for match, return if so */
622 if (memcmp(s1: ustrA, s2: ustrB, n: ustrB_len * sizeof(ustrB[0])) == 0) {
623 *has_match = true;
624 return 0;
625 }
626 ustrA++; /* advance match position */
627 }
630/* nextBaseAndAnyMarks:
631 * Guts of code to get next bufferful of base character (or first char in string)
632 * and all trailing combining marks.
633 * This is called each time through the main loop of functions above, and does the
634 * following:
635 * 1. If there are characters available in the normalization result buffer (from the
636 * result of normalizing a previous input character), copy the first character and
637 * any following characters that have non-zero combining class to the main buffer.
638 * 2. If there is nothing left in the normalization buffer, then loop processing
639 * input characters as follows:
640 * a) Get the next input character from UTF8, get its normalized and case-folded
641 * result in the normalization buffer.
642 * b) If the first character in the normalization buffer has combining class 0,
643 * break; we will handle this normalization buffer next time through the main
644 * loop.
645 * c) Else copy the current normalization buffer (which has only combining marks)
646 * to the main buffer, and continue with the loop processing input characters.
647 */
649static int
650nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes,
651 int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP,
652 int32_t* buf, uint8_t* bufcc, int32_t* buflenP,
653 bool* needReorderP, bool* startP)
655 /* update buffers for str */
656 if (*unormlenP > 0 && *unormstartP < *unormlenP) {
657 /* unorm begins with a base char; buflen should be 0 */
658 *needReorderP = false;
659 for (*buflenP = 0; true;) {
660 if (*buflenP > 0 && unormcc[*unormstartP] > 0 && unormcc[*unormstartP] < bufcc[(*buflenP) - 1]) {
661 *needReorderP = true;
662 }
663 buf[*buflenP] = unorm[*unormstartP];
664 bufcc[(*buflenP)++] = unormcc[(*unormstartP)++];
665 if (*unormstartP >= *unormlenP || unormcc[*unormstartP] == 0) {
666 break;
667 }
668 }
669 }
670 if (*unormstartP >= *unormlenP) {
671 *unormstartP = *unormlenP = 0;
672 while (*strP < strLimit) {
673 int32_t idx;
674 uint32_t bytevalue = (uint8_t)*(*strP)++;
675 /* '/' is not produced by NFD decomposition from another character so we can
676 * check for it before normalization */
677 if (bytevalue == 0 || (bytevalue == 0x2F /*'/'*/ && !allow_slashes)) {
678 return EILSEQ;
679 }
680 if (bytevalue < 0x80) {
681 unorm[0] = (!case_sens && bytevalue >= 'A' && bytevalue <= 'Z')? bytevalue += 0x20: bytevalue;
682 *unormlenP = 1;
683 unormcc[0] = 0;
684 *startP = false;
685 break;
686 } else {
687 int32_t u32char = utf8ToU32Code(u32char: bytevalue, srcPtr: strP, srcLimit: strLimit);
688 if (u32char <= 0) {
689 return EILSEQ;
690 }
691 *unormlenP = normalizeOptCaseFoldU32Char(u32char, case_sens, u32NormFoldBuf: unorm, combClass: unormcc);
692 if (*unormlenP <= 0) {
693 return EILSEQ;
694 }
695 if (unormcc[0] == 0 || *startP) {
696 *startP = false;
697 break;
698 }
699 }
700 /* the latest char decomposes to just combining sequence, add to buffer being built */
701 if (*buflenP + *unormlenP > kNCFStreamSafeBufMax) {
702 return EILSEQ;
703 }
704 for (idx = 0; idx < *unormlenP; idx++, (*buflenP)++) {
705 if (*buflenP > 0 && unormcc[idx] > 0 && unormcc[idx] < bufcc[(*buflenP) - 1]) {
706 *needReorderP = true;
707 }
708 buf[*buflenP] = unorm[idx];
709 bufcc[*buflenP] = unormcc[idx];
710 }
711 *unormlenP = 0;
712 }
713 }
714 return 0;
717/* local prototypes used only by internal functions */
718static void swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]);
719static int32_t adjustCase(bool case_sens, int32_t uSeqLen,
720 int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]);
721static uint8_t getCombClassU32Char(int32_t u32char);
722static int32_t decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]);
724/* Reorder combining marks if necessary. Should be rare, and sequences should
725 * usually be short when does occur => simple bubblesort should be sufficient. */
727doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen)
729 int32_t idx, jdx;
730 for (idx = 0; idx < buflen - 1; idx++) {
731 for (jdx = buflen - 1; jdx > idx; jdx--) {
732 if (bufcc[jdx] < bufcc[jdx - 1]) {
733 swapBufCharCCWithPrevious(jdx, buf, bufcc);
734 }
735 }
736 }
738/* swap function for bubblesort */
739static void
740swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[])
742 int32_t bufchar = buf[jdx];
743 uint8_t bufccval = bufcc[jdx];
744 buf[jdx] = buf[jdx - 1];
745 bufcc[jdx] = bufcc[jdx - 1];
746 buf[jdx - 1] = bufchar;
747 bufcc[jdx - 1] = bufccval;
751 * u32CharToUTF8Bytes, map a valid Unicode character (UTF32 code point) to 1..4 UTF8 bytes,
752 * and returns the number of UTF8 bytes.
753 *
754 * adapted from ICU macro U8_APPEND_UNSAFE (utf8.h).
755 */
757u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar])
759 int32_t idx = 0;
760 if (u32char <= 0x7F) {
761 utf8Bytes[idx++] = (uint8_t)u32char;
762 } else {
763 if (u32char <= 0x7FF) {
764 utf8Bytes[idx++] = (uint8_t)((u32char >> 6) | 0xC0);
765 } else {
766 if (u32char <= 0xFFFF) {
767 utf8Bytes[idx++] = (uint8_t)((u32char >> 12) | 0xE0);
768 } else {
769 utf8Bytes[idx++] = (uint8_t)((u32char >> 18) | 0xF0);
770 utf8Bytes[idx++] = (uint8_t)(((u32char >> 12) & 0x3F) | 0x80);
771 }
772 utf8Bytes[idx++] = (uint8_t)(((u32char >> 6) & 0x3F) | 0x80);
773 }
774 utf8Bytes[idx++] = (uint8_t)((u32char & 0x3F) | 0x80);
775 }
776 return idx;
779/* two macros adapted from ICU's utf8.h */
780#define U8_COUNT_TRAIL_BYTES_LOC(leadByte) \
781((uint8_t)(leadByte)<0XF0 ? \
782((uint8_t)(leadByte)>=0XC0)+((uint8_t)(leadByte)>=0XE0) : \
783(uint8_t)(leadByte)<0XFE ? 3+((uint8_t)(leadByte)>=0XF8)+((uint8_t)(leadByte)>=0XFC) : 0)
785#define U8_MASK_LEAD_BYTE_LOC(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1)
787/* array adapted from ICU's utf_impl.c */
788static const int32_t utf8_minLegal[4] = { 0, 0X80, 0x800, 0x10000 };
791 * utf8ToU32Code, map a non-ASCII byte value plus a buffer of trail bytes to a UTF32 code point
792 *
793 * adapted from ICU macro U8_NEXT (utf8.h) and function utf8_nextCharSafeBody (utf_impl.c);
794 * verified to produce the same results (adusted for the difference in API signature).
795 *
796 * assumes at entry that:
797 * 1. a non-ASCII byte value (>= 0x80) that purports to be the beginning of a UTF8 character
798 * has been read, and its value is in u32char
799 * 2. *srcPtr points to the input buffer just after that non-ASCII byte, i.e. it purportedly
800 * points to the trail bytes for that UTF8 char.
801 * 3. srcLimit points to end of the input buffer (just after the last byte in the buffer)
802 *
803 * For a valid and complete UTF8 character, the function returns its value and advances
804 * *srcPtr to the first byte after the UTF8 char. Otherwise, the function returns -1
805 * (and the value in *srcPtr is undefined).
806 * Note that while it does not map to surrogate values (generates an error for malformed
807 * UTF-8 that would map to values in 0xD800..0xD8FF), it does output noncharacter values
808 * whose low 16 bits are 0xFFFE or 0xFFFF without generating an error.
809 *
810 * equivalences used in adapted ICU code:
811 * UChar = uint16_t
812 * UChar32 = int32_t
813 *
814 * This has been validated against ICU behavior.
815 */
818utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit)
820 const char* src = *srcPtr;
821 uint8_t pt1, pt2;
822 if (0xE0 < u32char && u32char <= 0xEC && src + 1 < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F && (pt2 = (uint8_t)(src[1] - 0x80)) <= 0x3F) {
823 /* handle U+1000..U+CFFF */
824 /* no need for (u32char&0xF) because the upper bits are truncated after <<12 in the cast to (uint16_t) */
825 u32char = (uint16_t)((u32char << 12) | (pt1 << 6) | pt2);
826 src += 2;
827 } else if (u32char < 0xE0 && u32char >= 0xC2 && src < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F) {
828 /* handle U+0080..U+07FF */
829 u32char = ((u32char & 0x1F) << 6) | pt1;
830 src++;
831 } else {
832 /* "complicated" and error cases, adapted from ICU's utf8_nextCharSafeBody() */
833 uint8_t count = U8_COUNT_TRAIL_BYTES_LOC(u32char);
834 if (src + count <= srcLimit) {
835 uint8_t trail;
837 U8_MASK_LEAD_BYTE_LOC(u32char, count);
838 switch (count) {
839 /* branches 3, 2 fall through to the next one */
840 case 0: /* count==0 for illegally leading trail bytes and the illegal bytes 0XFE and 0XFF */
841 case 5:
842 case 4: /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */
843 break;
844 case 3:
845 trail = *src++ - (char)0X80;
846 u32char = (u32char << 6) | trail;
847 /* u32char>=0x110 would result in code point>0x10FFFF, outside Unicode */
848 if (u32char >= 0x110 || trail > 0X3F) {
849 break;
850 }
851 case 2:
852 trail = *src++ - (char)0X80;
853 u32char = (u32char << 6) | trail;
854 /*
855 * test for a surrogate D800..DFFF:
856 * before the last (u32char<<6), a surrogate is u32char=360..37F
857 */
858 if (((u32char & 0xFFE0) == 0x360) || trail > 0X3F) {
859 break;
860 }
861 case 1:
862 trail = *src++ - (char)0X80;
863 u32char = (u32char << 6) | trail;
864 if (trail > 0X3F) {
865 break;
866 }
867 /* correct sequence - all trail bytes have (b7..b6)==(10) */
868 if (u32char >= utf8_minLegal[count]) {
869 *srcPtr = src;
870 return u32char;
871 }
872 /* no default branch to optimize switch() - all values are covered */
873 }
874 }
875 u32char = -1;
876 }
877 *srcPtr = src;
878 return u32char;
882 * normalizeCaseFoldU32Code, map a single UTF32 code point to its normalized result
883 * and the combining classes for each resulting char, or indicate it is invalid.
884 *
885 * The normalized and case-folded result might be up to 4 UTF32 characters (current
886 * max, could change in the future).
887 *
888 * u32char - input UTF32 code point
889 * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only
890 * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
891 * to receive the normalize result.
892 * combClass - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
893 * to receive the combining classes for the characters in u32NormFoldBuf. If
894 * the first entry has non-zero combining class, the remaining entries do too.
895 *
896 * returns -1 if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax
897 * is insufficient (though it is assumed to be at least 3), else the length of the
898 * normalized and case-folded result (currently in the range 1..4).
899 *
900 * This has been validated against ICU behavior.
901 *
902 * This function is highly dependent on the structure of the data trie; for details on
903 * that structure, see comments in normalizeCaseFoldData.h
904 */
907normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens,
908 int32_t u32NormFoldBuf[kNFCSingleCharDecompMax],
909 uint8_t combClass[kNFCSingleCharDecompMax])
911 combClass[0] = 0;
912 /* return hi-range PUA as self, except non-characters */
913 if (u32char >= kU32HiPUAStart) {
914 if ((u32char & 0xFFFE) == 0xFFFE) {
915 return -1;
916 }
917 u32NormFoldBuf[0] = u32char;
918 return 1;
919 }
920 /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */
921 /* 0 - 0x323FF; everything in between in currently invalid. */
922 int32_t u32charLookup = u32char;
923 if (u32charLookup >= kU32LowRangeLimit) {
924 u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit);
925 if (u32charLookup < kU32LowRangeLimit || u32charLookup >= (kU32LowRangeLimit + kU32HiRangeLen)) {
926 return -1; /* in the large range of currently-unassigned code points */
927 }
928 }
929 /* Now we have u32charLookup either in 0..0x323FF representing u32char itself,
930 * or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in
931 * the trie that identifies unassigneds in this range, or maps others to
932 * decomps or combining class or just self. */
933 uint16_t trieValue;
934 /* TrieHi */
935 trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift];
936 if (trieValue == kInvalidCodeFlag) {
937 return -1;
938 }
939 if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /* return self; */
940 u32NormFoldBuf[0] = u32char;
941 combClass[0] = trieValue & kFlagValueMask;
942 return 1;
943 }
944 if (trieValue == kHangulMask) {
945 combClass[1] = combClass[2] = 0;
946 return decomposeHangul(u32char, u32NormFoldBuf);
947 }
948 /* TrieMid */
949 trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask];
950 if (trieValue == kInvalidCodeFlag) {
951 return -1;
952 }
953 if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) {
954 u32NormFoldBuf[0] = u32char;
955 combClass[0] = trieValue & kFlagValueMask;
956 return adjustCase(case_sens, uSeqLen: 1, u32NormFoldBuf);
957 }
958 if ((trieValue & kFlagTestMask) == kInvMaskFlag) {
959 uint16_t invalidMask = nfU16InvMasks[trieValue & kFlagValueMask];
960 uint16_t testBit = (uint16_t)(1 << (u32charLookup & kNFTrieLoMask));
961 if (testBit & invalidMask) {
962 /* invalid */
963 return -1;
964 } else {
965 /* treat like trieValue == 0 above */
966 u32NormFoldBuf[0] = u32char;
967 return adjustCase(case_sens, uSeqLen: 1, u32NormFoldBuf);
968 }
969 }
970 if (trieValue == kHangulMask) {
971 combClass[1] = combClass[2] = 0;
972 return decomposeHangul(u32char, u32NormFoldBuf);
973 }
974 /* TrieLo */
975 trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieLoMask];
976 if (trieValue == kInvalidCodeFlag) {
977 return -1;
978 }
979 if (trieValue == kHangulMask) {
980 combClass[1] = combClass[2] = 0;
981 return decomposeHangul(u32char, u32NormFoldBuf);
982 }
983 if (trieValue < kToU16Seq2Mask || trieValue > kSpecialsEnd) {
984 if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) {
985 u32NormFoldBuf[0] = u32char;
986 combClass[0] = trieValue & kFlagValueMask;
987 } else {
988 u32NormFoldBuf[0] = trieValue;
989 }
990 return adjustCase(case_sens, uSeqLen: 1, u32NormFoldBuf);
991 }
992 const uint16_t* u16SeqPtr = NULL;
993 const int32_t* u32SeqPtr = NULL;
994 int32_t uSeqLen = 0;
995 switch (trieValue & kSpecialsMask) {
996 case kToU16Seq2Mask:
997 if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
998 /* don't use the mapping, it is only for case folding */
999 u32NormFoldBuf[0] = u32char;
1000 /* already have combClass[0] = 0 */
1001 return 1;
1002 }
1003 u16SeqPtr = nfU16Seq2[trieValue & kToSeqIndexMask];
1004 uSeqLen = 2;
1005 break;
1006 case kToU16Seq3Mask:
1007 if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
1008 /* don't use the mapping, it is only for case folding */
1009 u32NormFoldBuf[0] = u32char;
1010 /* already have combClass[0] = 0 */
1011 return 1;
1012 }
1013 u16SeqPtr = nfU16Seq3[trieValue & kToSeqIndexMask];
1014 uSeqLen = 3;
1015 break;
1016 case kToU16SeqMiscMask:
1017 u16SeqPtr = &nfU16SeqMisc[trieValue & kToSeqMiscIndexMask];
1018 uSeqLen = *u16SeqPtr & kToSeqMiscLenMask;
1019 combClass[0] = (uint8_t)(*u16SeqPtr++ >> kToSeqMiscCCShift);
1020 break;
1021 case kToU32CharMask:
1022 if (case_sens && (trieValue & kToSeqCaseFoldMask)) {
1023 /* don't use the mapping, it is only for case folding */
1024 u32NormFoldBuf[0] = u32char;
1025 /* already have combClass[0] = 0 */
1026 return 1;
1027 }
1028 u32SeqPtr = &nfU32Char[trieValue & kToSeqIndexMask];
1029 uSeqLen = 1;
1030 break;
1031 case kToU32SeqMiscMask:
1032 u32SeqPtr = &nfU32SeqMisc[trieValue & kToSeqMiscIndexMask];
1033 uSeqLen = *u32SeqPtr & kToSeqMiscLenMask;
1034 combClass[0] = (uint8_t)(*u32SeqPtr++ >> kToSeqMiscCCShift);
1035 break;
1036 default:
1037 return -1;
1038 }
1039 if (kNFCSingleCharDecompMax < uSeqLen) {
1040 return 0;
1041 }
1042 int32_t idx;
1043 for (idx = 0; idx < uSeqLen; idx++) {
1044 u32NormFoldBuf[idx] = (u16SeqPtr)? *u16SeqPtr++: *u32SeqPtr++;
1045 if (idx > 0) {
1046 combClass[idx] = getCombClassU32Char(u32char: u32NormFoldBuf[idx]);
1047 }
1048 }
1049 return adjustCase(case_sens, uSeqLen, u32NormFoldBuf);
1053 * adjustCase, final adjustments to normalizeOptCaseFoldU32Char for case folding
1054 *
1055 * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only
1056 * uSeqLen - length of the sequence specified in the u32NormFoldBuf
1057 * u32NormFoldBuf - buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
1058 * with normalized result.
1059 *
1060 * returns uSeqLen if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax
1061 * is insufficient (though it is assumed to be at least 3), else the length of the
1062 * normalized and case-folded result (currently in the range 1..4).
1063 *
1064 * This function is a reduced version of normalizeOptCaseFoldU32Char above.
1065 */
1067static int32_t
1068adjustCase(bool case_sens, int32_t uSeqLen,
1069 int32_t u32NormFoldBuf[kNFCSingleCharDecompMax])
1071 if (!case_sens && uSeqLen > 0) {
1072 if (u32NormFoldBuf[0] < kSimpleCaseFoldLimit) {
1073 u32NormFoldBuf[0] = nfBasicCF[u32NormFoldBuf[0]];
1074 /* There is one case in which this maps to a character with different combining
1075 * class: U+0345 (cc 240) casefolds to U+03B9 (cc 0). However when this is the
1076 * first or only character in the sequence, we want to keep the original
1077 * combining class, so nothing special to do here.
1078 */
1079 }
1080 /* The following is the only case where we have a casefolding after the first
1081 * character in the sequence. Don't worry about combining class here. that gets
1082 * set later for characters after the first.
1083 */
1084 if (uSeqLen > 1 && u32NormFoldBuf[uSeqLen - 1] == 0x0345) {
1085 u32NormFoldBuf[uSeqLen - 1] = 0x03B9;
1086 }
1087 }
1088 return uSeqLen;
1092 * getCombClassU32Char, map a single character (in UTF32 form) to its combining class.
1093 *
1094 * u32char - input UTF32 code point. This is assumed to be a valid character that does
1095 * not have a decomposition.
1096 *
1097 * returns combining class of the character.
1098 *
1099 * This is only called for characters after the first is a decomposition expansion. In
1100 * this situation, if we encounter U+03B9 (combining class 0), it is only there as the
1101 * case-folding of U+0345 (combining class 240). In this case it is the combining class
1102 * for U+0345 that we want. In the non-casefold case we won't see U+03B9 here at all.
1103 *
1104 * This function is a reduced version of normalizeOptCaseFoldU32Char above.
1105 */
1106static uint8_t
1107getCombClassU32Char(int32_t u32char)
1109 if (u32char >= kU32HiPUAStart) {
1110 return 0;
1111 }
1112 if (u32char == 0x03B9) {
1113 return 240;
1114 }
1115 /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */
1116 /* 0 - 0x323FF; everything in between in currently invalid. */
1117 int32_t u32charLookup = u32char;
1118 if (u32charLookup >= kU32LowRangeLimit) {
1119 u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit);
1120 }
1121 /* Now we have u32charLookup either in 0..0x323FF representing u32char itself,
1122 * or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in
1123 * the trie that identifies unassigneds in this range, or maps others to
1124 * decomps or combining class or just self. */
1125 uint16_t trieValue;
1126 /* TrieHi */
1127 trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift];
1128 if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) {
1129 return trieValue & kFlagValueMask;
1130 }
1131 /* TrieMid */
1132 trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask];
1133 if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /* return self; */
1134 return trieValue & kFlagValueMask;
1135 }
1136 if ((trieValue & kFlagTestMask) == kInvMaskFlag) {
1137 return 0;
1138 }
1139 /* TrieLo */
1140 trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieMidMask];
1141 return ((trieValue & kFlagTestMask) == kCombClassFlag)? (trieValue & kFlagValueMask): 0;
1145 * decomposeHangul, map a single UTF32 code point for a composed Hangul
1146 * in the range AC00-D7A3, using algorithmic decomp
1147 *
1148 * The normalized result will be 2 or 3 UTF32 characters.
1149 *
1150 * u32char - input UTF32 code point
1151 * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3)
1152 * to receive the normalize result.
1153 *
1154 * returns the length of the normalized result (2..3).
1155 *
1156 * Adapted from ICU Hangul:decompose in normalizer2impl.h
1157 *
1158 */
1160enum {
1161 HANGUL_BASE=0xAC00,
1162 JAMO_L_BASE=0x1100, /* "lead" jamo */
1163 JAMO_V_BASE=0x1161, /* "vowel" jamo */
1164 JAMO_T_BASE=0x11A7, /* "trail" jamo */
1165 JAMO_L_COUNT=19,
1166 JAMO_V_COUNT=21,
1167 JAMO_T_COUNT=28,
1170static int32_t
1171decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax])
1173 u32char -= HANGUL_BASE;
1174 int32_t tIndex = u32char % JAMO_T_COUNT;
1175 u32char /= JAMO_T_COUNT;
1176 u32NormFoldBuf[0] = (uint16_t)(JAMO_L_BASE + u32char / JAMO_V_COUNT);
1177 u32NormFoldBuf[1] = (uint16_t)(JAMO_V_BASE + u32char % JAMO_V_COUNT);
1178 if (tIndex == 0) {
1179 return 2;
1180 }
1181 u32NormFoldBuf[2] = (uint16_t)(JAMO_T_BASE + tIndex);
1182 return 3;