1 | /* |
2 | * Copyright (C) 2016-2023 Apple, Inc. All rights reserved. |
3 | * Some portions covered by other copyrights, listed below. |
4 | *--- |
5 | * Copyright (C) 2016 and later: Unicode, Inc. and others. |
6 | * License & terms of use: http://www.unicode.org/copyright.html |
7 | *--- |
8 | * Copyright (C) 1999-2015, International Business Machines |
9 | * Corporation and others. All Rights Reserved. |
10 | * |
11 | * add APPLE_OSREFERENCE_LICENSE_HEADER stuff... |
12 | */ |
13 | |
14 | #include <libkern/libkern.h> |
15 | #include <sys/errno.h> |
16 | #include <sys/unicode.h> |
17 | #include "vfs_unicode_data.h" |
18 | #define STATIC_UNLESS_TEST static |
19 | |
20 | enum { |
21 | /* Maximum number of UTF8 bytes from one Unicode code point (one UTF32 code unit) */ |
22 | kMaxUTF8BytesPerChar = 4 |
23 | }; |
24 | |
25 | /* local prototypes used by exported functions (and themselves exported for testing) */ |
26 | STATIC_UNLESS_TEST |
27 | int32_t utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit); |
28 | STATIC_UNLESS_TEST |
29 | int32_t normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens, |
30 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax], |
31 | uint8_t combClass[kNFCSingleCharDecompMax]); |
32 | /* local prototypes used by exported functions (not exported for separate testing) */ |
33 | static int nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes, |
34 | int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP, |
35 | int32_t* buf, uint8_t* bufcc, int32_t* buflenP, |
36 | bool* needReorderP, bool* startP); |
37 | void doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen); |
38 | int32_t u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]); |
39 | |
40 | /* |
41 | * utf8_normalizeOptCaseFoldGetUVersion |
42 | * |
43 | * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6 |
44 | * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3 |
45 | * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0 |
46 | * version[3] = Code revision level; for any given Unicode version, this value starts |
47 | * at 0 and is incremented for each significant revision to the |
48 | * normalizeOptCaseFold functions. |
49 | */ |
50 | void |
51 | utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]) |
52 | { |
53 | version[0] = 15; |
54 | version[1] = 1; |
55 | version[2] = 0; |
56 | version[3] = 0; |
57 | return; |
58 | } |
59 | |
60 | /* |
61 | * utf8_normalizeOptCaseFoldAndHash |
62 | * |
63 | * str: The input UTF-8 string (need not be 0 terminated) |
64 | * str_len: The byte length of the input string (excluding any 0 terminator) |
65 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. |
66 | * True for case-sensitive behavior; generates standard NFD. |
67 | * hash_func: A pointer to a hashing function to compute the hash of the |
68 | * normalized/case-folded result. buf contains buf_len bytes |
69 | * of data to be added to the hash using the caller-supplied |
70 | * context (ctx). |
71 | * hash_ctx: The context for the hash function. |
72 | * |
73 | * Returns: 0 on success, or |
74 | * EILSEQ: The input string contains illegal ASCII-range characters |
75 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or |
76 | * contains codepoints that are non-characters or unassigned in |
77 | * the version of Unicode currently supported (Unicode 9.0). |
78 | */ |
79 | |
80 | int |
81 | utf8_normalizeOptCaseFoldAndHash(const char *str, |
82 | size_t str_len, |
83 | bool case_sens, |
84 | void (*hash_func)(void *buf, size_t buf_len, void *ctx), |
85 | void *hash_ctx) |
86 | { |
87 | const char *strLimit = str + str_len; |
88 | |
89 | /* Data for the next pending single-char norm from input; |
90 | * This will always begin with a base char (combining class 0) |
91 | * or the first character in the string, which may no be a base */ |
92 | int32_t unorm[kNFCSingleCharDecompMax]; |
93 | uint8_t unormcc[kNFCSingleCharDecompMax]; |
94 | int32_t unormlen = 0; |
95 | int32_t unormstart = 0; |
96 | |
97 | bool start = true; |
98 | |
99 | /* main loop: |
100 | * Each input character may be normalized to a sequence of one or more characters, |
101 | * some of which may have non-zero combining class. Any sequence of characters |
102 | * with non-zero combining class resulting from one or more input characters needs |
103 | * to be accumulated in the main buffer so we can reorder as necessary before |
104 | * calling the hash function. |
105 | * |
106 | * At the beginning of the main loop: The normalization buffer and main buffer are |
107 | * both empty. |
108 | * |
109 | * Each time through the main loop we do the following: |
110 | * 1. If there are characters available in the normalization result buffer (from the |
111 | * result of normalizing a previous input character), copy the first character and |
112 | * any following characters that have non-zero combining class to the main buffer. |
113 | * 2. If there is nothing left in the normalization buffer, then loop processing |
114 | * input characters as follows: |
115 | * a) Get the next input character from UTF8, get its normalized and case-folded |
116 | * result in the normalization buffer. |
117 | * b) If the first character in the normalization buffer has combining class 0, |
118 | * break; we will handle this normalization buffer next time through the main |
119 | * loop. |
120 | * c) Else copy the current normalization buffer (which has only combining marks) |
121 | * to the main buffer, and continue with the loop processing input characters. |
122 | * 3. At this point the first character in the main buffer may or may not have |
123 | * combining class 0, but any subsequent characters (up to the the limit for |
124 | * stream safe text) will be combining characters with nonzero combining class. |
125 | * Reorder the combining marks if necessary into canonical order. |
126 | * 4. Call the hash function for each character in the main buffer. |
127 | * |
128 | */ |
129 | do { |
130 | /* Data for the buffers being built up from input */ |
131 | int32_t buf[kNCFStreamSafeBufMax]; |
132 | uint8_t bufcc[kNCFStreamSafeBufMax]; |
133 | int32_t buflen = 0; |
134 | bool needReorder = false; |
135 | int err; |
136 | |
137 | err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, false /* allow_slashes */, |
138 | unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start); |
139 | if (err != 0) { |
140 | return err; |
141 | } |
142 | |
143 | if (buflen > 0) { |
144 | /* Now buffer should have all of the combining marks up to the next base char. |
145 | * Normally it will also start with the last base char encountered (unless the |
146 | * UTF8 string began with a combining mark). */ |
147 | /* Now reorder combining marks if necessary. */ |
148 | if (needReorder) { |
149 | doReorder(buf, bufcc, buflen); |
150 | } |
151 | /* Now write to hash func */ |
152 | hash_func(buf, buflen * sizeof(buf[0]), hash_ctx); |
153 | } |
154 | /* OK so far, top of loop clears buffers to start refilling again */ |
155 | } while (str < strLimit || unormlen > 0); |
156 | return 0; |
157 | } |
158 | |
159 | /* |
160 | * utf8_normalizeOptCaseFoldAndCompare |
161 | * |
162 | * strA: A UTF-8 string to be compared (need not be 0 terminated) |
163 | * strA_len: The byte length of strA (excluding any 0 terminator) |
164 | * strB: The second UTF-8 string to be compared (need not be 0 terminated) |
165 | * strB_len: The byte length of strB (excluding any 0 terminator) |
166 | * case_sens: False for case-insensitive behavior; compares canonical caseless forms. |
167 | * True for case-sensitive behavior; compares standard NFD forms. |
168 | * are_equal: On success, set to true if the strings are equal, or set to false |
169 | * if they are not. |
170 | * |
171 | * Returns: 0 on success, or |
172 | * EILSEQ: One or both of the input strings contains illegal ASCII-range |
173 | * characters (0x00 or '/'), or is not well-formed stream-safe UTF-8, |
174 | * or contains codepoints that are non-characters or unassigned in |
175 | * the version of Unicode currently supported (Unicode 9.0). |
176 | * Note: The comparison may terminate early when a difference is |
177 | * detected, and may return 0 and set *are_equal=false even |
178 | * if one or both strings are invalid. |
179 | */ |
180 | enum { kNFCSingleCharDecompMaxPlusPushback = kNFCSingleCharDecompMax + 4 }; /* room for 03B9 pushback(s) */ |
181 | |
182 | int |
183 | utf8_normalizeOptCaseFoldAndCompare(const char *strA, |
184 | size_t strA_len, |
185 | const char *strB, |
186 | size_t strB_len, |
187 | bool case_sens, |
188 | bool *are_equal) |
189 | { |
190 | const char *strALimit = strA + strA_len; |
191 | const char *strBLimit = strB + strB_len; |
192 | |
193 | /* Data for the next pending single-char norms from each input; |
194 | * These will always begin with a base char (combining class 0) |
195 | * or the first character in the string, which may not be a base */ |
196 | int32_t unormA[kNFCSingleCharDecompMaxPlusPushback], unormB[kNFCSingleCharDecompMaxPlusPushback]; |
197 | uint8_t unormAcc[kNFCSingleCharDecompMaxPlusPushback], unormBcc[kNFCSingleCharDecompMaxPlusPushback]; |
198 | int32_t unormAlen = 0, unormBlen = 0; |
199 | int32_t unormAstart = 0, unormBstart = 0; |
200 | |
201 | bool startA = true, startB = true; |
202 | |
203 | /* main loop: |
204 | * The main loop here is similar to the main loop in utf8_normalizeOptCaseFoldAndHash, |
205 | * described above. The differences are: |
206 | * - We keep a normalization buffer and main buffer for each string. |
207 | * - In the main loop, we do steps 1-3 for each string. |
208 | * - In step 4, instead of calling the hash function, we compare the two main |
209 | * buffers; if they are unequal, we return a non-equal result. |
210 | * - After the end of the main loop, if we still have data for one string but |
211 | * not the other, return a non-equal result, else return an equal result. |
212 | */ |
213 | do { |
214 | /* Data for the buffers being built up from each input */ |
215 | int32_t bufA[kNCFStreamSafeBufMax], bufB[kNCFStreamSafeBufMax]; |
216 | uint8_t bufAcc[kNCFStreamSafeBufMax], bufBcc[kNCFStreamSafeBufMax]; |
217 | int32_t bufAlen = 0, bufBlen = 0; |
218 | bool needReorderA = false, needReorderB = false; |
219 | int err; |
220 | |
221 | err = nextBaseAndAnyMarks(strP: &strA, strLimit: strALimit, case_sens, false /* allow_slashes */, |
222 | unorm: unormA, unormcc: unormAcc, unormlenP: &unormAlen, unormstartP: &unormAstart, buf: bufA, bufcc: bufAcc, buflenP: &bufAlen, needReorderP: &needReorderA, startP: &startA); |
223 | if (err != 0) { |
224 | return err; |
225 | } |
226 | err = nextBaseAndAnyMarks(strP: &strB, strLimit: strBLimit, case_sens, false /* allow_slashes */, |
227 | unorm: unormB, unormcc: unormBcc, unormlenP: &unormBlen, unormstartP: &unormBstart, buf: bufB, bufcc: bufBcc, buflenP: &bufBlen, needReorderP: &needReorderB, startP: &startB); |
228 | if (err != 0) { |
229 | return err; |
230 | } |
231 | |
232 | if (bufAlen > 0 || bufBlen > 0) { |
233 | /* Now each buffer should have all of the combining marks up to the next base char. |
234 | * Normally it will also start with the last base char encountered (unless the |
235 | * UTF8 string began with a combining mark). */ |
236 | /* Now reorder combining marks if necessary. */ |
237 | if (needReorderA) { |
238 | doReorder(buf: bufA, bufcc: bufAcc, buflen: bufAlen); |
239 | } |
240 | if (needReorderB) { |
241 | doReorder(buf: bufB, bufcc: bufBcc, buflen: bufBlen); |
242 | } |
243 | /* handle 03B9 pushback */ |
244 | int32_t idx; |
245 | if (!case_sens) { |
246 | if (bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9 && unormAstart == 0) { |
247 | int32_t tailCount = 0; |
248 | while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormAlen && bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9) { |
249 | tailCount++; |
250 | bufAlen--; |
251 | } |
252 | for (idx = unormAlen; idx > 0; idx--) { |
253 | unormA[idx - 1 + tailCount] = unormA[idx - 1]; |
254 | unormAcc[idx - 1 + tailCount] = unormAcc[idx - 1]; |
255 | } |
256 | for (idx = 0; idx < tailCount; idx++) { |
257 | unormA[idx] = 0x03B9; |
258 | unormAcc[idx] = 0; |
259 | } |
260 | unormAlen += tailCount; |
261 | } |
262 | if (bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9 && unormBstart == 0) { |
263 | int32_t tailCount = 0; |
264 | while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormBlen && bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9) { |
265 | tailCount++; |
266 | bufBlen--; |
267 | } |
268 | for (idx = unormBlen; idx > 0; idx--) { |
269 | unormB[idx - 1 + tailCount] = unormB[idx - 1]; |
270 | unormBcc[idx - 1 + tailCount] = unormBcc[idx - 1]; |
271 | } |
272 | for (idx = 0; idx < tailCount; idx++) { |
273 | unormB[idx] = 0x03B9; |
274 | unormBcc[idx] = 0; |
275 | } |
276 | unormBlen += tailCount; |
277 | } |
278 | } |
279 | /* Now compare the buffers. */ |
280 | if (bufAlen != bufBlen || memcmp(s1: bufA, s2: bufB, n: bufAlen * sizeof(bufA[0])) != 0) { |
281 | *are_equal = false; |
282 | return 0; |
283 | } |
284 | } |
285 | /* OK so far, top of loop clears buffers to start refilling again */ |
286 | } while ((strA < strALimit || unormAlen > 0) && (strB < strBLimit || unormBlen > 0)); |
287 | |
288 | *are_equal = (strA == strALimit && unormAlen == 0 && strB == strBLimit && unormBlen == 0); |
289 | return 0; |
290 | } |
291 | |
292 | /* |
293 | * utf8_normalizeOptCaseFold |
294 | * |
295 | * str: The input UTF-8 string (need not be 0 terminated) |
296 | * str_len: The byte length of the input string (excluding any 0 terminator) |
297 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. |
298 | * True for case-sensitive behavior; generates standard NFD. |
299 | * ustr: A pointer to a buffer for the resulting UTF-32 string. |
300 | * ustr_size: The capacity of ustr, in UTF-32 units. |
301 | * ustr_len: Pointer to a value that will be filled in with the actual length |
302 | * in UTF-32 units of the string copied to ustr. |
303 | * |
304 | * Returns: 0 on success, or |
305 | * EILSEQ: The input string contains illegal ASCII-range characters |
306 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or |
307 | * contains codepoints that are non-characters or unassigned in |
308 | * the version of Unicode currently supported. |
309 | * ENOMEM: ustr_size is insufficient for the resulting string. In this |
310 | * case the value returned in *ustr_len is invalid. |
311 | */ |
312 | int |
313 | utf8_normalizeOptCaseFold(const char *str, |
314 | size_t str_len, |
315 | bool case_sens, |
316 | int32_t *ustr, |
317 | int32_t ustr_size, |
318 | int32_t *ustr_len) |
319 | { |
320 | const char *strLimit = str + str_len; |
321 | int32_t *ustrCur = ustr; |
322 | const int32_t *ustrLimit = ustr + ustr_size; |
323 | |
324 | /* Data for the next pending single-char norm from input; |
325 | * This will always begin with a base char (combining class 0) */ |
326 | int32_t unorm[kNFCSingleCharDecompMax]; |
327 | uint8_t unormcc[kNFCSingleCharDecompMax]; |
328 | int32_t unormlen = 0; |
329 | int32_t unormstart = 0; |
330 | |
331 | bool start = true; |
332 | |
333 | *ustr_len = 0; |
334 | do { |
335 | /* Data for the buffers being built up from input */ |
336 | int32_t buf[kNCFStreamSafeBufMax]; |
337 | uint8_t bufcc[kNCFStreamSafeBufMax]; |
338 | int32_t buflen = 0; |
339 | bool needReorder = false; |
340 | int err; |
341 | |
342 | err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, false /* allow_slashes */, |
343 | unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start); |
344 | if (err != 0) { |
345 | return err; |
346 | } |
347 | |
348 | if (buflen > 0) { |
349 | if (needReorder) { |
350 | doReorder(buf, bufcc, buflen); |
351 | } |
352 | /* Now copy to output buffer */ |
353 | int32_t idx; |
354 | if (ustrCur + buflen > ustrLimit) { |
355 | return ENOMEM; |
356 | } |
357 | for (idx = 0; idx < buflen; idx++) { |
358 | *ustrCur++ = buf[idx]; |
359 | } |
360 | } |
361 | /* OK so far, top of loop clears buffers to start refilling again */ |
362 | } while (str < strLimit || unormlen > 0); |
363 | *ustr_len = (uint32_t)(ustrCur - ustr); // XXXpjr: the explicit (uint32_t) cast wasn't present in the original code drop |
364 | return 0; |
365 | } |
366 | |
367 | static int |
368 | utf8_normalizeOptCaseFoldToUTF8_internal(const char *str, |
369 | size_t str_len, |
370 | bool case_sens, |
371 | bool allow_slashes, |
372 | char *ustr, |
373 | size_t ustr_size, |
374 | size_t *ustr_len) |
375 | { |
376 | const char *strLimit = str + str_len; |
377 | char *ustrCur = ustr; |
378 | const char *ustrLimit = ustr + ustr_size; |
379 | |
380 | /* Data for the next pending single-char norm from input; |
381 | * This will always begin with a base char (combining class 0) */ |
382 | int32_t unorm[kNFCSingleCharDecompMax]; |
383 | uint8_t unormcc[kNFCSingleCharDecompMax]; |
384 | int32_t unormlen = 0; |
385 | int32_t unormstart = 0; |
386 | |
387 | bool start = true; |
388 | |
389 | *ustr_len = 0; |
390 | do { |
391 | /* Data for the buffers being built up from input */ |
392 | int32_t buf[kNCFStreamSafeBufMax]; |
393 | uint8_t bufcc[kNCFStreamSafeBufMax]; |
394 | int32_t buflen = 0; |
395 | bool needReorder = false; |
396 | int err; |
397 | |
398 | err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, allow_slashes, |
399 | unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start); |
400 | if (err != 0) { |
401 | return err; |
402 | } |
403 | |
404 | if (buflen > 0) { |
405 | uint8_t utf8Bytes[kMaxUTF8BytesPerChar]; |
406 | int32_t *bufPtr = buf; |
407 | if (needReorder) { |
408 | doReorder(buf, bufcc, buflen); |
409 | } |
410 | /* Now copy to output buffer */ |
411 | while (buflen-- > 0) { |
412 | int32_t idx, utf8Len = u32CharToUTF8Bytes(u32char: (uint32_t)*bufPtr++, utf8Bytes); |
413 | if (ustrCur + utf8Len > ustrLimit) { |
414 | return ENOMEM; |
415 | } |
416 | for (idx = 0; idx < utf8Len; idx++) { |
417 | *ustrCur++ = (char)utf8Bytes[idx]; |
418 | } |
419 | } |
420 | } |
421 | /* OK so far, top of loop clears buffers to start refilling again */ |
422 | } while (str < strLimit || unormlen > 0); |
423 | *ustr_len = ustrCur - ustr; |
424 | return 0; |
425 | } |
426 | |
427 | /* |
428 | * utf8_normalizeOptCaseFoldToUTF8 |
429 | * (This is similar to normalizeOptCaseFold except that this has a different output |
430 | * buffer type, and adds conversion to UTF8 while copying to output buffer) |
431 | * |
432 | * str: The input UTF-8 string (need not be 0 terminated) |
433 | * str_len: The byte length of the input string (excluding any 0 terminator) |
434 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. |
435 | * True for case-sensitive behavior; generates standard NFD. |
436 | * ustr: A pointer to a buffer for the resulting UTF-8 string. |
437 | * ustr_size: The capacity of ustr, in bytes. |
438 | * ustr_len: Pointer to a value that will be filled in with the actual length |
439 | * in bytes of the string copied to ustr. |
440 | * |
441 | * Returns: 0 on success, or |
442 | * EILSEQ: The input string contains illegal ASCII-range characters |
443 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or |
444 | * contains codepoints that are non-characters or unassigned in |
445 | * the version of Unicode currently supported. |
446 | * ENOMEM: ustr_size is insufficient for the resulting string. In this |
447 | * case the value returned in *ustr_len is invalid. |
448 | */ |
449 | int |
450 | utf8_normalizeOptCaseFoldToUTF8(const char *str, |
451 | size_t str_len, |
452 | bool case_sens, |
453 | char *ustr, |
454 | size_t ustr_size, |
455 | size_t *ustr_len) |
456 | { |
457 | return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, false /* allow_slashes */, |
458 | ustr, ustr_size, ustr_len); |
459 | } |
460 | |
461 | /* |
462 | * utf8_normalizeOptCaseFoldToUTF8ForPath |
463 | * (This is similar to normalizeOptCaseFoldToUTF8 except that this allows '/' character.) |
464 | * |
465 | * str: The input UTF-8 path string |
466 | * str_len: The byte length of the input path string (excluding any 0 terminator) |
467 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. |
468 | * True for case-sensitive behavior; generates standard NFD. |
469 | * ustr: A pointer to a buffer for the resulting UTF-8 string. |
470 | * ustr_size: The capacity of ustr, in bytes. |
471 | * ustr_len: Pointer to a value that will be filled in with the actual length |
472 | * in bytes of the string copied to ustr. |
473 | * |
474 | * Returns: 0 on success, or |
475 | * EILSEQ: The input string contains illegal ASCII-range characters |
476 | * (0x00), or is not well-formed stream-safe UTF-8, or |
477 | * contains codepoints that are non-characters or unassigned in |
478 | * the version of Unicode currently supported. |
479 | * ENOMEM: ustr_size is insufficient for the resulting string. In this |
480 | * case the value returned in *ustr_len is invalid. |
481 | */ |
482 | int |
483 | utf8_normalizeOptCaseFoldToUTF8ForPath(const char *str, |
484 | size_t str_len, |
485 | bool case_sens, |
486 | char *ustr, |
487 | size_t ustr_size, |
488 | size_t *ustr_len) |
489 | { |
490 | return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, true /* allow_slashes */, |
491 | ustr, ustr_size, ustr_len); |
492 | } |
493 | |
494 | /* |
495 | * utf8_normalizeOptCaseFoldAndMatchSubstring |
496 | * |
497 | * strA: A UTF-8 string (need not be 0 terminated) in which to search for the |
498 | * substring specified by ustrB. |
499 | * strA_len: The byte length of strA (excluding any 0 terminator) |
500 | * ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched |
501 | * for in the UTF-32 string resulting from converting strA to the normalized |
502 | * UTF-32 form specified by the case_sens parameter; ustrB must already be |
503 | * in that form. |
504 | * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator). |
505 | * case_sens: False for case-insensitive matching; compares canonical caseless forms. |
506 | * True for case-sensitive matching; compares standard NFD forms. |
507 | * buf: Pointer to caller-supplied working memory for storing the portion of |
508 | * strA which has been converted to normalized UTF-32. |
509 | * buf_size: The size of buf. |
510 | * has_match: On success, set to true if strA (when converter to UTF-32 and normalized |
511 | * per case_sens) contains ustrB, set to false otherwise. |
512 | * |
513 | * Returns: 0 on success, or |
514 | * EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is |
515 | * not well-formed stream-safe UTF-8, or contains codepoints that are |
516 | * non-characters or unassigned in the version of Unicode currently |
517 | * supported. |
518 | * Note: The search may terminate early when a match is detected, and |
519 | * may return 0 and set *has_match=true even if strA is invalid. |
520 | * ENOMEM: buf_size is insufficient. |
521 | */ |
522 | int |
523 | utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA, |
524 | size_t strA_len, |
525 | const int32_t *ustrB, |
526 | int32_t ustrB_len, |
527 | bool case_sens, |
528 | void *buf, |
529 | size_t buf_size, |
530 | bool *has_match) |
531 | { |
532 | /* |
533 | * ustrA represents the current position in the UTF-32 normalized version of strA |
534 | * at which we want to test for a match; ustrANormEnd is the position beyond that |
535 | * which is just after the end of what has already been converted from strA to |
536 | * UTF-32 normalized form. |
537 | * Each time through the main loop: |
538 | * - The first task is to make sure we have enough of strA converted to UTF32 |
539 | * normalized form to test for match with ustrB at the current match position. |
540 | * If we don't, then convert more of strA to UTF-32 normalized form until we |
541 | * have enough to compare with ustrB. To do this, run a loop which is like the |
542 | * main loop in utf8_normalizeOptCaseFoldAndHash except that in step 4, instead of |
543 | * calling the hash function, we copy the normalized buffer to ustrANormEnd, |
544 | * advancing the latter. We keep doing this until we have enough additional |
545 | * converted to match with ustrB. |
546 | * - Then we test for match of ustrB at the current ustrA position. If there is |
547 | * a match we return; otherwise, if there is more strA to convert we advance |
548 | * ustrA and repeat the main loop, otherwise we return without a match. |
549 | */ |
550 | if (ustrB_len == 0) { /* always matches */ |
551 | *has_match = true; |
552 | return 0; |
553 | } |
554 | *has_match = false; /* initialize return value */ |
555 | if (ustrB_len > 2 * strA_len) { |
556 | /* If ustrB is clearly too long to find in strA, don't bother normalizing strA. |
557 | * A UTF-8 character of 1 byte (ASCII) will normalize to 1 UTF-32 unit. |
558 | * A UTF-8 character of 2-4 bytes will normalize to a maximum of 4 UTF-32 units. |
559 | * The maximum expansion from unnormalized UTF-8 byte length to normalized |
560 | * UTF-32 unit length is thus 2. */ |
561 | return 0; |
562 | } |
563 | |
564 | const char *strALimit = strA + strA_len; |
565 | int32_t *ustrA = (int32_t *)buf; |
566 | const int32_t *ustrALimit = ustrA + (buf_size / sizeof(int32_t)); |
567 | int32_t *ustrANormEnd = ustrA; /* how far we have already normalized in ustrA */ |
568 | |
569 | /* Data for the next pending single-char norms from each input; |
570 | * These will always begin with a base char (combining class 0) |
571 | * or the first character in the string, which may not be a base */ |
572 | int32_t unormA[kNFCSingleCharDecompMax]; |
573 | uint8_t unormAcc[kNFCSingleCharDecompMax]; |
574 | int32_t unormAlen = 0; |
575 | int32_t unormAstart = 0; |
576 | |
577 | bool startA = true; |
578 | |
579 | while (true) { |
580 | /* convert enough more of strA to normalized UTF-32 in ustrA to check for match */ |
581 | if (ustrANormEnd - ustrA < ustrB_len) { |
582 | do { |
583 | /* Data for the buffers being built up from each input */ |
584 | int32_t bufA[kNCFStreamSafeBufMax]; |
585 | uint8_t bufAcc[kNCFStreamSafeBufMax]; |
586 | int32_t bufAlen = 0; |
587 | bool needReorderA = false; |
588 | int err; |
589 | |
590 | err = nextBaseAndAnyMarks(strP: &strA, strLimit: strALimit, case_sens, false /* allow_slashes */, |
591 | unorm: unormA, unormcc: unormAcc, unormlenP: &unormAlen, unormstartP: &unormAstart, buf: bufA, bufcc: bufAcc, buflenP: &bufAlen, needReorderP: &needReorderA, startP: &startA); |
592 | if (err != 0) { |
593 | return err; |
594 | } |
595 | |
596 | if (bufAlen > 0) { |
597 | /* Now each buffer should have all of the combining marks up to the next base char. |
598 | * Normally it will also start with the last base char encountered (unless the |
599 | * UTF8 string began with a combining mark). */ |
600 | /* Now reorder combining marks if necessary. Should be rare, and sequences should |
601 | * usually be short when does occur => simple bubblesort should be sufficient. */ |
602 | if (needReorderA) { |
603 | doReorder(buf: bufA, bufcc: bufAcc, buflen: bufAlen); |
604 | } |
605 | /* Now copy to working buffer */ |
606 | int32_t idx; |
607 | if (ustrANormEnd + bufAlen > ustrALimit) { |
608 | return ENOMEM; |
609 | } |
610 | for (idx = 0; idx < bufAlen; idx++) { |
611 | *ustrANormEnd++ = bufA[idx]; |
612 | } |
613 | } |
614 | /* OK so far, top of loop clears buffers to start refilling again */ |
615 | } while ((ustrANormEnd - ustrA < ustrB_len) && (strA < strALimit || unormAlen > 0)); |
616 | } |
617 | |
618 | if (ustrANormEnd - ustrA < ustrB_len) { |
619 | return 0; /* not enough of strA left for match */ |
620 | } |
621 | /* check for match, return if so */ |
622 | if (memcmp(s1: ustrA, s2: ustrB, n: ustrB_len * sizeof(ustrB[0])) == 0) { |
623 | *has_match = true; |
624 | return 0; |
625 | } |
626 | ustrA++; /* advance match position */ |
627 | } |
628 | } |
629 | |
630 | /* nextBaseAndAnyMarks: |
631 | * Guts of code to get next bufferful of base character (or first char in string) |
632 | * and all trailing combining marks. |
633 | * This is called each time through the main loop of functions above, and does the |
634 | * following: |
635 | * 1. If there are characters available in the normalization result buffer (from the |
636 | * result of normalizing a previous input character), copy the first character and |
637 | * any following characters that have non-zero combining class to the main buffer. |
638 | * 2. If there is nothing left in the normalization buffer, then loop processing |
639 | * input characters as follows: |
640 | * a) Get the next input character from UTF8, get its normalized and case-folded |
641 | * result in the normalization buffer. |
642 | * b) If the first character in the normalization buffer has combining class 0, |
643 | * break; we will handle this normalization buffer next time through the main |
644 | * loop. |
645 | * c) Else copy the current normalization buffer (which has only combining marks) |
646 | * to the main buffer, and continue with the loop processing input characters. |
647 | */ |
648 | |
649 | static int |
650 | nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes, |
651 | int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP, |
652 | int32_t* buf, uint8_t* bufcc, int32_t* buflenP, |
653 | bool* needReorderP, bool* startP) |
654 | { |
655 | /* update buffers for str */ |
656 | if (*unormlenP > 0 && *unormstartP < *unormlenP) { |
657 | /* unorm begins with a base char; buflen should be 0 */ |
658 | *needReorderP = false; |
659 | for (*buflenP = 0; true;) { |
660 | if (*buflenP > 0 && unormcc[*unormstartP] > 0 && unormcc[*unormstartP] < bufcc[(*buflenP) - 1]) { |
661 | *needReorderP = true; |
662 | } |
663 | buf[*buflenP] = unorm[*unormstartP]; |
664 | bufcc[(*buflenP)++] = unormcc[(*unormstartP)++]; |
665 | if (*unormstartP >= *unormlenP || unormcc[*unormstartP] == 0) { |
666 | break; |
667 | } |
668 | } |
669 | } |
670 | if (*unormstartP >= *unormlenP) { |
671 | *unormstartP = *unormlenP = 0; |
672 | while (*strP < strLimit) { |
673 | int32_t idx; |
674 | uint32_t bytevalue = (uint8_t)*(*strP)++; |
675 | /* '/' is not produced by NFD decomposition from another character so we can |
676 | * check for it before normalization */ |
677 | if (bytevalue == 0 || (bytevalue == 0x2F /*'/'*/ && !allow_slashes)) { |
678 | return EILSEQ; |
679 | } |
680 | if (bytevalue < 0x80) { |
681 | unorm[0] = (!case_sens && bytevalue >= 'A' && bytevalue <= 'Z')? bytevalue += 0x20: bytevalue; |
682 | *unormlenP = 1; |
683 | unormcc[0] = 0; |
684 | *startP = false; |
685 | break; |
686 | } else { |
687 | int32_t u32char = utf8ToU32Code(u32char: bytevalue, srcPtr: strP, srcLimit: strLimit); |
688 | if (u32char <= 0) { |
689 | return EILSEQ; |
690 | } |
691 | *unormlenP = normalizeOptCaseFoldU32Char(u32char, case_sens, u32NormFoldBuf: unorm, combClass: unormcc); |
692 | if (*unormlenP <= 0) { |
693 | return EILSEQ; |
694 | } |
695 | if (unormcc[0] == 0 || *startP) { |
696 | *startP = false; |
697 | break; |
698 | } |
699 | } |
700 | /* the latest char decomposes to just combining sequence, add to buffer being built */ |
701 | if (*buflenP + *unormlenP > kNCFStreamSafeBufMax) { |
702 | return EILSEQ; |
703 | } |
704 | for (idx = 0; idx < *unormlenP; idx++, (*buflenP)++) { |
705 | if (*buflenP > 0 && unormcc[idx] > 0 && unormcc[idx] < bufcc[(*buflenP) - 1]) { |
706 | *needReorderP = true; |
707 | } |
708 | buf[*buflenP] = unorm[idx]; |
709 | bufcc[*buflenP] = unormcc[idx]; |
710 | } |
711 | *unormlenP = 0; |
712 | } |
713 | } |
714 | return 0; |
715 | } |
716 | |
717 | /* local prototypes used only by internal functions */ |
718 | static void swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]); |
719 | static int32_t adjustCase(bool case_sens, int32_t uSeqLen, |
720 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]); |
721 | static uint8_t getCombClassU32Char(int32_t u32char); |
722 | static int32_t decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]); |
723 | |
724 | /* Reorder combining marks if necessary. Should be rare, and sequences should |
725 | * usually be short when does occur => simple bubblesort should be sufficient. */ |
726 | void |
727 | doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen) |
728 | { |
729 | int32_t idx, jdx; |
730 | for (idx = 0; idx < buflen - 1; idx++) { |
731 | for (jdx = buflen - 1; jdx > idx; jdx--) { |
732 | if (bufcc[jdx] < bufcc[jdx - 1]) { |
733 | swapBufCharCCWithPrevious(jdx, buf, bufcc); |
734 | } |
735 | } |
736 | } |
737 | } |
738 | /* swap function for bubblesort */ |
739 | static void |
740 | swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]) |
741 | { |
742 | int32_t bufchar = buf[jdx]; |
743 | uint8_t bufccval = bufcc[jdx]; |
744 | buf[jdx] = buf[jdx - 1]; |
745 | bufcc[jdx] = bufcc[jdx - 1]; |
746 | buf[jdx - 1] = bufchar; |
747 | bufcc[jdx - 1] = bufccval; |
748 | } |
749 | |
750 | /* |
751 | * u32CharToUTF8Bytes, map a valid Unicode character (UTF32 code point) to 1..4 UTF8 bytes, |
752 | * and returns the number of UTF8 bytes. |
753 | * |
754 | * adapted from ICU macro U8_APPEND_UNSAFE (utf8.h). |
755 | */ |
756 | int32_t |
757 | u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]) |
758 | { |
759 | int32_t idx = 0; |
760 | if (u32char <= 0x7F) { |
761 | utf8Bytes[idx++] = (uint8_t)u32char; |
762 | } else { |
763 | if (u32char <= 0x7FF) { |
764 | utf8Bytes[idx++] = (uint8_t)((u32char >> 6) | 0xC0); |
765 | } else { |
766 | if (u32char <= 0xFFFF) { |
767 | utf8Bytes[idx++] = (uint8_t)((u32char >> 12) | 0xE0); |
768 | } else { |
769 | utf8Bytes[idx++] = (uint8_t)((u32char >> 18) | 0xF0); |
770 | utf8Bytes[idx++] = (uint8_t)(((u32char >> 12) & 0x3F) | 0x80); |
771 | } |
772 | utf8Bytes[idx++] = (uint8_t)(((u32char >> 6) & 0x3F) | 0x80); |
773 | } |
774 | utf8Bytes[idx++] = (uint8_t)((u32char & 0x3F) | 0x80); |
775 | } |
776 | return idx; |
777 | } |
778 | |
779 | /* two macros adapted from ICU's utf8.h */ |
780 | #define U8_COUNT_TRAIL_BYTES_LOC(leadByte) \ |
781 | ((uint8_t)(leadByte)<0XF0 ? \ |
782 | ((uint8_t)(leadByte)>=0XC0)+((uint8_t)(leadByte)>=0XE0) : \ |
783 | (uint8_t)(leadByte)<0XFE ? 3+((uint8_t)(leadByte)>=0XF8)+((uint8_t)(leadByte)>=0XFC) : 0) |
784 | |
785 | #define U8_MASK_LEAD_BYTE_LOC(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) |
786 | |
787 | /* array adapted from ICU's utf_impl.c */ |
788 | static const int32_t utf8_minLegal[4] = { 0, 0X80, 0x800, 0x10000 }; |
789 | |
790 | /* |
791 | * utf8ToU32Code, map a non-ASCII byte value plus a buffer of trail bytes to a UTF32 code point |
792 | * |
793 | * adapted from ICU macro U8_NEXT (utf8.h) and function utf8_nextCharSafeBody (utf_impl.c); |
794 | * verified to produce the same results (adusted for the difference in API signature). |
795 | * |
796 | * assumes at entry that: |
797 | * 1. a non-ASCII byte value (>= 0x80) that purports to be the beginning of a UTF8 character |
798 | * has been read, and its value is in u32char |
799 | * 2. *srcPtr points to the input buffer just after that non-ASCII byte, i.e. it purportedly |
800 | * points to the trail bytes for that UTF8 char. |
801 | * 3. srcLimit points to end of the input buffer (just after the last byte in the buffer) |
802 | * |
803 | * For a valid and complete UTF8 character, the function returns its value and advances |
804 | * *srcPtr to the first byte after the UTF8 char. Otherwise, the function returns -1 |
805 | * (and the value in *srcPtr is undefined). |
806 | * Note that while it does not map to surrogate values (generates an error for malformed |
807 | * UTF-8 that would map to values in 0xD800..0xD8FF), it does output noncharacter values |
808 | * whose low 16 bits are 0xFFFE or 0xFFFF without generating an error. |
809 | * |
810 | * equivalences used in adapted ICU code: |
811 | * UChar = uint16_t |
812 | * UChar32 = int32_t |
813 | * |
814 | * This has been validated against ICU behavior. |
815 | */ |
816 | STATIC_UNLESS_TEST |
817 | int32_t |
818 | utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit) |
819 | { |
820 | const char* src = *srcPtr; |
821 | uint8_t pt1, pt2; |
822 | if (0xE0 < u32char && u32char <= 0xEC && src + 1 < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F && (pt2 = (uint8_t)(src[1] - 0x80)) <= 0x3F) { |
823 | /* handle U+1000..U+CFFF */ |
824 | /* no need for (u32char&0xF) because the upper bits are truncated after <<12 in the cast to (uint16_t) */ |
825 | u32char = (uint16_t)((u32char << 12) | (pt1 << 6) | pt2); |
826 | src += 2; |
827 | } else if (u32char < 0xE0 && u32char >= 0xC2 && src < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F) { |
828 | /* handle U+0080..U+07FF */ |
829 | u32char = ((u32char & 0x1F) << 6) | pt1; |
830 | src++; |
831 | } else { |
832 | /* "complicated" and error cases, adapted from ICU's utf8_nextCharSafeBody() */ |
833 | uint8_t count = U8_COUNT_TRAIL_BYTES_LOC(u32char); |
834 | if (src + count <= srcLimit) { |
835 | uint8_t trail; |
836 | |
837 | U8_MASK_LEAD_BYTE_LOC(u32char, count); |
838 | switch (count) { |
839 | /* branches 3, 2 fall through to the next one */ |
840 | case 0: /* count==0 for illegally leading trail bytes and the illegal bytes 0XFE and 0XFF */ |
841 | case 5: |
842 | case 4: /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ |
843 | break; |
844 | case 3: |
845 | trail = *src++ - (char)0X80; |
846 | u32char = (u32char << 6) | trail; |
847 | /* u32char>=0x110 would result in code point>0x10FFFF, outside Unicode */ |
848 | if (u32char >= 0x110 || trail > 0X3F) { |
849 | break; |
850 | } |
851 | case 2: |
852 | trail = *src++ - (char)0X80; |
853 | u32char = (u32char << 6) | trail; |
854 | /* |
855 | * test for a surrogate D800..DFFF: |
856 | * before the last (u32char<<6), a surrogate is u32char=360..37F |
857 | */ |
858 | if (((u32char & 0xFFE0) == 0x360) || trail > 0X3F) { |
859 | break; |
860 | } |
861 | case 1: |
862 | trail = *src++ - (char)0X80; |
863 | u32char = (u32char << 6) | trail; |
864 | if (trail > 0X3F) { |
865 | break; |
866 | } |
867 | /* correct sequence - all trail bytes have (b7..b6)==(10) */ |
868 | if (u32char >= utf8_minLegal[count]) { |
869 | *srcPtr = src; |
870 | return u32char; |
871 | } |
872 | /* no default branch to optimize switch() - all values are covered */ |
873 | } |
874 | } |
875 | u32char = -1; |
876 | } |
877 | *srcPtr = src; |
878 | return u32char; |
879 | } |
880 | |
881 | /* |
882 | * normalizeCaseFoldU32Code, map a single UTF32 code point to its normalized result |
883 | * and the combining classes for each resulting char, or indicate it is invalid. |
884 | * |
885 | * The normalized and case-folded result might be up to 4 UTF32 characters (current |
886 | * max, could change in the future). |
887 | * |
888 | * u32char - input UTF32 code point |
889 | * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only |
890 | * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) |
891 | * to receive the normalize result. |
892 | * combClass - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) |
893 | * to receive the combining classes for the characters in u32NormFoldBuf. If |
894 | * the first entry has non-zero combining class, the remaining entries do too. |
895 | * |
896 | * returns -1 if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax |
897 | * is insufficient (though it is assumed to be at least 3), else the length of the |
898 | * normalized and case-folded result (currently in the range 1..4). |
899 | * |
900 | * This has been validated against ICU behavior. |
901 | * |
902 | * This function is highly dependent on the structure of the data trie; for details on |
903 | * that structure, see comments in normalizeCaseFoldData.h |
904 | */ |
905 | STATIC_UNLESS_TEST |
906 | int32_t |
907 | normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens, |
908 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax], |
909 | uint8_t combClass[kNFCSingleCharDecompMax]) |
910 | { |
911 | combClass[0] = 0; |
912 | /* return hi-range PUA as self, except non-characters */ |
913 | if (u32char >= kU32HiPUAStart) { |
914 | if ((u32char & 0xFFFE) == 0xFFFE) { |
915 | return -1; |
916 | } |
917 | u32NormFoldBuf[0] = u32char; |
918 | return 1; |
919 | } |
920 | /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */ |
921 | /* 0 - 0x323FF; everything in between in currently invalid. */ |
922 | int32_t u32charLookup = u32char; |
923 | if (u32charLookup >= kU32LowRangeLimit) { |
924 | u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit); |
925 | if (u32charLookup < kU32LowRangeLimit || u32charLookup >= (kU32LowRangeLimit + kU32HiRangeLen)) { |
926 | return -1; /* in the large range of currently-unassigned code points */ |
927 | } |
928 | } |
929 | /* Now we have u32charLookup either in 0..0x323FF representing u32char itself, |
930 | * or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in |
931 | * the trie that identifies unassigneds in this range, or maps others to |
932 | * decomps or combining class or just self. */ |
933 | uint16_t trieValue; |
934 | /* TrieHi */ |
935 | trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift]; |
936 | if (trieValue == kInvalidCodeFlag) { |
937 | return -1; |
938 | } |
939 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /* return self; */ |
940 | u32NormFoldBuf[0] = u32char; |
941 | combClass[0] = trieValue & kFlagValueMask; |
942 | return 1; |
943 | } |
944 | if (trieValue == kHangulMask) { |
945 | combClass[1] = combClass[2] = 0; |
946 | return decomposeHangul(u32char, u32NormFoldBuf); |
947 | } |
948 | /* TrieMid */ |
949 | trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask]; |
950 | if (trieValue == kInvalidCodeFlag) { |
951 | return -1; |
952 | } |
953 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { |
954 | u32NormFoldBuf[0] = u32char; |
955 | combClass[0] = trieValue & kFlagValueMask; |
956 | return adjustCase(case_sens, uSeqLen: 1, u32NormFoldBuf); |
957 | } |
958 | if ((trieValue & kFlagTestMask) == kInvMaskFlag) { |
959 | uint16_t invalidMask = nfU16InvMasks[trieValue & kFlagValueMask]; |
960 | uint16_t testBit = (uint16_t)(1 << (u32charLookup & kNFTrieLoMask)); |
961 | if (testBit & invalidMask) { |
962 | /* invalid */ |
963 | return -1; |
964 | } else { |
965 | /* treat like trieValue == 0 above */ |
966 | u32NormFoldBuf[0] = u32char; |
967 | return adjustCase(case_sens, uSeqLen: 1, u32NormFoldBuf); |
968 | } |
969 | } |
970 | if (trieValue == kHangulMask) { |
971 | combClass[1] = combClass[2] = 0; |
972 | return decomposeHangul(u32char, u32NormFoldBuf); |
973 | } |
974 | /* TrieLo */ |
975 | trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieLoMask]; |
976 | if (trieValue == kInvalidCodeFlag) { |
977 | return -1; |
978 | } |
979 | if (trieValue == kHangulMask) { |
980 | combClass[1] = combClass[2] = 0; |
981 | return decomposeHangul(u32char, u32NormFoldBuf); |
982 | } |
983 | if (trieValue < kToU16Seq2Mask || trieValue > kSpecialsEnd) { |
984 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { |
985 | u32NormFoldBuf[0] = u32char; |
986 | combClass[0] = trieValue & kFlagValueMask; |
987 | } else { |
988 | u32NormFoldBuf[0] = trieValue; |
989 | } |
990 | return adjustCase(case_sens, uSeqLen: 1, u32NormFoldBuf); |
991 | } |
992 | const uint16_t* u16SeqPtr = NULL; |
993 | const int32_t* u32SeqPtr = NULL; |
994 | int32_t uSeqLen = 0; |
995 | switch (trieValue & kSpecialsMask) { |
996 | case kToU16Seq2Mask: |
997 | if (case_sens && (trieValue & kToSeqCaseFoldMask)) { |
998 | /* don't use the mapping, it is only for case folding */ |
999 | u32NormFoldBuf[0] = u32char; |
1000 | /* already have combClass[0] = 0 */ |
1001 | return 1; |
1002 | } |
1003 | u16SeqPtr = nfU16Seq2[trieValue & kToSeqIndexMask]; |
1004 | uSeqLen = 2; |
1005 | break; |
1006 | case kToU16Seq3Mask: |
1007 | if (case_sens && (trieValue & kToSeqCaseFoldMask)) { |
1008 | /* don't use the mapping, it is only for case folding */ |
1009 | u32NormFoldBuf[0] = u32char; |
1010 | /* already have combClass[0] = 0 */ |
1011 | return 1; |
1012 | } |
1013 | u16SeqPtr = nfU16Seq3[trieValue & kToSeqIndexMask]; |
1014 | uSeqLen = 3; |
1015 | break; |
1016 | case kToU16SeqMiscMask: |
1017 | u16SeqPtr = &nfU16SeqMisc[trieValue & kToSeqMiscIndexMask]; |
1018 | uSeqLen = *u16SeqPtr & kToSeqMiscLenMask; |
1019 | combClass[0] = (uint8_t)(*u16SeqPtr++ >> kToSeqMiscCCShift); |
1020 | break; |
1021 | case kToU32CharMask: |
1022 | if (case_sens && (trieValue & kToSeqCaseFoldMask)) { |
1023 | /* don't use the mapping, it is only for case folding */ |
1024 | u32NormFoldBuf[0] = u32char; |
1025 | /* already have combClass[0] = 0 */ |
1026 | return 1; |
1027 | } |
1028 | u32SeqPtr = &nfU32Char[trieValue & kToSeqIndexMask]; |
1029 | uSeqLen = 1; |
1030 | break; |
1031 | case kToU32SeqMiscMask: |
1032 | u32SeqPtr = &nfU32SeqMisc[trieValue & kToSeqMiscIndexMask]; |
1033 | uSeqLen = *u32SeqPtr & kToSeqMiscLenMask; |
1034 | combClass[0] = (uint8_t)(*u32SeqPtr++ >> kToSeqMiscCCShift); |
1035 | break; |
1036 | default: |
1037 | return -1; |
1038 | } |
1039 | if (kNFCSingleCharDecompMax < uSeqLen) { |
1040 | return 0; |
1041 | } |
1042 | int32_t idx; |
1043 | for (idx = 0; idx < uSeqLen; idx++) { |
1044 | u32NormFoldBuf[idx] = (u16SeqPtr)? *u16SeqPtr++: *u32SeqPtr++; |
1045 | if (idx > 0) { |
1046 | combClass[idx] = getCombClassU32Char(u32char: u32NormFoldBuf[idx]); |
1047 | } |
1048 | } |
1049 | return adjustCase(case_sens, uSeqLen, u32NormFoldBuf); |
1050 | } |
1051 | |
1052 | /* |
1053 | * adjustCase, final adjustments to normalizeOptCaseFoldU32Char for case folding |
1054 | * |
1055 | * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only |
1056 | * uSeqLen - length of the sequence specified in the u32NormFoldBuf |
1057 | * u32NormFoldBuf - buffer of length kNFCSingleCharDecompMax (assume to be at least 3) |
1058 | * with normalized result. |
1059 | * |
1060 | * returns uSeqLen if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax |
1061 | * is insufficient (though it is assumed to be at least 3), else the length of the |
1062 | * normalized and case-folded result (currently in the range 1..4). |
1063 | * |
1064 | * This function is a reduced version of normalizeOptCaseFoldU32Char above. |
1065 | */ |
1066 | |
1067 | static int32_t |
1068 | adjustCase(bool case_sens, int32_t uSeqLen, |
1069 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]) |
1070 | { |
1071 | if (!case_sens && uSeqLen > 0) { |
1072 | if (u32NormFoldBuf[0] < kSimpleCaseFoldLimit) { |
1073 | u32NormFoldBuf[0] = nfBasicCF[u32NormFoldBuf[0]]; |
1074 | /* There is one case in which this maps to a character with different combining |
1075 | * class: U+0345 (cc 240) casefolds to U+03B9 (cc 0). However when this is the |
1076 | * first or only character in the sequence, we want to keep the original |
1077 | * combining class, so nothing special to do here. |
1078 | */ |
1079 | } |
1080 | /* The following is the only case where we have a casefolding after the first |
1081 | * character in the sequence. Don't worry about combining class here. that gets |
1082 | * set later for characters after the first. |
1083 | */ |
1084 | if (uSeqLen > 1 && u32NormFoldBuf[uSeqLen - 1] == 0x0345) { |
1085 | u32NormFoldBuf[uSeqLen - 1] = 0x03B9; |
1086 | } |
1087 | } |
1088 | return uSeqLen; |
1089 | } |
1090 | |
1091 | /* |
1092 | * getCombClassU32Char, map a single character (in UTF32 form) to its combining class. |
1093 | * |
1094 | * u32char - input UTF32 code point. This is assumed to be a valid character that does |
1095 | * not have a decomposition. |
1096 | * |
1097 | * returns combining class of the character. |
1098 | * |
1099 | * This is only called for characters after the first is a decomposition expansion. In |
1100 | * this situation, if we encounter U+03B9 (combining class 0), it is only there as the |
1101 | * case-folding of U+0345 (combining class 240). In this case it is the combining class |
1102 | * for U+0345 that we want. In the non-casefold case we won't see U+03B9 here at all. |
1103 | * |
1104 | * This function is a reduced version of normalizeOptCaseFoldU32Char above. |
1105 | */ |
1106 | static uint8_t |
1107 | getCombClassU32Char(int32_t u32char) |
1108 | { |
1109 | if (u32char >= kU32HiPUAStart) { |
1110 | return 0; |
1111 | } |
1112 | if (u32char == 0x03B9) { |
1113 | return 240; |
1114 | } |
1115 | /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */ |
1116 | /* 0 - 0x323FF; everything in between in currently invalid. */ |
1117 | int32_t u32charLookup = u32char; |
1118 | if (u32charLookup >= kU32LowRangeLimit) { |
1119 | u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit); |
1120 | } |
1121 | /* Now we have u32charLookup either in 0..0x323FF representing u32char itself, |
1122 | * or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in |
1123 | * the trie that identifies unassigneds in this range, or maps others to |
1124 | * decomps or combining class or just self. */ |
1125 | uint16_t trieValue; |
1126 | /* TrieHi */ |
1127 | trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift]; |
1128 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { |
1129 | return trieValue & kFlagValueMask; |
1130 | } |
1131 | /* TrieMid */ |
1132 | trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask]; |
1133 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /* return self; */ |
1134 | return trieValue & kFlagValueMask; |
1135 | } |
1136 | if ((trieValue & kFlagTestMask) == kInvMaskFlag) { |
1137 | return 0; |
1138 | } |
1139 | /* TrieLo */ |
1140 | trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieMidMask]; |
1141 | return ((trieValue & kFlagTestMask) == kCombClassFlag)? (trieValue & kFlagValueMask): 0; |
1142 | } |
1143 | |
1144 | /* |
1145 | * decomposeHangul, map a single UTF32 code point for a composed Hangul |
1146 | * in the range AC00-D7A3, using algorithmic decomp |
1147 | * |
1148 | * The normalized result will be 2 or 3 UTF32 characters. |
1149 | * |
1150 | * u32char - input UTF32 code point |
1151 | * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) |
1152 | * to receive the normalize result. |
1153 | * |
1154 | * returns the length of the normalized result (2..3). |
1155 | * |
1156 | * Adapted from ICU Hangul:decompose in normalizer2impl.h |
1157 | * |
1158 | */ |
1159 | |
1160 | enum { |
1161 | HANGUL_BASE=0xAC00, |
1162 | JAMO_L_BASE=0x1100, /* "lead" jamo */ |
1163 | JAMO_V_BASE=0x1161, /* "vowel" jamo */ |
1164 | JAMO_T_BASE=0x11A7, /* "trail" jamo */ |
1165 | JAMO_L_COUNT=19, |
1166 | JAMO_V_COUNT=21, |
1167 | JAMO_T_COUNT=28, |
1168 | }; |
1169 | |
1170 | static int32_t |
1171 | decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]) |
1172 | { |
1173 | u32char -= HANGUL_BASE; |
1174 | int32_t tIndex = u32char % JAMO_T_COUNT; |
1175 | u32char /= JAMO_T_COUNT; |
1176 | u32NormFoldBuf[0] = (uint16_t)(JAMO_L_BASE + u32char / JAMO_V_COUNT); |
1177 | u32NormFoldBuf[1] = (uint16_t)(JAMO_V_BASE + u32char % JAMO_V_COUNT); |
1178 | if (tIndex == 0) { |
1179 | return 2; |
1180 | } |
1181 | u32NormFoldBuf[2] = (uint16_t)(JAMO_T_BASE + tIndex); |
1182 | return 3; |
1183 | } |
1184 | |