| 1 | /* |
| 2 | * Copyright (C) 2016-2023 Apple, Inc. All rights reserved. |
| 3 | * Some portions covered by other copyrights, listed below. |
| 4 | *--- |
| 5 | * Copyright (C) 2016 and later: Unicode, Inc. and others. |
| 6 | * License & terms of use: http://www.unicode.org/copyright.html |
| 7 | *--- |
| 8 | * Copyright (C) 1999-2015, International Business Machines |
| 9 | * Corporation and others. All Rights Reserved. |
| 10 | * |
| 11 | * add APPLE_OSREFERENCE_LICENSE_HEADER stuff... |
| 12 | */ |
| 13 | |
| 14 | #include <libkern/libkern.h> |
| 15 | #include <sys/errno.h> |
| 16 | #include <sys/unicode.h> |
| 17 | #include "vfs_unicode_data.h" |
| 18 | #define STATIC_UNLESS_TEST static |
| 19 | |
| 20 | enum { |
| 21 | /* Maximum number of UTF8 bytes from one Unicode code point (one UTF32 code unit) */ |
| 22 | kMaxUTF8BytesPerChar = 4 |
| 23 | }; |
| 24 | |
| 25 | /* local prototypes used by exported functions (and themselves exported for testing) */ |
| 26 | STATIC_UNLESS_TEST |
| 27 | int32_t utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit); |
| 28 | STATIC_UNLESS_TEST |
| 29 | int32_t normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens, |
| 30 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax], |
| 31 | uint8_t combClass[kNFCSingleCharDecompMax]); |
| 32 | /* local prototypes used by exported functions (not exported for separate testing) */ |
| 33 | static int nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes, |
| 34 | int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP, |
| 35 | int32_t* buf, uint8_t* bufcc, int32_t* buflenP, |
| 36 | bool* needReorderP, bool* startP); |
| 37 | void doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen); |
| 38 | int32_t u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]); |
| 39 | |
| 40 | /* |
| 41 | * utf8_normalizeOptCaseFoldGetUVersion |
| 42 | * |
| 43 | * version[0] = Unicode major version; for Unicode 6.3.0 this would be 6 |
| 44 | * version[1] = Unicode minor version; for Unicode 6.3.0 this would be 3 |
| 45 | * version[2] = Unicode patch version; for Unicode 6.3.0 this would be 0 |
| 46 | * version[3] = Code revision level; for any given Unicode version, this value starts |
| 47 | * at 0 and is incremented for each significant revision to the |
| 48 | * normalizeOptCaseFold functions. |
| 49 | */ |
| 50 | void |
| 51 | utf8_normalizeOptCaseFoldGetUVersion(unsigned char version[4]) |
| 52 | { |
| 53 | version[0] = 15; |
| 54 | version[1] = 1; |
| 55 | version[2] = 0; |
| 56 | version[3] = 0; |
| 57 | return; |
| 58 | } |
| 59 | |
| 60 | /* |
| 61 | * utf8_normalizeOptCaseFoldAndHash |
| 62 | * |
| 63 | * str: The input UTF-8 string (need not be 0 terminated) |
| 64 | * str_len: The byte length of the input string (excluding any 0 terminator) |
| 65 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. |
| 66 | * True for case-sensitive behavior; generates standard NFD. |
| 67 | * hash_func: A pointer to a hashing function to compute the hash of the |
| 68 | * normalized/case-folded result. buf contains buf_len bytes |
| 69 | * of data to be added to the hash using the caller-supplied |
| 70 | * context (ctx). |
| 71 | * hash_ctx: The context for the hash function. |
| 72 | * |
| 73 | * Returns: 0 on success, or |
| 74 | * EILSEQ: The input string contains illegal ASCII-range characters |
| 75 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or |
| 76 | * contains codepoints that are non-characters or unassigned in |
| 77 | * the version of Unicode currently supported (Unicode 9.0). |
| 78 | */ |
| 79 | |
| 80 | int |
| 81 | utf8_normalizeOptCaseFoldAndHash(const char *str, |
| 82 | size_t str_len, |
| 83 | bool case_sens, |
| 84 | void (*hash_func)(void *buf, size_t buf_len, void *ctx), |
| 85 | void *hash_ctx) |
| 86 | { |
| 87 | const char *strLimit = str + str_len; |
| 88 | |
| 89 | /* Data for the next pending single-char norm from input; |
| 90 | * This will always begin with a base char (combining class 0) |
| 91 | * or the first character in the string, which may no be a base */ |
| 92 | int32_t unorm[kNFCSingleCharDecompMax]; |
| 93 | uint8_t unormcc[kNFCSingleCharDecompMax]; |
| 94 | int32_t unormlen = 0; |
| 95 | int32_t unormstart = 0; |
| 96 | |
| 97 | bool start = true; |
| 98 | |
| 99 | /* main loop: |
| 100 | * Each input character may be normalized to a sequence of one or more characters, |
| 101 | * some of which may have non-zero combining class. Any sequence of characters |
| 102 | * with non-zero combining class resulting from one or more input characters needs |
| 103 | * to be accumulated in the main buffer so we can reorder as necessary before |
| 104 | * calling the hash function. |
| 105 | * |
| 106 | * At the beginning of the main loop: The normalization buffer and main buffer are |
| 107 | * both empty. |
| 108 | * |
| 109 | * Each time through the main loop we do the following: |
| 110 | * 1. If there are characters available in the normalization result buffer (from the |
| 111 | * result of normalizing a previous input character), copy the first character and |
| 112 | * any following characters that have non-zero combining class to the main buffer. |
| 113 | * 2. If there is nothing left in the normalization buffer, then loop processing |
| 114 | * input characters as follows: |
| 115 | * a) Get the next input character from UTF8, get its normalized and case-folded |
| 116 | * result in the normalization buffer. |
| 117 | * b) If the first character in the normalization buffer has combining class 0, |
| 118 | * break; we will handle this normalization buffer next time through the main |
| 119 | * loop. |
| 120 | * c) Else copy the current normalization buffer (which has only combining marks) |
| 121 | * to the main buffer, and continue with the loop processing input characters. |
| 122 | * 3. At this point the first character in the main buffer may or may not have |
| 123 | * combining class 0, but any subsequent characters (up to the the limit for |
| 124 | * stream safe text) will be combining characters with nonzero combining class. |
| 125 | * Reorder the combining marks if necessary into canonical order. |
| 126 | * 4. Call the hash function for each character in the main buffer. |
| 127 | * |
| 128 | */ |
| 129 | do { |
| 130 | /* Data for the buffers being built up from input */ |
| 131 | int32_t buf[kNCFStreamSafeBufMax]; |
| 132 | uint8_t bufcc[kNCFStreamSafeBufMax]; |
| 133 | int32_t buflen = 0; |
| 134 | bool needReorder = false; |
| 135 | int err; |
| 136 | |
| 137 | err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, false /* allow_slashes */, |
| 138 | unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start); |
| 139 | if (err != 0) { |
| 140 | return err; |
| 141 | } |
| 142 | |
| 143 | if (buflen > 0) { |
| 144 | /* Now buffer should have all of the combining marks up to the next base char. |
| 145 | * Normally it will also start with the last base char encountered (unless the |
| 146 | * UTF8 string began with a combining mark). */ |
| 147 | /* Now reorder combining marks if necessary. */ |
| 148 | if (needReorder) { |
| 149 | doReorder(buf, bufcc, buflen); |
| 150 | } |
| 151 | /* Now write to hash func */ |
| 152 | hash_func(buf, buflen * sizeof(buf[0]), hash_ctx); |
| 153 | } |
| 154 | /* OK so far, top of loop clears buffers to start refilling again */ |
| 155 | } while (str < strLimit || unormlen > 0); |
| 156 | return 0; |
| 157 | } |
| 158 | |
| 159 | /* |
| 160 | * utf8_normalizeOptCaseFoldAndCompare |
| 161 | * |
| 162 | * strA: A UTF-8 string to be compared (need not be 0 terminated) |
| 163 | * strA_len: The byte length of strA (excluding any 0 terminator) |
| 164 | * strB: The second UTF-8 string to be compared (need not be 0 terminated) |
| 165 | * strB_len: The byte length of strB (excluding any 0 terminator) |
| 166 | * case_sens: False for case-insensitive behavior; compares canonical caseless forms. |
| 167 | * True for case-sensitive behavior; compares standard NFD forms. |
| 168 | * are_equal: On success, set to true if the strings are equal, or set to false |
| 169 | * if they are not. |
| 170 | * |
| 171 | * Returns: 0 on success, or |
| 172 | * EILSEQ: One or both of the input strings contains illegal ASCII-range |
| 173 | * characters (0x00 or '/'), or is not well-formed stream-safe UTF-8, |
| 174 | * or contains codepoints that are non-characters or unassigned in |
| 175 | * the version of Unicode currently supported (Unicode 9.0). |
| 176 | * Note: The comparison may terminate early when a difference is |
| 177 | * detected, and may return 0 and set *are_equal=false even |
| 178 | * if one or both strings are invalid. |
| 179 | */ |
| 180 | enum { kNFCSingleCharDecompMaxPlusPushback = kNFCSingleCharDecompMax + 4 }; /* room for 03B9 pushback(s) */ |
| 181 | |
| 182 | int |
| 183 | utf8_normalizeOptCaseFoldAndCompare(const char *strA, |
| 184 | size_t strA_len, |
| 185 | const char *strB, |
| 186 | size_t strB_len, |
| 187 | bool case_sens, |
| 188 | bool *are_equal) |
| 189 | { |
| 190 | const char *strALimit = strA + strA_len; |
| 191 | const char *strBLimit = strB + strB_len; |
| 192 | |
| 193 | /* Data for the next pending single-char norms from each input; |
| 194 | * These will always begin with a base char (combining class 0) |
| 195 | * or the first character in the string, which may not be a base */ |
| 196 | int32_t unormA[kNFCSingleCharDecompMaxPlusPushback], unormB[kNFCSingleCharDecompMaxPlusPushback]; |
| 197 | uint8_t unormAcc[kNFCSingleCharDecompMaxPlusPushback], unormBcc[kNFCSingleCharDecompMaxPlusPushback]; |
| 198 | int32_t unormAlen = 0, unormBlen = 0; |
| 199 | int32_t unormAstart = 0, unormBstart = 0; |
| 200 | |
| 201 | bool startA = true, startB = true; |
| 202 | |
| 203 | /* main loop: |
| 204 | * The main loop here is similar to the main loop in utf8_normalizeOptCaseFoldAndHash, |
| 205 | * described above. The differences are: |
| 206 | * - We keep a normalization buffer and main buffer for each string. |
| 207 | * - In the main loop, we do steps 1-3 for each string. |
| 208 | * - In step 4, instead of calling the hash function, we compare the two main |
| 209 | * buffers; if they are unequal, we return a non-equal result. |
| 210 | * - After the end of the main loop, if we still have data for one string but |
| 211 | * not the other, return a non-equal result, else return an equal result. |
| 212 | */ |
| 213 | do { |
| 214 | /* Data for the buffers being built up from each input */ |
| 215 | int32_t bufA[kNCFStreamSafeBufMax], bufB[kNCFStreamSafeBufMax]; |
| 216 | uint8_t bufAcc[kNCFStreamSafeBufMax], bufBcc[kNCFStreamSafeBufMax]; |
| 217 | int32_t bufAlen = 0, bufBlen = 0; |
| 218 | bool needReorderA = false, needReorderB = false; |
| 219 | int err; |
| 220 | |
| 221 | err = nextBaseAndAnyMarks(strP: &strA, strLimit: strALimit, case_sens, false /* allow_slashes */, |
| 222 | unorm: unormA, unormcc: unormAcc, unormlenP: &unormAlen, unormstartP: &unormAstart, buf: bufA, bufcc: bufAcc, buflenP: &bufAlen, needReorderP: &needReorderA, startP: &startA); |
| 223 | if (err != 0) { |
| 224 | return err; |
| 225 | } |
| 226 | err = nextBaseAndAnyMarks(strP: &strB, strLimit: strBLimit, case_sens, false /* allow_slashes */, |
| 227 | unorm: unormB, unormcc: unormBcc, unormlenP: &unormBlen, unormstartP: &unormBstart, buf: bufB, bufcc: bufBcc, buflenP: &bufBlen, needReorderP: &needReorderB, startP: &startB); |
| 228 | if (err != 0) { |
| 229 | return err; |
| 230 | } |
| 231 | |
| 232 | if (bufAlen > 0 || bufBlen > 0) { |
| 233 | /* Now each buffer should have all of the combining marks up to the next base char. |
| 234 | * Normally it will also start with the last base char encountered (unless the |
| 235 | * UTF8 string began with a combining mark). */ |
| 236 | /* Now reorder combining marks if necessary. */ |
| 237 | if (needReorderA) { |
| 238 | doReorder(buf: bufA, bufcc: bufAcc, buflen: bufAlen); |
| 239 | } |
| 240 | if (needReorderB) { |
| 241 | doReorder(buf: bufB, bufcc: bufBcc, buflen: bufBlen); |
| 242 | } |
| 243 | /* handle 03B9 pushback */ |
| 244 | int32_t idx; |
| 245 | if (!case_sens) { |
| 246 | if (bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9 && unormAstart == 0) { |
| 247 | int32_t tailCount = 0; |
| 248 | while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormAlen && bufAlen > 1 && bufA[bufAlen - 1] == 0x03B9) { |
| 249 | tailCount++; |
| 250 | bufAlen--; |
| 251 | } |
| 252 | for (idx = unormAlen; idx > 0; idx--) { |
| 253 | unormA[idx - 1 + tailCount] = unormA[idx - 1]; |
| 254 | unormAcc[idx - 1 + tailCount] = unormAcc[idx - 1]; |
| 255 | } |
| 256 | for (idx = 0; idx < tailCount; idx++) { |
| 257 | unormA[idx] = 0x03B9; |
| 258 | unormAcc[idx] = 0; |
| 259 | } |
| 260 | unormAlen += tailCount; |
| 261 | } |
| 262 | if (bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9 && unormBstart == 0) { |
| 263 | int32_t tailCount = 0; |
| 264 | while (tailCount < kNFCSingleCharDecompMaxPlusPushback - unormBlen && bufBlen > 1 && bufB[bufBlen - 1] == 0x03B9) { |
| 265 | tailCount++; |
| 266 | bufBlen--; |
| 267 | } |
| 268 | for (idx = unormBlen; idx > 0; idx--) { |
| 269 | unormB[idx - 1 + tailCount] = unormB[idx - 1]; |
| 270 | unormBcc[idx - 1 + tailCount] = unormBcc[idx - 1]; |
| 271 | } |
| 272 | for (idx = 0; idx < tailCount; idx++) { |
| 273 | unormB[idx] = 0x03B9; |
| 274 | unormBcc[idx] = 0; |
| 275 | } |
| 276 | unormBlen += tailCount; |
| 277 | } |
| 278 | } |
| 279 | /* Now compare the buffers. */ |
| 280 | if (bufAlen != bufBlen || memcmp(s1: bufA, s2: bufB, n: bufAlen * sizeof(bufA[0])) != 0) { |
| 281 | *are_equal = false; |
| 282 | return 0; |
| 283 | } |
| 284 | } |
| 285 | /* OK so far, top of loop clears buffers to start refilling again */ |
| 286 | } while ((strA < strALimit || unormAlen > 0) && (strB < strBLimit || unormBlen > 0)); |
| 287 | |
| 288 | *are_equal = (strA == strALimit && unormAlen == 0 && strB == strBLimit && unormBlen == 0); |
| 289 | return 0; |
| 290 | } |
| 291 | |
| 292 | /* |
| 293 | * utf8_normalizeOptCaseFold |
| 294 | * |
| 295 | * str: The input UTF-8 string (need not be 0 terminated) |
| 296 | * str_len: The byte length of the input string (excluding any 0 terminator) |
| 297 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. |
| 298 | * True for case-sensitive behavior; generates standard NFD. |
| 299 | * ustr: A pointer to a buffer for the resulting UTF-32 string. |
| 300 | * ustr_size: The capacity of ustr, in UTF-32 units. |
| 301 | * ustr_len: Pointer to a value that will be filled in with the actual length |
| 302 | * in UTF-32 units of the string copied to ustr. |
| 303 | * |
| 304 | * Returns: 0 on success, or |
| 305 | * EILSEQ: The input string contains illegal ASCII-range characters |
| 306 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or |
| 307 | * contains codepoints that are non-characters or unassigned in |
| 308 | * the version of Unicode currently supported. |
| 309 | * ENOMEM: ustr_size is insufficient for the resulting string. In this |
| 310 | * case the value returned in *ustr_len is invalid. |
| 311 | */ |
| 312 | int |
| 313 | utf8_normalizeOptCaseFold(const char *str, |
| 314 | size_t str_len, |
| 315 | bool case_sens, |
| 316 | int32_t *ustr, |
| 317 | int32_t ustr_size, |
| 318 | int32_t *ustr_len) |
| 319 | { |
| 320 | const char *strLimit = str + str_len; |
| 321 | int32_t *ustrCur = ustr; |
| 322 | const int32_t *ustrLimit = ustr + ustr_size; |
| 323 | |
| 324 | /* Data for the next pending single-char norm from input; |
| 325 | * This will always begin with a base char (combining class 0) */ |
| 326 | int32_t unorm[kNFCSingleCharDecompMax]; |
| 327 | uint8_t unormcc[kNFCSingleCharDecompMax]; |
| 328 | int32_t unormlen = 0; |
| 329 | int32_t unormstart = 0; |
| 330 | |
| 331 | bool start = true; |
| 332 | |
| 333 | *ustr_len = 0; |
| 334 | do { |
| 335 | /* Data for the buffers being built up from input */ |
| 336 | int32_t buf[kNCFStreamSafeBufMax]; |
| 337 | uint8_t bufcc[kNCFStreamSafeBufMax]; |
| 338 | int32_t buflen = 0; |
| 339 | bool needReorder = false; |
| 340 | int err; |
| 341 | |
| 342 | err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, false /* allow_slashes */, |
| 343 | unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start); |
| 344 | if (err != 0) { |
| 345 | return err; |
| 346 | } |
| 347 | |
| 348 | if (buflen > 0) { |
| 349 | if (needReorder) { |
| 350 | doReorder(buf, bufcc, buflen); |
| 351 | } |
| 352 | /* Now copy to output buffer */ |
| 353 | int32_t idx; |
| 354 | if (ustrCur + buflen > ustrLimit) { |
| 355 | return ENOMEM; |
| 356 | } |
| 357 | for (idx = 0; idx < buflen; idx++) { |
| 358 | *ustrCur++ = buf[idx]; |
| 359 | } |
| 360 | } |
| 361 | /* OK so far, top of loop clears buffers to start refilling again */ |
| 362 | } while (str < strLimit || unormlen > 0); |
| 363 | *ustr_len = (uint32_t)(ustrCur - ustr); // XXXpjr: the explicit (uint32_t) cast wasn't present in the original code drop |
| 364 | return 0; |
| 365 | } |
| 366 | |
| 367 | static int |
| 368 | utf8_normalizeOptCaseFoldToUTF8_internal(const char *str, |
| 369 | size_t str_len, |
| 370 | bool case_sens, |
| 371 | bool allow_slashes, |
| 372 | char *ustr, |
| 373 | size_t ustr_size, |
| 374 | size_t *ustr_len) |
| 375 | { |
| 376 | const char *strLimit = str + str_len; |
| 377 | char *ustrCur = ustr; |
| 378 | const char *ustrLimit = ustr + ustr_size; |
| 379 | |
| 380 | /* Data for the next pending single-char norm from input; |
| 381 | * This will always begin with a base char (combining class 0) */ |
| 382 | int32_t unorm[kNFCSingleCharDecompMax]; |
| 383 | uint8_t unormcc[kNFCSingleCharDecompMax]; |
| 384 | int32_t unormlen = 0; |
| 385 | int32_t unormstart = 0; |
| 386 | |
| 387 | bool start = true; |
| 388 | |
| 389 | *ustr_len = 0; |
| 390 | do { |
| 391 | /* Data for the buffers being built up from input */ |
| 392 | int32_t buf[kNCFStreamSafeBufMax]; |
| 393 | uint8_t bufcc[kNCFStreamSafeBufMax]; |
| 394 | int32_t buflen = 0; |
| 395 | bool needReorder = false; |
| 396 | int err; |
| 397 | |
| 398 | err = nextBaseAndAnyMarks(strP: &str, strLimit, case_sens, allow_slashes, |
| 399 | unorm, unormcc, unormlenP: &unormlen, unormstartP: &unormstart, buf, bufcc, buflenP: &buflen, needReorderP: &needReorder, startP: &start); |
| 400 | if (err != 0) { |
| 401 | return err; |
| 402 | } |
| 403 | |
| 404 | if (buflen > 0) { |
| 405 | uint8_t utf8Bytes[kMaxUTF8BytesPerChar]; |
| 406 | int32_t *bufPtr = buf; |
| 407 | if (needReorder) { |
| 408 | doReorder(buf, bufcc, buflen); |
| 409 | } |
| 410 | /* Now copy to output buffer */ |
| 411 | while (buflen-- > 0) { |
| 412 | int32_t idx, utf8Len = u32CharToUTF8Bytes(u32char: (uint32_t)*bufPtr++, utf8Bytes); |
| 413 | if (ustrCur + utf8Len > ustrLimit) { |
| 414 | return ENOMEM; |
| 415 | } |
| 416 | for (idx = 0; idx < utf8Len; idx++) { |
| 417 | *ustrCur++ = (char)utf8Bytes[idx]; |
| 418 | } |
| 419 | } |
| 420 | } |
| 421 | /* OK so far, top of loop clears buffers to start refilling again */ |
| 422 | } while (str < strLimit || unormlen > 0); |
| 423 | *ustr_len = ustrCur - ustr; |
| 424 | return 0; |
| 425 | } |
| 426 | |
| 427 | /* |
| 428 | * utf8_normalizeOptCaseFoldToUTF8 |
| 429 | * (This is similar to normalizeOptCaseFold except that this has a different output |
| 430 | * buffer type, and adds conversion to UTF8 while copying to output buffer) |
| 431 | * |
| 432 | * str: The input UTF-8 string (need not be 0 terminated) |
| 433 | * str_len: The byte length of the input string (excluding any 0 terminator) |
| 434 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. |
| 435 | * True for case-sensitive behavior; generates standard NFD. |
| 436 | * ustr: A pointer to a buffer for the resulting UTF-8 string. |
| 437 | * ustr_size: The capacity of ustr, in bytes. |
| 438 | * ustr_len: Pointer to a value that will be filled in with the actual length |
| 439 | * in bytes of the string copied to ustr. |
| 440 | * |
| 441 | * Returns: 0 on success, or |
| 442 | * EILSEQ: The input string contains illegal ASCII-range characters |
| 443 | * (0x00 or '/'), or is not well-formed stream-safe UTF-8, or |
| 444 | * contains codepoints that are non-characters or unassigned in |
| 445 | * the version of Unicode currently supported. |
| 446 | * ENOMEM: ustr_size is insufficient for the resulting string. In this |
| 447 | * case the value returned in *ustr_len is invalid. |
| 448 | */ |
| 449 | int |
| 450 | utf8_normalizeOptCaseFoldToUTF8(const char *str, |
| 451 | size_t str_len, |
| 452 | bool case_sens, |
| 453 | char *ustr, |
| 454 | size_t ustr_size, |
| 455 | size_t *ustr_len) |
| 456 | { |
| 457 | return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, false /* allow_slashes */, |
| 458 | ustr, ustr_size, ustr_len); |
| 459 | } |
| 460 | |
| 461 | /* |
| 462 | * utf8_normalizeOptCaseFoldToUTF8ForPath |
| 463 | * (This is similar to normalizeOptCaseFoldToUTF8 except that this allows '/' character.) |
| 464 | * |
| 465 | * str: The input UTF-8 path string |
| 466 | * str_len: The byte length of the input path string (excluding any 0 terminator) |
| 467 | * case_sens: False for case-insensitive behavior; generates canonical caseless form. |
| 468 | * True for case-sensitive behavior; generates standard NFD. |
| 469 | * ustr: A pointer to a buffer for the resulting UTF-8 string. |
| 470 | * ustr_size: The capacity of ustr, in bytes. |
| 471 | * ustr_len: Pointer to a value that will be filled in with the actual length |
| 472 | * in bytes of the string copied to ustr. |
| 473 | * |
| 474 | * Returns: 0 on success, or |
| 475 | * EILSEQ: The input string contains illegal ASCII-range characters |
| 476 | * (0x00), or is not well-formed stream-safe UTF-8, or |
| 477 | * contains codepoints that are non-characters or unassigned in |
| 478 | * the version of Unicode currently supported. |
| 479 | * ENOMEM: ustr_size is insufficient for the resulting string. In this |
| 480 | * case the value returned in *ustr_len is invalid. |
| 481 | */ |
| 482 | int |
| 483 | utf8_normalizeOptCaseFoldToUTF8ForPath(const char *str, |
| 484 | size_t str_len, |
| 485 | bool case_sens, |
| 486 | char *ustr, |
| 487 | size_t ustr_size, |
| 488 | size_t *ustr_len) |
| 489 | { |
| 490 | return utf8_normalizeOptCaseFoldToUTF8_internal(str, str_len, case_sens, true /* allow_slashes */, |
| 491 | ustr, ustr_size, ustr_len); |
| 492 | } |
| 493 | |
| 494 | /* |
| 495 | * utf8_normalizeOptCaseFoldAndMatchSubstring |
| 496 | * |
| 497 | * strA: A UTF-8 string (need not be 0 terminated) in which to search for the |
| 498 | * substring specified by ustrB. |
| 499 | * strA_len: The byte length of strA (excluding any 0 terminator) |
| 500 | * ustrB: A normalized UTF-32 substring (need not be 0 terminated) to be searched |
| 501 | * for in the UTF-32 string resulting from converting strA to the normalized |
| 502 | * UTF-32 form specified by the case_sens parameter; ustrB must already be |
| 503 | * in that form. |
| 504 | * ustrB_len: The length of ustrB in UTF-32 units (excluding any 0 terminator). |
| 505 | * case_sens: False for case-insensitive matching; compares canonical caseless forms. |
| 506 | * True for case-sensitive matching; compares standard NFD forms. |
| 507 | * buf: Pointer to caller-supplied working memory for storing the portion of |
| 508 | * strA which has been converted to normalized UTF-32. |
| 509 | * buf_size: The size of buf. |
| 510 | * has_match: On success, set to true if strA (when converter to UTF-32 and normalized |
| 511 | * per case_sens) contains ustrB, set to false otherwise. |
| 512 | * |
| 513 | * Returns: 0 on success, or |
| 514 | * EILSEQ: strA contains illegal ASCII-range characters (0x00 or '/'), or is |
| 515 | * not well-formed stream-safe UTF-8, or contains codepoints that are |
| 516 | * non-characters or unassigned in the version of Unicode currently |
| 517 | * supported. |
| 518 | * Note: The search may terminate early when a match is detected, and |
| 519 | * may return 0 and set *has_match=true even if strA is invalid. |
| 520 | * ENOMEM: buf_size is insufficient. |
| 521 | */ |
| 522 | int |
| 523 | utf8_normalizeOptCaseFoldAndMatchSubstring(const char *strA, |
| 524 | size_t strA_len, |
| 525 | const int32_t *ustrB, |
| 526 | int32_t ustrB_len, |
| 527 | bool case_sens, |
| 528 | void *buf, |
| 529 | size_t buf_size, |
| 530 | bool *has_match) |
| 531 | { |
| 532 | /* |
| 533 | * ustrA represents the current position in the UTF-32 normalized version of strA |
| 534 | * at which we want to test for a match; ustrANormEnd is the position beyond that |
| 535 | * which is just after the end of what has already been converted from strA to |
| 536 | * UTF-32 normalized form. |
| 537 | * Each time through the main loop: |
| 538 | * - The first task is to make sure we have enough of strA converted to UTF32 |
| 539 | * normalized form to test for match with ustrB at the current match position. |
| 540 | * If we don't, then convert more of strA to UTF-32 normalized form until we |
| 541 | * have enough to compare with ustrB. To do this, run a loop which is like the |
| 542 | * main loop in utf8_normalizeOptCaseFoldAndHash except that in step 4, instead of |
| 543 | * calling the hash function, we copy the normalized buffer to ustrANormEnd, |
| 544 | * advancing the latter. We keep doing this until we have enough additional |
| 545 | * converted to match with ustrB. |
| 546 | * - Then we test for match of ustrB at the current ustrA position. If there is |
| 547 | * a match we return; otherwise, if there is more strA to convert we advance |
| 548 | * ustrA and repeat the main loop, otherwise we return without a match. |
| 549 | */ |
| 550 | if (ustrB_len == 0) { /* always matches */ |
| 551 | *has_match = true; |
| 552 | return 0; |
| 553 | } |
| 554 | *has_match = false; /* initialize return value */ |
| 555 | if (ustrB_len > 2 * strA_len) { |
| 556 | /* If ustrB is clearly too long to find in strA, don't bother normalizing strA. |
| 557 | * A UTF-8 character of 1 byte (ASCII) will normalize to 1 UTF-32 unit. |
| 558 | * A UTF-8 character of 2-4 bytes will normalize to a maximum of 4 UTF-32 units. |
| 559 | * The maximum expansion from unnormalized UTF-8 byte length to normalized |
| 560 | * UTF-32 unit length is thus 2. */ |
| 561 | return 0; |
| 562 | } |
| 563 | |
| 564 | const char *strALimit = strA + strA_len; |
| 565 | int32_t *ustrA = (int32_t *)buf; |
| 566 | const int32_t *ustrALimit = ustrA + (buf_size / sizeof(int32_t)); |
| 567 | int32_t *ustrANormEnd = ustrA; /* how far we have already normalized in ustrA */ |
| 568 | |
| 569 | /* Data for the next pending single-char norms from each input; |
| 570 | * These will always begin with a base char (combining class 0) |
| 571 | * or the first character in the string, which may not be a base */ |
| 572 | int32_t unormA[kNFCSingleCharDecompMax]; |
| 573 | uint8_t unormAcc[kNFCSingleCharDecompMax]; |
| 574 | int32_t unormAlen = 0; |
| 575 | int32_t unormAstart = 0; |
| 576 | |
| 577 | bool startA = true; |
| 578 | |
| 579 | while (true) { |
| 580 | /* convert enough more of strA to normalized UTF-32 in ustrA to check for match */ |
| 581 | if (ustrANormEnd - ustrA < ustrB_len) { |
| 582 | do { |
| 583 | /* Data for the buffers being built up from each input */ |
| 584 | int32_t bufA[kNCFStreamSafeBufMax]; |
| 585 | uint8_t bufAcc[kNCFStreamSafeBufMax]; |
| 586 | int32_t bufAlen = 0; |
| 587 | bool needReorderA = false; |
| 588 | int err; |
| 589 | |
| 590 | err = nextBaseAndAnyMarks(strP: &strA, strLimit: strALimit, case_sens, false /* allow_slashes */, |
| 591 | unorm: unormA, unormcc: unormAcc, unormlenP: &unormAlen, unormstartP: &unormAstart, buf: bufA, bufcc: bufAcc, buflenP: &bufAlen, needReorderP: &needReorderA, startP: &startA); |
| 592 | if (err != 0) { |
| 593 | return err; |
| 594 | } |
| 595 | |
| 596 | if (bufAlen > 0) { |
| 597 | /* Now each buffer should have all of the combining marks up to the next base char. |
| 598 | * Normally it will also start with the last base char encountered (unless the |
| 599 | * UTF8 string began with a combining mark). */ |
| 600 | /* Now reorder combining marks if necessary. Should be rare, and sequences should |
| 601 | * usually be short when does occur => simple bubblesort should be sufficient. */ |
| 602 | if (needReorderA) { |
| 603 | doReorder(buf: bufA, bufcc: bufAcc, buflen: bufAlen); |
| 604 | } |
| 605 | /* Now copy to working buffer */ |
| 606 | int32_t idx; |
| 607 | if (ustrANormEnd + bufAlen > ustrALimit) { |
| 608 | return ENOMEM; |
| 609 | } |
| 610 | for (idx = 0; idx < bufAlen; idx++) { |
| 611 | *ustrANormEnd++ = bufA[idx]; |
| 612 | } |
| 613 | } |
| 614 | /* OK so far, top of loop clears buffers to start refilling again */ |
| 615 | } while ((ustrANormEnd - ustrA < ustrB_len) && (strA < strALimit || unormAlen > 0)); |
| 616 | } |
| 617 | |
| 618 | if (ustrANormEnd - ustrA < ustrB_len) { |
| 619 | return 0; /* not enough of strA left for match */ |
| 620 | } |
| 621 | /* check for match, return if so */ |
| 622 | if (memcmp(s1: ustrA, s2: ustrB, n: ustrB_len * sizeof(ustrB[0])) == 0) { |
| 623 | *has_match = true; |
| 624 | return 0; |
| 625 | } |
| 626 | ustrA++; /* advance match position */ |
| 627 | } |
| 628 | } |
| 629 | |
| 630 | /* nextBaseAndAnyMarks: |
| 631 | * Guts of code to get next bufferful of base character (or first char in string) |
| 632 | * and all trailing combining marks. |
| 633 | * This is called each time through the main loop of functions above, and does the |
| 634 | * following: |
| 635 | * 1. If there are characters available in the normalization result buffer (from the |
| 636 | * result of normalizing a previous input character), copy the first character and |
| 637 | * any following characters that have non-zero combining class to the main buffer. |
| 638 | * 2. If there is nothing left in the normalization buffer, then loop processing |
| 639 | * input characters as follows: |
| 640 | * a) Get the next input character from UTF8, get its normalized and case-folded |
| 641 | * result in the normalization buffer. |
| 642 | * b) If the first character in the normalization buffer has combining class 0, |
| 643 | * break; we will handle this normalization buffer next time through the main |
| 644 | * loop. |
| 645 | * c) Else copy the current normalization buffer (which has only combining marks) |
| 646 | * to the main buffer, and continue with the loop processing input characters. |
| 647 | */ |
| 648 | |
| 649 | static int |
| 650 | nextBaseAndAnyMarks(const char** strP, const char *strLimit, bool case_sens, bool allow_slashes, |
| 651 | int32_t* unorm, uint8_t* unormcc, int32_t* unormlenP, int32_t* unormstartP, |
| 652 | int32_t* buf, uint8_t* bufcc, int32_t* buflenP, |
| 653 | bool* needReorderP, bool* startP) |
| 654 | { |
| 655 | /* update buffers for str */ |
| 656 | if (*unormlenP > 0 && *unormstartP < *unormlenP) { |
| 657 | /* unorm begins with a base char; buflen should be 0 */ |
| 658 | *needReorderP = false; |
| 659 | for (*buflenP = 0; true;) { |
| 660 | if (*buflenP > 0 && unormcc[*unormstartP] > 0 && unormcc[*unormstartP] < bufcc[(*buflenP) - 1]) { |
| 661 | *needReorderP = true; |
| 662 | } |
| 663 | buf[*buflenP] = unorm[*unormstartP]; |
| 664 | bufcc[(*buflenP)++] = unormcc[(*unormstartP)++]; |
| 665 | if (*unormstartP >= *unormlenP || unormcc[*unormstartP] == 0) { |
| 666 | break; |
| 667 | } |
| 668 | } |
| 669 | } |
| 670 | if (*unormstartP >= *unormlenP) { |
| 671 | *unormstartP = *unormlenP = 0; |
| 672 | while (*strP < strLimit) { |
| 673 | int32_t idx; |
| 674 | uint32_t bytevalue = (uint8_t)*(*strP)++; |
| 675 | /* '/' is not produced by NFD decomposition from another character so we can |
| 676 | * check for it before normalization */ |
| 677 | if (bytevalue == 0 || (bytevalue == 0x2F /*'/'*/ && !allow_slashes)) { |
| 678 | return EILSEQ; |
| 679 | } |
| 680 | if (bytevalue < 0x80) { |
| 681 | unorm[0] = (!case_sens && bytevalue >= 'A' && bytevalue <= 'Z')? bytevalue += 0x20: bytevalue; |
| 682 | *unormlenP = 1; |
| 683 | unormcc[0] = 0; |
| 684 | *startP = false; |
| 685 | break; |
| 686 | } else { |
| 687 | int32_t u32char = utf8ToU32Code(u32char: bytevalue, srcPtr: strP, srcLimit: strLimit); |
| 688 | if (u32char <= 0) { |
| 689 | return EILSEQ; |
| 690 | } |
| 691 | *unormlenP = normalizeOptCaseFoldU32Char(u32char, case_sens, u32NormFoldBuf: unorm, combClass: unormcc); |
| 692 | if (*unormlenP <= 0) { |
| 693 | return EILSEQ; |
| 694 | } |
| 695 | if (unormcc[0] == 0 || *startP) { |
| 696 | *startP = false; |
| 697 | break; |
| 698 | } |
| 699 | } |
| 700 | /* the latest char decomposes to just combining sequence, add to buffer being built */ |
| 701 | if (*buflenP + *unormlenP > kNCFStreamSafeBufMax) { |
| 702 | return EILSEQ; |
| 703 | } |
| 704 | for (idx = 0; idx < *unormlenP; idx++, (*buflenP)++) { |
| 705 | if (*buflenP > 0 && unormcc[idx] > 0 && unormcc[idx] < bufcc[(*buflenP) - 1]) { |
| 706 | *needReorderP = true; |
| 707 | } |
| 708 | buf[*buflenP] = unorm[idx]; |
| 709 | bufcc[*buflenP] = unormcc[idx]; |
| 710 | } |
| 711 | *unormlenP = 0; |
| 712 | } |
| 713 | } |
| 714 | return 0; |
| 715 | } |
| 716 | |
| 717 | /* local prototypes used only by internal functions */ |
| 718 | static void swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]); |
| 719 | static int32_t adjustCase(bool case_sens, int32_t uSeqLen, |
| 720 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]); |
| 721 | static uint8_t getCombClassU32Char(int32_t u32char); |
| 722 | static int32_t decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]); |
| 723 | |
| 724 | /* Reorder combining marks if necessary. Should be rare, and sequences should |
| 725 | * usually be short when does occur => simple bubblesort should be sufficient. */ |
| 726 | void |
| 727 | doReorder(int32_t* buf, uint8_t* bufcc, int32_t buflen) |
| 728 | { |
| 729 | int32_t idx, jdx; |
| 730 | for (idx = 0; idx < buflen - 1; idx++) { |
| 731 | for (jdx = buflen - 1; jdx > idx; jdx--) { |
| 732 | if (bufcc[jdx] < bufcc[jdx - 1]) { |
| 733 | swapBufCharCCWithPrevious(jdx, buf, bufcc); |
| 734 | } |
| 735 | } |
| 736 | } |
| 737 | } |
| 738 | /* swap function for bubblesort */ |
| 739 | static void |
| 740 | swapBufCharCCWithPrevious(int32_t jdx, int32_t buf[], uint8_t bufcc[]) |
| 741 | { |
| 742 | int32_t bufchar = buf[jdx]; |
| 743 | uint8_t bufccval = bufcc[jdx]; |
| 744 | buf[jdx] = buf[jdx - 1]; |
| 745 | bufcc[jdx] = bufcc[jdx - 1]; |
| 746 | buf[jdx - 1] = bufchar; |
| 747 | bufcc[jdx - 1] = bufccval; |
| 748 | } |
| 749 | |
| 750 | /* |
| 751 | * u32CharToUTF8Bytes, map a valid Unicode character (UTF32 code point) to 1..4 UTF8 bytes, |
| 752 | * and returns the number of UTF8 bytes. |
| 753 | * |
| 754 | * adapted from ICU macro U8_APPEND_UNSAFE (utf8.h). |
| 755 | */ |
| 756 | int32_t |
| 757 | u32CharToUTF8Bytes(uint32_t u32char, uint8_t utf8Bytes[kMaxUTF8BytesPerChar]) |
| 758 | { |
| 759 | int32_t idx = 0; |
| 760 | if (u32char <= 0x7F) { |
| 761 | utf8Bytes[idx++] = (uint8_t)u32char; |
| 762 | } else { |
| 763 | if (u32char <= 0x7FF) { |
| 764 | utf8Bytes[idx++] = (uint8_t)((u32char >> 6) | 0xC0); |
| 765 | } else { |
| 766 | if (u32char <= 0xFFFF) { |
| 767 | utf8Bytes[idx++] = (uint8_t)((u32char >> 12) | 0xE0); |
| 768 | } else { |
| 769 | utf8Bytes[idx++] = (uint8_t)((u32char >> 18) | 0xF0); |
| 770 | utf8Bytes[idx++] = (uint8_t)(((u32char >> 12) & 0x3F) | 0x80); |
| 771 | } |
| 772 | utf8Bytes[idx++] = (uint8_t)(((u32char >> 6) & 0x3F) | 0x80); |
| 773 | } |
| 774 | utf8Bytes[idx++] = (uint8_t)((u32char & 0x3F) | 0x80); |
| 775 | } |
| 776 | return idx; |
| 777 | } |
| 778 | |
| 779 | /* two macros adapted from ICU's utf8.h */ |
| 780 | #define U8_COUNT_TRAIL_BYTES_LOC(leadByte) \ |
| 781 | ((uint8_t)(leadByte)<0XF0 ? \ |
| 782 | ((uint8_t)(leadByte)>=0XC0)+((uint8_t)(leadByte)>=0XE0) : \ |
| 783 | (uint8_t)(leadByte)<0XFE ? 3+((uint8_t)(leadByte)>=0XF8)+((uint8_t)(leadByte)>=0XFC) : 0) |
| 784 | |
| 785 | #define U8_MASK_LEAD_BYTE_LOC(leadByte, countTrailBytes) ((leadByte)&=(1<<(6-(countTrailBytes)))-1) |
| 786 | |
| 787 | /* array adapted from ICU's utf_impl.c */ |
| 788 | static const int32_t utf8_minLegal[4] = { 0, 0X80, 0x800, 0x10000 }; |
| 789 | |
| 790 | /* |
| 791 | * utf8ToU32Code, map a non-ASCII byte value plus a buffer of trail bytes to a UTF32 code point |
| 792 | * |
| 793 | * adapted from ICU macro U8_NEXT (utf8.h) and function utf8_nextCharSafeBody (utf_impl.c); |
| 794 | * verified to produce the same results (adusted for the difference in API signature). |
| 795 | * |
| 796 | * assumes at entry that: |
| 797 | * 1. a non-ASCII byte value (>= 0x80) that purports to be the beginning of a UTF8 character |
| 798 | * has been read, and its value is in u32char |
| 799 | * 2. *srcPtr points to the input buffer just after that non-ASCII byte, i.e. it purportedly |
| 800 | * points to the trail bytes for that UTF8 char. |
| 801 | * 3. srcLimit points to end of the input buffer (just after the last byte in the buffer) |
| 802 | * |
| 803 | * For a valid and complete UTF8 character, the function returns its value and advances |
| 804 | * *srcPtr to the first byte after the UTF8 char. Otherwise, the function returns -1 |
| 805 | * (and the value in *srcPtr is undefined). |
| 806 | * Note that while it does not map to surrogate values (generates an error for malformed |
| 807 | * UTF-8 that would map to values in 0xD800..0xD8FF), it does output noncharacter values |
| 808 | * whose low 16 bits are 0xFFFE or 0xFFFF without generating an error. |
| 809 | * |
| 810 | * equivalences used in adapted ICU code: |
| 811 | * UChar = uint16_t |
| 812 | * UChar32 = int32_t |
| 813 | * |
| 814 | * This has been validated against ICU behavior. |
| 815 | */ |
| 816 | STATIC_UNLESS_TEST |
| 817 | int32_t |
| 818 | utf8ToU32Code(int32_t u32char, const char** srcPtr, const char* srcLimit) |
| 819 | { |
| 820 | const char* src = *srcPtr; |
| 821 | uint8_t pt1, pt2; |
| 822 | if (0xE0 < u32char && u32char <= 0xEC && src + 1 < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F && (pt2 = (uint8_t)(src[1] - 0x80)) <= 0x3F) { |
| 823 | /* handle U+1000..U+CFFF */ |
| 824 | /* no need for (u32char&0xF) because the upper bits are truncated after <<12 in the cast to (uint16_t) */ |
| 825 | u32char = (uint16_t)((u32char << 12) | (pt1 << 6) | pt2); |
| 826 | src += 2; |
| 827 | } else if (u32char < 0xE0 && u32char >= 0xC2 && src < srcLimit && (pt1 = (uint8_t)(src[0] - 0x80)) <= 0x3F) { |
| 828 | /* handle U+0080..U+07FF */ |
| 829 | u32char = ((u32char & 0x1F) << 6) | pt1; |
| 830 | src++; |
| 831 | } else { |
| 832 | /* "complicated" and error cases, adapted from ICU's utf8_nextCharSafeBody() */ |
| 833 | uint8_t count = U8_COUNT_TRAIL_BYTES_LOC(u32char); |
| 834 | if (src + count <= srcLimit) { |
| 835 | uint8_t trail; |
| 836 | |
| 837 | U8_MASK_LEAD_BYTE_LOC(u32char, count); |
| 838 | switch (count) { |
| 839 | /* branches 3, 2 fall through to the next one */ |
| 840 | case 0: /* count==0 for illegally leading trail bytes and the illegal bytes 0XFE and 0XFF */ |
| 841 | case 5: |
| 842 | case 4: /* count>=4 is always illegal: no more than 3 trail bytes in Unicode's UTF-8 */ |
| 843 | break; |
| 844 | case 3: |
| 845 | trail = *src++ - (char)0X80; |
| 846 | u32char = (u32char << 6) | trail; |
| 847 | /* u32char>=0x110 would result in code point>0x10FFFF, outside Unicode */ |
| 848 | if (u32char >= 0x110 || trail > 0X3F) { |
| 849 | break; |
| 850 | } |
| 851 | case 2: |
| 852 | trail = *src++ - (char)0X80; |
| 853 | u32char = (u32char << 6) | trail; |
| 854 | /* |
| 855 | * test for a surrogate D800..DFFF: |
| 856 | * before the last (u32char<<6), a surrogate is u32char=360..37F |
| 857 | */ |
| 858 | if (((u32char & 0xFFE0) == 0x360) || trail > 0X3F) { |
| 859 | break; |
| 860 | } |
| 861 | case 1: |
| 862 | trail = *src++ - (char)0X80; |
| 863 | u32char = (u32char << 6) | trail; |
| 864 | if (trail > 0X3F) { |
| 865 | break; |
| 866 | } |
| 867 | /* correct sequence - all trail bytes have (b7..b6)==(10) */ |
| 868 | if (u32char >= utf8_minLegal[count]) { |
| 869 | *srcPtr = src; |
| 870 | return u32char; |
| 871 | } |
| 872 | /* no default branch to optimize switch() - all values are covered */ |
| 873 | } |
| 874 | } |
| 875 | u32char = -1; |
| 876 | } |
| 877 | *srcPtr = src; |
| 878 | return u32char; |
| 879 | } |
| 880 | |
| 881 | /* |
| 882 | * normalizeCaseFoldU32Code, map a single UTF32 code point to its normalized result |
| 883 | * and the combining classes for each resulting char, or indicate it is invalid. |
| 884 | * |
| 885 | * The normalized and case-folded result might be up to 4 UTF32 characters (current |
| 886 | * max, could change in the future). |
| 887 | * |
| 888 | * u32char - input UTF32 code point |
| 889 | * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only |
| 890 | * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) |
| 891 | * to receive the normalize result. |
| 892 | * combClass - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) |
| 893 | * to receive the combining classes for the characters in u32NormFoldBuf. If |
| 894 | * the first entry has non-zero combining class, the remaining entries do too. |
| 895 | * |
| 896 | * returns -1 if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax |
| 897 | * is insufficient (though it is assumed to be at least 3), else the length of the |
| 898 | * normalized and case-folded result (currently in the range 1..4). |
| 899 | * |
| 900 | * This has been validated against ICU behavior. |
| 901 | * |
| 902 | * This function is highly dependent on the structure of the data trie; for details on |
| 903 | * that structure, see comments in normalizeCaseFoldData.h |
| 904 | */ |
| 905 | STATIC_UNLESS_TEST |
| 906 | int32_t |
| 907 | normalizeOptCaseFoldU32Char(int32_t u32char, bool case_sens, |
| 908 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax], |
| 909 | uint8_t combClass[kNFCSingleCharDecompMax]) |
| 910 | { |
| 911 | combClass[0] = 0; |
| 912 | /* return hi-range PUA as self, except non-characters */ |
| 913 | if (u32char >= kU32HiPUAStart) { |
| 914 | if ((u32char & 0xFFFE) == 0xFFFE) { |
| 915 | return -1; |
| 916 | } |
| 917 | u32NormFoldBuf[0] = u32char; |
| 918 | return 1; |
| 919 | } |
| 920 | /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */ |
| 921 | /* 0 - 0x323FF; everything in between in currently invalid. */ |
| 922 | int32_t u32charLookup = u32char; |
| 923 | if (u32charLookup >= kU32LowRangeLimit) { |
| 924 | u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit); |
| 925 | if (u32charLookup < kU32LowRangeLimit || u32charLookup >= (kU32LowRangeLimit + kU32HiRangeLen)) { |
| 926 | return -1; /* in the large range of currently-unassigned code points */ |
| 927 | } |
| 928 | } |
| 929 | /* Now we have u32charLookup either in 0..0x323FF representing u32char itself, |
| 930 | * or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in |
| 931 | * the trie that identifies unassigneds in this range, or maps others to |
| 932 | * decomps or combining class or just self. */ |
| 933 | uint16_t trieValue; |
| 934 | /* TrieHi */ |
| 935 | trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift]; |
| 936 | if (trieValue == kInvalidCodeFlag) { |
| 937 | return -1; |
| 938 | } |
| 939 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /* return self; */ |
| 940 | u32NormFoldBuf[0] = u32char; |
| 941 | combClass[0] = trieValue & kFlagValueMask; |
| 942 | return 1; |
| 943 | } |
| 944 | if (trieValue == kHangulMask) { |
| 945 | combClass[1] = combClass[2] = 0; |
| 946 | return decomposeHangul(u32char, u32NormFoldBuf); |
| 947 | } |
| 948 | /* TrieMid */ |
| 949 | trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask]; |
| 950 | if (trieValue == kInvalidCodeFlag) { |
| 951 | return -1; |
| 952 | } |
| 953 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { |
| 954 | u32NormFoldBuf[0] = u32char; |
| 955 | combClass[0] = trieValue & kFlagValueMask; |
| 956 | return adjustCase(case_sens, uSeqLen: 1, u32NormFoldBuf); |
| 957 | } |
| 958 | if ((trieValue & kFlagTestMask) == kInvMaskFlag) { |
| 959 | uint16_t invalidMask = nfU16InvMasks[trieValue & kFlagValueMask]; |
| 960 | uint16_t testBit = (uint16_t)(1 << (u32charLookup & kNFTrieLoMask)); |
| 961 | if (testBit & invalidMask) { |
| 962 | /* invalid */ |
| 963 | return -1; |
| 964 | } else { |
| 965 | /* treat like trieValue == 0 above */ |
| 966 | u32NormFoldBuf[0] = u32char; |
| 967 | return adjustCase(case_sens, uSeqLen: 1, u32NormFoldBuf); |
| 968 | } |
| 969 | } |
| 970 | if (trieValue == kHangulMask) { |
| 971 | combClass[1] = combClass[2] = 0; |
| 972 | return decomposeHangul(u32char, u32NormFoldBuf); |
| 973 | } |
| 974 | /* TrieLo */ |
| 975 | trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieLoMask]; |
| 976 | if (trieValue == kInvalidCodeFlag) { |
| 977 | return -1; |
| 978 | } |
| 979 | if (trieValue == kHangulMask) { |
| 980 | combClass[1] = combClass[2] = 0; |
| 981 | return decomposeHangul(u32char, u32NormFoldBuf); |
| 982 | } |
| 983 | if (trieValue < kToU16Seq2Mask || trieValue > kSpecialsEnd) { |
| 984 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { |
| 985 | u32NormFoldBuf[0] = u32char; |
| 986 | combClass[0] = trieValue & kFlagValueMask; |
| 987 | } else { |
| 988 | u32NormFoldBuf[0] = trieValue; |
| 989 | } |
| 990 | return adjustCase(case_sens, uSeqLen: 1, u32NormFoldBuf); |
| 991 | } |
| 992 | const uint16_t* u16SeqPtr = NULL; |
| 993 | const int32_t* u32SeqPtr = NULL; |
| 994 | int32_t uSeqLen = 0; |
| 995 | switch (trieValue & kSpecialsMask) { |
| 996 | case kToU16Seq2Mask: |
| 997 | if (case_sens && (trieValue & kToSeqCaseFoldMask)) { |
| 998 | /* don't use the mapping, it is only for case folding */ |
| 999 | u32NormFoldBuf[0] = u32char; |
| 1000 | /* already have combClass[0] = 0 */ |
| 1001 | return 1; |
| 1002 | } |
| 1003 | u16SeqPtr = nfU16Seq2[trieValue & kToSeqIndexMask]; |
| 1004 | uSeqLen = 2; |
| 1005 | break; |
| 1006 | case kToU16Seq3Mask: |
| 1007 | if (case_sens && (trieValue & kToSeqCaseFoldMask)) { |
| 1008 | /* don't use the mapping, it is only for case folding */ |
| 1009 | u32NormFoldBuf[0] = u32char; |
| 1010 | /* already have combClass[0] = 0 */ |
| 1011 | return 1; |
| 1012 | } |
| 1013 | u16SeqPtr = nfU16Seq3[trieValue & kToSeqIndexMask]; |
| 1014 | uSeqLen = 3; |
| 1015 | break; |
| 1016 | case kToU16SeqMiscMask: |
| 1017 | u16SeqPtr = &nfU16SeqMisc[trieValue & kToSeqMiscIndexMask]; |
| 1018 | uSeqLen = *u16SeqPtr & kToSeqMiscLenMask; |
| 1019 | combClass[0] = (uint8_t)(*u16SeqPtr++ >> kToSeqMiscCCShift); |
| 1020 | break; |
| 1021 | case kToU32CharMask: |
| 1022 | if (case_sens && (trieValue & kToSeqCaseFoldMask)) { |
| 1023 | /* don't use the mapping, it is only for case folding */ |
| 1024 | u32NormFoldBuf[0] = u32char; |
| 1025 | /* already have combClass[0] = 0 */ |
| 1026 | return 1; |
| 1027 | } |
| 1028 | u32SeqPtr = &nfU32Char[trieValue & kToSeqIndexMask]; |
| 1029 | uSeqLen = 1; |
| 1030 | break; |
| 1031 | case kToU32SeqMiscMask: |
| 1032 | u32SeqPtr = &nfU32SeqMisc[trieValue & kToSeqMiscIndexMask]; |
| 1033 | uSeqLen = *u32SeqPtr & kToSeqMiscLenMask; |
| 1034 | combClass[0] = (uint8_t)(*u32SeqPtr++ >> kToSeqMiscCCShift); |
| 1035 | break; |
| 1036 | default: |
| 1037 | return -1; |
| 1038 | } |
| 1039 | if (kNFCSingleCharDecompMax < uSeqLen) { |
| 1040 | return 0; |
| 1041 | } |
| 1042 | int32_t idx; |
| 1043 | for (idx = 0; idx < uSeqLen; idx++) { |
| 1044 | u32NormFoldBuf[idx] = (u16SeqPtr)? *u16SeqPtr++: *u32SeqPtr++; |
| 1045 | if (idx > 0) { |
| 1046 | combClass[idx] = getCombClassU32Char(u32char: u32NormFoldBuf[idx]); |
| 1047 | } |
| 1048 | } |
| 1049 | return adjustCase(case_sens, uSeqLen, u32NormFoldBuf); |
| 1050 | } |
| 1051 | |
| 1052 | /* |
| 1053 | * adjustCase, final adjustments to normalizeOptCaseFoldU32Char for case folding |
| 1054 | * |
| 1055 | * case_sens - false for case insensiive => casefold, true for case sensitive => NFD only |
| 1056 | * uSeqLen - length of the sequence specified in the u32NormFoldBuf |
| 1057 | * u32NormFoldBuf - buffer of length kNFCSingleCharDecompMax (assume to be at least 3) |
| 1058 | * with normalized result. |
| 1059 | * |
| 1060 | * returns uSeqLen if input code point is invalid, 0 if the buffer length kNFCSingleCharDecompMax |
| 1061 | * is insufficient (though it is assumed to be at least 3), else the length of the |
| 1062 | * normalized and case-folded result (currently in the range 1..4). |
| 1063 | * |
| 1064 | * This function is a reduced version of normalizeOptCaseFoldU32Char above. |
| 1065 | */ |
| 1066 | |
| 1067 | static int32_t |
| 1068 | adjustCase(bool case_sens, int32_t uSeqLen, |
| 1069 | int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]) |
| 1070 | { |
| 1071 | if (!case_sens && uSeqLen > 0) { |
| 1072 | if (u32NormFoldBuf[0] < kSimpleCaseFoldLimit) { |
| 1073 | u32NormFoldBuf[0] = nfBasicCF[u32NormFoldBuf[0]]; |
| 1074 | /* There is one case in which this maps to a character with different combining |
| 1075 | * class: U+0345 (cc 240) casefolds to U+03B9 (cc 0). However when this is the |
| 1076 | * first or only character in the sequence, we want to keep the original |
| 1077 | * combining class, so nothing special to do here. |
| 1078 | */ |
| 1079 | } |
| 1080 | /* The following is the only case where we have a casefolding after the first |
| 1081 | * character in the sequence. Don't worry about combining class here. that gets |
| 1082 | * set later for characters after the first. |
| 1083 | */ |
| 1084 | if (uSeqLen > 1 && u32NormFoldBuf[uSeqLen - 1] == 0x0345) { |
| 1085 | u32NormFoldBuf[uSeqLen - 1] = 0x03B9; |
| 1086 | } |
| 1087 | } |
| 1088 | return uSeqLen; |
| 1089 | } |
| 1090 | |
| 1091 | /* |
| 1092 | * getCombClassU32Char, map a single character (in UTF32 form) to its combining class. |
| 1093 | * |
| 1094 | * u32char - input UTF32 code point. This is assumed to be a valid character that does |
| 1095 | * not have a decomposition. |
| 1096 | * |
| 1097 | * returns combining class of the character. |
| 1098 | * |
| 1099 | * This is only called for characters after the first is a decomposition expansion. In |
| 1100 | * this situation, if we encounter U+03B9 (combining class 0), it is only there as the |
| 1101 | * case-folding of U+0345 (combining class 240). In this case it is the combining class |
| 1102 | * for U+0345 that we want. In the non-casefold case we won't see U+03B9 here at all. |
| 1103 | * |
| 1104 | * This function is a reduced version of normalizeOptCaseFoldU32Char above. |
| 1105 | */ |
| 1106 | static uint8_t |
| 1107 | getCombClassU32Char(int32_t u32char) |
| 1108 | { |
| 1109 | if (u32char >= kU32HiPUAStart) { |
| 1110 | return 0; |
| 1111 | } |
| 1112 | if (u32char == 0x03B9) { |
| 1113 | return 240; |
| 1114 | } |
| 1115 | /* for trie lookup, shift the range 0xE0000-0xE01FF down to be just after the range */ |
| 1116 | /* 0 - 0x323FF; everything in between in currently invalid. */ |
| 1117 | int32_t u32charLookup = u32char; |
| 1118 | if (u32charLookup >= kU32LowRangeLimit) { |
| 1119 | u32charLookup -= (kU32HiRangeStart - kU32LowRangeLimit); |
| 1120 | } |
| 1121 | /* Now we have u32charLookup either in 0..0x323FF representing u32char itself, |
| 1122 | * or in 0x32400..0x325FF representing u32char 0xE0000..0xE01FF; look it up in |
| 1123 | * the trie that identifies unassigneds in this range, or maps others to |
| 1124 | * decomps or combining class or just self. */ |
| 1125 | uint16_t trieValue; |
| 1126 | /* TrieHi */ |
| 1127 | trieValue = nfTrieHi[u32charLookup >> kNFTrieHiShift]; |
| 1128 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { |
| 1129 | return trieValue & kFlagValueMask; |
| 1130 | } |
| 1131 | /* TrieMid */ |
| 1132 | trieValue = nfTrieMid[trieValue & kNextIndexValueMask][(u32charLookup >> kNFTrieMidShift) & kNFTrieMidMask]; |
| 1133 | if (trieValue == 0 || (trieValue & kFlagTestMask) == kCombClassFlag) { /* return self; */ |
| 1134 | return trieValue & kFlagValueMask; |
| 1135 | } |
| 1136 | if ((trieValue & kFlagTestMask) == kInvMaskFlag) { |
| 1137 | return 0; |
| 1138 | } |
| 1139 | /* TrieLo */ |
| 1140 | trieValue = nfTrieLo[trieValue & kNextIndexValueMask][u32charLookup & kNFTrieMidMask]; |
| 1141 | return ((trieValue & kFlagTestMask) == kCombClassFlag)? (trieValue & kFlagValueMask): 0; |
| 1142 | } |
| 1143 | |
| 1144 | /* |
| 1145 | * decomposeHangul, map a single UTF32 code point for a composed Hangul |
| 1146 | * in the range AC00-D7A3, using algorithmic decomp |
| 1147 | * |
| 1148 | * The normalized result will be 2 or 3 UTF32 characters. |
| 1149 | * |
| 1150 | * u32char - input UTF32 code point |
| 1151 | * u32NormFoldBuf - output buffer of length kNFCSingleCharDecompMax (assume to be at least 3) |
| 1152 | * to receive the normalize result. |
| 1153 | * |
| 1154 | * returns the length of the normalized result (2..3). |
| 1155 | * |
| 1156 | * Adapted from ICU Hangul:decompose in normalizer2impl.h |
| 1157 | * |
| 1158 | */ |
| 1159 | |
| 1160 | enum { |
| 1161 | HANGUL_BASE=0xAC00, |
| 1162 | JAMO_L_BASE=0x1100, /* "lead" jamo */ |
| 1163 | JAMO_V_BASE=0x1161, /* "vowel" jamo */ |
| 1164 | JAMO_T_BASE=0x11A7, /* "trail" jamo */ |
| 1165 | JAMO_L_COUNT=19, |
| 1166 | JAMO_V_COUNT=21, |
| 1167 | JAMO_T_COUNT=28, |
| 1168 | }; |
| 1169 | |
| 1170 | static int32_t |
| 1171 | decomposeHangul(int32_t u32char, int32_t u32NormFoldBuf[kNFCSingleCharDecompMax]) |
| 1172 | { |
| 1173 | u32char -= HANGUL_BASE; |
| 1174 | int32_t tIndex = u32char % JAMO_T_COUNT; |
| 1175 | u32char /= JAMO_T_COUNT; |
| 1176 | u32NormFoldBuf[0] = (uint16_t)(JAMO_L_BASE + u32char / JAMO_V_COUNT); |
| 1177 | u32NormFoldBuf[1] = (uint16_t)(JAMO_V_BASE + u32char % JAMO_V_COUNT); |
| 1178 | if (tIndex == 0) { |
| 1179 | return 2; |
| 1180 | } |
| 1181 | u32NormFoldBuf[2] = (uint16_t)(JAMO_T_BASE + tIndex); |
| 1182 | return 3; |
| 1183 | } |
| 1184 | |