1/*
2 * Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29 /*
30 Includes Unicode 3.2 decomposition code derived from Core Foundation
31 */
32
33#include <sys/param.h>
34#include <sys/utfconv.h>
35#include <sys/errno.h>
36#include <sys/malloc.h>
37#include <libkern/OSByteOrder.h>
38
39#if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST)
40#include <kern/assert.h>
41#else
42#include <assert.h>
43#endif
44
45/*
46 * UTF-8 (Unicode Transformation Format)
47 *
48 * UTF-8 is the Unicode Transformation Format that serializes a Unicode
49 * character as a sequence of one to four bytes. Only the shortest form
50 * required to represent the significant Unicode bits is legal.
51 *
52 * UTF-8 Multibyte Codes
53 *
54 * Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
55 * -----------------------------------------------------------------------------
56 * 1 7 0x0000 0x007F 0xxxxxxx
57 * 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
58 * 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
59 * 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
60 * -----------------------------------------------------------------------------
61 */
62
63
64#define UNICODE_TO_UTF8_LEN(c) \
65 ((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
66
67#define UCS_ALT_NULL 0x2400
68
69/* Surrogate Pair Constants */
70#define SP_HALF_SHIFT 10
71#define SP_HALF_BASE 0x0010000u
72#define SP_HALF_MASK 0x3FFu
73
74#define SP_HIGH_FIRST 0xD800u
75#define SP_HIGH_LAST 0xDBFFu
76#define SP_LOW_FIRST 0xDC00u
77#define SP_LOW_LAST 0xDFFFu
78
79
80#include "vfs_utfconvdata.h"
81
82
83/*
84 * Test for a combining character.
85 *
86 * Similar to __CFUniCharIsNonBaseCharacter except that
87 * unicode_combinable also includes Hangul Jamo characters.
88 */
89int
90unicode_combinable(u_int16_t character)
91{
92 const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
93 u_int8_t value;
94
95 if (character < 0x0300)
96 return (0);
97
98 value = bitmap[(character >> 8) & 0xFF];
99
100 if (value == 0xFF) {
101 return (1);
102 } else if (value) {
103 bitmap = bitmap + ((value - 1) * 32) + 256;
104 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
105 }
106 return (0);
107}
108
109/*
110 * Test for a precomposed character.
111 *
112 * Similar to __CFUniCharIsDecomposableCharacter.
113 */
114int
115unicode_decomposeable(u_int16_t character) {
116 const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
117 u_int8_t value;
118
119 if (character < 0x00C0)
120 return (0);
121
122 value = bitmap[(character >> 8) & 0xFF];
123
124 if (value == 0xFF) {
125 return (1);
126 } else if (value) {
127 bitmap = bitmap + ((value - 1) * 32) + 256;
128 return (bitmap[(character & 0xFF) / 8] & (1 << (character % 8)) ? 1 : 0);
129 }
130 return (0);
131}
132
133
134/*
135 * Get the combing class.
136 *
137 * Similar to CFUniCharGetCombiningPropertyForCharacter.
138 */
139static inline u_int8_t
140get_combining_class(u_int16_t character) {
141 const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
142
143 u_int8_t value = bitmap[(character >> 8)];
144
145 if (value) {
146 bitmap = bitmap + (value * 256);
147 return bitmap[character % 256];
148 }
149 return (0);
150}
151
152
153static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
154
155static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
156
157static void prioritysort(u_int16_t* characters, int count);
158
159static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
160
161static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
162
163
164char utf_extrabytes[32] = {
165 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
166 -1, -1, -1, -1, -1, -1, -1, -1, 1, 1, 1, 1, 2, 2, 3, -1
167};
168
169const char hexdigits[16] = {
170 '0', '1', '2', '3', '4', '5', '6', '7',
171 '8', '9', 'A', 'B', 'C', 'D', 'E', 'F'
172};
173
174/*
175 * utf8_encodelen - Calculate the UTF-8 encoding length
176 *
177 * This function takes a Unicode input string, ucsp, of ucslen bytes
178 * and calculates the size of the UTF-8 output in bytes (not including
179 * a NULL termination byte). The string must reside in kernel memory.
180 *
181 * If '/' chars are possible in the Unicode input then an alternate
182 * (replacement) char should be provided in altslash.
183 *
184 * FLAGS
185 * UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
186 *
187 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
188 *
189 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
190 *
191 * UTF_DECOMPOSED: generate fully decomposed output
192 *
193 * UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
194 *
195 * ERRORS
196 * None
197 */
198size_t
199utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
200{
201 u_int16_t ucs_ch;
202 u_int16_t * chp = NULL;
203 u_int16_t sequence[8];
204 int extra = 0;
205 size_t charcnt;
206 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
207 int decompose = (flags & UTF_DECOMPOSED);
208 size_t len;
209
210 charcnt = ucslen / 2;
211 len = 0;
212
213 while (charcnt-- > 0) {
214 if (extra > 0) {
215 --extra;
216 ucs_ch = *chp++;
217 } else {
218 ucs_ch = *ucsp++;
219 if (swapbytes) {
220 ucs_ch = OSSwapInt16(ucs_ch);
221 }
222 if (ucs_ch == '/') {
223 ucs_ch = altslash ? altslash : '_';
224 } else if (ucs_ch == '\0') {
225 ucs_ch = UCS_ALT_NULL;
226 } else if (decompose && unicode_decomposeable(ucs_ch)) {
227 extra = unicode_decompose(ucs_ch, sequence) - 1;
228 charcnt += extra;
229 ucs_ch = sequence[0];
230 chp = &sequence[1];
231 }
232 }
233 len += UNICODE_TO_UTF8_LEN(ucs_ch);
234 }
235
236 return (len);
237}
238
239
240/*
241 * utf8_encodestr - Encodes a Unicode string to UTF-8
242 *
243 * NOTES:
244 * The resulting UTF-8 string is NULL terminated.
245 *
246 * If '/' chars are allowed on disk then an alternate
247 * (replacement) char must be provided in altslash.
248 *
249 * input flags:
250 * UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
251 *
252 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
253 *
254 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
255 *
256 * UTF_DECOMPOSED: generate fully decomposed output
257 *
258 * UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
259 *
260 * result:
261 * ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
262 *
263 * EINVAL: Illegal char found; char was replaced by an '_'.
264 */
265int
266utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
267 size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
268{
269 u_int8_t * bufstart;
270 u_int8_t * bufend;
271 u_int16_t ucs_ch;
272 u_int16_t * chp = NULL;
273 u_int16_t sequence[8];
274 int extra = 0;
275 size_t charcnt;
276 int swapbytes = (flags & UTF_REVERSE_ENDIAN);
277 int nullterm = ((flags & UTF_NO_NULL_TERM) == 0);
278 int decompose = (flags & UTF_DECOMPOSED);
279 int sfmconv = (flags & UTF_SFM_CONVERSIONS);
280 int result = 0;
281
282 bufstart = utf8p;
283 bufend = bufstart + buflen;
284 if (nullterm)
285 --bufend;
286 charcnt = ucslen / 2;
287
288 while (charcnt-- > 0) {
289 if (extra > 0) {
290 --extra;
291 ucs_ch = *chp++;
292 } else {
293 ucs_ch = swapbytes ? OSSwapInt16(*ucsp++) : *ucsp++;
294
295 if (decompose && unicode_decomposeable(ucs_ch)) {
296 extra = unicode_decompose(ucs_ch, sequence) - 1;
297 charcnt += extra;
298 ucs_ch = sequence[0];
299 chp = &sequence[1];
300 }
301 }
302
303 /* Slash and NULL are not permitted */
304 if (ucs_ch == '/') {
305 if (altslash)
306 ucs_ch = altslash;
307 else {
308 ucs_ch = '_';
309 result = EINVAL;
310 }
311 } else if (ucs_ch == '\0') {
312 ucs_ch = UCS_ALT_NULL;
313 }
314
315 if (ucs_ch < 0x0080) {
316 if (utf8p >= bufend) {
317 result = ENAMETOOLONG;
318 break;
319 }
320 *utf8p++ = ucs_ch;
321
322 } else if (ucs_ch < 0x800) {
323 if ((utf8p + 1) >= bufend) {
324 result = ENAMETOOLONG;
325 break;
326 }
327 *utf8p++ = 0xc0 | (ucs_ch >> 6);
328 *utf8p++ = 0x80 | (0x3f & ucs_ch);
329
330 } else {
331 /* These chars never valid Unicode. */
332 if (ucs_ch == 0xFFFE || ucs_ch == 0xFFFF) {
333 result = EINVAL;
334 break;
335 }
336
337 /* Combine valid surrogate pairs */
338 if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
339 && charcnt > 0) {
340 u_int16_t ch2;
341 u_int32_t pair;
342
343 ch2 = swapbytes ? OSSwapInt16(*ucsp) : *ucsp;
344 if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
345 pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
346 + (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
347 if ((utf8p + 3) >= bufend) {
348 result = ENAMETOOLONG;
349 break;
350 }
351 --charcnt;
352 ++ucsp;
353 *utf8p++ = 0xf0 | (pair >> 18);
354 *utf8p++ = 0x80 | (0x3f & (pair >> 12));
355 *utf8p++ = 0x80 | (0x3f & (pair >> 6));
356 *utf8p++ = 0x80 | (0x3f & pair);
357 continue;
358 }
359 } else if (sfmconv) {
360 ucs_ch = sfm_to_ucs(ucs_ch);
361 if (ucs_ch < 0x0080) {
362 if (utf8p >= bufend) {
363 result = ENAMETOOLONG;
364 break;
365 }
366 *utf8p++ = ucs_ch;
367 continue;
368 }
369 }
370 if ((utf8p + 2) >= bufend) {
371 result = ENAMETOOLONG;
372 break;
373 }
374 *utf8p++ = 0xe0 | (ucs_ch >> 12);
375 *utf8p++ = 0x80 | (0x3f & (ucs_ch >> 6));
376 *utf8p++ = 0x80 | (0x3f & ucs_ch);
377 }
378 }
379
380 *utf8len = utf8p - bufstart;
381 if (nullterm)
382 *utf8p++ = '\0';
383
384 return (result);
385}
386
387// Pushes a character taking account of combining character sequences
388static void push(uint16_t ucs_ch, int *combcharcnt, uint16_t **ucsp)
389{
390 /*
391 * Make multiple combining character sequences canonical
392 */
393 if (unicode_combinable(ucs_ch)) {
394 ++*combcharcnt; /* start tracking a run */
395 } else if (*combcharcnt) {
396 if (*combcharcnt > 1) {
397 prioritysort(*ucsp - *combcharcnt, *combcharcnt);
398 }
399 *combcharcnt = 0; /* start over */
400 }
401
402 *(*ucsp)++ = ucs_ch;
403}
404
405/*
406 * utf8_decodestr - Decodes a UTF-8 string back to Unicode
407 *
408 * NOTES:
409 * The input UTF-8 string does not need to be null terminated
410 * if utf8len is set.
411 *
412 * If '/' chars are allowed on disk then an alternate
413 * (replacement) char must be provided in altslash.
414 *
415 * input flags:
416 * UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
417 *
418 * UTF_BIG_ENDIAN: Unicode byte order is always big endian
419 *
420 * UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
421 *
422 * UTF_DECOMPOSED: generate fully decomposed output (NFD)
423 *
424 * UTF_PRECOMPOSED: generate precomposed output (NFC)
425 *
426 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
427 *
428 * result:
429 * ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
430 *
431 * EINVAL: Illegal UTF-8 sequence found.
432 */
433int
434utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
435 size_t *ucslen, size_t buflen, u_int16_t altslash, int flags)
436{
437 u_int16_t* bufstart;
438 u_int16_t* bufend;
439 unsigned int ucs_ch;
440 unsigned int byte;
441 int combcharcnt = 0;
442 int result = 0;
443 int decompose, precompose, escaping;
444 int sfmconv;
445 int extrabytes;
446
447 decompose = (flags & UTF_DECOMPOSED);
448 precompose = (flags & UTF_PRECOMPOSED);
449 escaping = (flags & UTF_ESCAPE_ILLEGAL);
450 sfmconv = (flags & UTF_SFM_CONVERSIONS);
451
452 bufstart = ucsp;
453 bufend = (u_int16_t *)((u_int8_t *)ucsp + buflen);
454
455 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
456 if (ucsp >= bufend)
457 goto toolong;
458
459 /* check for ascii */
460 if (byte < 0x80) {
461 ucs_ch = sfmconv ? ucs_to_sfm(byte, utf8len == 0) : byte;
462 } else {
463 u_int32_t ch;
464
465 extrabytes = utf_extrabytes[byte >> 3];
466 if ((extrabytes < 0) || ((int)utf8len < extrabytes)) {
467 goto escape;
468 }
469 utf8len -= extrabytes;
470
471 switch (extrabytes) {
472 case 1:
473 ch = byte; ch <<= 6; /* 1st byte */
474 byte = *utf8p++; /* 2nd byte */
475 if ((byte >> 6) != 2)
476 goto escape2;
477 ch += byte;
478 ch -= 0x00003080UL;
479 if (ch < 0x0080)
480 goto escape2;
481 ucs_ch = ch;
482 break;
483 case 2:
484 ch = byte; ch <<= 6; /* 1st byte */
485 byte = *utf8p++; /* 2nd byte */
486 if ((byte >> 6) != 2)
487 goto escape2;
488 ch += byte; ch <<= 6;
489 byte = *utf8p++; /* 3rd byte */
490 if ((byte >> 6) != 2)
491 goto escape3;
492 ch += byte;
493 ch -= 0x000E2080UL;
494 if (ch < 0x0800)
495 goto escape3;
496 if (ch >= 0xD800) {
497 if (ch <= 0xDFFF)
498 goto escape3;
499 if (ch == 0xFFFE || ch == 0xFFFF)
500 goto escape3;
501 }
502 ucs_ch = ch;
503 break;
504 case 3:
505 ch = byte; ch <<= 6; /* 1st byte */
506 byte = *utf8p++; /* 2nd byte */
507 if ((byte >> 6) != 2)
508 goto escape2;
509 ch += byte; ch <<= 6;
510 byte = *utf8p++; /* 3rd byte */
511 if ((byte >> 6) != 2)
512 goto escape3;
513 ch += byte; ch <<= 6;
514 byte = *utf8p++; /* 4th byte */
515 if ((byte >> 6) != 2)
516 goto escape4;
517 ch += byte;
518 ch -= 0x03C82080UL + SP_HALF_BASE;
519 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
520 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
521 goto escape4;
522 push(ucs_ch, &combcharcnt, &ucsp);
523 if (ucsp >= bufend)
524 goto toolong;
525 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
526 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST) {
527 --ucsp;
528 goto escape4;
529 }
530 *ucsp++ = ucs_ch;
531 continue;
532 default:
533 result = EINVAL;
534 goto exit;
535 }
536 if (decompose) {
537 if (unicode_decomposeable(ucs_ch)) {
538 u_int16_t sequence[8];
539 int count, i;
540
541 count = unicode_decompose(ucs_ch, sequence);
542
543 for (i = 0; i < count; ++i) {
544 if (ucsp >= bufend)
545 goto toolong;
546
547 push(sequence[i], &combcharcnt, &ucsp);
548 }
549
550 continue;
551 }
552 } else if (precompose && (ucsp != bufstart)) {
553 u_int16_t composite, base;
554
555 if (unicode_combinable(ucs_ch)) {
556 base = ucsp[-1];
557 composite = unicode_combine(base, ucs_ch);
558 if (composite) {
559 --ucsp;
560 ucs_ch = composite;
561 }
562 }
563 }
564 if (ucs_ch == UCS_ALT_NULL)
565 ucs_ch = '\0';
566 }
567 if (ucs_ch == altslash)
568 ucs_ch = '/';
569
570 push(ucs_ch, &combcharcnt, &ucsp);
571 continue;
572
573 /*
574 * Escape illegal UTF-8 into something legal.
575 */
576escape4:
577 utf8p -= 3;
578 goto escape;
579escape3:
580 utf8p -= 2;
581 goto escape;
582escape2:
583 utf8p -= 1;
584escape:
585 if (!escaping) {
586 result = EINVAL;
587 goto exit;
588 }
589 if (extrabytes > 0)
590 utf8len += extrabytes;
591 byte = *(utf8p - 1);
592
593 if ((ucsp + 2) >= bufend)
594 goto toolong;
595
596 /* Make a previous combining sequence canonical. */
597 if (combcharcnt > 1) {
598 prioritysort(ucsp - combcharcnt, combcharcnt);
599 }
600 combcharcnt = 0;
601
602 ucs_ch = '%';
603 *ucsp++ = ucs_ch;
604 ucs_ch = hexdigits[byte >> 4];
605 *ucsp++ = ucs_ch;
606 ucs_ch = hexdigits[byte & 0x0F];
607 *ucsp++ = ucs_ch;
608 }
609 /*
610 * Make a previous combining sequence canonical
611 */
612 if (combcharcnt > 1) {
613 prioritysort(ucsp - combcharcnt, combcharcnt);
614 }
615
616 if (flags & UTF_REVERSE_ENDIAN) {
617 uint16_t *p = bufstart;
618 while (p < ucsp) {
619 *p = OSSwapInt16(*p);
620 ++p;
621 }
622 }
623
624exit:
625 *ucslen = (u_int8_t*)ucsp - (u_int8_t*)bufstart;
626
627 return (result);
628
629toolong:
630 result = ENAMETOOLONG;
631 goto exit;
632}
633
634
635/*
636 * utf8_validatestr - Check for a valid UTF-8 string.
637 */
638int
639utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
640{
641 unsigned int byte;
642 u_int32_t ch;
643 unsigned int ucs_ch;
644 size_t extrabytes;
645
646 while (utf8len-- > 0 && (byte = *utf8p++) != '\0') {
647 if (byte < 0x80)
648 continue; /* plain ascii */
649
650 extrabytes = utf_extrabytes[byte >> 3];
651
652 if (utf8len < extrabytes)
653 goto invalid;
654 utf8len -= extrabytes;
655
656 switch (extrabytes) {
657 case 1:
658 ch = byte; ch <<= 6; /* 1st byte */
659 byte = *utf8p++; /* 2nd byte */
660 if ((byte >> 6) != 2)
661 goto invalid;
662 ch += byte;
663 ch -= 0x00003080UL;
664 if (ch < 0x0080)
665 goto invalid;
666 break;
667 case 2:
668 ch = byte; ch <<= 6; /* 1st byte */
669 byte = *utf8p++; /* 2nd byte */
670 if ((byte >> 6) != 2)
671 goto invalid;
672 ch += byte; ch <<= 6;
673 byte = *utf8p++; /* 3rd byte */
674 if ((byte >> 6) != 2)
675 goto invalid;
676 ch += byte;
677 ch -= 0x000E2080UL;
678 if (ch < 0x0800)
679 goto invalid;
680 if (ch >= 0xD800) {
681 if (ch <= 0xDFFF)
682 goto invalid;
683 if (ch == 0xFFFE || ch == 0xFFFF)
684 goto invalid;
685 }
686 break;
687 case 3:
688 ch = byte; ch <<= 6; /* 1st byte */
689 byte = *utf8p++; /* 2nd byte */
690 if ((byte >> 6) != 2)
691 goto invalid;
692 ch += byte; ch <<= 6;
693 byte = *utf8p++; /* 3rd byte */
694 if ((byte >> 6) != 2)
695 goto invalid;
696 ch += byte; ch <<= 6;
697 byte = *utf8p++; /* 4th byte */
698 if ((byte >> 6) != 2)
699 goto invalid;
700 ch += byte;
701 ch -= 0x03C82080UL + SP_HALF_BASE;
702 ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
703 if (ucs_ch < SP_HIGH_FIRST || ucs_ch > SP_HIGH_LAST)
704 goto invalid;
705 ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
706 if (ucs_ch < SP_LOW_FIRST || ucs_ch > SP_LOW_LAST)
707 goto invalid;
708 break;
709 default:
710 goto invalid;
711 }
712
713 }
714 return (0);
715invalid:
716 return (EINVAL);
717}
718
719/*
720 * utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
721 *
722 * This function takes an UTF-8 input string, instr, of inlen bytes
723 * and produces normalized UTF-8 output into a buffer of buflen bytes
724 * pointed to by outstr. The size of the output in bytes (not including
725 * a NULL termination byte) is returned in outlen. In-place conversions
726 * are not supported (i.e. instr != outstr).]
727
728 * FLAGS
729 * UTF_DECOMPOSED: output string will be fully decomposed (NFD)
730 *
731 * UTF_PRECOMPOSED: output string will be precomposed (NFC)
732 *
733 * UTF_NO_NULL_TERM: do not add null termination to output string
734 *
735 * UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
736 *
737 * ERRORS
738 * ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
739 *
740 * EINVAL: illegal UTF-8 sequence encountered or invalid flags
741 */
742int
743utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
744 size_t *outlen, size_t buflen, int flags)
745{
746 u_int16_t unicodebuf[32];
747 u_int16_t* unistr = NULL;
748 size_t unicode_bytes;
749 size_t uft8_bytes;
750 size_t inbuflen;
751 u_int8_t *outbufstart, *outbufend;
752 const u_int8_t *inbufstart;
753 unsigned int byte;
754 int decompose, precompose;
755 int result = 0;
756
757 if (flags & ~(UTF_DECOMPOSED | UTF_PRECOMPOSED | UTF_NO_NULL_TERM | UTF_ESCAPE_ILLEGAL)) {
758 return (EINVAL);
759 }
760 decompose = (flags & UTF_DECOMPOSED);
761 precompose = (flags & UTF_PRECOMPOSED);
762 if ((decompose && precompose) || (!decompose && !precompose)) {
763 return (EINVAL);
764 }
765 outbufstart = outstr;
766 outbufend = outbufstart + buflen;
767 inbufstart = instr;
768 inbuflen = inlen;
769
770 while (inlen-- > 0 && (byte = *instr++) != '\0') {
771 if (outstr >= outbufend) {
772 result = ENAMETOOLONG;
773 goto exit;
774 }
775 if (byte >= 0x80) {
776 goto nonASCII;
777 }
778 /* ASCII is already normalized. */
779 *outstr++ = byte;
780 }
781exit:
782 *outlen = outstr - outbufstart;
783 if (((flags & UTF_NO_NULL_TERM) == 0)) {
784 if (outstr < outbufend)
785 *outstr++ = '\0';
786 else
787 result = ENAMETOOLONG;
788 }
789 return (result);
790
791
792 /*
793 * Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
794 * functions to perform the normalization. Since this will
795 * presumably be used to normalize filenames in the back-end
796 * (on disk or over-the-wire), it should be fast enough.
797 */
798nonASCII:
799
800 /* Make sure the input size is reasonable. */
801 if (inbuflen > MAXPATHLEN) {
802 result = ENAMETOOLONG;
803 goto exit;
804 }
805 /*
806 * Compute worst case Unicode buffer size.
807 *
808 * For pre-composed output, every UTF-8 input byte will be at
809 * most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
810 * (smallest composite char sequence) may yield 6 Unicode bytes
811 * (1 base char + 2 combining chars).
812 */
813 unicode_bytes = precompose ? (inbuflen * 2) : (inbuflen * 3);
814
815 if (unicode_bytes <= sizeof(unicodebuf))
816 unistr = &unicodebuf[0];
817 else
818 MALLOC(unistr, uint16_t *, unicode_bytes, M_TEMP, M_WAITOK);
819
820 /* Normalize the string. */
821 result = utf8_decodestr(inbufstart, inbuflen, unistr, &unicode_bytes,
822 unicode_bytes, 0, flags & ~UTF_NO_NULL_TERM);
823 if (result == 0) {
824 /* Put results back into UTF-8. */
825 result = utf8_encodestr(unistr, unicode_bytes, outbufstart,
826 &uft8_bytes, buflen, 0, UTF_NO_NULL_TERM);
827 outstr = outbufstart + uft8_bytes;
828 }
829 if (unistr && unistr != &unicodebuf[0]) {
830 FREE(unistr, M_TEMP);
831 }
832 goto exit;
833}
834
835
836 /*
837 * Unicode 3.2 decomposition code (derived from Core Foundation)
838 */
839
840typedef struct {
841 u_int32_t _key;
842 u_int32_t _value;
843} unicode_mappings32;
844
845static inline u_int32_t
846getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
847 u_int16_t character)
848{
849 const unicode_mappings32 *p, *q, *divider;
850
851 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
852 return (0);
853
854 p = theTable;
855 q = p + (numElem-1);
856 while (p <= q) {
857 divider = p + ((q - p) >> 1); /* divide by 2 */
858 if (character < divider->_key) { q = divider - 1; }
859 else if (character > divider->_key) { p = divider + 1; }
860 else { return (divider->_value); }
861 }
862 return (0);
863}
864
865#define RECURSIVE_DECOMPOSITION (1 << 15)
866#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
867
868typedef struct {
869 u_int16_t _key;
870 u_int16_t _value;
871} unicode_mappings16;
872
873static inline u_int16_t
874getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
875 u_int16_t character)
876{
877 const unicode_mappings16 *p, *q, *divider;
878
879 if ((character < theTable[0]._key) || (character > theTable[numElem-1]._key))
880 return (0);
881
882 p = theTable;
883 q = p + (numElem-1);
884 while (p <= q) {
885 divider = p + ((q - p) >> 1); /* divide by 2 */
886 if (character < divider->_key)
887 q = divider - 1;
888 else if (character > divider->_key)
889 p = divider + 1;
890 else
891 return (divider->_value);
892 }
893 return (0);
894}
895
896
897static u_int32_t
898unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
899{
900 u_int16_t value;
901 u_int32_t length;
902 u_int16_t firstChar;
903 u_int16_t theChar;
904 const u_int16_t *bmpMappings;
905 u_int32_t usedLength;
906
907 value = getmappedvalue16(
908 (const unicode_mappings16 *)__CFUniCharDecompositionTable,
909 __UniCharDecompositionTableLength, character);
910 length = EXTRACT_COUNT(value);
911 firstChar = value & 0x0FFF;
912 theChar = firstChar;
913 bmpMappings = (length == 1 ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
914 usedLength = 0;
915
916 if (value & RECURSIVE_DECOMPOSITION) {
917 usedLength = unicode_recursive_decompose((u_int16_t)*bmpMappings, convertedChars);
918
919 --length; /* Decrement for the first char */
920 if (!usedLength)
921 return 0;
922 ++bmpMappings;
923 convertedChars += usedLength;
924 }
925
926 usedLength += length;
927
928 while (length--)
929 *(convertedChars++) = *(bmpMappings++);
930
931 return (usedLength);
932}
933
934#define HANGUL_SBASE 0xAC00
935#define HANGUL_LBASE 0x1100
936#define HANGUL_VBASE 0x1161
937#define HANGUL_TBASE 0x11A7
938
939#define HANGUL_SCOUNT 11172
940#define HANGUL_LCOUNT 19
941#define HANGUL_VCOUNT 21
942#define HANGUL_TCOUNT 28
943#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
944
945/*
946 * unicode_decompose - decompose a composed Unicode char
947 *
948 * Composed Unicode characters are forbidden on
949 * HFS Plus volumes. ucs_decompose will convert a
950 * composed character into its correct decomposed
951 * sequence.
952 *
953 * Similar to CFUniCharDecomposeCharacter
954 */
955static int
956unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
957{
958 if ((character >= HANGUL_SBASE) &&
959 (character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
960 u_int32_t length;
961
962 character -= HANGUL_SBASE;
963 length = (character % HANGUL_TCOUNT ? 3 : 2);
964
965 *(convertedChars++) =
966 character / HANGUL_NCOUNT + HANGUL_LBASE;
967 *(convertedChars++) =
968 (character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
969 if (length > 2)
970 *convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
971 return (length);
972 } else {
973 return (unicode_recursive_decompose(character, convertedChars));
974 }
975}
976
977/*
978 * unicode_combine - generate a precomposed Unicode char
979 *
980 * Precomposed Unicode characters are required for some volume
981 * formats and network protocols. unicode_combine will combine
982 * a decomposed character sequence into a single precomposed
983 * (composite) character.
984 *
985 * Similar toCFUniCharPrecomposeCharacter but unicode_combine
986 * also handles Hangul Jamo characters.
987 */
988static u_int16_t
989unicode_combine(u_int16_t base, u_int16_t combining)
990{
991 u_int32_t value;
992
993 /* Check HANGUL */
994 if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
995 /* 2 char Hangul sequences */
996 if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
997 (base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
998 return (HANGUL_SBASE +
999 ((base - HANGUL_LBASE)*(HANGUL_VCOUNT*HANGUL_TCOUNT)) +
1000 ((combining - HANGUL_VBASE)*HANGUL_TCOUNT));
1001 }
1002
1003 /* 3 char Hangul sequences */
1004 if ((combining > HANGUL_TBASE) &&
1005 (base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
1006 if ((base - HANGUL_SBASE) % HANGUL_TCOUNT)
1007 return (0);
1008 else
1009 return (base + (combining - HANGUL_TBASE));
1010 }
1011 }
1012
1013 value = getmappedvalue32(
1014 (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
1015 __CFUniCharPrecompositionTableLength, combining);
1016
1017 if (value) {
1018 value = getmappedvalue16(
1019 (const unicode_mappings16 *)
1020 ((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & 0xFFFF)),
1021 (value >> 16), base);
1022 }
1023 return (value);
1024}
1025
1026
1027/*
1028 * prioritysort - order combining chars into canonical order
1029 *
1030 * Similar to CFUniCharPrioritySort
1031 */
1032static void
1033prioritysort(u_int16_t* characters, int count)
1034{
1035 u_int32_t p1, p2;
1036 u_int16_t *ch1, *ch2;
1037 u_int16_t *end;
1038 int changes = 0;
1039
1040 end = characters + count;
1041 do {
1042 changes = 0;
1043 ch1 = characters;
1044 ch2 = characters + 1;
1045 p2 = get_combining_class(*ch1);
1046 while (ch2 < end) {
1047 p1 = p2;
1048 p2 = get_combining_class(*ch2);
1049 if (p1 > p2 && p2 != 0) {
1050 u_int32_t tmp;
1051
1052 tmp = *ch1;
1053 *ch1 = *ch2;
1054 *ch2 = tmp;
1055 changes = 1;
1056
1057 /*
1058 * Make sure that p2 contains the combining class for the
1059 * character now stored at *ch2. This isn't required for
1060 * correctness, but it will be more efficient if a character
1061 * with a large combining class has to "bubble past" several
1062 * characters with lower combining classes.
1063 */
1064 p2 = p1;
1065 }
1066 ++ch1;
1067 ++ch2;
1068 }
1069 } while (changes);
1070}
1071
1072
1073/*
1074 * Invalid NTFS filename characters are encodeded using the
1075 * SFM (Services for Macintosh) private use Unicode characters.
1076 *
1077 * These should only be used for SMB, MSDOS or NTFS.
1078 *
1079 * Illegal NTFS Char SFM Unicode Char
1080 * ----------------------------------------
1081 * 0x01-0x1f 0xf001-0xf01f
1082 * '"' 0xf020
1083 * '*' 0xf021
1084 * '/' 0xf022
1085 * '<' 0xf023
1086 * '>' 0xf024
1087 * '?' 0xf025
1088 * '\' 0xf026
1089 * '|' 0xf027
1090 * ' ' 0xf028 (Only if last char of the name)
1091 * '.' 0xf029 (Only if last char of the name)
1092 * ----------------------------------------
1093 *
1094 * Reference: http://support.microsoft.com/kb/q117258/
1095 */
1096
1097#define MAX_SFM2MAC 0x29
1098#define SFMCODE_PREFIX_MASK 0xf000
1099
1100/*
1101 * In the Mac OS 9 days the colon was illegal in a file name. For that reason
1102 * SFM had no conversion for the colon. There is a conversion for the
1103 * slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1104 * is a slash and a slash is a colon. So we can just replace the slash with the
1105 * colon in our tables and everything will just work.
1106 */
1107static u_int8_t
1108sfm2mac[] = {
1109 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07, /* 00 - 07 */
1110 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f, /* 08 - 0F */
1111 0x10, 0x11, 0x12, 0x13, 0x14, 0x15, 0x16, 0x17, /* 10 - 17 */
1112 0x18, 0x19, 0x1a, 0x1b, 0x1c, 0x1d, 0x1e, 0x1f, /* 18 - 1F */
1113 0x22, 0x2a, 0x3a, 0x3c, 0x3e, 0x3f, 0x5c, 0x7c, /* 20 - 27 */
1114 0x20, 0x2e /* 28 - 29 */
1115};
1116#define SFM2MAC_LEN ((sizeof(sfm2mac))/sizeof(sfm2mac[0]))
1117
1118static u_int8_t
1119mac2sfm[] = {
1120 0x20, 0x21, 0x20, 0x23, 0x24, 0x25, 0x26, 0x27, /* 20 - 27 */
1121 0x28, 0x29, 0x21, 0x2b, 0x2c, 0x2d, 0x2e, 0x22, /* 28 - 2f */
1122 0x30, 0x31, 0x32, 0x33, 0x34, 0x35, 0x36, 0x37, /* 30 - 37 */
1123 0x38, 0x39, 0x22, 0x3b, 0x23, 0x3d, 0x24, 0x25, /* 38 - 3f */
1124 0x40, 0x41, 0x42, 0x43, 0x44, 0x45, 0x46, 0x47, /* 40 - 47 */
1125 0x48, 0x49, 0x4a, 0x4b, 0x4c, 0x4d, 0x4e, 0x4f, /* 48 - 4f */
1126 0x50, 0x51, 0x52, 0x53, 0x54, 0x55, 0x56, 0x57, /* 50 - 57 */
1127 0x58, 0x59, 0x5a, 0x5b, 0x26, 0x5d, 0x5e, 0x5f, /* 58 - 5f */
1128 0x60, 0x61, 0x62, 0x63, 0x64, 0x65, 0x66, 0x67, /* 60 - 67 */
1129 0x68, 0x69, 0x6a, 0x6b, 0x6c, 0x6d, 0x6e, 0x6f, /* 68 - 6f */
1130 0x70, 0x71, 0x72, 0x73, 0x74, 0x75, 0x76, 0x77, /* 70 - 77 */
1131 0x78, 0x79, 0x7a, 0x7b, 0x27, 0x7d, 0x7e, 0x7f /* 78 - 7f */
1132};
1133#define MAC2SFM_LEN ((sizeof(mac2sfm))/sizeof(mac2sfm[0]))
1134
1135
1136/*
1137 * Encode illegal NTFS filename characters into SFM Private Unicode characters
1138 *
1139 * Assumes non-zero ASCII input.
1140 */
1141static u_int16_t
1142ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
1143{
1144 /* The last character of filename cannot be a space or period. */
1145 if (lastchar) {
1146 if (ucs_ch == 0x20)
1147 return (0xf028);
1148 else if (ucs_ch == 0x2e)
1149 return (0xf029);
1150 }
1151 /* 0x01 - 0x1f is simple transformation. */
1152 if (ucs_ch <= 0x1f) {
1153 return (ucs_ch | 0xf000);
1154 } else /* 0x20 - 0x7f */ {
1155 u_int16_t lsb;
1156
1157 assert((ucs_ch - 0x0020) < MAC2SFM_LEN);
1158 lsb = mac2sfm[ucs_ch - 0x0020];
1159 if (lsb != ucs_ch)
1160 return(0xf000 | lsb);
1161 }
1162 return (ucs_ch);
1163}
1164
1165/*
1166 * Decode any SFM Private Unicode characters
1167 */
1168static u_int16_t
1169sfm_to_ucs(u_int16_t ucs_ch)
1170{
1171 if (((ucs_ch & 0xffC0) == SFMCODE_PREFIX_MASK) &&
1172 ((ucs_ch & 0x003f) <= MAX_SFM2MAC)) {
1173 assert((ucs_ch & 0x003f) < SFM2MAC_LEN);
1174 ucs_ch = sfm2mac[ucs_ch & 0x003f];
1175 }
1176 return (ucs_ch);
1177}
1178
1179
1180