vfs_utfconv.c source code [xnu/bsd/vfs/vfs_utfconv.c]

1	/*
2	* Copyright (c) 2000-2005 Apple Computer, Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	/*
30	* Includes Unicode 3.2 decomposition code derived from Core Foundation
31	*/
32
33	#include <sys/param.h>
34	#include <sys/utfconv.h>
35	#include <sys/errno.h>
36	#include <sys/malloc.h>
37	#include <libkern/OSByteOrder.h>
38
39	#if defined(KERNEL) && !defined(VFS_UTF8_UNIT_TEST)
40	#include <kern/assert.h>
41	#else
42	#include <assert.h>
43	#endif
44
45	/*
46	* UTF-8 (Unicode Transformation Format)
47	*
48	* UTF-8 is the Unicode Transformation Format that serializes a Unicode
49	* character as a sequence of one to four bytes. Only the shortest form
50	* required to represent the significant Unicode bits is legal.
51	*
52	* UTF-8 Multibyte Codes
53	*
54	* Bytes Bits Unicode Min Unicode Max UTF-8 Byte Sequence (binary)
55	* -----------------------------------------------------------------------------
56	* 1 7 0x0000 0x007F 0xxxxxxx
57	* 2 11 0x0080 0x07FF 110xxxxx 10xxxxxx
58	* 3 16 0x0800 0xFFFF 1110xxxx 10xxxxxx 10xxxxxx
59	* 4 21 0x10000 0x10FFFF 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
60	* -----------------------------------------------------------------------------
61	*/
62
63
64	#define UNICODE_TO_UTF8_LEN(c) \
65	((c) < 0x0080 ? 1 : ((c) < 0x0800 ? 2 : (((c) & 0xf800) == 0xd800 ? 2 : 3)))
66
67	#define UCS_ALT_NULL 0x2400
68
69	/ Surrogate Pair Constants /
70	#define SP_HALF_SHIFT 10
71	#define SP_HALF_BASE 0x0010000u
72	#define SP_HALF_MASK 0x3FFu
73
74	#define SP_HIGH_FIRST 0xD800u
75	#define SP_HIGH_LAST 0xDBFFu
76	#define SP_LOW_FIRST 0xDC00u
77	#define SP_LOW_LAST 0xDFFFu
78
79
80	#include "vfs_utfconvdata.h"
81
82
83	/*
84	* Test for a combining character.
85	*
86	* Similar to __CFUniCharIsNonBaseCharacter except that
87	* unicode_combinable also includes Hangul Jamo characters.
88	*/
89	int
90	unicode_combinable(u_int16_t character)
91	{
92	const u_int8_t *bitmap = __CFUniCharCombiningBitmap;
93	u_int8_t value;
94
95	if (character < `0x0300`) {
96	return `0`;
97	}
98
99	value = bitmap[(character >> `8`) & `0xFF`];
100
101	if (value == `0xFF`) {
102	return `1`;
103	} else if (value) {
104	bitmap = bitmap + ((value - `1`) * `32`) + `256`;
105	return bitmap[(character & `0xFF`) / `8`] & (`1` << (character % `8`)) ? `1` : `0`;
106	}
107	return `0`;
108	}
109
110	/*
111	* Test for a precomposed character.
112	*
113	* Similar to __CFUniCharIsDecomposableCharacter.
114	*/
115	int
116	unicode_decomposeable(u_int16_t character)
117	{
118	const u_int8_t *bitmap = __CFUniCharDecomposableBitmap;
119	u_int8_t value;
120
121	if (character < `0x00C0`) {
122	return `0`;
123	}
124
125	value = bitmap[(character >> `8`) & `0xFF`];
126
127	if (value == `0xFF`) {
128	return `1`;
129	} else if (value) {
130	bitmap = bitmap + ((value - `1`) * `32`) + `256`;
131	return bitmap[(character & `0xFF`) / `8`] & (`1` << (character % `8`)) ? `1` : `0`;
132	}
133	return `0`;
134	}
135
136
137	/*
138	* Get the combing class.
139	*
140	* Similar to CFUniCharGetCombiningPropertyForCharacter.
141	*/
142	static inline u_int8_t
143	get_combining_class(u_int16_t character)
144	{
145	const u_int8_t *bitmap = __CFUniCharCombiningPropertyBitmap;
146
147	u_int8_t value = bitmap[(character >> `8`)];
148
149	if (value) {
150	bitmap = bitmap + (value * `256`);
151	return bitmap[character % `256`];
152	}
153	return `0`;
154	}
155
156
157	static int unicode_decompose(u_int16_t character, u_int16_t *convertedChars);
158
159	static u_int16_t unicode_combine(u_int16_t base, u_int16_t combining);
160
161	static void prioritysort(u_int16_t* characters, int count);
162
163	static u_int16_t ucs_to_sfm(u_int16_t ucs_ch, int lastchar);
164
165	static u_int16_t sfm_to_ucs(u_int16_t ucs_ch);
166
167
168	char utf_extrabytes[`32`] = {
169	`0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`, `0`,
170	-`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, -`1`, `1`, `1`, `1`, `1`, `2`, `2`, `3`, -`1`
171	};
172
173	const char hexdigits[`16`] = {
174	`'0'`, `'1'`, `'2'`, `'3'`, `'4'`, `'5'`, `'6'`, `'7'`,
175	`'8'`, `'9'`, `'A'`, `'B'`, `'C'`, `'D'`, `'E'`, `'F'`
176	};
177
178	/*
179	* utf8_encodelen - Calculate the UTF-8 encoding length
180	*
181	* This function takes a Unicode input string, ucsp, of ucslen bytes
182	* and calculates the size of the UTF-8 output in bytes (not including
183	* a NULL termination byte). The string must reside in kernel memory.
184	*
185	* If '/' chars are possible in the Unicode input then an alternate
186	* (replacement) char should be provided in altslash.
187	*
188	* FLAGS
189	* UTF_REVERSE_ENDIAN: Unicode byte order is opposite current runtime
190	*
191	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
192	*
193	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
194	*
195	* UTF_DECOMPOSED: generate fully decomposed output
196	*
197	* UTF_PRECOMPOSED is ignored since utf8_encodestr doesn't support it
198	*
199	* ERRORS
200	* None
201	*/
202	size_t
203	utf8_encodelen(const u_int16_t * ucsp, size_t ucslen, u_int16_t altslash, int flags)
204	{
205	u_int16_t ucs_ch;
206	u_int16_t * chp = NULL;
207	u_int16_t sequence[`8`];
208	int extra = `0`;
209	size_t charcnt;
210	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
211	int decompose = (flags & UTF_DECOMPOSED);
212	size_t len;
213
214	charcnt = ucslen / `2`;
215	len = `0`;
216
217	while (charcnt-- > `0`) {
218	if (extra > `0`) {
219	--extra;
220	ucs_ch = *chp++;
221	} else {
222	ucs_ch = *ucsp++;
223	if (swapbytes) {
224	ucs_ch = OSSwapInt16(ucs_ch);
225	}
226	if (ucs_ch == `'/'`) {
227	ucs_ch = altslash ? altslash : `'_'`;
228	} else if (ucs_ch == `'\0'`) {
229	ucs_ch = UCS_ALT_NULL;
230	} else if (decompose && unicode_decomposeable(character: ucs_ch)) {
231	extra = unicode_decompose(character: ucs_ch, convertedChars: sequence) - `1`;
232	charcnt += extra;
233	ucs_ch = sequence[`0`];
234	chp = &sequence[`1`];
235	}
236	}
237	len += UNICODE_TO_UTF8_LEN(ucs_ch);
238	}
239
240	return len;
241	}
242
243
244	/*
245	* utf8_encodestr - Encodes a Unicode string to UTF-8
246	*
247	* NOTES:
248	* The resulting UTF-8 string is NULL terminated.
249	*
250	* If '/' chars are allowed on disk then an alternate
251	* (replacement) char must be provided in altslash.
252	*
253	* input flags:
254	* UTF_REVERSE_ENDIAN: Unicode byteorder is opposite current runtime
255	*
256	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
257	*
258	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
259	*
260	* UTF_DECOMPOSED: generate fully decomposed output
261	*
262	* UTF_NO_NULL_TERM: don't add NULL termination to UTF-8 output
263	*
264	* result:
265	* ENAMETOOLONG: Name didn't fit; only buflen bytes were encoded
266	*
267	* EINVAL: Illegal char found; char was replaced by an '_'.
268	*/
269	int
270	utf8_encodestr(const u_int16_t * ucsp, size_t ucslen, u_int8_t * utf8p,
271	size_t * utf8len, size_t buflen, u_int16_t altslash, int flags)
272	{
273	u_int8_t * bufstart;
274	u_int8_t * bufend;
275	u_int16_t ucs_ch;
276	u_int16_t * chp = NULL;
277	u_int16_t sequence[`8`];
278	int extra = `0`;
279	size_t charcnt;
280	int swapbytes = (flags & UTF_REVERSE_ENDIAN);
281	int nullterm = ((flags & UTF_NO_NULL_TERM) == `0`);
282	int decompose = (flags & UTF_DECOMPOSED);
283	int sfmconv = (flags & UTF_SFM_CONVERSIONS);
284	int result = `0`;
285
286	bufstart = utf8p;
287	bufend = bufstart + buflen;
288	if (nullterm) {
289	--bufend;
290	}
291	charcnt = ucslen / `2`;
292
293	while (charcnt-- > `0`) {
294	if (extra > `0`) {
295	--extra;
296	ucs_ch = *chp++;
297	} else {
298	ucs_ch = swapbytes ? OSSwapInt16(ucsp++) : ucsp++;
299
300	if (decompose && unicode_decomposeable(character: ucs_ch)) {
301	extra = unicode_decompose(character: ucs_ch, convertedChars: sequence) - `1`;
302	charcnt += extra;
303	ucs_ch = sequence[`0`];
304	chp = &sequence[`1`];
305	}
306	}
307
308	/ Slash and NULL are not permitted /
309	if (ucs_ch == `'/'`) {
310	if (altslash) {
311	ucs_ch = altslash;
312	} else {
313	ucs_ch = `'_'`;
314	result = EINVAL;
315	}
316	} else if (ucs_ch == `'\0'`) {
317	ucs_ch = UCS_ALT_NULL;
318	}
319
320	if (ucs_ch < `0x0080`) {
321	if (utf8p >= bufend) {
322	result = ENAMETOOLONG;
323	break;
324	}
325	*utf8p++ = (u_int8_t)ucs_ch;
326	} else if (ucs_ch < `0x800`) {
327	if ((utf8p + `1`) >= bufend) {
328	result = ENAMETOOLONG;
329	break;
330	}
331	*utf8p++ = `0xc0` \| (u_int8_t)(ucs_ch >> `6`);
332	*utf8p++ = `0x80` \| (`0x3f` & ucs_ch);
333	} else {
334	/ These chars never valid Unicode. /
335	if (ucs_ch == `0xFFFE` \|\| ucs_ch == `0xFFFF`) {
336	result = EINVAL;
337	break;
338	}
339
340	/ Combine valid surrogate pairs /
341	if (ucs_ch >= SP_HIGH_FIRST && ucs_ch <= SP_HIGH_LAST
342	&& charcnt > `0`) {
343	u_int16_t ch2;
344	u_int32_t pair;
345
346	ch2 = swapbytes ? OSSwapInt16(ucsp) : ucsp;
347	if (ch2 >= SP_LOW_FIRST && ch2 <= SP_LOW_LAST) {
348	pair = ((ucs_ch - SP_HIGH_FIRST) << SP_HALF_SHIFT)
349	+ (ch2 - SP_LOW_FIRST) + SP_HALF_BASE;
350	if ((utf8p + `3`) >= bufend) {
351	result = ENAMETOOLONG;
352	break;
353	}
354	--charcnt;
355	++ucsp;
356	*utf8p++ = `0xf0` \| (u_int8_t)(pair >> `18`);
357	*utf8p++ = `0x80` \| (`0x3f` & (pair >> `12`));
358	*utf8p++ = `0x80` \| (`0x3f` & (pair >> `6`));
359	*utf8p++ = `0x80` \| (`0x3f` & pair);
360	continue;
361	}
362	} else if (sfmconv) {
363	ucs_ch = sfm_to_ucs(ucs_ch);
364	if (ucs_ch < `0x0080`) {
365	if (utf8p >= bufend) {
366	result = ENAMETOOLONG;
367	break;
368	}
369	*utf8p++ = (u_int8_t)ucs_ch;
370	continue;
371	}
372	}
373	if ((utf8p + `2`) >= bufend) {
374	result = ENAMETOOLONG;
375	break;
376	}
377	*utf8p++ = `0xe0` \| (ucs_ch >> `12`);
378	*utf8p++ = `0x80` \| (`0x3f` & (ucs_ch >> `6`));
379	*utf8p++ = `0x80` \| (`0x3f` & ucs_ch);
380	}
381	}
382
383	*utf8len = utf8p - bufstart;
384	if (nullterm) {
385	*utf8p++ = `'\0'`;
386	}
387
388	return result;
389	}
390
391	// Pushes a character taking account of combining character sequences
392	static void
393	push(uint16_t ucs_ch, int combcharcnt, uint16_t *ucsp)
394	{
395	/*
396	* Make multiple combining character sequences canonical
397	*/
398	if (unicode_combinable(character: ucs_ch)) {
399	++combcharcnt; /* start tracking a run /
400	} else if (*combcharcnt) {
401	if (*combcharcnt > `1`) {
402	prioritysort(characters: ucsp - combcharcnt, count: *combcharcnt);
403	}
404	combcharcnt = `0`; /* start over /
405	}
406
407	(ucsp)++ = ucs_ch;
408	}
409
410	/*
411	* utf8_decodestr - Decodes a UTF-8 string back to Unicode
412	*
413	* NOTES:
414	* The input UTF-8 string does not need to be null terminated
415	* if utf8len is set.
416	*
417	* If '/' chars are allowed on disk then an alternate
418	* (replacement) char must be provided in altslash.
419	*
420	* input flags:
421	* UTF_REV_ENDIAN: Unicode byte order is opposite current runtime
422	*
423	* UTF_BIG_ENDIAN: Unicode byte order is always big endian
424	*
425	* UTF_LITTLE_ENDIAN: Unicode byte order is always little endian
426	*
427	* UTF_DECOMPOSED: generate fully decomposed output (NFD)
428	*
429	* UTF_PRECOMPOSED: generate precomposed output (NFC)
430	*
431	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
432	*
433	* result:
434	* ENAMETOOLONG: Name didn't fit; only ucslen chars were decoded.
435	*
436	* EINVAL: Illegal UTF-8 sequence found.
437	*/
438	int
439	utf8_decodestr(const u_int8_t* utf8p, size_t utf8len, u_int16_t* ucsp,
440	size_t ucslen, size_t buflen, u_int16_t altslash, int* flags)
441	{
442	u_int16_t* bufstart;
443	u_int16_t* bufend;
444	unsigned int ucs_ch;
445	unsigned int byte;
446	int combcharcnt = `0`;
447	int result = `0`;
448	int decompose, precompose, escaping;
449	int sfmconv;
450	int extrabytes;
451
452	decompose = (flags & UTF_DECOMPOSED);
453	precompose = (flags & UTF_PRECOMPOSED);
454	escaping = (flags & UTF_ESCAPE_ILLEGAL);
455	sfmconv = (flags & UTF_SFM_CONVERSIONS);
456
457	bufstart = ucsp;
458	bufend = (u_int16_t )((u_int8_t )ucsp + buflen);
459
460	while (utf8len-- > `0` && (byte = *utf8p++) != `'\0'`) {
461	if ((ucsp + `1`) > bufend) {
462	goto toolong;
463	}
464
465	/ check for ascii /
466	if (byte < `0x80`) {
467	ucs_ch = sfmconv ? ucs_to_sfm(ucs_ch: (u_int16_t)byte, lastchar: utf8len == `0`) : byte;
468	} else {
469	u_int32_t ch;
470
471	extrabytes = utf_extrabytes[byte >> `3`];
472	if ((extrabytes < `0`) \|\| ((int)utf8len < extrabytes)) {
473	goto escape;
474	}
475	utf8len -= extrabytes;
476
477	switch (extrabytes) {
478	case `1`:
479	ch = byte; ch <<= `6`; / 1st byte /
480	byte = utf8p++; /* 2nd byte /
481	if ((byte >> `6`) != `2`) {
482	goto escape2;
483	}
484	ch += byte;
485	ch -= `0x00003080UL`;
486	if (ch < `0x0080`) {
487	goto escape2;
488	}
489	ucs_ch = ch;
490	break;
491	case `2`:
492	ch = byte; ch <<= `6`; / 1st byte /
493	byte = utf8p++; /* 2nd byte /
494	if ((byte >> `6`) != `2`) {
495	goto escape2;
496	}
497	ch += byte; ch <<= `6`;
498	byte = utf8p++; /* 3rd byte /
499	if ((byte >> `6`) != `2`) {
500	goto escape3;
501	}
502	ch += byte;
503	ch -= `0x000E2080UL`;
504	if (ch < `0x0800`) {
505	goto escape3;
506	}
507	if (ch >= `0xD800`) {
508	if (ch <= `0xDFFF`) {
509	goto escape3;
510	}
511	if (ch == `0xFFFE` \|\| ch == `0xFFFF`) {
512	goto escape3;
513	}
514	}
515	ucs_ch = ch;
516	break;
517	case `3`:
518	ch = byte; ch <<= `6`; / 1st byte /
519	byte = utf8p++; /* 2nd byte /
520	if ((byte >> `6`) != `2`) {
521	goto escape2;
522	}
523	ch += byte; ch <<= `6`;
524	byte = utf8p++; /* 3rd byte /
525	if ((byte >> `6`) != `2`) {
526	goto escape3;
527	}
528	ch += byte; ch <<= `6`;
529	byte = utf8p++; /* 4th byte /
530	if ((byte >> `6`) != `2`) {
531	goto escape4;
532	}
533	ch += byte;
534	ch -= `0x03C82080UL` + SP_HALF_BASE;
535	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
536	if (ucs_ch < SP_HIGH_FIRST \|\| ucs_ch > SP_HIGH_LAST) {
537	goto escape4;
538	}
539	push(ucs_ch: (uint16_t)ucs_ch, combcharcnt: &combcharcnt, ucsp: &ucsp);
540	if (ucsp >= bufend) {
541	goto toolong;
542	}
543	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
544	if (ucs_ch < SP_LOW_FIRST \|\| ucs_ch > SP_LOW_LAST) {
545	--ucsp;
546	goto escape4;
547	}
548	*ucsp++ = (u_int16_t)ucs_ch;
549	continue;
550	default:
551	result = EINVAL;
552	goto exit;
553	}
554	if (decompose) {
555	if (unicode_decomposeable(character: (u_int16_t)ucs_ch)) {
556	u_int16_t sequence[`8`] = {`0`};
557	int count, i;
558
559	count = unicode_decompose(character: (u_int16_t)ucs_ch, convertedChars: sequence);
560
561	for (i = `0`; i < count; ++i) {
562	if (ucsp >= bufend) {
563	goto toolong;
564	}
565
566	push(ucs_ch: sequence[i], combcharcnt: &combcharcnt, ucsp: &ucsp);
567	}
568
569	continue;
570	}
571	} else if (precompose && (ucsp != bufstart)) {
572	u_int16_t composite, base;
573
574	if (unicode_combinable(character: (u_int16_t)ucs_ch)) {
575	base = ucsp[-`1`];
576	composite = unicode_combine(base, combining: (u_int16_t)ucs_ch);
577	if (composite) {
578	--ucsp;
579	ucs_ch = composite;
580	}
581	}
582	}
583	if (ucs_ch == UCS_ALT_NULL) {
584	ucs_ch = `'\0'`;
585	}
586	}
587	if (ucs_ch == altslash) {
588	ucs_ch = `'/'`;
589	}
590
591	push(ucs_ch: (u_int16_t)ucs_ch, combcharcnt: &combcharcnt, ucsp: &ucsp);
592	continue;
593
594	/*
595	* Escape illegal UTF-8 into something legal.
596	*/
597	escape4:
598	utf8p -= `3`;
599	goto escape;
600	escape3:
601	utf8p -= `2`;
602	goto escape;
603	escape2:
604	utf8p -= `1`;
605	escape:
606	if (!escaping) {
607	result = EINVAL;
608	goto exit;
609	}
610	if (extrabytes > `0`) {
611	utf8len += extrabytes;
612	}
613	byte = *(utf8p - `1`);
614
615	if ((ucsp + `2`) >= bufend) {
616	goto toolong;
617	}
618
619	/ Make a previous combining sequence canonical. /
620	if (combcharcnt > `1`) {
621	prioritysort(characters: ucsp - combcharcnt, count: combcharcnt);
622	}
623	combcharcnt = `0`;
624
625	ucs_ch = `'%'`;
626	*ucsp++ = (u_int16_t)ucs_ch;
627	ucs_ch = hexdigits[byte >> `4`];
628	*ucsp++ = (u_int16_t)ucs_ch;
629	ucs_ch = hexdigits[byte & `0x0F`];
630	*ucsp++ = (u_int16_t)ucs_ch;
631	}
632	/*
633	* Make a previous combining sequence canonical
634	*/
635	if (combcharcnt > `1`) {
636	prioritysort(characters: ucsp - combcharcnt, count: combcharcnt);
637	}
638
639	if (flags & UTF_REVERSE_ENDIAN) {
640	uint16_t *p = bufstart;
641	while (p < ucsp) {
642	p = OSSwapInt16(p);
643	++p;
644	}
645	}
646
647	exit:
648	ucslen = (u_int8_t)ucsp - (u_int8_t*)bufstart;
649
650	return result;
651
652	toolong:
653	result = ENAMETOOLONG;
654	goto exit;
655	}
656
657
658	/*
659	* utf8_validatestr - Check for a valid UTF-8 string.
660	*/
661	int
662	utf8_validatestr(const u_int8_t* utf8p, size_t utf8len)
663	{
664	unsigned int byte;
665	u_int32_t ch;
666	unsigned int ucs_ch;
667	size_t extrabytes;
668
669	while (utf8len-- > `0` && (byte = *utf8p++) != `'\0'`) {
670	if (byte < `0x80`) {
671	continue; / plain ascii /
672	}
673	extrabytes = utf_extrabytes[byte >> `3`];
674
675	if (utf8len < extrabytes) {
676	goto invalid;
677	}
678	utf8len -= extrabytes;
679
680	switch (extrabytes) {
681	case `1`:
682	ch = byte; ch <<= `6`; / 1st byte /
683	byte = utf8p++; /* 2nd byte /
684	if ((byte >> `6`) != `2`) {
685	goto invalid;
686	}
687	ch += byte;
688	ch -= `0x00003080UL`;
689	if (ch < `0x0080`) {
690	goto invalid;
691	}
692	break;
693	case `2`:
694	ch = byte; ch <<= `6`; / 1st byte /
695	byte = utf8p++; /* 2nd byte /
696	if ((byte >> `6`) != `2`) {
697	goto invalid;
698	}
699	ch += byte; ch <<= `6`;
700	byte = utf8p++; /* 3rd byte /
701	if ((byte >> `6`) != `2`) {
702	goto invalid;
703	}
704	ch += byte;
705	ch -= `0x000E2080UL`;
706	if (ch < `0x0800`) {
707	goto invalid;
708	}
709	if (ch >= `0xD800`) {
710	if (ch <= `0xDFFF`) {
711	goto invalid;
712	}
713	if (ch == `0xFFFE` \|\| ch == `0xFFFF`) {
714	goto invalid;
715	}
716	}
717	break;
718	case `3`:
719	ch = byte; ch <<= `6`; / 1st byte /
720	byte = utf8p++; /* 2nd byte /
721	if ((byte >> `6`) != `2`) {
722	goto invalid;
723	}
724	ch += byte; ch <<= `6`;
725	byte = utf8p++; /* 3rd byte /
726	if ((byte >> `6`) != `2`) {
727	goto invalid;
728	}
729	ch += byte; ch <<= `6`;
730	byte = utf8p++; /* 4th byte /
731	if ((byte >> `6`) != `2`) {
732	goto invalid;
733	}
734	ch += byte;
735	ch -= `0x03C82080UL` + SP_HALF_BASE;
736	ucs_ch = (ch >> SP_HALF_SHIFT) + SP_HIGH_FIRST;
737	if (ucs_ch < SP_HIGH_FIRST \|\| ucs_ch > SP_HIGH_LAST) {
738	goto invalid;
739	}
740	ucs_ch = (ch & SP_HALF_MASK) + SP_LOW_FIRST;
741	if (ucs_ch < SP_LOW_FIRST \|\| ucs_ch > SP_LOW_LAST) {
742	goto invalid;
743	}
744	break;
745	default:
746	goto invalid;
747	}
748	}
749	return `0`;
750	invalid:
751	return EINVAL;
752	}
753
754	/*
755	* utf8_normalizestr - Normalize a UTF-8 string (NFC or NFD)
756	*
757	* This function takes an UTF-8 input string, instr, of inlen bytes
758	* and produces normalized UTF-8 output into a buffer of buflen bytes
759	* pointed to by outstr. The size of the output in bytes (not including
760	* a NULL termination byte) is returned in outlen. In-place conversions
761	* are not supported (i.e. instr != outstr).]
762	*
763	* FLAGS
764	* UTF_DECOMPOSED: output string will be fully decomposed (NFD)
765	*
766	* UTF_PRECOMPOSED: output string will be precomposed (NFC)
767	*
768	* UTF_NO_NULL_TERM: do not add null termination to output string
769	*
770	* UTF_ESCAPE_ILLEGAL: percent escape any illegal UTF-8 input
771	*
772	* ERRORS
773	* ENAMETOOLONG: output did not fit or input exceeded MAXPATHLEN bytes
774	*
775	* EINVAL: illegal UTF-8 sequence encountered or invalid flags
776	*/
777	int
778	utf8_normalizestr(const u_int8_t* instr, size_t inlen, u_int8_t* outstr,
779	size_t outlen, size_t buflen, int* flags)
780	{
781	u_int16_t unicodebuf[`32`];
782	u_int16_t* unistr = NULL;
783	size_t unicode_bytes;
784	size_t uft8_bytes;
785	size_t inbuflen;
786	u_int8_t outbufstart, outbufend;
787	const u_int8_t *inbufstart;
788	unsigned int byte;
789	int decompose, precompose;
790	int result = `0`;
791
792	if (flags & ~(UTF_DECOMPOSED \| UTF_PRECOMPOSED \| UTF_NO_NULL_TERM \| UTF_ESCAPE_ILLEGAL)) {
793	return EINVAL;
794	}
795	decompose = (flags & UTF_DECOMPOSED);
796	precompose = (flags & UTF_PRECOMPOSED);
797	if ((decompose && precompose) \|\| (!decompose && !precompose)) {
798	return EINVAL;
799	}
800	outbufstart = outstr;
801	outbufend = outbufstart + buflen;
802	inbufstart = instr;
803	inbuflen = inlen;
804
805	while (inlen-- > `0` && (byte = *instr++) != `'\0'`) {
806	if (outstr >= outbufend) {
807	result = ENAMETOOLONG;
808	goto exit;
809	}
810	if (byte >= `0x80`) {
811	goto nonASCII;
812	}
813	/ ASCII is already normalized. /
814	*outstr++ = (u_int8_t)byte;
815	}
816	exit:
817	*outlen = outstr - outbufstart;
818	if (((flags & UTF_NO_NULL_TERM) == `0`)) {
819	if (outstr < outbufend) {
820	*outstr++ = `'\0'`;
821	} else {
822	result = ENAMETOOLONG;
823	}
824	}
825	return result;
826
827
828	/*
829	* Non-ASCII uses the existing utf8_encodestr/utf8_decodestr
830	* functions to perform the normalization. Since this will
831	* presumably be used to normalize filenames in the back-end
832	* (on disk or over-the-wire), it should be fast enough.
833	*/
834	nonASCII:
835
836	/ Make sure the input size is reasonable. /
837	if (inbuflen > MAXPATHLEN) {
838	result = ENAMETOOLONG;
839	goto exit;
840	}
841	/*
842	* Compute worst case Unicode buffer size.
843	*
844	* For pre-composed output, every UTF-8 input byte will be at
845	* most 2 Unicode bytes. For decomposed output, 2 UTF-8 bytes
846	* (smallest composite char sequence) may yield 6 Unicode bytes
847	* (1 base char + 2 combining chars).
848	*/
849	unicode_bytes = precompose ? (inbuflen * `2`) : (inbuflen * `3`);
850
851	if (unicode_bytes <= sizeof(unicodebuf)) {
852	unistr = &unicodebuf[`0`];
853	} else {
854	unistr = kalloc_data(unicode_bytes, Z_WAITOK);
855	}
856
857	/ Normalize the string. /
858	result = utf8_decodestr(utf8p: inbufstart, utf8len: inbuflen, ucsp: unistr, ucslen: &unicode_bytes,
859	buflen: unicode_bytes, altslash: `0`, flags: flags & ~UTF_NO_NULL_TERM);
860	if (result == `0`) {
861	/ Put results back into UTF-8. /
862	result = utf8_encodestr(ucsp: unistr, ucslen: unicode_bytes, utf8p: outbufstart,
863	utf8len: &uft8_bytes, buflen, altslash: `0`, UTF_NO_NULL_TERM);
864	outstr = outbufstart + uft8_bytes;
865	}
866	if (unistr && unistr != &unicodebuf[`0`]) {
867	kfree_data(unistr, unicode_bytes);
868	}
869	goto exit;
870	}
871
872
873	/*
874	* Unicode 3.2 decomposition code (derived from Core Foundation)
875	*/
876
877	typedef struct {
878	u_int32_t _key;
879	u_int32_t _value;
880	} unicode_mappings32;
881
882	static inline u_int32_t
883	getmappedvalue32(const unicode_mappings32 *theTable, u_int32_t numElem,
884	u_int16_t character)
885	{
886	const unicode_mappings32 p, q, *divider;
887
888	if ((character < theTable[`0`]._key) \|\| (character > theTable[numElem - `1`]._key)) {
889	return `0`;
890	}
891
892	p = theTable;
893	q = p + (numElem - `1`);
894	while (p <= q) {
895	divider = p + ((q - p) >> `1`); / divide by 2 /
896	if (character < divider->_key) {
897	q = divider - `1`;
898	} else if (character > divider->_key) {
899	p = divider + `1`;
900	} else {
901	return divider->_value;
902	}
903	}
904	return `0`;
905	}
906
907	#define RECURSIVE_DECOMPOSITION (1 << 15)
908	#define EXTRACT_COUNT(value) (((value) >> 12) & 0x0007)
909
910	typedef struct {
911	u_int16_t _key;
912	u_int16_t _value;
913	} unicode_mappings16;
914
915	static inline u_int16_t
916	getmappedvalue16(const unicode_mappings16 *theTable, u_int32_t numElem,
917	u_int16_t character)
918	{
919	const unicode_mappings16 p, q, *divider;
920
921	if ((character < theTable[`0`]._key) \|\| (character > theTable[numElem - `1`]._key)) {
922	return `0`;
923	}
924
925	p = theTable;
926	q = p + (numElem - `1`);
927	while (p <= q) {
928	divider = p + ((q - p) >> `1`); / divide by 2 /
929	if (character < divider->_key) {
930	q = divider - `1`;
931	} else if (character > divider->_key) {
932	p = divider + `1`;
933	} else {
934	return divider->_value;
935	}
936	}
937	return `0`;
938	}
939
940
941	static u_int32_t
942	unicode_recursive_decompose(u_int16_t character, u_int16_t *convertedChars)
943	{
944	u_int16_t value;
945	u_int32_t length;
946	u_int16_t firstChar;
947	u_int16_t theChar;
948	const u_int16_t *bmpMappings;
949	u_int32_t usedLength;
950
951	value = getmappedvalue16(
952	theTable: (const unicode_mappings16 *)__CFUniCharDecompositionTable,
953	numElem: __UniCharDecompositionTableLength, character);
954	length = EXTRACT_COUNT(value);
955	firstChar = value & `0x0FFF`;
956	theChar = firstChar;
957	bmpMappings = (length == `1` ? &theChar : __CFUniCharMultipleDecompositionTable + firstChar);
958	usedLength = `0`;
959
960	if (value & RECURSIVE_DECOMPOSITION) {
961	usedLength = unicode_recursive_decompose(character: (u_int16_t)*bmpMappings, convertedChars);
962
963	--length; / Decrement for the first char /
964	if (!usedLength) {
965	return `0`;
966	}
967	++bmpMappings;
968	convertedChars += usedLength;
969	}
970
971	usedLength += length;
972
973	while (length--) {
974	(convertedChars++) = (bmpMappings++);
975	}
976
977	return usedLength;
978	}
979
980	#define HANGUL_SBASE 0xAC00
981	#define HANGUL_LBASE 0x1100
982	#define HANGUL_VBASE 0x1161
983	#define HANGUL_TBASE 0x11A7
984
985	#define HANGUL_SCOUNT 11172
986	#define HANGUL_LCOUNT 19
987	#define HANGUL_VCOUNT 21
988	#define HANGUL_TCOUNT 28
989	#define HANGUL_NCOUNT (HANGUL_VCOUNT * HANGUL_TCOUNT)
990
991	/*
992	* unicode_decompose - decompose a composed Unicode char
993	*
994	* Composed Unicode characters are forbidden on
995	* HFS Plus volumes. ucs_decompose will convert a
996	* composed character into its correct decomposed
997	* sequence.
998	*
999	* Similar to CFUniCharDecomposeCharacter
1000	*/
1001	static int
1002	unicode_decompose(u_int16_t character, u_int16_t *convertedChars)
1003	{
1004	if ((character >= HANGUL_SBASE) &&
1005	(character <= (HANGUL_SBASE + HANGUL_SCOUNT))) {
1006	u_int32_t length;
1007
1008	character -= HANGUL_SBASE;
1009	length = (character % HANGUL_TCOUNT ? `3` : `2`);
1010
1011	*(convertedChars++) =
1012	character / HANGUL_NCOUNT + HANGUL_LBASE;
1013	*(convertedChars++) =
1014	(character % HANGUL_NCOUNT) / HANGUL_TCOUNT + HANGUL_VBASE;
1015	if (length > `2`) {
1016	*convertedChars = (character % HANGUL_TCOUNT) + HANGUL_TBASE;
1017	}
1018	return length;
1019	} else {
1020	return unicode_recursive_decompose(character, convertedChars);
1021	}
1022	}
1023
1024	/*
1025	* unicode_combine - generate a precomposed Unicode char
1026	*
1027	* Precomposed Unicode characters are required for some volume
1028	* formats and network protocols. unicode_combine will combine
1029	* a decomposed character sequence into a single precomposed
1030	* (composite) character.
1031	*
1032	* Similar toCFUniCharPrecomposeCharacter but unicode_combine
1033	* also handles Hangul Jamo characters.
1034	*/
1035	static u_int16_t
1036	unicode_combine(u_int16_t base, u_int16_t combining)
1037	{
1038	u_int32_t value;
1039
1040	/ Check HANGUL /
1041	if ((combining >= HANGUL_VBASE) && (combining < (HANGUL_TBASE + HANGUL_TCOUNT))) {
1042	/ 2 char Hangul sequences /
1043	if ((combining < (HANGUL_VBASE + HANGUL_VCOUNT)) &&
1044	(base >= HANGUL_LBASE && base < (HANGUL_LBASE + HANGUL_LCOUNT))) {
1045	return HANGUL_SBASE +
1046	((base - HANGUL_LBASE) * (HANGUL_VCOUNT * HANGUL_TCOUNT)) +
1047	((combining - HANGUL_VBASE) * HANGUL_TCOUNT);
1048	}
1049
1050	/ 3 char Hangul sequences /
1051	if ((combining > HANGUL_TBASE) &&
1052	(base >= HANGUL_SBASE && base < (HANGUL_SBASE + HANGUL_SCOUNT))) {
1053	if ((base - HANGUL_SBASE) % HANGUL_TCOUNT) {
1054	return `0`;
1055	} else {
1056	return base + (combining - HANGUL_TBASE);
1057	}
1058	}
1059	}
1060
1061	value = getmappedvalue32(
1062	theTable: (const unicode_mappings32 *)__CFUniCharPrecompSourceTable,
1063	numElem: __CFUniCharPrecompositionTableLength, character: combining);
1064
1065	if (value) {
1066	value = getmappedvalue16(
1067	theTable: (const unicode_mappings16 *)
1068	((const u_int32_t *)__CFUniCharBMPPrecompDestinationTable + (value & `0xFFFF`)),
1069	numElem: (value >> `16`), character: base);
1070	}
1071	return (u_int16_t)value;
1072	}
1073
1074
1075	/*
1076	* prioritysort - order combining chars into canonical order
1077	*
1078	* Similar to CFUniCharPrioritySort
1079	*/
1080	static void
1081	prioritysort(u_int16_t* characters, int count)
1082	{
1083	u_int32_t p1, p2;
1084	u_int16_t ch1, ch2;
1085	u_int16_t *end;
1086	int changes = `0`;
1087
1088	end = characters + count;
1089	do {
1090	changes = `0`;
1091	ch1 = characters;
1092	ch2 = characters + `1`;
1093	p2 = get_combining_class(character: *ch1);
1094	while (ch2 < end) {
1095	p1 = p2;
1096	p2 = get_combining_class(character: *ch2);
1097	if (p1 > p2 && p2 != `0`) {
1098	u_int16_t tmp;
1099
1100	tmp = *ch1;
1101	ch1 = ch2;
1102	*ch2 = tmp;
1103	changes = `1`;
1104
1105	/*
1106	* Make sure that p2 contains the combining class for the
1107	* character now stored at *ch2. This isn't required for
1108	* correctness, but it will be more efficient if a character
1109	* with a large combining class has to "bubble past" several
1110	* characters with lower combining classes.
1111	*/
1112	p2 = p1;
1113	}
1114	++ch1;
1115	++ch2;
1116	}
1117	} while (changes);
1118	}
1119
1120
1121	/*
1122	* Invalid NTFS filename characters are encodeded using the
1123	* SFM (Services for Macintosh) private use Unicode characters.
1124	*
1125	* These should only be used for SMB, MSDOS or NTFS.
1126	*
1127	* Illegal NTFS Char SFM Unicode Char
1128	* ----------------------------------------
1129	* 0x01-0x1f 0xf001-0xf01f
1130	* '"' 0xf020
1131	* '*' 0xf021
1132	* '/' 0xf022
1133	* '<' 0xf023
1134	* '>' 0xf024
1135	* '?' 0xf025
1136	* '\' 0xf026
1137	* '\|' 0xf027
1138	* ' ' 0xf028 (Only if last char of the name)
1139	* '.' 0xf029 (Only if last char of the name)
1140	* ----------------------------------------
1141	*
1142	* Reference: http://support.microsoft.com/kb/q117258/
1143	*/
1144
1145	#define MAX_SFM2MAC 0x29
1146	#define SFMCODE_PREFIX_MASK 0xf000
1147
1148	/*
1149	* In the Mac OS 9 days the colon was illegal in a file name. For that reason
1150	* SFM had no conversion for the colon. There is a conversion for the
1151	* slash. In Mac OS X the slash is illegal in a file name. So for us the colon
1152	* is a slash and a slash is a colon. So we can just replace the slash with the
1153	* colon in our tables and everything will just work.
1154	*/
1155	static u_int8_t
1156	sfm2mac[] = {
1157	`0x00`, `0x01`, `0x02`, `0x03`, `0x04`, `0x05`, `0x06`, `0x07`, / 00 - 07 /
1158	`0x08`, `0x09`, `0x0a`, `0x0b`, `0x0c`, `0x0d`, `0x0e`, `0x0f`, / 08 - 0F /
1159	`0x10`, `0x11`, `0x12`, `0x13`, `0x14`, `0x15`, `0x16`, `0x17`, / 10 - 17 /
1160	`0x18`, `0x19`, `0x1a`, `0x1b`, `0x1c`, `0x1d`, `0x1e`, `0x1f`, / 18 - 1F /
1161	`0x22`, `0x2a`, `0x3a`, `0x3c`, `0x3e`, `0x3f`, `0x5c`, `0x7c`, / 20 - 27 /
1162	`0x20`, `0x2e` / 28 - 29 /
1163	};
1164	#define SFM2MAC_LEN ((sizeof(sfm2mac))/sizeof(sfm2mac[0]))
1165
1166	static u_int8_t
1167	mac2sfm[] = {
1168	`0x20`, `0x21`, `0x20`, `0x23`, `0x24`, `0x25`, `0x26`, `0x27`, / 20 - 27 /
1169	`0x28`, `0x29`, `0x21`, `0x2b`, `0x2c`, `0x2d`, `0x2e`, `0x22`, / 28 - 2f /
1170	`0x30`, `0x31`, `0x32`, `0x33`, `0x34`, `0x35`, `0x36`, `0x37`, / 30 - 37 /
1171	`0x38`, `0x39`, `0x22`, `0x3b`, `0x23`, `0x3d`, `0x24`, `0x25`, / 38 - 3f /
1172	`0x40`, `0x41`, `0x42`, `0x43`, `0x44`, `0x45`, `0x46`, `0x47`, / 40 - 47 /
1173	`0x48`, `0x49`, `0x4a`, `0x4b`, `0x4c`, `0x4d`, `0x4e`, `0x4f`, / 48 - 4f /
1174	`0x50`, `0x51`, `0x52`, `0x53`, `0x54`, `0x55`, `0x56`, `0x57`, / 50 - 57 /
1175	`0x58`, `0x59`, `0x5a`, `0x5b`, `0x26`, `0x5d`, `0x5e`, `0x5f`, / 58 - 5f /
1176	`0x60`, `0x61`, `0x62`, `0x63`, `0x64`, `0x65`, `0x66`, `0x67`, / 60 - 67 /
1177	`0x68`, `0x69`, `0x6a`, `0x6b`, `0x6c`, `0x6d`, `0x6e`, `0x6f`, / 68 - 6f /
1178	`0x70`, `0x71`, `0x72`, `0x73`, `0x74`, `0x75`, `0x76`, `0x77`, / 70 - 77 /
1179	`0x78`, `0x79`, `0x7a`, `0x7b`, `0x27`, `0x7d`, `0x7e`, `0x7f` / 78 - 7f /
1180	};
1181	#define MAC2SFM_LEN ((sizeof(mac2sfm))/sizeof(mac2sfm[0]))
1182
1183
1184	/*
1185	* Encode illegal NTFS filename characters into SFM Private Unicode characters
1186	*
1187	* Assumes non-zero ASCII input.
1188	*/
1189	static u_int16_t
1190	ucs_to_sfm(u_int16_t ucs_ch, int lastchar)
1191	{
1192	/ The last character of filename cannot be a space or period. /
1193	if (lastchar) {
1194	if (ucs_ch == `0x20`) {
1195	return `0xf028`;
1196	} else if (ucs_ch == `0x2e`) {
1197	return `0xf029`;
1198	}
1199	}
1200	/ 0x01 - 0x1f is simple transformation. /
1201	if (ucs_ch <= `0x1f`) {
1202	return ucs_ch \| `0xf000`;
1203	} else { / 0x20 - 0x7f /
1204	u_int16_t lsb;
1205
1206	assert((ucs_ch - `0x0020`) < MAC2SFM_LEN);
1207	lsb = mac2sfm[ucs_ch - `0x0020`];
1208	if (lsb != ucs_ch) {
1209	return `0xf000` \| lsb;
1210	}
1211	}
1212	return ucs_ch;
1213	}
1214
1215	/*
1216	* Decode any SFM Private Unicode characters
1217	*/
1218	static u_int16_t
1219	sfm_to_ucs(u_int16_t ucs_ch)
1220	{
1221	if (((ucs_ch & `0xffC0`) == SFMCODE_PREFIX_MASK) &&
1222	((ucs_ch & `0x003f`) <= MAX_SFM2MAC)) {
1223	assert((ucs_ch & `0x003f`) < SFM2MAC_LEN);
1224	ucs_ch = sfm2mac[ucs_ch & `0x003f`];
1225	}
1226	return ucs_ch;
1227	}
1228

Browse the source code of xnu/bsd/vfs/vfs_utfconv.c