1/*
2 * Copyright (c) 2017-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29#ifndef _SKYWALK_COMMON_H_
30#define _SKYWALK_COMMON_H_
31
32#if defined(PRIVATE) || defined(BSD_KERNEL_PRIVATE)
33/*
34 * Routines common to kernel and userland. This file is intended to
35 * be included by the Skywalk kernel and libsyscall code.
36 */
37
38#include <skywalk/os_skywalk_private.h>
39
40#ifndef KERNEL
41#if defined(LIBSYSCALL_INTERFACE)
42__BEGIN_DECLS
43extern int fprintf_stderr(const char *format, ...);
44__END_DECLS
45
46/* CSTYLED */
47
48#define SK_ABORT(msg) do { \
49 (void) fprintf_stderr("%s\n", msg); \
50 __asm__(""); __builtin_trap(); \
51} while (0)
52
53#define SK_ABORT_WITH_CAUSE(msg, cause) do { \
54 (void) fprintf_stderr("%s: cause 0x%x\n", msg, cause); \
55 __asm__(""); __builtin_trap(); \
56} while (0)
57
58#define SK_ABORT_DYNAMIC(msg) SK_ABORT(msg)
59
60
61#define VERIFY(EX) do { \
62 if (__improbable(!(EX))) { \
63 SK_ABORT("assertion failed: " #EX); \
64 /* NOTREACHED */ \
65 __builtin_unreachable(); \
66 } \
67} while (0)
68
69#if (DEBUG || DEVELOPMENT)
70#define ASSERT(EX) VERIFY(EX)
71#else /* !DEBUG && !DEVELOPMENT */
72#define ASSERT(EX) ((void)0)
73#endif /* !DEBUG && !DEVELOPMENT */
74#endif /* !LIBSYSCALL_INTERFACE */
75#endif /* !KERNEL */
76
77#ifndef container_of
78#define container_of(ptr, type, member) \
79 ((type*)(((uintptr_t)ptr) - offsetof(type, member)))
80#endif
81
82/*
83 * Prefetch.
84 */
85#define SK_PREFETCH(a, n) \
86 __builtin_prefetch((const void *)((uintptr_t)(a) + (n)), 0, 3)
87#define SK_PREFETCHW(a, n) \
88 __builtin_prefetch((const void *)((uintptr_t)(a) + (n)), 1, 3)
89
90/*
91 * Slower roundup function; if "align" is not power of 2 (else use P2ROUNDUP)
92 */
93#define SK_ROUNDUP(x, align) \
94 ((((x) % (align)) == 0) ? (x) : ((x) + ((align) - ((x) % (align)))))
95
96/* compile time assert */
97#ifndef _CASSERT
98#define _CASSERT(x) _Static_assert(x, "compile-time assertion failed")
99#endif /* !_CASSERT */
100
101/* power of 2 address alignment */
102#ifndef IS_P2ALIGNED
103#define IS_P2ALIGNED(v, a) \
104 ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0)
105#endif /* IS_P2ALIGNED */
106
107#define __sk_aligned(a) __attribute__((__aligned__(a)))
108#define __sk_packed __attribute__((__packed__))
109#define __sk_unused __attribute__((__unused__))
110
111#ifdef KERNEL
112#include <sys/sdt.h>
113
114/*
115 * Copy 8-bytes total, 64-bit aligned, scalar.
116 */
117__attribute__((always_inline))
118static inline void
119__sk_copy64_8(uint64_t *src, uint64_t *dst)
120{
121 *dst = *src; /* [#0*8] */
122}
123
124/*
125 * Copy 8-bytes total, 32-bit aligned, scalar.
126 */
127__attribute__((always_inline))
128static inline void
129__sk_copy32_8(uint32_t *__counted_by(2)src, uint32_t *__counted_by(2)dst)
130{
131#if defined(__x86_64__)
132 /* use unaligned scalar move on x86_64 */
133 __sk_copy64_8((uint64_t *)(void *)src, (uint64_t *)(void *)dst);
134#else
135 dst[0] = src[0]; /* dw[0] */
136 dst[1] = src[1]; /* dw[1] */
137#endif
138}
139
140/*
141 * Copy 16-bytes total, 64-bit aligned, scalar.
142 */
143static inline void
144__sk_copy64_16(uint64_t *__counted_by(2)src, uint64_t *__counted_by(2)dst)
145{
146 dst[0] = src[0]; /* [#0*8] */
147 dst[1] = src[1]; /* [#1*8] */
148}
149
150/*
151 * Copy 16-bytes total, 32-bit aligned, scalar.
152 */
153__attribute__((always_inline))
154static inline void
155__sk_copy32_16(uint32_t *__counted_by(4)src, uint32_t *__counted_by(4)dst)
156{
157 dst[0] = src[0]; /* [#0*4] */
158 dst[1] = src[1]; /* [#1*4] */
159 dst[2] = src[2]; /* [#2*4] */
160 dst[3] = src[3]; /* [#3*4] */
161}
162
163/*
164 * Copy 20-bytes total, 64-bit aligned, scalar.
165 */
166__attribute__((always_inline))
167static inline void
168__sk_copy64_20(uint64_t *__sized_by(20)src, uint64_t *__sized_by(20)dst)
169{
170 dst[0] = src[0]; /* [#0*8] */
171 dst[1] = src[1]; /* [#1*8] */
172 *(uint32_t *)(dst + 2) = *(uint32_t *)(src + 2); /* [#2*4] */
173}
174
175/*
176 * Copy 24-bytes total, 64-bit aligned, scalar.
177 */
178__attribute__((always_inline))
179static inline void
180__sk_copy64_24(uint64_t *__counted_by(3)src, uint64_t *__counted_by(3)dst)
181{
182 dst[0] = src[0]; /* [#0*8] */
183 dst[1] = src[1]; /* [#1*8] */
184 dst[2] = src[2]; /* [#2*8] */
185}
186
187/*
188 * Copy 32-bytes total, 64-bit aligned, scalar.
189 */
190__attribute__((always_inline))
191static inline void
192__sk_copy64_32(uint64_t *__counted_by(4)src, uint64_t *__counted_by(4)dst)
193{
194 dst[0] = src[0]; /* [#0*8] */
195 dst[1] = src[1]; /* [#1*8] */
196 dst[2] = src[2]; /* [#2*8] */
197 dst[3] = src[3]; /* [#3*8] */
198}
199
200/*
201 * Copy 32-bytes total, 32-bit aligned, scalar.
202 */
203__attribute__((always_inline))
204static inline void
205__sk_copy32_32(uint32_t *__counted_by(8)src, uint32_t *__counted_by(8)dst)
206{
207 dst[0] = src[0]; /* [#0*4] */
208 dst[1] = src[1]; /* [#1*4] */
209 dst[2] = src[2]; /* [#2*4] */
210 dst[3] = src[3]; /* [#3*4] */
211 dst[4] = src[4]; /* [#4*4] */
212 dst[5] = src[5]; /* [#5*4] */
213 dst[6] = src[6]; /* [#6*4] */
214 dst[7] = src[7]; /* [#7*4] */
215}
216
217/*
218 * Copy 40-bytes total, 64-bit aligned, scalar.
219 */
220__attribute__((always_inline))
221static inline void
222__sk_copy64_40(uint64_t *__counted_by(5)src, uint64_t *__counted_by(5)dst)
223{
224 dst[0] = src[0]; /* [#0*8] */
225 dst[1] = src[1]; /* [#1*8] */
226 dst[2] = src[2]; /* [#2*8] */
227 dst[3] = src[3]; /* [#3*8] */
228 dst[4] = src[4]; /* [#4*8] */
229}
230
231#if defined(__arm64__)
232/*
233 * Copy 16-bytes total, 64-bit aligned, SIMD (if available).
234 */
235__attribute__((always_inline))
236static inline void
237__sk_vcopy64_16(uint64_t *src, uint64_t *dst)
238{
239 /* no need to save/restore registers on arm64 (SPILL_REGISTERS) */
240 /* BEGIN CSTYLED */
241 __asm__ __volatile__ (
242 "ldr q0, [%[src]] \n\t"
243 "str q0, [%[dst]] \n\t"
244 :
245 : [src] "r" (src), [dst] "r" (dst)
246 : "v0", "memory"
247 );
248 /* END CSTYLED */
249}
250
251/*
252 * Copy 16-bytes total, 32-bit aligned, SIMD (if available).
253 */
254__attribute__((always_inline))
255static inline void
256__sk_vcopy32_16(uint32_t *src, uint32_t *dst)
257{
258 /* use SIMD unaligned move on arm64 */
259 __sk_vcopy64_16(src: (uint64_t *)(void *)src, dst: (uint64_t *)(void *)dst);
260}
261
262/*
263 * Copy 20-bytes total, 64-bit aligned, SIMD (if available).
264 */
265__attribute__((always_inline))
266static inline void
267__sk_vcopy64_20(uint64_t *src, uint64_t *dst)
268{
269 /*
270 * Load/store 16 + 4 bytes;
271 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
272 */
273 /* BEGIN CSTYLED */
274 __asm__ __volatile__ (
275 "ldr q0, [%[src]] \n\t"
276 "str q0, [%[dst]] \n\t"
277 "ldr s0, [%[src], #16] \n\t"
278 "str s0, [%[dst], #16] \n\t"
279 :
280 : [src] "r" (src), [dst] "r" (dst)
281 : "v0", "memory"
282 );
283 /* END CSTYLED */
284}
285
286/*
287 * Copy 24-bytes total, 64-bit aligned, SIMD (if available).
288 */
289__attribute__((always_inline))
290static inline void
291__sk_vcopy64_24(uint64_t *src, uint64_t *dst)
292{
293 /*
294 * Use 16-bytes load/store and 8-bytes load/store on arm64;
295 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
296 */
297 /* BEGIN CSTYLED */
298 __asm__ __volatile__ (
299 "ldr q0, [%[src]] \n\t"
300 "str q0, [%[dst]] \n\t"
301 "ldr d0, [%[src], #16] \n\t"
302 "str d0, [%[dst], #16] \n\t"
303 :
304 : [src] "r" (src), [dst] "r" (dst)
305 : "v0", "memory"
306 );
307 /* END CSTYLED */
308}
309
310/*
311 * Copy 32-bytes total, 64-bit aligned, SIMD (if available).
312 */
313__attribute__((always_inline))
314static inline void
315__sk_vcopy64_32(uint64_t *src, uint64_t *dst)
316{
317 /* no need to save/restore registers on arm64 (SPILL_REGISTERS) */
318 /* BEGIN CSTYLED */
319 __asm__ __volatile__ (
320 "ldp q0, q1, [%[src]] \n\t"
321 "stp q0, q1, [%[dst]] \n\t"
322 :
323 : [src] "r" (src), [dst] "r" (dst)
324 : "v0", "v1", "memory"
325 );
326 /* END CSTYLED */
327}
328
329/*
330 * Copy 32-bytes total, 32-bit aligned, SIMD (if available).
331 */
332__attribute__((always_inline))
333static inline void
334__sk_vcopy32_32(uint32_t *__counted_by(8)src, uint32_t *__counted_by(8)dst)
335{
336 /* use SIMD unaligned move on arm64 */
337 __sk_vcopy64_32(src: (uint64_t *)(void *)src, dst: (uint64_t *)(void *)dst);
338}
339
340/*
341 * Copy 40-bytes total, 64-bit aligned, SIMD (if available).
342 */
343__attribute__((always_inline))
344static inline void
345__sk_vcopy64_40(uint64_t *src, uint64_t *dst)
346{
347 /*
348 * Use 32-bytes load/store pair and 8-bytes load/store on arm64;
349 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
350 */
351 /* BEGIN CSTYLED */
352 __asm__ __volatile__ (
353 "ldp q0, q1, [%[src]] \n\t"
354 "stp q0, q1, [%[dst]] \n\t"
355 "ldr d0, [%[src], #32] \n\t"
356 "str d0, [%[dst], #32] \n\t"
357 :
358 : [src] "r" (src), [dst] "r" (dst)
359 : "v0", "v1", "memory"
360 );
361 /* END CSTYLED */
362}
363
364/*
365 * On arm64, the following inline assembly fixed-length routines have
366 * fewer clock cycles than bzero(). We can directly use vector registers
367 * without saving/restoring them unlike on x86_64/arm32.
368 */
369
370/*
371 * Zero 16-bytes total, SIMD.
372 */
373__attribute__((always_inline))
374static inline void
375__sk_zero_16(void *p)
376{
377 /*
378 * Use 16-bytes store pair using 64-bit zero register on arm64;
379 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
380 */
381 /* BEGIN CSTYLED */
382 __asm__ __volatile__ (
383 "stp xzr, xzr, [%[p]] \n\t"
384 :
385 : [p] "r" (p)
386 : "memory"
387 );
388 /* END CSTYLED */
389}
390
391/*
392 * Zero 32-bytes total, SIMD.
393 */
394__attribute__((always_inline))
395static inline void
396__sk_zero_32(void *p)
397{
398 /*
399 * Use 32-bytes store pair using zeroed v0 register on arm64;
400 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
401 */
402 /* BEGIN CSTYLED */
403 __asm__ __volatile__ (
404 "eor.16b v0, v0, v0 \n\t"
405 "stp q0, q0, [%[p]] \n\t"
406 :
407 : [p] "r" (p)
408 : "v0", "memory", "cc"
409 );
410 /* END CSTYLED */
411}
412
413/*
414 * Zero 48-bytes total, SIMD.
415 */
416__attribute__((always_inline))
417static inline void
418__sk_zero_48(void *p)
419{
420 /*
421 * Use 32-bytes store pair and 16-byte store using zeroed v0
422 * register on arm64; no need to save/restore registers on
423 * arm64 (SPILL_REGISTERS).
424 */
425 /* BEGIN CSTYLED */
426 __asm__ __volatile__ (
427 "eor.16b v0, v0, v0 \n\t"
428 "stp q0, q0, [%[p]] \n\t"
429 "str q0, [%[p], #32] \n\t"
430 :
431 : [p] "r" (p)
432 : "v0", "memory", "cc"
433 );
434 /* END CSTYLED */
435}
436
437/*
438 * Zero 128-bytes total, SIMD.
439 */
440__attribute__((always_inline))
441static inline void
442__sk_zero_128(void *p)
443{
444 /*
445 * Use 4x 32-bytes store pairs using zeroed v0 register on arm64;
446 * no need to save/restore registers on arm64 (SPILL_REGISTERS).
447 *
448 * Note that we could optimize this routine by utilizing "dc zva"
449 * which zeroes the entire cache line. However, that requires
450 * us to guarantee that the address is cache line aligned which
451 * we cannot (at the moment).
452 */
453 /* BEGIN CSTYLED */
454 __asm__ __volatile__ (
455 "eor.16b v0, v0, v0 \n\t"
456 "stp q0, q0, [%[p]] \n\t"
457 "stp q0, q0, [%[p], #32] \n\t"
458 "stp q0, q0, [%[p], #64] \n\t"
459 "stp q0, q0, [%[p], #96] \n\t"
460 :
461 : [p] "r" (p)
462 : "v0", "memory", "cc"
463 );
464 /* END CSTYLED */
465}
466#else /* !__arm64__ */
467/*
468 * Just use bzero() for simplicity. On x86_64, "rep stosb" microcoded
469 * implementation already uses wider stores and can go much faster than
470 * one byte per clock cycle. For arm32, bzero() is also good enough.
471 */
472#define __sk_zero_16(_p) bzero(_p, 16)
473#define __sk_zero_32(_p) bzero(_p, 32)
474#define __sk_zero_48(_p) bzero(_p, 48)
475#define __sk_zero_128(_p) bzero(_p, 128)
476#endif /* !__arm64__ */
477
478/*
479 * The following are optimized routines which rely on the caller
480 * rounding up the source and destination buffers to multiples of
481 * 4, 8 or 64 bytes, and are 64-bit aligned; faster than memcpy().
482 *
483 * Note: they do not support overlapping ranges.
484 */
485
486/*
487 * Threshold as to when we use memcpy() rather than unrolled copy.
488 */
489#if defined(__x86_64__)
490#define SK_COPY_THRES 2048
491#elif defined(__arm64__)
492#define SK_COPY_THRES 1024
493#else /* !__x86_64__ && !__arm64__ */
494#define SK_COPY_THRES 1024
495#endif /* !__x86_64__ && !__arm64__ */
496
497#if (DEVELOPMENT || DEBUG)
498extern size_t sk_copy_thres;
499#endif /* (DEVELOPMENT || DEBUG) */
500
501/*
502 * Scalar version, 4-bytes multiple.
503 */
504__attribute__((always_inline))
505static inline void
506sk_copy64_4x(uint32_t *__sized_by(l)src, uint32_t *__sized_by(l)dst, size_t l)
507{
508#if (DEVELOPMENT || DEBUG)
509 if (__probable(l <= sk_copy_thres)) {
510#else
511 if (__probable(l <= SK_COPY_THRES)) {
512#endif /* (!DEVELOPMENT && !DEBUG! */
513 int i;
514
515 for (i = 0; i < l / 4; i++) {
516 dst[i] = src[i]; /* [#i*4] */
517 }
518 } else {
519 (void) memcpy(dst: (void *)dst, src: (void *)src, n: l);
520 }
521}
522
523/*
524 * Scalar version, 8-bytes multiple.
525 */
526__attribute__((always_inline))
527static inline void
528sk_copy64_8x(uint64_t *__sized_by(l)src, uint64_t *__sized_by(l)dst, size_t l)
529{
530#if (DEVELOPMENT || DEBUG)
531 if (__probable(l <= sk_copy_thres)) {
532#else
533 if (__probable(l <= SK_COPY_THRES)) {
534#endif /* (!DEVELOPMENT && !DEBUG! */
535 int i;
536
537 for (i = 0; i < l / 8; i++) {
538 dst[i] = src[i]; /* [#i*8] */
539 }
540 } else {
541 (void) memcpy(dst: (void *)dst, src: (void *)src, n: l);
542 }
543}
544
545/*
546 * Scalar version (usually faster than SIMD), 32-bytes multiple.
547 */
548__attribute__((always_inline))
549static inline void
550sk_copy64_32x(uint64_t *__sized_by(l)src, uint64_t *__sized_by(l)dst, size_t l)
551{
552#if (DEVELOPMENT || DEBUG)
553 if (__probable(l <= sk_copy_thres)) {
554#else
555 if (__probable(l <= SK_COPY_THRES)) {
556#endif /* (!DEVELOPMENT && !DEBUG! */
557 int n, i;
558
559 for (n = 0; n < l / 32; n++) {
560 i = n * 4;
561 dst[i] = src[i]; /* [#(i+0)*8] */
562 dst[i + 1] = src[i + 1]; /* [#(i+1)*8] */
563 dst[i + 2] = src[i + 2]; /* [#(i+2)*8] */
564 dst[i + 3] = src[i + 3]; /* [#(i+3)*8] */
565 }
566 } else {
567 (void) memcpy(dst: (void *)dst, src: (void *)src, n: l);
568 }
569}
570
571/*
572 * Scalar version (usually faster than SIMD), 64-bytes multiple.
573 */
574__attribute__((always_inline))
575static inline void
576sk_copy64_64x(uint64_t *__sized_by(l)src, uint64_t *__sized_by(l)dst, size_t l)
577{
578#if (DEVELOPMENT || DEBUG)
579 if (__probable(l <= sk_copy_thres)) {
580#else
581 if (__probable(l <= SK_COPY_THRES)) {
582#endif /* (!DEVELOPMENT && !DEBUG! */
583 int n, i;
584
585 for (n = 0; n < l / 64; n++) {
586 i = n * 8;
587 dst[i] = src[i]; /* [#(i+0)*8] */
588 dst[i + 1] = src[i + 1]; /* [#(i+1)*8] */
589 dst[i + 2] = src[i + 2]; /* [#(i+2)*8] */
590 dst[i + 3] = src[i + 3]; /* [#(i+3)*8] */
591 dst[i + 4] = src[i + 4]; /* [#(i+4)*8] */
592 dst[i + 5] = src[i + 5]; /* [#(i+5)*8] */
593 dst[i + 6] = src[i + 6]; /* [#(i+6)*8] */
594 dst[i + 7] = src[i + 7]; /* [#(i+7)*8] */
595 }
596 } else {
597 (void) memcpy(dst: (void *)dst, src: (void *)src, n: l);
598 }
599}
600
601/*
602 * Use scalar or SIMD based on platform/size.
603 */
604#if defined(__x86_64__)
605#define sk_copy64_8 __sk_copy64_8 /* scalar only */
606#define sk_copy32_8 __sk_copy32_8 /* scalar only */
607#define sk_copy64_16 __sk_copy64_16 /* scalar */
608#define sk_copy32_16 __sk_copy32_16 /* scalar */
609#define sk_copy64_20 __sk_copy64_20 /* scalar */
610#define sk_copy64_24 __sk_copy64_24 /* scalar */
611#define sk_copy64_32 __sk_copy64_32 /* scalar */
612#define sk_copy32_32 __sk_copy32_32 /* scalar */
613#define sk_copy64_40 __sk_copy64_40 /* scalar */
614#define sk_zero_16 __sk_zero_16 /* scalar */
615#define sk_zero_32 __sk_zero_32 /* scalar */
616#define sk_zero_48 __sk_zero_48 /* scalar */
617#define sk_zero_128 __sk_zero_128 /* scalar */
618#elif defined(__arm64__)
619#define sk_copy64_8 __sk_copy64_8 /* scalar only */
620#define sk_copy32_8 __sk_copy32_8 /* scalar only */
621#define sk_copy64_16 __sk_vcopy64_16 /* SIMD */
622#define sk_copy32_16 __sk_vcopy32_16 /* SIMD */
623#define sk_copy64_20 __sk_vcopy64_20 /* SIMD */
624#define sk_copy64_24 __sk_vcopy64_24 /* SIMD */
625#define sk_copy64_32 __sk_vcopy64_32 /* SIMD */
626#define sk_copy32_32 __sk_vcopy32_32 /* SIMD */
627#define sk_copy64_40 __sk_vcopy64_40 /* SIMD */
628#define sk_zero_16 __sk_zero_16 /* SIMD */
629#define sk_zero_32 __sk_zero_32 /* SIMD */
630#define sk_zero_48 __sk_zero_48 /* SIMD */
631#define sk_zero_128 __sk_zero_128 /* SIMD */
632#else
633#define sk_copy64_8 __sk_copy64_8 /* scalar only */
634#define sk_copy32_8 __sk_copy32_8 /* scalar only */
635#define sk_copy64_16 __sk_copy64_16 /* scalar */
636#define sk_copy32_16 __sk_copy32_16 /* scalar */
637#define sk_copy64_20 __sk_copy64_20 /* scalar */
638#define sk_copy64_24 __sk_copy64_24 /* scalar */
639#define sk_copy64_32 __sk_copy64_32 /* scalar */
640#define sk_copy32_32 __sk_copy32_32 /* scalar */
641#define sk_copy64_40 __sk_copy64_40 /* scalar */
642#define sk_zero_16 __sk_zero_16 /* scalar */
643#define sk_zero_32 __sk_zero_32 /* scalar */
644#define sk_zero_48 __sk_zero_48 /* scalar */
645#define sk_zero_128 __sk_zero_128 /* scalar */
646#endif
647
648/*
649 * Do not use these directly.
650 * Use the skn_ variants if you need custom probe names.
651 */
652#define _sk_alloc_type(probename, type, flags, name) \
653({ \
654 void *ret; \
655 \
656 /* XXX Modify this to use KT_PRIV_ACCT later */ \
657 ret = kalloc_type_tag(type, Z_ZERO | (flags), (name)->tag); \
658 DTRACE_SKYWALK3(probename, char *, #type, int, (flags), \
659 void *, ret); \
660 ret; \
661})
662
663#define _sk_alloc_type_array(probename, type, count, flags, name) \
664({ \
665 void *ret; \
666 \
667 ret = kalloc_type_tag(type, (count), Z_ZERO | (flags), \
668 (name)->tag); \
669 DTRACE_SKYWALK4(probename, char *, #type, size_t, (count), \
670 int, (flags), void *, ret); \
671 ret; \
672})
673
674#define _sk_alloc_type_hash(probename, heap, size, flags, name) \
675({ \
676 void *ret; \
677 \
678 ret = kalloc_type_var_impl((heap), (size), \
679 __zone_flags_mix_tag((flags) | Z_ZERO, (name)->tag), NULL); \
680 DTRACE_SKYWALK4(probename, char *, (heap)->kt_name + 5, \
681 size_t, (size), int, (flags), void *, ret); \
682 ret; \
683})
684
685#define _sk_realloc_type_array(probename, type, oldcount, newcount, elem, flags, name) \
686({ \
687 void *ret; \
688 \
689 ret = krealloc_type_tag(type, (oldcount), (newcount), (elem), \
690 Z_ZERO | (flags), (name)->tag); \
691 DTRACE_SKYWALK5(probename, void *, (elem), size_t, (oldcount), \
692 size_t, (newcount), int, (flags), void *, ret); \
693 ret; \
694})
695
696#define _sk_alloc_type_header_array(probename, htype, type, count, flags, name) \
697({ \
698 void *ret; \
699 \
700 ret = kalloc_type_tag(htype, type, (count), Z_ZERO | (flags), \
701 (name)->tag); \
702 DTRACE_SKYWALK5(probename, char *, #htype, char *, #type, \
703 size_t, (count), int, (flags), void *, ret); \
704 ret; \
705})
706
707#define _sk_free_type(probename, type, elem) \
708{ \
709 DTRACE_SKYWALK2(probename, char *, #type, void *, (elem)); \
710 kfree_type(type, (elem)); \
711}
712
713#define _sk_free_type_array(probename, type, count, elem) \
714{ \
715 DTRACE_SKYWALK3(probename, char *, #type, size_t, (count), \
716 void *, (elem)); \
717 kfree_type(type, (count), (elem)); \
718}
719
720#define _sk_free_type_hash(probename, heap, size, elem) \
721{ \
722 DTRACE_SKYWALK3(probename, char *, (heap)->kt_name + 5, \
723 size_t, (size), void *, (elem)); \
724 kfree_type_var_impl((heap), (elem), (size)); \
725}
726
727#define _sk_free_type_header_array(probename, htype, type, count, elem) \
728{ \
729 DTRACE_SKYWALK4(probename, char *, #htype, char *, #type, \
730 size_t, (count), void *, (elem)); \
731 kfree_type(htype, type, (count), (elem)); \
732}
733
734#define _sk_alloc_data(probename, size, flags, name) \
735({ \
736 void *ret; \
737 \
738 ret = kalloc_data_tag((size), Z_ZERO | (flags), (name)->tag); \
739 DTRACE_SKYWALK3(probename, size_t, (size), int, (flags), \
740 void *, ret); \
741 ret; \
742})
743
744#define _sk_realloc_data(probename, elem, oldsize, newsize, flags, name) \
745({ \
746 void *ret; \
747 \
748 ret = krealloc_data_tag((elem), (oldsize), (newsize), \
749 Z_ZERO | (flags), (name)->tag); \
750 DTRACE_SKYWALK5(probename, void *, (elem), size_t, (oldsize), \
751 size_t, (newsize), int, (flags), void *, ret); \
752 ret; \
753})
754
755#define _sk_free_data(probename, elem, size) \
756{ \
757 DTRACE_SKYWALK2(probename, void *, (elem), size_t, (size)); \
758 kfree_data((elem), (size)); \
759}
760
761#define sk_alloc_type(type, flags, tag) \
762 _sk_alloc_type(sk_alloc_type, type, flags, tag)
763
764#define sk_alloc_type_array(type, count, flags, tag) \
765 _sk_alloc_type_array(sk_alloc_type_array, type, count, flags, tag)
766
767#define sk_alloc_type_hash(heap, size, flags, tag) \
768 _sk_alloc_type_hash(sk_alloc_type_hash, heap, size, flags, tag)
769
770#define sk_alloc_type_header_array(htype, type, count, flags, tag) \
771 _sk_alloc_type_header_array(sk_alloc_type_header_array, htype, \
772 type, count, flags, tag)
773
774#define sk_realloc_type_array(type, oldsize, newsize, elem, flags, tag) \
775 _sk_realloc_type_array(sk_realloc_type_array, type, \
776 oldsize, newsize, elem, flags, tag)
777
778#define sk_free_type(type, elem) \
779 _sk_free_type(sk_free_type, type, elem)
780
781#define sk_free_type_array(type, count, elem) \
782 _sk_free_type_array(sk_free_type_array, type, count, elem)
783
784#define sk_free_type_hash(heap, size, elem) \
785 _sk_free_type_hash(sk_free_type_hash, heap, size, elem)
786
787#define sk_free_type_header_array(htype, type, count, elem) \
788 _sk_free_type_header_array(sk_free_type_header_array, htype, \
789 type, count, elem)
790
791#define sk_alloc_data(size, flags, tag) \
792 _sk_alloc_data(sk_alloc_data, size, flags, tag)
793
794#define sk_realloc_data(elem, oldsize, newsize, flags, tag) \
795 _sk_realloc_data(sk_realloc_data, elem, oldsize, newsize, \
796 flags, tag)
797
798#define sk_free_data(elem, size) \
799 _sk_free_data(sk_free_data, elem, size)
800
801/*
802 * The skn_ variants are meant to be used if you need to use two or more
803 * of the same call within the same function and you want the dtrace
804 * probename to be different at each callsite.
805 */
806#define skn_realloc(name, elem, oldsize, newsize, flags, tag) \
807 _sk_realloc(sk_realloc_ ## name, elem, oldsize, newsize, flags, \
808 tag)
809
810#define skn_alloc_type(name, type, flags, tag) \
811 _sk_alloc_type(sk_alloc_type_ ## name, type, flags, tag)
812
813#define skn_alloc_type_array(name, type, count, flags, tag) \
814 _sk_alloc_type_array(sk_alloc_type_array_ ## name, type, count, \
815 flags, tag)
816
817#define skn_alloc_type_hash(name, heap, size, flags, tag) \
818 _sk_alloc_type_hash(sk_alloc_type_hash_ ## name, heap, size, \
819 flags, tag)
820
821#define skn_alloc_type_header_array(name, htype, type, count, flags, tag) \
822 _sk_alloc_type_header_array(sk_alloc_type_header_array_ ## name, \
823 htype, type, count, flags, tag)
824
825#define skn_free_type(name, type, elem) \
826 _sk_free_type(sk_free_type_ ## name, type, elem)
827
828#define skn_free_type_array(name, type, count, elem) \
829 _sk_free_type_array(sk_free_type_array_ ## name, type, count, \
830 elem)
831
832#define skn_free_type_hash(name, heap, size, elem) \
833 _sk_free_type_hash(sk_free_type_hash_ ## name, heap, size, elem)
834
835#define skn_free_type_header_array(name, htype, type, count, elem) \
836 _sk_free_type_header_array(sk_free_type_header_array_ ## name, \
837 htype, type, count, elem)
838
839#define skn_alloc_data(name, size, flags, tag) \
840 _sk_alloc_data(sk_alloc_data_ ## name, size, flags, tag)
841
842#define skn_realloc_data(name, elem, oldsize, newsize, flags, tag) \
843 _sk_realloc_data(sk_realloc_data_ ## name, elem, oldsize, newsize,\
844 flags, tag)
845
846#define skn_free_data(name, elem, size) \
847 _sk_free_data(sk_free_data_ ## name, elem, size)
848
849struct sk_tag_spec {
850 kern_allocation_name_t *skt_var;
851 const char *skt_name;
852};
853
854extern void __sk_tag_make(const struct sk_tag_spec *spec);
855
856#define SKMEM_TAG_DEFINE(var, name) \
857 SECURITY_READ_ONLY_LATE(kern_allocation_name_t) var; \
858 __startup_data struct sk_tag_spec __sktag_##var = { \
859 .skt_var = &var, .skt_name = name, \
860 }; \
861 STARTUP_ARG(ZALLOC, STARTUP_RANK_LAST, __sk_tag_make, &__sktag_##var)
862
863/*!
864 * @abstract Compare byte buffers of n bytes long src1 against src2, applying
865 * the byte masks to input data before comparison. (Scalar version)
866 *
867 * @discussion
868 * Returns zero if the two buffers are identical after applying the byte
869 * masks, otherwise non-zero.
870 * Zero-length buffers are always identical.
871 *
872 * @param src1 first input buffer of n bytes long
873 * @param src2 second input buffer of n bytes long
874 * @param byte_mask byte mask of n bytes long applied before comparision
875 * @param n number of bytes
876 */
877static inline int
878__sk_memcmp_mask_scalar(const uint8_t *__counted_by(n)src1,
879 const uint8_t *__counted_by(n)src2,
880 const uint8_t *__counted_by(n)byte_mask, size_t n)
881{
882 uint32_t result = 0;
883 for (size_t i = 0; i < n; i++) {
884 result |= (src1[i] ^ src2[i]) & byte_mask[i];
885 }
886 return result;
887}
888
889static inline int
890__sk_memcmp_mask_16B_scalar(const uint8_t *__counted_by(16)src1,
891 const uint8_t *__counted_by(16)src2,
892 const uint8_t *__counted_by(16)byte_mask)
893{
894 return __sk_memcmp_mask_scalar(src1, src2, byte_mask, n: 16);
895}
896
897static inline int
898__sk_memcmp_mask_32B_scalar(const uint8_t *__counted_by(32)src1,
899 const uint8_t *__counted_by(32)src2,
900 const uint8_t *__counted_by(32)byte_mask)
901{
902 return __sk_memcmp_mask_scalar(src1, src2, byte_mask, n: 32);
903}
904
905static inline int
906__sk_memcmp_mask_48B_scalar(const uint8_t *__counted_by(48)src1,
907 const uint8_t *__counted_by(48)src2,
908 const uint8_t *__counted_by(48)byte_mask)
909{
910 return __sk_memcmp_mask_scalar(src1, src2, byte_mask, n: 48);
911}
912
913static inline int
914__sk_memcmp_mask_64B_scalar(const uint8_t *__counted_by(64)src1,
915 const uint8_t *__counted_by(64)src2,
916 const uint8_t *__counted_by(64)byte_mask)
917{
918 return __sk_memcmp_mask_scalar(src1, src2, byte_mask, n: 64);
919}
920
921static inline int
922__sk_memcmp_mask_80B_scalar(const uint8_t *__counted_by(80)src1,
923 const uint8_t *__counted_by(80)src2,
924 const uint8_t *__counted_by(80)byte_mask)
925{
926 return __sk_memcmp_mask_scalar(src1, src2, byte_mask, n: 80);
927}
928
929#if defined(__arm64__) || defined(__arm__) || defined(__x86_64__)
930extern int os_memcmp_mask_16B(const uint8_t *__counted_by(16)src1,
931 const uint8_t *__counted_by(16)src2,
932 const uint8_t *__counted_by(16)byte_mask);
933extern int os_memcmp_mask_32B(const uint8_t *__counted_by(32)src1,
934 const uint8_t *__counted_by(32)src2,
935 const uint8_t *__counted_by(32)byte_mask);
936extern int os_memcmp_mask_48B(const uint8_t *__counted_by(48)src1,
937 const uint8_t *__counted_by(48)src2,
938 const uint8_t *__counted_by(48)byte_mask);
939extern int os_memcmp_mask_64B(const uint8_t *__counted_by(64)src1,
940 const uint8_t *__counted_by(64)src2,
941 const uint8_t *__counted_by(64)byte_mask);
942extern int os_memcmp_mask_80B(const uint8_t *__counted_by(80)src1,
943 const uint8_t *__counted_by(80)src2,
944 const uint8_t *__counted_by(80)byte_mask);
945
946/*
947 * Use SIMD variants based on ARM64 and x86_64.
948 */
949#define sk_memcmp_mask __sk_memcmp_mask
950#define sk_memcmp_mask_16B os_memcmp_mask_16B
951#define sk_memcmp_mask_32B os_memcmp_mask_32B
952#define sk_memcmp_mask_48B os_memcmp_mask_48B
953#define sk_memcmp_mask_64B os_memcmp_mask_64B
954#define sk_memcmp_mask_80B os_memcmp_mask_80B
955
956/*!
957 * @abstract Compare byte buffers of n bytes long src1 against src2, applying
958 * the byte masks to input data before comparison. (SIMD version)
959 *
960 * @discussion
961 * Returns zero if the two buffers are identical after applying the byte
962 * masks, otherwise non-zero.
963 * Zero-length buffers are always identical.
964 *
965 * @param src1 first input buffer of n bytes long
966 * @param src2 second input buffer of n bytes long
967 * @param byte_mask byte mask of n bytes long applied before comparision
968 * @param n number of bytes
969 */
970static inline int
971__sk_memcmp_mask(const uint8_t *__counted_by(n)src1,
972 const uint8_t *__counted_by(n)src2,
973 const uint8_t *__counted_by(n)byte_mask, size_t n)
974{
975 uint32_t result = 0;
976 size_t i = 0;
977 for (; i + 64 <= n; i += 64) {
978 result |= sk_memcmp_mask_64B(src1: src1 + i, src2: src2 + i,
979 byte_mask: byte_mask + i);
980 }
981 for (; i + 32 <= n; i += 32) {
982 result |= sk_memcmp_mask_32B(src1: src1 + i, src2: src2 + i,
983 byte_mask: byte_mask + i);
984 }
985 for (; i + 16 <= n; i += 16) {
986 result |= sk_memcmp_mask_16B(src1: src1 + i, src2: src2 + i,
987 byte_mask: byte_mask + i);
988 }
989 if (i < n) {
990 if (n >= 16) {
991 /* Compare the last 16 bytes with vector code. */
992 result |= sk_memcmp_mask_16B(src1: src1 + n - 16,
993 src2: src2 + n - 16, byte_mask: byte_mask + n - 16);
994 } else {
995 /* Use scalar code if n < 16. */
996 for (; i < n; i++) {
997 result |= (src1[i] ^ src2[i]) & byte_mask[i];
998 }
999 }
1000 }
1001 return result;
1002}
1003#else /* !(__arm64__ || __arm__ || __x86_64__) */
1004/*
1005 * Use scalar variants elsewhere.
1006 */
1007#define sk_memcmp_mask __sk_memcmp_mask_scalar
1008#define sk_memcmp_mask_16B __sk_memcmp_mask_16B_scalar
1009#define sk_memcmp_mask_32B __sk_memcmp_mask_32B_scalar
1010#define sk_memcmp_mask_48B __sk_memcmp_mask_48B_scalar
1011#define sk_memcmp_mask_64B __sk_memcmp_mask_64B_scalar
1012#define sk_memcmp_mask_80B __sk_memcmp_mask_80B_scalar
1013#endif /* !(__arm64__ || __arm__ || __x86_64__) */
1014
1015/*
1016 * Scalar variants are available on all platforms if needed.
1017 */
1018#define sk_memcmp_mask_scalar __sk_memcmp_mask_scalar
1019#define sk_memcmp_mask_16B_scalar __sk_memcmp_mask_16B_scalar
1020#define sk_memcmp_mask_32B_scalar __sk_memcmp_mask_32B_scalar
1021#define sk_memcmp_mask_48B_scalar __sk_memcmp_mask_48B_scalar
1022#define sk_memcmp_mask_64B_scalar __sk_memcmp_mask_64B_scalar
1023#define sk_memcmp_mask_80B_scalar __sk_memcmp_mask_80B_scalar
1024
1025#endif /* KERNEL */
1026#endif /* PRIVATE || BSD_KERNEL_PRIVATE */
1027#endif /* !_SKYWALK_COMMON_H_ */
1028