1 | /* |
2 | * Copyright (c) 2017-2021 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | #ifndef _SKYWALK_COMMON_H_ |
30 | #define _SKYWALK_COMMON_H_ |
31 | |
32 | #if defined(PRIVATE) || defined(BSD_KERNEL_PRIVATE) |
33 | /* |
34 | * Routines common to kernel and userland. This file is intended to |
35 | * be included by the Skywalk kernel and libsyscall code. |
36 | */ |
37 | |
38 | #include <skywalk/os_skywalk_private.h> |
39 | |
40 | #ifndef KERNEL |
41 | #if defined(LIBSYSCALL_INTERFACE) |
42 | __BEGIN_DECLS |
43 | extern int fprintf_stderr(const char *format, ...); |
44 | __END_DECLS |
45 | |
46 | /* CSTYLED */ |
47 | |
48 | #define SK_ABORT(msg) do { \ |
49 | (void) fprintf_stderr("%s\n", msg); \ |
50 | __asm__(""); __builtin_trap(); \ |
51 | } while (0) |
52 | |
53 | #define SK_ABORT_WITH_CAUSE(msg, cause) do { \ |
54 | (void) fprintf_stderr("%s: cause 0x%x\n", msg, cause); \ |
55 | __asm__(""); __builtin_trap(); \ |
56 | } while (0) |
57 | |
58 | #define SK_ABORT_DYNAMIC(msg) SK_ABORT(msg) |
59 | |
60 | |
61 | #define VERIFY(EX) do { \ |
62 | if (__improbable(!(EX))) { \ |
63 | SK_ABORT("assertion failed: " #EX); \ |
64 | /* NOTREACHED */ \ |
65 | __builtin_unreachable(); \ |
66 | } \ |
67 | } while (0) |
68 | |
69 | #if (DEBUG || DEVELOPMENT) |
70 | #define ASSERT(EX) VERIFY(EX) |
71 | #else /* !DEBUG && !DEVELOPMENT */ |
72 | #define ASSERT(EX) ((void)0) |
73 | #endif /* !DEBUG && !DEVELOPMENT */ |
74 | #endif /* !LIBSYSCALL_INTERFACE */ |
75 | #endif /* !KERNEL */ |
76 | |
77 | #ifndef container_of |
78 | #define container_of(ptr, type, member) \ |
79 | ((type*)(((uintptr_t)ptr) - offsetof(type, member))) |
80 | #endif |
81 | |
82 | /* |
83 | * Prefetch. |
84 | */ |
85 | #define SK_PREFETCH(a, n) \ |
86 | __builtin_prefetch((const void *)((uintptr_t)(a) + (n)), 0, 3) |
87 | #define SK_PREFETCHW(a, n) \ |
88 | __builtin_prefetch((const void *)((uintptr_t)(a) + (n)), 1, 3) |
89 | |
90 | /* |
91 | * Slower roundup function; if "align" is not power of 2 (else use P2ROUNDUP) |
92 | */ |
93 | #define SK_ROUNDUP(x, align) \ |
94 | ((((x) % (align)) == 0) ? (x) : ((x) + ((align) - ((x) % (align))))) |
95 | |
96 | /* compile time assert */ |
97 | #ifndef _CASSERT |
98 | #define _CASSERT(x) _Static_assert(x, "compile-time assertion failed") |
99 | #endif /* !_CASSERT */ |
100 | |
101 | /* power of 2 address alignment */ |
102 | #ifndef IS_P2ALIGNED |
103 | #define IS_P2ALIGNED(v, a) \ |
104 | ((((uintptr_t)(v)) & ((uintptr_t)(a) - 1)) == 0) |
105 | #endif /* IS_P2ALIGNED */ |
106 | |
107 | #define __sk_aligned(a) __attribute__((__aligned__(a))) |
108 | #define __sk_packed __attribute__((__packed__)) |
109 | #define __sk_unused __attribute__((__unused__)) |
110 | |
111 | #ifdef KERNEL |
112 | #include <sys/sdt.h> |
113 | |
114 | /* |
115 | * Copy 8-bytes total, 64-bit aligned, scalar. |
116 | */ |
117 | __attribute__((always_inline)) |
118 | static inline void |
119 | __sk_copy64_8(uint64_t *src, uint64_t *dst) |
120 | { |
121 | *dst = *src; /* [#0*8] */ |
122 | } |
123 | |
124 | /* |
125 | * Copy 8-bytes total, 32-bit aligned, scalar. |
126 | */ |
127 | __attribute__((always_inline)) |
128 | static inline void |
129 | __sk_copy32_8(uint32_t *__counted_by(2)src, uint32_t *__counted_by(2)dst) |
130 | { |
131 | #if defined(__x86_64__) |
132 | /* use unaligned scalar move on x86_64 */ |
133 | __sk_copy64_8((uint64_t *)(void *)src, (uint64_t *)(void *)dst); |
134 | #else |
135 | dst[0] = src[0]; /* dw[0] */ |
136 | dst[1] = src[1]; /* dw[1] */ |
137 | #endif |
138 | } |
139 | |
140 | /* |
141 | * Copy 16-bytes total, 64-bit aligned, scalar. |
142 | */ |
143 | static inline void |
144 | __sk_copy64_16(uint64_t *__counted_by(2)src, uint64_t *__counted_by(2)dst) |
145 | { |
146 | dst[0] = src[0]; /* [#0*8] */ |
147 | dst[1] = src[1]; /* [#1*8] */ |
148 | } |
149 | |
150 | /* |
151 | * Copy 16-bytes total, 32-bit aligned, scalar. |
152 | */ |
153 | __attribute__((always_inline)) |
154 | static inline void |
155 | __sk_copy32_16(uint32_t *__counted_by(4)src, uint32_t *__counted_by(4)dst) |
156 | { |
157 | dst[0] = src[0]; /* [#0*4] */ |
158 | dst[1] = src[1]; /* [#1*4] */ |
159 | dst[2] = src[2]; /* [#2*4] */ |
160 | dst[3] = src[3]; /* [#3*4] */ |
161 | } |
162 | |
163 | /* |
164 | * Copy 20-bytes total, 64-bit aligned, scalar. |
165 | */ |
166 | __attribute__((always_inline)) |
167 | static inline void |
168 | __sk_copy64_20(uint64_t *__sized_by(20)src, uint64_t *__sized_by(20)dst) |
169 | { |
170 | dst[0] = src[0]; /* [#0*8] */ |
171 | dst[1] = src[1]; /* [#1*8] */ |
172 | *(uint32_t *)(dst + 2) = *(uint32_t *)(src + 2); /* [#2*4] */ |
173 | } |
174 | |
175 | /* |
176 | * Copy 24-bytes total, 64-bit aligned, scalar. |
177 | */ |
178 | __attribute__((always_inline)) |
179 | static inline void |
180 | __sk_copy64_24(uint64_t *__counted_by(3)src, uint64_t *__counted_by(3)dst) |
181 | { |
182 | dst[0] = src[0]; /* [#0*8] */ |
183 | dst[1] = src[1]; /* [#1*8] */ |
184 | dst[2] = src[2]; /* [#2*8] */ |
185 | } |
186 | |
187 | /* |
188 | * Copy 32-bytes total, 64-bit aligned, scalar. |
189 | */ |
190 | __attribute__((always_inline)) |
191 | static inline void |
192 | __sk_copy64_32(uint64_t *__counted_by(4)src, uint64_t *__counted_by(4)dst) |
193 | { |
194 | dst[0] = src[0]; /* [#0*8] */ |
195 | dst[1] = src[1]; /* [#1*8] */ |
196 | dst[2] = src[2]; /* [#2*8] */ |
197 | dst[3] = src[3]; /* [#3*8] */ |
198 | } |
199 | |
200 | /* |
201 | * Copy 32-bytes total, 32-bit aligned, scalar. |
202 | */ |
203 | __attribute__((always_inline)) |
204 | static inline void |
205 | __sk_copy32_32(uint32_t *__counted_by(8)src, uint32_t *__counted_by(8)dst) |
206 | { |
207 | dst[0] = src[0]; /* [#0*4] */ |
208 | dst[1] = src[1]; /* [#1*4] */ |
209 | dst[2] = src[2]; /* [#2*4] */ |
210 | dst[3] = src[3]; /* [#3*4] */ |
211 | dst[4] = src[4]; /* [#4*4] */ |
212 | dst[5] = src[5]; /* [#5*4] */ |
213 | dst[6] = src[6]; /* [#6*4] */ |
214 | dst[7] = src[7]; /* [#7*4] */ |
215 | } |
216 | |
217 | /* |
218 | * Copy 40-bytes total, 64-bit aligned, scalar. |
219 | */ |
220 | __attribute__((always_inline)) |
221 | static inline void |
222 | __sk_copy64_40(uint64_t *__counted_by(5)src, uint64_t *__counted_by(5)dst) |
223 | { |
224 | dst[0] = src[0]; /* [#0*8] */ |
225 | dst[1] = src[1]; /* [#1*8] */ |
226 | dst[2] = src[2]; /* [#2*8] */ |
227 | dst[3] = src[3]; /* [#3*8] */ |
228 | dst[4] = src[4]; /* [#4*8] */ |
229 | } |
230 | |
231 | #if defined(__arm64__) |
232 | /* |
233 | * Copy 16-bytes total, 64-bit aligned, SIMD (if available). |
234 | */ |
235 | __attribute__((always_inline)) |
236 | static inline void |
237 | __sk_vcopy64_16(uint64_t *src, uint64_t *dst) |
238 | { |
239 | /* no need to save/restore registers on arm64 (SPILL_REGISTERS) */ |
240 | /* BEGIN CSTYLED */ |
241 | __asm__ __volatile__ ( |
242 | "ldr q0, [%[src]] \n\t" |
243 | "str q0, [%[dst]] \n\t" |
244 | : |
245 | : [src] "r" (src), [dst] "r" (dst) |
246 | : "v0" , "memory" |
247 | ); |
248 | /* END CSTYLED */ |
249 | } |
250 | |
251 | /* |
252 | * Copy 16-bytes total, 32-bit aligned, SIMD (if available). |
253 | */ |
254 | __attribute__((always_inline)) |
255 | static inline void |
256 | __sk_vcopy32_16(uint32_t *src, uint32_t *dst) |
257 | { |
258 | /* use SIMD unaligned move on arm64 */ |
259 | __sk_vcopy64_16(src: (uint64_t *)(void *)src, dst: (uint64_t *)(void *)dst); |
260 | } |
261 | |
262 | /* |
263 | * Copy 20-bytes total, 64-bit aligned, SIMD (if available). |
264 | */ |
265 | __attribute__((always_inline)) |
266 | static inline void |
267 | __sk_vcopy64_20(uint64_t *src, uint64_t *dst) |
268 | { |
269 | /* |
270 | * Load/store 16 + 4 bytes; |
271 | * no need to save/restore registers on arm64 (SPILL_REGISTERS). |
272 | */ |
273 | /* BEGIN CSTYLED */ |
274 | __asm__ __volatile__ ( |
275 | "ldr q0, [%[src]] \n\t" |
276 | "str q0, [%[dst]] \n\t" |
277 | "ldr s0, [%[src], #16] \n\t" |
278 | "str s0, [%[dst], #16] \n\t" |
279 | : |
280 | : [src] "r" (src), [dst] "r" (dst) |
281 | : "v0" , "memory" |
282 | ); |
283 | /* END CSTYLED */ |
284 | } |
285 | |
286 | /* |
287 | * Copy 24-bytes total, 64-bit aligned, SIMD (if available). |
288 | */ |
289 | __attribute__((always_inline)) |
290 | static inline void |
291 | __sk_vcopy64_24(uint64_t *src, uint64_t *dst) |
292 | { |
293 | /* |
294 | * Use 16-bytes load/store and 8-bytes load/store on arm64; |
295 | * no need to save/restore registers on arm64 (SPILL_REGISTERS). |
296 | */ |
297 | /* BEGIN CSTYLED */ |
298 | __asm__ __volatile__ ( |
299 | "ldr q0, [%[src]] \n\t" |
300 | "str q0, [%[dst]] \n\t" |
301 | "ldr d0, [%[src], #16] \n\t" |
302 | "str d0, [%[dst], #16] \n\t" |
303 | : |
304 | : [src] "r" (src), [dst] "r" (dst) |
305 | : "v0" , "memory" |
306 | ); |
307 | /* END CSTYLED */ |
308 | } |
309 | |
310 | /* |
311 | * Copy 32-bytes total, 64-bit aligned, SIMD (if available). |
312 | */ |
313 | __attribute__((always_inline)) |
314 | static inline void |
315 | __sk_vcopy64_32(uint64_t *src, uint64_t *dst) |
316 | { |
317 | /* no need to save/restore registers on arm64 (SPILL_REGISTERS) */ |
318 | /* BEGIN CSTYLED */ |
319 | __asm__ __volatile__ ( |
320 | "ldp q0, q1, [%[src]] \n\t" |
321 | "stp q0, q1, [%[dst]] \n\t" |
322 | : |
323 | : [src] "r" (src), [dst] "r" (dst) |
324 | : "v0" , "v1" , "memory" |
325 | ); |
326 | /* END CSTYLED */ |
327 | } |
328 | |
329 | /* |
330 | * Copy 32-bytes total, 32-bit aligned, SIMD (if available). |
331 | */ |
332 | __attribute__((always_inline)) |
333 | static inline void |
334 | __sk_vcopy32_32(uint32_t *__counted_by(8)src, uint32_t *__counted_by(8)dst) |
335 | { |
336 | /* use SIMD unaligned move on arm64 */ |
337 | __sk_vcopy64_32(src: (uint64_t *)(void *)src, dst: (uint64_t *)(void *)dst); |
338 | } |
339 | |
340 | /* |
341 | * Copy 40-bytes total, 64-bit aligned, SIMD (if available). |
342 | */ |
343 | __attribute__((always_inline)) |
344 | static inline void |
345 | __sk_vcopy64_40(uint64_t *src, uint64_t *dst) |
346 | { |
347 | /* |
348 | * Use 32-bytes load/store pair and 8-bytes load/store on arm64; |
349 | * no need to save/restore registers on arm64 (SPILL_REGISTERS). |
350 | */ |
351 | /* BEGIN CSTYLED */ |
352 | __asm__ __volatile__ ( |
353 | "ldp q0, q1, [%[src]] \n\t" |
354 | "stp q0, q1, [%[dst]] \n\t" |
355 | "ldr d0, [%[src], #32] \n\t" |
356 | "str d0, [%[dst], #32] \n\t" |
357 | : |
358 | : [src] "r" (src), [dst] "r" (dst) |
359 | : "v0" , "v1" , "memory" |
360 | ); |
361 | /* END CSTYLED */ |
362 | } |
363 | |
364 | /* |
365 | * On arm64, the following inline assembly fixed-length routines have |
366 | * fewer clock cycles than bzero(). We can directly use vector registers |
367 | * without saving/restoring them unlike on x86_64/arm32. |
368 | */ |
369 | |
370 | /* |
371 | * Zero 16-bytes total, SIMD. |
372 | */ |
373 | __attribute__((always_inline)) |
374 | static inline void |
375 | __sk_zero_16(void *p) |
376 | { |
377 | /* |
378 | * Use 16-bytes store pair using 64-bit zero register on arm64; |
379 | * no need to save/restore registers on arm64 (SPILL_REGISTERS). |
380 | */ |
381 | /* BEGIN CSTYLED */ |
382 | __asm__ __volatile__ ( |
383 | "stp xzr, xzr, [%[p]] \n\t" |
384 | : |
385 | : [p] "r" (p) |
386 | : "memory" |
387 | ); |
388 | /* END CSTYLED */ |
389 | } |
390 | |
391 | /* |
392 | * Zero 32-bytes total, SIMD. |
393 | */ |
394 | __attribute__((always_inline)) |
395 | static inline void |
396 | __sk_zero_32(void *p) |
397 | { |
398 | /* |
399 | * Use 32-bytes store pair using zeroed v0 register on arm64; |
400 | * no need to save/restore registers on arm64 (SPILL_REGISTERS). |
401 | */ |
402 | /* BEGIN CSTYLED */ |
403 | __asm__ __volatile__ ( |
404 | "eor.16b v0, v0, v0 \n\t" |
405 | "stp q0, q0, [%[p]] \n\t" |
406 | : |
407 | : [p] "r" (p) |
408 | : "v0" , "memory" , "cc" |
409 | ); |
410 | /* END CSTYLED */ |
411 | } |
412 | |
413 | /* |
414 | * Zero 48-bytes total, SIMD. |
415 | */ |
416 | __attribute__((always_inline)) |
417 | static inline void |
418 | __sk_zero_48(void *p) |
419 | { |
420 | /* |
421 | * Use 32-bytes store pair and 16-byte store using zeroed v0 |
422 | * register on arm64; no need to save/restore registers on |
423 | * arm64 (SPILL_REGISTERS). |
424 | */ |
425 | /* BEGIN CSTYLED */ |
426 | __asm__ __volatile__ ( |
427 | "eor.16b v0, v0, v0 \n\t" |
428 | "stp q0, q0, [%[p]] \n\t" |
429 | "str q0, [%[p], #32] \n\t" |
430 | : |
431 | : [p] "r" (p) |
432 | : "v0" , "memory" , "cc" |
433 | ); |
434 | /* END CSTYLED */ |
435 | } |
436 | |
437 | /* |
438 | * Zero 128-bytes total, SIMD. |
439 | */ |
440 | __attribute__((always_inline)) |
441 | static inline void |
442 | __sk_zero_128(void *p) |
443 | { |
444 | /* |
445 | * Use 4x 32-bytes store pairs using zeroed v0 register on arm64; |
446 | * no need to save/restore registers on arm64 (SPILL_REGISTERS). |
447 | * |
448 | * Note that we could optimize this routine by utilizing "dc zva" |
449 | * which zeroes the entire cache line. However, that requires |
450 | * us to guarantee that the address is cache line aligned which |
451 | * we cannot (at the moment). |
452 | */ |
453 | /* BEGIN CSTYLED */ |
454 | __asm__ __volatile__ ( |
455 | "eor.16b v0, v0, v0 \n\t" |
456 | "stp q0, q0, [%[p]] \n\t" |
457 | "stp q0, q0, [%[p], #32] \n\t" |
458 | "stp q0, q0, [%[p], #64] \n\t" |
459 | "stp q0, q0, [%[p], #96] \n\t" |
460 | : |
461 | : [p] "r" (p) |
462 | : "v0" , "memory" , "cc" |
463 | ); |
464 | /* END CSTYLED */ |
465 | } |
466 | #else /* !__arm64__ */ |
467 | /* |
468 | * Just use bzero() for simplicity. On x86_64, "rep stosb" microcoded |
469 | * implementation already uses wider stores and can go much faster than |
470 | * one byte per clock cycle. For arm32, bzero() is also good enough. |
471 | */ |
472 | #define __sk_zero_16(_p) bzero(_p, 16) |
473 | #define __sk_zero_32(_p) bzero(_p, 32) |
474 | #define __sk_zero_48(_p) bzero(_p, 48) |
475 | #define __sk_zero_128(_p) bzero(_p, 128) |
476 | #endif /* !__arm64__ */ |
477 | |
478 | /* |
479 | * The following are optimized routines which rely on the caller |
480 | * rounding up the source and destination buffers to multiples of |
481 | * 4, 8 or 64 bytes, and are 64-bit aligned; faster than memcpy(). |
482 | * |
483 | * Note: they do not support overlapping ranges. |
484 | */ |
485 | |
486 | /* |
487 | * Threshold as to when we use memcpy() rather than unrolled copy. |
488 | */ |
489 | #if defined(__x86_64__) |
490 | #define SK_COPY_THRES 2048 |
491 | #elif defined(__arm64__) |
492 | #define SK_COPY_THRES 1024 |
493 | #else /* !__x86_64__ && !__arm64__ */ |
494 | #define SK_COPY_THRES 1024 |
495 | #endif /* !__x86_64__ && !__arm64__ */ |
496 | |
497 | #if (DEVELOPMENT || DEBUG) |
498 | extern size_t sk_copy_thres; |
499 | #endif /* (DEVELOPMENT || DEBUG) */ |
500 | |
501 | /* |
502 | * Scalar version, 4-bytes multiple. |
503 | */ |
504 | __attribute__((always_inline)) |
505 | static inline void |
506 | sk_copy64_4x(uint32_t *__sized_by(l)src, uint32_t *__sized_by(l)dst, size_t l) |
507 | { |
508 | #if (DEVELOPMENT || DEBUG) |
509 | if (__probable(l <= sk_copy_thres)) { |
510 | #else |
511 | if (__probable(l <= SK_COPY_THRES)) { |
512 | #endif /* (!DEVELOPMENT && !DEBUG! */ |
513 | int i; |
514 | |
515 | for (i = 0; i < l / 4; i++) { |
516 | dst[i] = src[i]; /* [#i*4] */ |
517 | } |
518 | } else { |
519 | (void) memcpy(dst: (void *)dst, src: (void *)src, n: l); |
520 | } |
521 | } |
522 | |
523 | /* |
524 | * Scalar version, 8-bytes multiple. |
525 | */ |
526 | __attribute__((always_inline)) |
527 | static inline void |
528 | sk_copy64_8x(uint64_t *__sized_by(l)src, uint64_t *__sized_by(l)dst, size_t l) |
529 | { |
530 | #if (DEVELOPMENT || DEBUG) |
531 | if (__probable(l <= sk_copy_thres)) { |
532 | #else |
533 | if (__probable(l <= SK_COPY_THRES)) { |
534 | #endif /* (!DEVELOPMENT && !DEBUG! */ |
535 | int i; |
536 | |
537 | for (i = 0; i < l / 8; i++) { |
538 | dst[i] = src[i]; /* [#i*8] */ |
539 | } |
540 | } else { |
541 | (void) memcpy(dst: (void *)dst, src: (void *)src, n: l); |
542 | } |
543 | } |
544 | |
545 | /* |
546 | * Scalar version (usually faster than SIMD), 32-bytes multiple. |
547 | */ |
548 | __attribute__((always_inline)) |
549 | static inline void |
550 | sk_copy64_32x(uint64_t *__sized_by(l)src, uint64_t *__sized_by(l)dst, size_t l) |
551 | { |
552 | #if (DEVELOPMENT || DEBUG) |
553 | if (__probable(l <= sk_copy_thres)) { |
554 | #else |
555 | if (__probable(l <= SK_COPY_THRES)) { |
556 | #endif /* (!DEVELOPMENT && !DEBUG! */ |
557 | int n, i; |
558 | |
559 | for (n = 0; n < l / 32; n++) { |
560 | i = n * 4; |
561 | dst[i] = src[i]; /* [#(i+0)*8] */ |
562 | dst[i + 1] = src[i + 1]; /* [#(i+1)*8] */ |
563 | dst[i + 2] = src[i + 2]; /* [#(i+2)*8] */ |
564 | dst[i + 3] = src[i + 3]; /* [#(i+3)*8] */ |
565 | } |
566 | } else { |
567 | (void) memcpy(dst: (void *)dst, src: (void *)src, n: l); |
568 | } |
569 | } |
570 | |
571 | /* |
572 | * Scalar version (usually faster than SIMD), 64-bytes multiple. |
573 | */ |
574 | __attribute__((always_inline)) |
575 | static inline void |
576 | sk_copy64_64x(uint64_t *__sized_by(l)src, uint64_t *__sized_by(l)dst, size_t l) |
577 | { |
578 | #if (DEVELOPMENT || DEBUG) |
579 | if (__probable(l <= sk_copy_thres)) { |
580 | #else |
581 | if (__probable(l <= SK_COPY_THRES)) { |
582 | #endif /* (!DEVELOPMENT && !DEBUG! */ |
583 | int n, i; |
584 | |
585 | for (n = 0; n < l / 64; n++) { |
586 | i = n * 8; |
587 | dst[i] = src[i]; /* [#(i+0)*8] */ |
588 | dst[i + 1] = src[i + 1]; /* [#(i+1)*8] */ |
589 | dst[i + 2] = src[i + 2]; /* [#(i+2)*8] */ |
590 | dst[i + 3] = src[i + 3]; /* [#(i+3)*8] */ |
591 | dst[i + 4] = src[i + 4]; /* [#(i+4)*8] */ |
592 | dst[i + 5] = src[i + 5]; /* [#(i+5)*8] */ |
593 | dst[i + 6] = src[i + 6]; /* [#(i+6)*8] */ |
594 | dst[i + 7] = src[i + 7]; /* [#(i+7)*8] */ |
595 | } |
596 | } else { |
597 | (void) memcpy(dst: (void *)dst, src: (void *)src, n: l); |
598 | } |
599 | } |
600 | |
601 | /* |
602 | * Use scalar or SIMD based on platform/size. |
603 | */ |
604 | #if defined(__x86_64__) |
605 | #define sk_copy64_8 __sk_copy64_8 /* scalar only */ |
606 | #define sk_copy32_8 __sk_copy32_8 /* scalar only */ |
607 | #define sk_copy64_16 __sk_copy64_16 /* scalar */ |
608 | #define sk_copy32_16 __sk_copy32_16 /* scalar */ |
609 | #define sk_copy64_20 __sk_copy64_20 /* scalar */ |
610 | #define sk_copy64_24 __sk_copy64_24 /* scalar */ |
611 | #define sk_copy64_32 __sk_copy64_32 /* scalar */ |
612 | #define sk_copy32_32 __sk_copy32_32 /* scalar */ |
613 | #define sk_copy64_40 __sk_copy64_40 /* scalar */ |
614 | #define sk_zero_16 __sk_zero_16 /* scalar */ |
615 | #define sk_zero_32 __sk_zero_32 /* scalar */ |
616 | #define sk_zero_48 __sk_zero_48 /* scalar */ |
617 | #define sk_zero_128 __sk_zero_128 /* scalar */ |
618 | #elif defined(__arm64__) |
619 | #define sk_copy64_8 __sk_copy64_8 /* scalar only */ |
620 | #define sk_copy32_8 __sk_copy32_8 /* scalar only */ |
621 | #define sk_copy64_16 __sk_vcopy64_16 /* SIMD */ |
622 | #define sk_copy32_16 __sk_vcopy32_16 /* SIMD */ |
623 | #define sk_copy64_20 __sk_vcopy64_20 /* SIMD */ |
624 | #define sk_copy64_24 __sk_vcopy64_24 /* SIMD */ |
625 | #define sk_copy64_32 __sk_vcopy64_32 /* SIMD */ |
626 | #define sk_copy32_32 __sk_vcopy32_32 /* SIMD */ |
627 | #define sk_copy64_40 __sk_vcopy64_40 /* SIMD */ |
628 | #define sk_zero_16 __sk_zero_16 /* SIMD */ |
629 | #define sk_zero_32 __sk_zero_32 /* SIMD */ |
630 | #define sk_zero_48 __sk_zero_48 /* SIMD */ |
631 | #define sk_zero_128 __sk_zero_128 /* SIMD */ |
632 | #else |
633 | #define sk_copy64_8 __sk_copy64_8 /* scalar only */ |
634 | #define sk_copy32_8 __sk_copy32_8 /* scalar only */ |
635 | #define sk_copy64_16 __sk_copy64_16 /* scalar */ |
636 | #define sk_copy32_16 __sk_copy32_16 /* scalar */ |
637 | #define sk_copy64_20 __sk_copy64_20 /* scalar */ |
638 | #define sk_copy64_24 __sk_copy64_24 /* scalar */ |
639 | #define sk_copy64_32 __sk_copy64_32 /* scalar */ |
640 | #define sk_copy32_32 __sk_copy32_32 /* scalar */ |
641 | #define sk_copy64_40 __sk_copy64_40 /* scalar */ |
642 | #define sk_zero_16 __sk_zero_16 /* scalar */ |
643 | #define sk_zero_32 __sk_zero_32 /* scalar */ |
644 | #define sk_zero_48 __sk_zero_48 /* scalar */ |
645 | #define sk_zero_128 __sk_zero_128 /* scalar */ |
646 | #endif |
647 | |
648 | /* |
649 | * Do not use these directly. |
650 | * Use the skn_ variants if you need custom probe names. |
651 | */ |
652 | #define _sk_alloc_type(probename, type, flags, name) \ |
653 | ({ \ |
654 | void *ret; \ |
655 | \ |
656 | /* XXX Modify this to use KT_PRIV_ACCT later */ \ |
657 | ret = kalloc_type_tag(type, Z_ZERO | (flags), (name)->tag); \ |
658 | DTRACE_SKYWALK3(probename, char *, #type, int, (flags), \ |
659 | void *, ret); \ |
660 | ret; \ |
661 | }) |
662 | |
663 | #define _sk_alloc_type_array(probename, type, count, flags, name) \ |
664 | ({ \ |
665 | void *ret; \ |
666 | \ |
667 | ret = kalloc_type_tag(type, (count), Z_ZERO | (flags), \ |
668 | (name)->tag); \ |
669 | DTRACE_SKYWALK4(probename, char *, #type, size_t, (count), \ |
670 | int, (flags), void *, ret); \ |
671 | ret; \ |
672 | }) |
673 | |
674 | #define _sk_alloc_type_hash(probename, heap, size, flags, name) \ |
675 | ({ \ |
676 | void *ret; \ |
677 | \ |
678 | ret = kalloc_type_var_impl((heap), (size), \ |
679 | __zone_flags_mix_tag((flags) | Z_ZERO, (name)->tag), NULL); \ |
680 | DTRACE_SKYWALK4(probename, char *, (heap)->kt_name + 5, \ |
681 | size_t, (size), int, (flags), void *, ret); \ |
682 | ret; \ |
683 | }) |
684 | |
685 | #define _sk_realloc_type_array(probename, type, oldcount, newcount, elem, flags, name) \ |
686 | ({ \ |
687 | void *ret; \ |
688 | \ |
689 | ret = krealloc_type_tag(type, (oldcount), (newcount), (elem), \ |
690 | Z_ZERO | (flags), (name)->tag); \ |
691 | DTRACE_SKYWALK5(probename, void *, (elem), size_t, (oldcount), \ |
692 | size_t, (newcount), int, (flags), void *, ret); \ |
693 | ret; \ |
694 | }) |
695 | |
696 | #define (probename, htype, type, count, flags, name) \ |
697 | ({ \ |
698 | void *ret; \ |
699 | \ |
700 | ret = kalloc_type_tag(htype, type, (count), Z_ZERO | (flags), \ |
701 | (name)->tag); \ |
702 | DTRACE_SKYWALK5(probename, char *, #htype, char *, #type, \ |
703 | size_t, (count), int, (flags), void *, ret); \ |
704 | ret; \ |
705 | }) |
706 | |
707 | #define _sk_free_type(probename, type, elem) \ |
708 | { \ |
709 | DTRACE_SKYWALK2(probename, char *, #type, void *, (elem)); \ |
710 | kfree_type(type, (elem)); \ |
711 | } |
712 | |
713 | #define _sk_free_type_array(probename, type, count, elem) \ |
714 | { \ |
715 | DTRACE_SKYWALK3(probename, char *, #type, size_t, (count), \ |
716 | void *, (elem)); \ |
717 | kfree_type(type, (count), (elem)); \ |
718 | } |
719 | |
720 | #define _sk_free_type_hash(probename, heap, size, elem) \ |
721 | { \ |
722 | DTRACE_SKYWALK3(probename, char *, (heap)->kt_name + 5, \ |
723 | size_t, (size), void *, (elem)); \ |
724 | kfree_type_var_impl((heap), (elem), (size)); \ |
725 | } |
726 | |
727 | #define (probename, htype, type, count, elem) \ |
728 | { \ |
729 | DTRACE_SKYWALK4(probename, char *, #htype, char *, #type, \ |
730 | size_t, (count), void *, (elem)); \ |
731 | kfree_type(htype, type, (count), (elem)); \ |
732 | } |
733 | |
734 | #define _sk_alloc_data(probename, size, flags, name) \ |
735 | ({ \ |
736 | void *ret; \ |
737 | \ |
738 | ret = kalloc_data_tag((size), Z_ZERO | (flags), (name)->tag); \ |
739 | DTRACE_SKYWALK3(probename, size_t, (size), int, (flags), \ |
740 | void *, ret); \ |
741 | ret; \ |
742 | }) |
743 | |
744 | #define _sk_realloc_data(probename, elem, oldsize, newsize, flags, name) \ |
745 | ({ \ |
746 | void *ret; \ |
747 | \ |
748 | ret = krealloc_data_tag((elem), (oldsize), (newsize), \ |
749 | Z_ZERO | (flags), (name)->tag); \ |
750 | DTRACE_SKYWALK5(probename, void *, (elem), size_t, (oldsize), \ |
751 | size_t, (newsize), int, (flags), void *, ret); \ |
752 | ret; \ |
753 | }) |
754 | |
755 | #define _sk_free_data(probename, elem, size) \ |
756 | { \ |
757 | DTRACE_SKYWALK2(probename, void *, (elem), size_t, (size)); \ |
758 | kfree_data((elem), (size)); \ |
759 | } |
760 | |
761 | #define sk_alloc_type(type, flags, tag) \ |
762 | _sk_alloc_type(sk_alloc_type, type, flags, tag) |
763 | |
764 | #define sk_alloc_type_array(type, count, flags, tag) \ |
765 | _sk_alloc_type_array(sk_alloc_type_array, type, count, flags, tag) |
766 | |
767 | #define sk_alloc_type_hash(heap, size, flags, tag) \ |
768 | _sk_alloc_type_hash(sk_alloc_type_hash, heap, size, flags, tag) |
769 | |
770 | #define (htype, type, count, flags, tag) \ |
771 | _sk_alloc_type_header_array(sk_alloc_type_header_array, htype, \ |
772 | type, count, flags, tag) |
773 | |
774 | #define sk_realloc_type_array(type, oldsize, newsize, elem, flags, tag) \ |
775 | _sk_realloc_type_array(sk_realloc_type_array, type, \ |
776 | oldsize, newsize, elem, flags, tag) |
777 | |
778 | #define sk_free_type(type, elem) \ |
779 | _sk_free_type(sk_free_type, type, elem) |
780 | |
781 | #define sk_free_type_array(type, count, elem) \ |
782 | _sk_free_type_array(sk_free_type_array, type, count, elem) |
783 | |
784 | #define sk_free_type_hash(heap, size, elem) \ |
785 | _sk_free_type_hash(sk_free_type_hash, heap, size, elem) |
786 | |
787 | #define (htype, type, count, elem) \ |
788 | _sk_free_type_header_array(sk_free_type_header_array, htype, \ |
789 | type, count, elem) |
790 | |
791 | #define sk_alloc_data(size, flags, tag) \ |
792 | _sk_alloc_data(sk_alloc_data, size, flags, tag) |
793 | |
794 | #define sk_realloc_data(elem, oldsize, newsize, flags, tag) \ |
795 | _sk_realloc_data(sk_realloc_data, elem, oldsize, newsize, \ |
796 | flags, tag) |
797 | |
798 | #define sk_free_data(elem, size) \ |
799 | _sk_free_data(sk_free_data, elem, size) |
800 | |
801 | /* |
802 | * The skn_ variants are meant to be used if you need to use two or more |
803 | * of the same call within the same function and you want the dtrace |
804 | * probename to be different at each callsite. |
805 | */ |
806 | #define skn_realloc(name, elem, oldsize, newsize, flags, tag) \ |
807 | _sk_realloc(sk_realloc_ ## name, elem, oldsize, newsize, flags, \ |
808 | tag) |
809 | |
810 | #define skn_alloc_type(name, type, flags, tag) \ |
811 | _sk_alloc_type(sk_alloc_type_ ## name, type, flags, tag) |
812 | |
813 | #define skn_alloc_type_array(name, type, count, flags, tag) \ |
814 | _sk_alloc_type_array(sk_alloc_type_array_ ## name, type, count, \ |
815 | flags, tag) |
816 | |
817 | #define skn_alloc_type_hash(name, heap, size, flags, tag) \ |
818 | _sk_alloc_type_hash(sk_alloc_type_hash_ ## name, heap, size, \ |
819 | flags, tag) |
820 | |
821 | #define (name, htype, type, count, flags, tag) \ |
822 | _sk_alloc_type_header_array(sk_alloc_type_header_array_ ## name, \ |
823 | htype, type, count, flags, tag) |
824 | |
825 | #define skn_free_type(name, type, elem) \ |
826 | _sk_free_type(sk_free_type_ ## name, type, elem) |
827 | |
828 | #define skn_free_type_array(name, type, count, elem) \ |
829 | _sk_free_type_array(sk_free_type_array_ ## name, type, count, \ |
830 | elem) |
831 | |
832 | #define skn_free_type_hash(name, heap, size, elem) \ |
833 | _sk_free_type_hash(sk_free_type_hash_ ## name, heap, size, elem) |
834 | |
835 | #define (name, htype, type, count, elem) \ |
836 | _sk_free_type_header_array(sk_free_type_header_array_ ## name, \ |
837 | htype, type, count, elem) |
838 | |
839 | #define skn_alloc_data(name, size, flags, tag) \ |
840 | _sk_alloc_data(sk_alloc_data_ ## name, size, flags, tag) |
841 | |
842 | #define skn_realloc_data(name, elem, oldsize, newsize, flags, tag) \ |
843 | _sk_realloc_data(sk_realloc_data_ ## name, elem, oldsize, newsize,\ |
844 | flags, tag) |
845 | |
846 | #define skn_free_data(name, elem, size) \ |
847 | _sk_free_data(sk_free_data_ ## name, elem, size) |
848 | |
849 | struct sk_tag_spec { |
850 | kern_allocation_name_t *skt_var; |
851 | const char *skt_name; |
852 | }; |
853 | |
854 | extern void __sk_tag_make(const struct sk_tag_spec *spec); |
855 | |
856 | #define SKMEM_TAG_DEFINE(var, name) \ |
857 | SECURITY_READ_ONLY_LATE(kern_allocation_name_t) var; \ |
858 | __startup_data struct sk_tag_spec __sktag_##var = { \ |
859 | .skt_var = &var, .skt_name = name, \ |
860 | }; \ |
861 | STARTUP_ARG(ZALLOC, STARTUP_RANK_LAST, __sk_tag_make, &__sktag_##var) |
862 | |
863 | /*! |
864 | * @abstract Compare byte buffers of n bytes long src1 against src2, applying |
865 | * the byte masks to input data before comparison. (Scalar version) |
866 | * |
867 | * @discussion |
868 | * Returns zero if the two buffers are identical after applying the byte |
869 | * masks, otherwise non-zero. |
870 | * Zero-length buffers are always identical. |
871 | * |
872 | * @param src1 first input buffer of n bytes long |
873 | * @param src2 second input buffer of n bytes long |
874 | * @param byte_mask byte mask of n bytes long applied before comparision |
875 | * @param n number of bytes |
876 | */ |
877 | static inline int |
878 | __sk_memcmp_mask_scalar(const uint8_t *__counted_by(n)src1, |
879 | const uint8_t *__counted_by(n)src2, |
880 | const uint8_t *__counted_by(n)byte_mask, size_t n) |
881 | { |
882 | uint32_t result = 0; |
883 | for (size_t i = 0; i < n; i++) { |
884 | result |= (src1[i] ^ src2[i]) & byte_mask[i]; |
885 | } |
886 | return result; |
887 | } |
888 | |
889 | static inline int |
890 | __sk_memcmp_mask_16B_scalar(const uint8_t *__counted_by(16)src1, |
891 | const uint8_t *__counted_by(16)src2, |
892 | const uint8_t *__counted_by(16)byte_mask) |
893 | { |
894 | return __sk_memcmp_mask_scalar(src1, src2, byte_mask, n: 16); |
895 | } |
896 | |
897 | static inline int |
898 | __sk_memcmp_mask_32B_scalar(const uint8_t *__counted_by(32)src1, |
899 | const uint8_t *__counted_by(32)src2, |
900 | const uint8_t *__counted_by(32)byte_mask) |
901 | { |
902 | return __sk_memcmp_mask_scalar(src1, src2, byte_mask, n: 32); |
903 | } |
904 | |
905 | static inline int |
906 | __sk_memcmp_mask_48B_scalar(const uint8_t *__counted_by(48)src1, |
907 | const uint8_t *__counted_by(48)src2, |
908 | const uint8_t *__counted_by(48)byte_mask) |
909 | { |
910 | return __sk_memcmp_mask_scalar(src1, src2, byte_mask, n: 48); |
911 | } |
912 | |
913 | static inline int |
914 | __sk_memcmp_mask_64B_scalar(const uint8_t *__counted_by(64)src1, |
915 | const uint8_t *__counted_by(64)src2, |
916 | const uint8_t *__counted_by(64)byte_mask) |
917 | { |
918 | return __sk_memcmp_mask_scalar(src1, src2, byte_mask, n: 64); |
919 | } |
920 | |
921 | static inline int |
922 | __sk_memcmp_mask_80B_scalar(const uint8_t *__counted_by(80)src1, |
923 | const uint8_t *__counted_by(80)src2, |
924 | const uint8_t *__counted_by(80)byte_mask) |
925 | { |
926 | return __sk_memcmp_mask_scalar(src1, src2, byte_mask, n: 80); |
927 | } |
928 | |
929 | #if defined(__arm64__) || defined(__arm__) || defined(__x86_64__) |
930 | extern int os_memcmp_mask_16B(const uint8_t *__counted_by(16)src1, |
931 | const uint8_t *__counted_by(16)src2, |
932 | const uint8_t *__counted_by(16)byte_mask); |
933 | extern int os_memcmp_mask_32B(const uint8_t *__counted_by(32)src1, |
934 | const uint8_t *__counted_by(32)src2, |
935 | const uint8_t *__counted_by(32)byte_mask); |
936 | extern int os_memcmp_mask_48B(const uint8_t *__counted_by(48)src1, |
937 | const uint8_t *__counted_by(48)src2, |
938 | const uint8_t *__counted_by(48)byte_mask); |
939 | extern int os_memcmp_mask_64B(const uint8_t *__counted_by(64)src1, |
940 | const uint8_t *__counted_by(64)src2, |
941 | const uint8_t *__counted_by(64)byte_mask); |
942 | extern int os_memcmp_mask_80B(const uint8_t *__counted_by(80)src1, |
943 | const uint8_t *__counted_by(80)src2, |
944 | const uint8_t *__counted_by(80)byte_mask); |
945 | |
946 | /* |
947 | * Use SIMD variants based on ARM64 and x86_64. |
948 | */ |
949 | #define sk_memcmp_mask __sk_memcmp_mask |
950 | #define sk_memcmp_mask_16B os_memcmp_mask_16B |
951 | #define sk_memcmp_mask_32B os_memcmp_mask_32B |
952 | #define sk_memcmp_mask_48B os_memcmp_mask_48B |
953 | #define sk_memcmp_mask_64B os_memcmp_mask_64B |
954 | #define sk_memcmp_mask_80B os_memcmp_mask_80B |
955 | |
956 | /*! |
957 | * @abstract Compare byte buffers of n bytes long src1 against src2, applying |
958 | * the byte masks to input data before comparison. (SIMD version) |
959 | * |
960 | * @discussion |
961 | * Returns zero if the two buffers are identical after applying the byte |
962 | * masks, otherwise non-zero. |
963 | * Zero-length buffers are always identical. |
964 | * |
965 | * @param src1 first input buffer of n bytes long |
966 | * @param src2 second input buffer of n bytes long |
967 | * @param byte_mask byte mask of n bytes long applied before comparision |
968 | * @param n number of bytes |
969 | */ |
970 | static inline int |
971 | __sk_memcmp_mask(const uint8_t *__counted_by(n)src1, |
972 | const uint8_t *__counted_by(n)src2, |
973 | const uint8_t *__counted_by(n)byte_mask, size_t n) |
974 | { |
975 | uint32_t result = 0; |
976 | size_t i = 0; |
977 | for (; i + 64 <= n; i += 64) { |
978 | result |= sk_memcmp_mask_64B(src1: src1 + i, src2: src2 + i, |
979 | byte_mask: byte_mask + i); |
980 | } |
981 | for (; i + 32 <= n; i += 32) { |
982 | result |= sk_memcmp_mask_32B(src1: src1 + i, src2: src2 + i, |
983 | byte_mask: byte_mask + i); |
984 | } |
985 | for (; i + 16 <= n; i += 16) { |
986 | result |= sk_memcmp_mask_16B(src1: src1 + i, src2: src2 + i, |
987 | byte_mask: byte_mask + i); |
988 | } |
989 | if (i < n) { |
990 | if (n >= 16) { |
991 | /* Compare the last 16 bytes with vector code. */ |
992 | result |= sk_memcmp_mask_16B(src1: src1 + n - 16, |
993 | src2: src2 + n - 16, byte_mask: byte_mask + n - 16); |
994 | } else { |
995 | /* Use scalar code if n < 16. */ |
996 | for (; i < n; i++) { |
997 | result |= (src1[i] ^ src2[i]) & byte_mask[i]; |
998 | } |
999 | } |
1000 | } |
1001 | return result; |
1002 | } |
1003 | #else /* !(__arm64__ || __arm__ || __x86_64__) */ |
1004 | /* |
1005 | * Use scalar variants elsewhere. |
1006 | */ |
1007 | #define sk_memcmp_mask __sk_memcmp_mask_scalar |
1008 | #define sk_memcmp_mask_16B __sk_memcmp_mask_16B_scalar |
1009 | #define sk_memcmp_mask_32B __sk_memcmp_mask_32B_scalar |
1010 | #define sk_memcmp_mask_48B __sk_memcmp_mask_48B_scalar |
1011 | #define sk_memcmp_mask_64B __sk_memcmp_mask_64B_scalar |
1012 | #define sk_memcmp_mask_80B __sk_memcmp_mask_80B_scalar |
1013 | #endif /* !(__arm64__ || __arm__ || __x86_64__) */ |
1014 | |
1015 | /* |
1016 | * Scalar variants are available on all platforms if needed. |
1017 | */ |
1018 | #define sk_memcmp_mask_scalar __sk_memcmp_mask_scalar |
1019 | #define sk_memcmp_mask_16B_scalar __sk_memcmp_mask_16B_scalar |
1020 | #define sk_memcmp_mask_32B_scalar __sk_memcmp_mask_32B_scalar |
1021 | #define sk_memcmp_mask_48B_scalar __sk_memcmp_mask_48B_scalar |
1022 | #define sk_memcmp_mask_64B_scalar __sk_memcmp_mask_64B_scalar |
1023 | #define sk_memcmp_mask_80B_scalar __sk_memcmp_mask_80B_scalar |
1024 | |
1025 | #endif /* KERNEL */ |
1026 | #endif /* PRIVATE || BSD_KERNEL_PRIVATE */ |
1027 | #endif /* !_SKYWALK_COMMON_H_ */ |
1028 | |