1/*
2 * Copyright (c) 2012-2018 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
31 * with __arm64__ tagged ARM64_TODO . This code revision is optimized based
32 * on the 64-bit part in netinet/cpu_in_cksum.c
33 *
34 * cclee - CoreOS - Vector & Numerics. 06/20/2012.
35 */
36
37#ifdef KERNEL
38#define CKSUM_ERR _kprintf
39#else
40#ifndef LIBSYSCALL_INTERFACE
41#error "LIBSYSCALL_INTERFACE not defined"
42#endif /* !LIBSYSCALL_INTERFACE */
43#define CKSUM_ERR _fprintf_stderr
44#endif /* !KERNEL */
45
46/*
47 * XXX: adi@apple.com:
48 *
49 * Ugly, but we have little choice, since relying on genassym and <assym.s>
50 * is not possible unless this code lives in osfmk. Note also that this
51 * routine expects "mbuf-like" argument, and it does not expect the mbuf to be
52 * authentic; it only cares about 3 fields.
53 */
54#if defined(__LP64__)
55#define M_NEXT 0
56#define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
57#define M_LEN 24
58#else
59#define M_NEXT 0
60#define M_DATA 8
61#define M_LEN 12
62#endif
63
64 .globl _os_cpu_in_cksum_mbuf
65 .text
66 .align 4
67_os_cpu_in_cksum_mbuf:
68
69
70/*
71 * 64-bit version.
72 *
73 * This function returns the partial 16-bit checksum accumulated in
74 * a 32-bit variable (withouth 1's complement); caller is responsible
75 * for folding the 32-bit sum into 16-bit and performinng the 1's
76 * complement if applicable
77 */
78
79/*
80 * uint32_t
81 * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
82 * {
83 * int mlen;
84 * uint64_t sum, partial;
85 * unsigned int final_acc;
86 * uint8_t *data;
87 * boolean_t needs_swap, started_on_odd;
88 *
89 * VERIFY(len >= 0);
90 * VERIFY(off >= 0);
91 *
92 * needs_swap = FALSE;
93 * started_on_odd = FALSE;
94 * sum = initial_sum;
95 */
96
97 #define m x0
98 #define len x1
99 #define off x2
100 #define sum x3
101 #define needs_swap x4
102 #define started_on_odd x5
103 #define mlen x6
104 #define Wmlen w6
105 #define t x7
106 #define data x8
107#if defined(__LP64__)
108 #define ptr_m x0
109 #define ptr_data x8
110#else
111 #define ptr_m w0
112 #define ptr_data w8
113#endif
114
115
116 mov needs_swap, #0 // needs_swap = FALSE;
117 mov started_on_odd, #0 // started_on_odd = FALSE;
118 mov w3, w3 // clear higher half
119
120
121/*
122 * for (;;) {
123 * if (PREDICT_FALSE(m == NULL)) {
124 * CKSUM_ERR("%s: out of data\n", __func__);
125 * return (-1);
126 * }
127 * mlen = m->m_len;
128 * if (mlen > off) {
129 * mlen -= off;
130 * data = mtod(m, uint8_t *) + off;
131 * goto post_initial_offset;
132 * }
133 * off -= mlen;
134 * if (len == 0)
135 * break;
136 * m = m->m_next;
137 * }
138 */
139
1400:
141 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
142 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
143 cmp mlen, off
144 b.le 1f
145 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
146 sub mlen, mlen, off // mlen -= off;
147 add data, data, off // data = mtod(m, uint8_t *) + off;
148 b L_post_initial_offset
1491:
150 sub off, off, mlen
151 cbnz len, 2f
152 mov x0, x3
153 ret lr
1542:
155 ldr ptr_m, [m, #M_NEXT]
156 b 0b
157
158L_loop: // for (; len > 0; m = m->m_next) {
159/*
160 * if (PREDICT_FALSE(m == NULL)) {
161 * CKSUM_ERR("%s: out of data\n", __func__);
162 * return (-1);
163 * }
164 * mlen = m->m_len;
165 * data = mtod(m, uint8_t *);
166 */
167 cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
168 ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
169 ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *)
170
171L_post_initial_offset:
172/*
173 * if (mlen == 0) continue;
174 * if (mlen > len) mlen = len;
175 * len -= mlen;
176 */
177
178 cbz mlen, L_continue
179 cmp mlen, len
180 csel mlen, mlen, len, le
181 sub len, len, mlen
182
183/*
184 * partial = 0;
185 * if ((uintptr_t)data & 1) {
186 * started_on_odd = !started_on_odd;
187 * partial = *data << 8;
188 * ++data;
189 * --mlen;
190 * }
191 * needs_swap = started_on_odd;
192 */
193
194 tst data, #1
195 mov x7, #0
196 mov x10, #0
197 b.eq 1f
198 ldrb w7, [data], #1
199 eor started_on_odd, started_on_odd, #1
200 sub mlen, mlen, #1
201 lsl w7, w7, #8
2021:
203
204
205/*
206 * if ((uintptr_t)data & 2) {
207 * if (mlen < 2)
208 * goto trailing_bytes;
209 * partial += *(uint16_t *)(void *)data;
210 * data += 2;
211 * mlen -= 2;
212 * }
213 */
214 tst data, #2
215 mov needs_swap, started_on_odd
216 b.eq 1f
217 cmp mlen, #2
218 b.lt L_trailing_bytes
219 ldrh w9, [data], #2
220 sub mlen, mlen, #2
221 add w7, w7, w9
2221:
223
224/*
225 * if ((uintptr_t)data & 4) {
226 * if (mlen < 4)
227 * goto L2_bytes;
228 * partial += *(uint32_t *)(void *)data;
229 * data += 4;
230 * mlen -= 4;
231 * }
232 */
233 // align on 8-bytes boundary if applicable
234 tst data, #4
235 b.eq 1f
236 cmp mlen, #4
237 b.lt L2_bytes
238 ldr w9, [data], #4
239 sub mlen, mlen, #4
240 adds w7, w7, w9
241 adc x7, x7, x10 // assumes x10 still is #0 as set above
2421:
243
244/*
245 * while (mlen >= 64) {
246 * __builtin_prefetch(data + 32);
247 * __builtin_prefetch(data + 64);
248 * partial += *(uint32_t *)(void *)data;
249 * partial += *(uint32_t *)(void *)(data + 4);
250 * partial += *(uint32_t *)(void *)(data + 8);
251 * partial += *(uint32_t *)(void *)(data + 12);
252 * partial += *(uint32_t *)(void *)(data + 16);
253 * partial += *(uint32_t *)(void *)(data + 20);
254 * partial += *(uint32_t *)(void *)(data + 24);
255 * partial += *(uint32_t *)(void *)(data + 28);
256 * partial += *(uint32_t *)(void *)(data + 32);
257 * partial += *(uint32_t *)(void *)(data + 36);
258 * partial += *(uint32_t *)(void *)(data + 40);
259 * partial += *(uint32_t *)(void *)(data + 44);
260 * partial += *(uint32_t *)(void *)(data + 48);
261 * partial += *(uint32_t *)(void *)(data + 52);
262 * partial += *(uint32_t *)(void *)(data + 56);
263 * partial += *(uint32_t *)(void *)(data + 60);
264 * data += 64;
265 * mlen -= 64;
266 * // if (PREDICT_FALSE(partial & (3ULL << 62))) {
267 * // if (needs_swap)
268 * // partial = (partial << 8) +
269 * // (partial >> 56);
270 * // sum += (partial >> 32);
271 * // sum += (partial & 0xffffffff);
272 * // partial = 0;
273 * // }
274 * }
275*/
276
277 // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
278 subs mlen, mlen, #64
279 b.lt L32_bytes
280
281 // save used vector registers
282 sub sp, sp, #8*16
283 mov x11, sp
284 st1.4s {v0, v1, v2, v3}, [x11], #4*16
285 st1.4s {v4, v5, v6, v7}, [x11], #4*16
286
287 // spread partial into 8 8-byte registers in v0-v3
288 fmov s3, w7
289 eor.16b v0, v0, v0
290 eor.16b v1, v1, v1
291 eor.16b v2, v2, v2
292
293 // load the 1st 64 bytes (16 32-bit words)
294 ld1.4s {v4,v5,v6,v7},[data],#64
295
296 // branch to finish off if mlen<64
297 subs mlen, mlen, #64
298 b.lt L64_finishup
299
300 /*
301 * loop for loading and accumulating 16 32-bit words into
302 * 8 8-byte accumulators per iteration.
303 */
304L64_loop:
305 subs mlen, mlen, #64 // mlen -= 64
306
307 uadalp.2d v0, v4
308 ld1.4s {v4},[data], #16
309
310 uadalp.2d v1, v5
311 ld1.4s {v5},[data], #16
312
313 uadalp.2d v2, v6
314 ld1.4s {v6},[data], #16
315
316 uadalp.2d v3, v7
317 ld1.4s {v7},[data], #16
318
319 b.ge L64_loop
320
321L64_finishup:
322 uadalp.2d v0, v4
323 uadalp.2d v1, v5
324 uadalp.2d v2, v6
325 uadalp.2d v3, v7
326
327 add.2d v0, v0, v1
328 add.2d v2, v2, v3
329 addp.2d d0, v0
330 addp.2d d2, v2
331 add.2d v0, v0, v2
332 fmov x7, d0 // partial in x7 now
333
334 // restore used vector registers
335 ld1.4s {v0, v1, v2, v3}, [sp], #4*16
336 ld1.4s {v4, v5, v6, v7}, [sp], #4*16
337
338L32_bytes:
339 tst mlen, #32
340 b.eq L16_bytes
341 ldp x9, x10, [data], #16
342 ldp x11, x12, [data], #16
343 adds x7, x7, x9
344 mov x9, #0
345 adcs x7, x7, x10
346 adcs x7, x7, x11
347 adcs x7, x7, x12
348 adc x7, x7, x9
349
350L16_bytes:
351 tst mlen, #16
352 b.eq L8_bytes
353 ldp x9, x10, [data], #16
354 adds x7, x7, x9
355 mov x9, #0
356 adcs x7, x7, x10
357 adc x7, x7, x9
358
359L8_bytes:
360 tst mlen, #8
361 mov x10, #0
362 b.eq L4_bytes
363 ldr x9,[data],#8
364 adds x7, x7, x9
365 adc x7, x7, x10
366
367L4_bytes:
368 tst mlen, #4
369 b.eq L2_bytes
370 ldr w9,[data],#4
371 adds x7, x7, x9
372 adc x7, x7, x10
373
374L2_bytes:
375 tst mlen, #2
376 b.eq L_trailing_bytes
377 ldrh w9,[data],#2
378 adds x7, x7, x9
379 adc x7, x7, x10
380
381L_trailing_bytes:
382 tst mlen, #1
383 b.eq L0_bytes
384 ldrb w9,[data],#1
385 adds x7, x7, x9
386 adc x7, x7, x10
387 eor started_on_odd, started_on_odd, #1
388
389L0_bytes:
390/*
391 * if (needs_swap)
392 * partial = (partial << 8) + (partial >> 56);
393 */
394 cbz needs_swap, 1f
395 ror x7, x7, #56
3961:
397/*
398 * sum += (partial >> 32) + (partial & 0xffffffff);
399 * sum = (sum >> 32) + (sum & 0xffffffff);
400 * }
401 */
402
403 add x3, x3, x7, lsr #32
404 mov w7, w7
405 add x3, x3, x7
406 mov w7, w3
407 add x3, x7, x3, lsr #32
408
409L_continue:
410 cmp len, #0
411 ldr ptr_m, [m, #M_NEXT] // m = m->m_next
412 b.gt L_loop
413
414/*
415 * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
416 * ((sum >> 16) & 0xffff) + (sum & 0xffff);
417 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
418 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
419 * return (final_acc & 0xffff);
420 * }
421 */
422
423 mov w4, #0x00ffff
424 and x0, x4, x3, lsr #48
425 and x1, x4, x3, lsr #32
426 and x2, x4, x3, lsr #16
427 and x3, x4, x3
428 add w0, w0, w1
429 add w2, w2, w3
430 add w0, w0, w2
431 and w1, w4, w0, lsr #16
432 and w0, w4, w0
433 add w0, w0, w1
434 and w1, w4, w0, lsr #16
435 and w0, w4, w0
436 add w0, w0, w1
437 /*
438 * If we were to 1's complement it (XOR with 0xffff):
439 *
440 * eor w0, w0, w4
441 */
442 and w0, w0, w4
443
444 ret lr
445
446Lin_cksum_whoops:
447 adrp x0, Lin_cksum_whoops_str@page
448 add x0, x0, Lin_cksum_whoops_str@pageoff
449 bl #CKSUM_ERR
450 mov x0, #-1
451 ret lr
452
453Lin_cksum_whoops_str:
454 .asciz "os_cpu_in_cksum_mbuf: out of data\n"
455 .align 5
456