1 | /* |
2 | * Copyright (c) 2012-2018 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | /* |
30 | * This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__) |
31 | * with __arm64__ tagged ARM64_TODO . This code revision is optimized based |
32 | * on the 64-bit part in netinet/cpu_in_cksum.c |
33 | * |
34 | * cclee - CoreOS - Vector & Numerics. 06/20/2012. |
35 | */ |
36 | |
37 | #ifdef KERNEL |
38 | #define CKSUM_ERR _kprintf |
39 | #else |
40 | #ifndef LIBSYSCALL_INTERFACE |
41 | #error "LIBSYSCALL_INTERFACE not defined" |
42 | #endif /* !LIBSYSCALL_INTERFACE */ |
43 | #define CKSUM_ERR _fprintf_stderr |
44 | #endif /* !KERNEL */ |
45 | |
46 | /* |
47 | * XXX: adi@apple.com: |
48 | * |
49 | * Ugly, but we have little choice, since relying on genassym and <assym.s> |
50 | * is not possible unless this code lives in osfmk. Note also that this |
51 | * routine expects "mbuf-like" argument, and it does not expect the mbuf to be |
52 | * authentic; it only cares about 3 fields. |
53 | */ |
54 | #if defined(__LP64__) |
55 | #define M_NEXT 0 |
56 | #define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary |
57 | #define M_LEN 24 |
58 | #else |
59 | #define M_NEXT 0 |
60 | #define M_DATA 8 |
61 | #define M_LEN 12 |
62 | #endif |
63 | |
64 | .globl _os_cpu_in_cksum_mbuf |
65 | .text |
66 | .align 4 |
67 | _os_cpu_in_cksum_mbuf: |
68 | |
69 | |
70 | /* |
71 | * 64-bit version. |
72 | * |
73 | * This function returns the partial 16-bit checksum accumulated in |
74 | * a 32-bit variable (withouth 1's complement); caller is responsible |
75 | * for folding the 32-bit sum into 16-bit and performinng the 1's |
76 | * complement if applicable |
77 | */ |
78 | |
79 | /* |
80 | * uint32_t |
81 | * os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum) |
82 | * { |
83 | * int mlen; |
84 | * uint64_t sum, partial; |
85 | * unsigned int final_acc; |
86 | * uint8_t *data; |
87 | * boolean_t needs_swap, started_on_odd; |
88 | * |
89 | * VERIFY(len >= 0); |
90 | * VERIFY(off >= 0); |
91 | * |
92 | * needs_swap = FALSE; |
93 | * started_on_odd = FALSE; |
94 | * sum = initial_sum; |
95 | */ |
96 | |
97 | #define m x0 |
98 | #define len x1 |
99 | #define off x2 |
100 | #define sum x3 |
101 | #define needs_swap x4 |
102 | #define started_on_odd x5 |
103 | #define mlen x6 |
104 | #define Wmlen w6 |
105 | #define t x7 |
106 | #define data x8 |
107 | #if defined(__LP64__) |
108 | #define ptr_m x0 |
109 | #define ptr_data x8 |
110 | #else |
111 | #define ptr_m w0 |
112 | #define ptr_data w8 |
113 | #endif |
114 | |
115 | |
116 | mov needs_swap, #0 // needs_swap = FALSE; |
117 | mov started_on_odd, #0 // started_on_odd = FALSE; |
118 | mov w3, w3 // clear higher half |
119 | |
120 | |
121 | /* |
122 | * for (;;) { |
123 | * if (PREDICT_FALSE(m == NULL)) { |
124 | * CKSUM_ERR("%s: out of data\n", __func__); |
125 | * return (-1); |
126 | * } |
127 | * mlen = m->m_len; |
128 | * if (mlen > off) { |
129 | * mlen -= off; |
130 | * data = mtod(m, uint8_t *) + off; |
131 | * goto post_initial_offset; |
132 | * } |
133 | * off -= mlen; |
134 | * if (len == 0) |
135 | * break; |
136 | * m = m->m_next; |
137 | * } |
138 | */ |
139 | |
140 | 0: |
141 | cbz m, Lin_cksum_whoops // if (m == NULL) return -1; |
142 | ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; |
143 | cmp mlen, off |
144 | b.le 1f |
145 | ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) |
146 | sub mlen, mlen, off // mlen -= off; |
147 | add data, data, off // data = mtod(m, uint8_t *) + off; |
148 | b L_post_initial_offset |
149 | 1: |
150 | sub off, off, mlen |
151 | cbnz len, 2f |
152 | mov x0, x3 |
153 | ret lr |
154 | 2: |
155 | ldr ptr_m, [m, #M_NEXT] |
156 | b 0b |
157 | |
158 | L_loop: // for (; len > 0; m = m->m_next) { |
159 | /* |
160 | * if (PREDICT_FALSE(m == NULL)) { |
161 | * CKSUM_ERR("%s: out of data\n", __func__); |
162 | * return (-1); |
163 | * } |
164 | * mlen = m->m_len; |
165 | * data = mtod(m, uint8_t *); |
166 | */ |
167 | cbz m, Lin_cksum_whoops // if (m == NULL) return -1; |
168 | ldr Wmlen, [m, #M_LEN] // mlen = m->m_len; |
169 | ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t *) |
170 | |
171 | L_post_initial_offset: |
172 | /* |
173 | * if (mlen == 0) continue; |
174 | * if (mlen > len) mlen = len; |
175 | * len -= mlen; |
176 | */ |
177 | |
178 | cbz mlen, L_continue |
179 | cmp mlen, len |
180 | csel mlen, mlen, len, le |
181 | sub len, len, mlen |
182 | |
183 | /* |
184 | * partial = 0; |
185 | * if ((uintptr_t)data & 1) { |
186 | * started_on_odd = !started_on_odd; |
187 | * partial = *data << 8; |
188 | * ++data; |
189 | * --mlen; |
190 | * } |
191 | * needs_swap = started_on_odd; |
192 | */ |
193 | |
194 | tst data, #1 |
195 | mov x7, #0 |
196 | mov x10, #0 |
197 | b.eq 1f |
198 | ldrb w7, [data], #1 |
199 | eor started_on_odd, started_on_odd, #1 |
200 | sub mlen, mlen, #1 |
201 | lsl w7, w7, #8 |
202 | 1: |
203 | |
204 | |
205 | /* |
206 | * if ((uintptr_t)data & 2) { |
207 | * if (mlen < 2) |
208 | * goto trailing_bytes; |
209 | * partial += *(uint16_t *)(void *)data; |
210 | * data += 2; |
211 | * mlen -= 2; |
212 | * } |
213 | */ |
214 | tst data, #2 |
215 | mov needs_swap, started_on_odd |
216 | b.eq 1f |
217 | cmp mlen, #2 |
218 | b.lt L_trailing_bytes |
219 | ldrh w9, [data], #2 |
220 | sub mlen, mlen, #2 |
221 | add w7, w7, w9 |
222 | 1: |
223 | |
224 | /* |
225 | * if ((uintptr_t)data & 4) { |
226 | * if (mlen < 4) |
227 | * goto L2_bytes; |
228 | * partial += *(uint32_t *)(void *)data; |
229 | * data += 4; |
230 | * mlen -= 4; |
231 | * } |
232 | */ |
233 | // align on 8-bytes boundary if applicable |
234 | tst data, #4 |
235 | b.eq 1f |
236 | cmp mlen, #4 |
237 | b.lt L2_bytes |
238 | ldr w9, [data], #4 |
239 | sub mlen, mlen, #4 |
240 | adds w7, w7, w9 |
241 | adc x7, x7, x10 // assumes x10 still is #0 as set above |
242 | 1: |
243 | |
244 | /* |
245 | * while (mlen >= 64) { |
246 | * __builtin_prefetch(data + 32); |
247 | * __builtin_prefetch(data + 64); |
248 | * partial += *(uint32_t *)(void *)data; |
249 | * partial += *(uint32_t *)(void *)(data + 4); |
250 | * partial += *(uint32_t *)(void *)(data + 8); |
251 | * partial += *(uint32_t *)(void *)(data + 12); |
252 | * partial += *(uint32_t *)(void *)(data + 16); |
253 | * partial += *(uint32_t *)(void *)(data + 20); |
254 | * partial += *(uint32_t *)(void *)(data + 24); |
255 | * partial += *(uint32_t *)(void *)(data + 28); |
256 | * partial += *(uint32_t *)(void *)(data + 32); |
257 | * partial += *(uint32_t *)(void *)(data + 36); |
258 | * partial += *(uint32_t *)(void *)(data + 40); |
259 | * partial += *(uint32_t *)(void *)(data + 44); |
260 | * partial += *(uint32_t *)(void *)(data + 48); |
261 | * partial += *(uint32_t *)(void *)(data + 52); |
262 | * partial += *(uint32_t *)(void *)(data + 56); |
263 | * partial += *(uint32_t *)(void *)(data + 60); |
264 | * data += 64; |
265 | * mlen -= 64; |
266 | * // if (PREDICT_FALSE(partial & (3ULL << 62))) { |
267 | * // if (needs_swap) |
268 | * // partial = (partial << 8) + |
269 | * // (partial >> 56); |
270 | * // sum += (partial >> 32); |
271 | * // sum += (partial & 0xffffffff); |
272 | * // partial = 0; |
273 | * // } |
274 | * } |
275 | */ |
276 | |
277 | // pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next |
278 | subs mlen, mlen, #64 |
279 | b.lt L32_bytes |
280 | |
281 | // save used vector registers |
282 | sub sp, sp, #8*16 |
283 | mov x11, sp |
284 | st1.4s {v0, v1, v2, v3}, [x11], #4*16 |
285 | st1.4s {v4, v5, v6, v7}, [x11], #4*16 |
286 | |
287 | // spread partial into 8 8-byte registers in v0-v3 |
288 | fmov s3, w7 |
289 | eor.16b v0, v0, v0 |
290 | eor.16b v1, v1, v1 |
291 | eor.16b v2, v2, v2 |
292 | |
293 | // load the 1st 64 bytes (16 32-bit words) |
294 | ld1.4s {v4,v5,v6,v7},[data],#64 |
295 | |
296 | // branch to finish off if mlen<64 |
297 | subs mlen, mlen, #64 |
298 | b.lt L64_finishup |
299 | |
300 | /* |
301 | * loop for loading and accumulating 16 32-bit words into |
302 | * 8 8-byte accumulators per iteration. |
303 | */ |
304 | L64_loop: |
305 | subs mlen, mlen, #64 // mlen -= 64 |
306 | |
307 | uadalp.2d v0, v4 |
308 | ld1.4s {v4},[data], #16 |
309 | |
310 | uadalp.2d v1, v5 |
311 | ld1.4s {v5},[data], #16 |
312 | |
313 | uadalp.2d v2, v6 |
314 | ld1.4s {v6},[data], #16 |
315 | |
316 | uadalp.2d v3, v7 |
317 | ld1.4s {v7},[data], #16 |
318 | |
319 | b.ge L64_loop |
320 | |
321 | L64_finishup: |
322 | uadalp.2d v0, v4 |
323 | uadalp.2d v1, v5 |
324 | uadalp.2d v2, v6 |
325 | uadalp.2d v3, v7 |
326 | |
327 | add.2d v0, v0, v1 |
328 | add.2d v2, v2, v3 |
329 | addp.2d d0, v0 |
330 | addp.2d d2, v2 |
331 | add.2d v0, v0, v2 |
332 | fmov x7, d0 // partial in x7 now |
333 | |
334 | // restore used vector registers |
335 | ld1.4s {v0, v1, v2, v3}, [sp], #4*16 |
336 | ld1.4s {v4, v5, v6, v7}, [sp], #4*16 |
337 | |
338 | L32_bytes: |
339 | tst mlen, #32 |
340 | b.eq L16_bytes |
341 | ldp x9, x10, [data], #16 |
342 | ldp x11, x12, [data], #16 |
343 | adds x7, x7, x9 |
344 | mov x9, #0 |
345 | adcs x7, x7, x10 |
346 | adcs x7, x7, x11 |
347 | adcs x7, x7, x12 |
348 | adc x7, x7, x9 |
349 | |
350 | L16_bytes: |
351 | tst mlen, #16 |
352 | b.eq L8_bytes |
353 | ldp x9, x10, [data], #16 |
354 | adds x7, x7, x9 |
355 | mov x9, #0 |
356 | adcs x7, x7, x10 |
357 | adc x7, x7, x9 |
358 | |
359 | L8_bytes: |
360 | tst mlen, #8 |
361 | mov x10, #0 |
362 | b.eq L4_bytes |
363 | ldr x9,[data],#8 |
364 | adds x7, x7, x9 |
365 | adc x7, x7, x10 |
366 | |
367 | L4_bytes: |
368 | tst mlen, #4 |
369 | b.eq L2_bytes |
370 | ldr w9,[data],#4 |
371 | adds x7, x7, x9 |
372 | adc x7, x7, x10 |
373 | |
374 | L2_bytes: |
375 | tst mlen, #2 |
376 | b.eq L_trailing_bytes |
377 | ldrh w9,[data],#2 |
378 | adds x7, x7, x9 |
379 | adc x7, x7, x10 |
380 | |
381 | L_trailing_bytes: |
382 | tst mlen, #1 |
383 | b.eq L0_bytes |
384 | ldrb w9,[data],#1 |
385 | adds x7, x7, x9 |
386 | adc x7, x7, x10 |
387 | eor started_on_odd, started_on_odd, #1 |
388 | |
389 | L0_bytes: |
390 | /* |
391 | * if (needs_swap) |
392 | * partial = (partial << 8) + (partial >> 56); |
393 | */ |
394 | cbz needs_swap, 1f |
395 | ror x7, x7, #56 |
396 | 1: |
397 | /* |
398 | * sum += (partial >> 32) + (partial & 0xffffffff); |
399 | * sum = (sum >> 32) + (sum & 0xffffffff); |
400 | * } |
401 | */ |
402 | |
403 | add x3, x3, x7, lsr #32 |
404 | mov w7, w7 |
405 | add x3, x3, x7 |
406 | mov w7, w3 |
407 | add x3, x7, x3, lsr #32 |
408 | |
409 | L_continue: |
410 | cmp len, #0 |
411 | ldr ptr_m, [m, #M_NEXT] // m = m->m_next |
412 | b.gt L_loop |
413 | |
414 | /* |
415 | * final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) + |
416 | * ((sum >> 16) & 0xffff) + (sum & 0xffff); |
417 | * final_acc = (final_acc >> 16) + (final_acc & 0xffff); |
418 | * final_acc = (final_acc >> 16) + (final_acc & 0xffff); |
419 | * return (final_acc & 0xffff); |
420 | * } |
421 | */ |
422 | |
423 | mov w4, #0x00ffff |
424 | and x0, x4, x3, lsr #48 |
425 | and x1, x4, x3, lsr #32 |
426 | and x2, x4, x3, lsr #16 |
427 | and x3, x4, x3 |
428 | add w0, w0, w1 |
429 | add w2, w2, w3 |
430 | add w0, w0, w2 |
431 | and w1, w4, w0, lsr #16 |
432 | and w0, w4, w0 |
433 | add w0, w0, w1 |
434 | and w1, w4, w0, lsr #16 |
435 | and w0, w4, w0 |
436 | add w0, w0, w1 |
437 | /* |
438 | * If we were to 1's complement it (XOR with 0xffff): |
439 | * |
440 | * eor w0, w0, w4 |
441 | */ |
442 | and w0, w0, w4 |
443 | |
444 | ret lr |
445 | |
446 | Lin_cksum_whoops: |
447 | adrp x0, Lin_cksum_whoops_str@page |
448 | add x0, x0, Lin_cksum_whoops_str@pageoff |
449 | bl #CKSUM_ERR |
450 | mov x0, #-1 |
451 | ret lr |
452 | |
453 | Lin_cksum_whoops_str: |
454 | .asciz "os_cpu_in_cksum_mbuf: out of data\n" |
455 | .align 5 |
456 | |