1 | /* |
2 | * Copyright (c) 2016-2021 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | /* |
30 | * extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst, |
31 | * uint32_t len, uint32_t sum0); |
32 | * |
33 | * input : |
34 | * src : source starting address |
35 | * dst : destination starting address |
36 | * len : byte stream length |
37 | * sum0 : initial 32-bit sum |
38 | * |
39 | * output : |
40 | * the source byte stream is copied into the destination buffer |
41 | * the function returns the partial 16-bit checksum accumulated |
42 | * in a 32-bit variable (without 1's complement); caller is |
43 | * responsible for folding the 32-bit sum into 16-bit and |
44 | * performing the 1's complement if applicable |
45 | */ |
46 | |
47 | /* |
48 | * The following definitions default the implementation to little-endian |
49 | * architectures. |
50 | */ |
51 | #define LITTLE_ENDIAN 1 |
52 | #define BYTE_ORDER LITTLE_ENDIAN |
53 | |
54 | /* |
55 | * ARM64 kernel mode -- just like user mode -- no longer requires saving |
56 | * the vector registers, since it's done by the exception handler code. |
57 | */ |
58 | #define SAVE_REGISTERS 0 |
59 | |
60 | .globl _os_cpu_copy_in_cksum |
61 | .text |
62 | .align 4 |
63 | _os_cpu_copy_in_cksum: |
64 | |
65 | #define src x0 |
66 | #define dst x1 |
67 | #define len x2 |
68 | #define sum x3 |
69 | #define need_swap x5 |
70 | #define t x6 |
71 | #define partial x7 |
72 | #define wpartial w7 |
73 | |
74 | mov partial, #0 // partial = 0; |
75 | mov need_swap, #0 // needs_swap = 0; |
76 | |
77 | cbz len, L_len_0 |
78 | |
79 | /* |
80 | * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this |
81 | * byte to high byte of 16-bit in w7 |
82 | * |
83 | * t = 0; |
84 | * if ((uintptr_t)src & 1) { |
85 | * t = *src << 8; |
86 | * *dst++ = *src++; |
87 | * --len; |
88 | * } |
89 | */ |
90 | tst src, #1 |
91 | b.eq 1f |
92 | ldrb wpartial, [src] |
93 | add src, src, #1 |
94 | strb wpartial, [dst], #1 |
95 | #if BYTE_ORDER == LITTLE_ENDIAN |
96 | lsl partial, partial, #8 |
97 | #endif |
98 | sub len, len, #1 |
99 | mov need_swap, #1 |
100 | cbz len, L_len_0 |
101 | 1: |
102 | |
103 | #if SAVE_REGISTERS |
104 | /* |
105 | * we will always use v0-v3, and v4-v7/v16-v19 if len>=128 |
106 | * so allocate 12*16 bytes in the stack, and store v0-v3 now, |
107 | * keep x11 as the pointer |
108 | */ |
109 | sub sp, sp, #12*16 |
110 | mov x11, sp |
111 | st1.4s {v0, v1, v2, v3}, [x11], #4*16 |
112 | #endif |
113 | |
114 | /* |
115 | * pre-decrement len by 8*16, and if less tha 8*16 bytes, try |
116 | * 4*16 bytes next. |
117 | * v0,v1 will store temp result after we exit the L128 loop |
118 | */ |
119 | eor.16b v0, v0, v0 |
120 | eor.16b v1, v1, v1 |
121 | cmp len, #8*16 |
122 | mov v0.d[0], partial // move partial to 1st 64b lane in v0 |
123 | b.lt L64_bytes |
124 | |
125 | #if SAVE_REGISTERS |
126 | /* if we are here, we need to save v4-v7/v16-v19 for kernel mode */ |
127 | st1.4s {v4, v5, v6, v7}, [x11], #4*16 |
128 | st1.4s {v16, v17, v18, v19}, [x11], #4*16 |
129 | #endif |
130 | |
131 | /* |
132 | * accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3 |
133 | * load 1st 4 vectors, and clear v0-v3 |
134 | */ |
135 | ldr q4, [src], #8*16 |
136 | eor.16b v2, v2, v2 |
137 | ldr q5, [src, #-7*16] |
138 | eor.16b v3, v3, v3 |
139 | ldr q6, [src, #-6*16] |
140 | ldr q7, [src, #-5*16] |
141 | ldr q16, [src, #-4*16] |
142 | ldr q17, [src, #-3*16] |
143 | ldr q18, [src, #-2*16] |
144 | ldr q19, [src, #-1*16] |
145 | |
146 | /* branch to finish off if len<128 */ |
147 | subs len, len, #2*8*16 |
148 | b.lt L128_finishup |
149 | |
150 | /* |
151 | * loop for loading and accumulating 16 32-bit words nto 8 8-byte |
152 | * accumulators per iteration |
153 | */ |
154 | L128_loop: |
155 | str q4, [dst], #16*8 |
156 | uadalp.2d v0, v4 |
157 | str q5, [dst, #-7*16] |
158 | uadalp.2d v1, v5 |
159 | ldr q4, [src], #16*8 |
160 | ldr q5, [src, #-7*16] |
161 | |
162 | str q6, [dst, #-6*16] |
163 | uadalp.2d v2, v6 |
164 | str q7, [dst, #-5*16] |
165 | uadalp.2d v3, v7 |
166 | ldr q6, [src, #-6*16] |
167 | ldr q7, [src, #-5*16] |
168 | |
169 | str q16, [dst, #-4*16] |
170 | uadalp.2d v0, v16 |
171 | str q17, [dst, #-3*16] |
172 | uadalp.2d v1, v17 |
173 | ldr q16, [src, #-4*16] |
174 | ldr q17, [src, #-3*16] |
175 | |
176 | str q18, [dst, #-2*16] |
177 | uadalp.2d v2, v18 |
178 | str q19, [dst, #-1*16] |
179 | uadalp.2d v3, v19 |
180 | ldr q18, [src, #-2*16] |
181 | ldr q19, [src, #-1*16] |
182 | |
183 | subs len, len, #8*16 |
184 | b.ge L128_loop |
185 | |
186 | L128_finishup: |
187 | str q4, [dst], #16*8 |
188 | uadalp.2d v0, v4 |
189 | str q5, [dst, #-7*16] |
190 | uadalp.2d v1, v5 |
191 | str q6, [dst, #-6*16] |
192 | uadalp.2d v2, v6 |
193 | str q7, [dst, #-5*16] |
194 | uadalp.2d v3, v7 |
195 | |
196 | str q16, [dst, #-4*16] |
197 | uadalp.2d v0, v16 |
198 | str q17, [dst, #-3*16] |
199 | uadalp.2d v1, v17 |
200 | str q18, [dst, #-2*16] |
201 | uadalp.2d v2, v18 |
202 | str q19, [dst, #-1*16] |
203 | uadalp.2d v3, v19 |
204 | |
205 | add len, len, #8*16 |
206 | |
207 | add.2d v0, v0, v2 |
208 | add.2d v1, v1, v3 |
209 | |
210 | #if SAVE_REGISTERS |
211 | /* restore v4-v7/v16-v19 as they won't be used any more */ |
212 | add x11, sp, #4*16 |
213 | ld1.4s {v4, v5, v6, v7}, [x11], #4*16 |
214 | ld1.4s {v16, v17, v18, v19}, [x11], #4*16 |
215 | #endif |
216 | |
217 | L64_bytes: |
218 | cmp len, #4*16 |
219 | b.lt L32_bytes |
220 | |
221 | ldr q2, [src], #4*16 |
222 | ldr q3, [src, #-3*16] |
223 | str q2, [dst], #4*16 |
224 | uadalp.2d v0, v2 |
225 | str q3, [dst, #-3*16] |
226 | uadalp.2d v1, v3 |
227 | |
228 | ldr q2, [src, #-2*16] |
229 | ldr q3, [src, #-1*16] |
230 | str q2, [dst, #-2*16] |
231 | uadalp.2d v0, v2 |
232 | str q3, [dst, #-1*16] |
233 | uadalp.2d v1, v3 |
234 | sub len, len, #4*16 |
235 | |
236 | L32_bytes: |
237 | cmp len, #2*16 |
238 | b.lt L16_bytes |
239 | ldr q2, [src], #2*16 |
240 | ldr q3, [src, #-1*16] |
241 | str q2, [dst], #2*16 |
242 | uadalp.2d v0, v2 |
243 | str q3, [dst, #-1*16] |
244 | uadalp.2d v1, v3 |
245 | sub len, len, #2*16 |
246 | |
247 | L16_bytes: |
248 | add.2d v0, v0, v1 |
249 | cmp len, #16 |
250 | b.lt L8_bytes |
251 | ldr q2, [src], #16 |
252 | str q2, [dst], #16 |
253 | uadalp.2d v0, v2 |
254 | sub len, len, #16 |
255 | |
256 | L8_bytes: |
257 | eor.16b v1, v1, v1 |
258 | eor.16b v2, v2, v2 |
259 | eor.16b v3, v3, v3 |
260 | |
261 | tst len, #8 |
262 | b.eq L4_bytes |
263 | ldr d1,[src],#8 |
264 | str d1,[dst],#8 |
265 | |
266 | L4_bytes: |
267 | tst len, #4 |
268 | b.eq L2_bytes |
269 | ldr s2,[src],#4 |
270 | str s2,[dst],#4 |
271 | |
272 | L2_bytes: |
273 | uadalp.2d v0, v1 |
274 | eor.16b v1, v1, v1 |
275 | tst len, #2 |
276 | b.eq L_trailing_bytes |
277 | ldr h3,[src],#2 |
278 | str h3,[dst],#2 |
279 | |
280 | L_trailing_bytes: |
281 | tst len, #1 |
282 | b.eq L0_bytes |
283 | ldr b1,[src],#1 |
284 | str b1,[dst],#1 |
285 | #if BYTE_ORDER != LITTLE_ENDIAN |
286 | shl.4h v1, v1, #8 // partial <<= 8; |
287 | #endif |
288 | |
289 | L0_bytes: |
290 | uadalp.2d v2, v3 |
291 | uadalp.2d v0, v1 |
292 | uadalp.2d v0, v2 |
293 | |
294 | addp.2d d0, v0 |
295 | fmov partial, d0 |
296 | |
297 | #if SAVE_REGISTERS |
298 | /* restore v0-v3 and deallocate stack space */ |
299 | ld1.4s {v0, v1, v2, v3}, [sp] |
300 | add sp, sp, #12*16 |
301 | #endif |
302 | |
303 | /* partial = (partial >> 32) + (partial & 0xffffffff); */ |
304 | and t, partial, #0xffffffff |
305 | add partial, t, partial, lsr #32 |
306 | |
307 | /* partial = (partial >> 16) + (partial & 0xffff); */ |
308 | and t, partial, #0xffff |
309 | add partial, t, partial, lsr #16 |
310 | |
311 | L_len_0: |
312 | /* |
313 | * if (needs_swap) |
314 | * partial = (partial << 8) + (partial >> 24); |
315 | */ |
316 | cbz need_swap, 1f |
317 | lsl t, partial, #8 |
318 | add partial, t, partial, lsr #24 |
319 | 1: |
320 | /* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */ |
321 | and x0, sum, #0xffff |
322 | add x0, x0, sum, lsr #16 |
323 | |
324 | /* final_acc += (partial >> 16) + (partial & 0xffff); */ |
325 | add x0, x0, partial, lsr #16 |
326 | and partial, partial, #0xffff |
327 | add x0, x0, partial |
328 | |
329 | /* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */ |
330 | and t, x0, #0xffff |
331 | add x0, t, x0, lsr #16 |
332 | |
333 | /* |
334 | * One final fold in case of carry from the previous one. |
335 | * final_acc = (final_acc >> 16) + (final_acc & 0xffff); |
336 | */ |
337 | and t, x0, #0xffff |
338 | add x0, t, x0, lsr #16 |
339 | |
340 | /* |
341 | * return (~final_acc & 0xffff); |
342 | * |
343 | * mvn w0, w0 |
344 | * and w0, w0, #0xffff |
345 | */ |
346 | |
347 | ret lr |
348 | |