1/*
2 * Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 * extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst,
31 * uint32_t len, uint32_t sum0);
32 *
33 * input :
34 * src : source starting address
35 * dst : destination starting address
36 * len : byte stream length
37 * sum0 : initial 32-bit sum
38 *
39 * output :
40 * the source byte stream is copied into the destination buffer
41 * the function returns the partial 16-bit checksum accumulated
42 * in a 32-bit variable (without 1's complement); caller is
43 * responsible for folding the 32-bit sum into 16-bit and
44 * performing the 1's complement if applicable
45 */
46
47/*
48 * The following definitions default the implementation to little-endian
49 * architectures.
50 */
51#define LITTLE_ENDIAN 1
52#define BYTE_ORDER LITTLE_ENDIAN
53
54/*
55 * ARM64 kernel mode -- just like user mode -- no longer requires saving
56 * the vector registers, since it's done by the exception handler code.
57 */
58#define SAVE_REGISTERS 0
59
60 .globl _os_cpu_copy_in_cksum
61 .text
62 .align 4
63_os_cpu_copy_in_cksum:
64
65#define src x0
66#define dst x1
67#define len x2
68#define sum x3
69#define need_swap x5
70#define t x6
71#define partial x7
72#define wpartial w7
73
74 mov partial, #0 // partial = 0;
75 mov need_swap, #0 // needs_swap = 0;
76
77 cbz len, L_len_0
78
79/*
80 * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this
81 * byte to high byte of 16-bit in w7
82 *
83 * t = 0;
84 * if ((uintptr_t)src & 1) {
85 * t = *src << 8;
86 * *dst++ = *src++;
87 * --len;
88 * }
89 */
90 tst src, #1
91 b.eq 1f
92 ldrb wpartial, [src]
93 add src, src, #1
94 strb wpartial, [dst], #1
95#if BYTE_ORDER == LITTLE_ENDIAN
96 lsl partial, partial, #8
97#endif
98 sub len, len, #1
99 mov need_swap, #1
100 cbz len, L_len_0
1011:
102
103#if SAVE_REGISTERS
104 /*
105 * we will always use v0-v3, and v4-v7/v16-v19 if len>=128
106 * so allocate 12*16 bytes in the stack, and store v0-v3 now,
107 * keep x11 as the pointer
108 */
109 sub sp, sp, #12*16
110 mov x11, sp
111 st1.4s {v0, v1, v2, v3}, [x11], #4*16
112#endif
113
114 /*
115 * pre-decrement len by 8*16, and if less tha 8*16 bytes, try
116 * 4*16 bytes next.
117 * v0,v1 will store temp result after we exit the L128 loop
118 */
119 eor.16b v0, v0, v0
120 eor.16b v1, v1, v1
121 cmp len, #8*16
122 mov v0.d[0], partial // move partial to 1st 64b lane in v0
123 b.lt L64_bytes
124
125#if SAVE_REGISTERS
126 /* if we are here, we need to save v4-v7/v16-v19 for kernel mode */
127 st1.4s {v4, v5, v6, v7}, [x11], #4*16
128 st1.4s {v16, v17, v18, v19}, [x11], #4*16
129#endif
130
131 /*
132 * accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3
133 * load 1st 4 vectors, and clear v0-v3
134 */
135 ldr q4, [src], #8*16
136 eor.16b v2, v2, v2
137 ldr q5, [src, #-7*16]
138 eor.16b v3, v3, v3
139 ldr q6, [src, #-6*16]
140 ldr q7, [src, #-5*16]
141 ldr q16, [src, #-4*16]
142 ldr q17, [src, #-3*16]
143 ldr q18, [src, #-2*16]
144 ldr q19, [src, #-1*16]
145
146 /* branch to finish off if len<128 */
147 subs len, len, #2*8*16
148 b.lt L128_finishup
149
150 /*
151 * loop for loading and accumulating 16 32-bit words nto 8 8-byte
152 * accumulators per iteration
153 */
154L128_loop:
155 str q4, [dst], #16*8
156 uadalp.2d v0, v4
157 str q5, [dst, #-7*16]
158 uadalp.2d v1, v5
159 ldr q4, [src], #16*8
160 ldr q5, [src, #-7*16]
161
162 str q6, [dst, #-6*16]
163 uadalp.2d v2, v6
164 str q7, [dst, #-5*16]
165 uadalp.2d v3, v7
166 ldr q6, [src, #-6*16]
167 ldr q7, [src, #-5*16]
168
169 str q16, [dst, #-4*16]
170 uadalp.2d v0, v16
171 str q17, [dst, #-3*16]
172 uadalp.2d v1, v17
173 ldr q16, [src, #-4*16]
174 ldr q17, [src, #-3*16]
175
176 str q18, [dst, #-2*16]
177 uadalp.2d v2, v18
178 str q19, [dst, #-1*16]
179 uadalp.2d v3, v19
180 ldr q18, [src, #-2*16]
181 ldr q19, [src, #-1*16]
182
183 subs len, len, #8*16
184 b.ge L128_loop
185
186L128_finishup:
187 str q4, [dst], #16*8
188 uadalp.2d v0, v4
189 str q5, [dst, #-7*16]
190 uadalp.2d v1, v5
191 str q6, [dst, #-6*16]
192 uadalp.2d v2, v6
193 str q7, [dst, #-5*16]
194 uadalp.2d v3, v7
195
196 str q16, [dst, #-4*16]
197 uadalp.2d v0, v16
198 str q17, [dst, #-3*16]
199 uadalp.2d v1, v17
200 str q18, [dst, #-2*16]
201 uadalp.2d v2, v18
202 str q19, [dst, #-1*16]
203 uadalp.2d v3, v19
204
205 add len, len, #8*16
206
207 add.2d v0, v0, v2
208 add.2d v1, v1, v3
209
210#if SAVE_REGISTERS
211 /* restore v4-v7/v16-v19 as they won't be used any more */
212 add x11, sp, #4*16
213 ld1.4s {v4, v5, v6, v7}, [x11], #4*16
214 ld1.4s {v16, v17, v18, v19}, [x11], #4*16
215#endif
216
217L64_bytes:
218 cmp len, #4*16
219 b.lt L32_bytes
220
221 ldr q2, [src], #4*16
222 ldr q3, [src, #-3*16]
223 str q2, [dst], #4*16
224 uadalp.2d v0, v2
225 str q3, [dst, #-3*16]
226 uadalp.2d v1, v3
227
228 ldr q2, [src, #-2*16]
229 ldr q3, [src, #-1*16]
230 str q2, [dst, #-2*16]
231 uadalp.2d v0, v2
232 str q3, [dst, #-1*16]
233 uadalp.2d v1, v3
234 sub len, len, #4*16
235
236L32_bytes:
237 cmp len, #2*16
238 b.lt L16_bytes
239 ldr q2, [src], #2*16
240 ldr q3, [src, #-1*16]
241 str q2, [dst], #2*16
242 uadalp.2d v0, v2
243 str q3, [dst, #-1*16]
244 uadalp.2d v1, v3
245 sub len, len, #2*16
246
247L16_bytes:
248 add.2d v0, v0, v1
249 cmp len, #16
250 b.lt L8_bytes
251 ldr q2, [src], #16
252 str q2, [dst], #16
253 uadalp.2d v0, v2
254 sub len, len, #16
255
256L8_bytes:
257 eor.16b v1, v1, v1
258 eor.16b v2, v2, v2
259 eor.16b v3, v3, v3
260
261 tst len, #8
262 b.eq L4_bytes
263 ldr d1,[src],#8
264 str d1,[dst],#8
265
266L4_bytes:
267 tst len, #4
268 b.eq L2_bytes
269 ldr s2,[src],#4
270 str s2,[dst],#4
271
272L2_bytes:
273 uadalp.2d v0, v1
274 eor.16b v1, v1, v1
275 tst len, #2
276 b.eq L_trailing_bytes
277 ldr h3,[src],#2
278 str h3,[dst],#2
279
280L_trailing_bytes:
281 tst len, #1
282 b.eq L0_bytes
283 ldr b1,[src],#1
284 str b1,[dst],#1
285#if BYTE_ORDER != LITTLE_ENDIAN
286 shl.4h v1, v1, #8 // partial <<= 8;
287#endif
288
289L0_bytes:
290 uadalp.2d v2, v3
291 uadalp.2d v0, v1
292 uadalp.2d v0, v2
293
294 addp.2d d0, v0
295 fmov partial, d0
296
297#if SAVE_REGISTERS
298 /* restore v0-v3 and deallocate stack space */
299 ld1.4s {v0, v1, v2, v3}, [sp]
300 add sp, sp, #12*16
301#endif
302
303 /* partial = (partial >> 32) + (partial & 0xffffffff); */
304 and t, partial, #0xffffffff
305 add partial, t, partial, lsr #32
306
307 /* partial = (partial >> 16) + (partial & 0xffff); */
308 and t, partial, #0xffff
309 add partial, t, partial, lsr #16
310
311L_len_0:
312 /*
313 * if (needs_swap)
314 * partial = (partial << 8) + (partial >> 24);
315 */
316 cbz need_swap, 1f
317 lsl t, partial, #8
318 add partial, t, partial, lsr #24
3191:
320 /* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */
321 and x0, sum, #0xffff
322 add x0, x0, sum, lsr #16
323
324 /* final_acc += (partial >> 16) + (partial & 0xffff); */
325 add x0, x0, partial, lsr #16
326 and partial, partial, #0xffff
327 add x0, x0, partial
328
329 /* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */
330 and t, x0, #0xffff
331 add x0, t, x0, lsr #16
332
333 /*
334 * One final fold in case of carry from the previous one.
335 * final_acc = (final_acc >> 16) + (final_acc & 0xffff);
336 */
337 and t, x0, #0xffff
338 add x0, t, x0, lsr #16
339
340 /*
341 * return (~final_acc & 0xffff);
342 *
343 * mvn w0, w0
344 * and w0, w0, #0xffff
345 */
346
347 ret lr
348