cpu_copy_in_cksum.s source code [xnu/bsd/dev/arm64/cpu_copy_in_cksum.s]

1	/*
2	* Copyright (c) 2016-2021 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	/*
30	* extern uint32_t os_cpu_copy_in_cksum(const void src, void dst,
31	* uint32_t len, uint32_t sum0);
32	*
33	* input :
34	* src : source starting address
35	* dst : destination starting address
36	* len : byte stream length
37	* sum0 : initial 32-bit sum
38	*
39	* output :
40	* the source byte stream is copied into the destination buffer
41	* the function returns the partial 16-bit checksum accumulated
42	* in a 32-bit variable (without 1's complement); caller is
43	* responsible for folding the 32-bit sum into 16-bit and
44	* performing the 1's complement if applicable
45	*/
46
47	/*
48	* The following definitions default the implementation to little-endian
49	* architectures.
50	*/
51	#define LITTLE_ENDIAN 1
52	#define BYTE_ORDER LITTLE_ENDIAN
53
54	/*
55	* ARM64 kernel mode -- just like user mode -- no longer requires saving
56	* the vector registers, since it's done by the exception handler code.
57	*/
58	#define SAVE_REGISTERS 0
59
60	.globl _os_cpu_copy_in_cksum
61	.text
62	.align `4`
63	_os_cpu_copy_in_cksum:
64
65	#define src x0
66	#define dst x1
67	#define len x2
68	#define sum x3
69	#define need_swap x5
70	#define t x6
71	#define partial x7
72	#define wpartial w7
73
74	mov partial, #`0` // partial = 0;
75	mov need_swap, #`0` // needs_swap = 0;
76
77	cbz len, L_len_0
78
79	/*
80	* Deal with odd-addressed byte, use w7 to store temporary sum, deposit this
81	* byte to high byte of 16-bit in w7
82	*
83	* t = 0;
84	* if ((uintptr_t)src & 1) {
85	* t = *src << 8;
86	* dst++ = src++;
87	* --len;
88	* }
89	*/
90	tst src, #`1`
91	b.eq `1f`
92	ldrb wpartial, [src]
93	add src, src, #`1`
94	strb wpartial, [dst], #`1`
95	#if BYTE_ORDER == LITTLE_ENDIAN
96	lsl partial, partial, #`8`
97	#endif
98	sub len, len, #`1`
99	mov need_swap, #`1`
100	cbz len, L_len_0
101	`1`:
102
103	#if SAVE_REGISTERS
104	/*
105	* we will always use v0-v3, and v4-v7/v16-v19 if len>=128
106	* so allocate 12*16 bytes in the stack, and store v0-v3 now,
107	* keep x11 as the pointer
108	*/
109	sub sp, sp, #`12`*`16`
110	mov x11, sp
111	st1`.4s` {v0, v1, v2, v3}, [x11], #`4`*`16`
112	#endif
113
114	/*
115	* pre-decrement len by 816, and if less tha 816 bytes, try
116	* 4*16 bytes next.
117	* v0,v1 will store temp result after we exit the L128 loop
118	*/
119	eor`.16b` v0, v0, v0
120	eor`.16b` v1, v1, v1
121	cmp len, #`8`*`16`
122	mov v0.d[`0`], partial // move partial to 1st 64b lane in v0
123	b.lt L64_bytes
124
125	#if SAVE_REGISTERS
126	/ if we are here, we need to save v4-v7/v16-v19 for kernel mode /
127	st1`.4s` {v4, v5, v6, v7}, [x11], #`4`*`16`
128	st1`.4s` {v16, v17, v18, v19}, [x11], #`4`*`16`
129	#endif
130
131	/*
132	* accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3
133	* load 1st 4 vectors, and clear v0-v3
134	*/
135	ldr q4, [src], #`8`*`16`
136	eor`.16b` v2, v2, v2
137	ldr q5, [src, #-`7`*`16`]
138	eor`.16b` v3, v3, v3
139	ldr q6, [src, #-`6`*`16`]
140	ldr q7, [src, #-`5`*`16`]
141	ldr q16, [src, #-`4`*`16`]
142	ldr q17, [src, #-`3`*`16`]
143	ldr q18, [src, #-`2`*`16`]
144	ldr q19, [src, #-`1`*`16`]
145
146	/ branch to finish off if len<128 /
147	subs len, len, #`2``8``16`
148	b.lt L128_finishup
149
150	/*
151	* loop for loading and accumulating 16 32-bit words nto 8 8-byte
152	* accumulators per iteration
153	*/
154	L128_loop:
155	str q4, [dst], #`16`*`8`
156	uadalp`.2d` v0, v4
157	str q5, [dst, #-`7`*`16`]
158	uadalp`.2d` v1, v5
159	ldr q4, [src], #`16`*`8`
160	ldr q5, [src, #-`7`*`16`]
161
162	str q6, [dst, #-`6`*`16`]
163	uadalp`.2d` v2, v6
164	str q7, [dst, #-`5`*`16`]
165	uadalp`.2d` v3, v7
166	ldr q6, [src, #-`6`*`16`]
167	ldr q7, [src, #-`5`*`16`]
168
169	str q16, [dst, #-`4`*`16`]
170	uadalp`.2d` v0, v16
171	str q17, [dst, #-`3`*`16`]
172	uadalp`.2d` v1, v17
173	ldr q16, [src, #-`4`*`16`]
174	ldr q17, [src, #-`3`*`16`]
175
176	str q18, [dst, #-`2`*`16`]
177	uadalp`.2d` v2, v18
178	str q19, [dst, #-`1`*`16`]
179	uadalp`.2d` v3, v19
180	ldr q18, [src, #-`2`*`16`]
181	ldr q19, [src, #-`1`*`16`]
182
183	subs len, len, #`8`*`16`
184	b.ge L128_loop
185
186	L128_finishup:
187	str q4, [dst], #`16`*`8`
188	uadalp`.2d` v0, v4
189	str q5, [dst, #-`7`*`16`]
190	uadalp`.2d` v1, v5
191	str q6, [dst, #-`6`*`16`]
192	uadalp`.2d` v2, v6
193	str q7, [dst, #-`5`*`16`]
194	uadalp`.2d` v3, v7
195
196	str q16, [dst, #-`4`*`16`]
197	uadalp`.2d` v0, v16
198	str q17, [dst, #-`3`*`16`]
199	uadalp`.2d` v1, v17
200	str q18, [dst, #-`2`*`16`]
201	uadalp`.2d` v2, v18
202	str q19, [dst, #-`1`*`16`]
203	uadalp`.2d` v3, v19
204
205	add len, len, #`8`*`16`
206
207	add`.2d` v0, v0, v2
208	add`.2d` v1, v1, v3
209
210	#if SAVE_REGISTERS
211	/ restore v4-v7/v16-v19 as they won't be used any more /
212	add x11, sp, #`4`*`16`
213	ld1`.4s` {v4, v5, v6, v7}, [x11], #`4`*`16`
214	ld1`.4s` {v16, v17, v18, v19}, [x11], #`4`*`16`
215	#endif
216
217	L64_bytes:
218	cmp len, #`4`*`16`
219	b.lt L32_bytes
220
221	ldr q2, [src], #`4`*`16`
222	ldr q3, [src, #-`3`*`16`]
223	str q2, [dst], #`4`*`16`
224	uadalp`.2d` v0, v2
225	str q3, [dst, #-`3`*`16`]
226	uadalp`.2d` v1, v3
227
228	ldr q2, [src, #-`2`*`16`]
229	ldr q3, [src, #-`1`*`16`]
230	str q2, [dst, #-`2`*`16`]
231	uadalp`.2d` v0, v2
232	str q3, [dst, #-`1`*`16`]
233	uadalp`.2d` v1, v3
234	sub len, len, #`4`*`16`
235
236	L32_bytes:
237	cmp len, #`2`*`16`
238	b.lt L16_bytes
239	ldr q2, [src], #`2`*`16`
240	ldr q3, [src, #-`1`*`16`]
241	str q2, [dst], #`2`*`16`
242	uadalp`.2d` v0, v2
243	str q3, [dst, #-`1`*`16`]
244	uadalp`.2d` v1, v3
245	sub len, len, #`2`*`16`
246
247	L16_bytes:
248	add`.2d` v0, v0, v1
249	cmp len, #`16`
250	b.lt L8_bytes
251	ldr q2, [src], #`16`
252	str q2, [dst], #`16`
253	uadalp`.2d` v0, v2
254	sub len, len, #`16`
255
256	L8_bytes:
257	eor`.16b` v1, v1, v1
258	eor`.16b` v2, v2, v2
259	eor`.16b` v3, v3, v3
260
261	tst len, #`8`
262	b.eq L4_bytes
263	ldr d1,[src],#`8`
264	str d1,[dst],#`8`
265
266	L4_bytes:
267	tst len, #`4`
268	b.eq L2_bytes
269	ldr s2,[src],#`4`
270	str s2,[dst],#`4`
271
272	L2_bytes:
273	uadalp`.2d` v0, v1
274	eor`.16b` v1, v1, v1
275	tst len, #`2`
276	b.eq L_trailing_bytes
277	ldr h3,[src],#`2`
278	str h3,[dst],#`2`
279
280	L_trailing_bytes:
281	tst len, #`1`
282	b.eq L0_bytes
283	ldr b1,[src],#`1`
284	str b1,[dst],#`1`
285	#if BYTE_ORDER != LITTLE_ENDIAN
286	shl`.4h` v1, v1, #`8` // partial <<= 8;
287	#endif
288
289	L0_bytes:
290	uadalp`.2d` v2, v3
291	uadalp`.2d` v0, v1
292	uadalp`.2d` v0, v2
293
294	addp`.2d` d0, v0
295	fmov partial, d0
296
297	#if SAVE_REGISTERS
298	/ restore v0-v3 and deallocate stack space /
299	ld1`.4s` {v0, v1, v2, v3}, [sp]
300	add sp, sp, #`12`*`16`
301	#endif
302
303	/ partial = (partial >> 32) + (partial & 0xffffffff); /
304	and t, partial, #`0xffffffff`
305	add partial, t, partial, lsr #`32`
306
307	/ partial = (partial >> 16) + (partial & 0xffff); /
308	and t, partial, #`0xffff`
309	add partial, t, partial, lsr #`16`
310
311	L_len_0:
312	/*
313	* if (needs_swap)
314	* partial = (partial << 8) + (partial >> 24);
315	*/
316	cbz need_swap, `1f`
317	lsl t, partial, #`8`
318	add partial, t, partial, lsr #`24`
319	`1`:
320	/ final_acc = (sum0 >> 16) + (sum0 & 0xffff); /
321	and x0, sum, #`0xffff`
322	add x0, x0, sum, lsr #`16`
323
324	/ final_acc += (partial >> 16) + (partial & 0xffff); /
325	add x0, x0, partial, lsr #`16`
326	and partial, partial, #`0xffff`
327	add x0, x0, partial
328
329	/ final_acc = (final_acc >> 16) + (final_acc & 0xffff); /
330	and t, x0, #`0xffff`
331	add x0, t, x0, lsr #`16`
332
333	/*
334	* One final fold in case of carry from the previous one.
335	* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
336	*/
337	and t, x0, #`0xffff`
338	add x0, t, x0, lsr #`16`
339
340	/*
341	* return (~final_acc & 0xffff);
342	*
343	* mvn w0, w0
344	* and w0, w0, #0xffff
345	*/
346
347	ret lr
348

Browse the source code of xnu/bsd/dev/arm64/cpu_copy_in_cksum.s