cpu_in_cksum.s source code [xnu/bsd/dev/arm64/cpu_in_cksum.s]

1	/*
2	* Copyright (c) 2012-2018 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	/*
30	* This assembly was previously cloned from ../arm/cpu_in_cksum.s (__arm__)
31	* with __arm64__ tagged ARM64_TODO . This code revision is optimized based
32	* on the 64-bit part in netinet/cpu_in_cksum.c
33	*
34	* cclee - CoreOS - Vector & Numerics. 06/20/2012.
35	*/
36
37	#ifdef KERNEL
38	#define CKSUM_ERR _kprintf
39	#else
40	#ifndef LIBSYSCALL_INTERFACE
41	#error "LIBSYSCALL_INTERFACE not defined"
42	#endif /* !LIBSYSCALL_INTERFACE */
43	#define CKSUM_ERR _fprintf_stderr
44	#endif /* !KERNEL */
45
46	/*
47	* XXX: adi@apple.com:
48	*
49	* Ugly, but we have little choice, since relying on genassym and <assym.s>
50	* is not possible unless this code lives in osfmk. Note also that this
51	* routine expects "mbuf-like" argument, and it does not expect the mbuf to be
52	* authentic; it only cares about 3 fields.
53	*/
54	#if defined(__LP64__)
55	#define M_NEXT 0
56	#define M_DATA 16 // 8-byte address, would be aligned to 8-byte boundary
57	#define M_LEN 24
58	#else
59	#define M_NEXT 0
60	#define M_DATA 8
61	#define M_LEN 12
62	#endif
63
64	.globl _os_cpu_in_cksum_mbuf
65	.text
66	.align `4`
67	_os_cpu_in_cksum_mbuf:
68
69
70	/*
71	* 64-bit version.
72	*
73	* This function returns the partial 16-bit checksum accumulated in
74	* a 32-bit variable (withouth 1's complement); caller is responsible
75	* for folding the 32-bit sum into 16-bit and performinng the 1's
76	* complement if applicable
77	*/
78
79	/*
80	* uint32_t
81	* os_cpu_in_cksum_mbuf(struct mbuf *m, int len, int off, uint32_t initial_sum)
82	* {
83	* int mlen;
84	* uint64_t sum, partial;
85	* unsigned int final_acc;
86	* uint8_t *data;
87	* boolean_t needs_swap, started_on_odd;
88	*
89	* VERIFY(len >= 0);
90	* VERIFY(off >= 0);
91	*
92	* needs_swap = FALSE;
93	* started_on_odd = FALSE;
94	* sum = initial_sum;
95	*/
96
97	#define m x0
98	#define len x1
99	#define off x2
100	#define sum x3
101	#define needs_swap x4
102	#define started_on_odd x5
103	#define mlen x6
104	#define Wmlen w6
105	#define t x7
106	#define data x8
107	#if defined(__LP64__)
108	#define ptr_m x0
109	#define ptr_data x8
110	#else
111	#define ptr_m w0
112	#define ptr_data w8
113	#endif
114
115
116	mov needs_swap, #`0` // needs_swap = FALSE;
117	mov started_on_odd, #`0` // started_on_odd = FALSE;
118	mov w3, w3 // clear higher half
119
120
121	/*
122	* for (;;) {
123	* if (PREDICT_FALSE(m == NULL)) {
124	* CKSUM_ERR("%s: out of data\n", __func__);
125	* return (-1);
126	* }
127	* mlen = m->m_len;
128	* if (mlen > off) {
129	* mlen -= off;
130	* data = mtod(m, uint8_t *) + off;
131	* goto post_initial_offset;
132	* }
133	* off -= mlen;
134	* if (len == 0)
135	* break;
136	* m = m->m_next;
137	* }
138	*/
139
140	`0`:
141	cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
142	ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
143	cmp mlen, off
144	b.le `1f`
145	ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t )*
146	sub mlen, mlen, off // mlen -= off;
147	add data, data, off // data = mtod(m, uint8_t ) + off;*
148	b L_post_initial_offset
149	`1`:
150	sub off, off, mlen
151	cbnz len, `2f`
152	mov x0, x3
153	ret lr
154	`2`:
155	ldr ptr_m, [m, #M_NEXT]
156	b `0b`
157
158	L_loop: // for (; len > 0; m = m->m_next) {
159	/*
160	* if (PREDICT_FALSE(m == NULL)) {
161	* CKSUM_ERR("%s: out of data\n", __func__);
162	* return (-1);
163	* }
164	* mlen = m->m_len;
165	* data = mtod(m, uint8_t *);
166	*/
167	cbz m, Lin_cksum_whoops // if (m == NULL) return -1;
168	ldr Wmlen, [m, #M_LEN] // mlen = m->m_len;
169	ldr ptr_data, [m, #M_DATA] // mtod(m, uint8_t )*
170
171	L_post_initial_offset:
172	/*
173	* if (mlen == 0) continue;
174	* if (mlen > len) mlen = len;
175	* len -= mlen;
176	*/
177
178	cbz mlen, L_continue
179	cmp mlen, len
180	csel mlen, mlen, len, le
181	sub len, len, mlen
182
183	/*
184	* partial = 0;
185	* if ((uintptr_t)data & 1) {
186	* started_on_odd = !started_on_odd;
187	* partial = *data << 8;
188	* ++data;
189	* --mlen;
190	* }
191	* needs_swap = started_on_odd;
192	*/
193
194	tst data, #`1`
195	mov x7, #`0`
196	mov x10, #`0`
197	b.eq `1f`
198	ldrb w7, [data], #`1`
199	eor started_on_odd, started_on_odd, #`1`
200	sub mlen, mlen, #`1`
201	lsl w7, w7, #`8`
202	`1`:
203
204
205	/*
206	* if ((uintptr_t)data & 2) {
207	* if (mlen < 2)
208	* goto trailing_bytes;
209	* partial += (uint16_t )(void *)data;
210	* data += 2;
211	* mlen -= 2;
212	* }
213	*/
214	tst data, #`2`
215	mov needs_swap, started_on_odd
216	b.eq `1f`
217	cmp mlen, #`2`
218	b.lt L_trailing_bytes
219	ldrh w9, [data], #`2`
220	sub mlen, mlen, #`2`
221	add w7, w7, w9
222	`1`:
223
224	/*
225	* if ((uintptr_t)data & 4) {
226	* if (mlen < 4)
227	* goto L2_bytes;
228	* partial += (uint32_t )(void *)data;
229	* data += 4;
230	* mlen -= 4;
231	* }
232	*/
233	// align on 8-bytes boundary if applicable
234	tst data, #`4`
235	b.eq `1f`
236	cmp mlen, #`4`
237	b.lt L2_bytes
238	ldr w9, [data], #`4`
239	sub mlen, mlen, #`4`
240	adds w7, w7, w9
241	adc x7, x7, x10 // assumes x10 still is #0 as set above
242	`1`:
243
244	/*
245	* while (mlen >= 64) {
246	* __builtin_prefetch(data + 32);
247	* __builtin_prefetch(data + 64);
248	* partial += (uint32_t )(void *)data;
249	* partial += (uint32_t )(void *)(data + 4);
250	* partial += (uint32_t )(void *)(data + 8);
251	* partial += (uint32_t )(void *)(data + 12);
252	* partial += (uint32_t )(void *)(data + 16);
253	* partial += (uint32_t )(void *)(data + 20);
254	* partial += (uint32_t )(void *)(data + 24);
255	* partial += (uint32_t )(void *)(data + 28);
256	* partial += (uint32_t )(void *)(data + 32);
257	* partial += (uint32_t )(void *)(data + 36);
258	* partial += (uint32_t )(void *)(data + 40);
259	* partial += (uint32_t )(void *)(data + 44);
260	* partial += (uint32_t )(void *)(data + 48);
261	* partial += (uint32_t )(void *)(data + 52);
262	* partial += (uint32_t )(void *)(data + 56);
263	* partial += (uint32_t )(void *)(data + 60);
264	* data += 64;
265	* mlen -= 64;
266	* // if (PREDICT_FALSE(partial & (3ULL << 62))) {
267	* // if (needs_swap)
268	* // partial = (partial << 8) +
269	* // (partial >> 56);
270	* // sum += (partial >> 32);
271	* // sum += (partial & 0xffffffff);
272	* // partial = 0;
273	* // }
274	* }
275	*/
276
277	// pre-decrement mlen by 64, and if < 64 bytes, try 32 bytes next
278	subs mlen, mlen, #`64`
279	b.lt L32_bytes
280
281	// save used vector registers
282	sub sp, sp, #`8`*`16`
283	mov x11, sp
284	st1`.4s` {v0, v1, v2, v3}, [x11], #`4`*`16`
285	st1`.4s` {v4, v5, v6, v7}, [x11], #`4`*`16`
286
287	// spread partial into 8 8-byte registers in v0-v3
288	fmov s3, w7
289	eor`.16b` v0, v0, v0
290	eor`.16b` v1, v1, v1
291	eor`.16b` v2, v2, v2
292
293	// load the 1st 64 bytes (16 32-bit words)
294	ld1`.4s` {v4,v5,v6,v7},[data],#`64`
295
296	// branch to finish off if mlen<64
297	subs mlen, mlen, #`64`
298	b.lt L64_finishup
299
300	/*
301	* loop for loading and accumulating 16 32-bit words into
302	* 8 8-byte accumulators per iteration.
303	*/
304	L64_loop:
305	subs mlen, mlen, #`64` // mlen -= 64
306
307	uadalp`.2d` v0, v4
308	ld1`.4s` {v4},[data], #`16`
309
310	uadalp`.2d` v1, v5
311	ld1`.4s` {v5},[data], #`16`
312
313	uadalp`.2d` v2, v6
314	ld1`.4s` {v6},[data], #`16`
315
316	uadalp`.2d` v3, v7
317	ld1`.4s` {v7},[data], #`16`
318
319	b.ge L64_loop
320
321	L64_finishup:
322	uadalp`.2d` v0, v4
323	uadalp`.2d` v1, v5
324	uadalp`.2d` v2, v6
325	uadalp`.2d` v3, v7
326
327	add`.2d` v0, v0, v1
328	add`.2d` v2, v2, v3
329	addp`.2d` d0, v0
330	addp`.2d` d2, v2
331	add`.2d` v0, v0, v2
332	fmov x7, d0 // partial in x7 now
333
334	// restore used vector registers
335	ld1`.4s` {v0, v1, v2, v3}, [sp], #`4`*`16`
336	ld1`.4s` {v4, v5, v6, v7}, [sp], #`4`*`16`
337
338	L32_bytes:
339	tst mlen, #`32`
340	b.eq L16_bytes
341	ldp x9, x10, [data], #`16`
342	ldp x11, x12, [data], #`16`
343	adds x7, x7, x9
344	mov x9, #`0`
345	adcs x7, x7, x10
346	adcs x7, x7, x11
347	adcs x7, x7, x12
348	adc x7, x7, x9
349
350	L16_bytes:
351	tst mlen, #`16`
352	b.eq L8_bytes
353	ldp x9, x10, [data], #`16`
354	adds x7, x7, x9
355	mov x9, #`0`
356	adcs x7, x7, x10
357	adc x7, x7, x9
358
359	L8_bytes:
360	tst mlen, #`8`
361	mov x10, #`0`
362	b.eq L4_bytes
363	ldr x9,[data],#`8`
364	adds x7, x7, x9
365	adc x7, x7, x10
366
367	L4_bytes:
368	tst mlen, #`4`
369	b.eq L2_bytes
370	ldr w9,[data],#`4`
371	adds x7, x7, x9
372	adc x7, x7, x10
373
374	L2_bytes:
375	tst mlen, #`2`
376	b.eq L_trailing_bytes
377	ldrh w9,[data],#`2`
378	adds x7, x7, x9
379	adc x7, x7, x10
380
381	L_trailing_bytes:
382	tst mlen, #`1`
383	b.eq L0_bytes
384	ldrb w9,[data],#`1`
385	adds x7, x7, x9
386	adc x7, x7, x10
387	eor started_on_odd, started_on_odd, #`1`
388
389	L0_bytes:
390	/*
391	* if (needs_swap)
392	* partial = (partial << 8) + (partial >> 56);
393	*/
394	cbz needs_swap, `1f`
395	ror x7, x7, #`56`
396	`1`:
397	/*
398	* sum += (partial >> 32) + (partial & 0xffffffff);
399	* sum = (sum >> 32) + (sum & 0xffffffff);
400	* }
401	*/
402
403	add x3, x3, x7, lsr #`32`
404	mov w7, w7
405	add x3, x3, x7
406	mov w7, w3
407	add x3, x7, x3, lsr #`32`
408
409	L_continue:
410	cmp len, #`0`
411	ldr ptr_m, [m, #M_NEXT] // m = m->m_next
412	b.gt L_loop
413
414	/*
415	* final_acc = (sum >> 48) + ((sum >> 32) & 0xffff) +
416	* ((sum >> 16) & 0xffff) + (sum & 0xffff);
417	* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
418	* final_acc = (final_acc >> 16) + (final_acc & 0xffff);
419	* return (final_acc & 0xffff);
420	* }
421	*/
422
423	mov w4, #`0x00ffff`
424	and x0, x4, x3, lsr #`48`
425	and x1, x4, x3, lsr #`32`
426	and x2, x4, x3, lsr #`16`
427	and x3, x4, x3
428	add w0, w0, w1
429	add w2, w2, w3
430	add w0, w0, w2
431	and w1, w4, w0, lsr #`16`
432	and w0, w4, w0
433	add w0, w0, w1
434	and w1, w4, w0, lsr #`16`
435	and w0, w4, w0
436	add w0, w0, w1
437	/*
438	* If we were to 1's complement it (XOR with 0xffff):
439	*
440	* eor w0, w0, w4
441	*/
442	and w0, w0, w4
443
444	ret lr
445
446	Lin_cksum_whoops:
447	adrp x0, Lin_cksum_whoops_str@page
448	add x0, x0, Lin_cksum_whoops_str@pageoff
449	bl #CKSUM_ERR
450	mov x0, #-`1`
451	ret lr
452
453	Lin_cksum_whoops_str:
454	.asciz "os_cpu_in_cksum_mbuf: out of data\n"
455	.align `5`
456

Browse the source code of xnu/bsd/dev/arm64/cpu_in_cksum.s