sha256_compress_arm64.s source code [xnu/osfmk/arm64/corecrypto/sha256_compress_arm64.s]

1	# Copyright (c) (2018-2020,2022) Apple Inc. All rights reserved.
2	#
3	# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
4	# is contained in the License.txt file distributed with corecrypto) and only to
5	# people who accept that license. IMPORTANT: Any license rights granted to you by
6	# Apple Inc. (if any) are limited to internal use within your organization only on
7	# devices and computers you own or control, for the sole purpose of verifying the
8	# security characteristics and correct functioning of the Apple Software. You may
9	# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
10
11	/*
12	* Copyright (c) 2019-2021 Apple Inc. All rights reserved.
13	*
14	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
15	*
16	* This file contains Original Code and/or Modifications of Original Code
17	* as defined in and that are subject to the Apple Public Source License
18	* Version 2.0 (the 'License'). You may not use this file except in
19	* compliance with the License. The rights granted to you under the License
20	* may not be used to create, or enable the creation or redistribution of,
21	* unlawful or unlicensed copies of an Apple operating system, or to
22	* circumvent, violate, or enable the circumvention or violation of, any
23	* terms of an Apple operating system software license agreement.
24	*
25	* Please obtain a copy of the License at
26	* http://www.opensource.apple.com/apsl/ and read it before using this file.
27	*
28	* The Original Code and all software distributed under the License are
29	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
30	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
31	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
32	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
33	* Please see the License for the specific language governing rights and
34	* limitations under the License.
35	*
36	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
37	*/
38	/*
39	This file provides armv7+neon hand implementation of the following function
40
41	void SHA256_Transform(SHA256_ctx ctx, char data, unsigned int num_blocks);
42
43	which is a C function in sha2.c (from xnu).
44
45	sha256 algorithm per block description:
46
47	1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
48	2. load 8 digests a-h from ctx->state
49	3. for r = 0:15
50	T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
51	d += T1;
52	h = T1 + Sigma0(a) + Maj(a,b,c)
53	permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
54	4. for r = 16:63
55	W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
56	T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
57	d += T1;
58	h = T1 + Sigma0(a) + Maj(a,b,c)
59	permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
60
61	In the assembly implementation:
62	- a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
63	- its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
64	- the 8 digests (a-h) will be stored in GPR or memory
65
66	the implementation per block looks like
67
68	----------------------------------------------------------------------------
69
70	load W(0:15) (big-endian per 4 bytes) into q0:q3
71	pre_calculate and store W+K(0:15) in stack
72
73	load digests a-h from ctx->state;
74
75	for (r=0;r<48;r+=4) {
76	digests a-h update and permute round r:r+3
77	update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
78	}
79
80	for (r=48;r<64;r+=4) {
81	digests a-h update and permute round r:r+3
82	}
83
84	ctx->states += digests a-h;
85
86	----------------------------------------------------------------------------
87
88	our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
89	into the last 16 rounds of its previous block:
90
91	----------------------------------------------------------------------------
92
93	load W(0:15) (big-endian per 4 bytes) into q0:q3
94	pre_calculate and store W+K(0:15) in stack
95
96	L_loop:
97
98	load digests a-h from ctx->state;
99
100	for (r=0;r<48;r+=4) {
101	digests a-h update and permute round r:r+3
102	update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
103	}
104
105	num_block--;
106	if (num_block==0) jmp L_last_block;
107
108	for (r=48;r<64;r+=4) {
109	digests a-h update and permute round r:r+3
110	load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
111	pre_calculate and store W+K([r:r+3]%16) in stack
112	}
113
114	ctx->states += digests a-h;
115
116	jmp L_loop;
117
118	L_last_block:
119
120	for (r=48;r<64;r+=4) {
121	digests a-h update and permute round r:r+3
122	}
123
124	ctx->states += digests a-h;
125
126	------------------------------------------------------------------------
127
128	Apple CoreOS vector & numerics
129	*/
130
131	#if defined(__arm64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
132
133	#include "arm64_isa_compatibility.h"
134	#include "ccarm_pac_bti_macros.h"
135
136	.subsections_via_symbols
137	.text
138
139	.p2align `4`
140
141	K256:
142	.long `0x428a2f98`
143	.long `0x71374491`
144	.long `0xb5c0fbcf`
145	.long `0xe9b5dba5`
146	.long `0x3956c25b`
147	.long `0x59f111f1`
148	.long `0x923f82a4`
149	.long `0xab1c5ed5`
150	.long `0xd807aa98`
151	.long `0x12835b01`
152	.long `0x243185be`
153	.long `0x550c7dc3`
154	.long `0x72be5d74`
155	.long `0x80deb1fe`
156	.long `0x9bdc06a7`
157	.long `0xc19bf174`
158	.long `0xe49b69c1`
159	.long `0xefbe4786`
160	.long `0x0fc19dc6`
161	.long `0x240ca1cc`
162	.long `0x2de92c6f`
163	.long `0x4a7484aa`
164	.long `0x5cb0a9dc`
165	.long `0x76f988da`
166	.long `0x983e5152`
167	.long `0xa831c66d`
168	.long `0xb00327c8`
169	.long `0xbf597fc7`
170	.long `0xc6e00bf3`
171	.long `0xd5a79147`
172	.long `0x06ca6351`
173	.long `0x14292967`
174	.long `0x27b70a85`
175	.long `0x2e1b2138`
176	.long `0x4d2c6dfc`
177	.long `0x53380d13`
178	.long `0x650a7354`
179	.long `0x766a0abb`
180	.long `0x81c2c92e`
181	.long `0x92722c85`
182	.long `0xa2bfe8a1`
183	.long `0xa81a664b`
184	.long `0xc24b8b70`
185	.long `0xc76c51a3`
186	.long `0xd192e819`
187	.long `0xd6990624`
188	.long `0xf40e3585`
189	.long `0x106aa070`
190	.long `0x19a4c116`
191	.long `0x1e376c08`
192	.long `0x2748774c`
193	.long `0x34b0bcb5`
194	.long `0x391c0cb3`
195	.long `0x4ed8aa4a`
196	.long `0x5b9cca4f`
197	.long `0x682e6ff3`
198	.long `0x748f82ee`
199	.long `0x78a5636f`
200	.long `0x84c87814`
201	.long `0x8cc70208`
202	.long `0x90befffa`
203	.long `0xa4506ceb`
204	.long `0xbef9a3f7`
205	.long `0xc67178f2`
206
207
208	.p2align `4`
209
210	.globl _AccelerateCrypto_SHA256_compress
211	_AccelerateCrypto_SHA256_compress:
212
213
214	#define hashes x0
215	#define numblocks x1
216	#define data x2
217	#define ktable x3
218	BRANCH_TARGET_CALL
219	#ifdef __ILP32__
220	uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it
221	#endif
222
223
224	adrp ktable, K256@page
225	cbnz numblocks, `1f` // if number of blocks is nonzero, go on for sha256 transform operation
226	ret lr // otherwise, return
227	`1`:
228	add ktable, ktable, K256@pageoff
229
230	#if BUILDKERNEL
231	// save q0-q7, q16-q24 8+8+1=19
232	sub x4, sp, #`17`*`16`
233	sub sp, sp, #`17`*`16`
234	st1`.4s` {v0, v1, v2, v3}, [x4], #`64`
235	st1`.4s` {v4, v5, v6, v7}, [x4], #`64`
236	st1`.4s` {v16, v17, v18, v19}, [x4], #`64`
237	st1`.4s` {v20, v21, v22, v23}, [x4], #`64`
238	st1`.4s` {v24}, [x4], #`16`
239	#endif
240
241	ld1`.4s` {v0,v1,v2,v3}, [data], #`64` // w0,w1,w2,w3 need to bswap into big-endian
242
243	rev32`.16b` v0, v0 // byte swap of 1st 4 ints
244	ldr q21, [ktable, #`16`*`0`]
245	rev32`.16b` v1, v1 // byte swap of 2nd 4 ints
246	ldr q16, [hashes, #`0`]
247	rev32`.16b` v2, v2 // byte swap of 3rd 4 ints
248	ldr q17, [hashes, #`16`]
249	rev32`.16b` v3, v3 // byte swap of 4th 4 ints
250	ldr q22, [ktable, #`16`*`1`]
251
252	mov`.16b` v18, v16
253	ldr q23, [ktable, #`16`*`2`]
254	add`.4s` v4, v0, v21 // 1st 4 input + K256
255	ldr q24, [ktable, #`16`*`3`]
256	add`.4s` v5, v1, v22 // 2nd 4 input + K256
257	mov`.16b` v19, v17
258	add`.4s` v6, v2, v23 // 3rd 4 input + K256
259	add`.4s` v7, v3, v24 // 4th 4 input + K256
260	add ktable, ktable, #`16`*`4`
261
262
263	.macro sha256_round
264	mov`.16b` v20, v18
265	SHA256SU0 $`0`, $`1`
266	SHA256H `18`, `19`, $`4`
267	SHA256SU1 $`0`, $`2`, $`3`
268	SHA256H2 `19`, `20`, $`4`
269	add`.4s` $`6`, $`5`, $`7`
270	.endm
271
272	// 4 vector hashes update and load next vector rounds
273	.macro sha256_hash_load_round
274	mov`.16b` v20, v18
275	SHA256H `18`, `19`, $`0`
276	rev32`.16b` $`1`, $`1`
277	SHA256H2 `19`, `20`, $`0`
278	add`.4s` $`2`, $`1`, $`3`
279	.endm
280
281	.macro sha256_hash_round
282	mov`.16b` v20, v18
283	SHA256H `18`, `19`, $`0`
284	SHA256H2 `19`, `20`, $`0`
285	.endm
286
287	// 12 vector hash and sequence update rounds
288	mov w4, #`3`
289	L_i_loop:
290	mov`.16b` v20, v18
291	ldr q21, [ktable, #`0`] // k0
292	SHA256SU0 `0`, `1`
293	ldr q22, [ktable, #`16`] // k1
294	SHA256H `18`, `19`, `4`
295	ldr q23, [ktable, #`32`] // k2
296	SHA256SU1 `0`, `2`, `3`
297	ldr q24, [ktable, #`48`] // k3
298	SHA256H2 `19`, `20`, `4`
299	add ktable, ktable, #`64`
300	add`.4s` v4, v0, v21
301
302	sha256_round `1`, `2`, `3`, `0`, `5`, v1, v5, v22
303	sha256_round `2`, `3`, `0`, `1`, `6`, v2, v6, v23
304	subs w4, w4, #`1`
305	sha256_round `3`, `0`, `1`, `2`, `7`, v3, v7, v24
306	b.gt L_i_loop
307
308	subs numblocks, numblocks, #`1` // pre-decrement num_blocks by 1
309	b.le L_wrapup
310
311	sub ktable, ktable, #`256`
312
313	L_loop:
314
315	ldr q0, [data, #`0`]
316	mov`.16b` v20, v18
317	ldr q21, [ktable,#`0`]
318	SHA256H `18`, `19`, `4`
319	ldr q1, [data, #`16`]
320	rev32`.16b` v0, v0
321	ldr q2, [data, #`32`]
322	SHA256H2 `19`, `20`, `4`
323	ldr q3, [data, #`48`]
324	add`.4s` v4, v0, v21
325
326	ldr q22, [ktable,#`16`]
327	mov`.16b` v20, v18
328	add data, data, #`64`
329	SHA256H `18`, `19`, `5`
330	ldr q23, [ktable,#`32`]
331	rev32`.16b` v1, v1
332	ldr q24, [ktable,#`48`]
333	SHA256H2 `19`, `20`, `5`
334	add`.4s` v5, v1, v22
335
336	sha256_hash_load_round `6`, v2, v6, v23
337	sha256_hash_load_round `7`, v3, v7, v24
338
339	add`.4s` v18, v16, v18
340	add`.4s` v19, v17, v19
341	mov`.16b` v16, v18
342	mov`.16b` v17, v19
343
344	// 12 vector hash and sequence update rounds
345	mov`.16b` v20, v18
346	ldr q21, [ktable, #`16``4`] // k0*
347	SHA256SU0 `0`, `1`
348	ldr q22, [ktable, #`16``5`] // k1*
349	SHA256H `18`, `19`, `4`
350	ldr q23, [ktable, #`16``6`] // k2*
351	SHA256SU1 `0`, `2`, `3`
352	ldr q24, [ktable, #`16``7`] // k3*
353	SHA256H2 `19`, `20`, `4`
354	add`.4s` v4, v0, v21
355
356	sha256_round `1`, `2`, `3`, `0`, `5`, v1, v5, v22
357	sha256_round `2`, `3`, `0`, `1`, `6`, v2, v6, v23
358	sha256_round `3`, `0`, `1`, `2`, `7`, v3, v7, v24
359	mov`.16b` v20, v18
360	ldr q21, [ktable, #`16``8`] // k0*
361	SHA256SU0 `0`, `1`
362	ldr q22, [ktable, #`16``9`] // k1*
363	SHA256H `18`, `19`, `4`
364	ldr q23, [ktable, #`16``10`] // k2*
365	SHA256SU1 `0`, `2`, `3`
366	ldr q24, [ktable, #`16``11`] // k3*
367	SHA256H2 `19`, `20`, `4`
368	add`.4s` v4, v0, v21
369
370	sha256_round `1`, `2`, `3`, `0`, `5`, v1, v5, v22
371	sha256_round `2`, `3`, `0`, `1`, `6`, v2, v6, v23
372	sha256_round `3`, `0`, `1`, `2`, `7`, v3, v7, v24
373
374	mov`.16b` v20, v18
375	ldr q21, [ktable, #`16``12`] // k0*
376	SHA256SU0 `0`, `1`
377	ldr q22, [ktable, #`16``13`] // k1*
378	SHA256H `18`, `19`, `4`
379	ldr q23, [ktable, #`16``14`] // k2*
380	SHA256SU1 `0`, `2`, `3`
381	ldr q24, [ktable, #`16``15`] // k3*
382	SHA256H2 `19`, `20`, `4`
383	add`.4s` v4, v0, v21
384
385	sha256_round `1`, `2`, `3`, `0`, `5`, v1, v5, v22
386	sha256_round `2`, `3`, `0`, `1`, `6`, v2, v6, v23
387	sha256_round `3`, `0`, `1`, `2`, `7`, v3, v7, v24
388
389	subs numblocks, numblocks, #`1` // pre-decrement num_blocks by 1
390	b.gt L_loop
391
392	L_wrapup:
393
394	sha256_hash_round `4`
395	sha256_hash_round `5`
396	sha256_hash_round `6`
397	sha256_hash_round `7`
398
399	add`.4s` v16, v16, v18
400	add`.4s` v17, v17, v19
401	st1`.4s` {v16,v17}, [hashes] // hashes q16 : d,c,b,a q17 : h,g,f,e
402
403	#if BUILDKERNEL
404	// restore q9-q13, q0-q7, q16-q31
405	ld1`.4s` {v0, v1, v2, v3}, [sp], #`64`
406	ld1`.4s` {v4, v5, v6, v7}, [sp], #`64`
407	ld1`.4s` {v16, v17, v18, v19}, [sp], #`64`
408	ld1`.4s` {v20, v21, v22, v23}, [sp], #`64`
409	ld1`.4s` {v24}, [sp], #`16`
410	#endif
411
412	ret lr
413
414
415	#endif // arm64
416
417

Browse the source code of xnu/osfmk/arm64/corecrypto/sha256_compress_arm64.s