1# Copyright (c) (2018-2020,2022) Apple Inc. All rights reserved.
2#
3# corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which
4# is contained in the License.txt file distributed with corecrypto) and only to
5# people who accept that license. IMPORTANT: Any license rights granted to you by
6# Apple Inc. (if any) are limited to internal use within your organization only on
7# devices and computers you own or control, for the sole purpose of verifying the
8# security characteristics and correct functioning of the Apple Software. You may
9# not, directly or indirectly, redistribute the Apple Software or any portions thereof.
10
11/*
12 * Copyright (c) 2019-2021 Apple Inc. All rights reserved.
13 *
14 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
15 *
16 * This file contains Original Code and/or Modifications of Original Code
17 * as defined in and that are subject to the Apple Public Source License
18 * Version 2.0 (the 'License'). You may not use this file except in
19 * compliance with the License. The rights granted to you under the License
20 * may not be used to create, or enable the creation or redistribution of,
21 * unlawful or unlicensed copies of an Apple operating system, or to
22 * circumvent, violate, or enable the circumvention or violation of, any
23 * terms of an Apple operating system software license agreement.
24 *
25 * Please obtain a copy of the License at
26 * http://www.opensource.apple.com/apsl/ and read it before using this file.
27 *
28 * The Original Code and all software distributed under the License are
29 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
30 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
31 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
32 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
33 * Please see the License for the specific language governing rights and
34 * limitations under the License.
35 *
36 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
37 */
38/*
39 This file provides armv7+neon hand implementation of the following function
40
41 void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks);
42
43 which is a C function in sha2.c (from xnu).
44
45 sha256 algorithm per block description:
46
47 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte)
48 2. load 8 digests a-h from ctx->state
49 3. for r = 0:15
50 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
51 d += T1;
52 h = T1 + Sigma0(a) + Maj(a,b,c)
53 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
54 4. for r = 16:63
55 W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]);
56 T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r];
57 d += T1;
58 h = T1 + Sigma0(a) + Maj(a,b,c)
59 permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g
60
61 In the assembly implementation:
62 - a circular window of message schedule W(r:r+15) is updated and stored in q0-q3
63 - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer
64 - the 8 digests (a-h) will be stored in GPR or memory
65
66 the implementation per block looks like
67
68 ----------------------------------------------------------------------------
69
70 load W(0:15) (big-endian per 4 bytes) into q0:q3
71 pre_calculate and store W+K(0:15) in stack
72
73 load digests a-h from ctx->state;
74
75 for (r=0;r<48;r+=4) {
76 digests a-h update and permute round r:r+3
77 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
78 }
79
80 for (r=48;r<64;r+=4) {
81 digests a-h update and permute round r:r+3
82 }
83
84 ctx->states += digests a-h;
85
86 ----------------------------------------------------------------------------
87
88 our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block
89 into the last 16 rounds of its previous block:
90
91 ----------------------------------------------------------------------------
92
93 load W(0:15) (big-endian per 4 bytes) into q0:q3
94 pre_calculate and store W+K(0:15) in stack
95
96L_loop:
97
98 load digests a-h from ctx->state;
99
100 for (r=0;r<48;r+=4) {
101 digests a-h update and permute round r:r+3
102 update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration
103 }
104
105 num_block--;
106 if (num_block==0) jmp L_last_block;
107
108 for (r=48;r<64;r+=4) {
109 digests a-h update and permute round r:r+3
110 load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3
111 pre_calculate and store W+K([r:r+3]%16) in stack
112 }
113
114 ctx->states += digests a-h;
115
116 jmp L_loop;
117
118L_last_block:
119
120 for (r=48;r<64;r+=4) {
121 digests a-h update and permute round r:r+3
122 }
123
124 ctx->states += digests a-h;
125
126 ------------------------------------------------------------------------
127
128 Apple CoreOS vector & numerics
129*/
130
131#if defined(__arm64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2)
132
133#include "arm64_isa_compatibility.h"
134#include "ccarm_pac_bti_macros.h"
135
136.subsections_via_symbols
137 .text
138
139 .p2align 4
140
141K256:
142 .long 0x428a2f98
143 .long 0x71374491
144 .long 0xb5c0fbcf
145 .long 0xe9b5dba5
146 .long 0x3956c25b
147 .long 0x59f111f1
148 .long 0x923f82a4
149 .long 0xab1c5ed5
150 .long 0xd807aa98
151 .long 0x12835b01
152 .long 0x243185be
153 .long 0x550c7dc3
154 .long 0x72be5d74
155 .long 0x80deb1fe
156 .long 0x9bdc06a7
157 .long 0xc19bf174
158 .long 0xe49b69c1
159 .long 0xefbe4786
160 .long 0x0fc19dc6
161 .long 0x240ca1cc
162 .long 0x2de92c6f
163 .long 0x4a7484aa
164 .long 0x5cb0a9dc
165 .long 0x76f988da
166 .long 0x983e5152
167 .long 0xa831c66d
168 .long 0xb00327c8
169 .long 0xbf597fc7
170 .long 0xc6e00bf3
171 .long 0xd5a79147
172 .long 0x06ca6351
173 .long 0x14292967
174 .long 0x27b70a85
175 .long 0x2e1b2138
176 .long 0x4d2c6dfc
177 .long 0x53380d13
178 .long 0x650a7354
179 .long 0x766a0abb
180 .long 0x81c2c92e
181 .long 0x92722c85
182 .long 0xa2bfe8a1
183 .long 0xa81a664b
184 .long 0xc24b8b70
185 .long 0xc76c51a3
186 .long 0xd192e819
187 .long 0xd6990624
188 .long 0xf40e3585
189 .long 0x106aa070
190 .long 0x19a4c116
191 .long 0x1e376c08
192 .long 0x2748774c
193 .long 0x34b0bcb5
194 .long 0x391c0cb3
195 .long 0x4ed8aa4a
196 .long 0x5b9cca4f
197 .long 0x682e6ff3
198 .long 0x748f82ee
199 .long 0x78a5636f
200 .long 0x84c87814
201 .long 0x8cc70208
202 .long 0x90befffa
203 .long 0xa4506ceb
204 .long 0xbef9a3f7
205 .long 0xc67178f2
206
207
208 .p2align 4
209
210 .globl _AccelerateCrypto_SHA256_compress
211_AccelerateCrypto_SHA256_compress:
212
213
214 #define hashes x0
215 #define numblocks x1
216 #define data x2
217 #define ktable x3
218 BRANCH_TARGET_CALL
219#ifdef __ILP32__
220 uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it
221#endif
222
223
224 adrp ktable, K256@page
225 cbnz numblocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation
226 ret lr // otherwise, return
2271:
228 add ktable, ktable, K256@pageoff
229
230#if BUILDKERNEL
231 // save q0-q7, q16-q24 8+8+1=19
232 sub x4, sp, #17*16
233 sub sp, sp, #17*16
234 st1.4s {v0, v1, v2, v3}, [x4], #64
235 st1.4s {v4, v5, v6, v7}, [x4], #64
236 st1.4s {v16, v17, v18, v19}, [x4], #64
237 st1.4s {v20, v21, v22, v23}, [x4], #64
238 st1.4s {v24}, [x4], #16
239#endif
240
241 ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian
242
243 rev32.16b v0, v0 // byte swap of 1st 4 ints
244 ldr q21, [ktable, #16*0]
245 rev32.16b v1, v1 // byte swap of 2nd 4 ints
246 ldr q16, [hashes, #0]
247 rev32.16b v2, v2 // byte swap of 3rd 4 ints
248 ldr q17, [hashes, #16]
249 rev32.16b v3, v3 // byte swap of 4th 4 ints
250 ldr q22, [ktable, #16*1]
251
252 mov.16b v18, v16
253 ldr q23, [ktable, #16*2]
254 add.4s v4, v0, v21 // 1st 4 input + K256
255 ldr q24, [ktable, #16*3]
256 add.4s v5, v1, v22 // 2nd 4 input + K256
257 mov.16b v19, v17
258 add.4s v6, v2, v23 // 3rd 4 input + K256
259 add.4s v7, v3, v24 // 4th 4 input + K256
260 add ktable, ktable, #16*4
261
262
263 .macro sha256_round
264 mov.16b v20, v18
265 SHA256SU0 $0, $1
266 SHA256H 18, 19, $4
267 SHA256SU1 $0, $2, $3
268 SHA256H2 19, 20, $4
269 add.4s $6, $5, $7
270 .endm
271
272 // 4 vector hashes update and load next vector rounds
273 .macro sha256_hash_load_round
274 mov.16b v20, v18
275 SHA256H 18, 19, $0
276 rev32.16b $1, $1
277 SHA256H2 19, 20, $0
278 add.4s $2, $1, $3
279 .endm
280
281 .macro sha256_hash_round
282 mov.16b v20, v18
283 SHA256H 18, 19, $0
284 SHA256H2 19, 20, $0
285 .endm
286
287 // 12 vector hash and sequence update rounds
288 mov w4, #3
289L_i_loop:
290 mov.16b v20, v18
291 ldr q21, [ktable, #0] // k0
292 SHA256SU0 0, 1
293 ldr q22, [ktable, #16] // k1
294 SHA256H 18, 19, 4
295 ldr q23, [ktable, #32] // k2
296 SHA256SU1 0, 2, 3
297 ldr q24, [ktable, #48] // k3
298 SHA256H2 19, 20, 4
299 add ktable, ktable, #64
300 add.4s v4, v0, v21
301
302 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
303 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
304 subs w4, w4, #1
305 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
306 b.gt L_i_loop
307
308 subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
309 b.le L_wrapup
310
311 sub ktable, ktable, #256
312
313L_loop:
314
315 ldr q0, [data, #0]
316 mov.16b v20, v18
317 ldr q21, [ktable,#0]
318 SHA256H 18, 19, 4
319 ldr q1, [data, #16]
320 rev32.16b v0, v0
321 ldr q2, [data, #32]
322 SHA256H2 19, 20, 4
323 ldr q3, [data, #48]
324 add.4s v4, v0, v21
325
326 ldr q22, [ktable,#16]
327 mov.16b v20, v18
328 add data, data, #64
329 SHA256H 18, 19, 5
330 ldr q23, [ktable,#32]
331 rev32.16b v1, v1
332 ldr q24, [ktable,#48]
333 SHA256H2 19, 20, 5
334 add.4s v5, v1, v22
335
336 sha256_hash_load_round 6, v2, v6, v23
337 sha256_hash_load_round 7, v3, v7, v24
338
339 add.4s v18, v16, v18
340 add.4s v19, v17, v19
341 mov.16b v16, v18
342 mov.16b v17, v19
343
344 // 12 vector hash and sequence update rounds
345 mov.16b v20, v18
346 ldr q21, [ktable, #16*4] // k0
347 SHA256SU0 0, 1
348 ldr q22, [ktable, #16*5] // k1
349 SHA256H 18, 19, 4
350 ldr q23, [ktable, #16*6] // k2
351 SHA256SU1 0, 2, 3
352 ldr q24, [ktable, #16*7] // k3
353 SHA256H2 19, 20, 4
354 add.4s v4, v0, v21
355
356 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
357 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
358 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
359 mov.16b v20, v18
360 ldr q21, [ktable, #16*8] // k0
361 SHA256SU0 0, 1
362 ldr q22, [ktable, #16*9] // k1
363 SHA256H 18, 19, 4
364 ldr q23, [ktable, #16*10] // k2
365 SHA256SU1 0, 2, 3
366 ldr q24, [ktable, #16*11] // k3
367 SHA256H2 19, 20, 4
368 add.4s v4, v0, v21
369
370 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
371 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
372 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
373
374 mov.16b v20, v18
375 ldr q21, [ktable, #16*12] // k0
376 SHA256SU0 0, 1
377 ldr q22, [ktable, #16*13] // k1
378 SHA256H 18, 19, 4
379 ldr q23, [ktable, #16*14] // k2
380 SHA256SU1 0, 2, 3
381 ldr q24, [ktable, #16*15] // k3
382 SHA256H2 19, 20, 4
383 add.4s v4, v0, v21
384
385 sha256_round 1, 2, 3, 0, 5, v1, v5, v22
386 sha256_round 2, 3, 0, 1, 6, v2, v6, v23
387 sha256_round 3, 0, 1, 2, 7, v3, v7, v24
388
389 subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1
390 b.gt L_loop
391
392L_wrapup:
393
394 sha256_hash_round 4
395 sha256_hash_round 5
396 sha256_hash_round 6
397 sha256_hash_round 7
398
399 add.4s v16, v16, v18
400 add.4s v17, v17, v19
401 st1.4s {v16,v17}, [hashes] // hashes q16 : d,c,b,a q17 : h,g,f,e
402
403#if BUILDKERNEL
404 // restore q9-q13, q0-q7, q16-q31
405 ld1.4s {v0, v1, v2, v3}, [sp], #64
406 ld1.4s {v4, v5, v6, v7}, [sp], #64
407 ld1.4s {v16, v17, v18, v19}, [sp], #64
408 ld1.4s {v20, v21, v22, v23}, [sp], #64
409 ld1.4s {v24}, [sp], #16
410#endif
411
412 ret lr
413
414
415#endif // arm64
416
417