1 | # Copyright (c) (2018-2020,2022) Apple Inc. All rights reserved. |
2 | # |
3 | # corecrypto is licensed under Apple Inc.’s Internal Use License Agreement (which |
4 | # is contained in the License.txt file distributed with corecrypto) and only to |
5 | # people who accept that license. IMPORTANT: Any license rights granted to you by |
6 | # Apple Inc. (if any) are limited to internal use within your organization only on |
7 | # devices and computers you own or control, for the sole purpose of verifying the |
8 | # security characteristics and correct functioning of the Apple Software. You may |
9 | # not, directly or indirectly, redistribute the Apple Software or any portions thereof. |
10 | |
11 | /* |
12 | * Copyright (c) 2019-2021 Apple Inc. All rights reserved. |
13 | * |
14 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
15 | * |
16 | * This file contains Original Code and/or Modifications of Original Code |
17 | * as defined in and that are subject to the Apple Public Source License |
18 | * Version 2.0 (the 'License'). You may not use this file except in |
19 | * compliance with the License. The rights granted to you under the License |
20 | * may not be used to create, or enable the creation or redistribution of, |
21 | * unlawful or unlicensed copies of an Apple operating system, or to |
22 | * circumvent, violate, or enable the circumvention or violation of, any |
23 | * terms of an Apple operating system software license agreement. |
24 | * |
25 | * Please obtain a copy of the License at |
26 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
27 | * |
28 | * The Original Code and all software distributed under the License are |
29 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
30 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
31 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
32 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
33 | * Please see the License for the specific language governing rights and |
34 | * limitations under the License. |
35 | * |
36 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
37 | */ |
38 | /* |
39 | This file provides armv7+neon hand implementation of the following function |
40 | |
41 | void SHA256_Transform(SHA256_ctx *ctx, char *data, unsigned int num_blocks); |
42 | |
43 | which is a C function in sha2.c (from xnu). |
44 | |
45 | sha256 algorithm per block description: |
46 | |
47 | 1. W(0:15) = big-endian (per 4 bytes) loading of input data (64 byte) |
48 | 2. load 8 digests a-h from ctx->state |
49 | 3. for r = 0:15 |
50 | T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; |
51 | d += T1; |
52 | h = T1 + Sigma0(a) + Maj(a,b,c) |
53 | permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g |
54 | 4. for r = 16:63 |
55 | W[r] = W[r-16] + sigma1(W[r-2]) + W[r-7] + sigma0(W[r-15]); |
56 | T1 = h + Sigma1(e) + Ch(e,f,g) + K[r] + W[r]; |
57 | d += T1; |
58 | h = T1 + Sigma0(a) + Maj(a,b,c) |
59 | permute a,b,c,d,e,f,g,h into h,a,b,c,d,e,f,g |
60 | |
61 | In the assembly implementation: |
62 | - a circular window of message schedule W(r:r+15) is updated and stored in q0-q3 |
63 | - its corresponding W+K(r:r+15) is updated and stored in a stack space circular buffer |
64 | - the 8 digests (a-h) will be stored in GPR or memory |
65 | |
66 | the implementation per block looks like |
67 | |
68 | ---------------------------------------------------------------------------- |
69 | |
70 | load W(0:15) (big-endian per 4 bytes) into q0:q3 |
71 | pre_calculate and store W+K(0:15) in stack |
72 | |
73 | load digests a-h from ctx->state; |
74 | |
75 | for (r=0;r<48;r+=4) { |
76 | digests a-h update and permute round r:r+3 |
77 | update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration |
78 | } |
79 | |
80 | for (r=48;r<64;r+=4) { |
81 | digests a-h update and permute round r:r+3 |
82 | } |
83 | |
84 | ctx->states += digests a-h; |
85 | |
86 | ---------------------------------------------------------------------------- |
87 | |
88 | our implementation (allows multiple blocks per call) pipelines the loading of W/WK of a future block |
89 | into the last 16 rounds of its previous block: |
90 | |
91 | ---------------------------------------------------------------------------- |
92 | |
93 | load W(0:15) (big-endian per 4 bytes) into q0:q3 |
94 | pre_calculate and store W+K(0:15) in stack |
95 | |
96 | L_loop: |
97 | |
98 | load digests a-h from ctx->state; |
99 | |
100 | for (r=0;r<48;r+=4) { |
101 | digests a-h update and permute round r:r+3 |
102 | update W([r:r+3]%16) and WK([r:r+3]%16) for the next 4th iteration |
103 | } |
104 | |
105 | num_block--; |
106 | if (num_block==0) jmp L_last_block; |
107 | |
108 | for (r=48;r<64;r+=4) { |
109 | digests a-h update and permute round r:r+3 |
110 | load W([r:r+3]%16) (big-endian per 4 bytes) into q0:q3 |
111 | pre_calculate and store W+K([r:r+3]%16) in stack |
112 | } |
113 | |
114 | ctx->states += digests a-h; |
115 | |
116 | jmp L_loop; |
117 | |
118 | L_last_block: |
119 | |
120 | for (r=48;r<64;r+=4) { |
121 | digests a-h update and permute round r:r+3 |
122 | } |
123 | |
124 | ctx->states += digests a-h; |
125 | |
126 | ------------------------------------------------------------------------ |
127 | |
128 | Apple CoreOS vector & numerics |
129 | */ |
130 | |
131 | #if defined(__arm64__) && defined(__ARM_NEON) && defined(__ARM_FEATURE_SHA2) |
132 | |
133 | #include "arm64_isa_compatibility.h" |
134 | #include "ccarm_pac_bti_macros.h" |
135 | |
136 | .subsections_via_symbols |
137 | .text |
138 | |
139 | .p2align 4 |
140 | |
141 | K256: |
142 | .long 0x428a2f98 |
143 | .long 0x71374491 |
144 | .long 0xb5c0fbcf |
145 | .long 0xe9b5dba5 |
146 | .long 0x3956c25b |
147 | .long 0x59f111f1 |
148 | .long 0x923f82a4 |
149 | .long 0xab1c5ed5 |
150 | .long 0xd807aa98 |
151 | .long 0x12835b01 |
152 | .long 0x243185be |
153 | .long 0x550c7dc3 |
154 | .long 0x72be5d74 |
155 | .long 0x80deb1fe |
156 | .long 0x9bdc06a7 |
157 | .long 0xc19bf174 |
158 | .long 0xe49b69c1 |
159 | .long 0xefbe4786 |
160 | .long 0x0fc19dc6 |
161 | .long 0x240ca1cc |
162 | .long 0x2de92c6f |
163 | .long 0x4a7484aa |
164 | .long 0x5cb0a9dc |
165 | .long 0x76f988da |
166 | .long 0x983e5152 |
167 | .long 0xa831c66d |
168 | .long 0xb00327c8 |
169 | .long 0xbf597fc7 |
170 | .long 0xc6e00bf3 |
171 | .long 0xd5a79147 |
172 | .long 0x06ca6351 |
173 | .long 0x14292967 |
174 | .long 0x27b70a85 |
175 | .long 0x2e1b2138 |
176 | .long 0x4d2c6dfc |
177 | .long 0x53380d13 |
178 | .long 0x650a7354 |
179 | .long 0x766a0abb |
180 | .long 0x81c2c92e |
181 | .long 0x92722c85 |
182 | .long 0xa2bfe8a1 |
183 | .long 0xa81a664b |
184 | .long 0xc24b8b70 |
185 | .long 0xc76c51a3 |
186 | .long 0xd192e819 |
187 | .long 0xd6990624 |
188 | .long 0xf40e3585 |
189 | .long 0x106aa070 |
190 | .long 0x19a4c116 |
191 | .long 0x1e376c08 |
192 | .long 0x2748774c |
193 | .long 0x34b0bcb5 |
194 | .long 0x391c0cb3 |
195 | .long 0x4ed8aa4a |
196 | .long 0x5b9cca4f |
197 | .long 0x682e6ff3 |
198 | .long 0x748f82ee |
199 | .long 0x78a5636f |
200 | .long 0x84c87814 |
201 | .long 0x8cc70208 |
202 | .long 0x90befffa |
203 | .long 0xa4506ceb |
204 | .long 0xbef9a3f7 |
205 | .long 0xc67178f2 |
206 | |
207 | |
208 | .p2align 4 |
209 | |
210 | .globl _AccelerateCrypto_SHA256_compress |
211 | _AccelerateCrypto_SHA256_compress: |
212 | |
213 | |
214 | #define hashes x0 |
215 | #define numblocks x1 |
216 | #define data x2 |
217 | #define ktable x3 |
218 | BRANCH_TARGET_CALL |
219 | #ifdef __ILP32__ |
220 | uxtw numblocks, numblocks // in arm64_32 size_t is 32-bit, so we need to extend it |
221 | #endif |
222 | |
223 | |
224 | adrp ktable, K256@page |
225 | cbnz numblocks, 1f // if number of blocks is nonzero, go on for sha256 transform operation |
226 | ret lr // otherwise, return |
227 | 1: |
228 | add ktable, ktable, K256@pageoff |
229 | |
230 | #if BUILDKERNEL |
231 | // save q0-q7, q16-q24 8+8+1=19 |
232 | sub x4, sp, #17*16 |
233 | sub sp, sp, #17*16 |
234 | st1.4s {v0, v1, v2, v3}, [x4], #64 |
235 | st1.4s {v4, v5, v6, v7}, [x4], #64 |
236 | st1.4s {v16, v17, v18, v19}, [x4], #64 |
237 | st1.4s {v20, v21, v22, v23}, [x4], #64 |
238 | st1.4s {v24}, [x4], #16 |
239 | #endif |
240 | |
241 | ld1.4s {v0,v1,v2,v3}, [data], #64 // w0,w1,w2,w3 need to bswap into big-endian |
242 | |
243 | rev32.16b v0, v0 // byte swap of 1st 4 ints |
244 | ldr q21, [ktable, #16*0] |
245 | rev32.16b v1, v1 // byte swap of 2nd 4 ints |
246 | ldr q16, [hashes, #0] |
247 | rev32.16b v2, v2 // byte swap of 3rd 4 ints |
248 | ldr q17, [hashes, #16] |
249 | rev32.16b v3, v3 // byte swap of 4th 4 ints |
250 | ldr q22, [ktable, #16*1] |
251 | |
252 | mov.16b v18, v16 |
253 | ldr q23, [ktable, #16*2] |
254 | add.4s v4, v0, v21 // 1st 4 input + K256 |
255 | ldr q24, [ktable, #16*3] |
256 | add.4s v5, v1, v22 // 2nd 4 input + K256 |
257 | mov.16b v19, v17 |
258 | add.4s v6, v2, v23 // 3rd 4 input + K256 |
259 | add.4s v7, v3, v24 // 4th 4 input + K256 |
260 | add ktable, ktable, #16*4 |
261 | |
262 | |
263 | .macro sha256_round |
264 | mov.16b v20, v18 |
265 | SHA256SU0 $0, $1 |
266 | SHA256H 18, 19, $4 |
267 | SHA256SU1 $0, $2, $3 |
268 | SHA256H2 19, 20, $4 |
269 | add.4s $6, $5, $7 |
270 | .endm |
271 | |
272 | // 4 vector hashes update and load next vector rounds |
273 | .macro sha256_hash_load_round |
274 | mov.16b v20, v18 |
275 | SHA256H 18, 19, $0 |
276 | rev32.16b $1, $1 |
277 | SHA256H2 19, 20, $0 |
278 | add.4s $2, $1, $3 |
279 | .endm |
280 | |
281 | .macro sha256_hash_round |
282 | mov.16b v20, v18 |
283 | SHA256H 18, 19, $0 |
284 | SHA256H2 19, 20, $0 |
285 | .endm |
286 | |
287 | // 12 vector hash and sequence update rounds |
288 | mov w4, #3 |
289 | L_i_loop: |
290 | mov.16b v20, v18 |
291 | ldr q21, [ktable, #0] // k0 |
292 | SHA256SU0 0, 1 |
293 | ldr q22, [ktable, #16] // k1 |
294 | SHA256H 18, 19, 4 |
295 | ldr q23, [ktable, #32] // k2 |
296 | SHA256SU1 0, 2, 3 |
297 | ldr q24, [ktable, #48] // k3 |
298 | SHA256H2 19, 20, 4 |
299 | add ktable, ktable, #64 |
300 | add.4s v4, v0, v21 |
301 | |
302 | sha256_round 1, 2, 3, 0, 5, v1, v5, v22 |
303 | sha256_round 2, 3, 0, 1, 6, v2, v6, v23 |
304 | subs w4, w4, #1 |
305 | sha256_round 3, 0, 1, 2, 7, v3, v7, v24 |
306 | b.gt L_i_loop |
307 | |
308 | subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1 |
309 | b.le L_wrapup |
310 | |
311 | sub ktable, ktable, #256 |
312 | |
313 | L_loop: |
314 | |
315 | ldr q0, [data, #0] |
316 | mov.16b v20, v18 |
317 | ldr q21, [ktable,#0] |
318 | SHA256H 18, 19, 4 |
319 | ldr q1, [data, #16] |
320 | rev32.16b v0, v0 |
321 | ldr q2, [data, #32] |
322 | SHA256H2 19, 20, 4 |
323 | ldr q3, [data, #48] |
324 | add.4s v4, v0, v21 |
325 | |
326 | ldr q22, [ktable,#16] |
327 | mov.16b v20, v18 |
328 | add data, data, #64 |
329 | SHA256H 18, 19, 5 |
330 | ldr q23, [ktable,#32] |
331 | rev32.16b v1, v1 |
332 | ldr q24, [ktable,#48] |
333 | SHA256H2 19, 20, 5 |
334 | add.4s v5, v1, v22 |
335 | |
336 | sha256_hash_load_round 6, v2, v6, v23 |
337 | sha256_hash_load_round 7, v3, v7, v24 |
338 | |
339 | add.4s v18, v16, v18 |
340 | add.4s v19, v17, v19 |
341 | mov.16b v16, v18 |
342 | mov.16b v17, v19 |
343 | |
344 | // 12 vector hash and sequence update rounds |
345 | mov.16b v20, v18 |
346 | ldr q21, [ktable, #16*4] // k0 |
347 | SHA256SU0 0, 1 |
348 | ldr q22, [ktable, #16*5] // k1 |
349 | SHA256H 18, 19, 4 |
350 | ldr q23, [ktable, #16*6] // k2 |
351 | SHA256SU1 0, 2, 3 |
352 | ldr q24, [ktable, #16*7] // k3 |
353 | SHA256H2 19, 20, 4 |
354 | add.4s v4, v0, v21 |
355 | |
356 | sha256_round 1, 2, 3, 0, 5, v1, v5, v22 |
357 | sha256_round 2, 3, 0, 1, 6, v2, v6, v23 |
358 | sha256_round 3, 0, 1, 2, 7, v3, v7, v24 |
359 | mov.16b v20, v18 |
360 | ldr q21, [ktable, #16*8] // k0 |
361 | SHA256SU0 0, 1 |
362 | ldr q22, [ktable, #16*9] // k1 |
363 | SHA256H 18, 19, 4 |
364 | ldr q23, [ktable, #16*10] // k2 |
365 | SHA256SU1 0, 2, 3 |
366 | ldr q24, [ktable, #16*11] // k3 |
367 | SHA256H2 19, 20, 4 |
368 | add.4s v4, v0, v21 |
369 | |
370 | sha256_round 1, 2, 3, 0, 5, v1, v5, v22 |
371 | sha256_round 2, 3, 0, 1, 6, v2, v6, v23 |
372 | sha256_round 3, 0, 1, 2, 7, v3, v7, v24 |
373 | |
374 | mov.16b v20, v18 |
375 | ldr q21, [ktable, #16*12] // k0 |
376 | SHA256SU0 0, 1 |
377 | ldr q22, [ktable, #16*13] // k1 |
378 | SHA256H 18, 19, 4 |
379 | ldr q23, [ktable, #16*14] // k2 |
380 | SHA256SU1 0, 2, 3 |
381 | ldr q24, [ktable, #16*15] // k3 |
382 | SHA256H2 19, 20, 4 |
383 | add.4s v4, v0, v21 |
384 | |
385 | sha256_round 1, 2, 3, 0, 5, v1, v5, v22 |
386 | sha256_round 2, 3, 0, 1, 6, v2, v6, v23 |
387 | sha256_round 3, 0, 1, 2, 7, v3, v7, v24 |
388 | |
389 | subs numblocks, numblocks, #1 // pre-decrement num_blocks by 1 |
390 | b.gt L_loop |
391 | |
392 | L_wrapup: |
393 | |
394 | sha256_hash_round 4 |
395 | sha256_hash_round 5 |
396 | sha256_hash_round 6 |
397 | sha256_hash_round 7 |
398 | |
399 | add.4s v16, v16, v18 |
400 | add.4s v17, v17, v19 |
401 | st1.4s {v16,v17}, [hashes] // hashes q16 : d,c,b,a q17 : h,g,f,e |
402 | |
403 | #if BUILDKERNEL |
404 | // restore q9-q13, q0-q7, q16-q31 |
405 | ld1.4s {v0, v1, v2, v3}, [sp], #64 |
406 | ld1.4s {v4, v5, v6, v7}, [sp], #64 |
407 | ld1.4s {v16, v17, v18, v19}, [sp], #64 |
408 | ld1.4s {v20, v21, v22, v23}, [sp], #64 |
409 | ld1.4s {v24}, [sp], #16 |
410 | #endif |
411 | |
412 | ret lr |
413 | |
414 | |
415 | #endif // arm64 |
416 | |
417 | |