| 1 | /* |
| 2 | * Copyright (c) 2016-2021 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | /* |
| 30 | * extern uint32_t os_cpu_copy_in_cksum(const void *src, void *dst, |
| 31 | * uint32_t len, uint32_t sum0); |
| 32 | * |
| 33 | * input : |
| 34 | * src : source starting address |
| 35 | * dst : destination starting address |
| 36 | * len : byte stream length |
| 37 | * sum0 : initial 32-bit sum |
| 38 | * |
| 39 | * output : |
| 40 | * the source byte stream is copied into the destination buffer |
| 41 | * the function returns the partial 16-bit checksum accumulated |
| 42 | * in a 32-bit variable (without 1's complement); caller is |
| 43 | * responsible for folding the 32-bit sum into 16-bit and |
| 44 | * performing the 1's complement if applicable |
| 45 | */ |
| 46 | |
| 47 | /* |
| 48 | * The following definitions default the implementation to little-endian |
| 49 | * architectures. |
| 50 | */ |
| 51 | #define LITTLE_ENDIAN 1 |
| 52 | #define BYTE_ORDER LITTLE_ENDIAN |
| 53 | |
| 54 | /* |
| 55 | * ARM64 kernel mode -- just like user mode -- no longer requires saving |
| 56 | * the vector registers, since it's done by the exception handler code. |
| 57 | */ |
| 58 | #define SAVE_REGISTERS 0 |
| 59 | |
| 60 | .globl _os_cpu_copy_in_cksum |
| 61 | .text |
| 62 | .align 4 |
| 63 | _os_cpu_copy_in_cksum: |
| 64 | |
| 65 | #define src x0 |
| 66 | #define dst x1 |
| 67 | #define len x2 |
| 68 | #define sum x3 |
| 69 | #define need_swap x5 |
| 70 | #define t x6 |
| 71 | #define partial x7 |
| 72 | #define wpartial w7 |
| 73 | |
| 74 | mov partial, #0 // partial = 0; |
| 75 | mov need_swap, #0 // needs_swap = 0; |
| 76 | |
| 77 | cbz len, L_len_0 |
| 78 | |
| 79 | /* |
| 80 | * Deal with odd-addressed byte, use w7 to store temporary sum, deposit this |
| 81 | * byte to high byte of 16-bit in w7 |
| 82 | * |
| 83 | * t = 0; |
| 84 | * if ((uintptr_t)src & 1) { |
| 85 | * t = *src << 8; |
| 86 | * *dst++ = *src++; |
| 87 | * --len; |
| 88 | * } |
| 89 | */ |
| 90 | tst src, #1 |
| 91 | b.eq 1f |
| 92 | ldrb wpartial, [src] |
| 93 | add src, src, #1 |
| 94 | strb wpartial, [dst], #1 |
| 95 | #if BYTE_ORDER == LITTLE_ENDIAN |
| 96 | lsl partial, partial, #8 |
| 97 | #endif |
| 98 | sub len, len, #1 |
| 99 | mov need_swap, #1 |
| 100 | cbz len, L_len_0 |
| 101 | 1: |
| 102 | |
| 103 | #if SAVE_REGISTERS |
| 104 | /* |
| 105 | * we will always use v0-v3, and v4-v7/v16-v19 if len>=128 |
| 106 | * so allocate 12*16 bytes in the stack, and store v0-v3 now, |
| 107 | * keep x11 as the pointer |
| 108 | */ |
| 109 | sub sp, sp, #12*16 |
| 110 | mov x11, sp |
| 111 | st1.4s {v0, v1, v2, v3}, [x11], #4*16 |
| 112 | #endif |
| 113 | |
| 114 | /* |
| 115 | * pre-decrement len by 8*16, and if less tha 8*16 bytes, try |
| 116 | * 4*16 bytes next. |
| 117 | * v0,v1 will store temp result after we exit the L128 loop |
| 118 | */ |
| 119 | eor.16b v0, v0, v0 |
| 120 | eor.16b v1, v1, v1 |
| 121 | cmp len, #8*16 |
| 122 | mov v0.d[0], partial // move partial to 1st 64b lane in v0 |
| 123 | b.lt L64_bytes |
| 124 | |
| 125 | #if SAVE_REGISTERS |
| 126 | /* if we are here, we need to save v4-v7/v16-v19 for kernel mode */ |
| 127 | st1.4s {v4, v5, v6, v7}, [x11], #4*16 |
| 128 | st1.4s {v16, v17, v18, v19}, [x11], #4*16 |
| 129 | #endif |
| 130 | |
| 131 | /* |
| 132 | * accumulate 4 x 2 x 32-bit pairs into 8 lanes in v0-v3 |
| 133 | * load 1st 4 vectors, and clear v0-v3 |
| 134 | */ |
| 135 | ldr q4, [src], #8*16 |
| 136 | eor.16b v2, v2, v2 |
| 137 | ldr q5, [src, #-7*16] |
| 138 | eor.16b v3, v3, v3 |
| 139 | ldr q6, [src, #-6*16] |
| 140 | ldr q7, [src, #-5*16] |
| 141 | ldr q16, [src, #-4*16] |
| 142 | ldr q17, [src, #-3*16] |
| 143 | ldr q18, [src, #-2*16] |
| 144 | ldr q19, [src, #-1*16] |
| 145 | |
| 146 | /* branch to finish off if len<128 */ |
| 147 | subs len, len, #2*8*16 |
| 148 | b.lt L128_finishup |
| 149 | |
| 150 | /* |
| 151 | * loop for loading and accumulating 16 32-bit words nto 8 8-byte |
| 152 | * accumulators per iteration |
| 153 | */ |
| 154 | L128_loop: |
| 155 | str q4, [dst], #16*8 |
| 156 | uadalp.2d v0, v4 |
| 157 | str q5, [dst, #-7*16] |
| 158 | uadalp.2d v1, v5 |
| 159 | ldr q4, [src], #16*8 |
| 160 | ldr q5, [src, #-7*16] |
| 161 | |
| 162 | str q6, [dst, #-6*16] |
| 163 | uadalp.2d v2, v6 |
| 164 | str q7, [dst, #-5*16] |
| 165 | uadalp.2d v3, v7 |
| 166 | ldr q6, [src, #-6*16] |
| 167 | ldr q7, [src, #-5*16] |
| 168 | |
| 169 | str q16, [dst, #-4*16] |
| 170 | uadalp.2d v0, v16 |
| 171 | str q17, [dst, #-3*16] |
| 172 | uadalp.2d v1, v17 |
| 173 | ldr q16, [src, #-4*16] |
| 174 | ldr q17, [src, #-3*16] |
| 175 | |
| 176 | str q18, [dst, #-2*16] |
| 177 | uadalp.2d v2, v18 |
| 178 | str q19, [dst, #-1*16] |
| 179 | uadalp.2d v3, v19 |
| 180 | ldr q18, [src, #-2*16] |
| 181 | ldr q19, [src, #-1*16] |
| 182 | |
| 183 | subs len, len, #8*16 |
| 184 | b.ge L128_loop |
| 185 | |
| 186 | L128_finishup: |
| 187 | str q4, [dst], #16*8 |
| 188 | uadalp.2d v0, v4 |
| 189 | str q5, [dst, #-7*16] |
| 190 | uadalp.2d v1, v5 |
| 191 | str q6, [dst, #-6*16] |
| 192 | uadalp.2d v2, v6 |
| 193 | str q7, [dst, #-5*16] |
| 194 | uadalp.2d v3, v7 |
| 195 | |
| 196 | str q16, [dst, #-4*16] |
| 197 | uadalp.2d v0, v16 |
| 198 | str q17, [dst, #-3*16] |
| 199 | uadalp.2d v1, v17 |
| 200 | str q18, [dst, #-2*16] |
| 201 | uadalp.2d v2, v18 |
| 202 | str q19, [dst, #-1*16] |
| 203 | uadalp.2d v3, v19 |
| 204 | |
| 205 | add len, len, #8*16 |
| 206 | |
| 207 | add.2d v0, v0, v2 |
| 208 | add.2d v1, v1, v3 |
| 209 | |
| 210 | #if SAVE_REGISTERS |
| 211 | /* restore v4-v7/v16-v19 as they won't be used any more */ |
| 212 | add x11, sp, #4*16 |
| 213 | ld1.4s {v4, v5, v6, v7}, [x11], #4*16 |
| 214 | ld1.4s {v16, v17, v18, v19}, [x11], #4*16 |
| 215 | #endif |
| 216 | |
| 217 | L64_bytes: |
| 218 | cmp len, #4*16 |
| 219 | b.lt L32_bytes |
| 220 | |
| 221 | ldr q2, [src], #4*16 |
| 222 | ldr q3, [src, #-3*16] |
| 223 | str q2, [dst], #4*16 |
| 224 | uadalp.2d v0, v2 |
| 225 | str q3, [dst, #-3*16] |
| 226 | uadalp.2d v1, v3 |
| 227 | |
| 228 | ldr q2, [src, #-2*16] |
| 229 | ldr q3, [src, #-1*16] |
| 230 | str q2, [dst, #-2*16] |
| 231 | uadalp.2d v0, v2 |
| 232 | str q3, [dst, #-1*16] |
| 233 | uadalp.2d v1, v3 |
| 234 | sub len, len, #4*16 |
| 235 | |
| 236 | L32_bytes: |
| 237 | cmp len, #2*16 |
| 238 | b.lt L16_bytes |
| 239 | ldr q2, [src], #2*16 |
| 240 | ldr q3, [src, #-1*16] |
| 241 | str q2, [dst], #2*16 |
| 242 | uadalp.2d v0, v2 |
| 243 | str q3, [dst, #-1*16] |
| 244 | uadalp.2d v1, v3 |
| 245 | sub len, len, #2*16 |
| 246 | |
| 247 | L16_bytes: |
| 248 | add.2d v0, v0, v1 |
| 249 | cmp len, #16 |
| 250 | b.lt L8_bytes |
| 251 | ldr q2, [src], #16 |
| 252 | str q2, [dst], #16 |
| 253 | uadalp.2d v0, v2 |
| 254 | sub len, len, #16 |
| 255 | |
| 256 | L8_bytes: |
| 257 | eor.16b v1, v1, v1 |
| 258 | eor.16b v2, v2, v2 |
| 259 | eor.16b v3, v3, v3 |
| 260 | |
| 261 | tst len, #8 |
| 262 | b.eq L4_bytes |
| 263 | ldr d1,[src],#8 |
| 264 | str d1,[dst],#8 |
| 265 | |
| 266 | L4_bytes: |
| 267 | tst len, #4 |
| 268 | b.eq L2_bytes |
| 269 | ldr s2,[src],#4 |
| 270 | str s2,[dst],#4 |
| 271 | |
| 272 | L2_bytes: |
| 273 | uadalp.2d v0, v1 |
| 274 | eor.16b v1, v1, v1 |
| 275 | tst len, #2 |
| 276 | b.eq L_trailing_bytes |
| 277 | ldr h3,[src],#2 |
| 278 | str h3,[dst],#2 |
| 279 | |
| 280 | L_trailing_bytes: |
| 281 | tst len, #1 |
| 282 | b.eq L0_bytes |
| 283 | ldr b1,[src],#1 |
| 284 | str b1,[dst],#1 |
| 285 | #if BYTE_ORDER != LITTLE_ENDIAN |
| 286 | shl.4h v1, v1, #8 // partial <<= 8; |
| 287 | #endif |
| 288 | |
| 289 | L0_bytes: |
| 290 | uadalp.2d v2, v3 |
| 291 | uadalp.2d v0, v1 |
| 292 | uadalp.2d v0, v2 |
| 293 | |
| 294 | addp.2d d0, v0 |
| 295 | fmov partial, d0 |
| 296 | |
| 297 | #if SAVE_REGISTERS |
| 298 | /* restore v0-v3 and deallocate stack space */ |
| 299 | ld1.4s {v0, v1, v2, v3}, [sp] |
| 300 | add sp, sp, #12*16 |
| 301 | #endif |
| 302 | |
| 303 | /* partial = (partial >> 32) + (partial & 0xffffffff); */ |
| 304 | and t, partial, #0xffffffff |
| 305 | add partial, t, partial, lsr #32 |
| 306 | |
| 307 | /* partial = (partial >> 16) + (partial & 0xffff); */ |
| 308 | and t, partial, #0xffff |
| 309 | add partial, t, partial, lsr #16 |
| 310 | |
| 311 | L_len_0: |
| 312 | /* |
| 313 | * if (needs_swap) |
| 314 | * partial = (partial << 8) + (partial >> 24); |
| 315 | */ |
| 316 | cbz need_swap, 1f |
| 317 | lsl t, partial, #8 |
| 318 | add partial, t, partial, lsr #24 |
| 319 | 1: |
| 320 | /* final_acc = (sum0 >> 16) + (sum0 & 0xffff); */ |
| 321 | and x0, sum, #0xffff |
| 322 | add x0, x0, sum, lsr #16 |
| 323 | |
| 324 | /* final_acc += (partial >> 16) + (partial & 0xffff); */ |
| 325 | add x0, x0, partial, lsr #16 |
| 326 | and partial, partial, #0xffff |
| 327 | add x0, x0, partial |
| 328 | |
| 329 | /* final_acc = (final_acc >> 16) + (final_acc & 0xffff); */ |
| 330 | and t, x0, #0xffff |
| 331 | add x0, t, x0, lsr #16 |
| 332 | |
| 333 | /* |
| 334 | * One final fold in case of carry from the previous one. |
| 335 | * final_acc = (final_acc >> 16) + (final_acc & 0xffff); |
| 336 | */ |
| 337 | and t, x0, #0xffff |
| 338 | add x0, t, x0, lsr #16 |
| 339 | |
| 340 | /* |
| 341 | * return (~final_acc & 0xffff); |
| 342 | * |
| 343 | * mvn w0, w0 |
| 344 | * and w0, w0, #0xffff |
| 345 | */ |
| 346 | |
| 347 | ret lr |
| 348 | |