| 1 | /* |
| 2 | * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | * |
| 28 | * This file implements the following functions for the arm64 architecture. |
| 29 | * |
| 30 | * void bcopy(const void * source, |
| 31 | * void * destination, |
| 32 | * size_t length); |
| 33 | * |
| 34 | * void *memmove(void * destination, |
| 35 | * const void * source, |
| 36 | * size_t n); |
| 37 | * |
| 38 | * void *memcpy(void * restrict destination, |
| 39 | * const void * restrict source, |
| 40 | * size_t n); |
| 41 | * |
| 42 | * All copy n successive bytes from source to destination. Memmove and memcpy |
| 43 | * return destination, whereas bcopy has no return value. Copying takes place |
| 44 | * as if it were through a temporary buffer -- after return destination |
| 45 | * contains exactly the bytes from source, even if the buffers overlap (this is |
| 46 | * not required of memcpy by the C standard; its behavior is undefined if the |
| 47 | * buffers overlap, but we are holding ourselves to the historical behavior of |
| 48 | * this function on MacOS). |
| 49 | */ |
| 50 | |
| 51 | #include "asm.h" |
| 52 | |
| 53 | .globl _bcopy |
| 54 | .globl _ovbcopy |
| 55 | .globl _memcpy |
| 56 | .globl _memmove |
| 57 | |
| 58 | /***************************************************************************** |
| 59 | * Macros * |
| 60 | *****************************************************************************/ |
| 61 | |
| 62 | #define kSmallCopy 64 |
| 63 | |
| 64 | /***************************************************************************** |
| 65 | * Entrypoints * |
| 66 | *****************************************************************************/ |
| 67 | |
| 68 | .text |
| 69 | .align 5 |
| 70 | _bcopy: |
| 71 | _ovbcopy: |
| 72 | // Translate bcopy into memcpy by swapping the first and second arguments. |
| 73 | mov x3, x0 |
| 74 | mov x0, x1 |
| 75 | mov x1, x3 |
| 76 | |
| 77 | .align 4 |
| 78 | _memcpy: |
| 79 | _memmove: |
| 80 | // Our preference is to copy the data in ascending address order, but if the |
| 81 | // buffers overlap such that the beginning of the destination buffer aliases |
| 82 | // the end of the source buffer, we need to copy in descending address order |
| 83 | // instead to preserve the memmove semantics. We detect this case with the |
| 84 | // test: |
| 85 | // |
| 86 | // destination - source < length (unsigned compare) |
| 87 | // |
| 88 | // If the address of the source buffer is higher than the address of the |
| 89 | // destination buffer, this arithmetic can overflow, but the overflowed value |
| 90 | // can only be smaller than length if the buffers do not overlap, so we don't |
| 91 | // need to worry about false positives due to the overflow (they happen, but |
| 92 | // only in cases where copying in either order is correct). |
| 93 | ARM64_STACK_PROLOG |
| 94 | PUSH_FRAME |
| 95 | sub x3, x0, x1 |
| 96 | cmp x3, x2 |
| 97 | b.cc L_reverse |
| 98 | mov x3, x0 // copy destination pointer |
| 99 | cmp x2, #(kSmallCopy) |
| 100 | b.cc L_forwardSmallCopy |
| 101 | |
| 102 | /***************************************************************************** |
| 103 | * Forward large copy * |
| 104 | *****************************************************************************/ |
| 105 | |
| 106 | // Load the first 32 bytes from src, and compute the number of bytes to the |
| 107 | // first 32-byte aligned location in dst. Even though we are going to copy |
| 108 | // 32 bytes, only those preceeding that 32-byte location "count" towards |
| 109 | // reducing the length of the buffer or advancing the pointers. We will need |
| 110 | // to issue the first load from the advanced src pointer BEFORE the store to |
| 111 | // the unmodified dst pointer. |
| 112 | add x3, x3, #32 |
| 113 | and x3, x3, #-32 // aligned dst |
| 114 | ldp x12,x13,[x1] |
| 115 | ldp x14,x15,[x1, #16] |
| 116 | sub x5, x3, x0 // bytes between original dst and aligned dst |
| 117 | add x1, x1, x5 // update src pointer |
| 118 | |
| 119 | // At this point, data in the following registers is in flight: |
| 120 | // |
| 121 | // x0 original dst pointer |
| 122 | // x1 corresponding location in src buffer. |
| 123 | // x2 length from aligned location in dst to end of buffer. This is |
| 124 | // guaranteed to be >= (64 - 32). |
| 125 | // x3 aligned location in dst buffer. |
| 126 | // x12:x15 first 32 bytes of src buffer. |
| 127 | // |
| 128 | // We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3. The |
| 129 | // store *may* overlap the first 32 bytes of the load, so in order to get |
| 130 | // correct memmove semantics, the first 32 byte load must occur before the |
| 131 | // store. |
| 132 | // |
| 133 | // After loading these 32 bytes, we advance x1, and decrement the length by |
| 134 | // 64. If the remaining length of the buffer was less than 64, then we jump |
| 135 | // directly to the cleanup path. |
| 136 | ldp x8, x9, [x1] |
| 137 | ldp x10,x11,[x1, #16] |
| 138 | add x1, x1, #32 |
| 139 | sub x2, x2, x5 // update length |
| 140 | stp x12,x13,[x0] // initial unaligned store |
| 141 | stp x14,x15,[x0, #16] // initial unaligned store |
| 142 | subs x2, x2, #64 |
| 143 | b.ls L_forwardCleanup |
| 144 | |
| 145 | L_forwardCopyLoop: |
| 146 | // Main copy loop: |
| 147 | // |
| 148 | // 1. store the 32 bytes loaded in the previous loop iteration |
| 149 | // 2. advance the destination pointer |
| 150 | // 3. load the next 32 bytes |
| 151 | // 4. advance the source pointer |
| 152 | // 5. subtract 32 from the length |
| 153 | // |
| 154 | // The loop is terminated when 32 or fewer bytes remain to be loaded. Those |
| 155 | // trailing 1-32 bytes will be copied in the loop cleanup. |
| 156 | stnp x8, x9, [x3] |
| 157 | stnp x10,x11,[x3, #16] |
| 158 | add x3, x3, #32 |
| 159 | ldnp x8, x9, [x1] |
| 160 | ldnp x10,x11,[x1, #16] |
| 161 | add x1, x1, #32 |
| 162 | subs x2, x2, #32 |
| 163 | b.hi L_forwardCopyLoop |
| 164 | |
| 165 | L_forwardCleanup: |
| 166 | // There are 32 bytes in x8-x11 that were loaded in the previous loop |
| 167 | // iteration, which need to be stored to [x3,x3+32). In addition, between |
| 168 | // 0 and 32 more bytes need to be copied from x1 to x3 + 32. The exact |
| 169 | // number of bytes to copy is x2 + 32. Instead of using smaller conditional |
| 170 | // copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2. |
| 171 | // This copy may overlap with the first store, so the loads must come before |
| 172 | // the store of the data from the previous loop iteration. |
| 173 | add x1, x1, x2 |
| 174 | ldp x12,x13,[x1] |
| 175 | ldp x14,x15,[x1, #16] |
| 176 | stp x8, x9, [x3] |
| 177 | stp x10,x11,[x3, #16] |
| 178 | add x3, x3, x2 |
| 179 | stp x12,x13,[x3, #32] |
| 180 | stp x14,x15,[x3, #48] |
| 181 | POP_FRAME |
| 182 | ARM64_STACK_EPILOG |
| 183 | |
| 184 | /***************************************************************************** |
| 185 | * forward small copy * |
| 186 | *****************************************************************************/ |
| 187 | |
| 188 | // Copy one quadword at a time until less than 8 bytes remain to be copied. |
| 189 | // At the point of entry to L_forwardSmallCopy, the "calling convention" |
| 190 | // is as follows: |
| 191 | // |
| 192 | // x0 pointer to first byte of destination |
| 193 | // x1 pointer to first byte of source |
| 194 | // x2 length of buffers |
| 195 | // x3 pointer to first byte of destination |
| 196 | 0: ldr x6, [x1],#8 |
| 197 | str x6, [x3],#8 |
| 198 | L_forwardSmallCopy: |
| 199 | subs x2, x2, #8 |
| 200 | b.cs 0b |
| 201 | adds x2, x2, #8 |
| 202 | b.eq 2f |
| 203 | 1: ldrb w6, [x1],#1 |
| 204 | strb w6, [x3],#1 |
| 205 | subs x2, x2, #1 |
| 206 | b.ne 1b |
| 207 | 2: POP_FRAME |
| 208 | ARM64_STACK_EPILOG |
| 209 | |
| 210 | /***************************************************************************** |
| 211 | * Reverse copy engines * |
| 212 | *****************************************************************************/ |
| 213 | |
| 214 | // The reverse copy engines are identical in every way to the forward copy |
| 215 | // engines, except in that they do everything backwards. For this reason, they |
| 216 | // are somewhat more sparsely commented than the forward copy loops. I have |
| 217 | // tried to only comment things that might be somewhat surprising in how they |
| 218 | // differ from the forward implementation. |
| 219 | // |
| 220 | // The one important thing to note is that (almost without fail), x1 and x3 |
| 221 | // will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer |
| 222 | // throughout these copy loops. They are initially advanced to that position |
| 223 | // in the L_reverse jump island. Because of this, whereas the forward copy |
| 224 | // loops generally follow a "copy data, then advance pointers" scheme, in the |
| 225 | // reverse copy loops, we advance the pointers, then copy the data. |
| 226 | |
| 227 | L_reverse: |
| 228 | // As a minor optimization, we early out if dst == src. |
| 229 | cbz x3, L_return |
| 230 | // advance both pointers to the ends of their respective buffers before |
| 231 | // jumping into the appropriate reverse copy loop. |
| 232 | add x4, x0, x2 |
| 233 | add x1, x1, x2 |
| 234 | cmp x2, #(kSmallCopy) |
| 235 | b.cc L_reverseSmallCopy |
| 236 | |
| 237 | /***************************************************************************** |
| 238 | * Reverse large copy * |
| 239 | *****************************************************************************/ |
| 240 | |
| 241 | ldp x12,x13,[x1, #-16] |
| 242 | ldp x14,x15,[x1, #-32] |
| 243 | sub x3, x4, #1 // In the forward copy, we used dst+32 & -32 |
| 244 | and x3, x3, #-32 // to find an aligned location in the dest |
| 245 | sub x5, x4, x3 // buffer. Here we use dst-1 & -32 instead, |
| 246 | sub x1, x1, x5 // because we are going backwards. |
| 247 | sub x2, x2, x5 |
| 248 | ldp x8, x9, [x1, #-16] |
| 249 | ldp x10,x11,[x1, #-32] |
| 250 | stp x12,x13,[x4, #-16] |
| 251 | stp x14,x15,[x4, #-32] |
| 252 | sub x1, x1, #32 |
| 253 | subs x2, x2, #64 |
| 254 | b.ls L_reverseCleanup |
| 255 | |
| 256 | L_reverseCopyLoop: |
| 257 | stnp x8, x9, [x3, #-16] |
| 258 | stnp x10,x11,[x3, #-32] |
| 259 | sub x3, x3, #32 |
| 260 | ldnp x8, x9, [x1, #-16] |
| 261 | ldnp x10,x11,[x1, #-32] |
| 262 | sub x1, x1, #32 |
| 263 | subs x2, x2, #32 |
| 264 | b.hi L_reverseCopyLoop |
| 265 | |
| 266 | L_reverseCleanup: |
| 267 | sub x1, x1, x2 |
| 268 | ldp x12,x13,[x1, #-16] |
| 269 | ldp x14,x15,[x1, #-32] |
| 270 | stp x8, x9, [x3, #-16] |
| 271 | stp x10,x11,[x3, #-32] |
| 272 | stp x12,x13,[x0, #16] // In the forward copy, we need to compute the |
| 273 | stp x14,x15,[x0] // address of these stores, but here we already |
| 274 | POP_FRAME // have a pointer to the start of the buffer. |
| 275 | ARM64_STACK_EPILOG |
| 276 | |
| 277 | /***************************************************************************** |
| 278 | * reverse small copy * |
| 279 | *****************************************************************************/ |
| 280 | |
| 281 | 0: ldr x6, [x1,#-8]! |
| 282 | str x6, [x4,#-8]! |
| 283 | L_reverseSmallCopy: |
| 284 | subs x2, x2, #8 |
| 285 | b.cs 0b |
| 286 | adds x2, x2, #8 |
| 287 | b.eq 2f |
| 288 | 1: ldrb w6, [x1,#-1]! |
| 289 | strb w6, [x4,#-1]! |
| 290 | subs x2, x2, #1 |
| 291 | b.ne 1b |
| 292 | 2: POP_FRAME |
| 293 | ARM64_STACK_EPILOG |
| 294 | |
| 295 | |
| 296 | L_return: |
| 297 | POP_FRAME |
| 298 | ARM64_STACK_EPILOG |
| 299 | |