| 1 | /* |
| 2 | * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | * |
| 28 | * This file implements the following function for the arm64 architecture: |
| 29 | * |
| 30 | * int memcmp_zero_ptr_aligned(const void *s, size_t n); |
| 31 | * |
| 32 | * The memcmp_zero_ptr_aligned function checks string s of n bytes contains all zeros. |
| 33 | * Address and size of the string s must be pointer-aligned (8-byte for arm64). |
| 34 | * Return 0 if true, 1 otherwise. Also return 0 if n is 0. |
| 35 | */ |
| 36 | |
| 37 | /* this guard is used by tests */ |
| 38 | #ifdef __arm64__ |
| 39 | |
| 40 | #include "asm.h" |
| 41 | |
| 42 | .globl _memcmp_zero_ptr_aligned |
| 43 | |
| 44 | /***************************************************************************** |
| 45 | * Macros * |
| 46 | *****************************************************************************/ |
| 47 | |
| 48 | .macro EstablishFrame |
| 49 | ARM64_STACK_PROLOG |
| 50 | stp fp, lr, [sp, #-16]! |
| 51 | mov fp, sp |
| 52 | .endm |
| 53 | |
| 54 | .macro ClearFrameAndReturn |
| 55 | ldp fp, lr, [sp], #16 |
| 56 | ARM64_STACK_EPILOG |
| 57 | .endm |
| 58 | |
| 59 | /***************************************************************************** |
| 60 | * Constants * |
| 61 | *****************************************************************************/ |
| 62 | |
| 63 | .text |
| 64 | .align 5 |
| 65 | |
| 66 | /***************************************************************************** |
| 67 | * memcmp_zero_ptr_aligned entrypoint * |
| 68 | *****************************************************************************/ |
| 69 | |
| 70 | _memcmp_zero_ptr_aligned: |
| 71 | |
| 72 | // For the use case in <rdar://problem/59523721>, memory corruption should be rare |
| 73 | // so check for all zeros is fairly simple when early out is not necessary. |
| 74 | // We just load all the bytes and logical OR them together. If the result |
| 75 | // is still zero, all the bytes are zero. |
| 76 | |
| 77 | EstablishFrame |
| 78 | cmp x1, #64 |
| 79 | b.lo L_sizeIsSmall |
| 80 | |
| 81 | // Load the first 64 bytes, and compute the number of bytes to the |
| 82 | // first 64-byte aligned location. Even though we are going to test |
| 83 | // 64 bytes, only those preceeding that 64-byte location "count" towards |
| 84 | // reducing the length of the buffer or advancing the pointers. |
| 85 | mov x2, x0 // copy the original addr |
| 86 | add x0, x0, #64 |
| 87 | and x0, x0, #-64 // aligned addr |
| 88 | ldp q4, q5, [x2] |
| 89 | ldp q6, q7, [x2, #32] |
| 90 | sub x2, x0, x2 // bytes between original and aligned addr |
| 91 | sub x1, x1, x2 // update length |
| 92 | subs x1, x1, #64 // check length > 64 |
| 93 | b.ls L_cleanup |
| 94 | |
| 95 | L_loop: |
| 96 | ldp q0, q1, [x0] |
| 97 | ldp q2, q3, [x0, #32] |
| 98 | orr.16b v4, v4, v0 // use orr to keep non-zero bytes |
| 99 | orr.16b v5, v5, v1 |
| 100 | orr.16b v6, v6, v2 |
| 101 | orr.16b v7, v7, v3 |
| 102 | add x0, x0, #64 // advance pointer |
| 103 | subs x1, x1, #64 // check length > 64 |
| 104 | b.hi L_loop |
| 105 | |
| 106 | L_cleanup: |
| 107 | // Between 0 and 64 more bytes need to be tested. The exact |
| 108 | // number of bytes to test is x1 + 64. Instead of using smaller conditional |
| 109 | // checks, we simply check 64 unaligned bytes from x0+x1. This load may overlap |
| 110 | // with the previous one but it's ok. |
| 111 | add x0, x0, x1 |
| 112 | ldp q0, q1, [x0] |
| 113 | ldp q2, q3, [x0, #32] |
| 114 | orr.16b v4, v4, v0 // use orr to keep non-zero bytes |
| 115 | orr.16b v5, v5, v1 |
| 116 | orr.16b v6, v6, v2 |
| 117 | orr.16b v7, v7, v3 |
| 118 | |
| 119 | orr.16b v4, v4, v5 // reduce four regs into two |
| 120 | orr.16b v6, v6, v7 |
| 121 | orr.16b v4, v4, v6 // reduce two regs into one |
| 122 | umaxv.16b b0, v4 // reduce 16 bytes into one |
| 123 | umov w0, v0.b[0] // move byte to GPR for testing |
| 124 | tst w0, w0 |
| 125 | cset x0, ne // return 1 if non-zero, 0 otherwise |
| 126 | ClearFrameAndReturn |
| 127 | |
| 128 | L_sizeIsSmall: |
| 129 | cbz x1, L_sizeIsZero // return zero if length is zero |
| 130 | |
| 131 | mov x3, #0 |
| 132 | 0: ldr x2, [x0],#8 |
| 133 | orr x3, x3, x2 // use orr to keep non-zero bytes |
| 134 | subs x1, x1, #8 // update length |
| 135 | b.hi 0b |
| 136 | |
| 137 | tst x3, x3 |
| 138 | cset x0, ne // return 1 if non-zero, 0 otherwise |
| 139 | ClearFrameAndReturn |
| 140 | |
| 141 | L_sizeIsZero: |
| 142 | mov x0, #0 |
| 143 | ClearFrameAndReturn |
| 144 | |
| 145 | #endif // __arm64__ |
| 146 | |