| 1 | /* |
| 2 | * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | * |
| 28 | * This file implements the following function for the arm64 architecture: |
| 29 | * |
| 30 | * int strncmp(const char *s1, const char *s2, size_t n); |
| 31 | * |
| 32 | * Returns 0 if the two strings are equal up to the first n bytes or to the |
| 33 | * end of the string, whichever comes first. Otherwise, returns the difference |
| 34 | * of the first mismatched characters interpreted as uint8_t. |
| 35 | */ |
| 36 | |
| 37 | #include <arm64/asm.h> |
| 38 | |
| 39 | .globl _strncmp |
| 40 | |
| 41 | /***************************************************************************** |
| 42 | * Macros * |
| 43 | *****************************************************************************/ |
| 44 | |
| 45 | .macro EstablishFrame |
| 46 | ARM64_STACK_PROLOG |
| 47 | stp fp, lr, [sp, #-16]! |
| 48 | mov fp, sp |
| 49 | .endm |
| 50 | |
| 51 | .macro ClearFrameAndReturn |
| 52 | ldp fp, lr, [sp], #16 |
| 53 | ARM64_STACK_EPILOG |
| 54 | .endm |
| 55 | |
| 56 | #include "../mach/arm/vm_param.h" |
| 57 | #define kVectorSize 16 |
| 58 | |
| 59 | /***************************************************************************** |
| 60 | * Constants * |
| 61 | *****************************************************************************/ |
| 62 | |
| 63 | .text |
| 64 | .align 5 |
| 65 | L_mask: |
| 66 | .quad 0x0706050403020100, 0x0f0e0d0c0b0a0908 |
| 67 | |
| 68 | /***************************************************************************** |
| 69 | * Entrypoints * |
| 70 | *****************************************************************************/ |
| 71 | |
| 72 | _strncmp: |
| 73 | EstablishFrame |
| 74 | eor x3, x3, x3 |
| 75 | cbz x2, L_scalarDone |
| 76 | // Compare one byte at a time until s1 has vector alignment. |
| 77 | 0: tst x0, #(kVectorSize-1) |
| 78 | b.eq L_s1aligned |
| 79 | ldrb w4, [x0],#1 // load byte from src1 |
| 80 | ldrb w5, [x1],#1 // load byte from src2 |
| 81 | subs x3, x4, x5 // if the are not equal |
| 82 | ccmp w4, #0, #4, eq // or we find an EOS |
| 83 | b.eq L_scalarDone // return the difference |
| 84 | subs x2, x2, #1 // decrement length |
| 85 | b.ne 0b // continue loop if non-zero |
| 86 | |
| 87 | // We found a mismatch or EOS before s1 became aligned. Simply return the |
| 88 | // difference between the last bytes that we loaded. |
| 89 | L_scalarDone: |
| 90 | mov x0, x3 |
| 91 | ClearFrameAndReturn |
| 92 | |
| 93 | L_s1aligned: |
| 94 | // If s2 is similarly aligned to s1, then we can use a naive vector comparison |
| 95 | // from this point on without worrying about spurious page faults; none of our |
| 96 | // loads will ever cross a page boundary, because they are all aligned. |
| 97 | tst x1, #(kVectorSize-1) |
| 98 | b.eq L_naiveVector |
| 99 | |
| 100 | /***************************************************************************** |
| 101 | * Careful chunk comparison * |
| 102 | *****************************************************************************/ |
| 103 | |
| 104 | // Otherwise, we need to be careful; although vector loads from s1 cannot |
| 105 | // cross a page boundary because they are aligned, s2 is not aligned. We |
| 106 | // compute the multiple of vector size that we can safely load before reaching |
| 107 | // a page boundary, and compare only that far before switching over to scalar |
| 108 | // comparisons to step across the page boundary. If this number happens to |
| 109 | // be zero, we jump directly to the scalar comparison. |
| 110 | neg x7, x1 |
| 111 | ands x7, x7, #(PAGE_MIN_SIZE-kVectorSize) |
| 112 | b.eq 2f |
| 113 | |
| 114 | .align 4 |
| 115 | // If n is less than the number of bytes before a page-crossing load, jump |
| 116 | // into the naive vector path instead, since we will not even reach a page |
| 117 | // crossing. Otherwise, decrement n by that number before we monkey with it, |
| 118 | // and set the decremented value aside. |
| 119 | 0: cmp x2, x7 |
| 120 | b.ls L_naiveVector |
| 121 | sub x6, x2, x7 |
| 122 | // Use vector comparisons until a mismatch or EOS is encountered, or the next |
| 123 | // vector load from s2 would be page-crossing. |
| 124 | 1: ldr q0, [x0],#(kVectorSize) |
| 125 | ldr q1, [x1],#(kVectorSize) |
| 126 | cmeq.16b v1, v0, v1 |
| 127 | and.16b v0, v0, v1 // contains zero byte iff mismatch or EOS |
| 128 | uminv.16b b1, v0 |
| 129 | fmov w3, s1 // zero only iff comparison is finished |
| 130 | cbz w3, L_vectorDone |
| 131 | subs x7, x7, #(kVectorSize) |
| 132 | b.ne 1b |
| 133 | // Restore the updated n to x2 |
| 134 | mov x2, x6 |
| 135 | // The next vector load will cross a page boundary. Instead, compare one byte |
| 136 | // at a time until s1 again has vector alignment, at which point we will have |
| 137 | // compared exactly 16 bytes. |
| 138 | 2: ldrb w4, [x0],#1 // load byte from src1 |
| 139 | ldrb w5, [x1],#1 // load byte from src2 |
| 140 | subs x3, x4, x5 // if the are not equal |
| 141 | ccmp w4, #0, #4, eq // or we find an EOS |
| 142 | b.eq L_scalarDone // return the difference |
| 143 | subs x2, x2, #1 // decrement length |
| 144 | b.eq L_scalarDone // exit loop if zero. |
| 145 | tst x0, #(kVectorSize-1) |
| 146 | b.ne 2b |
| 147 | // Having compared one vector's worth of bytes using a scalar comparison, we |
| 148 | // know that we are safely across the page boundary. Initialize x7 and jump |
| 149 | // back into the vector comparison part of the loop. |
| 150 | mov x7, #(PAGE_MIN_SIZE-kVectorSize) |
| 151 | b 0b |
| 152 | |
| 153 | /***************************************************************************** |
| 154 | * Naive vector comparison * |
| 155 | *****************************************************************************/ |
| 156 | |
| 157 | L_naiveVector: |
| 158 | subs x3, x2, #(kVectorSize) |
| 159 | b.lo L_scalar |
| 160 | add x4, x0, x3 // save the addresses of the last vectors |
| 161 | add x5, x1, x3 |
| 162 | mov x2, x3 // length -= kVectorSize |
| 163 | .align 4 |
| 164 | 0: |
| 165 | ldr q0, [x0],#(kVectorSize) |
| 166 | ldr q1, [x1],#(kVectorSize) |
| 167 | cmeq.16b v1, v0, v1 |
| 168 | and.16b v0, v0, v1 // contains zero byte iff mismatch or EOS |
| 169 | uminv.16b b1, v0 |
| 170 | fmov w3, s1 // zero only iff comparison is finished |
| 171 | cbz w3, L_vectorDone |
| 172 | subs x2, x2, #(kVectorSize) |
| 173 | b.hi 0b |
| 174 | |
| 175 | // compare the last vector |
| 176 | mov x0, x4 |
| 177 | mov x1, x5 |
| 178 | ldr q0, [x0],#(kVectorSize) |
| 179 | ldr q1, [x1],#(kVectorSize) |
| 180 | cmeq.16b v1, v0, v1 |
| 181 | and.16b v0, v0, v1 // contains zero byte iff mismatch or EOS |
| 182 | uminv.16b b1, v0 |
| 183 | fmov w3, s1 // zero only iff comparison is finished |
| 184 | cbz w3, L_vectorDone |
| 185 | |
| 186 | L_readNBytes: |
| 187 | eor x0, x0, x0 |
| 188 | ClearFrameAndReturn |
| 189 | |
| 190 | L_vectorDone: |
| 191 | // Load the bytes corresponding to the first mismatch or EOS and return |
| 192 | // their difference. |
| 193 | eor.16b v1, v1, v1 |
| 194 | cmhi.16b v0, v0, v1 // force non-zero lanes to 0xff |
| 195 | ldr q1, L_mask |
| 196 | orr.16b v0, v0, v1 // lane index in lanes containing mismatch or EOS |
| 197 | uminv.16b b1, v0 |
| 198 | fmov w3, s1 |
| 199 | sub x3, x3, #(kVectorSize) |
| 200 | ldrb w4, [x0, x3] |
| 201 | ldrb w5, [x1, x3] |
| 202 | sub x0, x4, x5 |
| 203 | ClearFrameAndReturn |
| 204 | |
| 205 | L_scalar: |
| 206 | ldrb w4, [x0],#1 // load byte from src1 |
| 207 | ldrb w5, [x1],#1 // load byte from src2 |
| 208 | subs x3, x4, x5 // if the are not equal |
| 209 | ccmp w4, #0, #4, eq // or we find an EOS |
| 210 | b.eq 1f // return the difference |
| 211 | subs x2, x2, #1 // decrement length |
| 212 | b.ne L_scalar // continue loop if non-zero |
| 213 | 1: |
| 214 | mov x0, x3 |
| 215 | ClearFrameAndReturn |
| 216 | |
| 217 | |