1 | /* |
2 | * Copyright (c) 2012 Apple Computer, Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | * |
28 | * This file implements the following function for the arm64 architecture: |
29 | * |
30 | * int memcmp_zero_ptr_aligned(const void *s, size_t n); |
31 | * |
32 | * The memcmp_zero_ptr_aligned function checks string s of n bytes contains all zeros. |
33 | * Address and size of the string s must be pointer-aligned (8-byte for arm64). |
34 | * Return 0 if true, 1 otherwise. Also return 0 if n is 0. |
35 | */ |
36 | |
37 | /* this guard is used by tests */ |
38 | #ifdef __arm64__ |
39 | |
40 | #include "asm.h" |
41 | |
42 | .globl _memcmp_zero_ptr_aligned |
43 | |
44 | /***************************************************************************** |
45 | * Macros * |
46 | *****************************************************************************/ |
47 | |
48 | .macro EstablishFrame |
49 | ARM64_STACK_PROLOG |
50 | stp fp, lr, [sp, #-16]! |
51 | mov fp, sp |
52 | .endm |
53 | |
54 | .macro ClearFrameAndReturn |
55 | ldp fp, lr, [sp], #16 |
56 | ARM64_STACK_EPILOG |
57 | .endm |
58 | |
59 | /***************************************************************************** |
60 | * Constants * |
61 | *****************************************************************************/ |
62 | |
63 | .text |
64 | .align 5 |
65 | |
66 | /***************************************************************************** |
67 | * memcmp_zero_ptr_aligned entrypoint * |
68 | *****************************************************************************/ |
69 | |
70 | _memcmp_zero_ptr_aligned: |
71 | |
72 | // For the use case in <rdar://problem/59523721>, memory corruption should be rare |
73 | // so check for all zeros is fairly simple when early out is not necessary. |
74 | // We just load all the bytes and logical OR them together. If the result |
75 | // is still zero, all the bytes are zero. |
76 | |
77 | EstablishFrame |
78 | cmp x1, #64 |
79 | b.lo L_sizeIsSmall |
80 | |
81 | // Load the first 64 bytes, and compute the number of bytes to the |
82 | // first 64-byte aligned location. Even though we are going to test |
83 | // 64 bytes, only those preceeding that 64-byte location "count" towards |
84 | // reducing the length of the buffer or advancing the pointers. |
85 | mov x2, x0 // copy the original addr |
86 | add x0, x0, #64 |
87 | and x0, x0, #-64 // aligned addr |
88 | ldp q4, q5, [x2] |
89 | ldp q6, q7, [x2, #32] |
90 | sub x2, x0, x2 // bytes between original and aligned addr |
91 | sub x1, x1, x2 // update length |
92 | subs x1, x1, #64 // check length > 64 |
93 | b.ls L_cleanup |
94 | |
95 | L_loop: |
96 | ldp q0, q1, [x0] |
97 | ldp q2, q3, [x0, #32] |
98 | orr.16b v4, v4, v0 // use orr to keep non-zero bytes |
99 | orr.16b v5, v5, v1 |
100 | orr.16b v6, v6, v2 |
101 | orr.16b v7, v7, v3 |
102 | add x0, x0, #64 // advance pointer |
103 | subs x1, x1, #64 // check length > 64 |
104 | b.hi L_loop |
105 | |
106 | L_cleanup: |
107 | // Between 0 and 64 more bytes need to be tested. The exact |
108 | // number of bytes to test is x1 + 64. Instead of using smaller conditional |
109 | // checks, we simply check 64 unaligned bytes from x0+x1. This load may overlap |
110 | // with the previous one but it's ok. |
111 | add x0, x0, x1 |
112 | ldp q0, q1, [x0] |
113 | ldp q2, q3, [x0, #32] |
114 | orr.16b v4, v4, v0 // use orr to keep non-zero bytes |
115 | orr.16b v5, v5, v1 |
116 | orr.16b v6, v6, v2 |
117 | orr.16b v7, v7, v3 |
118 | |
119 | orr.16b v4, v4, v5 // reduce four regs into two |
120 | orr.16b v6, v6, v7 |
121 | orr.16b v4, v4, v6 // reduce two regs into one |
122 | umaxv.16b b0, v4 // reduce 16 bytes into one |
123 | umov w0, v0.b[0] // move byte to GPR for testing |
124 | tst w0, w0 |
125 | cset x0, ne // return 1 if non-zero, 0 otherwise |
126 | ClearFrameAndReturn |
127 | |
128 | L_sizeIsSmall: |
129 | cbz x1, L_sizeIsZero // return zero if length is zero |
130 | |
131 | mov x3, #0 |
132 | 0: ldr x2, [x0],#8 |
133 | orr x3, x3, x2 // use orr to keep non-zero bytes |
134 | subs x1, x1, #8 // update length |
135 | b.hi 0b |
136 | |
137 | tst x3, x3 |
138 | cset x0, ne // return 1 if non-zero, 0 otherwise |
139 | ClearFrameAndReturn |
140 | |
141 | L_sizeIsZero: |
142 | mov x0, #0 |
143 | ClearFrameAndReturn |
144 | |
145 | #endif // __arm64__ |
146 | |