1/*
2 * Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 *
28 * This file implements the following function for the arm64 architecture:
29 *
30 * int memcmp_zero_ptr_aligned(const void *s, size_t n);
31 *
32 * The memcmp_zero_ptr_aligned function checks string s of n bytes contains all zeros.
33 * Address and size of the string s must be pointer-aligned (8-byte for arm64).
34 * Return 0 if true, 1 otherwise. Also return 0 if n is 0.
35 */
36
37/* this guard is used by tests */
38#ifdef __arm64__
39
40#include "asm.h"
41
42.globl _memcmp_zero_ptr_aligned
43
44/*****************************************************************************
45 * Macros *
46 *****************************************************************************/
47
48.macro EstablishFrame
49 ARM64_STACK_PROLOG
50 stp fp, lr, [sp, #-16]!
51 mov fp, sp
52.endm
53
54.macro ClearFrameAndReturn
55 ldp fp, lr, [sp], #16
56 ARM64_STACK_EPILOG
57.endm
58
59/*****************************************************************************
60 * Constants *
61 *****************************************************************************/
62
63.text
64.align 5
65
66/*****************************************************************************
67 * memcmp_zero_ptr_aligned entrypoint *
68 *****************************************************************************/
69
70_memcmp_zero_ptr_aligned:
71
72// For the use case in <rdar://problem/59523721>, memory corruption should be rare
73// so check for all zeros is fairly simple when early out is not necessary.
74// We just load all the bytes and logical OR them together. If the result
75// is still zero, all the bytes are zero.
76
77 EstablishFrame
78 cmp x1, #64
79 b.lo L_sizeIsSmall
80
81// Load the first 64 bytes, and compute the number of bytes to the
82// first 64-byte aligned location. Even though we are going to test
83// 64 bytes, only those preceeding that 64-byte location "count" towards
84// reducing the length of the buffer or advancing the pointers.
85 mov x2, x0 // copy the original addr
86 add x0, x0, #64
87 and x0, x0, #-64 // aligned addr
88 ldp q4, q5, [x2]
89 ldp q6, q7, [x2, #32]
90 sub x2, x0, x2 // bytes between original and aligned addr
91 sub x1, x1, x2 // update length
92 subs x1, x1, #64 // check length > 64
93 b.ls L_cleanup
94
95L_loop:
96 ldp q0, q1, [x0]
97 ldp q2, q3, [x0, #32]
98 orr.16b v4, v4, v0 // use orr to keep non-zero bytes
99 orr.16b v5, v5, v1
100 orr.16b v6, v6, v2
101 orr.16b v7, v7, v3
102 add x0, x0, #64 // advance pointer
103 subs x1, x1, #64 // check length > 64
104 b.hi L_loop
105
106L_cleanup:
107// Between 0 and 64 more bytes need to be tested. The exact
108// number of bytes to test is x1 + 64. Instead of using smaller conditional
109// checks, we simply check 64 unaligned bytes from x0+x1. This load may overlap
110// with the previous one but it's ok.
111 add x0, x0, x1
112 ldp q0, q1, [x0]
113 ldp q2, q3, [x0, #32]
114 orr.16b v4, v4, v0 // use orr to keep non-zero bytes
115 orr.16b v5, v5, v1
116 orr.16b v6, v6, v2
117 orr.16b v7, v7, v3
118
119 orr.16b v4, v4, v5 // reduce four regs into two
120 orr.16b v6, v6, v7
121 orr.16b v4, v4, v6 // reduce two regs into one
122 umaxv.16b b0, v4 // reduce 16 bytes into one
123 umov w0, v0.b[0] // move byte to GPR for testing
124 tst w0, w0
125 cset x0, ne // return 1 if non-zero, 0 otherwise
126 ClearFrameAndReturn
127
128L_sizeIsSmall:
129 cbz x1, L_sizeIsZero // return zero if length is zero
130
131 mov x3, #0
1320: ldr x2, [x0],#8
133 orr x3, x3, x2 // use orr to keep non-zero bytes
134 subs x1, x1, #8 // update length
135 b.hi 0b
136
137 tst x3, x3
138 cset x0, ne // return 1 if non-zero, 0 otherwise
139 ClearFrameAndReturn
140
141L_sizeIsZero:
142 mov x0, #0
143 ClearFrameAndReturn
144
145#endif // __arm64__
146