bcopy.s source code [xnu/osfmk/arm64/bcopy.s]

1	/*
2	* Copyright (c) 2012 Apple Computer, Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*
28	* This file implements the following functions for the arm64 architecture.
29	*
30	* void bcopy(const void * source,
31	* void * destination,
32	* size_t length);
33	*
34	* void memmove(void destination,
35	* const void * source,
36	* size_t n);
37	*
38	* void memcpy(void restrict destination,
39	* const void * restrict source,
40	* size_t n);
41	*
42	* All copy n successive bytes from source to destination. Memmove and memcpy
43	* return destination, whereas bcopy has no return value. Copying takes place
44	* as if it were through a temporary buffer -- after return destination
45	* contains exactly the bytes from source, even if the buffers overlap (this is
46	* not required of memcpy by the C standard; its behavior is undefined if the
47	* buffers overlap, but we are holding ourselves to the historical behavior of
48	* this function on MacOS).
49	*/
50
51	#include "asm.h"
52
53	.globl _bcopy
54	.globl _ovbcopy
55	.globl _memcpy
56	.globl _memmove
57
58	/*****************************************************************************
59	* Macros *
60	*****************************************************************************/
61
62	#define kSmallCopy 64
63
64	/*****************************************************************************
65	* Entrypoints *
66	*****************************************************************************/
67
68	.text
69	.align `5`
70	_bcopy:
71	_ovbcopy:
72	// Translate bcopy into memcpy by swapping the first and second arguments.
73	mov x3, x0
74	mov x0, x1
75	mov x1, x3
76
77	.align `4`
78	_memcpy:
79	_memmove:
80	// Our preference is to copy the data in ascending address order, but if the
81	// buffers overlap such that the beginning of the destination buffer aliases
82	// the end of the source buffer, we need to copy in descending address order
83	// instead to preserve the memmove semantics. We detect this case with the
84	// test:
85	//
86	// destination - source < length (unsigned compare)
87	//
88	// If the address of the source buffer is higher than the address of the
89	// destination buffer, this arithmetic can overflow, but the overflowed value
90	// can only be smaller than length if the buffers do not overlap, so we don't
91	// need to worry about false positives due to the overflow (they happen, but
92	// only in cases where copying in either order is correct).
93	ARM64_STACK_PROLOG
94	PUSH_FRAME
95	sub x3, x0, x1
96	cmp x3, x2
97	b.cc L_reverse
98	mov x3, x0 // copy destination pointer
99	cmp x2, #(kSmallCopy)
100	b.cc L_forwardSmallCopy
101
102	/*****************************************************************************
103	* Forward large copy *
104	*****************************************************************************/
105
106	// Load the first 32 bytes from src, and compute the number of bytes to the
107	// first 32-byte aligned location in dst. Even though we are going to copy
108	// 32 bytes, only those preceeding that 32-byte location "count" towards
109	// reducing the length of the buffer or advancing the pointers. We will need
110	// to issue the first load from the advanced src pointer BEFORE the store to
111	// the unmodified dst pointer.
112	add x3, x3, #`32`
113	and x3, x3, #-`32` // aligned dst
114	ldp x12,x13,[x1]
115	ldp x14,x15,[x1, #`16`]
116	sub x5, x3, x0 // bytes between original dst and aligned dst
117	add x1, x1, x5 // update src pointer
118
119	// At this point, data in the following registers is in flight:
120	//
121	// x0 original dst pointer
122	// x1 corresponding location in src buffer.
123	// x2 length from aligned location in dst to end of buffer. This is
124	// guaranteed to be >= (64 - 32).
125	// x3 aligned location in dst buffer.
126	// x12:x15 first 32 bytes of src buffer.
127	//
128	// We now load 32 bytes from x1, and store 32 bytes from x12:x15 to x3. The
129	// store may* overlap the first 32 bytes of the load, so in order to get*
130	// correct memmove semantics, the first 32 byte load must occur before the
131	// store.
132	//
133	// After loading these 32 bytes, we advance x1, and decrement the length by
134	// 64. If the remaining length of the buffer was less than 64, then we jump
135	// directly to the cleanup path.
136	ldp x8, x9, [x1]
137	ldp x10,x11,[x1, #`16`]
138	add x1, x1, #`32`
139	sub x2, x2, x5 // update length
140	stp x12,x13,[x0] // initial unaligned store
141	stp x14,x15,[x0, #`16`] // initial unaligned store
142	subs x2, x2, #`64`
143	b.ls L_forwardCleanup
144
145	L_forwardCopyLoop:
146	// Main copy loop:
147	//
148	// 1. store the 32 bytes loaded in the previous loop iteration
149	// 2. advance the destination pointer
150	// 3. load the next 32 bytes
151	// 4. advance the source pointer
152	// 5. subtract 32 from the length
153	//
154	// The loop is terminated when 32 or fewer bytes remain to be loaded. Those
155	// trailing 1-32 bytes will be copied in the loop cleanup.
156	stnp x8, x9, [x3]
157	stnp x10,x11,[x3, #`16`]
158	add x3, x3, #`32`
159	ldnp x8, x9, [x1]
160	ldnp x10,x11,[x1, #`16`]
161	add x1, x1, #`32`
162	subs x2, x2, #`32`
163	b.hi L_forwardCopyLoop
164
165	L_forwardCleanup:
166	// There are 32 bytes in x8-x11 that were loaded in the previous loop
167	// iteration, which need to be stored to [x3,x3+32). In addition, between
168	// 0 and 32 more bytes need to be copied from x1 to x3 + 32. The exact
169	// number of bytes to copy is x2 + 32. Instead of using smaller conditional
170	// copies, we simply copy 32 unaligned bytes from x1+x2 to 64+x3+x2.
171	// This copy may overlap with the first store, so the loads must come before
172	// the store of the data from the previous loop iteration.
173	add x1, x1, x2
174	ldp x12,x13,[x1]
175	ldp x14,x15,[x1, #`16`]
176	stp x8, x9, [x3]
177	stp x10,x11,[x3, #`16`]
178	add x3, x3, x2
179	stp x12,x13,[x3, #`32`]
180	stp x14,x15,[x3, #`48`]
181	POP_FRAME
182	ARM64_STACK_EPILOG
183
184	/*****************************************************************************
185	* forward small copy *
186	*****************************************************************************/
187
188	// Copy one quadword at a time until less than 8 bytes remain to be copied.
189	// At the point of entry to L_forwardSmallCopy, the "calling convention"
190	// is as follows:
191	//
192	// x0 pointer to first byte of destination
193	// x1 pointer to first byte of source
194	// x2 length of buffers
195	// x3 pointer to first byte of destination
196	`0`: ldr x6, [x1],#`8`
197	str x6, [x3],#`8`
198	L_forwardSmallCopy:
199	subs x2, x2, #`8`
200	b.cs `0b`
201	adds x2, x2, #`8`
202	b.eq `2f`
203	`1`: ldrb w6, [x1],#`1`
204	strb w6, [x3],#`1`
205	subs x2, x2, #`1`
206	b.ne `1b`
207	`2`: POP_FRAME
208	ARM64_STACK_EPILOG
209
210	/*****************************************************************************
211	* Reverse copy engines *
212	*****************************************************************************/
213
214	// The reverse copy engines are identical in every way to the forward copy
215	// engines, except in that they do everything backwards. For this reason, they
216	// are somewhat more sparsely commented than the forward copy loops. I have
217	// tried to only comment things that might be somewhat surprising in how they
218	// differ from the forward implementation.
219	//
220	// The one important thing to note is that (almost without fail), x1 and x3
221	// will point to ONE BYTE BEYOND the "right-hand edge" of the active buffer
222	// throughout these copy loops. They are initially advanced to that position
223	// in the L_reverse jump island. Because of this, whereas the forward copy
224	// loops generally follow a "copy data, then advance pointers" scheme, in the
225	// reverse copy loops, we advance the pointers, then copy the data.
226
227	L_reverse:
228	// As a minor optimization, we early out if dst == src.
229	cbz x3, L_return
230	// advance both pointers to the ends of their respective buffers before
231	// jumping into the appropriate reverse copy loop.
232	add x4, x0, x2
233	add x1, x1, x2
234	cmp x2, #(kSmallCopy)
235	b.cc L_reverseSmallCopy
236
237	/*****************************************************************************
238	* Reverse large copy *
239	*****************************************************************************/
240
241	ldp x12,x13,[x1, #-`16`]
242	ldp x14,x15,[x1, #-`32`]
243	sub x3, x4, #`1` // In the forward copy, we used dst+32 & -32
244	and x3, x3, #-`32` // to find an aligned location in the dest
245	sub x5, x4, x3 // buffer. Here we use dst-1 & -32 instead,
246	sub x1, x1, x5 // because we are going backwards.
247	sub x2, x2, x5
248	ldp x8, x9, [x1, #-`16`]
249	ldp x10,x11,[x1, #-`32`]
250	stp x12,x13,[x4, #-`16`]
251	stp x14,x15,[x4, #-`32`]
252	sub x1, x1, #`32`
253	subs x2, x2, #`64`
254	b.ls L_reverseCleanup
255
256	L_reverseCopyLoop:
257	stnp x8, x9, [x3, #-`16`]
258	stnp x10,x11,[x3, #-`32`]
259	sub x3, x3, #`32`
260	ldnp x8, x9, [x1, #-`16`]
261	ldnp x10,x11,[x1, #-`32`]
262	sub x1, x1, #`32`
263	subs x2, x2, #`32`
264	b.hi L_reverseCopyLoop
265
266	L_reverseCleanup:
267	sub x1, x1, x2
268	ldp x12,x13,[x1, #-`16`]
269	ldp x14,x15,[x1, #-`32`]
270	stp x8, x9, [x3, #-`16`]
271	stp x10,x11,[x3, #-`32`]
272	stp x12,x13,[x0, #`16`] // In the forward copy, we need to compute the
273	stp x14,x15,[x0] // address of these stores, but here we already
274	POP_FRAME // have a pointer to the start of the buffer.
275	ARM64_STACK_EPILOG
276
277	/*****************************************************************************
278	* reverse small copy *
279	*****************************************************************************/
280
281	`0`: ldr x6, [x1,#-`8`]!
282	str x6, [x4,#-`8`]!
283	L_reverseSmallCopy:
284	subs x2, x2, #`8`
285	b.cs `0b`
286	adds x2, x2, #`8`
287	b.eq `2f`
288	`1`: ldrb w6, [x1,#-`1`]!
289	strb w6, [x4,#-`1`]!
290	subs x2, x2, #`1`
291	b.ne `1b`
292	`2`: POP_FRAME
293	ARM64_STACK_EPILOG
294
295
296	L_return:
297	POP_FRAME
298	ARM64_STACK_EPILOG
299

Browse the source code of xnu/osfmk/arm64/bcopy.s