lz4_encode_arm64.s source code [xnu/osfmk/arm64/lz4_encode_arm64.s]

1	/*
2	* Copyright (c) 2016-2016 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	#include <vm/lz4_assembly_select.h>
30	#include <vm/lz4_constants.h>
31	#include <arm64/asm.h>
32
33	#if LZ4_ENABLE_ASSEMBLY_ENCODE_ARM64
34
35	/ void lz4_encode_2gb(uint8_t ** dst_ptr,*
36	size_t dst_size,
37	const uint8_t * src_ptr,*
38	const uint8_t src_begin,*
39	size_t src_size,
40	lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
41	int skip_final_literals) /*
42
43	.globl _lz4_encode_2gb
44
45	#define dst_ptr x0
46	#define dst_size x1
47	#define src_ptr x2
48	#define src_begin x3
49	#define src_size x4
50	#define hash_table x5
51	#define skip_final_literals x6
52
53	.text
54	.p2align `4`
55	_lz4_encode_2gb:
56
57	// esteblish frame
58	ARM64_STACK_PROLOG
59	stp fp, lr, [sp, #-`16`]!
60	mov fp, sp
61
62	stp x19, x20, [sp, #-`16`]!
63	stp x21, x22, [sp, #-`16`]!
64	stp x23, x24, [sp, #-`16`]!
65	stp x25, x26, [sp, #-`16`]!
66	stp x27, x28, [sp, #-`16`]!
67
68	// constant registers
69	adr x7, L_constant
70	ldr w28, [x7, #`4`] // x28 = 0x80808081 (magic number to cmopute 1/255)
71	ldr w7, [x7] // x7 = LZ4_COMPRESS_HASH_MULTIPLY
72	mov x27, #-`1` // x27 = 0xffffffffffffffff
73	dup`.4s` v1, w27 // q1 = {0xffffffff, 0xffffffff, 0xffffffff, 0xffffffff}
74
75
76	// x9 - is current dst
77	// x10 - dst_end - safety_margin
78	ldr x9, [x0] // dst
79	add x10, x9, x1 // dst_end
80	sub x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // dst_end - safety_margin
81	cmp x10, x9 // if dst_size < safety_margin abort
82	b.lt L_done
83
84	// x11 - is current src
85	// x12 - is src_end - safety margin
86	ldr x11, [x2] // src
87	add x12, x11, x4 // src_end
88	sub x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // src_end - safety_margin
89	cmp x12, x11 // if src_size < safety_margin skip to trailing_literals
90	b.lt L_trailing_literals
91
92
93	// this block search for the next available match
94	// set match_begin to current src (which is also where last match ended)
95	L_search_next_available_match:
96	mov x13, x11 // match_begin = src
97	sub x14, x13, x3 // match_postion = match_begin - src_begin
98
99	// compute hash value for the next 5 "quads"
100	// hash distance need to be 0 < D < 0x10000
101
102	L_hash_match:
103	ldr x15, [x13] // match_first_4_bytes
104	umull x20, w7, w15 // match_bytes LZ4_COMPRESS_HASH_MULTIPLY*
105	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
106	add x20, x5, x20, lsl #`3` // hash_table_entry ptr (hash + 8index)*
107
108	ldp w19, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
109	stp w14, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
110
111	add x26, x14, #`1` // next_match pos
112	lsr x25, x15, #`8` // next_match_first_4_bytes
113	umull x21, w7, w25 // match_bytes LZ4_COMPRESS_HASH_MULTIPLY*
114	lsr w21, w21, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
115	add x21, x5, x21, lsl #`3` // hash_table_entry ptr (hash + 8index)*
116
117	ldp w23, w24, [x21] // read entry values (w23 - pos, w24 - 4 bytes at pos)
118	stp w26, w25, [x21] // write entry values (w26 - next pos, w25 - next 4 bytes)
119
120	cmp w15, w22
121	b.ne L_try_next_match_0 // compare the 4 bytes to see if there is a match
122	sub w19, w14, w19 // x19 - match_dist (current_pos - match_pos)
123	cmp w19, #`0x10000`
124	ccmp w19, #`0`, #`0xf`, lo
125	b.eq L_try_next_match_0 // verify the 0 < dist < 0x10000
126	b L_found_valid_match
127
128	L_try_next_match_0:
129	add x13, x13, #`1`
130	add x14, x14, #`1`
131
132	add x26, x14, #`1` // next_match pos
133	lsr x15, x15, #`16` // next_match_first_4_bytes
134	umull x20, w7, w15 // match_bytes LZ4_COMPRESS_HASH_MULTIPLY*
135	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
136	add x20, x5, x20, lsl #`3` // hash_table_entry ptr (hash + 8index)*
137
138	ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
139	stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
140
141	cmp w25, w24
142	b.ne L_try_next_match_1 // compare the 4 bytes to see if there is a match
143	sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos)
144	cmp w19, #`0x10000`
145	ccmp w19, #`0`, #`0xf`, lo
146	b.eq L_try_next_match_1 // verify the 0 < dist < 0x10000
147	b L_found_valid_match
148
149	L_try_next_match_1:
150	add x13, x13, #`1`
151	add x14, x14, #`1`
152
153	add x26, x14, #`1` // next_match pos
154	lsr x25, x15, #`8` // next_match_first_4_bytes
155	umull x20, w7, w25 // match_bytes LZ4_COMPRESS_HASH_MULTIPLY*
156	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
157	add x20, x5, x20, lsl #`3` // hash_table_entry ptr (hash + 8index)*
158
159	ldp w23, w24, [x20] // read entry values (w23 - pos, w24 - 4 bytes at pos)
160	stp w26, w25, [x20] // write entry values (w26 - next pos, w25 - next 4 bytes)
161
162	cmp w15, w22
163	b.ne L_try_next_match_2 // compare the 4 bytes to see if there is a match
164	sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos)
165	cmp w19, #`0x10000`
166	ccmp w19, #`0`, #`0xf`, lo
167	b.eq L_try_next_match_2 // verify the 0 < dist < 0x10000
168	b L_found_valid_match
169
170	L_try_next_match_2:
171	add x13, x13, #`1`
172	add x14, x14, #`1`
173
174	add x26, x14, #`1` // next_match pos
175	lsr x15, x15, #`16` // next_match_first_4_bytes
176	umull x20, w7, w15 // match_bytes LZ4_COMPRESS_HASH_MULTIPLY*
177	lsr w20, w20, #LZ4_COMPRESS_HASH_SHIFT // use LZ4_COMPRESS_HASH_BITS MSbits as index
178	add x20, x5, x20, lsl #`3` // hash_table_entry ptr (hash + 8index)*
179
180	ldp w21, w22, [x20] // read entry values (w19 - pos, w22 - 4 bytes at pos)
181	stp w26, w15, [x20] // write entry values (w14 - current pos, w15 - current 4 bytes)
182
183	cmp w25, w24
184	b.ne L_try_next_match_3 // compare the 4 bytes to see if there is a match
185	sub w19, w14, w23 // x19 - match_dist (current_pos - match_pos)
186	cmp w19, #`0x10000`
187	ccmp w19, #`0`, #`0xf`, lo
188	b.eq L_try_next_match_3 // verify the 0 < dist < 0x10000
189	b L_found_valid_match
190
191	L_try_next_match_3:
192	add x13, x13, #`1`
193	add x14, x14, #`1`
194
195	cmp w15, w22
196	b.ne L_try_next_matchs // compare the 4 bytes to see if there is a match
197	sub w19, w14, w21 // x19 - match_dist (current_pos - match_pos)
198	cmp w19, #`0x10000`
199	ccmp w19, #`0`, #`0xf`, lo
200	b.eq L_try_next_matchs // verify the 0 < dist < 0x10000
201	b L_found_valid_match
202
203	// this block exapnd the valid match as much as possible
204	// first it try to expand the match forward
205	// next it try to expand the match backword
206	L_found_valid_match:
207	add x20, x13, #`4` // match_end = match_begin+4 (already confirmd the first 4 bytes)
208	sub x21, x20, x19 // ref_end = match_end - dist
209	L_found_valid_match_expand_forward_loop:
210	ldr x22, [x20], #`8` // load match_current_8_bytes (safe to load becasue of safety margin)
211	ldr x23, [x21], #`8` // load ref_current_8_bytes
212	cmp x22, x23
213	b.ne L_found_valid_match_expand_forward_partial
214	cmp x20, x12 // check if match_end reached src_end
215	b.lo L_found_valid_match_expand_forward_loop
216	b L_found_valid_match_expand_backward
217	L_found_valid_match_expand_forward_partial:
218	sub x20, x20, #`8` // revert match_end by 8 and compute actual match of current 8 bytes
219	eor x22, x22, x23 // compare the bits using xor
220	rbit x22, x22 // revert the bits to use clz (the none equivalent bytes would have at least 1 set bit)
221	clz x22, x22 // after the revrse for every equal prefix byte clz would count 8
222	add x20, x20, x22, lsr #`3` // add the actual number of matching bytes is (clz result)>>3
223	L_found_valid_match_expand_backward:
224	sub x15, x13, x19 // ref_begin = match_begin - dist
225	L_found_valid_match_expand_backward_loop:
226	cmp x13, x11 // check if match_begin reached src (previous match end)
227	ccmp x15, x3, #`0xd`, gt // check if ref_begin reached src_begin
228	b.le L_found_valid_match_emit_match
229	ldrb w22, [x13, #-`1`]! // load match_current_8_bytes (safe to load becasue of safety margin)
230	ldrb w23, [x15, #-`1`]! // load ref_current_8_bytes
231	cmp w22, w23
232	b.eq L_found_valid_match_expand_backward_loop
233	add x13, x13, #`1` // revert x13, last compare didn't match
234
235	// this block write the match into dst
236	// it write the ML token [extra L tokens] [literals] <2byte dist> [extar M tokens]
237	// it update src & dst positions and progress to L_search_next_available_match
238	L_found_valid_match_emit_match:
239	sub x21, x20, x13 // match_length - match_end - match_begin
240	sub x21, x21, #`4` // match_length - 4 (first 4 bytes are guaranteed)
241	sub x22, x13, x11 // literals_length = match_begin - src // compute
242	sub x26, x10, x9 // dst_remaining_space = dst_end - dst
243	sub x26, x26, x22 // dst_remaining_space -= literals_length
244	subs x26, x26, #`3` // dst_remaining_space -= 2_dist_bytes + L/M_token
245	b.lo L_done // exit if dst isn't sufficent
246
247	and x23, x21, #`0xf` // store M 4 LSbits
248	add x23, x23, x22, lsl #`4` // add L 4 LSbits
249	add x15, x9, #`1` // tmp_dst = dst + 1
250	cmp x22, #`15` // if L >= 15 need to write more L tokens
251	b.lo L_found_valid_match_copy_literals
252	orr x23, x23, #`0xf0` // update L/M token to be 0xfM
253	sub x24, x22, #`15` // reduce 15 from number_of_literals
254	sub x26, x26, #`1` // check if there is space for the extra L token
255	b.lo L_done
256	cmp x24, #`255` // check if need to compute number of 255 tokens
257	b.lo L_found_valid_match_skip_L_255_tokens
258	umull x25, w24, w28 // x25 - (literals_to_token 1_DIV_255_magic_number)*
259	lsr x25, x25, #`39` // x25 - number_of_255_tokens = (literals_to_token 1_DIV_255_magic_number)>>39*
260	subs x26, x26, x25 // check if there is sufficent space for the 255_tokens
261	b.lo L_done
262	mov x13, #`255`
263	umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (number_of_255_tokens255)*
264	L_found_valid_match_L_255_tokens_loop:
265	str q1, [x15], #`16` // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
266	subs x25, x25, #`16` // check if there are any 255 token left after current 16
267	b.hi L_found_valid_match_L_255_tokens_loop
268	add x15, x15, x25 // revert tmp_dst if written too many 255 tokens.
269	L_found_valid_match_skip_L_255_tokens:
270	strb w24, [x15], #`1` // write last L token
271	L_found_valid_match_copy_literals:
272	ldr q0, [x11], #`16` // load current 16 literals. (safe becasue src_end has safety margin)
273	str q0, [x15], #`16` // store current 16 literals. (safe becasue dst_end has safety margin)
274	subs x22, x22, #`16`
275	b.gt L_found_valid_match_copy_literals
276	add x15, x15, x22 // revert tmp_dst if written too many literals
277	strh w19, [x15], #`2` // store dist bytes
278	cmp x21, #`15` // if M >= 15 need to write more M tokens
279	b.lo L_found_valid_match_finish_writing_match
280	orr x23, x23, #`0xf` // update L/M token to be 0xLf
281	sub x24, x21, #`15` // reduce 15 from match_length
282	sub x26, x26, #`1` // check if there is space for the extra M token
283	b.lo L_done
284	cmp x24, #`255` // check if need to compute number of 255 tokens
285	b.lo L_found_valid_match_skip_M_255_tokens
286	umull x25, w24, w28 // x25 - (match_length 1_DIV_255_magic_number)*
287	lsr x25, x25, #`39` // x25 - number_of_255_tokens = (match_length 1_DIV_255_magic_number)>>39*
288	subs x26, x26, x25 // check if there is sufficent space for the 255_tokens
289	b.lo L_done
290	mov x13, #`255`
291	umsubl x24, w25, w13, x24 // x24 - value_of_remainder_token = literals_to_token - (match_length255)*
292	L_found_valid_match_M_255_tokens_loop:
293	str q1, [x15], #`16` // store 16 255 tokens into dst_tmp. safe to store because dst has safety_margin
294	subs x25, x25, #`16` // check if there are any 255 token left after current 16
295	b.hi L_found_valid_match_M_255_tokens_loop
296	add x15, x15, x25 // revert tmp_dst if written too many 255 tokens.
297	L_found_valid_match_skip_M_255_tokens:
298	strb w24, [x15], #`1` // write last M token
299	L_found_valid_match_finish_writing_match:
300	strb w23, [x9] // store first token of match in dst
301	mov x9, x15 // update dst to last postion written
302	mov x11, x20 // update src to match_end (last byte that was encoded)
303	cmp x11, x12 // check if src reached src_end
304	ccmp x9, x10, #`9`, lt // check if dst reached dst_end
305	b.ge L_trailing_literals
306	b L_search_next_available_match
307	// !!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!!
308	// attempted to hash three quad values from the end of each emited match
309	// this eneded up being slower and less compression (???)
310	// this block set match_begin and pos for next hash search and
311	// compute the hash values for the last 3 bytes of currently emited match
312	// only need to comute these hash becasue other "quads" were hashed when the original
313	// data was read.
314
315	L_try_next_matchs:
316	add x13, x13, #`1` // move to next match
317	add x14, x14, #`1` // update next match pos
318	cmp x13, x12 // check match_begin didn't reach src_end
319	b.lo L_hash_match
320
321	L_trailing_literals:
322	// unless skip_final_literals is set
323	// write the trailing bytes as literals
324	// traliing bytes include the whole src (with the safty margin)
325	// need to verify whole dst (withthe safty margin) has sufficent space
326
327	tst x6, x6
328	b.ne L_done // if skip_final_literals is set skip writing them
329
330	add x12, x12, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin
331	subs x13, x12, x11 // remaining_src
332	b.eq L_done // finish if there are 0 trailing literals
333
334	add x10, x10, #LZ4_GOFAST_SAFETY_MARGIN // add safety_margin
335	sub x14, x10, x9 // remaining dst (dst_end - dst)
336	sub x14, x14, #`1` // 1 byte is needed at least to write literals token
337	subs x14, x14, x13 // finish if dst can't contain all remaining literals + 1 literals token
338	b.le L_done // (need to verify that it has room for literals tokens
339
340	cmp x13, #`15`
341	b.lt L_trailing_literals_store_less_than_15_literals
342	subs x14, x14, #`1` // 1-extra byte is needed for literals tokens
343	b.mi L_done
344	mov w15, #`0xf0`
345	strb w15, [x9], #`1` // write literals first token (Important !!! if 255 tokens exist but dst isn't sufficent need to revert dst by 1)
346	sub x15, x13, #`15`
347	cmp x15, #`255`
348	b.lo L_trailing_literals_no_255_tokens
349	umull x19, w15, w28 // x19 - (literals_to_token 1_DIV_255_magic_number)*
350	lsr x19, x19, #`39` // x19 - number_of_255_tokens = (literals_to_token 1_DIV_255_magic_number)>>39*
351	subs x14, x14, x19
352	b.mi L_revert_x9_and_done
353	mov x26, #`255`
354	umsubl x15, w26, w19, x15 // x15 - value_of_remainder_token = literals_to_token - (number_of_255_tokens255)*
355	L_tariling_literals_write_16_255_tokens:
356	str q1, [x9], #`16` // store 16 255 tokens each iteration (this is safe becasue there is space for 15 or more literals + remainder token)
357	subs x19, x19, #`16`
358	b.gt L_tariling_literals_write_16_255_tokens
359	add x9, x9, x19 // fixes dst to actual number of tokens (x19 might not be a mulitple of 16)
360	L_trailing_literals_no_255_tokens:
361	strb w15, [x9], #`1` // store remainder_token
362	lsr x14, x13, #`4` // check if there are more than 16 literals left to be written
363	tst x14, x14
364	b.eq L_trailing_literals_copy_less_than_16_literals
365	L_trailing_literals_copy_16_literals:
366	ldr q0, [x11], #`16` // load current_16_literals
367	str q0, [ x9], #`16` // dst16++ = current_16_literals*
368	subs x14, x14, #`1`
369	b.gt L_trailing_literals_copy_16_literals
370	cmp x11, x12
371	b.lo L_trailing_literals_copy_less_than_16_literals
372	b L_done
373
374	L_trailing_literals_store_less_than_15_literals:
375	lsl x14, x13, #`4` // literals_only_token is 0xL0 (where L is 4 bits)
376	strb w14, [x9], #`1` // dst++ = literals_only_token*
377	L_trailing_literals_copy_less_than_16_literals:
378	ldrb w13, [x11], #`1` // load current_literal
379	strb w13, [ x9], #`1` // dst++ = current_literal*
380	cmp x11, x12
381	b.lo L_trailing_literals_copy_less_than_16_literals
382
383	// this block upadte dst & src pointers and remove frame
384	L_done:
385	str x9, [x0]
386	str x11, [x2]
387
388	ldp x27, x28, [sp], #`16`
389	ldp x25, x26, [sp], #`16`
390	ldp x23, x24, [sp], #`16`
391	ldp x21, x22, [sp], #`16`
392	ldp x19, x20, [sp], #`16`
393
394	// clear frame
395	ldp fp, lr, [sp], #`16`
396	ARM64_STACK_EPILOG
397
398	L_revert_x9_and_done:
399	sub x9, x9, #`1`
400	b L_done
401
402	.p2align `2`
403	L_constant:
404	.long LZ4_COMPRESS_HASH_MULTIPLY
405	.long `0x80808081`
406
407	#endif
408
409

Browse the source code of xnu/osfmk/arm64/lz4_encode_arm64.s