1 | /* |
2 | * Copyright (c) 2017 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | #include <vm/lz4_assembly_select.h> |
29 | #include <arm64/asm.h> |
30 | #if LZ4_ENABLE_ASSEMBLY_DECODE_ARM64 |
31 | |
32 | /* |
33 | |
34 | int64_t lz4_decode_asm( |
35 | uint8_t ** dst_ptr, *dst_ptr points to next output byte to write |
36 | uint8_t * dst_begin, points to first valid output byte we can access, dst_begin <= dst |
37 | uint8_t * dst_end, "relaxed" end of output buffer (see below) |
38 | const uint8_t ** src_ptr, *src_ptr points to next input byte to read |
39 | const uint8_t * src_end) "relaxed" end of input buffer (see below) |
40 | |
41 | We test the position of the pointers only to ensure we don't access past src_end/dst_end + some fixed constant. |
42 | We never read before dst_begin. |
43 | |
44 | Return 0 on success, -1 on failure |
45 | On output, (*src_ptr,*dst_ptr) receives the last position in both buffers corresponding to the beginning of a LZ4 instruction. |
46 | |
47 | */ |
48 | |
49 | .globl _lz4_decode_asm |
50 | |
51 | #define dst x0 // arg0 |
52 | #define dst_begin x1 // arg1 |
53 | #define dst_end x2 // arg2 |
54 | #define src x3 // arg3 |
55 | #define src_end x4 // arg4 |
56 | |
57 | #define w_n_matches w5 // lower 32 bits of n_matches |
58 | #define n_matches x5 |
59 | #define n_literals x6 |
60 | #define copy_src x7 // match/literal copy source |
61 | #define copy_dst x8 // match/literal copy destination |
62 | |
63 | #define w_aux1 w9 // lower 32 bits of aux1 |
64 | #define aux1 x9 |
65 | #define aux2 x10 |
66 | |
67 | #define w_match_distance w11 // lower 32 bits of match_distance |
68 | #define match_distance x11 |
69 | |
70 | #define match_permtable x12 |
71 | #define match_disttable x13 |
72 | |
73 | #define dst_good x19 |
74 | #define src_good x20 |
75 | |
76 | .macro establish_frame |
77 | ARM64_STACK_PROLOG |
78 | stp fp, lr, [sp, #-16]! |
79 | mov fp, sp |
80 | .endm |
81 | |
82 | .macro clear_frame_and_return |
83 | ldp fp, lr, [sp], #16 |
84 | ARM64_STACK_EPILOG |
85 | .endm |
86 | |
87 | // copy_1x16 SOURCE_ADDR DESTINATION_ADDR |
88 | // Copy 16 bytes, clobber: q0 |
89 | .macro copy_1x16 |
90 | ldr q0,[$0] |
91 | str q0,[$1] |
92 | .endm |
93 | |
94 | // copy_1x16_and_increment SOURCE_ADDR DESTINATION_ADDR |
95 | // Copy 16 bytes, and increment both addresses by 16, clobber: q0 |
96 | .macro copy_1x16_and_increment |
97 | ldr q0,[$0],#16 |
98 | str q0,[$1],#16 |
99 | .endm |
100 | |
101 | // copy_2x16_and_increment SOURCE_ADDR DESTINATION_ADDR |
102 | // Copy 2 times 16 bytes, and increment both addresses by 32, clobber: q0 |
103 | .macro copy_2x16_and_increment |
104 | ldr q0,[$0],#16 |
105 | str q0,[$1],#16 |
106 | ldr q0,[$0],#16 |
107 | str q0,[$1],#16 |
108 | .endm |
109 | |
110 | // copy_1x32_and_increment SOURCE_ADDR DESTINATION_ADDR |
111 | // Copy 32 bytes, and increment both addresses by 32, clobber: q0,q1 |
112 | .macro copy_1x32_and_increment |
113 | ldp q0,q1,[$0],#32 |
114 | stp q0,q1,[$1],#32 |
115 | .endm |
116 | |
117 | // If we don't branch, src < src_end after this |
118 | .macro check_src_end |
119 | cmp src,src_end |
120 | b.hs L_done // extremely unlikely, DONE when src >= src_end |
121 | .endm |
122 | |
123 | // If we don't branch, dst < dst_end after this |
124 | .macro check_dst_end |
125 | cmp dst,dst_end |
126 | b.hs L_done // extremely unlikely, DONE when dst >= dst_end |
127 | .endm |
128 | |
129 | .text |
130 | .p2align 4 |
131 | _lz4_decode_asm: |
132 | establish_frame |
133 | stp x19,x20,[sp,#-16]! // need to preserve these |
134 | stp src,dst,[sp,#-16]! // save src_ptr,dst_ptr on stack |
135 | ldr src,[src] // src = *src_ptr |
136 | ldr dst,[dst] // dst = *dst_ptr |
137 | adr match_permtable,L_match_permtable |
138 | adr match_disttable,L_match_disttable |
139 | |
140 | L_decode_command: |
141 | // Keep last known good positions in both streams |
142 | mov dst_good,dst |
143 | mov src_good,src |
144 | |
145 | // Check limits |
146 | check_src_end |
147 | check_dst_end |
148 | |
149 | // Decode 1-byte command |
150 | ldrb w_aux1,[src],#1 // read command byte LLLLMMMM |
151 | lsr n_literals,aux1,#4 // 0000LLLL. n_literals is now 0..15 |
152 | and n_matches,aux1,#0xf // 0000MMMM. n_matches is now 0..15 |
153 | add n_matches,n_matches,#4 // n_matches is now 4..19 |
154 | |
155 | // Test number of literals (do not test if n_literals==0, because branch prediction fails on it) |
156 | cmp n_literals,#14 |
157 | b.ls L_copy_short_literal // 96% likely: n_literals in 0..14 |
158 | // continue to decode_long_literal |
159 | |
160 | // the number of literals is encoded on more bytes, we need to decode them |
161 | L_decode_long_literal: |
162 | check_src_end // required here, since we may loop an arbitrarily high number of times |
163 | ldrb w_aux1,[src],#1 |
164 | add n_literals,n_literals,aux1 |
165 | cmp aux1,#255 |
166 | b.eq L_decode_long_literal // extremely unlikely |
167 | // continue to copy_long_literal |
168 | |
169 | // Copy literals, n_literals >= 15 |
170 | L_copy_long_literal: |
171 | mov copy_src,src // literal copy origin |
172 | mov copy_dst,dst // literal copy destination |
173 | add src,src,n_literals |
174 | add dst,dst,n_literals |
175 | check_src_end // required here, since n_literals can be arbitrarily high |
176 | check_dst_end |
177 | |
178 | // fixed + loop |
179 | copy_1x32_and_increment copy_src,copy_dst |
180 | L_copy_long_literal_loop: |
181 | copy_1x32_and_increment copy_src,copy_dst |
182 | cmp dst,copy_dst |
183 | b.hi L_copy_long_literal_loop // first test occurs after 64 bytes have been copied, and is unlikely to loop back |
184 | b L_expand_match |
185 | |
186 | // Copy literals, n_literals <= 14: copy 16 bytes |
187 | L_copy_short_literal: |
188 | copy_1x16 src,dst |
189 | add src,src,n_literals |
190 | add dst,dst,n_literals |
191 | // continue to expand match |
192 | |
193 | L_expand_match: |
194 | |
195 | // Decode match distance |
196 | ldrh w_match_distance,[src],#2 // 16-bit distance |
197 | cbz match_distance,L_fail // distance == 0 is invalid |
198 | sub copy_src,dst,match_distance // copy_src is the match copy source |
199 | cmp copy_src,dst_begin |
200 | b.lo L_fail // copy_src < dst_begin: FAIL |
201 | mov copy_dst,dst // copy_dst is the match copy destination |
202 | add dst,dst,n_matches // dst is updated to be the byte after the match; n_matches <= 19 here |
203 | |
204 | // Do we need to decode a long match? |
205 | cmp n_matches,#19 |
206 | b.eq L_decode_long_match // unlikely, n_matches >= 19 encoded on more bytes |
207 | cmp n_matches,#16 |
208 | b.hi L_long_match // unlikely, n_matches == 17 or 18 |
209 | // continue to short match (most probable case) |
210 | |
211 | // Copy match, n_matches <= 16 |
212 | L_short_match: |
213 | cmp match_distance,#15 |
214 | b.ls L_copy_short_match_small_distance |
215 | |
216 | // Copy match, n_matches <= 16, match_distance >= 16: copy 16 bytes |
217 | copy_1x16 copy_src,copy_dst |
218 | b L_decode_command |
219 | |
220 | // Copy match, n_matches <= 16, match_distance < 16: |
221 | // load shuffle table, and permute to replicate the pattern on 16 bytes |
222 | L_copy_short_match_small_distance: |
223 | ldr q0,[copy_src] |
224 | add aux1,match_permtable,match_distance,lsl #5 // index in table |
225 | ldr q1,[aux1] // load only permutation for the low 16 bytes |
226 | tbl v0.16b,{v0.16b},v1.16b // low 16 bytes of pattern |
227 | str q0,[copy_dst] |
228 | b L_decode_command |
229 | |
230 | // n_matches == 19: the number of matches in encoded on more bytes, we need to decode them |
231 | L_decode_long_match: |
232 | check_src_end // required here, since we may loop an arbitrarily high number of times |
233 | ldrb w_aux1,[src],#1 |
234 | add dst,dst,aux1 |
235 | cmp aux1,#255 |
236 | b.eq L_decode_long_match // very unlikely |
237 | check_dst_end // required here, since dst was incremented by a arbitrarily high value |
238 | // continue to long_match |
239 | |
240 | // n_matches > 16 |
241 | L_long_match: |
242 | cmp match_distance,#31 |
243 | b.hi L_copy_long_match_32 |
244 | cmp match_distance,#15 |
245 | b.hi L_copy_long_match_16 |
246 | |
247 | // Copy match, n_matches >= 16, match_distance < 16: |
248 | // load shuffle table, and permute to replicate the pattern on 32 bytes |
249 | L_copy_long_match_small_distance: |
250 | ldr q1,[copy_src] // 16 pattern bytes |
251 | add aux1,match_permtable,match_distance,lsl #5 // index in table |
252 | ldp q2,q3,[aux1] // load 32-byte permutation |
253 | tbl v0.16b,{v1.16b},v2.16b // low 16 bytes of pattern in q0 |
254 | tbl v1.16b,{v1.16b},v3.16b // high 16 bytes of pattern in q1 |
255 | ldrb w_aux1,[match_disttable,match_distance] // valid pattern length in aux1 |
256 | // fixed |
257 | stp q0,q1,[copy_dst] |
258 | add copy_dst,copy_dst,aux1 |
259 | L_copy_long_match_small_distance_loop: |
260 | // loop |
261 | stp q0,q1,[copy_dst] |
262 | add copy_dst,copy_dst,aux1 |
263 | stp q0,q1,[copy_dst] |
264 | add copy_dst,copy_dst,aux1 |
265 | cmp dst,copy_dst |
266 | b.hi L_copy_long_match_small_distance_loop |
267 | b L_decode_command |
268 | |
269 | // Copy match, n_matches >= 16, match_distance >= 32 |
270 | L_copy_long_match_32: |
271 | // fixed + loop |
272 | copy_1x16_and_increment copy_src,copy_dst |
273 | L_copy_long_match_32_loop: |
274 | copy_1x32_and_increment copy_src,copy_dst |
275 | cmp dst,copy_dst |
276 | b.hi L_copy_long_match_32_loop |
277 | b L_decode_command |
278 | |
279 | // Copy match, n_matches >= 16, match_distance >= 16 |
280 | L_copy_long_match_16: |
281 | // fixed + loop |
282 | copy_1x16_and_increment copy_src,copy_dst |
283 | L_copy_long_match_16_loop: |
284 | copy_2x16_and_increment copy_src,copy_dst |
285 | cmp dst,copy_dst |
286 | b.hi L_copy_long_match_16_loop |
287 | b L_decode_command |
288 | |
289 | L_fail: |
290 | mov aux1,#-1 // FAIL |
291 | b L_exit |
292 | |
293 | L_done: |
294 | mov aux1,#0 // OK |
295 | // continue to L_exit |
296 | |
297 | L_exit: |
298 | ldp src,dst,[sp],#16 // get back src_ptr,dst_ptr from stack |
299 | str src_good,[src] // *src_ptr = src_good |
300 | str dst_good,[dst] // *dst_ptr = dst_good |
301 | mov x0,aux1 // x0 = return value |
302 | ldp x19,x20,[sp],#16 // restore |
303 | clear_frame_and_return |
304 | |
305 | // permutation tables for short distance matches, 32 byte result, for match_distance = 0 to 15 |
306 | // value(d)[i] = i%d for i = 0..31 |
307 | .p2align 6 |
308 | L_match_permtable: |
309 | .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 0 |
310 | .byte 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 // 1 |
311 | .byte 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 // 2 |
312 | .byte 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1 // 3 |
313 | .byte 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3, 0, 1, 2, 3 // 4 |
314 | .byte 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1, 2, 3, 4, 0, 1 // 5 |
315 | .byte 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1, 2, 3, 4, 5, 0, 1 // 6 |
316 | .byte 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3, 4, 5, 6, 0, 1, 2, 3 // 7 |
317 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7, 0, 1, 2, 3, 4, 5, 6, 7 // 8 |
318 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4, 5, 6, 7, 8, 0, 1, 2, 3, 4 // 9 |
319 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 0, 1 // 10 |
320 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9 // 11 |
321 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11, 0, 1, 2, 3, 4, 5, 6, 7 // 12 |
322 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12, 0, 1, 2, 3, 4, 5 // 13 |
323 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13, 0, 1, 2, 3 // 14 |
324 | .byte 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9,10,11,12,13,14, 0, 1 // 15 |
325 | |
326 | // valid repeating pattern size, for each match_distance = 0 to 15 |
327 | // value(d) = 32 - (32%d), is the largest a multiple of d <= 32 |
328 | .p2align 6 |
329 | L_match_disttable: |
330 | .byte 32,32,32,30 // 0 .. 3 |
331 | .byte 16,30,30,28 // 4 .. 7 |
332 | .byte 16,27,30,22 // 8 .. 11 |
333 | .byte 24,26,28,30 // 12 .. 15 |
334 | |
335 | #endif // LZ4_ENABLE_ASSEMBLY_DECODE_ARM64 |
336 | |