1 | /* |
2 | * Copyright (c) 2000-2014 Apple Inc. All rights reserved. |
3 | * |
4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
5 | * |
6 | * This file contains Original Code and/or Modifications of Original Code |
7 | * as defined in and that are subject to the Apple Public Source License |
8 | * Version 2.0 (the 'License'). You may not use this file except in |
9 | * compliance with the License. The rights granted to you under the License |
10 | * may not be used to create, or enable the creation or redistribution of, |
11 | * unlawful or unlicensed copies of an Apple operating system, or to |
12 | * circumvent, violate, or enable the circumvention or violation of, any |
13 | * terms of an Apple operating system software license agreement. |
14 | * |
15 | * Please obtain a copy of the License at |
16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
17 | * |
18 | * The Original Code and all software distributed under the License are |
19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
23 | * Please see the License for the specific language governing rights and |
24 | * limitations under the License. |
25 | * |
26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
27 | */ |
28 | |
29 | /* |
30 | This file contains arm64 hand optimized implementation of WKdm memory page decompressor. |
31 | |
32 | void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word *scratch, __unused__ unsigned int words); |
33 | |
34 | input : |
35 | src_buf : address of input compressed data buffer |
36 | dest_buf : address of output decompressed buffer |
37 | scratch : an 8-k bytes scratch mempro provided by the caller |
38 | words : this argument is not used in the implementation |
39 | (The 4th argument is, in fact, used by the Mostly Zero Value decoder) |
40 | |
41 | output : |
42 | |
43 | the input buffer is decompressed and the dest_buf is written with decompressed data. |
44 | |
45 | Am algorithm description of the WKdm compress and bit stream format can be found in the WKdm Compress arm64 assembly code WKdmCompress.s |
46 | |
47 | The bit stream (*src_buf) consists of |
48 | a. 12 bytes header |
49 | b. 256 bytes for 1024 packed tags |
50 | c. (varying number of) words for new words not matched to dictionary word. |
51 | d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3) |
52 | e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1) |
53 | |
54 | where the header (of 3 words) specifies the ending boundaries (in 32-bit words) of the bit stream of c,d,e, respectively. |
55 | |
56 | The decompressor 1st unpacking the bit stream component b/d/e into temorary buffers. Then it sequentially decodes the decompressed word as follows |
57 | |
58 | for (i=0;i<1024;i++) { |
59 | tag = *next_tag++ |
60 | switch (tag) { |
61 | case 0 : *dest_buf++ = 0; break; |
62 | case 1 : dict_word = dictionary[*dict_index]; dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; break; |
63 | case 2 : x = *new_word++; k = (x>>10)&255; k = hashTable[k]; dictionary[k] = *dest_buf++ = x; break; |
64 | case 3 : *dest_buf++ = dictionary[*dict_index++]; break; |
65 | } |
66 | |
67 | cclee, Nov 9, '12 |
68 | |
69 | Added zero page, single value page, sparse page, early abort optimizations |
70 | rsrini, 09/14/14 |
71 | */ |
72 | |
73 | #define MZV_MAGIC 17185 // magic value used to identify MZV page encoding |
74 | |
75 | #ifndef PAGES_SIZE_IN_KBYTES |
76 | #define PAGES_SIZE_IN_KBYTES 4 |
77 | #endif |
78 | |
79 | #if !((PAGES_SIZE_IN_KBYTES==4) || (PAGES_SIZE_IN_KBYTES==16)) |
80 | #error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported" |
81 | #endif |
82 | |
83 | #define scale (PAGES_SIZE_IN_KBYTES/4) |
84 | |
85 | |
86 | .align 4 |
87 | .text |
88 | |
89 | /* |
90 | void WKdm_decompress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes); |
91 | */ |
92 | |
93 | .globl _WKdm_decompress_4k |
94 | _WKdm_decompress_4k: |
95 | |
96 | /* |
97 | -------- symbolizing registers -------- |
98 | the arm64 code was ported from x86_64 so we name some registers that are used as temp variables with x86_64 register names. |
99 | */ |
100 | |
101 | #define src_buf x0 |
102 | #define dest_buf x1 |
103 | #define scratch x2 |
104 | #define n_bytes x3 |
105 | #define dictionary sp |
106 | #define rax x13 |
107 | #define eax w13 |
108 | #define rbx x4 |
109 | #define ebx w4 |
110 | #define rcx x5 |
111 | #define ecx w5 |
112 | #define rdx x6 |
113 | #define edx w6 |
114 | #define tags_counter x7 |
115 | #define next_tag x12 |
116 | #define r8 x8 |
117 | #define r9 x9 |
118 | #define r10 x10 |
119 | #define r11 x11 |
120 | #define r12 x12 |
121 | |
122 | /* |
123 | |
124 | ------ scratch memory for local variables --------- |
125 | |
126 | [sp,#0] : dictionary |
127 | [scratch,#0] : tempTagsArray |
128 | [scratch,#1024] : tempQPosArray |
129 | [scratch,#2048] : tempLowBitsArray |
130 | |
131 | */ |
132 | |
133 | #if KERNEL |
134 | sub rax, sp, #96 |
135 | sub sp, sp, #96 |
136 | st1.4s {v0,v1,v2},[rax],#48 |
137 | st1.4s {v3,v4,v5},[rax],#48 |
138 | #endif |
139 | |
140 | sub sp, sp, #64 |
141 | |
142 | ldr eax, [src_buf] // read the 1st word from the header |
143 | mov ecx, #MZV_MAGIC |
144 | cmp eax, ecx // is the alternate packer used (i.e. is MZV page)? |
145 | b.ne L_default_decompressor // default decompressor was used |
146 | |
147 | // Mostly Zero Page Handling... |
148 | // { |
149 | add src_buf, src_buf, 4 // skip the header |
150 | mov rax, dest_buf |
151 | mov rcx, #(PAGES_SIZE_IN_KBYTES*1024) // number of bytes to zero out |
152 | 1: |
153 | dc zva, rax // zero 64 bytes. since dest_buf is a page, it will be 4096 or 16384 byte aligned |
154 | add rax, rax, #64 |
155 | dc zva, rax |
156 | add rax, rax, #64 |
157 | dc zva, rax |
158 | add rax, rax, #64 |
159 | dc zva, rax |
160 | add rax, rax, #64 |
161 | subs rcx, rcx, #256 |
162 | b.ne 1b |
163 | |
164 | mov r12, #4 // current byte position in src to read from |
165 | mov rdx, #0 |
166 | 2: |
167 | ldr eax, [src_buf], #4 // get the word |
168 | ldrh edx, [src_buf], #2 // get the index |
169 | str eax, [dest_buf, rdx] // store non-0 word in the destination buffer |
170 | add r12, r12, #6 // 6 more bytes processed |
171 | cmp r12, n_bytes // finished processing all the bytes? |
172 | b.ne 2b |
173 | b L_done |
174 | // } |
175 | |
176 | L_default_decompressor: |
177 | |
178 | /* |
179 | ---------------------- set up registers and PRELOAD_DICTIONARY --------------------------------- |
180 | */ |
181 | // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO TO MIRROR THE COMPRESSOR |
182 | adrp rbx, _table_2bits@GOTPAGE |
183 | stp xzr, xzr, [dictionary, #0] |
184 | add r10, src_buf, #(12+256*scale) // TAGS_AREA_END |
185 | stp xzr, xzr, [dictionary, #16] |
186 | add rax, src_buf, #12 // TAGS_AREA_START |
187 | ldr rbx, [rbx, _table_2bits@GOTPAGEOFF] |
188 | stp xzr, xzr, [dictionary, #32] |
189 | mov rcx, scratch // tempTagsArray |
190 | stp xzr, xzr, [dictionary, #48] |
191 | ld1.4s {v0,v1},[rbx] |
192 | |
193 | |
194 | /* |
195 | ------------------------------ unpacking bit stream ---------------------------------- |
196 | */ |
197 | |
198 | // WK_unpack_2bits(TAGS_AREA_START(src_buf), TAGS_AREA_END(src_buf), tempTagsArray); |
199 | /* |
200 | unpacking 16 2-bit tags (from a 32-bit word) into 16 bytes |
201 | for arm64, this can be done by |
202 | 1. read the input 32-bit word into GPR w |
203 | 2. duplicate GPR into 4 elements in a vector register v0 |
204 | 3. ushl.4s vd, v0, vshift where vshift = {0, -2, -4, -6} |
205 | 4. and.4s vd, vd, vmask where vmask = 0x03030303030303030303030303030303 |
206 | */ |
207 | |
208 | L_WK_unpack_2bits: |
209 | ldr q5, [rax], #16 // read 4 32-bit words for 64 2-bit tags |
210 | dup.4s v2, v5[0] // duplicate to 4 elements |
211 | dup.4s v3, v5[1] // duplicate to 4 elements |
212 | dup.4s v4, v5[2] // duplicate to 4 elements |
213 | dup.4s v5, v5[3] // duplicate to 4 elements |
214 | ushl.4s v2, v2, v0 // v1 = {0, -2, -4, -6} |
215 | ushl.4s v3, v3, v0 // v1 = {0, -2, -4, -6} |
216 | ushl.4s v4, v4, v0 // v1 = {0, -2, -4, -6} |
217 | ushl.4s v5, v5, v0 // v1 = {0, -2, -4, -6} |
218 | and.16b v2, v2, v1 // v2 = {3,3,...,3} |
219 | and.16b v3, v3, v1 // v2 = {3,3,...,3} |
220 | and.16b v4, v4, v1 // v2 = {3,3,...,3} |
221 | and.16b v5, v5, v1 // v2 = {3,3,...,3} |
222 | cmp r10, rax // TAGS_AREA_END vs TAGS_AREA_START |
223 | st1.4s {v2,v3,v4,v5}, [rcx], #64 // write 64 tags into tempTagsArray |
224 | b.hi L_WK_unpack_2bits // if not reach TAGS_AREA_END, repeat L_WK_unpack_2bits |
225 | |
226 | |
227 | // WK_unpack_4bits(QPOS_AREA_START(src_buf), QPOS_AREA_END(src_buf), tempQPosArray); |
228 | |
229 | ldp w8, w9, [src_buf] // WKdm header qpos start and end |
230 | adrp rbx, _table_4bits@GOTPAGE |
231 | subs x14, r9, r8 // x14 = (QPOS_AREA_END - QPOS_AREA_START)/4 |
232 | add r8, src_buf, r8, lsl #2 // QPOS_AREA_START |
233 | add r9, src_buf, r9, lsl #2 // QPOS_AREA_END |
234 | |
235 | b.ls 1f // if QPOS_AREA_END <= QPOS_AREA_START, skip L_WK_unpack_4bits |
236 | ldr rbx, [rbx, _table_4bits@GOTPAGEOFF] |
237 | add rcx, scratch, #(1024*scale) // tempQPosArray |
238 | ld1.4s {v0,v1},[rbx] |
239 | subs w14, w14, #1 |
240 | b.ls 2f // do loop of 2 only if w14 >= 5 |
241 | L_WK_unpack_4bits: |
242 | ldr d2, [r8], #8 // read a 32-bit word for 8 4-bit positions |
243 | subs w14, w14, #2 |
244 | zip1.4s v2, v2, v2 |
245 | ushl.4s v2, v2, v0 // v1 = {0, -4, 0, -4} |
246 | and.16b v2, v2, v1 // v2 = {15,15,...,15} |
247 | str q2, [rcx], #16 |
248 | b.hi L_WK_unpack_4bits |
249 | 2: |
250 | adds w14, w14, #1 |
251 | b.le 1f |
252 | |
253 | ldr s3, [r8], #4 // read a 32-bit word for 8 4-bit positions |
254 | dup.2s v2, v3[0] // duplicate to 2 elements |
255 | ushl.2s v2, v2, v0 // v1 = {0, -4} |
256 | and.8b v2, v2, v1 // v2 = {15,15,...,15} |
257 | str d2, [rcx], #8 // write 16 tags into tempTagsArray |
258 | |
259 | 1: |
260 | |
261 | // WK_unpack_3_tenbits(LOW_BITS_AREA_START(src_buf), LOW_BITS_AREA_END(src_buf), tempLowBitsArray); |
262 | |
263 | ldr eax, [src_buf,#8] // LOW_BITS_AREA_END offset |
264 | add r8, src_buf, rax, lsl #2 // LOW_BITS_AREA_END |
265 | add rcx, scratch, #(2048*scale) // tempLowBitsArray |
266 | #if (scale==1) |
267 | add r11, scratch, #(4096*scale-2) // final tenbits for the rare case |
268 | #else |
269 | add r11, scratch, #(4096*scale) // final tenbits for the rare case |
270 | sub r11, r11, #2 |
271 | #endif |
272 | subs r8, r8, r9 // LOW_BITS_AREA_START vs LOW_BITS_AREA_END |
273 | b.ls 1f // if START>=END, skip L_WK_unpack_3_tenbits |
274 | |
275 | adrp rbx, _table_10bits@GOTPAGE |
276 | ldr rbx, [rbx, _table_10bits@GOTPAGEOFF] |
277 | ld1.4s {v0,v1,v2,v3},[rbx] |
278 | |
279 | /* |
280 | a very rare case : 1024 tenbits, 1023 + 1 -> 341 + final 1 that is padded with 2 zeros |
281 | since the scratch memory is 4k (2k for this section), we need to pay attention to the last case |
282 | so we don't overwrite to the scratch memory |
283 | |
284 | we 1st do a single 3_tenbits, followed by 2x_3_tenbits loop, and detect whether the last 3_tenbits |
285 | hits the raee case |
286 | */ |
287 | #if 1 |
288 | subs r8, r8, #4 // pre-decrement by 8 |
289 | ldr s4, [r9], #4 // read 32-bit words for 3 low 10-bits |
290 | zip1.4s v4, v4, v4 // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice. |
291 | ushl.4s v5, v4, v0 // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89. |
292 | ushl.4s v4, v4, v1 // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105. |
293 | and.16b v5, v5, v2 // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets. |
294 | and.16b v4, v4, v3 // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets |
295 | orr.16b v4, v4, v5 // combine data |
296 | str d4, [rcx], #6 // write 3 low 10-bits |
297 | b.eq 1f |
298 | #endif |
299 | |
300 | subs r8, r8, #8 // pre-decrement by 8 |
301 | b.lt L_WK_unpack_3_tenbits |
302 | |
303 | L_WK_unpack_2x_3_tenbits: |
304 | ldr d4, [r9], #8 // read 2 32-bit words for a pair of 3 low 10-bits |
305 | zip1.4s v4, v4, v4 // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice. |
306 | ushl.4s v5, v4, v0 // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89. |
307 | ushl.4s v4, v4, v1 // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105. |
308 | and.16b v5, v5, v2 // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets. |
309 | and.16b v4, v4, v3 // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets |
310 | orr.16b v4, v4, v5 // combine data |
311 | ins v5.d[0], v4.d[1] |
312 | str d4, [rcx], #6 // write 3 low 10-bits |
313 | str d5, [rcx], #6 // write 3 low 10-bits |
314 | |
315 | subs r8, r8, #8 |
316 | b.ge L_WK_unpack_2x_3_tenbits // repeat loop if LOW_BITS_AREA_END > next_word |
317 | |
318 | tst r8, #4 |
319 | b.eq 1f |
320 | |
321 | L_WK_unpack_3_tenbits: |
322 | ldr s4, [r9] // read 32-bit words for 3 low 10-bits |
323 | zip1.4s v4, v4, v4 // bits 0-63 contain first triplet twice, bits 64-127 contain second triplet twice. |
324 | ushl.4s v5, v4, v0 // v0 = {6, 0, 6, 0}, places second element of triplets into bits 16-25 and 80-89. |
325 | ushl.4s v4, v4, v1 // v1 = {0, -20, 0, -20}, places third element of triplets into bits 32-41 and 96-105. |
326 | and.16b v5, v5, v2 // v2 = {0, 1023, 0, 0, 0, 1023, 0, 0}, isolate second element of triplets. |
327 | and.16b v4, v4, v3 // v3 = {1023, 0, 1023, 0, 1023, 0, 1023, 0}, isolate first and third elements of triplets |
328 | orr.16b v4, v4, v5 // combine data |
329 | #if 0 |
330 | str d4, [rcx] // write 3 low 10-bits |
331 | #else |
332 | cmp rcx, r11 |
333 | b.eq 2f |
334 | str d4, [rcx] // write 3 low 10-bits |
335 | b 1f |
336 | 2: |
337 | str h4, [rcx] // write final 1 low 10-bits |
338 | #endif |
339 | 1: |
340 | |
341 | /* |
342 | set up before going to the main decompress loop |
343 | */ |
344 | mov next_tag, scratch // tempTagsArray |
345 | add r8, scratch, #(1024*scale) // next_qpos |
346 | add r11, scratch, #(2048*scale) // tempLowBitsArray |
347 | adrp rbx, _hashLookupTable@GOTPAGE |
348 | mov tags_counter, #(1024*scale) // tag_area_end |
349 | ldr rbx, [rbx, _hashLookupTable@GOTPAGEOFF] |
350 | |
351 | b L_next |
352 | |
353 | .align 4,0x90 |
354 | L_ZERO_TAG: |
355 | /* |
356 | we can only get here if w9 = 0, meaning this is a zero tag |
357 | *dest_buf++ = 0; |
358 | */ |
359 | str w9, [dest_buf], #4 // *dest_buf++ = 0 |
360 | subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end |
361 | b.ls L_done // if next_tag >= tag_area_end, we're done |
362 | |
363 | /* WKdm decompress main loop */ |
364 | L_next: |
365 | ldrb w9, [next_tag], #1 // new tag |
366 | cbz w9, L_ZERO_TAG |
367 | cmp w9, #2 // partial match tag ? |
368 | b.eq L_MISS_TAG |
369 | b.gt L_EXACT_TAG |
370 | |
371 | L_PARTIAL_TAG: |
372 | /* |
373 | this is a partial match: |
374 | dict_word = dictionary[*dict_index]; |
375 | dictionary[*dict_index++] = *dest_buf++ = dict_word&0xfffffc00 | *LowBits++; |
376 | */ |
377 | |
378 | ldrb edx, [r8], #1 // qpos = *next_qpos++ |
379 | ldrh ecx, [r11], #2 // lower 10-bits from *next_low_bits++ |
380 | ldr eax, [dictionary, rdx, lsl #2] // read dictionary word |
381 | bfm eax, ecx, #0, #9 // pad the lower 10-bits from *next_low_bits |
382 | str eax, [dictionary,rdx,lsl #2] // *dict_location = newly formed word |
383 | str eax, [dest_buf], #4 // *dest_buf++ = newly formed word |
384 | subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end |
385 | b.gt L_next // repeat loop until next_tag==tag_area_end |
386 | |
387 | L_done: |
388 | |
389 | // release stack memory, restore registers, and return |
390 | add sp, sp, #64 // deallocate for dictionary |
391 | #if KERNEL |
392 | ld1.4s {v0,v1,v2},[sp],#48 |
393 | ld1.4s {v3,v4,v5},[sp],#48 |
394 | #endif |
395 | ret lr |
396 | |
397 | .align 4,0x90 |
398 | L_MISS_TAG: |
399 | /* |
400 | this is a dictionary miss. |
401 | x = *new_word++; |
402 | k = (x>>10)&255; |
403 | k = hashTable[k]; |
404 | dictionary[k] = *dest_buf++ = x; |
405 | */ |
406 | ldr eax, [r10], #4 // w = *next_full_patt++ |
407 | ubfm edx, eax, #10, #17 // 8-bit hash table index |
408 | str eax, [dest_buf], #4 // *dest_buf++ = word |
409 | ldrb edx, [rbx, rdx] // qpos |
410 | str eax, [dictionary,rdx] // dictionary[qpos] = word |
411 | subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end |
412 | b.gt L_next // repeat the loop |
413 | b L_done // if next_tag >= tag_area_end, we're done |
414 | |
415 | .align 4,0x90 |
416 | L_EXACT_TAG: |
417 | /* |
418 | this is an exact match; |
419 | *dest_buf++ = dictionary[*dict_index++]; |
420 | */ |
421 | ldrb eax, [r8], #1 // qpos = *next_qpos++ |
422 | ldr eax, [dictionary,rax,lsl #2] // w = dictionary[qpos] |
423 | str eax, [dest_buf], #4 // *dest_buf++ = w |
424 | subs tags_counter, tags_counter, #1 // next_tag vs tag_area_end |
425 | b.gt L_next // repeat the loop |
426 | b L_done // if next_tag >= tag_area_end, we're done |
427 | |
428 | |
429 | |