1/*
2 * Copyright (c) 2000-2014 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29/*
30 This file contains arm64 hand optimized implementation of WKdm memory page compressor.
31
32 int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
33
34 input :
35 src_buf : address of input page (length = 1024 words)
36 dest_buf : address of output buffer (may not be 16-byte aligned)
37 scratch : a 16-byte aligned 4k bytes scratch memory provided by the caller,
38 bytes_budget : a given byte target in compression
39
40 output :
41
42 if the input buffer can be compressed within the given byte budget, the dest_buf is written with compressed data and the function returns with number of bytes for the compressed data
43 o.w., the function returns -1 to signal that the input data can not be compressed with the given byte budget.
44 During the scan and tag process, each word that can not be compressed will be written to dest_buf, followed by a 12-bytes header + 256-bytes tag area.
45 When the functions returns -1, dest_buf is filled with all those words that can not be compressed and should be considered undefined.
46 The worst-case scenario is that all words can not be compressed. Hence, the minimum size requirement for dest_buf should be 12+256+4096 = 4364 bytes to prevent from memory fault.
47
48 The 4th argument bytes_budget is the target compress budget in bytes.
49 Should the input page can be compressed within the budget, the compressed data is written to *dest_buf, and the function returns the number of compressed bytes.
50 Otherwise, the function returns -1 (to signal to the caller that the page can not be compressed).
51
52 WKdm Compression algorithm is briefly stated as follows:
53
54 There is a dynamically updated dictionary consisting of 16 words. Each dictionary word is initialized to 1 at the point of entry to the function.
55 For a nonzero input word x, its 8-bits (10-bits scaled up) is used to determine a corresponding word from the dictionary, represented by dict_index (4-bits) and dict_word (32-bits).
56 a. k = (x>>10)&255; // 8-bit hash table index
57 b. dict_index = hashTable[k]; // 4-bit dictionary index, hashTable[] is fixed
58 c. dict_word = dictionary[dict_index]; // 32-bit dictionary word, dictionary[] is dynamically updated
59
60 Each input word x is classified/tagged into 4 classes :
61 0 : x = 0
62 1 : (x>>10) == (dict_word>>10), bits 10:31 of the input word match a dictionary word
63 2 : (x>>10) != (dict_word>>10), the above condition (22 higher bits matched) is not met, meaning a dictionary miss
64 3 : (x == dict_word), the exact input word is in the dictionary
65
66 For each class, different numbers of bits are needed for the decompressor to reproduce the original input word.
67 0 : 2-bits tag (32->2 compression)
68 1 : 2-bits tag + 4-bits dict_index + 10-bits lower bits (32->16 compression)
69 2 : 2-bits tag + 32-bits new word (32->34 expansion)
70 3 : 2-bits tag + 4-bits dict_index (32->6 compression)
71
72 It is obvious now that WKdm compress algorithm works well for pages where there are lots of zero words (32->2) and/or there are freqeunt repeats of some word patterns (32->6).
73
74 the output bit stream (*dest_buf) consists of
75 a. 12 bytes header
76 b. 256 bytes for 1024 packed tags
77 c. (varying number of) words for new words not matched to dictionary word.
78 d. (varying number of) 32-bit words for packed 4-bit dict_indices (for class 1 and 3)
79 e. (varying number of) 32-bit words for packed 10-bit low bits (for class 1)
80
81 the header is actually of 3 words that specify the ending offset (in 32-bit words) from the start of the bit stream of c,d,e, respectively.
82 Note that there might be padding bits in d (if the number of dict_indices does not divide by 8), and there are 2/12/22 padding bits for packing 3/2/1 low 10-bits in a 32-bit word.
83
84
85 The WKdm compress algorithm 1st runs a scan and classification pass, tagging and write unpacked data into temporary buffers. It follows by packing those data into the output buffer.
86
87 The temp buffers are
88
89 uint8_t tempTagsArray[1024]; // temporary saving for tags before final packing
90 uint8_t tempQPosArray[1024]; // temporary saving for dict_indices before final packing
91 uint16_t tempLowBitsArray[1024]; // temporary saving for partially matched lower 10 bits before final packing
92
93 Since the new words (that can not matched fully or partially to the dictionary) are stored right after the header and the tags section and need no packing, we directly write them to
94 the destination buffer.
95
96 uint32_t *new_word = dest_buf+3+64; // 3 words for header, 64 words for tags, new words come right after the tags.
97
98 Now since we are given a byte budget for this compressor, we can monitor the byte (or bit) usage on the fly in the scanning and tagging pass.
99
100 byte_count -= 12 + 256; // bit budget minus header and tags
101
102 whenever an input word is classified as class
103
104 2 : byte_count -= 4;
105
106 the compress function can early exit (return -1) should the page can not be compressed with the given byte budget (i.e., byte_count <= 0).
107
108 without showing the bit budget management, the pseudo code is given as follows:
109
110 uint8_t *tags=tempTagsArray;
111 uint8_t *dict=tempQPosArray;
112 uint8_t *partial=tempLowBitsArray;
113
114 for (i=0;i<1024;i++) {
115 x = *src_buf++;
116 if (x == 0) { // zero, 2-bits tag
117 *tags++ = 0;
118 } else {
119
120 // find dict_index and dict_word from x
121 k = (x>>10)&255;
122 dict_index = hashTable[k];
123 dict_word = dictionary[dict_index];
124
125 if (dict_word == x) { // exactly match
126 // 2-bits tag + 4-bits table index
127 *tags++ = 3;
128 *dict++ = dict_index;
129 } else if (((x^dict_word)>>10)==0) { // 22 higher bits matched
130 // 2-bits tag + 4-bits table index + 10-bits lower partial
131 *tags++ = 1;
132 *dict++ = dict_index;
133 *partial++ = x &0x3ff;
134 dictionary[dict_index] = x;
135 } else { // not matched
136 // 2-bits tag + 32-bits new word
137 *tags++ = 2;
138 *new_word++ = x;
139 dictionary[dict_index] = x;
140 }
141 }
142 }
143
144 after this classification/tagging pass is completed, the 3 temp buffers are packed into the output *dest_buf:
145
146 1. 1024 tags are packed into 256 bytes right after the 12-bytes header
147 2. dictionary indices (4-bits each) are packed into are right after the new words section
148 3. 3 low 10-bits are packed into a 32-bit word, this is after the dictionary indices section.
149
150 cclee, 11/9/12
151
152 Added zero page, single value page, sparse page, early abort optimizations
153 rsrini, 09/14/14
154*/
155
156#ifndef PAGES_SIZE_IN_KBYTES
157#define PAGES_SIZE_IN_KBYTES 4
158#endif
159
160#if !((PAGES_SIZE_IN_KBYTES==4) || (PAGES_SIZE_IN_KBYTES==16))
161#error "Only PAGES_SIZE_IN_KBYTES = 4 or 16 is supported"
162#endif
163
164
165 .text
166 .align 4
167
168/*
169 int WKdm_compress (WK_word* src_buf, WK_word* dest_buf, WK_word* scratch, unsigned int bytes_budget);
170*/
171
172.globl _WKdm_compress_4k
173_WKdm_compress_4k:
174
175/*
176 ------------------------- symbolizing register use -----------------------------------
177*/
178 #define src_buf x0
179 #define next_input_word x0
180 #define dest_buf x1
181 #define scratch x2
182 #define byte_count x3
183 #define next_tag x4
184 #define tempTagsArray x2 // scratch
185 #define dictionary x5
186 #define remaining x6
187 #define next_full_patt x7
188 #define dict_location x8
189 #define wdict_location w8
190 #define next_qp x9
191 #define hashTable x10
192 #define tempQPosArray x11
193 #define next_low_bits x12
194
195/*
196 this arm64 assembly code is ported from x86_64 assembly code,
197 therefore need such symbolization to quickly reuse the x86_64 assembly code
198 for these intermediate/temporary register use
199*/
200 #define rax x13
201 #define eax w13
202 #define rcx x14
203 #define ecx w14
204 #define rdx x15
205 #define edx w15
206 #define rdi x0 /* after some point, x0/rdi becomes free other usage */
207
208
209/*
210 ------------------------- scratch memory --------------------------------------
211
212 need 16*4 (dictionary) + 256*4 (tempTagsArray) + 256*4 (tempQPosArray) + 1024*4 (tempLowBitsArray)
213 total 6208 bytes
214 [sp,#0] : dictionary
215 [scratch,#0] : tempTagsArray
216 [scratch,#1024] : tempQPosArray
217 [scratch,#2048] : tempLowBitsArray
218*/
219
220#define scale (PAGES_SIZE_IN_KBYTES/4)
221
222#define SV_RETURN 0 // return value when SV, ZV page is found
223#define MZV_MAGIC 17185 // magic value used to identify MZV page encoding
224#define CHKPT_BYTES 416 // for early aborts: checkpoint after processing this many bytes. Must be in range [4..4096]
225#define CHKPT_WORDS (CHKPT_BYTES/4) // checkpoint bytes in words
226#define CHKPT_TAG_BYTES (CHKPT_BYTES/16) // size of the tags for CHKPT_BYTES of data
227#define CHKPT_SHRUNK_BYTES 426 // for early aborts: max size of compressed stream to allow further processing ..
228 // .. to disable early aborts, set CHKPT_SHRUNK_BYTES to 4096
229#if CHKPT_BYTES > 4096
230 #error CHKPT_BYTES must be <= 4096
231#endif
232#if CHKPT_BYTES < 4
233 #error CHKPT_BYTES must be >= 4
234#endif
235
236#if KERNEL
237 sub sp, sp, #64
238 st1.4s {v0,v1,v2,v3},[sp]
239#endif
240
241 sub sp, sp, #64 // allocate for dictionary
242 mov dictionary, sp // use x5 to point to sp, so we can use sub xd, xn, sp
243
244 sub sp, sp, #64 // allocate space for saving callee-saved registers
245 mov x15, sp
246 stp x20, x21, [x15, #0] // save x20, x21
247 stp x22, x23, [x15, #16] // save x22, x23
248 stp x24, x25, [x15, #32] // save x24, x25
249 stp x26, x27, [x15, #48] // save x26, x27
250
251/*
252 ------- entwined statck space allocation, registers set up, and PRELOAD_DICTIONARY -------------------
253*/
254
255 // NOTE: ALL THE DICTIONARY VALUES MUST BE INITIALIZED TO ZERO
256 // THIS IS NEEDED TO EFFICIENTLY DETECT SINGLE VALUE PAGES
257 mov next_tag, tempTagsArray // &tempTagsArray[0]
258 add next_qp, scratch, #(1024*scale) // next_qp
259 mov remaining, #(CHKPT_WORDS*scale) // remaining input words .. initially set to checkpoint
260 add next_full_patt, dest_buf, #(12+256*scale) // dest_buf + [TAGS_AREA_OFFSET + (num_input_words / 16)]*4
261 sub byte_count, byte_count, #(12+256*scale) // bit_count - header - tags
262 add next_low_bits, scratch, #(2048*scale) // &tempLowBitsArray[0]
263 stp xzr, xzr, [dictionary, #0] // initialize dictionary
264 adrp hashTable, _hashLookupTable@GOTPAGE
265 stp xzr, xzr, [dictionary, #16] // initialize dictionary
266 stp xzr, xzr, [dictionary, #32] // initialize dictionary
267 ldr hashTable, [hashTable, _hashLookupTable@GOTPAGEOFF]
268 stp xzr, xzr, [dictionary, #48] // initialize dictionary
269
270#define EARLYCHECK 0
271#define NORMAL 1
272
273#define mode w20
274#define start_next_full_patt x21
275#define start_next_input_word x22
276#define start_next_low_bits x23
277#define r11 x24
278#define r13 x25
279#define byte_budget x26
280#define start_next_qp tempQPosArray
281
282 add tempQPosArray, scratch, #(1024*scale) // &tempQPosArray[0]
283 mov mode, EARLYCHECK // indicate we are yet to evaluate the early aborts
284 mov start_next_full_patt, next_full_patt // remember the start of next_full_patt
285 mov start_next_input_word, next_input_word // remember the start of next_input_word
286 mov start_next_low_bits, next_low_bits // remember the start of next_low_bit
287 add byte_budget, byte_count, #(12+256*scale) // remember the byte budget
288
289 b L_loop
290
291 .align 4, 0x90
292
293 /* we've just detected a zero input word in edx */
294L_RECORD_ZERO:
295 strb edx, [next_tag], #1 // *next_tag++ = ZERO; edx is used as input word, and if we are here edx = 0
296 subs remaining, remaining, #1 // remaing--;
297 b.le CHECKPOINT // if remaining = 0, break
298
299 /* -------------- scan/tag pass loop ------------------------- */
300L_loop:
301
302 /* load new input word to edx */
303 ldr edx, [next_input_word], #4
304 cbz edx, L_RECORD_ZERO // if (input_word==0) RECORD_ZERO
305
306 /*
307 now the input word edx is nonzero, we next find the corresponding dictionary word (eax) and dict_location
308 */
309 ubfm eax, edx, #10, #17
310 ldrb wdict_location, [hashTable, rax] // HASH_TO_DICT_BYTE_OFFSET(input_word)
311 ldr eax, [dictionary, dict_location] // dict_word = *dict_location;
312
313 /* detect whether we match input to its corresponding dictionary word */
314 eor eax, eax, edx // dict_word vs input_word
315 cbz eax, L_RECORD_EXACT // if identical, RECORD_EXACT
316 lsr eax, eax, #10 // HIGH_BITS(dict_word^input_word)
317 cbz eax, L_RECORD_PARTIAL // if identical, RECORD_PARTIAL
318
319L_RECORD_MISS:
320/*
321 if we are here, the input word can not be derived from the dictionary,
322 we write the input word as a new word,
323 and update the dictionary with this new word
324*/
325 subs byte_count, byte_count, #4 // byte_count -= 4
326 b.le L_budgetExhausted // return -1 to signal this page is not compressable
327 str edx, [next_full_patt], #4 // *next_full_patt++ = input_word;
328 mov eax, #2 // tag for MISS
329 subs remaining, remaining, #1 // remaing--;
330 str edx, [dictionary, dict_location] // *dict_location = input_word
331 strb eax, [next_tag], #1 // *next_tag++ = 2 for miss
332 b.gt L_loop // // if remaining > 0, repeat
333 b CHECKPOINT
334
335L_done_search:
336
337 // SET_QPOS_AREA_START(dest_buf,next_full_patt);
338 /* 1st word in dest_buf header = 4-byte offset (from start) of end of new word section */
339
340 sub rax, next_full_patt, dest_buf // next_full_patt - dest_buf
341 lsr eax, eax, #2 // offset in 4-bytes
342 str eax, [dest_buf] // dest_buf[0] = next_full_patt - dest_buf
343
344 /* -------------------------- packing 1024 tags into 256 bytes ----------------------------------------*/
345 // boundary_tmp = WK_pack_2bits(tempTagsArray, (WK_word *) next_tag, dest_buf + HEADER_SIZE_IN_WORDS);
346
347 add rdi, dest_buf, #12 // dest_buf
348 mov rcx, tempTagsArray // &tempTagsArray[0]
349
350L_pack_2bits:
351 ld1.2s {v0,v1,v2,v3},[rcx],#32
352
353 shl.2d v1,v1,#4
354 shl.2d v3,v3,#4
355
356 orr.8b v0, v0, v1
357 orr.8b v2, v2, v3
358
359 ushr.2d v1, v0, #30
360 ushr.2d v3, v2, #30
361
362 orr.8b v0, v0, v1
363 orr.8b v2, v2, v3
364
365 zip1.2s v0, v0, v2
366 st1.2s {v0},[rdi],#8
367 cmp next_tag, rcx
368 b.hi L_pack_2bits
369
370 /* --------------------------------- packing 4-bits dict indices into dest_buf ---------------------------------- */
371
372 /* 1st, round up number of 4-bits dict_indices to a multiple of 8 and fill in 0 if needed */
373 sub rax, next_qp, tempQPosArray // eax = num_bytes_to_pack = next_qp - (char *) tempQPosArray;
374 add eax, eax, #7 // num_bytes_to_pack+7
375 lsr eax, eax, #3 // num_packed_words = (num_bytes_to_pack + 7) >> 3
376 add rcx, tempQPosArray, rax, lsl #3 // endQPosArray = tempQPosArray + 2*num_source_words
377 lsl rax, rax, #2
378 subs byte_count, byte_count, rax
379 b.lt L_budgetExhausted
380
381 cmp rcx, next_qp // endQPosArray vs next_qp
382 b.ls 2f // if (next_qp >= endQPosArray) skip the following zero paddings
383 sub rax, rcx, next_qp
384 mov edx, #0
385 tst eax, #4
386 b.eq 1f
387 str edx, [next_qp], #4
3881: tst eax, #2
389 b.eq 1f
390 strh edx, [next_qp], #2
3911: tst eax, #1
392 b.eq 2f
393 strb edx, [next_qp], #1
3942:
395 mov rdi, next_full_patt // next_full_patt
396 cmp rcx, tempQPosArray // endQPosArray vs tempQPosArray
397 ldr eax, [dest_buf]
398 b.ls L20 // if (endQPosArray <= tempQPosArray) skip the following
399 mov rdx, tempQPosArray // tempQPosArray
400
401 /* packing 4-bits dict indices into dest_buf */
402L_pack_4bits:
403 ldr rax, [rdx], #8 // src_next[1]:src_next[0]
404 orr rax, rax, rax, lsr #28 // eax = src_next[0] | (src_next[1] << 4)
405 cmp rcx, rdx // source_end vs src_next
406 str eax, [rdi], #4 // *dest_next++ = temp;
407 b.hi L_pack_4bits // while (src_next < source_end) repeat the loop
408
409 // SET_LOW_BITS_AREA_START(dest_buf,boundary_tmp);
410 sub rax, rdi, dest_buf // boundary_tmp - dest_buf
411 lsr eax, eax, #2 // boundary_tmp - dest_buf in words
412L20:
413 str eax, [dest_buf,#4] // dest_buf[1] = boundary_tmp - dest_buf
414
415
416
417 /* --------------------------- packing 3 10-bits low bits into a 32-bit word in dest_buf[] ----------------------------------------- */
418
419 add rcx, scratch, #(2048*scale) // tempLowBitsArray
420 sub rdx, next_low_bits, rcx // next_low_bits - tempLowBitsArray (in bytes)
421 lsr rdx, rdx, #1 // num_tenbits_to_pack (in half-words)
422 subs edx, edx, #3 // pre-decrement num_tenbits_to_pack by 3
423 b.lt 1f // if num_tenbits_to_pack < 3, skip the following loop
4240:
425 subs byte_count, byte_count, #4 // byte_count -= 4
426 b.le L_budgetExhausted // return -1 to signal this page is not compressable
427 subs edx, edx, #3 // num_tenbits_to_pack-=3
428 ldr rax, [rcx], #6
429 bfm rax, rax, #58, #9 // pack 1st toward 2nd
430 bfm rax, rax, #58, #25 // pack 1st/2nd toward 3rd
431 lsr rax, rax, #12
432 str eax, [rdi], #4 // pack w0,w1,w2 into 1 dest_buf word
433 b.ge 0b // if no less than 3 elements, back to loop head
434
4351: adds edx, edx, #3 // post-increment num_tenbits_to_pack by 3
436 b.eq 3f // if num_tenbits_to_pack is a multiple of 3, skip the following
437 subs byte_count, byte_count, #4 // byte_count -= 4
438 b.le L_budgetExhausted // return -1 to signal this page is not compressable
439 ldrh eax,[rcx] // w0
440 subs edx, edx, #1 // num_tenbits_to_pack--
441 b.eq 2f //
442 ldrh edx, [rcx, #2] // w1
443 orr eax, eax, edx, lsl #10 // w0 | (w1<<10)
444
4452: str eax, [rdi], #4 // write the final dest_buf word
446
4473: sub rax, rdi, dest_buf // boundary_tmp - dest_buf
448 lsr eax, eax, #2 // boundary_tmp - dest_buf in terms of words
449 str eax, [dest_buf, #8] // SET_LOW_BITS_AREA_END(dest_buf,boundary_tmp)
450 lsl w0, eax, #2 // boundary_tmp - dest_buf in terms of bytes
451
452L_done:
453
454 // restore registers and return
455 mov x15, sp
456 ldp x20, x21, [x15, #0] // restore x20, x21
457 ldp x22, x23, [x15, #16] // restore x22, x23
458 ldp x24, x25, [x15, #32] // restore x24, x25
459 ldp x26, x27, [x15, #48] // restore x26, x27
460 add sp, sp, #128 // deallocate for dictionary + saved register space
461
462#if KERNEL
463 ld1.4s {v0,v1,v2,v3},[sp],#64
464#endif
465 ret lr
466
467 .align 4
468L_budgetExhausted:
469 mov x0, #-1
470 b L_done
471
472
473 .align 4,0x90
474L_RECORD_EXACT:
475/*
476 we have an exact match of the input word to its corresponding dictionary word
477 write tag/dict_index to the temorary buffers
478*/
479 mov eax, #3
480 lsr w14, wdict_location, #2 // divide by 4 for word offset
481 subs remaining, remaining, #1 // remaing--;
482 strb eax, [next_tag], #1 // *next_tag++ = 3 for exact
483 strb w14, [next_qp], #1 // *next_qp = word offset (4-bit)
484 b.gt L_loop
485 b CHECKPOINT // if remaining = 0, break
486
487 .align 4,0x90
488L_RECORD_PARTIAL:
489/*
490 we have a partial (high 22-bits) match of the input word to its corresponding dictionary word
491 write tag/dict_index/low 10 bits to the temorary buffers
492*/
493 mov ecx, #1
494 strb ecx, [next_tag], #1 // *next_tag++ = 1 for partial matched
495 str edx, [dictionary, dict_location] // *dict_location = input_word;
496 subs remaining, remaining, #1 // remaing--;
497 lsr eax, wdict_location, #2 // offset in 32-bit word
498 and edx, edx, #1023 // lower 10 bits
499 strb eax, [next_qp], #1 // update *next_qp++
500 strh edx, [next_low_bits], #2 // save next_low_bits++
501 b.gt L_loop
502
503CHECKPOINT:
504
505 cbz mode, L_check_compression_ratio // if this this an early abort check..
506
507L_check_zero_page:
508
509 cmp start_next_full_patt, next_full_patt // check if any dictionary misses in page
510 b.ne L_check_single_value_page
511
512 cmp start_next_qp, next_qp // check if any partial or exact dictionary matches
513 b.ne L_check_single_value_page
514
515 mov x0, #SV_RETURN // Magic return value
516 b L_done
517
518L_check_single_value_page:
519
520 sub rax, next_full_patt, start_next_full_patt // get # dictionary misses
521 lsr rax, rax, #2
522
523 sub r11, next_qp, start_next_qp // get # dictionary hits (exact + partial)
524
525 sub r13, next_low_bits, start_next_low_bits // get # dictionary partial hits
526 lsr r13, r13, #1
527
528 // Single value page if one of the follwoing is true:
529 // partial == 0 AND hits == 1023(for 4K page) AND miss == 1 AND tag[0] == 2 (i.e. miss)
530 // partial == 1 AND hits == 1024(for 4K page) AND tag[0] == 1 (i.e. partial)
531 //
532 cbnz r13, 1f // were there 0 partial hits?
533
534 cmp r11, #(256*PAGES_SIZE_IN_KBYTES - 1) // were there 1023 dictionary hits
535 b.ne 1f
536
537 cmp rax, #1 // was there exacly 1 dictionary miss?
538 b.ne 1f
539
540 ldrb edx, [tempTagsArray] // read the very 1st tag
541 cmp edx, #2 // was the very 1st tag a miss?
542 b.eq L_is_single_value_page
543
5441:
545 cmp r13, #1 // was there 1 partial hit?
546 b.ne L_check_mostly_zero
547
548 cmp r11, #(256*PAGES_SIZE_IN_KBYTES) // were there 1024 dictionary hits
549 b.ne L_check_mostly_zero
550
551 ldrb edx, [tempTagsArray] // read the very 1st tag
552 cmp edx, #1 // was the very 1st tag a partial?
553 b.ne L_check_mostly_zero
554
555L_is_single_value_page:
556
557 mov x0, #SV_RETURN // Magic return value
558 b L_done
559
560L_check_mostly_zero:
561 // how much space will the sparse packer take?
562 add rax, rax, r11 // rax += (next_qp - start_next_qp)
563 mov rdx, #6
564 mov rcx, #4
565 madd r11, rax, rdx, rcx // r11 = rax * 6 (i.e. 4 byte word + 2 byte offset) + 4 byte for header
566
567 sub rax, next_low_bits, start_next_low_bits // get bytes consumed by lower-10 bits
568 mov rdx, #1365
569 mul rax, rax, rdx
570
571 sub rdx, next_full_patt, start_next_full_patt // get bytes consumed by dictionary misses
572 add rax, rdx, rax, lsr #11 // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
573
574 sub rdx, next_qp, start_next_qp
575 add rax, rax, rdx, lsr #1 // rax += (next_qp - start_next_qp)/2
576 add rax, rax, #(12+256*scale) // rax += bytes taken by the header + tags
577
578 cmp rax, r11 // is the default packer the better option?
579 b.lt L_done_search
580
581 cmp r11, byte_budget // can the sparse packer fit into the given budget?
582 b.gt L_budgetExhausted
583
584L_sparse_packer:
585 mov edx, #MZV_MAGIC
586 str edx, [dest_buf], #4 // header to indicate a sparse packer
587
588 mov rdx, #0 // rdx = byte offset in src of non-0 word
5891:
590 ldr rax, [start_next_input_word, rdx] // rax = read dword
591 cbnz rax, 5f // is dword != 0
5923:
593 add rdx, rdx, #8 // 8 more bytes have been processed
5944:
595 cmp rdx, #(4096*scale) // has the entire page been processed
596 b.ne 1b
597 mov x0, r11 // store the size of the compressed stream
598 b L_done
599
6005:
601 cbz eax, 6f // is lower word == 0
602 str eax, [dest_buf], #4 // store the non-0 word in the dest buffer
603 strh edx, [dest_buf], #2 // store the byte index
6046:
605 lsr rax, rax, 32 // get the upper word into position
606 cbz eax, 3b // is dword == 0
607 add rdx, rdx, #4
608 str eax, [dest_buf], #4 // store the non-0 word in the dest buffer
609 strh edx, [dest_buf], #2 // store the byte index
610 add rdx, rdx, #4
611 b 4b
612
613L_check_compression_ratio:
614
615 mov mode, NORMAL
616 mov remaining, #((1024 - CHKPT_WORDS)*scale) // remaining input words to process
617 cbz remaining, CHECKPOINT // if there are no remaining words to process
618
619 sub rax, next_low_bits, start_next_low_bits // get bytes consumed by lower-10 bits
620 mov rdx, #1365
621 mul rax, rax, rdx
622
623 sub rdx, next_full_patt, start_next_full_patt // get bytes consumed by dictionary misses
624 add rax, rdx, rax, lsr #11 // rax = 2/3*(next_low_bits - start_next_low_bits) + (next_full_patt - start_next_full_patt)
625
626 sub rdx, next_qp, start_next_qp
627 add rax, rax, rdx, lsr #1 // rax += (next_qp - start_next_qp)/2
628 subs rax, rax, #((CHKPT_SHRUNK_BYTES - CHKPT_TAG_BYTES)*scale)
629 // rax += CHKPT_TAG_BYTES; rax -= CHKPT_SHRUNK_BYTES
630
631 b.gt L_budgetExhausted // if rax is > 0, we need to early abort
632 b L_loop // we are done
633