1/*
2 * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29// LZ4_RAW buffer API
30// EB May 2015
31// Mar 2016 Imported from the Compression project, with minor optimisations and
32// early abort detection (Derek Kumar)
33
34#include "lz4.h"
35#define memcpy __builtin_memcpy
36
37size_t lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
38 const uint8_t * __restrict src_buffer, size_t src_size,
39 void * __restrict work __attribute__((unused)))
40{
41 const uint8_t * src = src_buffer;
42 uint8_t * dst = dst_buffer;
43
44 // Go fast if we can, keeping away from the end of buffers
45#if LZ4_ENABLE_ASSEMBLY_DECODE
46 if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN)
47 {
48 if (lz4_decode_asm(&dst, dst_buffer, dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, &src, src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN))
49 return 0; // FAIL
50 }
51#endif
52//DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers?
53
54 // Finish safe
55 if (lz4_decode(&dst, dst_buffer, dst_buffer + dst_size, &src, src_buffer + src_size))
56 return 0; // FAIL
57
58 return (size_t)(dst - dst_buffer); // bytes produced
59}
60// Debug flags
61#if LZ4DEBUG
62#define DEBUG_LZ4_ENCODE_ERRORS (1)
63#define DEBUG_LZ4_DECODE_ERRORS (1)
64#endif
65
66#if DEBUG_LZ4_ENCODE_ERRORS
67#endif
68
69#if !LZ4_ENABLE_ASSEMBLY_ENCODE
70
71#if defined(__x86_64__) || defined(__x86_64h__)
72# define LZ4_MATCH_SEARCH_INIT_SIZE 32
73# define LZ4_MATCH_SEARCH_LOOP_SIZE 32
74#else
75# define LZ4_MATCH_SEARCH_INIT_SIZE 8
76# define LZ4_MATCH_SEARCH_LOOP_SIZE 8
77#endif
78
79// Return hash for 4-byte sequence X
80static inline uint32_t lz4_hash(uint32_t x) { return (x * 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS); }
81
82// Store 0xfff..fff at *PTR
83static inline void lz4_fill16(uint8_t * ptr)
84{
85 store8(ptr,-1);
86 store8(ptr+8,-1);
87}
88
89// Return number of matching bytes 0..4 at positions A and B.
90static inline size_t lz4_nmatch4(const uint8_t * a,const uint8_t * b)
91{
92 uint32_t x = load4(a) ^ load4(b);
93 return (x == 0)?4:(__builtin_ctzl(x) >> 3);
94}
95
96// Return number of matching bytes 0..8 at positions A and B.
97static inline size_t lz4_nmatch8(const uint8_t * a,const uint8_t * b)
98{
99 uint64_t x = load8(a) ^ load8(b);
100 return (x == 0)?8:(__builtin_ctzll(x) >> 3);
101}
102
103// Return number of matching bytes 0..16 at positions A and B.
104static inline size_t lz4_nmatch16(const uint8_t * a,const uint8_t * b)
105{
106 size_t n = lz4_nmatch8(a,b);
107 return (n == 8)?(8 + lz4_nmatch8(a+8,b+8)):n;
108}
109
110// Return number of matching bytes 0..32 at positions A and B.
111static inline size_t lz4_nmatch32(const uint8_t * a,const uint8_t * b)
112{
113 size_t n = lz4_nmatch16(a,b);
114 return (n == 16)?(16 + lz4_nmatch16(a+16,b+16)):n;
115}
116
117// Return number of matching bytes 0..64 at positions A and B.
118static inline size_t lz4_nmatch64(const uint8_t * a,const uint8_t * b)
119{
120 size_t n = lz4_nmatch32(a,b);
121 return (n == 32)?(32 + lz4_nmatch32(a+32,b+32)):n;
122}
123
124// Compile-time selection, return number of matching bytes 0..N at positions A and B.
125static inline size_t lz4_nmatch(int N, const uint8_t * a, const uint8_t * b)
126{
127 switch (N) {
128 case 4: return lz4_nmatch4(a,b);
129 case 8: return lz4_nmatch8(a,b);
130 case 16: return lz4_nmatch16(a,b);
131 case 32: return lz4_nmatch32(a,b);
132 case 64: return lz4_nmatch64(a,b);
133 }
134 __builtin_trap(); // FAIL
135}
136
137// Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff.
138// We are allowed to access a constant number of bytes above DST_END.
139// Return incremented DST pointer on success, and 0 on failure
140static inline uint8_t *lz4_store_length(uint8_t * dst, const uint8_t * const end, uint32_t L) {
141 (void)end;
142 while (L >= 17*255) {
143 lz4_fill16(dst);
144 dst += 16;
145 L -= 16*255;
146 }
147 lz4_fill16(dst);
148 //DRKTODO verify these modulos/divisions are optimally handled by clang
149 dst += L/255;
150 *dst++ = L%255;
151 return dst;
152}
153
154static inline uint32_t clamp(uint32_t x, uint32_t max) __attribute__((overloadable)) { return x > max ? max : x; }
155
156static inline uint8_t *copy_literal(uint8_t *dst, const uint8_t * restrict src, uint32_t L) {
157 uint8_t *end = dst + L;
158 { copy16(dst, src); dst += 16; src += 16; }
159 while (dst < end) { copy32(dst, src); dst += 32; src += 32; }
160 return end;
161}
162
163static uint8_t *lz4_emit_match(uint32_t L, uint32_t M, uint32_t D,
164 uint8_t * restrict dst,
165 const uint8_t * const end,
166 const uint8_t * restrict src) {
167 // The LZ4 encoding scheme requires that M is at least 4, because
168 // the actual value stored by the encoding is M - 4. Check this
169 // requirement for debug builds.
170 assert(M >= 4 && "LZ4 encoding requires that M is at least 4");
171 // Having checked that M >= 4, translate M by four.
172 M -= 4;
173 // Similarly, we must have D < 2**16, because we use only two bytes
174 // to represent the value of D in the encoding.
175 assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes.");
176 // Construct the command byte by clamping both L and M to 0 ... 15
177 // and packing them into a single byte, and store it.
178 *dst++ = clamp(L, 15) << 4 | clamp(M, 15);
179 // If L is 15 or greater, we need to encode extra literal length bytes.
180 if (L >= 15) {
181 dst = lz4_store_length(dst, end, L - 15);
182 if (dst == 0 || dst + L >= end) return NULL;
183 }
184 // Copy the literal itself from src to dst.
185 dst = copy_literal(dst, src, L);
186 // Store match distance.
187 store2(dst, D); dst += 2;
188 // If M is 15 or greater, we need to encode extra match length bytes.
189 if (M >= 15) {
190 dst = lz4_store_length(dst, end, M - 15);
191 if (dst == 0) return NULL;
192 }
193 return dst;
194}
195
196/* #ifndef LZ4_EARLY_ABORT */
197/* #define LZ4_EARLY_ABORT (1) */
198/* #endif */
199
200#if LZ4_EARLY_ABORT
201int lz4_do_early_abort = 1;
202int lz4_early_aborts = 0;
203#define LZ4_EARLY_ABORT_EVAL (448)
204#define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20)
205#endif /* LZ4_EARLY_ABORT */
206
207void lz4_encode_2gb(uint8_t ** dst_ptr,
208 size_t dst_size,
209 const uint8_t ** src_ptr,
210 const uint8_t * src_begin,
211 size_t src_size,
212 lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
213 int skip_final_literals)
214{
215 uint8_t *dst = *dst_ptr; // current output stream position
216 uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN;
217 const uint8_t *src = *src_ptr; // current input stream literal to encode
218 const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN;
219 const uint8_t *match_begin = 0; // first byte of matched sequence
220 const uint8_t *match_end = 0; // first byte after matched sequence
221#if LZ4_EARLY_ABORT
222 uint8_t * const dst_begin = dst;
223 uint32_t lz4_do_abort_eval = lz4_do_early_abort;
224#endif
225
226 while (dst < end)
227 {
228 ptrdiff_t match_distance = 0;
229 for (match_begin = src; match_begin < src_end; match_begin += 1) {
230 const uint32_t pos = (uint32_t)(match_begin - src_begin);
231 const uint32_t w0 = load4(match_begin);
232 const uint32_t w1 = load4(match_begin + 1);
233 const uint32_t w2 = load4(match_begin + 2);
234 const uint32_t w3 = load4(match_begin + 3);
235 const int i0 = lz4_hash(w0);
236 const int i1 = lz4_hash(w1);
237 const int i2 = lz4_hash(w2);
238 const int i3 = lz4_hash(w3);
239 const uint8_t *c0 = src_begin + hash_table[i0].offset;
240 const uint8_t *c1 = src_begin + hash_table[i1].offset;
241 const uint8_t *c2 = src_begin + hash_table[i2].offset;
242 const uint8_t *c3 = src_begin + hash_table[i3].offset;
243 const uint32_t m0 = hash_table[i0].word;
244 const uint32_t m1 = hash_table[i1].word;
245 const uint32_t m2 = hash_table[i2].word;
246 const uint32_t m3 = hash_table[i3].word;
247 hash_table[i0].offset = pos;
248 hash_table[i0].word = w0;
249 hash_table[i1].offset = pos + 1;
250 hash_table[i1].word = w1;
251
252 hash_table[i2].offset = pos + 2;
253 hash_table[i2].word = w2;
254 hash_table[i3].offset = pos + 3;
255 hash_table[i3].word = w3;
256
257 match_distance = (match_begin - c0);
258 if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) {
259 match_end = match_begin + 4;
260 goto EXPAND_FORWARD;
261 }
262
263 match_begin++;
264 match_distance = (match_begin - c1);
265 if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) {
266 match_end = match_begin + 4;
267 goto EXPAND_FORWARD;
268 }
269
270 match_begin++;
271 match_distance = (match_begin - c2);
272 if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) {
273 match_end = match_begin + 4;
274 goto EXPAND_FORWARD;
275 }
276
277 match_begin++;
278 match_distance = (match_begin - c3);
279 if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) {
280 match_end = match_begin + 4;
281 goto EXPAND_FORWARD;
282 }
283
284#if LZ4_EARLY_ABORT
285 //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
286 if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) {
287 ptrdiff_t dstd = dst - dst_begin;
288
289 if (dstd == 0) {
290 lz4_early_aborts++;
291 return;
292 }
293
294/* if (dstd >= pos) { */
295/* return; */
296/* } */
297/* ptrdiff_t cbytes = pos - dstd; */
298/* if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { */
299/* return; */
300/* } */
301 lz4_do_abort_eval = 0;
302 }
303#endif
304 }
305
306 if (skip_final_literals) { *src_ptr = src; *dst_ptr = dst; return; } // do not emit the final literal sequence
307
308 // Emit a trailing literal that covers the remainder of the source buffer,
309 // if we can do so without exceeding the bounds of the destination buffer.
310 size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src;
311 if (src_remaining < 15) {
312 *dst++ = (uint8_t)(src_remaining << 4);
313 memcpy(dst, src, 16); dst += src_remaining;
314 } else {
315 *dst++ = 0xf0;
316 dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15));
317 if (dst == 0 || dst + src_remaining >= end) return;
318 memcpy(dst, src, src_remaining); dst += src_remaining;
319 }
320 *dst_ptr = dst;
321 *src_ptr = src + src_remaining;
322 return;
323
324 EXPAND_FORWARD:
325
326 // Expand match forward
327 {
328 const uint8_t * ref_end = match_end - match_distance;
329 while (match_end < src_end)
330 {
331 size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end);
332 if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) { match_end += n; break; }
333 match_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
334 ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
335 }
336 }
337
338 // Expand match backward
339 {
340 // match_begin_min = max(src_begin + match_distance,literal)
341 const uint8_t * match_begin_min = src_begin + match_distance;
342 match_begin_min = (match_begin_min < src)?src:match_begin_min;
343 const uint8_t * ref_begin = match_begin - match_distance;
344
345 while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1] ) { match_begin -= 1; ref_begin -= 1; }
346 }
347
348 // Emit match
349 dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src);
350 if (!dst) return;
351
352 // Update state
353 src = match_end;
354
355 // Update return values to include the last fully encoded match
356 *dst_ptr = dst;
357 *src_ptr = src;
358 }
359}
360
361#endif
362
363size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
364 const uint8_t * __restrict src_buffer, size_t src_size,
365 lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES])
366{
367 // Initialize hash table
368 const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 };
369
370 const uint8_t * src = src_buffer;
371 uint8_t * dst = dst_buffer;
372
373 // We need several blocks because our base function is limited to 2GB input
374 const size_t BLOCK_SIZE = 0x7ffff000;
375 while (src_size > 0)
376 {
377 //DRKTODO either implement pattern4 or figure out optimal unroll
378 //DRKTODO: bizarrely, with plain O3 the compiler generates a single
379 //DRKTODO: scalar STP per loop iteration with the stock loop
380 //DRKTODO If hand unrolled, it switches to NEON store pairs
381 // Reset hash table for each block
382/* #if __STDC_HOSTED__ */
383/* memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */
384/* #else */
385/* for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; */
386/* #endif */
387
388 for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;) {
389 hash_table[i++] = HASH_FILL;
390 hash_table[i++] = HASH_FILL;
391 hash_table[i++] = HASH_FILL;
392 hash_table[i++] = HASH_FILL;
393 }
394
395 // Bytes to encode in this block
396 const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size;
397
398 // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
399 // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
400 uint8_t * dst_start = dst;
401 const uint8_t * src_start = src;
402 lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size);
403
404 // Check progress
405 size_t dst_used = dst - dst_start;
406 size_t src_used = src - src_start; // src_used <= src_to_encode
407 if (src_to_encode == src_size && src_used < src_to_encode) return 0; // FAIL to encode last block
408
409 // Note that there is a potential problem here in case of non compressible data requiring more blocks.
410 // We may end up here with src_used very small, or even 0, and will not be able to make progress during
411 // compression. We FAIL unless the length of literals remaining at the end is small enough.
412 if (src_to_encode < src_size && src_to_encode - src_used >= (1<<16)) return 0; // FAIL too many literals
413
414 // Update counters (SRC and DST already have been updated)
415 src_size -= src_used;
416 dst_size -= dst_used;
417 }
418
419 return (size_t)(dst - dst_buffer); // bytes produced
420}
421
422typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1)));
423
424int lz4_decode(uint8_t ** dst_ptr,
425 uint8_t * dst_begin,
426 uint8_t * dst_end,
427 const uint8_t ** src_ptr,
428 const uint8_t * src_end)
429{
430 uint8_t * dst = *dst_ptr;
431 const uint8_t * src = *src_ptr;
432
433 // Require dst_end > dst.
434 if (dst_end <= dst) goto OUT_FULL;
435
436 while (src < src_end)
437 {
438 // Keep last good position
439 *src_ptr = src;
440 *dst_ptr = dst;
441
442 uint8_t cmd = *src++; // 1 byte encoding literal+(match-4) length: LLLLMMMM
443 uint32_t literalLength = (cmd >> 4) & 15; // 0..15
444 uint32_t matchLength = 4 + (cmd & 15); // 4..19
445
446 // extra bytes for literalLength
447 if (__improbable(literalLength == 15))
448 {
449 uint8_t s;
450 do {
451#if DEBUG_LZ4_DECODE_ERRORS
452 if (__improbable(src >= src_end)) printf("Truncated SRC literal length\n");
453#endif
454 if (__improbable(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed)
455 s = *src++;
456 literalLength += s;
457 } while (__improbable(s == 255));
458 }
459
460 // copy literal
461#if DEBUG_LZ4_DECODE_ERRORS
462 if (__improbable(literalLength > (size_t)(src_end - src))) printf("Truncated SRC literal\n");
463#endif
464 if (__improbable(literalLength > (size_t)(src_end - src))) goto IN_FAIL;
465 if (__improbable(literalLength > (size_t)(dst_end - dst))) {
466 // literal will take us past the end of the destination buffer,
467 // so we can only copy part of it.
468 literalLength = (uint32_t)(dst_end - dst);
469 memcpy(dst, src, literalLength);
470 dst += literalLength;
471 goto OUT_FULL;
472 }
473 memcpy(dst,src,literalLength);
474 src += literalLength;
475 dst += literalLength;
476
477 if (__improbable(src >= src_end)) goto OUT_FULL; // valid end of stream
478#if DEBUG_LZ4_DECODE_ERRORS
479 if (__improbable(2 > (size_t)(src_end - src))) printf("Truncated SRC distance\n");
480#endif
481 if (__improbable(2 > (size_t)(src_end - src))) goto IN_FAIL; // unexpected end of input (2 bytes needed)
482
483 //DRKTODO: this causes an alignment increase warning (legitimate?)
484 //DRKTODO: cast of char * to uint16_t*
485 #pragma clang diagnostic push
486 #pragma clang diagnostic ignored "-Wcast-align"
487
488 // match distance
489 uint64_t matchDistance = *(const uint16_t *)src; // 0x0000 <= matchDistance <= 0xffff
490 #pragma clang diagnostic pop
491 src += 2;
492#if DEBUG_LZ4_DECODE_ERRORS
493 if (matchDistance == 0) printf("Invalid match distance D = 0\n");
494#endif
495 if (__improbable(matchDistance == 0)) goto IN_FAIL; // 0x0000 invalid
496 uint8_t * ref = dst - matchDistance;
497#if DEBUG_LZ4_DECODE_ERRORS
498 if (__improbable(ref < dst_begin)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance,dst_begin,dst,dst_end);
499#endif
500 if (__improbable(ref < dst_begin)) goto OUT_FAIL; // out of range
501
502 // extra bytes for matchLength
503 if (__improbable(matchLength == 19))
504 {
505 uint8_t s;
506 do {
507#if DEBUG_LZ4_DECODE_ERRORS
508 if (__improbable(src >= src_end)) printf("Truncated SRC match length\n");
509#endif
510 if (__improbable(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed)
511 s = *src++;
512 matchLength += s;
513 } while (__improbable(s == 255));
514 }
515
516 // copy match (may overlap)
517 if (__improbable(matchLength > (size_t)(dst_end - dst))) {
518 // match will take us past the end of the destination buffer,
519 // so we can only copy part of it.
520 matchLength = (uint32_t)(dst_end - dst);
521 for (uint32_t i=0; i<matchLength; ++i) dst[i] = ref[i];
522 dst += matchLength;
523 goto OUT_FULL;
524 }
525 for (uint64_t i=0;i<matchLength;i++) dst[i] = ref[i];
526 dst += matchLength;
527 }
528
529 // We reached the end of the input buffer after a full instruction
530OUT_FULL:
531 // Or we reached the end of the output buffer
532 *dst_ptr = dst;
533 *src_ptr = src;
534 return 0;
535
536 // Error conditions
537OUT_FAIL:
538IN_FAIL:
539 return 1; // FAIL
540}
541