1/*
2 * Copyright (c) 2016-2016 Apple Inc. All rights reserved.
3 *
4 * @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5 *
6 * This file contains Original Code and/or Modifications of Original Code
7 * as defined in and that are subject to the Apple Public Source License
8 * Version 2.0 (the 'License'). You may not use this file except in
9 * compliance with the License. The rights granted to you under the License
10 * may not be used to create, or enable the creation or redistribution of,
11 * unlawful or unlicensed copies of an Apple operating system, or to
12 * circumvent, violate, or enable the circumvention or violation of, any
13 * terms of an Apple operating system software license agreement.
14 *
15 * Please obtain a copy of the License at
16 * http://www.opensource.apple.com/apsl/ and read it before using this file.
17 *
18 * The Original Code and all software distributed under the License are
19 * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20 * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21 * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22 * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23 * Please see the License for the specific language governing rights and
24 * limitations under the License.
25 *
26 * @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27 */
28
29// LZ4_RAW buffer API
30// EB May 2015
31// Mar 2016 Imported from the Compression project, with minor optimisations and
32// early abort detection (Derek Kumar)
33
34#include <string.h>
35#include "lz4.h"
36
37size_t
38lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
39 const uint8_t * __restrict src_buffer, size_t src_size,
40 void * __restrict work __attribute__((unused)))
41{
42 const uint8_t * src = src_buffer;
43 uint8_t * dst = dst_buffer;
44
45 // Go fast if we can, keeping away from the end of buffers
46#if LZ4_ENABLE_ASSEMBLY_DECODE
47 if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN) {
48 if (lz4_decode_asm(dst_ptr: &dst, dst_begin: dst_buffer, dst_end: dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, src_ptr: &src, src_end: src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN)) {
49 return 0; // FAIL
50 }
51 }
52#endif
53//DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers?
54
55 // Finish safe
56 if (lz4_decode(dst_ptr: &dst, dst_begin: dst_buffer, dst_end: dst_buffer + dst_size, src_ptr: &src, src_end: src_buffer + src_size)) {
57 return 0; // FAIL
58 }
59 return (size_t)(dst - dst_buffer); // bytes produced
60}
61// Debug flags
62#if LZ4DEBUG
63#define DEBUG_LZ4_ENCODE_ERRORS (1)
64#define DEBUG_LZ4_DECODE_ERRORS (1)
65#endif
66
67#if DEBUG_LZ4_ENCODE_ERRORS
68#endif
69
70#if !LZ4_ENABLE_ASSEMBLY_ENCODE
71
72#if defined(__x86_64__) || defined(__x86_64h__)
73# define LZ4_MATCH_SEARCH_INIT_SIZE 32
74# define LZ4_MATCH_SEARCH_LOOP_SIZE 32
75#else
76# define LZ4_MATCH_SEARCH_INIT_SIZE 8
77# define LZ4_MATCH_SEARCH_LOOP_SIZE 8
78#endif
79
80// Return hash for 4-byte sequence X
81static inline uint32_t
82lz4_hash(uint32_t x)
83{
84 return (x * 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS);
85}
86
87// Store 0xfff..fff at *PTR
88static inline void
89lz4_fill16(uint8_t * ptr)
90{
91 store8(ptr, -1);
92 store8(ptr + 8, -1);
93}
94
95// Return number of matching bytes 0..4 at positions A and B.
96static inline size_t
97lz4_nmatch4(const uint8_t * a, const uint8_t * b)
98{
99 uint32_t x = load4(a) ^ load4(b);
100 return (x == 0)?4:(__builtin_ctzl(x) >> 3);
101}
102
103// Return number of matching bytes 0..8 at positions A and B.
104static inline size_t
105lz4_nmatch8(const uint8_t * a, const uint8_t * b)
106{
107 uint64_t x = load8(a) ^ load8(b);
108 return (x == 0)?8:(__builtin_ctzll(x) >> 3);
109}
110
111// Return number of matching bytes 0..16 at positions A and B.
112static inline size_t
113lz4_nmatch16(const uint8_t * a, const uint8_t * b)
114{
115 size_t n = lz4_nmatch8(a, b);
116 return (n == 8)?(8 + lz4_nmatch8(a + 8, b + 8)):n;
117}
118
119// Return number of matching bytes 0..32 at positions A and B.
120static inline size_t
121lz4_nmatch32(const uint8_t * a, const uint8_t * b)
122{
123 size_t n = lz4_nmatch16(a, b);
124 return (n == 16)?(16 + lz4_nmatch16(a + 16, b + 16)):n;
125}
126
127// Return number of matching bytes 0..64 at positions A and B.
128static inline size_t
129lz4_nmatch64(const uint8_t * a, const uint8_t * b)
130{
131 size_t n = lz4_nmatch32(a, b);
132 return (n == 32)?(32 + lz4_nmatch32(a + 32, b + 32)):n;
133}
134
135// Compile-time selection, return number of matching bytes 0..N at positions A and B.
136static inline size_t
137lz4_nmatch(int N, const uint8_t * a, const uint8_t * b)
138{
139 switch (N) {
140 case 4: return lz4_nmatch4(a, b);
141 case 8: return lz4_nmatch8(a, b);
142 case 16: return lz4_nmatch16(a, b);
143 case 32: return lz4_nmatch32(a, b);
144 case 64: return lz4_nmatch64(a, b);
145 }
146 __builtin_trap(); // FAIL
147}
148
149// Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff.
150// We are allowed to access a constant number of bytes above DST_END.
151// Return incremented DST pointer on success, and 0 on failure
152static inline uint8_t *
153lz4_store_length(uint8_t * dst, const uint8_t * const end, uint32_t L)
154{
155 (void)end;
156 while (L >= 17 * 255) {
157 lz4_fill16(dst);
158 dst += 16;
159 L -= 16 * 255;
160 }
161 lz4_fill16(dst);
162 //DRKTODO verify these modulos/divisions are optimally handled by clang
163 dst += L / 255;
164 *dst++ = L % 255;
165 return dst;
166}
167
168static inline uint32_t
169clamp(uint32_t x, uint32_t max)
170__attribute__((overloadable))
171{
172 return x > max ? max : x;
173}
174
175static inline uint8_t *
176copy_literal(uint8_t *dst, const uint8_t * restrict src, uint32_t L)
177{
178 uint8_t *end = dst + L;
179 { copy16(dst, src); dst += 16; src += 16; }
180 while (dst < end) {
181 copy32(dst, src); dst += 32; src += 32;
182 }
183 return end;
184}
185
186static uint8_t *
187lz4_emit_match(uint32_t L, uint32_t M, uint32_t D,
188 uint8_t * restrict dst,
189 const uint8_t * const end,
190 const uint8_t * restrict src)
191{
192 // The LZ4 encoding scheme requires that M is at least 4, because
193 // the actual value stored by the encoding is M - 4. Check this
194 // requirement for debug builds.
195 assert(M >= 4 && "LZ4 encoding requires that M is at least 4");
196 // Having checked that M >= 4, translate M by four.
197 M -= 4;
198 // Similarly, we must have D < 2**16, because we use only two bytes
199 // to represent the value of D in the encoding.
200 assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes.");
201 // Construct the command byte by clamping both L and M to 0 ... 15
202 // and packing them into a single byte, and store it.
203 *dst++ = clamp(L, 15) << 4 | clamp(M, 15);
204 // If L is 15 or greater, we need to encode extra literal length bytes.
205 if (L >= 15) {
206 dst = lz4_store_length(dst, end, L - 15);
207 if (dst == 0 || dst + L >= end) {
208 return NULL;
209 }
210 }
211 // Copy the literal itself from src to dst.
212 dst = copy_literal(dst, src, L);
213 // Store match distance.
214 store2(dst, D); dst += 2;
215 // If M is 15 or greater, we need to encode extra match length bytes.
216 if (M >= 15) {
217 dst = lz4_store_length(dst, end, M - 15);
218 if (dst == 0) {
219 return NULL;
220 }
221 }
222 return dst;
223}
224
225/* #ifndef LZ4_EARLY_ABORT */
226/* #define LZ4_EARLY_ABORT (1) */
227/* #endif */
228
229#if LZ4_EARLY_ABORT
230int lz4_do_early_abort = 1;
231int lz4_early_aborts = 0;
232#define LZ4_EARLY_ABORT_EVAL (448)
233#define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20)
234#endif /* LZ4_EARLY_ABORT */
235
236void
237lz4_encode_2gb(uint8_t ** dst_ptr,
238 size_t dst_size,
239 const uint8_t ** src_ptr,
240 const uint8_t * src_begin,
241 size_t src_size,
242 lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
243 int skip_final_literals)
244{
245 uint8_t *dst = *dst_ptr; // current output stream position
246 uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN;
247 const uint8_t *src = *src_ptr; // current input stream literal to encode
248 const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN;
249 const uint8_t *match_begin = 0; // first byte of matched sequence
250 const uint8_t *match_end = 0; // first byte after matched sequence
251#if LZ4_EARLY_ABORT
252 uint8_t * const dst_begin = dst;
253 uint32_t lz4_do_abort_eval = lz4_do_early_abort;
254#endif
255
256 while (dst < end) {
257 ptrdiff_t match_distance = 0;
258 for (match_begin = src; match_begin < src_end; match_begin += 1) {
259 const uint32_t pos = (uint32_t)(match_begin - src_begin);
260 const uint32_t w0 = load4(match_begin);
261 const uint32_t w1 = load4(match_begin + 1);
262 const uint32_t w2 = load4(match_begin + 2);
263 const uint32_t w3 = load4(match_begin + 3);
264 const int i0 = lz4_hash(w0);
265 const int i1 = lz4_hash(w1);
266 const int i2 = lz4_hash(w2);
267 const int i3 = lz4_hash(w3);
268 const uint8_t *c0 = src_begin + hash_table[i0].offset;
269 const uint8_t *c1 = src_begin + hash_table[i1].offset;
270 const uint8_t *c2 = src_begin + hash_table[i2].offset;
271 const uint8_t *c3 = src_begin + hash_table[i3].offset;
272 const uint32_t m0 = hash_table[i0].word;
273 const uint32_t m1 = hash_table[i1].word;
274 const uint32_t m2 = hash_table[i2].word;
275 const uint32_t m3 = hash_table[i3].word;
276 hash_table[i0].offset = pos;
277 hash_table[i0].word = w0;
278 hash_table[i1].offset = pos + 1;
279 hash_table[i1].word = w1;
280
281 hash_table[i2].offset = pos + 2;
282 hash_table[i2].word = w2;
283 hash_table[i3].offset = pos + 3;
284 hash_table[i3].word = w3;
285
286 match_distance = (match_begin - c0);
287 if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) {
288 match_end = match_begin + 4;
289 goto EXPAND_FORWARD;
290 }
291
292 match_begin++;
293 match_distance = (match_begin - c1);
294 if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) {
295 match_end = match_begin + 4;
296 goto EXPAND_FORWARD;
297 }
298
299 match_begin++;
300 match_distance = (match_begin - c2);
301 if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) {
302 match_end = match_begin + 4;
303 goto EXPAND_FORWARD;
304 }
305
306 match_begin++;
307 match_distance = (match_begin - c3);
308 if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) {
309 match_end = match_begin + 4;
310 goto EXPAND_FORWARD;
311 }
312
313#if LZ4_EARLY_ABORT
314 //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
315 if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) {
316 ptrdiff_t dstd = dst - dst_begin;
317
318 if (dstd == 0) {
319 lz4_early_aborts++;
320 return;
321 }
322
323/* if (dstd >= pos) { */
324/* return; */
325/* } */
326/* ptrdiff_t cbytes = pos - dstd; */
327/* if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { */
328/* return; */
329/* } */
330 lz4_do_abort_eval = 0;
331 }
332#endif
333 }
334
335 if (skip_final_literals) {
336 *src_ptr = src; *dst_ptr = dst; return;
337 } // do not emit the final literal sequence
338
339 // Emit a trailing literal that covers the remainder of the source buffer,
340 // if we can do so without exceeding the bounds of the destination buffer.
341 size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src;
342 if (src_remaining < 15) {
343 *dst++ = (uint8_t)(src_remaining << 4);
344 memcpy(dst, src, 16); dst += src_remaining;
345 } else {
346 *dst++ = 0xf0;
347 dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15));
348 if (dst == 0 || dst + src_remaining >= end) {
349 return;
350 }
351 memcpy(dst, src, src_remaining); dst += src_remaining;
352 }
353 *dst_ptr = dst;
354 *src_ptr = src + src_remaining;
355 return;
356
357EXPAND_FORWARD:
358
359 // Expand match forward
360 {
361 const uint8_t * ref_end = match_end - match_distance;
362 while (match_end < src_end) {
363 size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end);
364 if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) {
365 match_end += n; break;
366 }
367 match_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
368 ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
369 }
370 }
371
372 // Expand match backward
373 {
374 // match_begin_min = max(src_begin + match_distance,literal)
375 const uint8_t * match_begin_min = src_begin + match_distance;
376 match_begin_min = (match_begin_min < src)?src:match_begin_min;
377 const uint8_t * ref_begin = match_begin - match_distance;
378
379 while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1]) {
380 match_begin -= 1; ref_begin -= 1;
381 }
382 }
383
384 // Emit match
385 dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src);
386 if (!dst) {
387 return;
388 }
389
390 // Update state
391 src = match_end;
392
393 // Update return values to include the last fully encoded match
394 *dst_ptr = dst;
395 *src_ptr = src;
396 }
397}
398
399#endif
400
401size_t
402lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
403 const uint8_t * __restrict src_buffer, size_t src_size,
404 lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES])
405{
406 // Initialize hash table
407 const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 };
408
409 const uint8_t * src = src_buffer;
410 uint8_t * dst = dst_buffer;
411
412 // We need several blocks because our base function is limited to 2GB input
413 const size_t BLOCK_SIZE = 0x7ffff000;
414 while (src_size > 0) {
415 //DRKTODO either implement pattern4 or figure out optimal unroll
416 //DRKTODO: bizarrely, with plain O3 the compiler generates a single
417 //DRKTODO: scalar STP per loop iteration with the stock loop
418 //DRKTODO If hand unrolled, it switches to NEON store pairs
419 // Reset hash table for each block
420/* #if __STDC_HOSTED__ */
421/* memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */
422/* #else */
423/* for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; */
424/* #endif */
425
426 for (int i = 0; i < LZ4_COMPRESS_HASH_ENTRIES;) {
427 hash_table[i++] = HASH_FILL;
428 hash_table[i++] = HASH_FILL;
429 hash_table[i++] = HASH_FILL;
430 hash_table[i++] = HASH_FILL;
431 }
432
433 // Bytes to encode in this block
434 const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size;
435
436 // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
437 // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
438 uint8_t * dst_start = dst;
439 const uint8_t * src_start = src;
440 lz4_encode_2gb(dst_ptr: &dst, dst_size, src_ptr: &src, src_begin: src, src_size: src_to_encode, hash_table, skip_final_literals: src_to_encode < src_size);
441
442 // Check progress
443 size_t dst_used = dst - dst_start;
444 size_t src_used = src - src_start; // src_used <= src_to_encode
445 if (src_to_encode == src_size && src_used < src_to_encode) {
446 return 0; // FAIL to encode last block
447 }
448 // Note that there is a potential problem here in case of non compressible data requiring more blocks.
449 // We may end up here with src_used very small, or even 0, and will not be able to make progress during
450 // compression. We FAIL unless the length of literals remaining at the end is small enough.
451 if (src_to_encode < src_size && src_to_encode - src_used >= (1 << 16)) {
452 return 0; // FAIL too many literals
453 }
454 // Update counters (SRC and DST already have been updated)
455 src_size -= src_used;
456 dst_size -= dst_used;
457 }
458
459 return (size_t)(dst - dst_buffer); // bytes produced
460}
461
462typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1)));
463
464int
465lz4_decode(uint8_t ** dst_ptr,
466 uint8_t * dst_begin,
467 uint8_t * dst_end,
468 const uint8_t ** src_ptr,
469 const uint8_t * src_end)
470{
471 uint8_t * dst = *dst_ptr;
472 const uint8_t * src = *src_ptr;
473
474 // Require dst_end > dst.
475 if (dst_end <= dst) {
476 goto OUT_FULL;
477 }
478
479 while (src < src_end) {
480 // Keep last good position
481 *src_ptr = src;
482 *dst_ptr = dst;
483
484 uint8_t cmd = *src++; // 1 byte encoding literal+(match-4) length: LLLLMMMM
485 uint32_t literalLength = (cmd >> 4) & 15; // 0..15
486 uint32_t matchLength = 4 + (cmd & 15); // 4..19
487
488 // extra bytes for literalLength
489 if (__improbable(literalLength == 15)) {
490 uint8_t s;
491 do {
492#if DEBUG_LZ4_DECODE_ERRORS
493 if (__improbable(src >= src_end)) {
494 printf("Truncated SRC literal length\n");
495 }
496#endif
497 if (__improbable(src >= src_end)) {
498 goto IN_FAIL; // unexpected end of input (1 byte needed)
499 }
500 s = *src++;
501 literalLength += s;
502 } while (__improbable(s == 255));
503 }
504
505 // copy literal
506#if DEBUG_LZ4_DECODE_ERRORS
507 if (__improbable(literalLength > (size_t)(src_end - src))) {
508 printf("Truncated SRC literal\n");
509 }
510#endif
511 if (__improbable(literalLength > (size_t)(src_end - src))) {
512 goto IN_FAIL;
513 }
514 if (__improbable(literalLength > (size_t)(dst_end - dst))) {
515 // literal will take us past the end of the destination buffer,
516 // so we can only copy part of it.
517 literalLength = (uint32_t)(dst_end - dst);
518 memcpy(dst, src, n: literalLength);
519 dst += literalLength;
520 goto OUT_FULL;
521 }
522 memcpy(dst, src, n: literalLength);
523 src += literalLength;
524 dst += literalLength;
525
526 if (__improbable(src >= src_end)) {
527 goto OUT_FULL; // valid end of stream
528 }
529#if DEBUG_LZ4_DECODE_ERRORS
530 if (__improbable(2 > (size_t)(src_end - src))) {
531 printf("Truncated SRC distance\n");
532 }
533#endif
534 if (__improbable(2 > (size_t)(src_end - src))) {
535 goto IN_FAIL; // unexpected end of input (2 bytes needed)
536 }
537 //DRKTODO: this causes an alignment increase warning (legitimate?)
538 //DRKTODO: cast of char * to uint16_t*
539 #pragma clang diagnostic push
540 #pragma clang diagnostic ignored "-Wcast-align"
541
542 // match distance
543 uint64_t matchDistance = *(const uint16_t *)src; // 0x0000 <= matchDistance <= 0xffff
544 #pragma clang diagnostic pop
545 src += 2;
546#if DEBUG_LZ4_DECODE_ERRORS
547 if (matchDistance == 0) {
548 printf("Invalid match distance D = 0\n");
549 }
550#endif
551 if (__improbable(matchDistance == 0)) {
552 goto IN_FAIL; // 0x0000 invalid
553 }
554 uint8_t * ref = dst - matchDistance;
555#if DEBUG_LZ4_DECODE_ERRORS
556 if (__improbable(ref < dst_begin)) {
557 printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n", matchDistance, dst_begin, dst, dst_end);
558 }
559#endif
560 if (__improbable(ref < dst_begin)) {
561 goto OUT_FAIL; // out of range
562 }
563 // extra bytes for matchLength
564 if (__improbable(matchLength == 19)) {
565 uint8_t s;
566 do {
567#if DEBUG_LZ4_DECODE_ERRORS
568 if (__improbable(src >= src_end)) {
569 printf("Truncated SRC match length\n");
570 }
571#endif
572 if (__improbable(src >= src_end)) {
573 goto IN_FAIL; // unexpected end of input (1 byte needed)
574 }
575 s = *src++;
576 matchLength += s;
577 } while (__improbable(s == 255));
578 }
579
580 // copy match (may overlap)
581 if (__improbable(matchLength > (size_t)(dst_end - dst))) {
582 // match will take us past the end of the destination buffer,
583 // so we can only copy part of it.
584 matchLength = (uint32_t)(dst_end - dst);
585 for (uint32_t i = 0; i < matchLength; ++i) {
586 dst[i] = ref[i];
587 }
588 dst += matchLength;
589 goto OUT_FULL;
590 }
591 for (uint64_t i = 0; i < matchLength; i++) {
592 dst[i] = ref[i];
593 }
594 dst += matchLength;
595 }
596
597 // We reached the end of the input buffer after a full instruction
598OUT_FULL:
599 // Or we reached the end of the output buffer
600 *dst_ptr = dst;
601 *src_ptr = src;
602 return 0;
603
604 // Error conditions
605OUT_FAIL:
606IN_FAIL:
607 return 1; // FAIL
608}
609