| 1 | /* |
| 2 | * Copyright (c) 2016-2016 Apple Inc. All rights reserved. |
| 3 | * |
| 4 | * @APPLE_OSREFERENCE_LICENSE_HEADER_START@ |
| 5 | * |
| 6 | * This file contains Original Code and/or Modifications of Original Code |
| 7 | * as defined in and that are subject to the Apple Public Source License |
| 8 | * Version 2.0 (the 'License'). You may not use this file except in |
| 9 | * compliance with the License. The rights granted to you under the License |
| 10 | * may not be used to create, or enable the creation or redistribution of, |
| 11 | * unlawful or unlicensed copies of an Apple operating system, or to |
| 12 | * circumvent, violate, or enable the circumvention or violation of, any |
| 13 | * terms of an Apple operating system software license agreement. |
| 14 | * |
| 15 | * Please obtain a copy of the License at |
| 16 | * http://www.opensource.apple.com/apsl/ and read it before using this file. |
| 17 | * |
| 18 | * The Original Code and all software distributed under the License are |
| 19 | * distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER |
| 20 | * EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES, |
| 21 | * INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY, |
| 22 | * FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT. |
| 23 | * Please see the License for the specific language governing rights and |
| 24 | * limitations under the License. |
| 25 | * |
| 26 | * @APPLE_OSREFERENCE_LICENSE_HEADER_END@ |
| 27 | */ |
| 28 | |
| 29 | // LZ4_RAW buffer API |
| 30 | // EB May 2015 |
| 31 | // Mar 2016 Imported from the Compression project, with minor optimisations and |
| 32 | // early abort detection (Derek Kumar) |
| 33 | |
| 34 | #include <string.h> |
| 35 | #include "lz4.h" |
| 36 | |
| 37 | size_t |
| 38 | lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size, |
| 39 | const uint8_t * __restrict src_buffer, size_t src_size, |
| 40 | void * __restrict work __attribute__((unused))) |
| 41 | { |
| 42 | const uint8_t * src = src_buffer; |
| 43 | uint8_t * dst = dst_buffer; |
| 44 | |
| 45 | // Go fast if we can, keeping away from the end of buffers |
| 46 | #if LZ4_ENABLE_ASSEMBLY_DECODE |
| 47 | if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN) { |
| 48 | if (lz4_decode_asm(dst_ptr: &dst, dst_begin: dst_buffer, dst_end: dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, src_ptr: &src, src_end: src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN)) { |
| 49 | return 0; // FAIL |
| 50 | } |
| 51 | } |
| 52 | #endif |
| 53 | //DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers? |
| 54 | |
| 55 | // Finish safe |
| 56 | if (lz4_decode(dst_ptr: &dst, dst_begin: dst_buffer, dst_end: dst_buffer + dst_size, src_ptr: &src, src_end: src_buffer + src_size)) { |
| 57 | return 0; // FAIL |
| 58 | } |
| 59 | return (size_t)(dst - dst_buffer); // bytes produced |
| 60 | } |
| 61 | // Debug flags |
| 62 | #if LZ4DEBUG |
| 63 | #define DEBUG_LZ4_ENCODE_ERRORS (1) |
| 64 | #define DEBUG_LZ4_DECODE_ERRORS (1) |
| 65 | #endif |
| 66 | |
| 67 | #if DEBUG_LZ4_ENCODE_ERRORS |
| 68 | #endif |
| 69 | |
| 70 | #if !LZ4_ENABLE_ASSEMBLY_ENCODE |
| 71 | |
| 72 | #if defined(__x86_64__) || defined(__x86_64h__) |
| 73 | # define LZ4_MATCH_SEARCH_INIT_SIZE 32 |
| 74 | # define LZ4_MATCH_SEARCH_LOOP_SIZE 32 |
| 75 | #else |
| 76 | # define LZ4_MATCH_SEARCH_INIT_SIZE 8 |
| 77 | # define LZ4_MATCH_SEARCH_LOOP_SIZE 8 |
| 78 | #endif |
| 79 | |
| 80 | // Return hash for 4-byte sequence X |
| 81 | static inline uint32_t |
| 82 | lz4_hash(uint32_t x) |
| 83 | { |
| 84 | return (x * 2654435761U) >> (32 - LZ4_COMPRESS_HASH_BITS); |
| 85 | } |
| 86 | |
| 87 | // Store 0xfff..fff at *PTR |
| 88 | static inline void |
| 89 | lz4_fill16(uint8_t * ptr) |
| 90 | { |
| 91 | store8(ptr, -1); |
| 92 | store8(ptr + 8, -1); |
| 93 | } |
| 94 | |
| 95 | // Return number of matching bytes 0..4 at positions A and B. |
| 96 | static inline size_t |
| 97 | lz4_nmatch4(const uint8_t * a, const uint8_t * b) |
| 98 | { |
| 99 | uint32_t x = load4(a) ^ load4(b); |
| 100 | return (x == 0)?4:(__builtin_ctzl(x) >> 3); |
| 101 | } |
| 102 | |
| 103 | // Return number of matching bytes 0..8 at positions A and B. |
| 104 | static inline size_t |
| 105 | lz4_nmatch8(const uint8_t * a, const uint8_t * b) |
| 106 | { |
| 107 | uint64_t x = load8(a) ^ load8(b); |
| 108 | return (x == 0)?8:(__builtin_ctzll(x) >> 3); |
| 109 | } |
| 110 | |
| 111 | // Return number of matching bytes 0..16 at positions A and B. |
| 112 | static inline size_t |
| 113 | lz4_nmatch16(const uint8_t * a, const uint8_t * b) |
| 114 | { |
| 115 | size_t n = lz4_nmatch8(a, b); |
| 116 | return (n == 8)?(8 + lz4_nmatch8(a + 8, b + 8)):n; |
| 117 | } |
| 118 | |
| 119 | // Return number of matching bytes 0..32 at positions A and B. |
| 120 | static inline size_t |
| 121 | lz4_nmatch32(const uint8_t * a, const uint8_t * b) |
| 122 | { |
| 123 | size_t n = lz4_nmatch16(a, b); |
| 124 | return (n == 16)?(16 + lz4_nmatch16(a + 16, b + 16)):n; |
| 125 | } |
| 126 | |
| 127 | // Return number of matching bytes 0..64 at positions A and B. |
| 128 | static inline size_t |
| 129 | lz4_nmatch64(const uint8_t * a, const uint8_t * b) |
| 130 | { |
| 131 | size_t n = lz4_nmatch32(a, b); |
| 132 | return (n == 32)?(32 + lz4_nmatch32(a + 32, b + 32)):n; |
| 133 | } |
| 134 | |
| 135 | // Compile-time selection, return number of matching bytes 0..N at positions A and B. |
| 136 | static inline size_t |
| 137 | lz4_nmatch(int N, const uint8_t * a, const uint8_t * b) |
| 138 | { |
| 139 | switch (N) { |
| 140 | case 4: return lz4_nmatch4(a, b); |
| 141 | case 8: return lz4_nmatch8(a, b); |
| 142 | case 16: return lz4_nmatch16(a, b); |
| 143 | case 32: return lz4_nmatch32(a, b); |
| 144 | case 64: return lz4_nmatch64(a, b); |
| 145 | } |
| 146 | __builtin_trap(); // FAIL |
| 147 | } |
| 148 | |
| 149 | // Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff. |
| 150 | // We are allowed to access a constant number of bytes above DST_END. |
| 151 | // Return incremented DST pointer on success, and 0 on failure |
| 152 | static inline uint8_t * |
| 153 | lz4_store_length(uint8_t * dst, const uint8_t * const end, uint32_t L) |
| 154 | { |
| 155 | (void)end; |
| 156 | while (L >= 17 * 255) { |
| 157 | lz4_fill16(dst); |
| 158 | dst += 16; |
| 159 | L -= 16 * 255; |
| 160 | } |
| 161 | lz4_fill16(dst); |
| 162 | //DRKTODO verify these modulos/divisions are optimally handled by clang |
| 163 | dst += L / 255; |
| 164 | *dst++ = L % 255; |
| 165 | return dst; |
| 166 | } |
| 167 | |
| 168 | static inline uint32_t |
| 169 | clamp(uint32_t x, uint32_t max) |
| 170 | __attribute__((overloadable)) |
| 171 | { |
| 172 | return x > max ? max : x; |
| 173 | } |
| 174 | |
| 175 | static inline uint8_t * |
| 176 | copy_literal(uint8_t *dst, const uint8_t * restrict src, uint32_t L) |
| 177 | { |
| 178 | uint8_t *end = dst + L; |
| 179 | { copy16(dst, src); dst += 16; src += 16; } |
| 180 | while (dst < end) { |
| 181 | copy32(dst, src); dst += 32; src += 32; |
| 182 | } |
| 183 | return end; |
| 184 | } |
| 185 | |
| 186 | static uint8_t * |
| 187 | lz4_emit_match(uint32_t L, uint32_t M, uint32_t D, |
| 188 | uint8_t * restrict dst, |
| 189 | const uint8_t * const end, |
| 190 | const uint8_t * restrict src) |
| 191 | { |
| 192 | // The LZ4 encoding scheme requires that M is at least 4, because |
| 193 | // the actual value stored by the encoding is M - 4. Check this |
| 194 | // requirement for debug builds. |
| 195 | assert(M >= 4 && "LZ4 encoding requires that M is at least 4" ); |
| 196 | // Having checked that M >= 4, translate M by four. |
| 197 | M -= 4; |
| 198 | // Similarly, we must have D < 2**16, because we use only two bytes |
| 199 | // to represent the value of D in the encoding. |
| 200 | assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes." ); |
| 201 | // Construct the command byte by clamping both L and M to 0 ... 15 |
| 202 | // and packing them into a single byte, and store it. |
| 203 | *dst++ = clamp(L, 15) << 4 | clamp(M, 15); |
| 204 | // If L is 15 or greater, we need to encode extra literal length bytes. |
| 205 | if (L >= 15) { |
| 206 | dst = lz4_store_length(dst, end, L - 15); |
| 207 | if (dst == 0 || dst + L >= end) { |
| 208 | return NULL; |
| 209 | } |
| 210 | } |
| 211 | // Copy the literal itself from src to dst. |
| 212 | dst = copy_literal(dst, src, L); |
| 213 | // Store match distance. |
| 214 | store2(dst, D); dst += 2; |
| 215 | // If M is 15 or greater, we need to encode extra match length bytes. |
| 216 | if (M >= 15) { |
| 217 | dst = lz4_store_length(dst, end, M - 15); |
| 218 | if (dst == 0) { |
| 219 | return NULL; |
| 220 | } |
| 221 | } |
| 222 | return dst; |
| 223 | } |
| 224 | |
| 225 | /* #ifndef LZ4_EARLY_ABORT */ |
| 226 | /* #define LZ4_EARLY_ABORT (1) */ |
| 227 | /* #endif */ |
| 228 | |
| 229 | #if LZ4_EARLY_ABORT |
| 230 | int lz4_do_early_abort = 1; |
| 231 | int lz4_early_aborts = 0; |
| 232 | #define LZ4_EARLY_ABORT_EVAL (448) |
| 233 | #define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20) |
| 234 | #endif /* LZ4_EARLY_ABORT */ |
| 235 | |
| 236 | void |
| 237 | lz4_encode_2gb(uint8_t ** dst_ptr, |
| 238 | size_t dst_size, |
| 239 | const uint8_t ** src_ptr, |
| 240 | const uint8_t * src_begin, |
| 241 | size_t src_size, |
| 242 | lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES], |
| 243 | int skip_final_literals) |
| 244 | { |
| 245 | uint8_t *dst = *dst_ptr; // current output stream position |
| 246 | uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN; |
| 247 | const uint8_t *src = *src_ptr; // current input stream literal to encode |
| 248 | const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN; |
| 249 | const uint8_t *match_begin = 0; // first byte of matched sequence |
| 250 | const uint8_t *match_end = 0; // first byte after matched sequence |
| 251 | #if LZ4_EARLY_ABORT |
| 252 | uint8_t * const dst_begin = dst; |
| 253 | uint32_t lz4_do_abort_eval = lz4_do_early_abort; |
| 254 | #endif |
| 255 | |
| 256 | while (dst < end) { |
| 257 | ptrdiff_t match_distance = 0; |
| 258 | for (match_begin = src; match_begin < src_end; match_begin += 1) { |
| 259 | const uint32_t pos = (uint32_t)(match_begin - src_begin); |
| 260 | const uint32_t w0 = load4(match_begin); |
| 261 | const uint32_t w1 = load4(match_begin + 1); |
| 262 | const uint32_t w2 = load4(match_begin + 2); |
| 263 | const uint32_t w3 = load4(match_begin + 3); |
| 264 | const int i0 = lz4_hash(w0); |
| 265 | const int i1 = lz4_hash(w1); |
| 266 | const int i2 = lz4_hash(w2); |
| 267 | const int i3 = lz4_hash(w3); |
| 268 | const uint8_t *c0 = src_begin + hash_table[i0].offset; |
| 269 | const uint8_t *c1 = src_begin + hash_table[i1].offset; |
| 270 | const uint8_t *c2 = src_begin + hash_table[i2].offset; |
| 271 | const uint8_t *c3 = src_begin + hash_table[i3].offset; |
| 272 | const uint32_t m0 = hash_table[i0].word; |
| 273 | const uint32_t m1 = hash_table[i1].word; |
| 274 | const uint32_t m2 = hash_table[i2].word; |
| 275 | const uint32_t m3 = hash_table[i3].word; |
| 276 | hash_table[i0].offset = pos; |
| 277 | hash_table[i0].word = w0; |
| 278 | hash_table[i1].offset = pos + 1; |
| 279 | hash_table[i1].word = w1; |
| 280 | |
| 281 | hash_table[i2].offset = pos + 2; |
| 282 | hash_table[i2].word = w2; |
| 283 | hash_table[i3].offset = pos + 3; |
| 284 | hash_table[i3].word = w3; |
| 285 | |
| 286 | match_distance = (match_begin - c0); |
| 287 | if (w0 == m0 && match_distance < 0x10000 && match_distance > 0) { |
| 288 | match_end = match_begin + 4; |
| 289 | goto EXPAND_FORWARD; |
| 290 | } |
| 291 | |
| 292 | match_begin++; |
| 293 | match_distance = (match_begin - c1); |
| 294 | if (w1 == m1 && match_distance < 0x10000 && match_distance > 0) { |
| 295 | match_end = match_begin + 4; |
| 296 | goto EXPAND_FORWARD; |
| 297 | } |
| 298 | |
| 299 | match_begin++; |
| 300 | match_distance = (match_begin - c2); |
| 301 | if (w2 == m2 && match_distance < 0x10000 && match_distance > 0) { |
| 302 | match_end = match_begin + 4; |
| 303 | goto EXPAND_FORWARD; |
| 304 | } |
| 305 | |
| 306 | match_begin++; |
| 307 | match_distance = (match_begin - c3); |
| 308 | if (w3 == m3 && match_distance < 0x10000 && match_distance > 0) { |
| 309 | match_end = match_begin + 4; |
| 310 | goto EXPAND_FORWARD; |
| 311 | } |
| 312 | |
| 313 | #if LZ4_EARLY_ABORT |
| 314 | //DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits |
| 315 | if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) { |
| 316 | ptrdiff_t dstd = dst - dst_begin; |
| 317 | |
| 318 | if (dstd == 0) { |
| 319 | lz4_early_aborts++; |
| 320 | return; |
| 321 | } |
| 322 | |
| 323 | /* if (dstd >= pos) { */ |
| 324 | /* return; */ |
| 325 | /* } */ |
| 326 | /* ptrdiff_t cbytes = pos - dstd; */ |
| 327 | /* if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { */ |
| 328 | /* return; */ |
| 329 | /* } */ |
| 330 | lz4_do_abort_eval = 0; |
| 331 | } |
| 332 | #endif |
| 333 | } |
| 334 | |
| 335 | if (skip_final_literals) { |
| 336 | *src_ptr = src; *dst_ptr = dst; return; |
| 337 | } // do not emit the final literal sequence |
| 338 | |
| 339 | // Emit a trailing literal that covers the remainder of the source buffer, |
| 340 | // if we can do so without exceeding the bounds of the destination buffer. |
| 341 | size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src; |
| 342 | if (src_remaining < 15) { |
| 343 | *dst++ = (uint8_t)(src_remaining << 4); |
| 344 | memcpy(dst, src, 16); dst += src_remaining; |
| 345 | } else { |
| 346 | *dst++ = 0xf0; |
| 347 | dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - 15)); |
| 348 | if (dst == 0 || dst + src_remaining >= end) { |
| 349 | return; |
| 350 | } |
| 351 | memcpy(dst, src, src_remaining); dst += src_remaining; |
| 352 | } |
| 353 | *dst_ptr = dst; |
| 354 | *src_ptr = src + src_remaining; |
| 355 | return; |
| 356 | |
| 357 | EXPAND_FORWARD: |
| 358 | |
| 359 | // Expand match forward |
| 360 | { |
| 361 | const uint8_t * ref_end = match_end - match_distance; |
| 362 | while (match_end < src_end) { |
| 363 | size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end); |
| 364 | if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) { |
| 365 | match_end += n; break; |
| 366 | } |
| 367 | match_end += LZ4_MATCH_SEARCH_LOOP_SIZE; |
| 368 | ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE; |
| 369 | } |
| 370 | } |
| 371 | |
| 372 | // Expand match backward |
| 373 | { |
| 374 | // match_begin_min = max(src_begin + match_distance,literal) |
| 375 | const uint8_t * match_begin_min = src_begin + match_distance; |
| 376 | match_begin_min = (match_begin_min < src)?src:match_begin_min; |
| 377 | const uint8_t * ref_begin = match_begin - match_distance; |
| 378 | |
| 379 | while (match_begin > match_begin_min && ref_begin[-1] == match_begin[-1]) { |
| 380 | match_begin -= 1; ref_begin -= 1; |
| 381 | } |
| 382 | } |
| 383 | |
| 384 | // Emit match |
| 385 | dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src); |
| 386 | if (!dst) { |
| 387 | return; |
| 388 | } |
| 389 | |
| 390 | // Update state |
| 391 | src = match_end; |
| 392 | |
| 393 | // Update return values to include the last fully encoded match |
| 394 | *dst_ptr = dst; |
| 395 | *src_ptr = src; |
| 396 | } |
| 397 | } |
| 398 | |
| 399 | #endif |
| 400 | |
| 401 | size_t |
| 402 | lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size, |
| 403 | const uint8_t * __restrict src_buffer, size_t src_size, |
| 404 | lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES]) |
| 405 | { |
| 406 | // Initialize hash table |
| 407 | const lz4_hash_entry_t HASH_FILL = { .offset = 0x80000000, .word = 0x0 }; |
| 408 | |
| 409 | const uint8_t * src = src_buffer; |
| 410 | uint8_t * dst = dst_buffer; |
| 411 | |
| 412 | // We need several blocks because our base function is limited to 2GB input |
| 413 | const size_t BLOCK_SIZE = 0x7ffff000; |
| 414 | while (src_size > 0) { |
| 415 | //DRKTODO either implement pattern4 or figure out optimal unroll |
| 416 | //DRKTODO: bizarrely, with plain O3 the compiler generates a single |
| 417 | //DRKTODO: scalar STP per loop iteration with the stock loop |
| 418 | //DRKTODO If hand unrolled, it switches to NEON store pairs |
| 419 | // Reset hash table for each block |
| 420 | /* #if __STDC_HOSTED__ */ |
| 421 | /* memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); */ |
| 422 | /* #else */ |
| 423 | /* for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; */ |
| 424 | /* #endif */ |
| 425 | |
| 426 | for (int i = 0; i < LZ4_COMPRESS_HASH_ENTRIES;) { |
| 427 | hash_table[i++] = HASH_FILL; |
| 428 | hash_table[i++] = HASH_FILL; |
| 429 | hash_table[i++] = HASH_FILL; |
| 430 | hash_table[i++] = HASH_FILL; |
| 431 | } |
| 432 | |
| 433 | // Bytes to encode in this block |
| 434 | const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size; |
| 435 | |
| 436 | // Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads. |
| 437 | // Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer |
| 438 | uint8_t * dst_start = dst; |
| 439 | const uint8_t * src_start = src; |
| 440 | lz4_encode_2gb(dst_ptr: &dst, dst_size, src_ptr: &src, src_begin: src, src_size: src_to_encode, hash_table, skip_final_literals: src_to_encode < src_size); |
| 441 | |
| 442 | // Check progress |
| 443 | size_t dst_used = dst - dst_start; |
| 444 | size_t src_used = src - src_start; // src_used <= src_to_encode |
| 445 | if (src_to_encode == src_size && src_used < src_to_encode) { |
| 446 | return 0; // FAIL to encode last block |
| 447 | } |
| 448 | // Note that there is a potential problem here in case of non compressible data requiring more blocks. |
| 449 | // We may end up here with src_used very small, or even 0, and will not be able to make progress during |
| 450 | // compression. We FAIL unless the length of literals remaining at the end is small enough. |
| 451 | if (src_to_encode < src_size && src_to_encode - src_used >= (1 << 16)) { |
| 452 | return 0; // FAIL too many literals |
| 453 | } |
| 454 | // Update counters (SRC and DST already have been updated) |
| 455 | src_size -= src_used; |
| 456 | dst_size -= dst_used; |
| 457 | } |
| 458 | |
| 459 | return (size_t)(dst - dst_buffer); // bytes produced |
| 460 | } |
| 461 | |
| 462 | typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(4))) __attribute__((__aligned__(1))); |
| 463 | |
| 464 | int |
| 465 | lz4_decode(uint8_t ** dst_ptr, |
| 466 | uint8_t * dst_begin, |
| 467 | uint8_t * dst_end, |
| 468 | const uint8_t ** src_ptr, |
| 469 | const uint8_t * src_end) |
| 470 | { |
| 471 | uint8_t * dst = *dst_ptr; |
| 472 | const uint8_t * src = *src_ptr; |
| 473 | |
| 474 | // Require dst_end > dst. |
| 475 | if (dst_end <= dst) { |
| 476 | goto OUT_FULL; |
| 477 | } |
| 478 | |
| 479 | while (src < src_end) { |
| 480 | // Keep last good position |
| 481 | *src_ptr = src; |
| 482 | *dst_ptr = dst; |
| 483 | |
| 484 | uint8_t cmd = *src++; // 1 byte encoding literal+(match-4) length: LLLLMMMM |
| 485 | uint32_t literalLength = (cmd >> 4) & 15; // 0..15 |
| 486 | uint32_t matchLength = 4 + (cmd & 15); // 4..19 |
| 487 | |
| 488 | // extra bytes for literalLength |
| 489 | if (__improbable(literalLength == 15)) { |
| 490 | uint8_t s; |
| 491 | do { |
| 492 | #if DEBUG_LZ4_DECODE_ERRORS |
| 493 | if (__improbable(src >= src_end)) { |
| 494 | printf("Truncated SRC literal length\n" ); |
| 495 | } |
| 496 | #endif |
| 497 | if (__improbable(src >= src_end)) { |
| 498 | goto IN_FAIL; // unexpected end of input (1 byte needed) |
| 499 | } |
| 500 | s = *src++; |
| 501 | literalLength += s; |
| 502 | } while (__improbable(s == 255)); |
| 503 | } |
| 504 | |
| 505 | // copy literal |
| 506 | #if DEBUG_LZ4_DECODE_ERRORS |
| 507 | if (__improbable(literalLength > (size_t)(src_end - src))) { |
| 508 | printf("Truncated SRC literal\n" ); |
| 509 | } |
| 510 | #endif |
| 511 | if (__improbable(literalLength > (size_t)(src_end - src))) { |
| 512 | goto IN_FAIL; |
| 513 | } |
| 514 | if (__improbable(literalLength > (size_t)(dst_end - dst))) { |
| 515 | // literal will take us past the end of the destination buffer, |
| 516 | // so we can only copy part of it. |
| 517 | literalLength = (uint32_t)(dst_end - dst); |
| 518 | memcpy(dst, src, n: literalLength); |
| 519 | dst += literalLength; |
| 520 | goto OUT_FULL; |
| 521 | } |
| 522 | memcpy(dst, src, n: literalLength); |
| 523 | src += literalLength; |
| 524 | dst += literalLength; |
| 525 | |
| 526 | if (__improbable(src >= src_end)) { |
| 527 | goto OUT_FULL; // valid end of stream |
| 528 | } |
| 529 | #if DEBUG_LZ4_DECODE_ERRORS |
| 530 | if (__improbable(2 > (size_t)(src_end - src))) { |
| 531 | printf("Truncated SRC distance\n" ); |
| 532 | } |
| 533 | #endif |
| 534 | if (__improbable(2 > (size_t)(src_end - src))) { |
| 535 | goto IN_FAIL; // unexpected end of input (2 bytes needed) |
| 536 | } |
| 537 | //DRKTODO: this causes an alignment increase warning (legitimate?) |
| 538 | //DRKTODO: cast of char * to uint16_t* |
| 539 | #pragma clang diagnostic push |
| 540 | #pragma clang diagnostic ignored "-Wcast-align" |
| 541 | |
| 542 | // match distance |
| 543 | uint64_t matchDistance = *(const uint16_t *)src; // 0x0000 <= matchDistance <= 0xffff |
| 544 | #pragma clang diagnostic pop |
| 545 | src += 2; |
| 546 | #if DEBUG_LZ4_DECODE_ERRORS |
| 547 | if (matchDistance == 0) { |
| 548 | printf("Invalid match distance D = 0\n" ); |
| 549 | } |
| 550 | #endif |
| 551 | if (__improbable(matchDistance == 0)) { |
| 552 | goto IN_FAIL; // 0x0000 invalid |
| 553 | } |
| 554 | uint8_t * ref = dst - matchDistance; |
| 555 | #if DEBUG_LZ4_DECODE_ERRORS |
| 556 | if (__improbable(ref < dst_begin)) { |
| 557 | printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n" , matchDistance, dst_begin, dst, dst_end); |
| 558 | } |
| 559 | #endif |
| 560 | if (__improbable(ref < dst_begin)) { |
| 561 | goto OUT_FAIL; // out of range |
| 562 | } |
| 563 | // extra bytes for matchLength |
| 564 | if (__improbable(matchLength == 19)) { |
| 565 | uint8_t s; |
| 566 | do { |
| 567 | #if DEBUG_LZ4_DECODE_ERRORS |
| 568 | if (__improbable(src >= src_end)) { |
| 569 | printf("Truncated SRC match length\n" ); |
| 570 | } |
| 571 | #endif |
| 572 | if (__improbable(src >= src_end)) { |
| 573 | goto IN_FAIL; // unexpected end of input (1 byte needed) |
| 574 | } |
| 575 | s = *src++; |
| 576 | matchLength += s; |
| 577 | } while (__improbable(s == 255)); |
| 578 | } |
| 579 | |
| 580 | // copy match (may overlap) |
| 581 | if (__improbable(matchLength > (size_t)(dst_end - dst))) { |
| 582 | // match will take us past the end of the destination buffer, |
| 583 | // so we can only copy part of it. |
| 584 | matchLength = (uint32_t)(dst_end - dst); |
| 585 | for (uint32_t i = 0; i < matchLength; ++i) { |
| 586 | dst[i] = ref[i]; |
| 587 | } |
| 588 | dst += matchLength; |
| 589 | goto OUT_FULL; |
| 590 | } |
| 591 | for (uint64_t i = 0; i < matchLength; i++) { |
| 592 | dst[i] = ref[i]; |
| 593 | } |
| 594 | dst += matchLength; |
| 595 | } |
| 596 | |
| 597 | // We reached the end of the input buffer after a full instruction |
| 598 | OUT_FULL: |
| 599 | // Or we reached the end of the output buffer |
| 600 | *dst_ptr = dst; |
| 601 | *src_ptr = src; |
| 602 | return 0; |
| 603 | |
| 604 | // Error conditions |
| 605 | OUT_FAIL: |
| 606 | IN_FAIL: |
| 607 | return 1; // FAIL |
| 608 | } |
| 609 | |