lz4.c source code [xnu/osfmk/vm/lz4.c]

1	/*
2	* Copyright (c) 2016-2016 Apple Inc. All rights reserved.
3	*
4	* @APPLE_OSREFERENCE_LICENSE_HEADER_START@
5	*
6	* This file contains Original Code and/or Modifications of Original Code
7	* as defined in and that are subject to the Apple Public Source License
8	* Version 2.0 (the 'License'). You may not use this file except in
9	* compliance with the License. The rights granted to you under the License
10	* may not be used to create, or enable the creation or redistribution of,
11	* unlawful or unlicensed copies of an Apple operating system, or to
12	* circumvent, violate, or enable the circumvention or violation of, any
13	* terms of an Apple operating system software license agreement.
14	*
15	* Please obtain a copy of the License at
16	* http://www.opensource.apple.com/apsl/ and read it before using this file.
17	*
18	* The Original Code and all software distributed under the License are
19	* distributed on an 'AS IS' basis, WITHOUT WARRANTY OF ANY KIND, EITHER
20	* EXPRESS OR IMPLIED, AND APPLE HEREBY DISCLAIMS ALL SUCH WARRANTIES,
21	* INCLUDING WITHOUT LIMITATION, ANY WARRANTIES OF MERCHANTABILITY,
22	* FITNESS FOR A PARTICULAR PURPOSE, QUIET ENJOYMENT OR NON-INFRINGEMENT.
23	* Please see the License for the specific language governing rights and
24	* limitations under the License.
25	*
26	* @APPLE_OSREFERENCE_LICENSE_HEADER_END@
27	*/
28
29	// LZ4_RAW buffer API
30	// EB May 2015
31	// Mar 2016 Imported from the Compression project, with minor optimisations and
32	// early abort detection (Derek Kumar)
33
34	#include "lz4.h"
35	#define memcpy __builtin_memcpy
36
37	size_t lz4raw_decode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
38	const uint8_t * __restrict src_buffer, size_t src_size,
39	void * __restrict work __attribute__((unused)))
40	{
41	const uint8_t * src = src_buffer;
42	uint8_t * dst = dst_buffer;
43
44	// Go fast if we can, keeping away from the end of buffers
45	#if LZ4_ENABLE_ASSEMBLY_DECODE
46	if (dst_size > LZ4_GOFAST_SAFETY_MARGIN && src_size > LZ4_GOFAST_SAFETY_MARGIN)
47	{
48	if (lz4_decode_asm(&dst, dst_buffer, dst_buffer + dst_size - LZ4_GOFAST_SAFETY_MARGIN, &src, src_buffer + src_size - LZ4_GOFAST_SAFETY_MARGIN))
49	return `0`; // FAIL
50	}
51	#endif
52	//DRKTODO: Can the 'C' "safety" decode be eliminated for 4/16K fixed-sized buffers?
53
54	// Finish safe
55	if (lz4_decode(&dst, dst_buffer, dst_buffer + dst_size, &src, src_buffer + src_size))
56	return `0`; // FAIL
57
58	return (size_t)(dst - dst_buffer); // bytes produced
59	}
60	// Debug flags
61	#if LZ4DEBUG
62	#define DEBUG_LZ4_ENCODE_ERRORS (1)
63	#define DEBUG_LZ4_DECODE_ERRORS (1)
64	#endif
65
66	#if DEBUG_LZ4_ENCODE_ERRORS
67	#endif
68
69	#if !LZ4_ENABLE_ASSEMBLY_ENCODE
70
71	#if defined(__x86_64__) \|\| defined(__x86_64h__)
72	# define LZ4_MATCH_SEARCH_INIT_SIZE 32
73	# define LZ4_MATCH_SEARCH_LOOP_SIZE 32
74	#else
75	# define LZ4_MATCH_SEARCH_INIT_SIZE 8
76	# define LZ4_MATCH_SEARCH_LOOP_SIZE 8
77	#endif
78
79	// Return hash for 4-byte sequence X
80	static inline uint32_t lz4_hash(uint32_t x) { return (x * `2654435761U`) >> (`32` - LZ4_COMPRESS_HASH_BITS); }
81
82	// Store 0xfff..fff at PTR*
83	static inline void lz4_fill16(uint8_t * ptr)
84	{
85	store8(ptr,-`1`);
86	store8(ptr+`8`,-`1`);
87	}
88
89	// Return number of matching bytes 0..4 at positions A and B.
90	static inline size_t lz4_nmatch4(const uint8_t * a,const uint8_t * b)
91	{
92	uint32_t x = load4(a) ^ load4(b);
93	return (x == `0`)?`4`:(__builtin_ctzl(x) >> `3`);
94	}
95
96	// Return number of matching bytes 0..8 at positions A and B.
97	static inline size_t lz4_nmatch8(const uint8_t * a,const uint8_t * b)
98	{
99	uint64_t x = load8(a) ^ load8(b);
100	return (x == `0`)?`8`:(__builtin_ctzll(x) >> `3`);
101	}
102
103	// Return number of matching bytes 0..16 at positions A and B.
104	static inline size_t lz4_nmatch16(const uint8_t * a,const uint8_t * b)
105	{
106	size_t n = lz4_nmatch8(a,b);
107	return (n == `8`)?(`8` + lz4_nmatch8(a+`8`,b+`8`)):n;
108	}
109
110	// Return number of matching bytes 0..32 at positions A and B.
111	static inline size_t lz4_nmatch32(const uint8_t * a,const uint8_t * b)
112	{
113	size_t n = lz4_nmatch16(a,b);
114	return (n == `16`)?(`16` + lz4_nmatch16(a+`16`,b+`16`)):n;
115	}
116
117	// Return number of matching bytes 0..64 at positions A and B.
118	static inline size_t lz4_nmatch64(const uint8_t * a,const uint8_t * b)
119	{
120	size_t n = lz4_nmatch32(a,b);
121	return (n == `32`)?(`32` + lz4_nmatch32(a+`32`,b+`32`)):n;
122	}
123
124	// Compile-time selection, return number of matching bytes 0..N at positions A and B.
125	static inline size_t lz4_nmatch(int N, const uint8_t * a, const uint8_t * b)
126	{
127	switch (N) {
128	case `4`: return lz4_nmatch4(a,b);
129	case `8`: return lz4_nmatch8(a,b);
130	case `16`: return lz4_nmatch16(a,b);
131	case `32`: return lz4_nmatch32(a,b);
132	case `64`: return lz4_nmatch64(a,b);
133	}
134	__builtin_trap(); // FAIL
135	}
136
137	// Store LENGTH in DST using the literal_length/match_length extension scheme: X is the sum of all bytes until we reach a byte < 0xff.
138	// We are allowed to access a constant number of bytes above DST_END.
139	// Return incremented DST pointer on success, and 0 on failure
140	static inline uint8_t lz4_store_length(uint8_t dst, const uint8_t * const end, uint32_t L) {
141	(void)end;
142	while (L >= `17`*`255`) {
143	lz4_fill16(dst);
144	dst += `16`;
145	L -= `16`*`255`;
146	}
147	lz4_fill16(dst);
148	//DRKTODO verify these modulos/divisions are optimally handled by clang
149	dst += L/`255`;
150	*dst++ = L%`255`;
151	return dst;
152	}
153
154	static inline uint32_t clamp(uint32_t x, uint32_t max) __attribute__((overloadable)) { return x > max ? max : x; }
155
156	static inline uint8_t copy_literal(uint8_t dst, const uint8_t * restrict src, uint32_t L) {
157	uint8_t *end = dst + L;
158	{ copy16(dst, src); dst += `16`; src += `16`; }
159	while (dst < end) { copy32(dst, src); dst += `32`; src += `32`; }
160	return end;
161	}
162
163	static uint8_t *lz4_emit_match(uint32_t L, uint32_t M, uint32_t D,
164	uint8_t * restrict dst,
165	const uint8_t * const end,
166	const uint8_t * restrict src) {
167	// The LZ4 encoding scheme requires that M is at least 4, because
168	// the actual value stored by the encoding is M - 4. Check this
169	// requirement for debug builds.
170	assert(M >= `4` && "LZ4 encoding requires that M is at least 4");
171	// Having checked that M >= 4, translate M by four.
172	M -= `4`;
173	// Similarly, we must have D < 216, because we use only two bytes
174	// to represent the value of D in the encoding.
175	assert(D <= USHRT_MAX && "LZ4 encoding requries that D can be stored in two bytes.");
176	// Construct the command byte by clamping both L and M to 0 ... 15
177	// and packing them into a single byte, and store it.
178	*dst++ = clamp(L, `15`) << `4` \| clamp(M, `15`);
179	// If L is 15 or greater, we need to encode extra literal length bytes.
180	if (L >= `15`) {
181	dst = lz4_store_length(dst, end, L - `15`);
182	if (dst == `0` \|\| dst + L >= end) return NULL;
183	}
184	// Copy the literal itself from src to dst.
185	dst = copy_literal(dst, src, L);
186	// Store match distance.
187	store2(dst, D); dst += `2`;
188	// If M is 15 or greater, we need to encode extra match length bytes.
189	if (M >= `15`) {
190	dst = lz4_store_length(dst, end, M - `15`);
191	if (dst == `0`) return NULL;
192	}
193	return dst;
194	}
195
196	/ #ifndef LZ4_EARLY_ABORT /
197	/ #define LZ4_EARLY_ABORT (1) /
198	/ #endif /
199
200	#if LZ4_EARLY_ABORT
201	int lz4_do_early_abort = `1`;
202	int lz4_early_aborts = `0`;
203	#define LZ4_EARLY_ABORT_EVAL (448)
204	#define LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR (20)
205	#endif /* LZ4_EARLY_ABORT */
206
207	void lz4_encode_2gb(uint8_t ** dst_ptr,
208	size_t dst_size,
209	const uint8_t ** src_ptr,
210	const uint8_t * src_begin,
211	size_t src_size,
212	lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES],
213	int skip_final_literals)
214	{
215	uint8_t dst = dst_ptr; // current output stream position
216	uint8_t *end = dst + dst_size - LZ4_GOFAST_SAFETY_MARGIN;
217	const uint8_t src = src_ptr; // current input stream literal to encode
218	const uint8_t *src_end = src + src_size - LZ4_GOFAST_SAFETY_MARGIN;
219	const uint8_t match_begin = `0`; // first byte of matched sequence*
220	const uint8_t match_end = `0`; // first byte after matched sequence*
221	#if LZ4_EARLY_ABORT
222	uint8_t * const dst_begin = dst;
223	uint32_t lz4_do_abort_eval = lz4_do_early_abort;
224	#endif
225
226	while (dst < end)
227	{
228	ptrdiff_t match_distance = `0`;
229	for (match_begin = src; match_begin < src_end; match_begin += `1`) {
230	const uint32_t pos = (uint32_t)(match_begin - src_begin);
231	const uint32_t w0 = load4(match_begin);
232	const uint32_t w1 = load4(match_begin + `1`);
233	const uint32_t w2 = load4(match_begin + `2`);
234	const uint32_t w3 = load4(match_begin + `3`);
235	const int i0 = lz4_hash(w0);
236	const int i1 = lz4_hash(w1);
237	const int i2 = lz4_hash(w2);
238	const int i3 = lz4_hash(w3);
239	const uint8_t *c0 = src_begin + hash_table[i0].offset;
240	const uint8_t *c1 = src_begin + hash_table[i1].offset;
241	const uint8_t *c2 = src_begin + hash_table[i2].offset;
242	const uint8_t *c3 = src_begin + hash_table[i3].offset;
243	const uint32_t m0 = hash_table[i0].word;
244	const uint32_t m1 = hash_table[i1].word;
245	const uint32_t m2 = hash_table[i2].word;
246	const uint32_t m3 = hash_table[i3].word;
247	hash_table[i0].offset = pos;
248	hash_table[i0].word = w0;
249	hash_table[i1].offset = pos + `1`;
250	hash_table[i1].word = w1;
251
252	hash_table[i2].offset = pos + `2`;
253	hash_table[i2].word = w2;
254	hash_table[i3].offset = pos + `3`;
255	hash_table[i3].word = w3;
256
257	match_distance = (match_begin - c0);
258	if (w0 == m0 && match_distance < `0x10000` && match_distance > `0`) {
259	match_end = match_begin + `4`;
260	goto EXPAND_FORWARD;
261	}
262
263	match_begin++;
264	match_distance = (match_begin - c1);
265	if (w1 == m1 && match_distance < `0x10000` && match_distance > `0`) {
266	match_end = match_begin + `4`;
267	goto EXPAND_FORWARD;
268	}
269
270	match_begin++;
271	match_distance = (match_begin - c2);
272	if (w2 == m2 && match_distance < `0x10000` && match_distance > `0`) {
273	match_end = match_begin + `4`;
274	goto EXPAND_FORWARD;
275	}
276
277	match_begin++;
278	match_distance = (match_begin - c3);
279	if (w3 == m3 && match_distance < `0x10000` && match_distance > `0`) {
280	match_end = match_begin + `4`;
281	goto EXPAND_FORWARD;
282	}
283
284	#if LZ4_EARLY_ABORT
285	//DRKTODO: Evaluate unrolling further. 2xunrolling had some modest benefits
286	if (lz4_do_abort_eval && ((pos) >= LZ4_EARLY_ABORT_EVAL)) {
287	ptrdiff_t dstd = dst - dst_begin;
288
289	if (dstd == `0`) {
290	lz4_early_aborts++;
291	return;
292	}
293
294	/ if (dstd >= pos) { /
295	/ return; /
296	/ } /
297	/ ptrdiff_t cbytes = pos - dstd; /
298	/ if ((cbytes * LZ4_EARLY_ABORT_MIN_COMPRESSION_FACTOR) > pos) { /
299	/ return; /
300	/ } /
301	lz4_do_abort_eval = `0`;
302	}
303	#endif
304	}
305
306	if (skip_final_literals) { src_ptr = src; dst_ptr = dst; return; } // do not emit the final literal sequence
307
308	// Emit a trailing literal that covers the remainder of the source buffer,
309	// if we can do so without exceeding the bounds of the destination buffer.
310	size_t src_remaining = src_end + LZ4_GOFAST_SAFETY_MARGIN - src;
311	if (src_remaining < `15`) {
312	*dst++ = (uint8_t)(src_remaining << `4`);
313	memcpy(dst, src, `16`); dst += src_remaining;
314	} else {
315	*dst++ = `0xf0`;
316	dst = lz4_store_length(dst, end, (uint32_t)(src_remaining - `15`));
317	if (dst == `0` \|\| dst + src_remaining >= end) return;
318	memcpy(dst, src, src_remaining); dst += src_remaining;
319	}
320	*dst_ptr = dst;
321	*src_ptr = src + src_remaining;
322	return;
323
324	EXPAND_FORWARD:
325
326	// Expand match forward
327	{
328	const uint8_t * ref_end = match_end - match_distance;
329	while (match_end < src_end)
330	{
331	size_t n = lz4_nmatch(LZ4_MATCH_SEARCH_LOOP_SIZE, ref_end, match_end);
332	if (n < LZ4_MATCH_SEARCH_LOOP_SIZE) { match_end += n; break; }
333	match_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
334	ref_end += LZ4_MATCH_SEARCH_LOOP_SIZE;
335	}
336	}
337
338	// Expand match backward
339	{
340	// match_begin_min = max(src_begin + match_distance,literal)
341	const uint8_t * match_begin_min = src_begin + match_distance;
342	match_begin_min = (match_begin_min < src)?src:match_begin_min;
343	const uint8_t * ref_begin = match_begin - match_distance;
344
345	while (match_begin > match_begin_min && ref_begin[-`1`] == match_begin[-`1`] ) { match_begin -= `1`; ref_begin -= `1`; }
346	}
347
348	// Emit match
349	dst = lz4_emit_match((uint32_t)(match_begin - src), (uint32_t)(match_end - match_begin), (uint32_t)match_distance, dst, end, src);
350	if (!dst) return;
351
352	// Update state
353	src = match_end;
354
355	// Update return values to include the last fully encoded match
356	*dst_ptr = dst;
357	*src_ptr = src;
358	}
359	}
360
361	#endif
362
363	size_t lz4raw_encode_buffer(uint8_t * __restrict dst_buffer, size_t dst_size,
364	const uint8_t * __restrict src_buffer, size_t src_size,
365	lz4_hash_entry_t hash_table[LZ4_COMPRESS_HASH_ENTRIES])
366	{
367	// Initialize hash table
368	const lz4_hash_entry_t HASH_FILL = { .offset = `0x80000000`, .word = `0x0` };
369
370	const uint8_t * src = src_buffer;
371	uint8_t * dst = dst_buffer;
372
373	// We need several blocks because our base function is limited to 2GB input
374	const size_t BLOCK_SIZE = `0x7ffff000`;
375	while (src_size > `0`)
376	{
377	//DRKTODO either implement pattern4 or figure out optimal unroll
378	//DRKTODO: bizarrely, with plain O3 the compiler generates a single
379	//DRKTODO: scalar STP per loop iteration with the stock loop
380	//DRKTODO If hand unrolled, it switches to NEON store pairs
381	// Reset hash table for each block
382	/ #if __STDC_HOSTED__ /
383	/ memset_pattern8(hash_table, &HASH_FILL, lz4_encode_scratch_size); /
384	/ #else /
385	/ for (int i=0;i<LZ4_COMPRESS_HASH_ENTRIES;i++) hash_table[i] = HASH_FILL; /
386	/ #endif /
387
388	for (int i=`0`;i<LZ4_COMPRESS_HASH_ENTRIES;) {
389	hash_table[i++] = HASH_FILL;
390	hash_table[i++] = HASH_FILL;
391	hash_table[i++] = HASH_FILL;
392	hash_table[i++] = HASH_FILL;
393	}
394
395	// Bytes to encode in this block
396	const size_t src_to_encode = src_size > BLOCK_SIZE ? BLOCK_SIZE : src_size;
397
398	// Run the encoder, only the last block emits final literals. Allows concatenation of encoded payloads.
399	// Blocks are encoded independently, so src_begin is set to each block origin instead of src_buffer
400	uint8_t * dst_start = dst;
401	const uint8_t * src_start = src;
402	lz4_encode_2gb(&dst, dst_size, &src, src, src_to_encode, hash_table, src_to_encode < src_size);
403
404	// Check progress
405	size_t dst_used = dst - dst_start;
406	size_t src_used = src - src_start; // src_used <= src_to_encode
407	if (src_to_encode == src_size && src_used < src_to_encode) return `0`; // FAIL to encode last block
408
409	// Note that there is a potential problem here in case of non compressible data requiring more blocks.
410	// We may end up here with src_used very small, or even 0, and will not be able to make progress during
411	// compression. We FAIL unless the length of literals remaining at the end is small enough.
412	if (src_to_encode < src_size && src_to_encode - src_used >= (`1`<<`16`)) return `0`; // FAIL too many literals
413
414	// Update counters (SRC and DST already have been updated)
415	src_size -= src_used;
416	dst_size -= dst_used;
417	}
418
419	return (size_t)(dst - dst_buffer); // bytes produced
420	}
421
422	typedef uint32_t lz4_uint128 __attribute__((ext_vector_type(`4`))) __attribute__((__aligned__(`1`)));
423
424	int lz4_decode(uint8_t ** dst_ptr,
425	uint8_t * dst_begin,
426	uint8_t * dst_end,
427	const uint8_t ** src_ptr,
428	const uint8_t * src_end)
429	{
430	uint8_t * dst = *dst_ptr;
431	const uint8_t * src = *src_ptr;
432
433	// Require dst_end > dst.
434	if (dst_end <= dst) goto OUT_FULL;
435
436	while (src < src_end)
437	{
438	// Keep last good position
439	*src_ptr = src;
440	*dst_ptr = dst;
441
442	uint8_t cmd = src++; // 1 byte encoding literal+(match-4) length: LLLLMMMM*
443	uint32_t literalLength = (cmd >> `4`) & `15`; // 0..15
444	uint32_t matchLength = `4` + (cmd & `15`); // 4..19
445
446	// extra bytes for literalLength
447	if (__improbable(literalLength == `15`))
448	{
449	uint8_t s;
450	do {
451	#if DEBUG_LZ4_DECODE_ERRORS
452	if (__improbable(src >= src_end)) printf("Truncated SRC literal length\n");
453	#endif
454	if (__improbable(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed)
455	s = *src++;
456	literalLength += s;
457	} while (__improbable(s == `255`));
458	}
459
460	// copy literal
461	#if DEBUG_LZ4_DECODE_ERRORS
462	if (__improbable(literalLength > (size_t)(src_end - src))) printf("Truncated SRC literal\n");
463	#endif
464	if (__improbable(literalLength > (size_t)(src_end - src))) goto IN_FAIL;
465	if (__improbable(literalLength > (size_t)(dst_end - dst))) {
466	// literal will take us past the end of the destination buffer,
467	// so we can only copy part of it.
468	literalLength = (uint32_t)(dst_end - dst);
469	memcpy(dst, src, literalLength);
470	dst += literalLength;
471	goto OUT_FULL;
472	}
473	memcpy(dst,src,literalLength);
474	src += literalLength;
475	dst += literalLength;
476
477	if (__improbable(src >= src_end)) goto OUT_FULL; // valid end of stream
478	#if DEBUG_LZ4_DECODE_ERRORS
479	if (__improbable(`2` > (size_t)(src_end - src))) printf("Truncated SRC distance\n");
480	#endif
481	if (__improbable(`2` > (size_t)(src_end - src))) goto IN_FAIL; // unexpected end of input (2 bytes needed)
482
483	//DRKTODO: this causes an alignment increase warning (legitimate?)
484	//DRKTODO: cast of char * to uint16_t*
485	#pragma clang diagnostic push
486	#pragma clang diagnostic ignored "-Wcast-align"
487
488	// match distance
489	uint64_t matchDistance = (const* uint16_t )src; // 0x0000 <= matchDistance <= 0xffff*
490	#pragma clang diagnostic pop
491	src += `2`;
492	#if DEBUG_LZ4_DECODE_ERRORS
493	if (matchDistance == `0`) printf("Invalid match distance D = 0\n");
494	#endif
495	if (__improbable(matchDistance == `0`)) goto IN_FAIL; // 0x0000 invalid
496	uint8_t * ref = dst - matchDistance;
497	#if DEBUG_LZ4_DECODE_ERRORS
498	if (__improbable(ref < dst_begin)) printf("Invalid reference D=0x%llx dst_begin=%p dst=%p dst_end=%p\n",matchDistance,dst_begin,dst,dst_end);
499	#endif
500	if (__improbable(ref < dst_begin)) goto OUT_FAIL; // out of range
501
502	// extra bytes for matchLength
503	if (__improbable(matchLength == `19`))
504	{
505	uint8_t s;
506	do {
507	#if DEBUG_LZ4_DECODE_ERRORS
508	if (__improbable(src >= src_end)) printf("Truncated SRC match length\n");
509	#endif
510	if (__improbable(src >= src_end)) goto IN_FAIL; // unexpected end of input (1 byte needed)
511	s = *src++;
512	matchLength += s;
513	} while (__improbable(s == `255`));
514	}
515
516	// copy match (may overlap)
517	if (__improbable(matchLength > (size_t)(dst_end - dst))) {
518	// match will take us past the end of the destination buffer,
519	// so we can only copy part of it.
520	matchLength = (uint32_t)(dst_end - dst);
521	for (uint32_t i=`0`; i<matchLength; ++i) dst[i] = ref[i];
522	dst += matchLength;
523	goto OUT_FULL;
524	}
525	for (uint64_t i=`0`;i<matchLength;i++) dst[i] = ref[i];
526	dst += matchLength;
527	}
528
529	// We reached the end of the input buffer after a full instruction
530	OUT_FULL:
531	// Or we reached the end of the output buffer
532	*dst_ptr = dst;
533	*src_ptr = src;
534	return `0`;
535
536	// Error conditions
537	OUT_FAIL:
538	IN_FAIL:
539	return `1`; // FAIL
540	}
541

Browse the source code of xnu/osfmk/vm/lz4.c