liblzma: Optimize LZ decoder slightly.

Now extra buffer space is reserved so that repeating bytes for any single match will never need to copy from two places (both the beginning and the end of the buffer). This simplifies dict_repeat() and helps a little with speed. This seems to reduce .lzma decompression time about 2 %, so with .xz and CRC it could be slightly less. The small things add up still.
2024-04-04 12:36:23 +02:00 · 2024-02-12 17:09:10 +02:00 · 2024-02-12 17:09:10 +02:00 · f3872a5947
commit f3872a5947
parent eb518446e5
3 changed files with 88 additions and 60 deletions
--- a/src/liblzma/lz/lz_decoder.c
+++ b/src/liblzma/lz/lz_decoder.c
@ -53,9 +53,10 @@ typedef struct {
 static void
 lz_decoder_reset(lzma_coder *coder)
 {
-	coder->dict.pos = 0;
+	coder->dict.pos = 2 * LZ_DICT_REPEAT_MAX;
 	coder->dict.full = 0;
-	coder->dict.buf[coder->dict.size - 1] = '\0';
+	coder->dict.buf[2 * LZ_DICT_REPEAT_MAX - 1] = '\0';
+	coder->dict.has_wrapped = false;
 	coder->dict.need_reset = false;
 	return;
 }
@ -69,8 +70,15 @@ decode_buffer(lzma_coder *coder,
 {
 	while (true) {
 		// Wrap the dictionary if needed.
-		if (coder->dict.pos == coder->dict.size)
-			coder->dict.pos = 0;
+		if (coder->dict.pos == coder->dict.size) {
+			// See the comment of #define LZ_DICT_REPEAT_MAX.
+			coder->dict.pos = LZ_DICT_REPEAT_MAX;
+			coder->dict.has_wrapped = true;
+			memcpy(coder->dict.buf, coder->dict.buf
+						+ coder->dict.size
+						- LZ_DICT_REPEAT_MAX,
+					LZ_DICT_REPEAT_MAX);
+		}

 		// Store the current dictionary position. It is needed to know
 		// where to start copying to the out[] buffer.
@ -252,21 +260,31 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 	// dictionary to the output buffer, since applications are
 	// recommended to give aligned buffers to liblzma.
 	//
+	// Reserve 2 * LZ_DICT_REPEAT_MAX bytes of extra space which is
+	// needed for alloc_size.
+	//
 	// Avoid integer overflow.
-	if (lz_options.dict_size > SIZE_MAX - 15)
+	if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX)
 		return LZMA_MEM_ERROR;

 	lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15));

+	// Reserve extra space as explained in the comment
+	// of #define LZ_DICT_REPEAT_MAX.
+	const size_t alloc_size
+			= lz_options.dict_size + 2 * LZ_DICT_REPEAT_MAX;
+
 	// Allocate and initialize the dictionary.
-	if (coder->dict.size != lz_options.dict_size) {
+	if (coder->dict.size != alloc_size) {
 		lzma_free(coder->dict.buf, allocator);
-		coder->dict.buf
-				= lzma_alloc(lz_options.dict_size, allocator);
+		coder->dict.buf = lzma_alloc(alloc_size, allocator);
 		if (coder->dict.buf == NULL)
 			return LZMA_MEM_ERROR;

-		coder->dict.size = lz_options.dict_size;
+		// NOTE: Yes, alloc_size, not lz_options.dict_size. The way
+		// coder->dict.full is updated will take care that we will
+		// still reject distances larger than lz_options.dict_size.
+		coder->dict.size = alloc_size;
 	}

 	lz_decoder_reset(next->coder);
@ -279,9 +297,12 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 		const size_t copy_size = my_min(lz_options.preset_dict_size,
 				lz_options.dict_size);
 		const size_t offset = lz_options.preset_dict_size - copy_size;
-		memcpy(coder->dict.buf, lz_options.preset_dict + offset,
+		memcpy(coder->dict.buf + coder->dict.pos,
+				lz_options.preset_dict + offset,
 				copy_size);
-		coder->dict.pos = copy_size;
+
+		// dict.pos isn't zero after lz_decoder_reset().
+		coder->dict.pos += copy_size;
 		coder->dict.full = copy_size;
 	}

--- a/src/liblzma/lz/lz_decoder.h
+++ b/src/liblzma/lz/lz_decoder.h
@ -16,10 +16,28 @@
 #include "common.h"


+/// Maximum length of a match rounded up to a nice power of 2 which is
+/// a good size for aligned memcpy(). The allocated dictionary buffer will
+/// be 2 * LZ_DICT_REPEAT_MAX bytes larger than the actual dictionary size:
+///
+/// (1) Every time the decoder reaches the end of the dictionary buffer,
+///     the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning.
+///     This way dict_repeat() will only need to copy from one place,
+///     never from both the end and beginning of the buffer.
+///
+/// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between
+///     the oldest byte still in the dictionary and the current write
+///     position. This way dict_repeat(dict, dict->size - 1, &len)
+///     won't need memmove() as the copying cannot overlap.
+///
+/// Note that memcpy() still cannot be used if distance < len.
+///
+/// LZMA's longest match length is 273 so pick a multiple of 16 above that.
+#define LZ_DICT_REPEAT_MAX 288
+
+
 typedef struct {
-	/// Pointer to the dictionary buffer. It can be an allocated buffer
-	/// internal to liblzma, or it can a be a buffer given by the
-	/// application when in single-call mode (not implemented yet).
+	/// Pointer to the dictionary buffer.
 	uint8_t *buf;

 	/// Write position in dictionary. The next byte will be written to
@ -34,9 +52,16 @@ typedef struct {
 	/// Write limit
 	size_t limit;

-	/// Size of the dictionary
+	/// Allocated size of buf. This is 2 * LZ_DICT_REPEAT_MAX bytes
+	/// larger than the actual dictionary size. This is enforced by
+	/// how the value for "full" is set; it can be at most
+	/// "size - 2 * LZ_DICT_REPEAT_MAX".
 	size_t size;

+	/// True once the dictionary has become full and the writing position
+	/// has been wrapped in decode_buffer() in lz_decoder.c.
+	bool has_wrapped;
+
 	/// True when dictionary should be reset before decoding more data.
 	bool need_reset;

@ -102,7 +127,16 @@ static inline uint8_t
 dict_get(const lzma_dict *const dict, const uint32_t distance)
 {
 	return dict->buf[dict->pos - distance - 1
-			+ (distance < dict->pos ? 0 : dict->size)];
+			+ (distance < dict->pos
+				? 0 : dict->size - LZ_DICT_REPEAT_MAX)];
+}
+
+
+/// Optimized version of dict_get(dict, 0)
+static inline uint8_t
+dict_get0(const lzma_dict *const dict)
+{
+	return dict->buf[dict->pos - 1];
 }


@ -131,50 +165,27 @@ dict_repeat(lzma_dict *dict, uint32_t distance, uint32_t *len)
 	uint32_t left = my_min(dict_avail, *len);
 	*len -= left;

+	size_t back = dict->pos - distance - 1;
+	if (distance >= dict->pos)
+		back += dict->size - LZ_DICT_REPEAT_MAX;
+
 	// Repeat a block of data from the history. Because memcpy() is faster
 	// than copying byte by byte in a loop, the copying process gets split
-	// into three cases.
+	// into two cases.
 	if (distance < left) {
 		// Source and target areas overlap, thus we can't use
 		// memcpy() nor even memmove() safely.
 		do {
-			dict->buf[dict->pos] = dict_get(dict, distance);
-			++dict->pos;
+			dict->buf[dict->pos++] = dict->buf[back++];
 		} while (--left > 0);
-
-	} else if (distance < dict->pos) {
-		// The easiest and fastest case
-		memcpy(dict->buf + dict->pos,
-				dict->buf + dict->pos - distance - 1,
-				left);
-		dict->pos += left;
-
 	} else {
-		// The bigger the dictionary, the more rare this
-		// case occurs. We need to "wrap" the dict, thus
-		// we might need two memcpy() to copy all the data.
-		assert(dict->full == dict->size);
-		const uint32_t copy_pos
-				= dict->pos - distance - 1 + dict->size;
-		uint32_t copy_size = dict->size - copy_pos;
-
-		if (copy_size < left) {
-			memmove(dict->buf + dict->pos, dict->buf + copy_pos,
-					copy_size);
-			dict->pos += copy_size;
-			copy_size = left - copy_size;
-			memcpy(dict->buf + dict->pos, dict->buf, copy_size);
-			dict->pos += copy_size;
-		} else {
-			memmove(dict->buf + dict->pos, dict->buf + copy_pos,
-					left);
+		memcpy(dict->buf + dict->pos, dict->buf + back, left);
 		dict->pos += left;
 	}
-	}

 	// Update how full the dictionary is.
-	if (dict->full < dict->pos)
-		dict->full = dict->pos;
+	if (!dict->has_wrapped)
+		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;

 	return unlikely(*len != 0);
 }
@ -185,8 +196,8 @@ dict_put(lzma_dict *dict, uint8_t byte)
 {
 	dict->buf[dict->pos++] = byte;

-	if (dict->pos > dict->full)
-		dict->full = dict->pos;
+	if (!dict->has_wrapped)
+		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
 }


@ -198,11 +209,7 @@ dict_put_safe(lzma_dict *dict, uint8_t byte)
 	if (dict->pos == dict->limit)
 		return true;

-	dict->buf[dict->pos++] = byte;
-
-	if (dict->pos > dict->full)
-		dict->full = dict->pos;
-
+	dict_put(dict, byte);
 	return false;
 }

@ -226,8 +233,8 @@ dict_write(lzma_dict *restrict dict, const uint8_t *restrict in,
 	*left -= lzma_bufcpy(in, in_pos, in_size,
 			dict->buf, &dict->pos, dict->limit);

-	if (dict->pos > dict->full)
-		dict->full = dict->pos;
+	if (!dict->has_wrapped)
+		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;

 	return;
 }
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@ -360,7 +360,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			// lc params.
 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
-					dict.pos, dict_get(&dict, 0));
+					dict.pos, dict_get0(&dict));

 			if (is_literal_state(state)) {
 				update_literal_normal(state);
@ -685,7 +685,7 @@ slow:

 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
-					dict.pos, dict_get(&dict, 0));
+					dict.pos, dict_get0(&dict));
 			symbol = 1;

 			if (is_literal_state(state)) {