liblzma: LZMA decoder improvements.

This adds macros for bittree decoding which prepares the code for alternative C versions and inline assembly.
2024-04-04 12:36:23 +02:00 · 2024-02-12 17:09:10 +02:00 · 2024-02-12 17:09:10 +02:00 · e0c0ee475c
commit e0c0ee475c
parent de5c5e4176
3 changed files with 210 additions and 200 deletions
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@ -24,9 +24,8 @@
 // Minimum number of input bytes to safely decode one LZMA symbol.
 // The worst case is that we decode 22 bits using probabilities and 26
-// direct bits. This may decode at maximum 20 bytes of input plus one
+// direct bits. This may decode at maximum 20 bytes of input.
-// extra byte after the final EOPM normalization.
+#define LZMA_IN_REQUIRED 20
 #define LZMA_IN_REQUIRED 21
 // Macros for (somewhat) size-optimized code.
@ -73,32 +72,22 @@ do { \
 	symbol = 1; \
 	rc_if_0(ld.choice) { \
 		rc_update_0(ld.choice); \
-		rc_bit(ld.low[pos_state][symbol], , ); \
+		rc_bittree3(ld.low[pos_state], \
-		rc_bit(ld.low[pos_state][symbol], , ); \
+				-LEN_LOW_SYMBOLS + MATCH_LEN_MIN); \
-		rc_bit(ld.low[pos_state][symbol], , ); \
+		target = symbol; \
 		target = symbol - LEN_LOW_SYMBOLS + MATCH_LEN_MIN; \
 	} else { \
 		rc_update_1(ld.choice); \
 		rc_if_0(ld.choice2) { \
 			rc_update_0(ld.choice2); \
-			rc_bit(ld.mid[pos_state][symbol], , ); \
+			rc_bittree3(ld.mid[pos_state], -LEN_MID_SYMBOLS \
-			rc_bit(ld.mid[pos_state][symbol], , ); \
+					+ MATCH_LEN_MIN + LEN_LOW_SYMBOLS); \
-			rc_bit(ld.mid[pos_state][symbol], , ); \
+			target = symbol; \
 			target = symbol - LEN_MID_SYMBOLS \
 					+ MATCH_LEN_MIN + LEN_LOW_SYMBOLS; \
 		} else { \
 			rc_update_1(ld.choice2); \
-			rc_bit(ld.high[symbol], , ); \
+			rc_bittree8(ld.high, -LEN_HIGH_SYMBOLS \
 			rc_bit(ld.high[symbol], , ); \
 			rc_bit(ld.high[symbol], , ); \
 			rc_bit(ld.high[symbol], , ); \
 			rc_bit(ld.high[symbol], , ); \
 			rc_bit(ld.high[symbol], , ); \
 			rc_bit(ld.high[symbol], , ); \
 			rc_bit(ld.high[symbol], , ); \
 			target = symbol - LEN_HIGH_SYMBOLS \
 					+ MATCH_LEN_MIN \
-					+ LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; \
+					+ LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS); \
 			target = symbol; \
 		} \
 	} \
 } while (0)
@ -369,8 +358,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		// If there is not enough room for another LZMA symbol
 		// go to Resumable mode.
-		if (rc_in_pos + LZMA_IN_REQUIRED > in_size
+		if (unlikely(rc_in_end - rc_in_ptr < LZMA_IN_REQUIRED
-			|| dict.pos == dict.limit)
+				|| dict.pos == dict.limit))
 			goto slow;
 		// Decode the first bit from the next LZMA symbol.
@ -390,64 +379,14 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
 					dict.pos, dict_get(&dict, 0));
 			symbol = 1;
 			if (is_literal_state(state)) {
 				// Decode literal without match byte.
-				// We need to decode 8 bits, so instead
+				rc_bittree8(probs, 0);
 				// of looping from 1 - 8, we unroll the
 				// loop for a speed optimization.
 				rc_bit(probs[symbol], , );
 				rc_bit(probs[symbol], , );
 				rc_bit(probs[symbol], , );
 				rc_bit(probs[symbol], , );
 				rc_bit(probs[symbol], , );
 				rc_bit(probs[symbol], , );
 				rc_bit(probs[symbol], , );
 				rc_bit(probs[symbol], , );
 			} else {
 				// Decode literal with match byte.
-				//
+				rc_matched_literal(probs,
-				// We store the byte we compare against
+						dict_get(&dict, rep0));
 				// ("match byte") to "len" to minimize the
 				// number of variables we need to store
 				// between decoder calls.
 				len = (uint32_t)(dict_get(&dict, rep0)) << 1;
 				// The usage of "offset" allows omitting some
 				// branches, which should give tiny speed
 				// improvement on some CPUs. "offset" gets
 				// set to zero if match_bit didn't match.
 				offset = 0x100;
 				// Unroll the loop.
 				uint32_t match_bit;
 				uint32_t subcoder_index;
 #	define decode_with_match_bit \
 			match_bit = len & offset; \
 			subcoder_index = offset + match_bit + symbol; \
 			rc_bit(probs[subcoder_index], \
 					offset &= ~match_bit, \
 					offset &= match_bit)
 				decode_with_match_bit;
 				len <<= 1;
 				decode_with_match_bit;
 				len <<= 1;
 				decode_with_match_bit;
 				len <<= 1;
 				decode_with_match_bit;
 				len <<= 1;
 				decode_with_match_bit;
 				len <<= 1;
 				decode_with_match_bit;
 				len <<= 1;
 				decode_with_match_bit;
 				len <<= 1;
 				decode_with_match_bit;
 #	undef decode_match_bit
 			}
 			state = next_state[state];
@ -501,18 +440,8 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			// The next 6 bits determine how to decode the
 			// rest of the distance.
 			probs = coder->dist_slot[get_dist_state(len)];
 			symbol = 1;
-			rc_bit(probs[symbol], , );
+			rc_bittree6(probs, -DIST_SLOTS);
 			rc_bit(probs[symbol], , );
 			rc_bit(probs[symbol], , );
 			rc_bit(probs[symbol], , );
 			rc_bit(probs[symbol], , );
 			rc_bit(probs[symbol], , );
 			// Get rid of the highest bit that was needed for
 			// indexing of the probability array.
 			symbol -= DIST_SLOTS;
 			assert(symbol <= 63);
 			if (symbol < DIST_MODEL_START) {
@ -540,6 +469,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					assert(limit <= 5);
 					rep0 <<= limit;
 					assert(rep0 <= 96);
 					// -1 is fine, because we start
 					// decoding at probs[1], not probs[0].
 					// NOTE: This violates the C standard,
@ -553,106 +483,51 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					probs = coder->pos_special + rep0
 							- symbol - 1;
 					symbol = 1;
-					offset = 0;
+					offset = 1;
-					switch (limit) {
+					// Variable number (1-5) of bits
-					case 5:
+					// from a reverse bittree. This
-						assert(offset == 0);
+					// isn't worth manual unrolling.
-						rc_bit(probs[symbol], ,
+					do {
-							rep0 += 1U);
+						rc_bit_add_if_1(probs,
-						++offset;
+								rep0, offset);
-						--limit;
+						offset <<= 1;
-					case 4:
+					} while (--limit > 0);
 						rc_bit(probs[symbol], ,
 							rep0 += 1U << offset);
 						++offset;
 						--limit;
 					case 3:
 						rc_bit(probs[symbol], ,
 							rep0 += 1U << offset);
 						++offset;
 						--limit;
 					case 2:
 						rc_bit(probs[symbol], ,
 							rep0 += 1U << offset);
 						++offset;
 						--limit;
 					case 1:
 						// We need "symbol" only for
 						// indexing the probability
 						// array, thus we can use
 						// rc_bit_last() here to
 						// omit the unneeded updating
 						// of "symbol".
 						rc_bit_last(probs[symbol], ,
 							rep0 += 1U << offset);
 					}
 				} else {
 					// The distance is >= 128. Decode the
 					// lower bits without probabilities
 					// except the lowest four bits.
 					assert(symbol >= 14);
 					assert(limit >= 6);
 					limit -= ALIGN_BITS;
 					assert(limit >= 2);
-					// Not worth manual unrolling
+					rc_direct(rep0, limit);
 					do {
 						rc_direct(rep0);
 					} while (--limit > 0);
 					// Decode the lowest four bits using
 					// probabilities.
 					rep0 <<= ALIGN_BITS;
-					symbol = 1;
+					rc_bittree_rev4(coder->pos_align);
 					rep0 += symbol;
-					rc_bit(coder->pos_align[symbol], ,
+					// If the end of payload marker (EOPM)
-							rep0 += 1);
+					// is detected, jump to the safe code.
-
+					// The EOPM handling isn't speed
-					rc_bit(coder->pos_align[symbol], ,
+					// critical at all.
-							rep0 += 2);
+					//
-
+					// A final normalization is needed
-					rc_bit(coder->pos_align[symbol], ,
+					// after the EOPM (there can be a
-							rep0 += 4);
+					// dummy byte to read in some cases).
-
+					// If the normalization was done here
-					// Like when distance [4, 127], we
+					// in the fast code, it would need to
-					// don't need "symbol" for anything
+					// be taken into account in the value
-					// other than indexing the probability
+					// of LZMA_IN_REQUIRED. Using the
-					// array.
+					// safe code allows keeping
-					rc_bit_last(
+					// LZMA_IN_REQUIRED as 20 instead of
-						coder->pos_align[symbol], ,
+					// 21.
-						rep0 += 8);
+					if (rep0 == UINT32_MAX)
-
+						goto eopm;
 					if (rep0 == UINT32_MAX) {
 						///////////////////////////
 						// End of payload marker //
 						///////////////////////////
 						// End of payload marker was
 						// found. It may only be
 						// present if
 						//   - uncompressed size is
 						//     unknown or
 						//   - after known uncompressed
 						//     size amount of bytes has
 						//     been decompressed and
 						//     caller has indicated
 						//     that EOPM might be used
 						//     (it's not allowed in
 						//     LZMA2).
 						if (!eopm_is_valid) {
 							ret = LZMA_DATA_ERROR;
 							goto out;
 						}
 						// LZMA1 stream with
 						// end-of-payload marker.
 						rc_normalize();
 						ret = rc_is_finished(rc)
 							? LZMA_STREAM_END
 							: LZMA_DATA_ERROR;
 						goto out;
 					}
 				}
 			}
@ -948,31 +823,48 @@ slow:
 					limit -= ALIGN_BITS;
 					assert(limit >= 2);
 	case SEQ_DIRECT:
-					do {
+					rc_direct_safe(rep0, limit,
-						rc_direct_safe(rep0,
+							SEQ_DIRECT);
 								SEQ_DIRECT);
 					} while (--limit > 0);
 					rep0 <<= ALIGN_BITS;
-					symbol = 1;
+					symbol = 0;
-
+					offset = 1;
 					offset = 0;
 	case SEQ_ALIGN:
 					do {
-						rc_bit_safe(coder->pos_align[
+						rc_bit_last_safe(
-								symbol], ,
+							coder->pos_align[
-							rep0 += 1U << offset,
+								offset
 								+ symbol],
 							,
 							symbol += offset,
 							SEQ_ALIGN);
-					} while (++offset < ALIGN_BITS);
+						offset <<= 1;
 					} while (offset < ALIGN_SIZE);
 					rep0 += symbol;
 					// End of payload marker
 					if (rep0 == UINT32_MAX) {
 						// End of payload marker was
 						// found. It may only be
 						// present if
 						//   - uncompressed size is
 						//     unknown or
 						//   - after known uncompressed
 						//     size amount of bytes has
 						//     been decompressed and
 						//     caller has indicated
 						//     that EOPM might be used
 						//     (it's not allowed in
 						//     LZMA2).
 eopm:
 						if (!eopm_is_valid) {
 							ret = LZMA_DATA_ERROR;
 							goto out;
 						}
 	case SEQ_EOPM:
 						// LZMA1 stream with
 						// end-of-payload marker.
 						rc_normalize_safe(SEQ_EOPM);
 						ret = rc_is_finished(rc)
 							? LZMA_STREAM_END
--- a/src/liblzma/rangecoder/range_common.h
+++ b/src/liblzma/rangecoder/range_common.h
@ -68,6 +68,10 @@
 ///
 /// I will be sticking to uint16_t unless some specific architectures
 /// are *much* faster (20-50 %) with uint32_t.
 ///
 /// Update in 2024: The branchless C and x86-64 assembly was written so that
 /// probability is assumed to be uint16_t. (In contrast, LZMA SDK 23.01
 /// assembly supports both types.)
 typedef uint16_t probability;
 #endif
--- a/src/liblzma/rangecoder/range_decoder.h
+++ b/src/liblzma/rangecoder/range_decoder.h
@ -16,6 +16,17 @@
 #include "range_common.h"
 // Negative RC_BIT_MODEL_TOTAL but the lowest RC_MOVE_BITS are flipped.
 // This is useful for updating probability variables in branchless decoding:
 //
 //     uint32_t decoded_bit = ...;
 //     probability tmp = RC_BIT_MODEL_OFFSET;
 //     tmp &= decoded_bit - 1;
 //     prob -= (prob + tmp) >> RC_MOVE_BITS;
 #define RC_BIT_MODEL_OFFSET \
 	((UINT32_C(1) << RC_MOVE_BITS) - 1 - RC_BIT_MODEL_TOTAL)
 typedef struct {
 	uint32_t range;
 	uint32_t code;
@ -52,7 +63,8 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
 /// variables 'in' and 'in_size' to be defined.
 #define rc_to_local(range_decoder, in_pos) \
 	lzma_range_decoder rc = range_decoder; \
-	size_t rc_in_pos = (in_pos); \
+	const uint8_t *rc_in_ptr = in + (in_pos); \
 	const uint8_t *rc_in_end = in + in_size; \
 	uint32_t rc_bound
@ -60,7 +72,7 @@ rc_read_init(lzma_range_decoder *rc, const uint8_t *restrict in,
 #define rc_from_local(range_decoder, in_pos) \
 do { \
 	range_decoder = rc; \
-	in_pos = rc_in_pos; \
+	in_pos = (size_t)(rc_in_ptr - in); \
 } while (0)
@ -85,7 +97,7 @@ do { \
 do { \
 	if (rc.range < RC_TOP_VALUE) { \
 		rc.range <<= RC_SHIFT_BITS; \
-		rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \
+		rc.code = (rc.code << RC_SHIFT_BITS) | *rc_in_ptr++; \
 	} \
 } while (0)
@ -98,12 +110,12 @@ do { \
 #define rc_normalize_safe(seq) \
 do { \
 	if (rc.range < RC_TOP_VALUE) { \
-		if (unlikely(rc_in_pos == in_size)) { \
+		if (rc_in_ptr == rc_in_end) { \
 			coder->sequence = seq; \
 			goto out; \
 		} \
 		rc.range <<= RC_SHIFT_BITS; \
-		rc.code = (rc.code << RC_SHIFT_BITS) | in[rc_in_pos++]; \
+		rc.code = (rc.code << RC_SHIFT_BITS) | *rc_in_ptr++; \
 	} \
 } while (0)
@ -133,10 +145,14 @@ do { \
 /// Update the range decoder state and the used probability variable to
 /// match a decoded bit of 0.
 ///
 /// The x86-64 assemly uses the commented method but it seems that,
 /// at least on x86-64, the first version is slightly faster as C code.
 #define rc_update_0(prob) \
 do { \
 	rc.range = rc_bound; \
 	prob += (RC_BIT_MODEL_TOTAL - (prob)) >> RC_MOVE_BITS; \
 	/* prob -= ((prob) + RC_BIT_MODEL_OFFSET) >> RC_MOVE_BITS; */ \
 } while (0)
@ -192,19 +208,121 @@ do { \
 		symbol = (symbol << 1) + 1; action1, \
 		seq);
-/// Decode a bit without using a probability.
+// Unroll fixed-sized bittree decoding.
-#define rc_direct(dest) \
+//
 // A compile-time constant in final_add can be used to get rid of the high bit
 // from symbol that is used for the array indexing (1U << bittree_bits).
 // final_add may also be used to add offset to the result (LZMA length
 // decoder does that).
 //
 // The reason to have final_add here is that in the asm code the addition
 // can be done for free: in x86-64 there is SBB instruction with -1 as
 // the immediate value, and final_add is combined with that value.
 #define rc_bittree_bit(prob) \
 	rc_bit(prob, , )
 #define rc_bittree3(probs, final_add) \
 do { \
 	symbol = 1; \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	symbol += (uint32_t)(final_add); \
 } while (0)
 #define rc_bittree6(probs, final_add) \
 do { \
 	symbol = 1; \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	symbol += (uint32_t)(final_add); \
 } while (0)
 #define rc_bittree8(probs, final_add) \
 do { \
 	symbol = 1; \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	rc_bittree_bit(probs[symbol]); \
 	symbol += (uint32_t)(final_add); \
 } while (0)
 // Fixed-sized reverse bittree
 #define rc_bittree_rev4(probs) \
 do { \
 	symbol = 0; \
 	rc_bit_last(probs[symbol + 1], , symbol += 1); \
 	rc_bit_last(probs[symbol + 2], , symbol += 2); \
 	rc_bit_last(probs[symbol + 4], , symbol += 4); \
 	rc_bit_last(probs[symbol + 8], , symbol += 8); \
 } while (0)
 // Decode one bit from variable-sized reverse bittree.
 // The loop is done in the code that uses this macro.
 #define rc_bit_add_if_1(probs, dest, value_to_add_if_1) \
 	rc_bit(probs[symbol], \
 		, \
 		dest += value_to_add_if_1);
 // Matched literal
 #define decode_with_match_bit \
 		t_match_byte <<= 1; \
 		t_match_bit = t_match_byte & t_offset; \
 		t_subcoder_index = t_offset + t_match_bit + symbol; \
 		rc_bit(probs[t_subcoder_index], \
 				t_offset &= ~t_match_bit, \
 				t_offset &= t_match_bit)
 #define rc_matched_literal(probs_base_var, match_byte) \
 do { \
 	uint32_t t_match_byte = (match_byte); \
 	uint32_t t_match_bit; \
 	uint32_t t_subcoder_index; \
 	uint32_t t_offset = 0x100; \
 	symbol = 1; \
 	decode_with_match_bit; \
 	decode_with_match_bit; \
 	decode_with_match_bit; \
 	decode_with_match_bit; \
 	decode_with_match_bit; \
 	decode_with_match_bit; \
 	decode_with_match_bit; \
 	decode_with_match_bit; \
 } while (0)
 /// Decode a bit without using a probability.
 //
 // NOTE: GCC 13 and Clang/LLVM 16 can, at least on x86-64, optimize the bound
 // calculation to use an arithmetic right shift so there's no need to provide
 // the alternative code which, according to C99/C11/C23 6.3.1.3-p3 isn't
 // perfectly portable: rc_bound = (uint32_t)((int32_t)rc.code >> 31);
 #define rc_direct(dest, count_var) \
 do { \
 	dest = (dest << 1) + 1; \
 	rc_normalize(); \
 	rc.range >>= 1; \
 	rc.code -= rc.range; \
 	rc_bound = UINT32_C(0) - (rc.code >> 31); \
 	dest += rc_bound; \
 	rc.code += rc.range & rc_bound; \
-	dest = (dest << 1) + (rc_bound + 1); \
+} while (--count_var > 0)
 } while (0)
-#define rc_direct_safe(dest, seq) \
+
 #define rc_direct_safe(dest, count_var, seq) \
 do { \
 	rc_normalize_safe(seq); \
 	rc.range >>= 1; \
@ -212,10 +330,6 @@ do { \
 	rc_bound = UINT32_C(0) - (rc.code >> 31); \
 	rc.code += rc.range & rc_bound; \
 	dest = (dest << 1) + (rc_bound + 1); \
-} while (0)
+} while (--count_var > 0)
 // NOTE: No macros are provided for bittree decoding. It seems to be simpler
 // to just write them open in the code.
 #endif