1
0
Fork 0
mirror of https://git.tukaani.org/xz.git synced 2024-04-04 12:36:23 +02:00

liblzma: Range decoder: Add branchless C code.

It's used only for basic bittrees and fixed-size reverse bittree
because those showed a clear benefit on x86-64 with GCC and Clang.
The other methods were more mixed and thus are commented out but
they should be tested on other archs.
This commit is contained in:
Lasse Collin 2024-02-12 17:09:10 +02:00
parent e290a72d6d
commit cba2edc991

View file

@ -340,4 +340,80 @@ do { \
dest = (dest << 1) + (rc_bound + 1); \ dest = (dest << 1) + (rc_bound + 1); \
} while (--count_var > 0) } while (--count_var > 0)
//////////////////
// Branchless C //
//////////////////
/// Decode a bit using a branchless method. This reduces the number of
/// mispredicted branches and thus can improve speed.
#define rc_c_bit(prob, action_bit, action_neg) \
do { \
probability *p = &(prob); \
rc_normalize(); \
rc_bound = (rc.range >> RC_BIT_MODEL_TOTAL_BITS) * *p; \
uint32_t rc_mask = rc.code >= rc_bound; /* rc_mask = decoded bit */ \
action_bit; /* action when rc_mask is 0 or 1 */ \
/* rc_mask becomes 0 if bit is 0 and 0xFFFFFFFF if bit is 1: */ \
rc_mask = 0U - rc_mask; \
rc.range &= rc_mask; /* If bit 0: set rc.range = 0 */ \
rc_bound ^= rc_mask; \
rc_bound -= rc_mask; /* If bit 1: rc_bound = 0U - rc_bound */ \
rc.range += rc_bound; \
rc_bound &= rc_mask; \
rc.code += rc_bound; \
action_neg; /* action when rc_mask is 0 or 0xFFFFFFFF */ \
rc_mask = ~rc_mask; /* If bit 0: all bits are set in rc_mask */ \
rc_mask &= RC_BIT_MODEL_OFFSET; \
*p -= (*p + rc_mask) >> RC_MOVE_BITS; \
} while (0)
// TODO: Testing on x86-64 give an impression that only the main bittrees are
// worth the branchless C code. It should be tested on other archs for which
// there isn't assembly code in this file.
// Using addition in "(symbol << 1) + rc_mask" allows use of x86 LEA
// or RISC-V SH1ADD instructions. Compilers might infer it from
// "(symbol << 1) | rc_mask" too if they see that mask is 0 or 1 but
// the use of addition doesn't require such analysis from compilers.
#undef rc_bittree_bit
#define rc_bittree_bit(prob) \
rc_c_bit(prob, \
symbol = (symbol << 1) + rc_mask, \
)
#undef rc_bittree_rev4
#define rc_bittree_rev4(probs) \
do { \
symbol = 0; \
rc_c_bit(probs[symbol + 1], symbol += rc_mask, ); \
rc_c_bit(probs[symbol + 2], symbol += rc_mask << 1, ); \
rc_c_bit(probs[symbol + 4], symbol += rc_mask << 2, ); \
rc_c_bit(probs[symbol + 8], symbol += rc_mask << 3, ); \
} while (0)
// TODO: Test performance on platforms for which there is no assembly code.
/*
#undef rc_bit_add_if_1
#define rc_bit_add_if_1(probs, dest, value_to_add_if_1) \
rc_c_bit(probs[symbol], \
symbol = (symbol << 1) + rc_mask, \
dest += (value_to_add_if_1) & rc_mask)
*/
// TODO: Test on platforms for which there is no assembly code.
/*
#undef decode_with_match_bit
#define decode_with_match_bit \
t_match_byte <<= 1; \
t_match_bit = t_match_byte & t_offset; \
t_subcoder_index = t_offset + t_match_bit + symbol; \
rc_c_bit(probs[t_subcoder_index], \
symbol = (symbol << 1) + rc_mask, \
t_offset &= ~t_match_bit ^ rc_mask)
*/
#endif #endif