diff --git a/src/liblzma/api/lzma/bcj.h b/src/liblzma/api/lzma/bcj.h index 10719b88..b68b6bac 100644 --- a/src/liblzma/api/lzma/bcj.h +++ b/src/liblzma/api/lzma/bcj.h @@ -49,13 +49,14 @@ * Filter for SPARC binaries. */ -#define LZMA_FILTER_ARM64 LZMA_VLI_C(0x3FDB87B33B27000B) - /**< - * Filter for ARM64 binaries. - * - * \note In contrast to the other BCJ filters, this uses - * its own options structure, lzma_options_arm64. - */ +#define LZMA_FILTER_ARM64 LZMA_VLI_C(0x3FDB87B33B27010B) + /**< + * Filter for ARM64 binaries. + * + * \note THIS IS AN EXPERIMENTAL VERSION WHICH WILL + * STILL CHANGE! FILES CREATED WITH THIS + * WILL NOT BE SUPPORTED IN THE FUTURE! + */ /** * \brief Options for BCJ filters (except ARM64) @@ -95,29 +96,3 @@ typedef struct { uint32_t start_offset; } lzma_options_bcj; - -/** - * \brief Options for the ARM64 filter - * - * This filter never changes the size of the data. - * Specifying options is mandatory. - */ -typedef struct { - /** - * \brief How wide range of relative addresses are converted - * - * The ARM64 BL instruction has 26-bit immediate field that encodes - * a relative address as a multiple of four bytes, so the effective - * range is 2^28 bytes (+/-128 MiB). - * - * If width is 28 bits (LZMA_ARM64_WIDTH_MAX), then all BL - * instructions will be converted. This has a downside of some - * false matches that make compression worse. The best value - * depends on the input file and the differences can be significant; - * with large executables the maximum value is sometimes the best. - */ - uint32_t width; -# define LZMA_ARM64_WIDTH_MIN 18 -# define LZMA_ARM64_WIDTH_MAX 28 -# define LZMA_ARM64_WIDTH_DEFAULT 26 -} lzma_options_arm64; diff --git a/src/liblzma/common/filter_common.c b/src/liblzma/common/filter_common.c index 52401aa9..0113035b 100644 --- a/src/liblzma/common/filter_common.c +++ b/src/liblzma/common/filter_common.c @@ -98,13 +98,13 @@ static const struct { }, #endif #if defined(HAVE_ENCODER_ARM64) || defined(HAVE_DECODER_ARM64) - { - .id = LZMA_FILTER_ARM64, - .options_size = sizeof(lzma_options_arm64), - .non_last_ok = true, - .last_ok = false, - .changes_size = false, - }, + { + .id = LZMA_FILTER_ARM64, + .options_size = sizeof(lzma_options_bcj), + .non_last_ok = true, + .last_ok = false, + .changes_size = false, + }, #endif #if defined(HAVE_ENCODER_SPARC) || defined(HAVE_DECODER_SPARC) { diff --git a/src/liblzma/common/filter_decoder.c b/src/liblzma/common/filter_decoder.c index 37af0cc2..b031ac62 100644 --- a/src/liblzma/common/filter_decoder.c +++ b/src/liblzma/common/filter_decoder.c @@ -100,12 +100,12 @@ static const lzma_filter_decoder decoders[] = { }, #endif #ifdef HAVE_DECODER_ARM64 - { - .id = LZMA_FILTER_ARM64, - .init = &lzma_simple_arm64_decoder_init, - .memusage = NULL, - .props_decode = &lzma_arm64_props_decode, - }, + { + .id = LZMA_FILTER_ARM64, + .init = &lzma_simple_arm64_decoder_init, + .memusage = NULL, + .props_decode = &lzma_simple_props_decode, + }, #endif #ifdef HAVE_DECODER_SPARC { diff --git a/src/liblzma/common/filter_encoder.c b/src/liblzma/common/filter_encoder.c index ec9e969d..1db78446 100644 --- a/src/liblzma/common/filter_encoder.c +++ b/src/liblzma/common/filter_encoder.c @@ -127,15 +127,14 @@ static const lzma_filter_encoder encoders[] = { }, #endif #ifdef HAVE_ENCODER_ARM64 - { - .id = LZMA_FILTER_ARM64, - .init = &lzma_simple_arm64_encoder_init, - .memusage = NULL, - .block_size = NULL, - .props_size_get = NULL, - .props_size_fixed = 1, - .props_encode = &lzma_arm64_props_encode, - }, + { + .id = LZMA_FILTER_ARM64, + .init = &lzma_simple_arm64_encoder_init, + .memusage = NULL, + .block_size = NULL, + .props_size_get = &lzma_simple_props_size, + .props_encode = &lzma_simple_props_encode, + }, #endif #ifdef HAVE_ENCODER_SPARC { diff --git a/src/liblzma/simple/arm64.c b/src/liblzma/simple/arm64.c index 0437f736..05b4be29 100644 --- a/src/liblzma/simple/arm64.c +++ b/src/liblzma/simple/arm64.c @@ -3,6 +3,22 @@ /// \file arm64.c /// \brief Filter for ARM64 binaries /// +/// This converts ARM64 relative addresses in the BL and ADRP immediates +/// to absolute values to increase redundancy of ARM64 code. +/// +/// Unlike the older BCJ filters, this handles zeros specially. This way +/// the filter won't be counterproductive on Linux kernel modules, object +/// files, and static libraries where the immediates are all zeros (to be +/// filled later by a linker). Usually this has no downsides but with bad +/// luck it can reduce the effectiveness of the filter and trying a different +/// start offset can mitigate the problem. +/// +/// Converting B or ADR instructions was also tested but it's not useful. +/// A majority of the jumps for the B instruction are very small (+/- 0xFF). +/// These are typical for loops and if-statements. Encoding them to their +/// absolute address reduces redundancy since many of the small relative +/// jump values are repeated, but very few of the absolute addresses are. +// // Authors: Lasse Collin // Jia Tan // @@ -13,126 +29,110 @@ #include "simple_private.h" -#ifdef HAVE_ENCODER_ARM64 -# include "simple_encoder.h" -#endif -#ifdef HAVE_DECODER_ARM64 -# include "simple_decoder.h" -#endif +static uint32_t +arm64_conv(uint32_t src, uint32_t pc, uint32_t mask, bool is_encoder) +{ + if (!is_encoder) + pc = 0U - pc; + uint32_t dest = src + pc; + if ((dest & mask) == 0) + dest = pc; -// In ARM64, there are two main branch instructions. -// bl - branch and link: Calls a function and stores the return address. -// b - branch: Jumps to a location, but does not store a return address. -// -// After some benchmarking, it was determined that only the bl instruction -// is beneficial for compression. A majority of the jumps for the b -// instruction are very small (+/- 0xFF). These are typical for loops -// and if-statements. Encoding them to their absolute address reduces -// redundancy since many of the small relative jump values are repeated, -// but very few of the absolute addresses are. -// -// Thus, only the bl instruction will be encoded and decoded. -// The bl instruction is 32 bits in size. The highest 6 bits contain -// the opcode (10 0101 == 0x25) and the remaining 26 bits are -// the immediate value. The immediate is a signed integer that -// encodes the target address as a multiple of four bytes so -// the range is +/-128 MiB. - -// The 6-bit op code for the bl instruction in ARM64 -#define ARM64_BL_OPCODE 0x25 - -// Once the 26-bit immediate is multiple by four, the address is 28 bits -// with the two lowest bits being zero. This mask is used to clear the -// unwanted bits. -#define ADDR28_MASK 0x0FFFFFFCU - - -typedef struct { - uint32_t sign_bit; - uint32_t sign_mask; -} lzma_simple_arm64; + return dest; +} static size_t -arm64_code(void *simple_ptr, uint32_t now_pos, bool is_encoder, +arm64_code(void *simple lzma_attribute((__unused__)), + uint32_t now_pos, bool is_encoder, uint8_t *buffer, size_t size) { - const lzma_simple_arm64 *simple = simple_ptr; - const uint32_t sign_bit = simple->sign_bit; - const uint32_t sign_mask = simple->sign_mask; - size_t i; + + // Clang 14.0.6 on x86-64 makes this four times bigger and 60 % slower + // with auto-vectorization that is enabled by default with -O2. + // Even -Os, which doesn't use vectorization, produces faster code. + // Disabling vectorization with -O2 gives good speed (faster than -Os) + // and reasonable code size. + // + // Such vectorization bloat happens with -O2 when targeting ARM64 too + // but performance hasn't been tested. + // + // Clang 14 and 15 won't auto-vectorize this loop if the condition + // for ADRP is replaced with the commented-out version. However, + // at least Clang 14.0.6 doesn't generate as fast code with that + // condition. The commented-out code is also bigger. + // + // GCC 12.2 on x86-64 with -O2 produces good code with both versions + // of the ADRP if-statement although the single-branch version is + // slightly faster and smaller than the commented-out version. + // Speed is similar to non-vectorized clang -O2. +#ifdef __clang__ +# pragma clang loop vectorize(disable) +#endif for (i = 0; i + 4 <= size; i += 4) { - if ((buffer[i + 3] >> 2) == ARM64_BL_OPCODE) { - // Get the relative 28-bit address from - // the 26-bit immediate. - uint32_t src = read32le(buffer + i); - src <<= 2; - src &= ADDR28_MASK; + const uint32_t pc = (uint32_t)(now_pos + i); + uint32_t instr = read32le(buffer + i); - // When the conversion width isn't the maximum, - // check that the highest bits are either all zero - // or all one. - if ((src & sign_mask) != 0 - && (src & sign_mask) != sign_mask) - continue; + if ((instr >> 26) == 0x25) { + // BL instruction: + // The full 26-bit immediate is converted. + // The range is +/-128 MiB. + // + // Using the full range is helps quite a lot with + // big executables. Smaller range would reduce false + // positives in non-code sections of the input though + // so this is a compromise that slightly favors big + // files. With the full range only six bits of the 32 + // need to match to trigger a conversion. + const uint32_t mask26 = 0x03FFFFFF; + const uint32_t src = instr & mask26; + instr = 0x94000000; - // Some files like static libraries or Linux kernel - // modules have the immediate value filled with - // zeros. Converting these placeholder values would - // make compression worse so don't touch them. if (src == 0) continue; - const uint32_t pc = now_pos + (uint32_t)(i); + instr |= arm64_conv(src, pc >> 2, mask26, is_encoder) + & mask26; + write32le(buffer + i, instr); - uint32_t dest; - if (is_encoder) - dest = pc + src; - else - dest = src - pc; +/* + // This is a more readable version of the one below but this + // has two branches. It results in bigger and slower code. + } else if ((instr & 0x9FF00000) == 0x90000000 + || (instr & 0x9FF00000) == 0x90F00000) { +*/ + // This is only a rotation, addition, and testing that + // none of the bits covered by the bitmask are set. + } else if (((((instr << 8) | (instr >> 24)) + + (0x10000000 - 0x90)) & 0xE000009F) == 0) { + // ADRP instruction: + // Only values in the range +/-512 MiB are converted. + // + // Using less than the full +/-4 GiB range reduces + // false positives on non-code sections of the input + // while being excellent for executables up to 512 MiB. + // The positive effect of ADRP conversion is smaller + // than that of BL but it also doesn't hurt so much in + // non-code sections of input because, with +/-512 MiB + // range, nine bits of 32 need to match to trigger a + // conversion (two 10-bit match choices = 9 bits). + const uint32_t src = ((instr >> 29) & 3) + | ((instr >> 3) & 0x0003FFFC); + instr &= 0x9000001F; - dest &= ADDR28_MASK; + if (src == 0) + continue; - // Sign-extend negative values or unset sign bits - // from positive values. - if (dest & sign_bit) - dest |= sign_mask; - else - dest &= ~sign_mask; + const uint32_t dest = arm64_conv( + src, pc >> 12, 0x3FFFF, is_encoder); - assert((dest & sign_mask) == 0 - || (dest & sign_mask) == sign_mask); - - // Since also the decoder will ignore src values - // of 0, we must ensure that nothing is ever encoded - // to 0. This is achieved by encoding such values - // as pc instead. When decoding, pc will be first - // converted to 0 which we will catch here and fix. - if (dest == 0) { - // We cannot get here if pc is zero because - // then src would need to be zero too but we - // already ensured that src != 0. - assert((pc & ADDR28_MASK) != 0); - dest = is_encoder ? pc : 0U - pc; - dest &= ADDR28_MASK; - - if (dest & sign_bit) - dest |= sign_mask; - else - dest &= ~sign_mask; - } - - assert((dest & sign_mask) == 0 - || (dest & sign_mask) == sign_mask); - assert((dest & ~ADDR28_MASK) == 0); - - // Construct and store the modified 32-bit instruction. - dest >>= 2; - dest |= (uint32_t)ARM64_BL_OPCODE << 26; - write32le(buffer + i, dest); + instr |= (dest & 3) << 29; + instr |= (dest & 0x0003FFFC) << 3; + instr |= (0U - (dest & 0x00020000)) & 0x00E00000; + write32le(buffer + i, instr); } } @@ -140,81 +140,12 @@ arm64_code(void *simple_ptr, uint32_t now_pos, bool is_encoder, } -#ifdef HAVE_ENCODER_ARM64 -extern lzma_ret -lzma_arm64_props_encode(const void *options, uint8_t *out) -{ - const lzma_options_arm64 *const opt = options; - - if (opt->width < LZMA_ARM64_WIDTH_MIN - || opt->width > LZMA_ARM64_WIDTH_MAX) - return LZMA_OPTIONS_ERROR; - - out[0] = (uint8_t)(opt->width - LZMA_ARM64_WIDTH_MIN); - return LZMA_OK; -} -#endif - - -#ifdef HAVE_DECODER_ARM64 -extern lzma_ret -lzma_arm64_props_decode(void **options, const lzma_allocator *allocator, - const uint8_t *props, size_t props_size) -{ - if (props_size != 1) - return LZMA_OPTIONS_ERROR; - - if (props[0] > LZMA_ARM64_WIDTH_MAX - LZMA_ARM64_WIDTH_MIN) - return LZMA_OPTIONS_ERROR; - - lzma_options_arm64 *opt = lzma_alloc(sizeof(lzma_options_arm64), - allocator); - if (opt == NULL) - return LZMA_MEM_ERROR; - - opt->width = props[0] + LZMA_ARM64_WIDTH_MIN; - *options = opt; - return LZMA_OK; - -} -#endif - - static lzma_ret arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator, const lzma_filter_info *filters, bool is_encoder) { - if (filters[0].options == NULL) - return LZMA_PROG_ERROR; - - const lzma_options_arm64 *opt = filters[0].options; - if (opt->width < LZMA_ARM64_WIDTH_MIN - || opt->width > LZMA_ARM64_WIDTH_MAX) - return LZMA_OPTIONS_ERROR; - - const lzma_ret ret = lzma_simple_coder_init(next, allocator, filters, - &arm64_code, sizeof(lzma_simple_arm64), 4, 4, - is_encoder, false); - - if (ret == LZMA_OK) { - lzma_simple_coder *coder = next->coder; - lzma_simple_arm64 *simple = coder->simple; - - // This will be used to detect if the value, after - // conversion has been done, is negative. The location - // of the sign bit depends on the conversion width. - simple->sign_bit = UINT32_C(1) << (opt->width - 1); - - // When conversion width isn't the maximum, the highest - // bits must all be either zero or one, that is, they - // all are copies of the sign bit. This mask is used to - // (1) detect if input value is in the range specified - // by the conversion width and (2) clearing or setting - // the high bits after conversion (integers can wrap around). - simple->sign_mask = (UINT32_C(1) << 28) - simple->sign_bit; - } - - return ret; + return lzma_simple_coder_init(next, allocator, filters, + &arm64_code, 0, 4, 4, is_encoder, true); } diff --git a/src/liblzma/simple/simple_decoder.h b/src/liblzma/simple/simple_decoder.h index 188d8370..bed8d37a 100644 --- a/src/liblzma/simple/simple_decoder.h +++ b/src/liblzma/simple/simple_decoder.h @@ -19,8 +19,4 @@ extern lzma_ret lzma_simple_props_decode( void **options, const lzma_allocator *allocator, const uint8_t *props, size_t props_size); -extern lzma_ret lzma_arm64_props_decode( - void **options, const lzma_allocator *allocator, - const uint8_t *props, size_t props_size); - #endif diff --git a/src/liblzma/simple/simple_encoder.h b/src/liblzma/simple/simple_encoder.h index 10828f8f..1cee4823 100644 --- a/src/liblzma/simple/simple_encoder.h +++ b/src/liblzma/simple/simple_encoder.h @@ -20,6 +20,4 @@ extern lzma_ret lzma_simple_props_size(uint32_t *size, const void *options); extern lzma_ret lzma_simple_props_encode(const void *options, uint8_t *out); -extern lzma_ret lzma_arm64_props_encode(const void *options, uint8_t *out); - #endif diff --git a/src/xz/args.c b/src/xz/args.c index 6b388a1e..639c811a 100644 --- a/src/xz/args.c +++ b/src/xz/args.c @@ -374,7 +374,7 @@ parse_real(args_info *args, int argc, char **argv) case OPT_ARM64: coder_add_filter(LZMA_FILTER_ARM64, - options_arm64(optarg)); + options_bcj(optarg)); break; case OPT_SPARC: diff --git a/src/xz/message.c b/src/xz/message.c index 345049e6..d4883709 100644 --- a/src/xz/message.c +++ b/src/xz/message.c @@ -1034,9 +1034,16 @@ message_filters_to_str(char buf[FILTERS_STR_SIZE], } case LZMA_FILTER_ARM64: { - const lzma_options_arm64 *opt = filters[i].options; - my_snprintf(&pos, &left, "arm64=width=%" PRIu32, - opt->width); + // FIXME TODO: Merge with the above generic BCJ list + // once the Filter ID is changed to the final value. + const lzma_options_bcj *opt = filters[i].options; + my_snprintf(&pos, &left, "arm64"); + + // Show the start offset only when really needed. + if (opt != NULL && opt->start_offset != 0) + my_snprintf(&pos, &left, "=start=%" PRIu32, + opt->start_offset); + break; } diff --git a/src/xz/options.c b/src/xz/options.c index 899bd842..0c1ee221 100644 --- a/src/xz/options.c +++ b/src/xz/options.c @@ -224,45 +224,6 @@ options_bcj(const char *str) } -/////////// -// ARM64 // -/////////// - -enum { - OPT_WIDTH, -}; - - -static void -set_arm64(void *options, unsigned key, uint64_t value, - const char *valuestr lzma_attribute((__unused__))) -{ - lzma_options_arm64 *opt = options; - switch (key) { - case OPT_WIDTH: - opt->width = value; - break; - } -} - - -extern lzma_options_arm64 * -options_arm64(const char *str) -{ - static const option_map opts[] = { - { "width", NULL, LZMA_ARM64_WIDTH_MIN, LZMA_ARM64_WIDTH_MAX }, - { NULL, NULL, 0, 0 } - }; - - lzma_options_arm64 *options = xmalloc(sizeof(lzma_options_arm64)); - options->width = LZMA_ARM64_WIDTH_DEFAULT; - - parse_options(str, opts, &set_arm64, options); - - return options; -} - - ////////// // LZMA // ////////// diff --git a/src/xz/options.h b/src/xz/options.h index d56adc8a..61ec8d58 100644 --- a/src/xz/options.h +++ b/src/xz/options.h @@ -24,13 +24,6 @@ extern lzma_options_delta *options_delta(const char *str); extern lzma_options_bcj *options_bcj(const char *str); -/// \brief Parser for ARM64 options -/// -/// \return Pointer to allocated options structure. -/// Doesn't return on error. -extern lzma_options_arm64 *options_arm64(const char *str); - - /// \brief Parser for LZMA options /// /// \return Pointer to allocated options structure.