liblzma: Omit zero-skipping from ARM64 filter.

It has some complicated downsides and its usefulness is more limited than I originally thought. So this change is bad for certain very specific situations but a generic solution that works for other filters (and is otherwise better too) is planned anyway. And this way 7-Zip can use the same compatible filter for the .7z format. This is still marked as experimental with a new temporary Filter ID.
2024-04-04 12:36:23 +02:00 · 2022-12-01 18:51:52 +02:00 · 2022-12-01 18:51:52 +02:00 · f9ca7d4516
commit f9ca7d4516
parent 5baec3f0a9
2 changed files with 24 additions and 59 deletions
--- a/src/liblzma/api/lzma/bcj.h
+++ b/src/liblzma/api/lzma/bcj.h
@ -49,7 +49,7 @@
 	 * Filter for SPARC binaries.
 	 */
-#define LZMA_FILTER_ARM64       LZMA_VLI_C(0x3FDB87B33B27010B)
+#define LZMA_FILTER_ARM64       LZMA_VLI_C(0x3FDB87B33B27020B)
 	/**<
 	 * Filter for ARM64 binaries.
 	 *
--- a/src/liblzma/simple/arm64.c
+++ b/src/liblzma/simple/arm64.c
@ -6,13 +6,6 @@
 /// This converts ARM64 relative addresses in the BL and ADRP immediates
 /// to absolute values to increase redundancy of ARM64 code.
 ///
 /// Unlike the older BCJ filters, this handles zeros specially. This way
 /// the filter won't be counterproductive on Linux kernel modules, object
 /// files, and static libraries where the immediates are all zeros (to be
 /// filled later by a linker). Usually this has no downsides but with bad
 /// luck it can reduce the effectiveness of the filter and trying a different
 /// start offset can mitigate the problem.
 ///
 /// Converting B or ADR instructions was also tested but it's not useful.
 /// A majority of the jumps for the B instruction are very small (+/- 0xFF).
 /// These are typical for loops and if-statements. Encoding them to their
@ -30,20 +23,6 @@
 #include "simple_private.h"
 static uint32_t
 arm64_conv(uint32_t src, uint32_t pc, uint32_t mask, bool is_encoder)
 {
 	if (!is_encoder)
 		pc = 0U - pc;
 	uint32_t dest = src + pc;
 	if ((dest & mask) == 0)
 		dest = pc;
 	return dest;
 }
 static size_t
 arm64_code(void *simple lzma_attribute((__unused__)),
 		uint32_t now_pos, bool is_encoder,
@ -51,29 +30,15 @@ arm64_code(void *simple lzma_attribute((__unused__)),
 {
 	size_t i;
-	// Clang 14.0.6 on x86-64 makes this four times bigger and 60 % slower
+	// Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
 	// with auto-vectorization that is enabled by default with -O2.
 	// Even -Os, which doesn't use vectorization, produces faster code.
 	// Disabling vectorization with -O2 gives good speed (faster than -Os)
 	// and reasonable code size.
 	//
 	// Such vectorization bloat happens with -O2 when targeting ARM64 too
 	// but performance hasn't been tested.
 	//
 	// Clang 14 and 15 won't auto-vectorize this loop if the condition
 	// for ADRP is replaced with the commented-out version. However,
 	// at least Clang 14.0.6 doesn't generate as fast code with that
 	// condition. The commented-out code is also bigger.
 	//
 	// GCC 12.2 on x86-64 with -O2 produces good code with both versions
 	// of the ADRP if-statement although the single-branch version is
 	// slightly faster and smaller than the commented-out version.
 	// Speed is similar to non-vectorized clang -O2.
 #ifdef __clang__
 #	pragma clang loop vectorize(disable)
 #endif
 	for (i = 0; i + 4 <= size; i += 4) {
-		const uint32_t pc = (uint32_t)(now_pos + i);
+		uint32_t pc = (uint32_t)(now_pos + i);
 		uint32_t instr = read32le(buffer + i);
 		if ((instr >> 26) == 0x25) {
@ -87,27 +52,17 @@ arm64_code(void *simple lzma_attribute((__unused__)),
 			// so this is a compromise that slightly favors big
 			// files. With the full range only six bits of the 32
 			// need to match to trigger a conversion.
-			const uint32_t mask26 = 0x03FFFFFF;
+			const uint32_t src = instr;
 			const uint32_t src = instr & mask26;
 			instr = 0x94000000;
-			if (src == 0)
+			pc >>= 2;
-				continue;
+			if (!is_encoder)
 				pc = 0U - pc;
-			instr |= arm64_conv(src, pc >> 2, mask26, is_encoder)
+			instr |= (src + pc) & 0x03FFFFFF;
 					& mask26;
 			write32le(buffer + i, instr);
-/*
+		} else if ((instr & 0x9F000000) == 0x90000000) {
 		// This is a more readable version of the one below but this
 		// has two branches. It results in bigger and slower code.
 		} else if ((instr & 0x9FF00000) == 0x90000000
 				|| (instr & 0x9FF00000) == 0x90F00000) {
 */
 		// This is only a rotation, addition, and testing that
 		// none of the bits covered by the bitmask are set.
 		} else if (((((instr << 8) | (instr >> 24))
 				+ (0x10000000 - 0x90)) & 0xE000009F) == 0) {
 			// ADRP instruction:
 			// Only values in the range +/-512 MiB are converted.
 			//
@ -120,15 +75,25 @@ arm64_code(void *simple lzma_attribute((__unused__)),
 			// range, nine bits of 32 need to match to trigger a
 			// conversion (two 10-bit match choices = 9 bits).
 			const uint32_t src = ((instr >> 29) & 3)
-					| ((instr >> 3) & 0x0003FFFC);
+					| ((instr >> 3) & 0x001FFFFC);
 			instr &= 0x9000001F;
-			if (src == 0)
+			// With the addition only one branch is needed to
 			// check the +/- range. This is usually false when
 			// processing ARM64 code so branch prediction will
 			// handle it well in terms of performance.
 			//
 			//if ((src & 0x001E0000) != 0
 			// && (src & 0x001E0000) != 0x001E0000)
 			if ((src + 0x00020000) & 0x001C0000)
 				continue;
-			const uint32_t dest = arm64_conv(
+			instr &= 0x9000001F;
 					src, pc >> 12, 0x3FFFF, is_encoder);
 			pc >>= 12;
 			if (!is_encoder)
 				pc = 0U - pc;
 			const uint32_t dest = src + pc;
 			instr |= (dest & 3) << 29;
 			instr |= (dest & 0x0003FFFC) << 3;
 			instr |= (0U - (dest & 0x00020000)) & 0x00E00000;