From 849d0f282a6a890c5cf5a0e0f02980b12d9ebb0f Mon Sep 17 00:00:00 2001 From: Chenxi Mao Date: Tue, 9 Jan 2024 17:23:11 +0800 Subject: [PATCH] Speed up CRC32 calculation on ARM64 The CRC32 instructions in ARM64 can calculate the CRC32 result for 8 bytes in a single operation, making the use of ARM64 instructions much faster compared to the general CRC32 algorithm. Optimized CRC32 will be enabled if ARM64 has CRC extension running on Linux. Signed-off-by: Chenxi Mao --- CMakeLists.txt | 1 + src/liblzma/check/Makefile.inc | 3 +- src/liblzma/check/crc32_aarch64.h | 109 ++++++++++++++++++++++++++++++ src/liblzma/check/crc32_fast.c | 5 +- src/liblzma/check/crc64_fast.c | 5 +- src/liblzma/check/crc_common.h | 16 +++-- 6 files changed, 130 insertions(+), 9 deletions(-) create mode 100644 src/liblzma/check/crc32_aarch64.h diff --git a/CMakeLists.txt b/CMakeLists.txt index c5573d7b..0cb08fc7 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -230,6 +230,7 @@ add_library(liblzma src/liblzma/check/check.h src/liblzma/check/crc_common.h src/liblzma/check/crc_x86_clmul.h + src/liblzma/check/crc32_aarch64.h src/liblzma/common/block_util.c src/liblzma/common/common.c src/liblzma/common/common.h diff --git a/src/liblzma/check/Makefile.inc b/src/liblzma/check/Makefile.inc index acff40c3..e7f87c85 100644 --- a/src/liblzma/check/Makefile.inc +++ b/src/liblzma/check/Makefile.inc @@ -15,7 +15,8 @@ liblzma_la_SOURCES += \ check/check.c \ check/check.h \ check/crc_common.h \ - check/crc_x86_clmul.h + check/crc_x86_clmul.h \ + check/crc32_aarch64.h if COND_SMALL liblzma_la_SOURCES += check/crc32_small.c diff --git a/src/liblzma/check/crc32_aarch64.h b/src/liblzma/check/crc32_aarch64.h new file mode 100644 index 00000000..77b14af4 --- /dev/null +++ b/src/liblzma/check/crc32_aarch64.h @@ -0,0 +1,109 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file crc32_aarch64.c +/// \brief CRC32 calculation with aarch64 optimization +// +// Authors: Chenxi Mao +// +// This file has been put into the public domain. +// You can do whatever you want with this file. +// +/////////////////////////////////////////////////////////////////////////////// +#ifdef LZMA_CRC_CRC32_AARCH64_H +# error crc_arm64_clmul.h was included twice. +#endif +#define LZMA_CRC_CRC32_AARCH64_H +#include +// EDG-based compilers (Intel's classic compiler and compiler for E2K) can +// define __GNUC__ but the attribute must not be used with them. +// The new Clang-based ICX needs the attribute. +// +// NOTE: Build systems check for this too, keep them in sync with this. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) +# define crc_attr_target \ + __attribute__((__target__("+crc"))) +#else +# define crc_attr_target +#endif +#ifdef BUILDING_CRC32_AARCH64 +crc_attr_target +crc_attr_no_sanitize_address +static uint32_t +crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc) +{ + crc = ~crc; + while ((uintptr_t)(buf) & 7) { + crc = __builtin_aarch64_crc32b(crc, *buf); + buf++; + size--; + } + for (;size>=8;size-=8,buf+=8) { + crc = __builtin_aarch64_crc32x(crc, aligned_read64le(buf)); + } + for (;size>0;size--,buf++) + crc = __builtin_aarch64_crc32b(crc, *buf); + return ~crc; +} +#endif +#ifdef BUILDING_CRC64_AARCH64 +//FIXME: there is no crc64_arch_optimized implementation, +// to make compiler happy, add crc64_generic here. +#ifdef WORDS_BIGENDIAN +# define A1(x) ((x) >> 56) +#else +# define A1 A +#endif +crc_attr_target +crc_attr_no_sanitize_address +static uint64_t +crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc) +{ + crc = ~crc; + +#ifdef WORDS_BIGENDIAN + crc = bswap64(crc); +#endif + + if (size > 4) { + while ((uintptr_t)(buf) & 3) { + crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc); + --size; + } + + const uint8_t *const limit = buf + (size & ~(size_t)(3)); + size &= (size_t)(3); + + while (buf < limit) { +#ifdef WORDS_BIGENDIAN + const uint32_t tmp = (uint32_t)(crc >> 32) + ^ aligned_read32ne(buf); +#else + const uint32_t tmp = (uint32_t)crc + ^ aligned_read32ne(buf); +#endif + buf += 4; + + crc = lzma_crc64_table[3][A(tmp)] + ^ lzma_crc64_table[2][B(tmp)] + ^ S32(crc) + ^ lzma_crc64_table[1][C(tmp)] + ^ lzma_crc64_table[0][D(tmp)]; + } + } + + while (size-- != 0) + crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc); + +#ifdef WORDS_BIGENDIAN + crc = bswap64(crc); +#endif + + return ~crc; +} +#endif +static inline bool +is_arch_extension_supported(void) +{ + return (getauxval(AT_HWCAP) & HWCAP_CRC32)!=0; +} + diff --git a/src/liblzma/check/crc32_fast.c b/src/liblzma/check/crc32_fast.c index cf7d75da..07d5afb1 100644 --- a/src/liblzma/check/crc32_fast.c +++ b/src/liblzma/check/crc32_fast.c @@ -15,9 +15,12 @@ #include "check.h" #include "crc_common.h" -#ifdef CRC_X86_CLMUL +#if defined(CRC_X86_CLMUL) # define BUILDING_CRC32_CLMUL # include "crc_x86_clmul.h" +#elif defined(CRC32_ARM64) +# define BUILDING_CRC32_AARCH64 +# include "crc32_aarch64.h" #endif diff --git a/src/liblzma/check/crc64_fast.c b/src/liblzma/check/crc64_fast.c index eb1a4ae4..cb5d3e4c 100644 --- a/src/liblzma/check/crc64_fast.c +++ b/src/liblzma/check/crc64_fast.c @@ -14,9 +14,12 @@ #include "check.h" #include "crc_common.h" -#ifdef CRC_X86_CLMUL +#if defined(CRC_X86_CLMUL) # define BUILDING_CRC64_CLMUL # include "crc_x86_clmul.h" +#elif defined(CRC32_ARM64) +# define BUILDING_CRC64_AARCH64 +# include "crc32_aarch64.h" #endif diff --git a/src/liblzma/check/crc_common.h b/src/liblzma/check/crc_common.h index 417d88bb..7c7f098d 100644 --- a/src/liblzma/check/crc_common.h +++ b/src/liblzma/check/crc_common.h @@ -52,29 +52,33 @@ #undef CRC_GENERIC #undef CRC_ARCH_OPTIMIZED #undef CRC_X86_CLMUL +#undef CRC32_ARM64 #undef CRC_USE_IFUNC #undef CRC_USE_GENERIC_FOR_SMALL_INPUTS -// If CLMUL cannot be used then only the generic slice-by-eight (CRC32) -// or slice-by-four (CRC64) is built. -#if !defined(HAVE_USABLE_CLMUL) -# define CRC_GENERIC 1 - // If CLMUL is allowed unconditionally in the compiler options then the // generic version can be omitted. Note that this doesn't work with MSVC // as I don't know how to detect the features here. // // NOTE: Keep this this in sync with crc32_table.c. -#elif (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \ +#if (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \ || (defined(__e2k__) && __iset__ >= 6) # define CRC_ARCH_OPTIMIZED 1 # define CRC_X86_CLMUL 1 +#elif (defined(__aarch64__)) +# define CRC_ARCH_OPTIMIZED 1 +# define CRC32_ARM64 1 +// If CLMUL cannot be used then only the generic slice-by-eight (CRC32) +// or slice-by-four (CRC64) is built. +#elif !defined(HAVE_USABLE_CLMUL) +# define CRC_GENERIC 1 // Otherwise build both and detect at runtime which version to use. #else # define CRC_GENERIC 1 # define CRC_ARCH_OPTIMIZED 1 # define CRC_X86_CLMUL 1 +# define CRC32_ARM64 1 # ifdef HAVE_FUNC_ATTRIBUTE_IFUNC # define CRC_USE_IFUNC 1