From f644473a211394447824ea00518d0a214ff3f7f2 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 14 Nov 2022 21:34:57 +0200 Subject: [PATCH] liblzma: Add fast CRC64 for 32/64-bit x86 using SSSE3 + SSE4.1 + CLMUL. It also works on E2K as it supports these intrinsics. On x86-64 runtime detection is used so the code keeps working on older processors too. A CLMUL-only build can be done by using -msse4.1 -mpclmul in CFLAGS and this will reduce the library size since the generic implementation and its 8 KiB lookup table will be omitted. On 32-bit x86 this isn't used by default for now because by default on 32-bit x86 the separate assembly file crc64_x86.S is used. If --disable-assembler is used then this new CLMUL code is used the same way as on 64-bit x86. However, a CLMUL-only build (-msse4.1 -mpclmul) won't omit the 8 KiB lookup table on 32-bit x86 due to a currently-missing check for disabled assembler usage. The configure.ac check should be such that the code won't be built if something in the toolchain doesn't support it but --disable-clmul-crc option can be used to unconditionally disable this feature. CLMUL speeds up decompression of files that have compressed very well (assuming CRC64 is used as a check type). It is know that the CLMUL code is significantly slower than the generic code for tiny inputs (especially 1-8 bytes but up to 16 bytes). If that is a real-world problem then there is already a commented-out variant that uses the generic version for small inputs. Thanks to Ilya Kurdyukov for the original patch which was derived from a white paper from Intel [1] (published in 2009) and public domain code from [2] (released in 2016). [1] https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf [2] https://github.com/rawrunprotected/crc --- CMakeLists.txt | 26 +- INSTALL | 12 + configure.ac | 59 ++++- src/liblzma/check/crc64_fast.c | 449 +++++++++++++++++++++++++++++++- src/liblzma/check/crc64_table.c | 21 +- 5 files changed, 554 insertions(+), 13 deletions(-) diff --git a/CMakeLists.txt b/CMakeLists.txt index 11f3a055..ec44bac3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,8 +49,10 @@ cmake_minimum_required(VERSION 3.13...3.16 FATAL_ERROR) include(CMakePushCheckState) +include(CheckIncludeFile) include(CheckSymbolExists) include(CheckStructHasMember) +include(CheckCSourceCompiles) include(cmake/tuklib_integer.cmake) include(cmake/tuklib_cpucores.cmake) include(cmake/tuklib_physmem.cmake) @@ -402,14 +404,16 @@ check_c_source_compiles(" cmake_pop_check_state() tuklib_add_definition_if(liblzma HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR) +# cpuid.h +check_include_file(cpuid.h HAVE_CPUID_H) +tuklib_add_definition_if(liblzma HAVE_CPUID_H) + # immintrin.h: -include(CheckIncludeFile) check_include_file(immintrin.h HAVE_IMMINTRIN_H) if(HAVE_IMMINTRIN_H) target_compile_definitions(liblzma PRIVATE HAVE_IMMINTRIN_H) # SSE2 intrinsics: - include(CheckCSourceCompiles) check_c_source_compiles(" #include int main(void) @@ -421,6 +425,24 @@ if(HAVE_IMMINTRIN_H) " HAVE__MM_MOVEMASK_EPI8) tuklib_add_definition_if(liblzma HAVE__MM_MOVEMASK_EPI8) + + # CLMUL intrinsic: + check_c_source_compiles(" + #include + #if defined(__e2k__) && __iset__ < 6 + # error + #endif + #if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) + __attribute__((__target__(\"ssse3,sse4.1,pclmul\"))) + #endif + __m128i my_clmul(__m128i a, __m128i b) + { + return _mm_clmulepi64_si128(a, b, 0); + } + int main(void) { return 0; } + " + HAVE_USABLE_CLMUL) + tuklib_add_definition_if(liblzma HAVE_USABLE_CLMUL) endif() # Support -fvisiblity=hidden when building shared liblzma. diff --git a/INSTALL b/INSTALL index 2c94ecea..bf1e9c31 100644 --- a/INSTALL +++ b/INSTALL @@ -370,6 +370,18 @@ XZ Utils Installation pre-i686 systems, you may want to disable the assembler code. + --disable-clmul-crc + Disable the use carryless multiplication for CRC + calculation even if compiler support for it is detected. + The code uses runtime detection of SSSE3, SSE4.1, and + CLMUL instructions on x86. On 32-bit x86 this currently + is used only if --disable-assembler is used (this might + be fixed in the future). The code works on E2K too. + + If using compiler options that unconditionally allow the + required extensions (-msse4.1 -mpclmul) then runtime + detection isn't used and the generic code is omitted. + --enable-unaligned-access Allow liblzma to use unaligned memory access for 16-bit, 32-bit, and 64-bit loads and stores. This should be diff --git a/configure.ac b/configure.ac index ddc82a3b..60a63825 100644 --- a/configure.ac +++ b/configure.ac @@ -370,6 +370,16 @@ esac AM_CONDITIONAL(COND_ASM_X86, test "x$enable_assembler" = xx86) +############# +# CLMUL CRC # +############# + +AC_ARG_ENABLE([clmul-crc], AS_HELP_STRING([--disable-clmul-crc], + [Do not use carryless multiplication for CRC calculation + even if support for it is detected.]), + [], [enable_clmul_crc=yes]) + + ##################### # Size optimization # ##################### @@ -733,8 +743,9 @@ AC_CHECK_HEADERS([fcntl.h limits.h sys/time.h], [], [AC_MSG_ERROR([Required header file(s) are missing.])]) -# This allows the use of the intrinsic functions if they are available. -AC_CHECK_HEADERS([immintrin.h]) +# immintrin.h allows the use of the intrinsic functions if they are available. +# cpuid.h may be used for detecting x86 processor features at runtime. +AC_CHECK_HEADERS([immintrin.h cpuid.h]) ############################################################################### @@ -874,6 +885,50 @@ AC_CHECK_DECL([_mm_movemask_epi8], #include #endif]) +# For faster CRC on 32/64-bit x86 and E2K (see also crc64_fast.c): +# +# - Check for the CLMUL intrinsic _mm_clmulepi64_si128 in . +# +# - Check that __attribute__((__target__("ssse3,sse4.1,pclmul"))) works +# together with _mm_clmulepi64_si128 from . The attribute +# was added in GCC 4.4 but some GCC 4.x versions don't allow intrinsics +# with it. Exception: it must be not be used with EDG-based compilers +# like ICC and the compiler on E2K. +# +# If everything above is supported, runtime detection will be used to keep the +# binaries working on systems that don't support the required extensions. +AC_MSG_CHECKING([if _mm_clmulepi64_si128 is usable]) +if test "x$enable_clmul_crc" = xno ; then + AC_MSG_RESULT([no, --disable-clmul-crc was used]) +else + AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +#include + +// CLMUL works on older E2K instruction set but it is slow due to emulation. +#if defined(__e2k__) && __iset__ < 6 +# error +#endif + +// Intel's old compiler (ICC) can define __GNUC__ but the attribute must not +// be used with it. The new Clang-based ICX needs the attribute. +// Checking for !defined(__EDG__) catches ICC and other EDG-based compilers. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) +__attribute__((__target__("ssse3,sse4.1,pclmul"))) +#endif +__m128i my_clmul(__m128i a, __m128i b) +{ + return _mm_clmulepi64_si128(a, b, 0); +} + ]])], [ + AC_DEFINE([HAVE_USABLE_CLMUL], [1], + [Define to 1 if _mm_clmulepi64_si128 is usable. + See configure.ac for details.]) + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ]) +fi + # Check for sandbox support. If one is found, set enable_sandbox=found. case $enable_sandbox in auto | capsicum) diff --git a/src/liblzma/check/crc64_fast.c b/src/liblzma/check/crc64_fast.c index cf18ebc5..e3cbf1b1 100644 --- a/src/liblzma/check/crc64_fast.c +++ b/src/liblzma/check/crc64_fast.c @@ -3,11 +3,25 @@ /// \file crc64.c /// \brief CRC64 calculation /// -/// Calculate the CRC64 using the slice-by-four algorithm. This is the same -/// idea that is used in crc32_fast.c, but for CRC64 we use only four tables +/// There are two methods in this file. crc64_generic uses the +/// the slice-by-four algorithm. This is the same idea that is +/// used in crc32_fast.c, but for CRC64 we use only four tables /// instead of eight to avoid increasing CPU cache usage. +/// +/// crc64_clmul uses 32/64-bit x86 SSSE3, SSE4.1, and CLMUL instructions. +/// It was derived from +/// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +/// and the public domain code from https://github.com/rawrunprotected/crc +/// (URLs were checked on 2022-11-07). +/// +/// FIXME: Builds for 32-bit x86 use crc64_x86.S by default instead +/// of this file and thus CLMUL version isn't available on 32-bit x86 +/// unless configured with --disable-assembler. Even then the lookup table +/// isn't omitted in crc64_table.c since it doesn't know that assembly +/// code has been disabled. // -// Author: Lasse Collin +// Authors: Lasse Collin +// Ilya Kurdyukov // // This file has been put into the public domain. // You can do whatever you want with this file. @@ -15,6 +29,54 @@ /////////////////////////////////////////////////////////////////////////////// #include "check.h" + +#undef CRC_GENERIC +#undef CRC_CLMUL +#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS + +// If CLMUL cannot be used then only the generic slice-by-four is built. +#if !defined(HAVE_USABLE_CLMUL) +# define CRC_GENERIC 1 + +// If CLMUL is allowed unconditionally in the compiler options then the +// generic version can be omitted. Note that this doesn't work with MSVC +// as I don't know how to detect the features here. +// +// NOTE: Keep this this in sync with crc64_table.c. +#elif (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \ + || (defined(__e2k__) && __iset__ >= 6) +# define CRC_CLMUL 1 + +// Otherwise build both and detect at runtime which version to use. +#else +# define CRC_GENERIC 1 +# define CRC_CLMUL 1 + +/* + // The generic code is much faster with 1-8-byte inputs and has + // similar performance up to 16 bytes at least in microbenchmarks + // (it depends on input buffer alignment too). If both versions are + // built, this #define will use the generic version for inputs up to + // 16 bytes and CLMUL for bigger inputs. It saves a little in code + // size since the special cases for 0-16-byte inputs will be omitted + // from the CLMUL code. +# define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1 +*/ + +# if defined(_MSC_VER) +# include +# elif defined(HAVE_CPUID_H) +# include +# endif +#endif + + +///////////////////////////////// +// Generic slice-by-four CRC64 // +///////////////////////////////// + +#ifdef CRC_GENERIC + #include "crc_macros.h" @@ -26,8 +88,8 @@ // See the comments in crc32_fast.c. They aren't duplicated here. -extern LZMA_API(uint64_t) -lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc) +static uint64_t +crc64_generic(const uint8_t *buf, size_t size, uint64_t crc) { crc = ~crc; @@ -71,3 +133,380 @@ lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc) return ~crc; } +#endif + + +///////////////////// +// x86 CLMUL CRC64 // +///////////////////// + +#ifdef CRC_CLMUL + +#include + + +/* +// These functions were used to generate the constants +// at the top of crc64_clmul(). +static uint64_t +calc_lo(uint64_t poly) +{ + uint64_t a = poly; + uint64_t b = 0; + + for (unsigned i = 0; i < 64; ++i) { + b = (b >> 1) | (a << 63); + a = (a >> 1) ^ (a & 1 ? poly : 0); + } + + return b; +} + +static uint64_t +calc_hi(uint64_t poly, uint64_t a) +{ + for (unsigned i = 0; i < 64; ++i) + a = (a >> 1) ^ (a & 1 ? poly : 0); + + return a; +} +*/ + + +#define MASK_L(in, mask, r) \ + r = _mm_shuffle_epi8(in, mask) + +#define MASK_H(in, mask, r) \ + r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign)) + +#define MASK_LH(in, mask, low, high) \ + MASK_L(in, mask, low); \ + MASK_H(in, mask, high) + + +// EDG-based compilers (Intel's classic compiler and compiler for E2K) can +// define __GNUC__ but the attribute must not be used with them. +// The new Clang-based ICX needs the attribute. +// +// NOTE: Build systems check for this too, keep them in sync with this. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) +__attribute__((__target__("ssse3,sse4.1,pclmul"))) +#endif +static uint64_t +crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc) +{ + // The prototypes of the intrinsics use signed types while most of + // the values are treated as unsigned here. These warnings in this + // function have been checked and found to be harmless so silence them. +#if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__) +# pragma GCC diagnostic push +# pragma GCC diagnostic ignored "-Wsign-conversion" +# pragma GCC diagnostic ignored "-Wconversion" +#endif + +#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS + // The code assumes that there is at least one byte of input. + if (size == 0) + return crc; +#endif + + // const uint64_t poly = 0xc96c5795d7870f42; // CRC polynomial + const uint64_t p = 0x92d8af2baf0e1e85; // (poly << 1) | 1 + const uint64_t mu = 0x9c3e466c172963d5; // (calc_lo(poly) << 1) | 1 + const uint64_t k2 = 0xdabe95afc7875f40; // calc_hi(poly, 1) + const uint64_t k1 = 0xe05dd497ca393ae4; // calc_hi(poly, k2) + const __m128i vfold0 = _mm_set_epi64x(p, mu); + const __m128i vfold1 = _mm_set_epi64x(k2, k1); + + // Create a vector with 8-bit values 0 to 15. This is used to + // construct control masks for _mm_blendv_epi8 and _mm_shuffle_epi8. + const __m128i vramp = _mm_setr_epi32( + 0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c); + + // This is used to inverse the control mask of _mm_shuffle_epi8 + // so that bytes that wouldn't be picked with the original mask + // will be picked and vice versa. + const __m128i vsign = _mm_set1_epi8(0x80); + + // Memory addresses A to D and the distances between them: + // + // A B C D + // [skip_start][size][skip_end] + // [ size2 ] + // + // A and D are 16-byte aligned. B and C are 1-byte aligned. + // skip_start and skip_end are 0-15 bytes. size is at least 1 byte. + // + // A = aligned_buf will initially point to this address. + // B = The address pointed by the caller-supplied buf. + // C = buf + size == aligned_buf + size2 + // D = buf + size + skip_end == aligned_buf + size2 + skip_end + const size_t skip_start = (size_t)((uintptr_t)buf & 15); + const size_t skip_end = (size_t)(-(uintptr_t)(buf + size) & 15); + const __m128i *aligned_buf = (const __m128i *)( + (uintptr_t)buf & ~(uintptr_t)15); + + // If size2 <= 16 then the whole input fits into a single 16-byte + // vector. If size2 > 16 then at least two 16-byte vectors must + // be processed. If size2 > 16 && size <= 16 then there is only + // one 16-byte vector's worth of input but it is unaligned in memory. + // + // NOTE: There is no integer overflow here if the arguments are valid. + // If this overflowed, buf + size would too. + size_t size2 = skip_start + size; + + // Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8: + // The first skip_start or skip_end bytes in the vectors will have + // the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi8 + // will produce zeros for these positions. (Bitwise-xor of these + // masks with vsign will produce the opposite behavior.) + const __m128i mask_start + = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_start)); + const __m128i mask_end = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_end)); + + // Get the first 1-16 bytes into data0. If loading less than 16 bytes, + // the bytes are loaded to the high bits of the vector and the least + // significant positions are filled with zeros. + const __m128i data0 = _mm_blendv_epi8(_mm_load_si128(aligned_buf), + _mm_setzero_si128(), mask_start); + ++aligned_buf; + +#if defined(__i386__) || defined(_M_IX86) + const __m128i initial_crc = _mm_set_epi64x(0, ~crc); +#else + // GCC and Clang would produce good code with _mm_set_epi64x + // but MSVC needs _mm_cvtsi64_si128 on x86-64. + const __m128i initial_crc = _mm_cvtsi64_si128(~crc); +#endif + + __m128i v0, v1, v2, v3; + +#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS + if (size <= 16) { + // Right-shift initial_crc by 1-16 bytes based on "size" + // and store the result in v1 (high bytes) and v0 (low bytes). + // + // NOTE: The highest 8 bytes of initial_crc are zeros so + // v1 will be filled with zeros if size >= 8. The highest 8 + // bytes of v1 will always become zeros. + // + // [ v1 ][ v0 ] + // [ initial_crc ] size == 1 + // [ initial_crc ] size == 2 + // [ initial_crc ] size == 15 + // [ initial_crc ] size == 16 (all in v0) + const __m128i mask_low = _mm_add_epi8( + vramp, _mm_set1_epi8(size - 16)); + MASK_LH(initial_crc, mask_low, v0, v1); + + if (size2 <= 16) { + // There are 1-16 bytes of input and it is all + // in data0. Copy the input bytes to v3. If there + // are fewer than 16 bytes, the low bytes in v3 + // will be filled with zeros. That is, the input + // bytes are stored to the same position as + // (part of) initial_crc is in v0. + MASK_L(data0, mask_end, v3); + } else { + // There are 2-16 bytes of input but not all bytes + // are in data0. + const __m128i data1 = _mm_load_si128(aligned_buf); + + // Collect the 2-16 input bytes from data0 and data1 + // to v2 and v3, and bitwise-xor them with the + // low bits of initial_crc in v0. Note that the + // the second xor is below this else-block as it + // is shared with the other branch. + MASK_H(data0, mask_end, v2); + MASK_L(data1, mask_end, v3); + v0 = _mm_xor_si128(v0, v2); + } + + v0 = _mm_xor_si128(v0, v3); + v1 = _mm_alignr_epi8(v1, v0, 8); + } else +#endif + { + const __m128i data1 = _mm_load_si128(aligned_buf); + MASK_LH(initial_crc, mask_start, v0, v1); + v0 = _mm_xor_si128(v0, data0); + v1 = _mm_xor_si128(v1, data1); + +#define FOLD \ + v1 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold1, 0x00)); \ + v0 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold1, 0x11)); + + while (size2 > 32) { + ++aligned_buf; + size2 -= 16; + FOLD + v1 = _mm_load_si128(aligned_buf); + } + + if (size2 < 32) { + MASK_H(v0, mask_end, v2); + MASK_L(v0, mask_end, v0); + MASK_L(v1, mask_end, v3); + v1 = _mm_or_si128(v2, v3); + } + + FOLD + v1 = _mm_srli_si128(v0, 8); +#undef FOLD + } + + v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold1, 0x10), v1); + v0 = _mm_clmulepi64_si128(v1, vfold0, 0x00); + v2 = _mm_clmulepi64_si128(v0, vfold0, 0x10); + v0 = _mm_xor_si128(_mm_xor_si128(v2, _mm_slli_si128(v0, 8)), v1); + +#if defined(__i386__) || defined(_M_IX86) + return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) | + (uint64_t)(uint32_t)_mm_extract_epi32(v0, 2)); +#else + return ~(uint64_t)_mm_extract_epi64(v0, 1); +#endif + +#if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__) +# pragma GCC diagnostic pop +#endif +} +#endif + + +//////////////////////// +// Detect CPU support // +//////////////////////// + +#if defined(CRC_GENERIC) && defined(CRC_CLMUL) +static inline bool +is_clmul_supported(void) +{ + int success = 1; + uint32_t r[4]; // eax, ebx, ecx, edx + +#if defined(_MSC_VER) + // This needs with MSVC. ICC has it as a built-in + // on all platforms. + __cpuid(r, 1); +#elif defined(HAVE_CPUID_H) + // Compared to just using __asm__ to run CPUID, this also checks + // that CPUID is supported and saves and restores ebx as that is + // needed with GCC < 5 with position-independent code (PIC). + success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]); +#else + // Just a fallback that shouldn't be needed. + __asm__("cpuid\n\t" + : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) + : "a"(1), "c"(0)); +#endif + + // Returns true if these are supported: + // CLMUL (bit 1 in ecx) + // SSSE3 (bit 9 in ecx) + // SSE4.1 (bit 19 in ecx) + const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19); + return success && (r[2] & ecx_mask) == ecx_mask; + + // Alternative methods that weren't used: + // - ICC's _may_i_use_cpu_feature: the other methods should work too. + // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul") + // + // CPUID decding is needed with MSVC anyway and older GCC. This keeps + // the feature checks in the build system simpler too. The nice thing + // about __builtin_cpu_supports would be that it generates very short + // code as is it only reads a variable set at startup but a few bytes + // doesn't matter here. +} + + +#ifdef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR +# define CRC64_FUNC_INIT +# define CRC64_SET_FUNC_ATTR __attribute__((__constructor__)) +#else +# define CRC64_FUNC_INIT = &crc64_dispatch +# define CRC64_SET_FUNC_ATTR +static uint64_t crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc); +#endif + + +// Pointer to the the selected CRC64 method. +static uint64_t (*crc64_func)(const uint8_t *buf, size_t size, uint64_t crc) + CRC64_FUNC_INIT; + + +CRC64_SET_FUNC_ATTR +static void +crc64_set_func(void) +{ + crc64_func = is_clmul_supported() ? &crc64_clmul : &crc64_generic; + return; +} + + +#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR +static uint64_t +crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc) +{ + // When __attribute__((__constructor__)) isn't supported, set the + // function pointer without any locking. If multiple threads run + // the detection code in parallel, they will all end up setting + // the pointer to the same value. This avoids the use of + // mythread_once() on every call to lzma_crc64() but this likely + // isn't strictly standards compliant. Let's change it if it breaks. + crc64_set_func(); + return crc64_func(buf, size, crc); +} +#endif +#endif + + +extern LZMA_API(uint64_t) +lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc) +{ +#if defined(CRC_GENERIC) && defined(CRC_CLMUL) + // If CLMUL is available, it is the best for non-tiny inputs, + // being over twice as fast as the generic slice-by-four version. + // However, for size <= 16 it's different. In the extreme case + // of size == 1 the generic version can be five times faster. + // At size >= 8 the CLMUL starts to become reasonable. It + // varies depending on the alignment of buf too. + // + // The above doesn't include the overhead of mythread_once(). + // At least on x86-64 GNU/Linux, pthread_once() is very fast but + // it still makes lzma_crc64(buf, 1, crc) 50-100 % slower. When + // size reaches 12-16 bytes the overhead becomes negligible. + // + // So using the generic version for size <= 16 may give better + // performance with tiny inputs but if such inputs happen rarely + // it's not so obvious because then the lookup table of the + // generic version may not be in the processor cache. +#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS + if (size <= 16) + return crc64_generic(buf, size, crc); +#endif + +/* +#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR + // See crc64_dispatch(). This would be the alternative which uses + // locking and doesn't use crc64_dispatch(). Note that on Windows + // this method needs Vista threads. + mythread_once(crc64_set_func); +#endif +*/ + + return crc64_func(buf, size, crc); + +#elif defined(CRC_CLMUL) + // If CLMUL is used unconditionally without runtime CPU detection + // then omitting the generic version and its 8 KiB lookup table + // makes the library smaller. + // + // FIXME: Lookup table isn't currently omitted on 32-bit x86, + // see crc64_table.c. + return crc64_clmul(buf, size, crc); + +#else + return crc64_generic(buf, size, crc); +#endif +} diff --git a/src/liblzma/check/crc64_table.c b/src/liblzma/check/crc64_table.c index 7560eb0a..241adcd4 100644 --- a/src/liblzma/check/crc64_table.c +++ b/src/liblzma/check/crc64_table.c @@ -12,11 +12,24 @@ #include "common.h" + +// FIXME: Compared to crc64_fast.c this has to check for __x86_64__ too +// so that in 32-bit builds crc64_x86.S won't break due to a missing table. +#if (defined(__x86_64__) && defined(__SSSE3__) \ + && defined(__SSE4_1__) && defined(__PCLMUL__)) \ + || (defined(__e2k__) && __iset__ >= 6) +// No table needed but something has to be exported to keep some toolchains +// happy. Also use a declaration to silence compiler warnings. +extern const char lzma_crc64_dummy; +const char lzma_crc64_dummy; + +#else // Having the declaration here silences clang -Wmissing-variable-declarations. extern const uint64_t lzma_crc64_table[4][256]; -#ifdef WORDS_BIGENDIAN -# include "crc64_table_be.h" -#else -# include "crc64_table_le.h" +# if defined(WORDS_BIGENDIAN) +# include "crc64_table_be.h" +# else +# include "crc64_table_le.h" +# endif #endif