1
0
Fork 0
mirror of https://git.tukaani.org/xz.git synced 2024-04-04 12:36:23 +02:00

liblzma: Add fast CRC64 for 32/64-bit x86 using SSSE3 + SSE4.1 + CLMUL.

It also works on E2K as it supports these intrinsics.

On x86-64 runtime detection is used so the code keeps working on
older processors too. A CLMUL-only build can be done by using
-msse4.1 -mpclmul in CFLAGS and this will reduce the library
size since the generic implementation and its 8 KiB lookup table
will be omitted.

On 32-bit x86 this isn't used by default for now because by default
on 32-bit x86 the separate assembly file crc64_x86.S is used.
If --disable-assembler is used then this new CLMUL code is used
the same way as on 64-bit x86. However, a CLMUL-only build
(-msse4.1 -mpclmul) won't omit the 8 KiB lookup table on
32-bit x86 due to a currently-missing check for disabled
assembler usage.

The configure.ac check should be such that the code won't be
built if something in the toolchain doesn't support it but
--disable-clmul-crc option can be used to unconditionally
disable this feature.

CLMUL speeds up decompression of files that have compressed very
well (assuming CRC64 is used as a check type). It is know that
the CLMUL code is significantly slower than the generic code for
tiny inputs (especially 1-8 bytes but up to 16 bytes). If that
is a real-world problem then there is already a commented-out
variant that uses the generic version for small inputs.

Thanks to Ilya Kurdyukov for the original patch which was
derived from a white paper from Intel [1] (published in 2009)
and public domain code from [2] (released in 2016).

[1] https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
[2] https://github.com/rawrunprotected/crc
This commit is contained in:
Lasse Collin 2022-11-14 21:34:57 +02:00
parent 3b466bc796
commit f644473a21
5 changed files with 554 additions and 13 deletions

View file

@ -49,8 +49,10 @@
cmake_minimum_required(VERSION 3.13...3.16 FATAL_ERROR)
include(CMakePushCheckState)
include(CheckIncludeFile)
include(CheckSymbolExists)
include(CheckStructHasMember)
include(CheckCSourceCompiles)
include(cmake/tuklib_integer.cmake)
include(cmake/tuklib_cpucores.cmake)
include(cmake/tuklib_physmem.cmake)
@ -402,14 +404,16 @@ check_c_source_compiles("
cmake_pop_check_state()
tuklib_add_definition_if(liblzma HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR)
# cpuid.h
check_include_file(cpuid.h HAVE_CPUID_H)
tuklib_add_definition_if(liblzma HAVE_CPUID_H)
# immintrin.h:
include(CheckIncludeFile)
check_include_file(immintrin.h HAVE_IMMINTRIN_H)
if(HAVE_IMMINTRIN_H)
target_compile_definitions(liblzma PRIVATE HAVE_IMMINTRIN_H)
# SSE2 intrinsics:
include(CheckCSourceCompiles)
check_c_source_compiles("
#include <immintrin.h>
int main(void)
@ -421,6 +425,24 @@ if(HAVE_IMMINTRIN_H)
"
HAVE__MM_MOVEMASK_EPI8)
tuklib_add_definition_if(liblzma HAVE__MM_MOVEMASK_EPI8)
# CLMUL intrinsic:
check_c_source_compiles("
#include <immintrin.h>
#if defined(__e2k__) && __iset__ < 6
# error
#endif
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
__attribute__((__target__(\"ssse3,sse4.1,pclmul\")))
#endif
__m128i my_clmul(__m128i a, __m128i b)
{
return _mm_clmulepi64_si128(a, b, 0);
}
int main(void) { return 0; }
"
HAVE_USABLE_CLMUL)
tuklib_add_definition_if(liblzma HAVE_USABLE_CLMUL)
endif()
# Support -fvisiblity=hidden when building shared liblzma.

12
INSTALL
View file

@ -370,6 +370,18 @@ XZ Utils Installation
pre-i686 systems, you may want to disable the assembler
code.
--disable-clmul-crc
Disable the use carryless multiplication for CRC
calculation even if compiler support for it is detected.
The code uses runtime detection of SSSE3, SSE4.1, and
CLMUL instructions on x86. On 32-bit x86 this currently
is used only if --disable-assembler is used (this might
be fixed in the future). The code works on E2K too.
If using compiler options that unconditionally allow the
required extensions (-msse4.1 -mpclmul) then runtime
detection isn't used and the generic code is omitted.
--enable-unaligned-access
Allow liblzma to use unaligned memory access for 16-bit,
32-bit, and 64-bit loads and stores. This should be

View file

@ -370,6 +370,16 @@ esac
AM_CONDITIONAL(COND_ASM_X86, test "x$enable_assembler" = xx86)
#############
# CLMUL CRC #
#############
AC_ARG_ENABLE([clmul-crc], AS_HELP_STRING([--disable-clmul-crc],
[Do not use carryless multiplication for CRC calculation
even if support for it is detected.]),
[], [enable_clmul_crc=yes])
#####################
# Size optimization #
#####################
@ -733,8 +743,9 @@ AC_CHECK_HEADERS([fcntl.h limits.h sys/time.h],
[],
[AC_MSG_ERROR([Required header file(s) are missing.])])
# This allows the use of the intrinsic functions if they are available.
AC_CHECK_HEADERS([immintrin.h])
# immintrin.h allows the use of the intrinsic functions if they are available.
# cpuid.h may be used for detecting x86 processor features at runtime.
AC_CHECK_HEADERS([immintrin.h cpuid.h])
###############################################################################
@ -874,6 +885,50 @@ AC_CHECK_DECL([_mm_movemask_epi8],
#include <immintrin.h>
#endif])
# For faster CRC on 32/64-bit x86 and E2K (see also crc64_fast.c):
#
# - Check for the CLMUL intrinsic _mm_clmulepi64_si128 in <immintrin.h>.
#
# - Check that __attribute__((__target__("ssse3,sse4.1,pclmul"))) works
# together with _mm_clmulepi64_si128 from <immintrin.h>. The attribute
# was added in GCC 4.4 but some GCC 4.x versions don't allow intrinsics
# with it. Exception: it must be not be used with EDG-based compilers
# like ICC and the compiler on E2K.
#
# If everything above is supported, runtime detection will be used to keep the
# binaries working on systems that don't support the required extensions.
AC_MSG_CHECKING([if _mm_clmulepi64_si128 is usable])
if test "x$enable_clmul_crc" = xno ; then
AC_MSG_RESULT([no, --disable-clmul-crc was used])
else
AC_COMPILE_IFELSE([AC_LANG_SOURCE([[
#include <immintrin.h>
// CLMUL works on older E2K instruction set but it is slow due to emulation.
#if defined(__e2k__) && __iset__ < 6
# error
#endif
// Intel's old compiler (ICC) can define __GNUC__ but the attribute must not
// be used with it. The new Clang-based ICX needs the attribute.
// Checking for !defined(__EDG__) catches ICC and other EDG-based compilers.
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
__attribute__((__target__("ssse3,sse4.1,pclmul")))
#endif
__m128i my_clmul(__m128i a, __m128i b)
{
return _mm_clmulepi64_si128(a, b, 0);
}
]])], [
AC_DEFINE([HAVE_USABLE_CLMUL], [1],
[Define to 1 if _mm_clmulepi64_si128 is usable.
See configure.ac for details.])
AC_MSG_RESULT([yes])
], [
AC_MSG_RESULT([no])
])
fi
# Check for sandbox support. If one is found, set enable_sandbox=found.
case $enable_sandbox in
auto | capsicum)

View file

@ -3,11 +3,25 @@
/// \file crc64.c
/// \brief CRC64 calculation
///
/// Calculate the CRC64 using the slice-by-four algorithm. This is the same
/// idea that is used in crc32_fast.c, but for CRC64 we use only four tables
/// There are two methods in this file. crc64_generic uses the
/// the slice-by-four algorithm. This is the same idea that is
/// used in crc32_fast.c, but for CRC64 we use only four tables
/// instead of eight to avoid increasing CPU cache usage.
///
/// crc64_clmul uses 32/64-bit x86 SSSE3, SSE4.1, and CLMUL instructions.
/// It was derived from
/// https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
/// and the public domain code from https://github.com/rawrunprotected/crc
/// (URLs were checked on 2022-11-07).
///
/// FIXME: Builds for 32-bit x86 use crc64_x86.S by default instead
/// of this file and thus CLMUL version isn't available on 32-bit x86
/// unless configured with --disable-assembler. Even then the lookup table
/// isn't omitted in crc64_table.c since it doesn't know that assembly
/// code has been disabled.
//
// Author: Lasse Collin
// Authors: Lasse Collin
// Ilya Kurdyukov
//
// This file has been put into the public domain.
// You can do whatever you want with this file.
@ -15,6 +29,54 @@
///////////////////////////////////////////////////////////////////////////////
#include "check.h"
#undef CRC_GENERIC
#undef CRC_CLMUL
#undef CRC_USE_GENERIC_FOR_SMALL_INPUTS
// If CLMUL cannot be used then only the generic slice-by-four is built.
#if !defined(HAVE_USABLE_CLMUL)
# define CRC_GENERIC 1
// If CLMUL is allowed unconditionally in the compiler options then the
// generic version can be omitted. Note that this doesn't work with MSVC
// as I don't know how to detect the features here.
//
// NOTE: Keep this this in sync with crc64_table.c.
#elif (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \
|| (defined(__e2k__) && __iset__ >= 6)
# define CRC_CLMUL 1
// Otherwise build both and detect at runtime which version to use.
#else
# define CRC_GENERIC 1
# define CRC_CLMUL 1
/*
// The generic code is much faster with 1-8-byte inputs and has
// similar performance up to 16 bytes at least in microbenchmarks
// (it depends on input buffer alignment too). If both versions are
// built, this #define will use the generic version for inputs up to
// 16 bytes and CLMUL for bigger inputs. It saves a little in code
// size since the special cases for 0-16-byte inputs will be omitted
// from the CLMUL code.
# define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1
*/
# if defined(_MSC_VER)
# include <intrin.h>
# elif defined(HAVE_CPUID_H)
# include <cpuid.h>
# endif
#endif
/////////////////////////////////
// Generic slice-by-four CRC64 //
/////////////////////////////////
#ifdef CRC_GENERIC
#include "crc_macros.h"
@ -26,8 +88,8 @@
// See the comments in crc32_fast.c. They aren't duplicated here.
extern LZMA_API(uint64_t)
lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
static uint64_t
crc64_generic(const uint8_t *buf, size_t size, uint64_t crc)
{
crc = ~crc;
@ -71,3 +133,380 @@ lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
return ~crc;
}
#endif
/////////////////////
// x86 CLMUL CRC64 //
/////////////////////
#ifdef CRC_CLMUL
#include <immintrin.h>
/*
// These functions were used to generate the constants
// at the top of crc64_clmul().
static uint64_t
calc_lo(uint64_t poly)
{
uint64_t a = poly;
uint64_t b = 0;
for (unsigned i = 0; i < 64; ++i) {
b = (b >> 1) | (a << 63);
a = (a >> 1) ^ (a & 1 ? poly : 0);
}
return b;
}
static uint64_t
calc_hi(uint64_t poly, uint64_t a)
{
for (unsigned i = 0; i < 64; ++i)
a = (a >> 1) ^ (a & 1 ? poly : 0);
return a;
}
*/
#define MASK_L(in, mask, r) \
r = _mm_shuffle_epi8(in, mask)
#define MASK_H(in, mask, r) \
r = _mm_shuffle_epi8(in, _mm_xor_si128(mask, vsign))
#define MASK_LH(in, mask, low, high) \
MASK_L(in, mask, low); \
MASK_H(in, mask, high)
// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
// define __GNUC__ but the attribute must not be used with them.
// The new Clang-based ICX needs the attribute.
//
// NOTE: Build systems check for this too, keep them in sync with this.
#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
__attribute__((__target__("ssse3,sse4.1,pclmul")))
#endif
static uint64_t
crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc)
{
// The prototypes of the intrinsics use signed types while most of
// the values are treated as unsigned here. These warnings in this
// function have been checked and found to be harmless so silence them.
#if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__)
# pragma GCC diagnostic push
# pragma GCC diagnostic ignored "-Wsign-conversion"
# pragma GCC diagnostic ignored "-Wconversion"
#endif
#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
// The code assumes that there is at least one byte of input.
if (size == 0)
return crc;
#endif
// const uint64_t poly = 0xc96c5795d7870f42; // CRC polynomial
const uint64_t p = 0x92d8af2baf0e1e85; // (poly << 1) | 1
const uint64_t mu = 0x9c3e466c172963d5; // (calc_lo(poly) << 1) | 1
const uint64_t k2 = 0xdabe95afc7875f40; // calc_hi(poly, 1)
const uint64_t k1 = 0xe05dd497ca393ae4; // calc_hi(poly, k2)
const __m128i vfold0 = _mm_set_epi64x(p, mu);
const __m128i vfold1 = _mm_set_epi64x(k2, k1);
// Create a vector with 8-bit values 0 to 15. This is used to
// construct control masks for _mm_blendv_epi8 and _mm_shuffle_epi8.
const __m128i vramp = _mm_setr_epi32(
0x03020100, 0x07060504, 0x0b0a0908, 0x0f0e0d0c);
// This is used to inverse the control mask of _mm_shuffle_epi8
// so that bytes that wouldn't be picked with the original mask
// will be picked and vice versa.
const __m128i vsign = _mm_set1_epi8(0x80);
// Memory addresses A to D and the distances between them:
//
// A B C D
// [skip_start][size][skip_end]
// [ size2 ]
//
// A and D are 16-byte aligned. B and C are 1-byte aligned.
// skip_start and skip_end are 0-15 bytes. size is at least 1 byte.
//
// A = aligned_buf will initially point to this address.
// B = The address pointed by the caller-supplied buf.
// C = buf + size == aligned_buf + size2
// D = buf + size + skip_end == aligned_buf + size2 + skip_end
const size_t skip_start = (size_t)((uintptr_t)buf & 15);
const size_t skip_end = (size_t)(-(uintptr_t)(buf + size) & 15);
const __m128i *aligned_buf = (const __m128i *)(
(uintptr_t)buf & ~(uintptr_t)15);
// If size2 <= 16 then the whole input fits into a single 16-byte
// vector. If size2 > 16 then at least two 16-byte vectors must
// be processed. If size2 > 16 && size <= 16 then there is only
// one 16-byte vector's worth of input but it is unaligned in memory.
//
// NOTE: There is no integer overflow here if the arguments are valid.
// If this overflowed, buf + size would too.
size_t size2 = skip_start + size;
// Masks to be used with _mm_blendv_epi8 and _mm_shuffle_epi8:
// The first skip_start or skip_end bytes in the vectors will have
// the high bit (0x80) set. _mm_blendv_epi8 and _mm_shuffle_epi8
// will produce zeros for these positions. (Bitwise-xor of these
// masks with vsign will produce the opposite behavior.)
const __m128i mask_start
= _mm_sub_epi8(vramp, _mm_set1_epi8(skip_start));
const __m128i mask_end = _mm_sub_epi8(vramp, _mm_set1_epi8(skip_end));
// Get the first 1-16 bytes into data0. If loading less than 16 bytes,
// the bytes are loaded to the high bits of the vector and the least
// significant positions are filled with zeros.
const __m128i data0 = _mm_blendv_epi8(_mm_load_si128(aligned_buf),
_mm_setzero_si128(), mask_start);
++aligned_buf;
#if defined(__i386__) || defined(_M_IX86)
const __m128i initial_crc = _mm_set_epi64x(0, ~crc);
#else
// GCC and Clang would produce good code with _mm_set_epi64x
// but MSVC needs _mm_cvtsi64_si128 on x86-64.
const __m128i initial_crc = _mm_cvtsi64_si128(~crc);
#endif
__m128i v0, v1, v2, v3;
#ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS
if (size <= 16) {
// Right-shift initial_crc by 1-16 bytes based on "size"
// and store the result in v1 (high bytes) and v0 (low bytes).
//
// NOTE: The highest 8 bytes of initial_crc are zeros so
// v1 will be filled with zeros if size >= 8. The highest 8
// bytes of v1 will always become zeros.
//
// [ v1 ][ v0 ]
// [ initial_crc ] size == 1
// [ initial_crc ] size == 2
// [ initial_crc ] size == 15
// [ initial_crc ] size == 16 (all in v0)
const __m128i mask_low = _mm_add_epi8(
vramp, _mm_set1_epi8(size - 16));
MASK_LH(initial_crc, mask_low, v0, v1);
if (size2 <= 16) {
// There are 1-16 bytes of input and it is all
// in data0. Copy the input bytes to v3. If there
// are fewer than 16 bytes, the low bytes in v3
// will be filled with zeros. That is, the input
// bytes are stored to the same position as
// (part of) initial_crc is in v0.
MASK_L(data0, mask_end, v3);
} else {
// There are 2-16 bytes of input but not all bytes
// are in data0.
const __m128i data1 = _mm_load_si128(aligned_buf);
// Collect the 2-16 input bytes from data0 and data1
// to v2 and v3, and bitwise-xor them with the
// low bits of initial_crc in v0. Note that the
// the second xor is below this else-block as it
// is shared with the other branch.
MASK_H(data0, mask_end, v2);
MASK_L(data1, mask_end, v3);
v0 = _mm_xor_si128(v0, v2);
}
v0 = _mm_xor_si128(v0, v3);
v1 = _mm_alignr_epi8(v1, v0, 8);
} else
#endif
{
const __m128i data1 = _mm_load_si128(aligned_buf);
MASK_LH(initial_crc, mask_start, v0, v1);
v0 = _mm_xor_si128(v0, data0);
v1 = _mm_xor_si128(v1, data1);
#define FOLD \
v1 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold1, 0x00)); \
v0 = _mm_xor_si128(v1, _mm_clmulepi64_si128(v0, vfold1, 0x11));
while (size2 > 32) {
++aligned_buf;
size2 -= 16;
FOLD
v1 = _mm_load_si128(aligned_buf);
}
if (size2 < 32) {
MASK_H(v0, mask_end, v2);
MASK_L(v0, mask_end, v0);
MASK_L(v1, mask_end, v3);
v1 = _mm_or_si128(v2, v3);
}
FOLD
v1 = _mm_srli_si128(v0, 8);
#undef FOLD
}
v1 = _mm_xor_si128(_mm_clmulepi64_si128(v0, vfold1, 0x10), v1);
v0 = _mm_clmulepi64_si128(v1, vfold0, 0x00);
v2 = _mm_clmulepi64_si128(v0, vfold0, 0x10);
v0 = _mm_xor_si128(_mm_xor_si128(v2, _mm_slli_si128(v0, 8)), v1);
#if defined(__i386__) || defined(_M_IX86)
return ~(((uint64_t)(uint32_t)_mm_extract_epi32(v0, 3) << 32) |
(uint64_t)(uint32_t)_mm_extract_epi32(v0, 2));
#else
return ~(uint64_t)_mm_extract_epi64(v0, 1);
#endif
#if TUKLIB_GNUC_REQ(4, 6) || defined(__clang__)
# pragma GCC diagnostic pop
#endif
}
#endif
////////////////////////
// Detect CPU support //
////////////////////////
#if defined(CRC_GENERIC) && defined(CRC_CLMUL)
static inline bool
is_clmul_supported(void)
{
int success = 1;
uint32_t r[4]; // eax, ebx, ecx, edx
#if defined(_MSC_VER)
// This needs <intrin.h> with MSVC. ICC has it as a built-in
// on all platforms.
__cpuid(r, 1);
#elif defined(HAVE_CPUID_H)
// Compared to just using __asm__ to run CPUID, this also checks
// that CPUID is supported and saves and restores ebx as that is
// needed with GCC < 5 with position-independent code (PIC).
success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]);
#else
// Just a fallback that shouldn't be needed.
__asm__("cpuid\n\t"
: "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3])
: "a"(1), "c"(0));
#endif
// Returns true if these are supported:
// CLMUL (bit 1 in ecx)
// SSSE3 (bit 9 in ecx)
// SSE4.1 (bit 19 in ecx)
const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19);
return success && (r[2] & ecx_mask) == ecx_mask;
// Alternative methods that weren't used:
// - ICC's _may_i_use_cpu_feature: the other methods should work too.
// - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul")
//
// CPUID decding is needed with MSVC anyway and older GCC. This keeps
// the feature checks in the build system simpler too. The nice thing
// about __builtin_cpu_supports would be that it generates very short
// code as is it only reads a variable set at startup but a few bytes
// doesn't matter here.
}
#ifdef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
# define CRC64_FUNC_INIT
# define CRC64_SET_FUNC_ATTR __attribute__((__constructor__))
#else
# define CRC64_FUNC_INIT = &crc64_dispatch
# define CRC64_SET_FUNC_ATTR
static uint64_t crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc);
#endif
// Pointer to the the selected CRC64 method.
static uint64_t (*crc64_func)(const uint8_t *buf, size_t size, uint64_t crc)
CRC64_FUNC_INIT;
CRC64_SET_FUNC_ATTR
static void
crc64_set_func(void)
{
crc64_func = is_clmul_supported() ? &crc64_clmul : &crc64_generic;
return;
}
#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
static uint64_t
crc64_dispatch(const uint8_t *buf, size_t size, uint64_t crc)
{
// When __attribute__((__constructor__)) isn't supported, set the
// function pointer without any locking. If multiple threads run
// the detection code in parallel, they will all end up setting
// the pointer to the same value. This avoids the use of
// mythread_once() on every call to lzma_crc64() but this likely
// isn't strictly standards compliant. Let's change it if it breaks.
crc64_set_func();
return crc64_func(buf, size, crc);
}
#endif
#endif
extern LZMA_API(uint64_t)
lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc)
{
#if defined(CRC_GENERIC) && defined(CRC_CLMUL)
// If CLMUL is available, it is the best for non-tiny inputs,
// being over twice as fast as the generic slice-by-four version.
// However, for size <= 16 it's different. In the extreme case
// of size == 1 the generic version can be five times faster.
// At size >= 8 the CLMUL starts to become reasonable. It
// varies depending on the alignment of buf too.
//
// The above doesn't include the overhead of mythread_once().
// At least on x86-64 GNU/Linux, pthread_once() is very fast but
// it still makes lzma_crc64(buf, 1, crc) 50-100 % slower. When
// size reaches 12-16 bytes the overhead becomes negligible.
//
// So using the generic version for size <= 16 may give better
// performance with tiny inputs but if such inputs happen rarely
// it's not so obvious because then the lookup table of the
// generic version may not be in the processor cache.
#ifdef CRC_USE_GENERIC_FOR_SMALL_INPUTS
if (size <= 16)
return crc64_generic(buf, size, crc);
#endif
/*
#ifndef HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR
// See crc64_dispatch(). This would be the alternative which uses
// locking and doesn't use crc64_dispatch(). Note that on Windows
// this method needs Vista threads.
mythread_once(crc64_set_func);
#endif
*/
return crc64_func(buf, size, crc);
#elif defined(CRC_CLMUL)
// If CLMUL is used unconditionally without runtime CPU detection
// then omitting the generic version and its 8 KiB lookup table
// makes the library smaller.
//
// FIXME: Lookup table isn't currently omitted on 32-bit x86,
// see crc64_table.c.
return crc64_clmul(buf, size, crc);
#else
return crc64_generic(buf, size, crc);
#endif
}

View file

@ -12,11 +12,24 @@
#include "common.h"
// FIXME: Compared to crc64_fast.c this has to check for __x86_64__ too
// so that in 32-bit builds crc64_x86.S won't break due to a missing table.
#if (defined(__x86_64__) && defined(__SSSE3__) \
&& defined(__SSE4_1__) && defined(__PCLMUL__)) \
|| (defined(__e2k__) && __iset__ >= 6)
// No table needed but something has to be exported to keep some toolchains
// happy. Also use a declaration to silence compiler warnings.
extern const char lzma_crc64_dummy;
const char lzma_crc64_dummy;
#else
// Having the declaration here silences clang -Wmissing-variable-declarations.
extern const uint64_t lzma_crc64_table[4][256];
#ifdef WORDS_BIGENDIAN
# if defined(WORDS_BIGENDIAN)
# include "crc64_table_be.h"
# else
# include "crc64_table_le.h"
# endif
#endif