From e1c8f1d01f4a4e2136173edab2dc63c71ef038f4 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Fri, 25 Jul 2014 20:57:20 +0300 Subject: [PATCH] liblzma: Add lzma_memcmplen() for fast memory comparison. This commit just adds the function. Its uses will be in separate commits. This hasn't been tested much yet and it's perhaps a bit early to commit it but if there are bugs they should get found quite quickly. Thanks to Jun I Jin from Intel for help and for pointing out that string comparison needs to be optimized in liblzma. --- configure.ac | 13 +++ src/liblzma/common/Makefile.inc | 1 + src/liblzma/common/memcmplen.h | 170 ++++++++++++++++++++++++++++++++ 3 files changed, 184 insertions(+) create mode 100644 src/liblzma/common/memcmplen.h diff --git a/configure.ac b/configure.ac index 6787f827..2df67970 100644 --- a/configure.ac +++ b/configure.ac @@ -560,6 +560,7 @@ echo "Initializing gettext:" AM_GNU_GETTEXT_VERSION([0.18]) AM_GNU_GETTEXT([external]) + ############################################################################### # Checks for header files. ############################################################################### @@ -573,6 +574,9 @@ AC_CHECK_HEADERS([fcntl.h limits.h sys/time.h], [], [AC_MSG_ERROR([Required header file(s) are missing.])]) +# This allows the use of the intrinsic functions if they are available. +AC_CHECK_HEADERS([immintrin.h]) + ############################################################################### # Checks for typedefs, structures, and compiler characteristics. @@ -681,6 +685,15 @@ AM_CONDITIONAL([COND_INTERNAL_SHA256], && test "x$ac_cv_func_SHA256Init" != xyes \ && test "x$ac_cv_func_CC_SHA256_Init" != xyes]) +# Check for SSE2 intrinsics. +AC_CHECK_DECL([_mm_movemask_epi8], + [AC_DEFINE([HAVE__MM_MOVEMASK_EPI8], [1], + [Define to 1 if _mm_movemask_epi8 is available.])], + [], +[#ifdef HAVE_IMMINTRIN_H +#include +#endif]) + ############################################################################### # If using GCC, set some additional AM_CFLAGS: diff --git a/src/liblzma/common/Makefile.inc b/src/liblzma/common/Makefile.inc index 8f54a97a..38a63416 100644 --- a/src/liblzma/common/Makefile.inc +++ b/src/liblzma/common/Makefile.inc @@ -8,6 +8,7 @@ liblzma_la_SOURCES += \ common/common.c \ common/common.h \ + common/memcmplen.h \ common/block_util.c \ common/easy_preset.c \ common/easy_preset.h \ diff --git a/src/liblzma/common/memcmplen.h b/src/liblzma/common/memcmplen.h new file mode 100644 index 00000000..f66e7cdb --- /dev/null +++ b/src/liblzma/common/memcmplen.h @@ -0,0 +1,170 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file memcmplen.h +/// \brief Optimized comparison of two buffers +// +// Author: Lasse Collin +// +// This file has been put into the public domain. +// You can do whatever you want with this file. +// +/////////////////////////////////////////////////////////////////////////////// + +#ifndef LZMA_MEMCMPLEN_H +#define LZMA_MEMCMPLEN_H + +#include "common.h" + +#ifdef HAVE_IMMINTRIN_H +# include +#endif + +/// How many extra bytes lzma_memcmplen() may read. This depends on +/// the method but since it is just a few bytes the biggest possible +/// value is used here. +#define LZMA_MEMCMPLEN_EXTRA 16 + + +/// Find out how many equal bytes the two buffers have. +/// +/// \param buf1 First buffer +/// \param buf2 Second buffer +/// \param len How many bytes have already been compared and will +/// be assumed to match +/// \param limit How many bytes to compare at most, including the +/// already-compared bytes. This must be significantly +/// smaller than UINT32_MAX to avoid integer overflows. +/// Up to LZMA_MEMCMPLEN_EXTRA bytes may be read past +/// the specified limit from both buf1 and buf2. +/// +/// \return Number of equal bytes in the buffers is returned. +/// This is always at least len and at most limit. +static inline uint32_t lzma_attribute((__always_inline__)) +lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2, + uint32_t len, uint32_t limit) +{ + assert(len <= limit); + assert(limit <= UINT32_MAX / 2); + +#if defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ + && ((TUKLIB_GNUC_REQ(3, 4) && defined(__x86_64__)) \ + || (defined(__INTEL_COMPILER) && defined(__x86_64__)) \ + || (defined(__INTEL_COMPILER) && defined(_M_X64)) \ + || (defined(_MSC_VER) && defined(_M_X64))) + // NOTE: This will use 64-bit unaligned access which + // TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit, but + // it's convenient here at least as long as it's x86-64 only. + // + // I keep this x86-64 only for now since that's where I know this + // to be a good method. This may be fine on other 64-bit CPUs too. + // On big endian one should use xor instead of subtraction and switch + // to __builtin_clzll(). + while (len < limit) { + const uint64_t x = *(const uint64_t *)(buf1 + len) + - *(const uint64_t *)(buf2 + len); + if (x != 0) { +# if defined(_M_X64) // MSVC or Intel C compiler on Windows + unsigned long tmp; + _BitScanForward64(&tmp, x); + len += (uint32_t)tmp >> 3; +# else // GCC, clang, or Intel C compiler + len += (uint32_t)__builtin_ctzll(x) >> 3; +# endif + return my_min(len, limit); + } + + len += 8; + } + + return limit; + +#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) \ + && defined(HAVE__MM_MOVEMASK_EPI8) \ + && ((defined(__GNUC__) && defined(__SSE2_MATH__)) \ + || (defined(__INTEL_COMPILER) && defined(__SSE2__)) \ + || (defined(_MSC_VER) && defined(_M_IX86_FP) \ + && _M_IX86_FP >= 2)) + // NOTE: Like above, this will use 128-bit unaligned access which + // TUKLIB_FAST_UNALIGNED_ACCESS wasn't meant to permit. + // + // SSE2 version for 32-bit and 64-bit x86. On x86-64 the above + // version is sometimes significantly faster and sometimes + // slightly slower than this SSE2 version, so this SSE2 + // version isn't used on x86-64. + while (len < limit) { + const uint32_t x = 0xFFFF ^ _mm_movemask_epi8(_mm_cmpeq_epi8( + _mm_loadu_si128((const __m128i *)(buf1 + len)), + _mm_loadu_si128((const __m128i *)(buf2 + len)))); + + if (x != 0) { +# if defined(__INTEL_COMPILER) + len += _bit_scan_forward(x); +# elif defined(_MSC_VER) + unsigned long tmp; + _BitScanForward(&tmp, x); + len += tmp; +# else + len += __builtin_ctz(x); +# endif + return my_min(len, limit); + } + + len += 16; + } + + return limit; + +#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && !defined(WORDS_BIGENDIAN) + // Generic 32-bit little endian method + while (len < limit) { + uint32_t x = *(const uint32_t *)(buf1 + len) + - *(const uint32_t *)(buf2 + len); + if (x != 0) { + if ((x & 0xFFFF) == 0) { + len += 2; + x >>= 16; + } + + if ((x & 0xFF) == 0) + ++len; + + return my_min(len, limit); + } + + len += 4; + } + + return limit; + +#elif defined(TUKLIB_FAST_UNALIGNED_ACCESS) && defined(WORDS_BIGENDIAN) + // Generic 32-bit big endian method + while (len < limit) { + uint32_t x = *(const uint32_t *)(buf1 + len) + ^ *(const uint32_t *)(buf2 + len); + if (x != 0) { + if ((x & 0xFFFF0000) == 0) { + len += 2; + x <<= 16; + } + + if ((x & 0xFF000000) == 0) + ++len; + + return my_min(len, limit); + } + + len += 4; + } + + return limit; + +#else + // Simple portable version that doesn't use unaligned access. + while (len < limit && buf1[len] == buf2[len]) + ++len; + + return len; +#endif +} + +#endif