From aab503fe71da91aa42f0271fea81cf11a6944f7f Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Thu, 11 Apr 2019 20:01:31 -0400
Subject: [PATCH 1/5] common/scope_exit: Replace std::move with std::forward in
 ScopeExit()

The template type here is actually a forwarding reference, not an rvalue
reference in this case, so it's more appropriate to use std::forward to
preserve the value category of the type being moved.
---
 src/common/scope_exit.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)
diff --git a/src/common/scope_exit.h b/src/common/scope_exit.h
index baf1f1c9e..1176a72b1 100644
--- a/src/common/scope_exit.h
+++ b/src/common/scope_exit.h
@@ -20,7 +20,7 @@ struct ScopeExitHelper {
 
 template <typename Func>
 ScopeExitHelper<Func> ScopeExit(Func&& func) {
-    return ScopeExitHelper<Func>(std::move(func));
+    return ScopeExitHelper<Func>(std::forward<Func>(func));
 }
 } // namespace detail
 

From f108d3c6e6784caea13306c5371ba3881b61b690 Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Thu, 11 Apr 2019 20:15:44 -0400
Subject: [PATCH 2/5] common/swap: Remove 32-bit ARM path

We don't plan to support host 32-bit ARM execution environments, so this
is essentially dead code.
---
 src/common/swap.h | 13 -------------
 1 file changed, 13 deletions(-)

diff --git a/src/common/swap.h b/src/common/swap.h
index b3eab1324..cb723cb8a 100644
--- a/src/common/swap.h
+++ b/src/common/swap.h
@@ -71,19 +71,6 @@ inline u32 swap32(u32 _data) {
 inline u64 swap64(u64 _data) {
     return _byteswap_uint64(_data);
 }
-#elif defined(ARCHITECTURE_ARM) && (__ARM_ARCH >= 6)
-inline u16 swap16(u16 _data) {
-    u32 data = _data;
-    __asm__("rev16 %0, %1\n" : "=l"(data) : "l"(data));
-    return (u16)data;
-}
-inline u32 swap32(u32 _data) {
-    __asm__("rev %0, %1\n" : "=l"(_data) : "l"(_data));
-    return _data;
-}
-inline u64 swap64(u64 _data) {
-    return ((u64)swap32(_data) << 32) | swap32(_data >> 32);
-}
 #elif __linux__
 inline u16 swap16(u16 _data) {
     return bswap_16(_data);

From 56646136400bff716cb4d8cb10ae507cb8a0d267 Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Thu, 11 Apr 2019 20:26:37 -0400
Subject: [PATCH 3/5] common/swap: Simplify swap function ifdefs

Including every OS' own built-in byte swapping functions is kind of
undesirable, since it adds yet another build path to ensure compilation
succeeds on.

Given we only support clang, GCC, and MSVC for the time being, we can
utilize their built-in functions directly instead of going through the
OS's API functions.

This shrinks the overall code down to just

if (msvc)
  use msvc's functions
else if (clang or gcc)
  use clang/gcc's builtins
else
  use the slow path
---
 src/common/swap.h | 63 +++++++++++------------------------------------
 1 file changed, 15 insertions(+), 48 deletions(-)

diff --git a/src/common/swap.h b/src/common/swap.h
index cb723cb8a..fad0f35a7 100644
--- a/src/common/swap.h
+++ b/src/common/swap.h
@@ -21,11 +21,6 @@
 
 #if defined(_MSC_VER)
 #include <cstdlib>
-#elif defined(__linux__)
-#include <byteswap.h>
-#elif defined(__Bitrig__) || defined(__DragonFly__) || defined(__FreeBSD__) ||                     \
-    defined(__NetBSD__) || defined(__OpenBSD__)
-#include <sys/endian.h>
 #endif
 #include <cstring>
 #include "common/common_types.h"
@@ -62,58 +57,30 @@
 namespace Common {
 
 #ifdef _MSC_VER
-inline u16 swap16(u16 _data) {
-    return _byteswap_ushort(_data);
+inline u16 swap16(u16 data) {
+    return _byteswap_ushort(data);
 }
-inline u32 swap32(u32 _data) {
-    return _byteswap_ulong(_data);
+inline u32 swap32(u32 data) {
+    return _byteswap_ulong(data);
 }
-inline u64 swap64(u64 _data) {
-    return _byteswap_uint64(_data);
+inline u64 swap64(u64 data) {
+    return _byteswap_uint64(data);
 }
-#elif __linux__
-inline u16 swap16(u16 _data) {
-    return bswap_16(_data);
-}
-inline u32 swap32(u32 _data) {
-    return bswap_32(_data);
-}
-inline u64 swap64(u64 _data) {
-    return bswap_64(_data);
-}
-#elif __APPLE__
-inline __attribute__((always_inline)) u16 swap16(u16 _data) {
-    return (_data >> 8) | (_data << 8);
-}
-inline __attribute__((always_inline)) u32 swap32(u32 _data) {
-    return __builtin_bswap32(_data);
-}
-inline __attribute__((always_inline)) u64 swap64(u64 _data) {
-    return __builtin_bswap64(_data);
-}
-#elif defined(__Bitrig__) || defined(__OpenBSD__)
+#elif defined(__clang__) || defined(__GNUC__)
+#if defined(__Bitrig__) || defined(__OpenBSD__)
 // redefine swap16, swap32, swap64 as inline functions
 #undef swap16
 #undef swap32
 #undef swap64
-inline u16 swap16(u16 _data) {
-    return __swap16(_data);
+#endif
+inline u16 swap16(u16 data) {
+    return __builtin_bswap16(data);
 }
-inline u32 swap32(u32 _data) {
-    return __swap32(_data);
+inline u32 swap32(u32 data) {
+    return __builtin_bswap32(data);
 }
-inline u64 swap64(u64 _data) {
-    return __swap64(_data);
-}
-#elif defined(__DragonFly__) || defined(__FreeBSD__) || defined(__NetBSD__)
-inline u16 swap16(u16 _data) {
-    return bswap16(_data);
-}
-inline u32 swap32(u32 _data) {
-    return bswap32(_data);
-}
-inline u64 swap64(u64 _data) {
-    return bswap64(_data);
+inline u64 swap64(u64 data) {
+    return __builtin_bswap64(data);
 }
 #else
 // Slow generic implementation.

From 686d06727177c673392a2b7dbd98185739fed2c2 Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Thu, 11 Apr 2019 20:42:40 -0400
Subject: [PATCH 4/5] common/swap: Mark byte swapping free functions with
 [[nodiscard]] and noexcept

Allows the compiler to inform when the result of a swap function is
being ignored (which is 100% a bug in all usage scenarios). We also mark
them noexcept to allow other functions using them to be able to be
marked as noexcept and play nicely with things that potentially inspect
"nothrowability".
---
 src/common/swap.h | 22 +++++++++++-----------
 1 file changed, 11 insertions(+), 11 deletions(-)

diff --git a/src/common/swap.h b/src/common/swap.h
index fad0f35a7..086449028 100644
--- a/src/common/swap.h
+++ b/src/common/swap.h
@@ -57,13 +57,13 @@
 namespace Common {
 
 #ifdef _MSC_VER
-inline u16 swap16(u16 data) {
+[[nodiscard]] inline u16 swap16(u16 data) noexcept {
     return _byteswap_ushort(data);
 }
-inline u32 swap32(u32 data) {
+[[nodiscard]] inline u32 swap32(u32 data) noexcept {
     return _byteswap_ulong(data);
 }
-inline u64 swap64(u64 data) {
+[[nodiscard]] inline u64 swap64(u64 data) noexcept {
     return _byteswap_uint64(data);
 }
 #elif defined(__clang__) || defined(__GNUC__)
@@ -73,29 +73,29 @@ inline u64 swap64(u64 data) {
 #undef swap32
 #undef swap64
 #endif
-inline u16 swap16(u16 data) {
+[[nodiscard]] inline u16 swap16(u16 data) noexcept {
     return __builtin_bswap16(data);
 }
-inline u32 swap32(u32 data) {
+[[nodiscard]] inline u32 swap32(u32 data) noexcept {
     return __builtin_bswap32(data);
 }
-inline u64 swap64(u64 data) {
+[[nodiscard]] inline u64 swap64(u64 data) noexcept {
     return __builtin_bswap64(data);
 }
 #else
 // Slow generic implementation.
-inline u16 swap16(u16 data) {
+[[nodiscard]] inline u16 swap16(u16 data) noexcept {
     return (data >> 8) | (data << 8);
 }
-inline u32 swap32(u32 data) {
+[[nodiscard]] inline u32 swap32(u32 data) noexcept {
     return (swap16(data) << 16) | swap16(data >> 16);
 }
-inline u64 swap64(u64 data) {
+[[nodiscard]] inline u64 swap64(u64 data) noexcept {
     return ((u64)swap32(data) << 32) | swap32(data >> 32);
 }
 #endif
 
-inline float swapf(float f) {
+[[nodiscard]] inline float swapf(float f) noexcept {
     static_assert(sizeof(u32) == sizeof(float), "float must be the same size as uint32_t.");
 
     u32 value;
@@ -107,7 +107,7 @@ inline float swapf(float f) {
     return f;
 }
 
-inline double swapd(double f) {
+[[nodiscard]] inline double swapd(double f) noexcept {
     static_assert(sizeof(u64) == sizeof(double), "double must be the same size as uint64_t.");
 
     u64 value;

From b8d43d4dfb7cc6840aa92f4588ec146a9892342f Mon Sep 17 00:00:00 2001
From: Lioncash <mathew1800@gmail.com>
Date: Thu, 11 Apr 2019 21:20:22 -0400
Subject: [PATCH 5/5] common/swap: Improve codegen of the default swap
 fallbacks

Uses arithmetic that can be identified more trivially by compilers for
optimizations. e.g. Rather than shifting the halves of the value and
then swapping and combining them, we can swap them in place.

e.g. for the original swap32 code on x86-64, clang 8.0 would generate:

    mov     ecx, edi
    rol     cx, 8
    shl     ecx, 16
    shr     edi, 16
    rol     di, 8
    movzx   eax, di
    or      eax, ecx
    ret

while GCC 8.3 would generate the ideal:

    mov     eax, edi
    bswap   eax
    ret

now both generate the same optimal output.

MSVC used to generate the following with the old code:

    mov     eax, ecx
    rol     cx, 8
    shr     eax, 16
    rol     ax, 8
    movzx   ecx, cx
    movzx   eax, ax
    shl     ecx, 16
    or      eax, ecx
    ret     0

Now MSVC also generates a similar, but equally optimal result as clang/GCC:

    bswap   ecx
    mov     eax, ecx
    ret     0

====

In the swap64 case, for the original code, clang 8.0 would generate:

    mov     eax, edi
    bswap   eax
    shl     rax, 32
    shr     rdi, 32
    bswap   edi
    or      rax, rdi
    ret

(almost there, but still missing the mark)

while, again, GCC 8.3 would generate the more ideal:

    mov     rax, rdi
    bswap   rax
    ret

now clang also generates the optimal sequence for this fallback as well.

This is a case where MSVC unfortunately falls short, despite the new
code, this one still generates a doozy of an output.

    mov     r8, rcx
    mov     r9, rcx
    mov     rax, 71776119061217280
    mov     rdx, r8
    and     r9, rax
    and     edx, 65280
    mov     rax, rcx
    shr     rax, 16
    or      r9, rax
    mov     rax, rcx
    shr     r9, 16
    mov     rcx, 280375465082880
    and     rax, rcx
    mov     rcx, 1095216660480
    or      r9, rax
    mov     rax, r8
    and     rax, rcx
    shr     r9, 16
    or      r9, rax
    mov     rcx, r8
    mov     rax, r8
    shr     r9, 8
    shl     rax, 16
    and     ecx, 16711680
    or      rdx, rax
    mov     eax, -16777216
    and     rax, r8
    shl     rdx, 16
    or      rdx, rcx
    shl     rdx, 16
    or      rax, rdx
    shl     rax, 8
    or      rax, r9
    ret     0

which is pretty unfortunate.
---
 src/common/swap.h | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/common/swap.h b/src/common/swap.h
index 086449028..71932c2bb 100644
--- a/src/common/swap.h
+++ b/src/common/swap.h
@@ -83,15 +83,19 @@ namespace Common {
     return __builtin_bswap64(data);
 }
 #else
-// Slow generic implementation.
+// Generic implementation.
 [[nodiscard]] inline u16 swap16(u16 data) noexcept {
     return (data >> 8) | (data << 8);
 }
 [[nodiscard]] inline u32 swap32(u32 data) noexcept {
-    return (swap16(data) << 16) | swap16(data >> 16);
+    return ((data & 0xFF000000U) >> 24) | ((data & 0x00FF0000U) >> 8) |
+           ((data & 0x0000FF00U) << 8) | ((data & 0x000000FFU) << 24);
 }
 [[nodiscard]] inline u64 swap64(u64 data) noexcept {
-    return ((u64)swap32(data) << 32) | swap32(data >> 32);
+    return ((data & 0xFF00000000000000ULL) >> 56) | ((data & 0x00FF000000000000ULL) >> 40) |
+           ((data & 0x0000FF0000000000ULL) >> 24) | ((data & 0x000000FF00000000ULL) >> 8) |
+           ((data & 0x00000000FF000000ULL) << 8) | ((data & 0x0000000000FF0000ULL) << 24) |
+           ((data & 0x000000000000FF00ULL) << 40) | ((data & 0x00000000000000FFULL) << 56);
 }
 #endif