From 72e7f5b4dd08e9ea46ee049d8f2564a8808273d4 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Sat, 3 Jun 2023 09:21:52 +0300 Subject: [PATCH 1/8] renderer_vulkan: Add suport for debug report callback --- .../renderer_vulkan/renderer_vulkan.cpp | 18 +++++- .../renderer_vulkan/renderer_vulkan.h | 5 +- .../vulkan_common/vulkan_debug_callback.cpp | 40 +++++++++++-- .../vulkan_common/vulkan_debug_callback.h | 4 +- .../vulkan_common/vulkan_device.cpp | 2 +- .../vulkan_common/vulkan_instance.cpp | 58 ++++++++++--------- .../vulkan_common/vulkan_wrapper.cpp | 14 +++++ src/video_core/vulkan_common/vulkan_wrapper.h | 9 +++ 8 files changed, 113 insertions(+), 37 deletions(-) diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index ddf28ca282..454bb66a47 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -12,6 +12,7 @@ #include #include "common/logging/log.h" +#include "common/polyfill_ranges.h" #include "common/scope_exit.h" #include "common/settings.h" #include "common/telemetry.h" @@ -65,6 +66,21 @@ std::string BuildCommaSeparatedExtensions( return fmt::format("{}", fmt::join(available_extensions, ",")); } +DebugCallback MakeDebugCallback(const vk::Instance& instance, const vk::InstanceDispatch& dld) { + if (!Settings::values.renderer_debug) { + return DebugCallback{}; + } + const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); + const auto it = std::ranges::find_if(*properties, [](const auto& prop) { + return std::strcmp(VK_EXT_DEBUG_UTILS_EXTENSION_NAME, prop.extensionName) == 0; + }); + if (it != properties->end()) { + return CreateDebugUtilsCallback(instance); + } else { + return CreateDebugReportCallback(instance); + } +} + } // Anonymous namespace Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld, @@ -87,7 +103,7 @@ RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, cpu_memory(cpu_memory_), gpu(gpu_), library(OpenLibrary(context.get())), instance(CreateInstance(*library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type, Settings::values.renderer_debug.GetValue())), - debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr), + debug_callback(MakeDebugCallback(instance, dld)), surface(CreateSurface(instance, render_window.GetWindowInfo())), device(CreateDevice(instance, dld, *surface)), memory_allocator(device), state_tracker(), scheduler(device, state_tracker), diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index b2e8cbd1bb..ca22c0baa3 100644 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -5,6 +5,7 @@ #include #include +#include #include "common/dynamic_library.h" #include "video_core/renderer_base.h" @@ -33,6 +34,8 @@ class GPU; namespace Vulkan { +using DebugCallback = std::variant; + Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld, VkSurfaceKHR surface); @@ -71,7 +74,7 @@ private: vk::InstanceDispatch dld; vk::Instance instance; - vk::DebugUtilsMessenger debug_callback; + DebugCallback debug_callback; vk::SurfaceKHR surface; ScreenInfo screen_info; diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.cpp b/src/video_core/vulkan_common/vulkan_debug_callback.cpp index 9de484c294..67e8065a47 100644 --- a/src/video_core/vulkan_common/vulkan_debug_callback.cpp +++ b/src/video_core/vulkan_common/vulkan_debug_callback.cpp @@ -7,10 +7,10 @@ namespace Vulkan { namespace { -VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, - VkDebugUtilsMessageTypeFlagsEXT type, - const VkDebugUtilsMessengerCallbackDataEXT* data, - [[maybe_unused]] void* user_data) { +VkBool32 DebugUtilCallback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, + VkDebugUtilsMessageTypeFlagsEXT type, + const VkDebugUtilsMessengerCallbackDataEXT* data, + [[maybe_unused]] void* user_data) { // Skip logging known false-positive validation errors switch (static_cast(data->messageIdNumber)) { #ifdef ANDROID @@ -62,9 +62,26 @@ VkBool32 Callback(VkDebugUtilsMessageSeverityFlagBitsEXT severity, } return VK_FALSE; } + +VkBool32 DebugReportCallback(VkDebugReportFlagsEXT flags, VkDebugReportObjectTypeEXT objectType, + uint64_t object, size_t location, int32_t messageCode, + const char* pLayerPrefix, const char* pMessage, void* pUserData) { + const VkDebugReportFlagBitsEXT severity = static_cast(flags); + const std::string_view message{pMessage}; + if (severity & VK_DEBUG_REPORT_ERROR_BIT_EXT) { + LOG_CRITICAL(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_REPORT_WARNING_BIT_EXT) { + LOG_WARNING(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_REPORT_INFORMATION_BIT_EXT) { + LOG_INFO(Render_Vulkan, "{}", message); + } else if (severity & VK_DEBUG_REPORT_DEBUG_BIT_EXT) { + LOG_DEBUG(Render_Vulkan, "{}", message); + } + return VK_FALSE; +} } // Anonymous namespace -vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance) { +vk::DebugUtilsMessenger CreateDebugUtilsCallback(const vk::Instance& instance) { return instance.CreateDebugUtilsMessenger(VkDebugUtilsMessengerCreateInfoEXT{ .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_MESSENGER_CREATE_INFO_EXT, .pNext = nullptr, @@ -76,7 +93,18 @@ vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance) { .messageType = VK_DEBUG_UTILS_MESSAGE_TYPE_GENERAL_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_VALIDATION_BIT_EXT | VK_DEBUG_UTILS_MESSAGE_TYPE_PERFORMANCE_BIT_EXT, - .pfnUserCallback = Callback, + .pfnUserCallback = DebugUtilCallback, + .pUserData = nullptr, + }); +} + +vk::DebugReportCallback CreateDebugReportCallback(const vk::Instance& instance) { + return instance.CreateDebugReportCallback({ + .sType = VK_STRUCTURE_TYPE_DEBUG_REPORT_CALLBACK_CREATE_INFO_EXT, + .pNext = nullptr, + .flags = VK_DEBUG_REPORT_DEBUG_BIT_EXT | VK_DEBUG_REPORT_INFORMATION_BIT_EXT | + VK_DEBUG_REPORT_ERROR_BIT_EXT | VK_DEBUG_REPORT_WARNING_BIT_EXT, + .pfnCallback = DebugReportCallback, .pUserData = nullptr, }); } diff --git a/src/video_core/vulkan_common/vulkan_debug_callback.h b/src/video_core/vulkan_common/vulkan_debug_callback.h index 71b1f69eca..a8af7b406b 100644 --- a/src/video_core/vulkan_common/vulkan_debug_callback.h +++ b/src/video_core/vulkan_common/vulkan_debug_callback.h @@ -7,6 +7,8 @@ namespace Vulkan { -vk::DebugUtilsMessenger CreateDebugCallback(const vk::Instance& instance); +vk::DebugUtilsMessenger CreateDebugUtilsCallback(const vk::Instance& instance); + +vk::DebugReportCallback CreateDebugReportCallback(const vk::Instance& instance); } // namespace Vulkan diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index e4ca65b583..9743a82f5c 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -349,7 +349,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR const bool is_s8gen2 = device_id == 0x43050a01; const bool is_arm = driver_id == VK_DRIVER_ID_ARM_PROPRIETARY; - if ((is_mvk || is_qualcomm || is_turnip) && !is_suitable) { + if ((is_mvk || is_qualcomm || is_turnip || is_arm) && !is_suitable) { LOG_WARNING(Render_Vulkan, "Unsuitable driver, continuing anyway"); } else if (!is_suitable) { throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER); diff --git a/src/video_core/vulkan_common/vulkan_instance.cpp b/src/video_core/vulkan_common/vulkan_instance.cpp index b6d83e4468..7624a9b324 100644 --- a/src/video_core/vulkan_common/vulkan_instance.cpp +++ b/src/video_core/vulkan_common/vulkan_instance.cpp @@ -31,10 +31,34 @@ namespace Vulkan { namespace { + +[[nodiscard]] bool AreExtensionsSupported(const vk::InstanceDispatch& dld, + std::span extensions) { + const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); + if (!properties) { + LOG_ERROR(Render_Vulkan, "Failed to query extension properties"); + return false; + } + for (const char* extension : extensions) { + const auto it = std::ranges::find_if(*properties, [extension](const auto& prop) { + return std::strcmp(extension, prop.extensionName) == 0; + }); + if (it == properties->end()) { + LOG_ERROR(Render_Vulkan, "Required instance extension {} is not available", extension); + return false; + } + } + return true; +} + [[nodiscard]] std::vector RequiredExtensions( - Core::Frontend::WindowSystemType window_type, bool enable_validation) { + const vk::InstanceDispatch& dld, Core::Frontend::WindowSystemType window_type, + bool enable_validation) { std::vector extensions; extensions.reserve(6); +#ifdef __APPLE__ + extensions.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); +#endif switch (window_type) { case Core::Frontend::WindowSystemType::Headless: break; @@ -66,35 +90,14 @@ namespace { extensions.push_back(VK_KHR_SURFACE_EXTENSION_NAME); } if (enable_validation) { - extensions.push_back(VK_EXT_DEBUG_UTILS_EXTENSION_NAME); + const bool debug_utils = + AreExtensionsSupported(dld, std::array{VK_EXT_DEBUG_UTILS_EXTENSION_NAME}); + extensions.push_back(debug_utils ? VK_EXT_DEBUG_UTILS_EXTENSION_NAME + : VK_EXT_DEBUG_REPORT_EXTENSION_NAME); } - extensions.push_back(VK_KHR_GET_PHYSICAL_DEVICE_PROPERTIES_2_EXTENSION_NAME); - -#ifdef __APPLE__ - extensions.push_back(VK_KHR_PORTABILITY_ENUMERATION_EXTENSION_NAME); -#endif return extensions; } -[[nodiscard]] bool AreExtensionsSupported(const vk::InstanceDispatch& dld, - std::span extensions) { - const std::optional properties = vk::EnumerateInstanceExtensionProperties(dld); - if (!properties) { - LOG_ERROR(Render_Vulkan, "Failed to query extension properties"); - return false; - } - for (const char* extension : extensions) { - const auto it = std::ranges::find_if(*properties, [extension](const auto& prop) { - return std::strcmp(extension, prop.extensionName) == 0; - }); - if (it == properties->end()) { - LOG_ERROR(Render_Vulkan, "Required instance extension {} is not available", extension); - return false; - } - } - return true; -} - [[nodiscard]] std::vector Layers(bool enable_validation) { std::vector layers; if (enable_validation) { @@ -138,7 +141,8 @@ vk::Instance CreateInstance(const Common::DynamicLibrary& library, vk::InstanceD LOG_ERROR(Render_Vulkan, "Failed to load Vulkan function pointers"); throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); } - const std::vector extensions = RequiredExtensions(window_type, enable_validation); + const std::vector extensions = + RequiredExtensions(dld, window_type, enable_validation); if (!AreExtensionsSupported(dld, extensions)) { throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); } diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 28fcb21a01..2fa29793ae 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -259,7 +259,9 @@ bool Load(VkInstance instance, InstanceDispatch& dld) noexcept { // These functions may fail to load depending on the enabled extensions. // Don't return a failure on these. X(vkCreateDebugUtilsMessengerEXT); + X(vkCreateDebugReportCallbackEXT); X(vkDestroyDebugUtilsMessengerEXT); + X(vkDestroyDebugReportCallbackEXT); X(vkDestroySurfaceKHR); X(vkGetPhysicalDeviceFeatures2); X(vkGetPhysicalDeviceProperties2); @@ -481,6 +483,11 @@ void Destroy(VkInstance instance, VkDebugUtilsMessengerEXT handle, dld.vkDestroyDebugUtilsMessengerEXT(instance, handle, nullptr); } +void Destroy(VkInstance instance, VkDebugReportCallbackEXT handle, + const InstanceDispatch& dld) noexcept { + dld.vkDestroyDebugReportCallbackEXT(instance, handle, nullptr); +} + void Destroy(VkInstance instance, VkSurfaceKHR handle, const InstanceDispatch& dld) noexcept { dld.vkDestroySurfaceKHR(instance, handle, nullptr); } @@ -549,6 +556,13 @@ DebugUtilsMessenger Instance::CreateDebugUtilsMessenger( return DebugUtilsMessenger(object, handle, *dld); } +DebugReportCallback Instance::CreateDebugReportCallback( + const VkDebugReportCallbackCreateInfoEXT& create_info) const { + VkDebugReportCallbackEXT object; + Check(dld->vkCreateDebugReportCallbackEXT(handle, &create_info, nullptr, &object)); + return DebugReportCallback(object, handle, *dld); +} + void Image::SetObjectNameEXT(const char* name) const { SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE, name); } diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 44fce47a5a..b5e70fcd46 100644 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -164,8 +164,10 @@ struct InstanceDispatch { PFN_vkEnumerateInstanceLayerProperties vkEnumerateInstanceLayerProperties{}; PFN_vkCreateDebugUtilsMessengerEXT vkCreateDebugUtilsMessengerEXT{}; + PFN_vkCreateDebugReportCallbackEXT vkCreateDebugReportCallbackEXT{}; PFN_vkCreateDevice vkCreateDevice{}; PFN_vkDestroyDebugUtilsMessengerEXT vkDestroyDebugUtilsMessengerEXT{}; + PFN_vkDestroyDebugReportCallbackEXT vkDestroyDebugReportCallbackEXT{}; PFN_vkDestroyDevice vkDestroyDevice{}; PFN_vkDestroySurfaceKHR vkDestroySurfaceKHR{}; PFN_vkEnumerateDeviceExtensionProperties vkEnumerateDeviceExtensionProperties{}; @@ -366,6 +368,7 @@ void Destroy(VkDevice, VkSwapchainKHR, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkSemaphore, const DeviceDispatch&) noexcept; void Destroy(VkDevice, VkShaderModule, const DeviceDispatch&) noexcept; void Destroy(VkInstance, VkDebugUtilsMessengerEXT, const InstanceDispatch&) noexcept; +void Destroy(VkInstance, VkDebugReportCallbackEXT, const InstanceDispatch&) noexcept; void Destroy(VkInstance, VkSurfaceKHR, const InstanceDispatch&) noexcept; VkResult Free(VkDevice, VkDescriptorPool, Span, const DeviceDispatch&) noexcept; @@ -581,6 +584,7 @@ private: }; using DebugUtilsMessenger = Handle; +using DebugReportCallback = Handle; using DescriptorSetLayout = Handle; using DescriptorUpdateTemplate = Handle; using Pipeline = Handle; @@ -613,6 +617,11 @@ public: DebugUtilsMessenger CreateDebugUtilsMessenger( const VkDebugUtilsMessengerCreateInfoEXT& create_info) const; + /// Creates a debug report callback. + /// @throw Exception on creation failure. + DebugReportCallback CreateDebugReportCallback( + const VkDebugReportCallbackCreateInfoEXT& create_info) const; + /// Returns dispatch table. const InstanceDispatch& Dispatch() const noexcept { return *dld; From a9b44d37e101c646b00ca73dffa06edcb7627dcd Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 5 Jun 2023 18:58:21 +0300 Subject: [PATCH 2/8] renderer_vulkan: Don't add transform feedback flag if unsupported --- .../renderer_vulkan/vk_buffer_cache.cpp | 3 ++- .../renderer_vulkan/vk_staging_buffer_pool.cpp | 16 ++++++++++------ 2 files changed, 12 insertions(+), 7 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index 660f7c9ff8..b72f95235b 100644 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -590,7 +590,8 @@ void BufferCacheRuntime::ReserveNullBuffer() { .pNext = nullptr, .flags = 0, .size = 4, - .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .usage = VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | + VK_BUFFER_USAGE_TRANSFER_DST_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, diff --git a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp index 62b251a9bd..ce92f66ab5 100644 --- a/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp +++ b/src/video_core/renderer_vulkan/vk_staging_buffer_pool.cpp @@ -38,18 +38,20 @@ size_t Region(size_t iterator) noexcept { StagingBufferPool::StagingBufferPool(const Device& device_, MemoryAllocator& memory_allocator_, Scheduler& scheduler_) : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_} { - const VkBufferCreateInfo stream_ci = { + VkBufferCreateInfo stream_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, .size = STREAM_BUFFER_SIZE, .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | - VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT, + VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; + if (device.IsExtTransformFeedbackSupported()) { + stream_ci.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; + } stream_buffer = memory_allocator.CreateBuffer(stream_ci, MemoryUsage::Stream); if (device.HasDebuggingToolAttached()) { stream_buffer.SetObjectNameEXT("Stream Buffer"); @@ -164,19 +166,21 @@ std::optional StagingBufferPool::TryGetReservedBuffer(size_t s StagingBufferRef StagingBufferPool::CreateStagingBuffer(size_t size, MemoryUsage usage, bool deferred) { const u32 log2 = Common::Log2Ceil64(size); - const VkBufferCreateInfo buffer_ci = { + VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, .size = 1ULL << log2, .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | - VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | - VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT, + VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; + if (device.IsExtTransformFeedbackSupported()) { + buffer_ci.usage |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; + } vk::Buffer buffer = memory_allocator.CreateBuffer(buffer_ci, usage); if (device.HasDebuggingToolAttached()) { ++buffer_index; From c339af37a73de144fbbad706e43aefe278640cd7 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 5 Jun 2023 19:03:16 +0300 Subject: [PATCH 3/8] renderer_vulkan: Respect viewport limit --- .../renderer_vulkan/vk_graphics_pipeline.cpp | 5 +++-- src/video_core/renderer_vulkan/vk_rasterizer.cpp | 16 ++++++++++++---- src/video_core/vulkan_common/vulkan_device.h | 4 ++++ 3 files changed, 19 insertions(+), 6 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp index c1595642ec..ad35cacacc 100644 --- a/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp +++ b/src/video_core/renderer_vulkan/vk_graphics_pipeline.cpp @@ -652,13 +652,14 @@ void GraphicsPipeline::MakePipeline(VkRenderPass render_pass) { .pNext = nullptr, .negativeOneToOne = key.state.ndc_minus_one_to_one.Value() != 0 ? VK_TRUE : VK_FALSE, }; + const u32 num_viewports = std::min(device.GetMaxViewports(), Maxwell::NumViewports); VkPipelineViewportStateCreateInfo viewport_ci{ .sType = VK_STRUCTURE_TYPE_PIPELINE_VIEWPORT_STATE_CREATE_INFO, .pNext = nullptr, .flags = 0, - .viewportCount = Maxwell::NumViewports, + .viewportCount = num_viewports, .pViewports = nullptr, - .scissorCount = Maxwell::NumViewports, + .scissorCount = num_viewports, .pScissors = nullptr, }; if (device.IsNvViewportSwizzleSupported()) { diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 84e3a30cc5..268b955fb0 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -925,7 +925,7 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg } const bool is_rescaling{texture_cache.IsRescaling()}; const float scale = is_rescaling ? Settings::values.resolution_info.up_factor : 1.0f; - const std::array viewports{ + const std::array viewport_list{ GetViewportState(device, regs, 0, scale), GetViewportState(device, regs, 1, scale), GetViewportState(device, regs, 2, scale), GetViewportState(device, regs, 3, scale), GetViewportState(device, regs, 4, scale), GetViewportState(device, regs, 5, scale), @@ -935,7 +935,11 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg GetViewportState(device, regs, 12, scale), GetViewportState(device, regs, 13, scale), GetViewportState(device, regs, 14, scale), GetViewportState(device, regs, 15, scale), }; - scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); }); + scheduler.Record([this, viewport_list](vk::CommandBuffer cmdbuf) { + const u32 num_viewports = std::min(device.GetMaxViewports(), Maxwell::NumViewports); + const vk::Span viewports(viewport_list.data(), num_viewports); + cmdbuf.SetViewport(0, viewports); + }); } void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs) { @@ -948,7 +952,7 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs up_scale = Settings::values.resolution_info.up_scale; down_shift = Settings::values.resolution_info.down_shift; } - const std::array scissors{ + const std::array scissor_list{ GetScissorState(regs, 0, up_scale, down_shift), GetScissorState(regs, 1, up_scale, down_shift), GetScissorState(regs, 2, up_scale, down_shift), @@ -966,7 +970,11 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs GetScissorState(regs, 14, up_scale, down_shift), GetScissorState(regs, 15, up_scale, down_shift), }; - scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); }); + scheduler.Record([this, scissor_list](vk::CommandBuffer cmdbuf) { + const u32 num_scissors = std::min(device.GetMaxViewports(), Maxwell::NumViewports); + const vk::Span scissors(scissor_list.data(), num_scissors); + cmdbuf.SetScissor(0, scissors); + }); } void RasterizerVulkan::UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs) { diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index b84af3dfb7..8a05a4fab7 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -588,6 +588,10 @@ public: return properties.properties.limits.maxVertexInputBindings; } + u32 GetMaxViewports() const { + return properties.properties.limits.maxViewports; + } + bool SupportsConditionalBarriers() const { return supports_conditional_barriers; } From 1522b956583aab463bd1576652d1794fe989437d Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 5 Jun 2023 19:04:22 +0300 Subject: [PATCH 4/8] renderer_vulkan: Bump minimum SPIRV version * 1.3 is guaranteed on all 1.1 drivers --- src/video_core/vulkan_common/vulkan_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 8a05a4fab7..13ca24ef56 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -518,7 +518,7 @@ public: if (extensions.spirv_1_4) { return 0x00010400U; } - return 0x00010000U; + return 0x00010300U; } /// Returns true when a known debugging tool is attached. From 220a42896d350399f1f2d77432aa571ade3c9cdd Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 5 Jun 2023 19:07:05 +0300 Subject: [PATCH 5/8] renderer_vulkan: Don't assume debug tool with debug renderer * Causes crashes because mali drivers don't support debug utils --- src/video_core/vulkan_common/vulkan_device.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 13ca24ef56..7be6311220 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -523,7 +523,7 @@ public: /// Returns true when a known debugging tool is attached. bool HasDebuggingToolAttached() const { - return has_renderdoc || has_nsight_graphics || Settings::values.renderer_debug.GetValue(); + return has_renderdoc || has_nsight_graphics; } /// @returns True if compute pipelines can cause crashing. From b8c96cee5f2eb0bd5ba9ef46746daec78ee3bb44 Mon Sep 17 00:00:00 2001 From: GPUCode Date: Mon, 5 Jun 2023 19:27:36 +0300 Subject: [PATCH 6/8] renderer_vulkan: Add more feature checking --- .../renderer_vulkan/vk_pipeline_cache.cpp | 7 ++++--- src/video_core/vulkan_common/vulkan_device.cpp | 4 ++++ src/video_core/vulkan_common/vulkan_device.h | 16 ++++++++++++++++ 3 files changed, 24 insertions(+), 3 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp index 9f316113cc..d600c4e61a 100644 --- a/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_pipeline_cache.cpp @@ -309,7 +309,7 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device .support_int16 = device.IsShaderInt16Supported(), .support_int64 = device.IsShaderInt64Supported(), .support_vertex_instance_id = false, - .support_float_controls = true, + .support_float_controls = device.IsKhrShaderFloatControlsSupported(), .support_separate_denorm_behavior = float_control.denormBehaviorIndependence == VK_SHADER_FLOAT_CONTROLS_INDEPENDENCE_ALL, .support_separate_rounding_mode = @@ -325,12 +325,13 @@ PipelineCache::PipelineCache(RasterizerVulkan& rasterizer_, const Device& device .support_fp64_signed_zero_nan_preserve = float_control.shaderSignedZeroInfNanPreserveFloat64 != VK_FALSE, .support_explicit_workgroup_layout = device.IsKhrWorkgroupMemoryExplicitLayoutSupported(), - .support_vote = true, + .support_vote = device.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_VOTE_BIT), .support_viewport_index_layer_non_geometry = device.IsExtShaderViewportIndexLayerSupported(), .support_viewport_mask = device.IsNvViewportArray2Supported(), .support_typeless_image_loads = device.IsFormatlessImageLoadSupported(), - .support_demote_to_helper_invocation = true, + .support_demote_to_helper_invocation = + device.IsExtShaderDemoteToHelperInvocationSupported(), .support_int64_atomics = device.IsExtShaderAtomicInt64Supported(), .support_derivative_control = true, .support_geometry_shader_passthrough = device.IsNvGeometryShaderPassthroughSupported(), diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 9743a82f5c..70436cf1c5 100644 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -905,6 +905,10 @@ bool Device::GetSuitability(bool requires_swapchain) { properties.driver.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_DRIVER_PROPERTIES; SetNext(next, properties.driver); + // Retrieve subgroup properties. + properties.subgroup_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_SUBGROUP_PROPERTIES; + SetNext(next, properties.subgroup_properties); + // Retrieve relevant extension properties. if (extensions.shader_float_controls) { properties.float_controls.sType = diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 7be6311220..e05d04db32 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -323,6 +323,11 @@ public: return properties.subgroup_size_control.requiredSubgroupSizeStages & stage; } + /// Returns true if the device supports the provided subgroup feature. + bool IsSubgroupFeatureSupported(VkSubgroupFeatureFlagBits feature) const { + return properties.subgroup_properties.supportedOperations & feature; + } + /// Returns the maximum number of push descriptors. u32 MaxPushDescriptors() const { return properties.push_descriptor.maxPushDescriptors; @@ -388,6 +393,11 @@ public: return extensions.swapchain_mutable_format; } + /// Returns true if VK_KHR_shader_float_controls is enabled. + bool IsKhrShaderFloatControlsSupported() const { + return extensions.shader_float_controls; + } + /// Returns true if the device supports VK_KHR_workgroup_memory_explicit_layout. bool IsKhrWorkgroupMemoryExplicitLayoutSupported() const { return extensions.workgroup_memory_explicit_layout; @@ -487,6 +497,11 @@ public: return extensions.shader_stencil_export; } + /// Returns true if the device supports VK_EXT_shader_demote_to_helper_invocation + bool IsExtShaderDemoteToHelperInvocationSupported() const { + return extensions.shader_demote_to_helper_invocation; + } + /// Returns true if the device supports VK_EXT_conservative_rasterization. bool IsExtConservativeRasterizationSupported() const { return extensions.conservative_rasterization; @@ -684,6 +699,7 @@ private: struct Properties { VkPhysicalDeviceDriverProperties driver{}; + VkPhysicalDeviceSubgroupProperties subgroup_properties{}; VkPhysicalDeviceFloatControlsProperties float_controls{}; VkPhysicalDevicePushDescriptorPropertiesKHR push_descriptor{}; VkPhysicalDeviceSubgroupSizeControlProperties subgroup_size_control{}; From eac46ad7ceca5e35b396a8b80bfc38dc6ef1a4fe Mon Sep 17 00:00:00 2001 From: GPUCode Date: Tue, 6 Jun 2023 23:10:06 +0300 Subject: [PATCH 7/8] video_core: Add BCn decoding support --- externals/CMakeLists.txt | 3 + externals/bc_decoder/bc_decoder.cpp | 1522 +++++++++++++++++ externals/bc_decoder/bc_decoder.h | 43 + src/video_core/CMakeLists.txt | 6 +- .../renderer_vulkan/maxwell_to_vk.cpp | 20 + .../renderer_vulkan/vk_rasterizer.cpp | 7 + .../renderer_vulkan/vk_texture_cache.cpp | 4 + src/video_core/surface.cpp | 22 + src/video_core/surface.h | 2 + src/video_core/texture_cache/decode_bc.cpp | 129 ++ .../{decode_bc4.h => decode_bc.h} | 6 +- src/video_core/texture_cache/decode_bc4.cpp | 96 -- src/video_core/texture_cache/util.cpp | 24 +- src/video_core/textures/bcn.cpp | 1 - src/video_core/textures/bcn.h | 9 +- src/video_core/vulkan_common/vulkan_device.h | 15 +- 16 files changed, 1789 insertions(+), 120 deletions(-) create mode 100644 externals/bc_decoder/bc_decoder.cpp create mode 100644 externals/bc_decoder/bc_decoder.h create mode 100644 src/video_core/texture_cache/decode_bc.cpp rename src/video_core/texture_cache/{decode_bc4.h => decode_bc.h} (50%) delete mode 100644 src/video_core/texture_cache/decode_bc4.cpp diff --git a/externals/CMakeLists.txt b/externals/CMakeLists.txt index 0184289eba..4ff5888510 100644 --- a/externals/CMakeLists.txt +++ b/externals/CMakeLists.txt @@ -157,6 +157,9 @@ endif() add_library(stb stb/stb_dxt.cpp) target_include_directories(stb PUBLIC ./stb) +add_library(bc_decoder bc_decoder/bc_decoder.cpp) +target_include_directories(bc_decoder PUBLIC ./bc_decoder) + if (ANDROID) if (ARCHITECTURE_arm64) add_subdirectory(libadrenotools) diff --git a/externals/bc_decoder/bc_decoder.cpp b/externals/bc_decoder/bc_decoder.cpp new file mode 100644 index 0000000000..536c44f34b --- /dev/null +++ b/externals/bc_decoder/bc_decoder.cpp @@ -0,0 +1,1522 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) +// Copyright 2019 The SwiftShader Authors. All Rights Reserved. + +// This BCn Decoder is directly derivative of Swiftshader's BCn Decoder found at: https://github.com/google/swiftshader/blob/d070309f7d154d6764cbd514b1a5c8bfcef61d06/src/Device/BC_Decoder.cpp +// This file does not follow the Skyline code conventions but has certain Skyline specific code +// There are a lot of implicit and narrowing conversions in this file due to this (Warnings are disabled as a result) + +#include +#include +#include +#include + +namespace { + constexpr int BlockWidth = 4; + constexpr int BlockHeight = 4; + + struct BC_color { + void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, bool hasAlphaChannel, bool hasSeparateAlpha) const { + Color c[4]; + c[0].extract565(c0); + c[1].extract565(c1); + if (hasSeparateAlpha || (c0 > c1)) { + c[2] = ((c[0] * 2) + c[1]) / 3; + c[3] = ((c[1] * 2) + c[0]) / 3; + } else { + c[2] = (c[0] + c[1]) >> 1; + if (hasAlphaChannel) { + c[3].clearAlpha(); + } + } + + for (int j = 0; j < BlockHeight && (y + j) < dstH; j++) { + size_t dstOffset = j * dstPitch; + size_t idxOffset = j * BlockHeight; + for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++, idxOffset++, dstOffset += dstBpp) { + *reinterpret_cast(dst + dstOffset) = c[getIdx(idxOffset)].pack8888(); + } + } + } + + private: + struct Color { + Color() { + c[0] = c[1] = c[2] = 0; + c[3] = 0xFF000000; + } + + void extract565(const unsigned int c565) { + c[0] = ((c565 & 0x0000001F) << 3) | ((c565 & 0x0000001C) >> 2); + c[1] = ((c565 & 0x000007E0) >> 3) | ((c565 & 0x00000600) >> 9); + c[2] = ((c565 & 0x0000F800) >> 8) | ((c565 & 0x0000E000) >> 13); + } + + unsigned int pack8888() const { + return ((c[0] & 0xFF) << 16) | ((c[1] & 0xFF) << 8) | (c[2] & 0xFF) | c[3]; + } + + void clearAlpha() { + c[3] = 0; + } + + Color operator*(int factor) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] * factor; + } + return res; + } + + Color operator/(int factor) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] / factor; + } + return res; + } + + Color operator>>(int shift) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] >> shift; + } + return res; + } + + Color operator+(Color const &obj) const { + Color res; + for (int i = 0; i < 4; ++i) { + res.c[i] = c[i] + obj.c[i]; + } + return res; + } + + private: + int c[4]; + }; + + size_t getIdx(int i) const { + size_t offset = i << 1; // 2 bytes per index + return (idx & (0x3 << offset)) >> offset; + } + + unsigned short c0; + unsigned short c1; + unsigned int idx; + }; + static_assert(sizeof(BC_color) == 8, "BC_color must be 8 bytes"); + + struct BC_channel { + void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, size_t channel, bool isSigned) const { + int c[8] = {0}; + + if (isSigned) { + c[0] = static_cast(data & 0xFF); + c[1] = static_cast((data & 0xFF00) >> 8); + } else { + c[0] = static_cast(data & 0xFF); + c[1] = static_cast((data & 0xFF00) >> 8); + } + + if (c[0] > c[1]) { + for (int i = 2; i < 8; ++i) { + c[i] = ((8 - i) * c[0] + (i - 1) * c[1]) / 7; + } + } else { + for (int i = 2; i < 6; ++i) { + c[i] = ((6 - i) * c[0] + (i - 1) * c[1]) / 5; + } + c[6] = isSigned ? -128 : 0; + c[7] = isSigned ? 127 : 255; + } + + for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++) { + for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++) { + dst[channel + (i * dstBpp) + (j * dstPitch)] = static_cast(c[getIdx((j * BlockHeight) + i)]); + } + } + } + + private: + uint8_t getIdx(int i) const { + int offset = i * 3 + 16; + return static_cast((data & (0x7ull << offset)) >> offset); + } + + uint64_t data; + }; + static_assert(sizeof(BC_channel) == 8, "BC_channel must be 8 bytes"); + + struct BC_alpha { + void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp) const { + dst += 3; // Write only to alpha (channel 3) + for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++, dst += dstPitch) { + uint8_t *dstRow = dst; + for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++, dstRow += dstBpp) { + *dstRow = getAlpha(j * BlockHeight + i); + } + } + } + + private: + uint8_t getAlpha(int i) const { + int offset = i << 2; + int alpha = (data & (0xFull << offset)) >> offset; + return static_cast(alpha | (alpha << 4)); + } + + uint64_t data; + }; + static_assert(sizeof(BC_alpha) == 8, "BC_alpha must be 8 bytes"); + + namespace BC6H { + static constexpr int MaxPartitions = 64; + + // @fmt:off + + static constexpr uint8_t PartitionTable2[MaxPartitions][16] = { + { 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, + { 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1 }, + { 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1 }, + { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1 }, + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1 }, + { 0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0 }, + { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, + { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0 }, + { 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, + { 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1 }, + { 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0 }, + { 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0 }, + { 0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0 }, + { 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, + { 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0 }, + { 0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1 }, + { 0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0 }, + { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0 }, + { 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0 }, + { 0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0 }, + { 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1 }, + { 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1 }, + { 0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0 }, + { 0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0 }, + { 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0 }, + { 0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0 }, + { 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0 }, + { 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1 }, + { 0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1 }, + { 0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0 }, + { 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0 }, + { 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0 }, + { 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0 }, + { 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0 }, + { 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, + { 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0 }, + { 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0 }, + { 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1 }, + { 0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1 }, + { 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1 }, + { 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1 }, + { 0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1 }, + { 0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0 }, + { 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0 }, + { 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1 }, + }; + + static constexpr uint8_t AnchorTable2[MaxPartitions] = { + 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, + 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, + 0xf, 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0xf, + 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0x2, 0x2, + 0xf, 0xf, 0x6, 0x8, 0x2, 0x8, 0xf, 0xf, + 0x2, 0x8, 0x2, 0x2, 0x2, 0xf, 0xf, 0x6, + 0x6, 0x2, 0x6, 0x8, 0xf, 0xf, 0x2, 0x2, + 0xf, 0xf, 0xf, 0xf, 0xf, 0x2, 0x2, 0xf, + }; + + // @fmt:on + + // 1.0f in half-precision floating point format + static constexpr uint16_t halfFloat1 = 0x3C00; + union Color { + struct RGBA { + uint16_t r = 0; + uint16_t g = 0; + uint16_t b = 0; + uint16_t a = halfFloat1; + + RGBA(uint16_t r, uint16_t g, uint16_t b) + : r(r), g(g), b(b) { + } + + RGBA &operator=(const RGBA &other) { + this->r = other.r; + this->g = other.g; + this->b = other.b; + this->a = halfFloat1; + + return *this; + } + }; + + Color(uint16_t r, uint16_t g, uint16_t b) + : rgba(r, g, b) { + } + + Color(int r, int g, int b) + : rgba((uint16_t) r, (uint16_t) g, (uint16_t) b) { + } + + Color() {} + + Color(const Color &other) { + this->rgba = other.rgba; + } + + Color &operator=(const Color &other) { + this->rgba = other.rgba; + + return *this; + } + + RGBA rgba; + uint16_t channel[4]; + }; + static_assert(sizeof(Color) == 8, "BC6h::Color must be 8 bytes long"); + + inline int32_t extendSign(int32_t val, size_t size) { + // Suppose we have a 2-bit integer being stored in 4 bit variable: + // x = 0b00AB + // + // In order to sign extend x, we need to turn the 0s into A's: + // x_extend = 0bAAAB + // + // We can do that by flipping A in x then subtracting 0b0010 from x. + // Suppose A is 1: + // x = 0b001B + // x_flip = 0b000B + // x_minus = 0b111B + // Since A is flipped to 0, subtracting the mask sets it and all the bits above it to 1. + // And if A is 0: + // x = 0b000B + // x_flip = 0b001B + // x_minus = 0b000B + // We unset the bit we flipped, and touch no other bit + uint16_t mask = 1u << (size - 1); + return (val ^ mask) - mask; + } + + static int constexpr RGBfChannels = 3; + struct RGBf { + uint16_t channel[RGBfChannels]; + size_t size[RGBfChannels]; + bool isSigned; + + RGBf() { + static_assert(RGBfChannels == 3, "RGBf must have exactly 3 channels"); + static_assert(sizeof(channel) / sizeof(channel[0]) == RGBfChannels, "RGBf must have exactly 3 channels"); + static_assert(sizeof(channel) / sizeof(channel[0]) == sizeof(size) / sizeof(size[0]), "RGBf requires equally sized arrays for channels and channel sizes"); + + for (int i = 0; i < RGBfChannels; i++) { + channel[i] = 0; + size[i] = 0; + } + + isSigned = false; + } + + void extendSign() { + for (int i = 0; i < RGBfChannels; i++) { + channel[i] = BC6H::extendSign(channel[i], size[i]); + } + } + + // Assuming this is the delta, take the base-endpoint and transform this into + // a proper endpoint. + // + // The final computed endpoint is truncated to the base-endpoint's size; + void resolveDelta(RGBf base) { + for (int i = 0; i < RGBfChannels; i++) { + size[i] = base.size[i]; + channel[i] = (base.channel[i] + channel[i]) & ((1 << base.size[i]) - 1); + } + + // Per the spec: + // "For signed formats, the results of the delta calculation must be sign + // extended as well." + if (isSigned) { + extendSign(); + } + } + + void unquantize() { + if (isSigned) { + unquantizeSigned(); + } else { + unquantizeUnsigned(); + } + } + + void unquantizeUnsigned() { + for (int i = 0; i < RGBfChannels; i++) { + if (size[i] >= 15 || channel[i] == 0) { + continue; + } else if (channel[i] == ((1u << size[i]) - 1)) { + channel[i] = 0xFFFFu; + } else { + // Need 32 bits to avoid overflow + uint32_t tmp = channel[i]; + channel[i] = (uint16_t) (((tmp << 16) + 0x8000) >> size[i]); + } + size[i] = 16; + } + } + + void unquantizeSigned() { + for (int i = 0; i < RGBfChannels; i++) { + if (size[i] >= 16 || channel[i] == 0) { + continue; + } + + int16_t value = (int16_t)channel[i]; + int32_t result = value; + bool signBit = value < 0; + if (signBit) { + value = -value; + } + + if (value >= ((1 << (size[i] - 1)) - 1)) { + result = 0x7FFF; + } else { + // Need 32 bits to avoid overflow + int32_t tmp = value; + result = (((tmp << 15) + 0x4000) >> (size[i] - 1)); + } + + if (signBit) { + result = -result; + } + + channel[i] = (uint16_t) result; + size[i] = 16; + } + } + }; + + struct Data { + uint64_t low64; + uint64_t high64; + + Data() = default; + + Data(uint64_t low64, uint64_t high64) + : low64(low64), high64(high64) { + } + + // Consumes the lowest N bits from from low64 and high64 where N is: + // abs(MSB - LSB) + // MSB and LSB come from the block description of the BC6h spec and specify + // the location of the bits in the returned bitstring. + // + // If MSB < LSB, then the bits are reversed. Otherwise, the bitstring is read and + // shifted without further modification. + // + uint32_t consumeBits(uint32_t MSB, uint32_t LSB) { + bool reversed = MSB < LSB; + if (reversed) { + std::swap(MSB, LSB); + } + assert(MSB - LSB + 1 < sizeof(uint32_t) * 8); + + uint32_t numBits = MSB - LSB + 1; + uint32_t mask = (1 << numBits) - 1; + // Read the low N bits + uint32_t bits = (low64 & mask); + + low64 >>= numBits; + // Put the low N bits of high64 into the high 64-N bits of low64 + low64 |= (high64 & mask) << (sizeof(high64) * 8 - numBits); + high64 >>= numBits; + + if (reversed) { + uint32_t tmp = 0; + for (uint32_t numSwaps = 0; numSwaps < numBits; numSwaps++) { + tmp <<= 1; + tmp |= (bits & 1); + bits >>= 1; + } + + bits = tmp; + } + + return bits << LSB; + } + }; + + struct IndexInfo { + uint64_t value; + int numBits; + }; + +// Interpolates between two endpoints, then does a final unquantization step + Color interpolate(RGBf e0, RGBf e1, const IndexInfo &index, bool isSigned) { + static constexpr uint32_t weights3[] = {0, 9, 18, 27, 37, 46, 55, 64}; + static constexpr uint32_t weights4[] = {0, 4, 9, 13, 17, 21, 26, 30, + 34, 38, 43, 47, 51, 55, 60, 64}; + static constexpr uint32_t const *weightsN[] = { + nullptr, nullptr, nullptr, weights3, weights4 + }; + auto weights = weightsN[index.numBits]; + assert(weights != nullptr); + Color color; + uint32_t e0Weight = 64 - weights[index.value]; + uint32_t e1Weight = weights[index.value]; + + for (int i = 0; i < RGBfChannels; i++) { + int32_t e0Channel = e0.channel[i]; + int32_t e1Channel = e1.channel[i]; + + if (isSigned) { + e0Channel = extendSign(e0Channel, 16); + e1Channel = extendSign(e1Channel, 16); + } + + int32_t e0Value = e0Channel * e0Weight; + int32_t e1Value = e1Channel * e1Weight; + + uint32_t tmp = ((e0Value + e1Value + 32) >> 6); + + // Need to unquantize value to limit it to the legal range of half-precision + // floats. We do this by scaling by 31/32 or 31/64 depending on if the value + // is signed or unsigned. + if (isSigned) { + tmp = ((tmp & 0x80000000) != 0) ? (((~tmp + 1) * 31) >> 5) | 0x8000 : (tmp * 31) >> 5; + // Don't return -0.0f, just normalize it to 0.0f. + if (tmp == 0x8000) + tmp = 0; + } else { + tmp = (tmp * 31) >> 6; + } + + color.channel[i] = (uint16_t) tmp; + } + + return color; + } + + enum DataType { + // Endpoints + EP0 = 0, + EP1 = 1, + EP2 = 2, + EP3 = 3, + Mode, + Partition, + End, + }; + + enum Channel { + R = 0, + G = 1, + B = 2, + None, + }; + + struct DeltaBits { + size_t channel[3]; + + constexpr DeltaBits() + : channel{0, 0, 0} { + } + + constexpr DeltaBits(size_t r, size_t g, size_t b) + : channel{r, g, b} { + } + }; + + struct ModeDesc { + int number; + bool hasDelta; + int partitionCount; + int endpointBits; + DeltaBits deltaBits; + + constexpr ModeDesc() + : number(-1), hasDelta(false), partitionCount(0), endpointBits(0) { + } + + constexpr ModeDesc(int number, bool hasDelta, int partitionCount, int endpointBits, DeltaBits deltaBits) + : number(number), hasDelta(hasDelta), partitionCount(partitionCount), endpointBits(endpointBits), deltaBits(deltaBits) { + } + }; + + struct BlockDesc { + DataType type; + Channel channel; + int MSB; + int LSB; + ModeDesc modeDesc; + + constexpr BlockDesc() + : type(End), channel(None), MSB(0), LSB(0), modeDesc() { + } + + constexpr BlockDesc(const DataType type, Channel channel, int MSB, int LSB, ModeDesc modeDesc) + : type(type), channel(channel), MSB(MSB), LSB(LSB), modeDesc(modeDesc) { + } + + constexpr BlockDesc(DataType type, Channel channel, int MSB, int LSB) + : type(type), channel(channel), MSB(MSB), LSB(LSB), modeDesc() { + } + }; + +// Turns a legal mode into an index into the BlockDesc table. +// Illegal or reserved modes return -1. + static int modeToIndex(uint8_t mode) { + if (mode <= 3) { + return mode; + } else if ((mode & 0x2) != 0) { + if (mode <= 18) { +// Turns 6 into 4, 7 into 5, 10 into 6, etc. + return (mode / 2) + 1 + (mode & 0x1); + } else if (mode == 22 || mode == 26 || mode == 30) { +// Turns 22 into 11, 26 into 12, etc. + return mode / 4 + 6; + } + } + + return -1; + } + +// Returns a description of the bitfields for each mode from the LSB +// to the MSB before the index data starts. +// +// The numbers come from the BC6h block description. Each BlockDesc in the +// {Type, Channel, MSB, LSB} +// * Type describes which endpoint this is, or if this is a mode, a partition +// number, or the end of the block description. +// * Channel describes one of the 3 color channels within an endpoint +// * MSB and LSB specificy: +// * The size of the bitfield being read +// * The position of the bitfield within the variable it is being read to +// * If the bitfield is stored in reverse bit order +// If MSB < LSB then the bitfield is stored in reverse order. The size of +// the bitfield is abs(MSB-LSB+1). And the position of the bitfield within +// the variable is min(LSB, MSB). +// +// Invalid or reserved modes return an empty list. + static constexpr int NumBlocks = 14; +// The largest number of descriptions within a block. + static constexpr int MaxBlockDescIndex = 26; + static constexpr BlockDesc blockDescs[NumBlocks][MaxBlockDescIndex] = { +// @fmt:off +// Mode 0, Index 0 +{ +{ Mode, None, 1, 0, { 0, true, 2, 10, { 5, 5, 5 } } }, +{ EP2, G, 4, 4 }, { EP2, B, 4, 4 }, { EP3, B, 4, 4 }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 }, +{ EP1, B, 4, 0 }, { EP3, B, 1, 1 }, { EP2, B, 3, 0 }, +{ EP2, R, 4, 0 }, { EP3, B, 2, 2 }, { EP3, R, 4, 0 }, +{ EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 1, Index 1 +{ +{ Mode, None, 1, 0, { 1, true, 2, 7, { 6, 6, 6 } } }, +{ EP2, G, 5, 5 }, { EP3, G, 5, 4 }, { EP0, R, 6, 0 }, +{ EP3, B, 1, 0 }, { EP2, B, 4, 4 }, { EP0, G, 6, 0 }, +{ EP2, B, 5, 5 }, { EP3, B, 2, 2 }, { EP2, G, 4, 4 }, +{ EP0, B, 6, 0 }, { EP3, B, 3, 3 }, { EP3, B, 5, 5 }, +{ EP3, B, 4, 4 }, { EP1, R, 5, 0 }, { EP2, G, 3, 0 }, +{ EP1, G, 5, 0 }, { EP3, G, 3, 0 }, { EP1, B, 5, 0 }, +{ EP2, B, 3, 0 }, { EP2, R, 5, 0 }, { EP3, R, 5, 0 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 2, Index 2 +{ +{ Mode, None, 4, 0, { 2, true, 2, 11, { 5, 4, 4 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 4, 0 }, { EP0, R, 10, 10 }, { EP2, G, 3, 0 }, +{ EP1, G, 3, 0 }, { EP0, G, 10, 10 }, { EP3, B, 0, 0 }, +{ EP3, G, 3, 0 }, { EP1, B, 3, 0 }, { EP0, B, 10, 10 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 }, +{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 3, Index 3 +{ +{ Mode, None, 4, 0, { 3, false, 1, 10, { 0, 0, 0 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 9, 0 }, { EP1, G, 9, 0 }, { EP1, B, 9, 0 }, +{ End, None, 0, 0}, +}, +// Mode 6, Index 4 +{ +{ Mode, None, 4, 0, { 6, true, 2, 11, { 4, 5, 4 } } }, // 1 1 +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 3, 0 }, { EP0, R, 10, 10 }, { EP3, G, 4, 4 }, +{ EP2, G, 3, 0 }, { EP1, G, 4, 0 }, { EP0, G, 10, 10 }, +{ EP3, G, 3, 0 }, { EP1, B, 3, 0 }, { EP0, B, 10, 10 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 3, 0 }, +{ EP3, B, 0, 0 }, { EP3, B, 2, 2 }, { EP3, R, 3, 0 }, // 18 19 +{ EP2, G, 4, 4 }, { EP3, B, 3, 3 }, // 2 21 +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 7, Index 5 +{ +{ Mode, None, 4, 0, { 7, true, 1, 11, { 9, 9, 9 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 8, 0 }, { EP0, R, 10, 10 }, { EP1, G, 8, 0 }, +{ EP0, G, 10, 10 }, { EP1, B, 8, 0 }, { EP0, B, 10, 10 }, +{ End, None, 0, 0}, +}, +// Mode 10, Index 6 +{ +{ Mode, None, 4, 0, { 10, true, 2, 11, { 4, 4, 5 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 3, 0 }, { EP0, R, 10, 10 }, { EP2, B, 4, 4 }, +{ EP2, G, 3, 0 }, { EP1, G, 3, 0 }, { EP0, G, 10, 10 }, +{ EP3, B, 0, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 }, +{ EP0, B, 10, 10 }, { EP2, B, 3, 0 }, { EP2, R, 3, 0 }, +{ EP3, B, 1, 1 }, { EP3, B, 2, 2 }, { EP3, R, 3, 0 }, +{ EP3, B, 4, 4 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 11, Index 7 +{ +{ Mode, None, 4, 0, { 11, true, 1, 12, { 8, 8, 8 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 7, 0 }, { EP0, R, 10, 11 }, { EP1, G, 7, 0 }, +{ EP0, G, 10, 11 }, { EP1, B, 7, 0 }, { EP0, B, 10, 11 }, +{ End, None, 0, 0}, +}, +// Mode 14, Index 8 +{ +{ Mode, None, 4, 0, { 14, true, 2, 9, { 5, 5, 5 } } }, +{ EP0, R, 8, 0 }, { EP2, B, 4, 4 }, { EP0, G, 8, 0 }, +{ EP2, G, 4, 4 }, { EP0, B, 8, 0 }, { EP3, B, 4, 4 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 }, +{ EP1, B, 4, 0 }, { EP3, B, 1, 1 }, { EP2, B, 3, 0 }, +{ EP2, R, 4, 0 }, { EP3, B, 2, 2 }, { EP3, R, 4, 0 }, +{ EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 15, Index 9 +{ +{ Mode, None, 4, 0, { 15, true, 1, 16, { 4, 4, 4 } } }, +{ EP0, R, 9, 0 }, { EP0, G, 9, 0 }, { EP0, B, 9, 0 }, +{ EP1, R, 3, 0 }, { EP0, R, 10, 15 }, { EP1, G, 3, 0 }, +{ EP0, G, 10, 15 }, { EP1, B, 3, 0 }, { EP0, B, 10, 15 }, +{ End, None, 0, 0}, +}, +// Mode 18, Index 10 +{ +{ Mode, None, 4, 0, { 18, true, 2, 8, { 6, 5, 5 } } }, +{ EP0, R, 7, 0 }, { EP3, G, 4, 4 }, { EP2, B, 4, 4 }, +{ EP0, G, 7, 0 }, { EP3, B, 2, 2 }, { EP2, G, 4, 4 }, +{ EP0, B, 7, 0 }, { EP3, B, 3, 3 }, { EP3, B, 4, 4 }, +{ EP1, R, 5, 0 }, { EP2, G, 3, 0 }, { EP1, G, 4, 0 }, +{ EP3, B, 0, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 5, 0 }, +{ EP3, R, 5, 0 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 22, Index 11 +{ +{ Mode, None, 4, 0, { 22, true, 2, 8, { 5, 6, 5 } } }, +{ EP0, R, 7, 0 }, { EP3, B, 0, 0 }, { EP2, B, 4, 4 }, +{ EP0, G, 7, 0 }, { EP2, G, 5, 5 }, { EP2, G, 4, 4 }, +{ EP0, B, 7, 0 }, { EP3, G, 5, 5 }, { EP3, B, 4, 4 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 5, 0 }, { EP3, G, 3, 0 }, { EP1, B, 4, 0 }, +{ EP3, B, 1, 1 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 }, +{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 26, Index 12 +{ +{ Mode, None, 4, 0, { 26, true, 2, 8, { 5, 5, 6 } } }, +{ EP0, R, 7, 0 }, { EP3, B, 1, 1 }, { EP2, B, 4, 4 }, +{ EP0, G, 7, 0 }, { EP2, B, 5, 5 }, { EP2, G, 4, 4 }, +{ EP0, B, 7, 0 }, { EP3, B, 5, 5 }, { EP3, B, 4, 4 }, +{ EP1, R, 4, 0 }, { EP3, G, 4, 4 }, { EP2, G, 3, 0 }, +{ EP1, G, 4, 0 }, { EP3, B, 0, 0 }, { EP3, G, 3, 0 }, +{ EP1, B, 5, 0 }, { EP2, B, 3, 0 }, { EP2, R, 4, 0 }, +{ EP3, B, 2, 2 }, { EP3, R, 4, 0 }, { EP3, B, 3, 3 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +}, +// Mode 30, Index 13 +{ +{ Mode, None, 4, 0, { 30, false, 2, 6, { 0, 0, 0 } } }, +{ EP0, R, 5, 0 }, { EP3, G, 4, 4 }, { EP3, B, 0, 0 }, +{ EP3, B, 1, 1 }, { EP2, B, 4, 4 }, { EP0, G, 5, 0 }, +{ EP2, G, 5, 5 }, { EP2, B, 5, 5 }, { EP3, B, 2, 2 }, +{ EP2, G, 4, 4 }, { EP0, B, 5, 0 }, { EP3, G, 5, 5 }, +{ EP3, B, 3, 3 }, { EP3, B, 5, 5 }, { EP3, B, 4, 4 }, +{ EP1, R, 5, 0 }, { EP2, G, 3, 0 }, { EP1, G, 5, 0 }, +{ EP3, G, 3, 0 }, { EP1, B, 5, 0 }, { EP2, B, 3, 0 }, +{ EP2, R, 5, 0 }, { EP3, R, 5, 0 }, +{ Partition, None, 4, 0 }, +{ End, None, 0, 0}, +} +// @fmt:on + }; + + struct Block { + uint64_t low64; + uint64_t high64; + + void decode(uint8_t *dst, size_t dstX, size_t dstY, size_t dstWidth, size_t dstHeight, size_t dstPitch, size_t dstBpp, bool isSigned) const { + uint8_t mode = 0; + Data data(low64, high64); + assert(dstBpp == sizeof(Color)); + + if ((data.low64 & 0x2) == 0) { + mode = data.consumeBits(1, 0); + } else { + mode = data.consumeBits(4, 0); + } + + int blockIndex = modeToIndex(mode); + // Handle illegal or reserved mode + if (blockIndex == -1) { + for (int y = 0; y < 4 && y + dstY < dstHeight; y++) { + for (int x = 0; x < 4 && x + dstX < dstWidth; x++) { + auto out = reinterpret_cast(dst + sizeof(Color) * x + dstPitch * y); + out->rgba = {0, 0, 0}; + } + } + return; + } + const BlockDesc *blockDesc = blockDescs[blockIndex]; + + RGBf e[4]; + e[0].isSigned = e[1].isSigned = e[2].isSigned = e[3].isSigned = isSigned; + + int partition = 0; + ModeDesc modeDesc; + for (int index = 0; blockDesc[index].type != End; index++) { + const BlockDesc desc = blockDesc[index]; + + switch (desc.type) { + case Mode: + modeDesc = desc.modeDesc; + assert(modeDesc.number == mode); + + e[0].size[0] = e[0].size[1] = e[0].size[2] = modeDesc.endpointBits; + for (int i = 0; i < RGBfChannels; i++) { + if (modeDesc.hasDelta) { + e[1].size[i] = e[2].size[i] = e[3].size[i] = modeDesc.deltaBits.channel[i]; + } else { + e[1].size[i] = e[2].size[i] = e[3].size[i] = modeDesc.endpointBits; + } + } + break; + case Partition: + partition |= data.consumeBits(desc.MSB, desc.LSB); + break; + case EP0: + case EP1: + case EP2: + case EP3: + e[desc.type].channel[desc.channel] |= data.consumeBits(desc.MSB, desc.LSB); + break; + default: + assert(false); + return; + } + } + + // Sign extension + if (isSigned) { + for (int ep = 0; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].extendSign(); + } + } else if (modeDesc.hasDelta) { + // Don't sign-extend the base endpoint in an unsigned format. + for (int ep = 1; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].extendSign(); + } + } + + // Turn the deltas into endpoints + if (modeDesc.hasDelta) { + for (int ep = 1; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].resolveDelta(e[0]); + } + } + + for (int ep = 0; ep < modeDesc.partitionCount * 2; ep++) { + e[ep].unquantize(); + } + + // Get the indices, calculate final colors, and output + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + int pixelNum = x + y * 4; + IndexInfo idx; + bool isAnchor = false; + int firstEndpoint = 0; + // Bc6H can have either 1 or 2 petitions depending on the mode. + // The number of petitions affects the number of indices with implicit + // leading 0 bits and the number of bits per index. + if (modeDesc.partitionCount == 1) { + idx.numBits = 4; + // There's an implicit leading 0 bit for the first idx + isAnchor = (pixelNum == 0); + } else { + idx.numBits = 3; + // There are 2 indices with implicit leading 0-bits. + isAnchor = ((pixelNum == 0) || (pixelNum == AnchorTable2[partition])); + firstEndpoint = PartitionTable2[partition][pixelNum] * 2; + } + + idx.value = data.consumeBits(idx.numBits - isAnchor - 1, 0); + + // Don't exit the loop early, we need to consume these index bits regardless if + // we actually output them or not. + if ((y + dstY >= dstHeight) || (x + dstX >= dstWidth)) { + continue; + } + + Color color = interpolate(e[firstEndpoint], e[firstEndpoint + 1], idx, isSigned); + auto out = reinterpret_cast(dst + dstBpp * x + dstPitch * y); + *out = color; + } + } + } + }; + + } // namespace BC6H + + namespace BC7 { +// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_bptc.txt +// https://docs.microsoft.com/en-us/windows/win32/direct3d11/bc7-format + + struct Bitfield { + int offset; + int count; + + constexpr Bitfield Then(const int bits) { return {offset + count, bits}; } + + constexpr bool operator==(const Bitfield &rhs) { + return offset == rhs.offset && count == rhs.count; + } + }; + + struct Mode { + const int IDX; // Mode index + const int NS; // Number of subsets in each partition + const int PB; // Partition bits + const int RB; // Rotation bits + const int ISB; // Index selection bits + const int CB; // Color bits + const int AB; // Alpha bits + const int EPB; // Endpoint P-bits + const int SPB; // Shared P-bits + const int IB; // Primary index bits per element + const int IBC; // Primary index bits total + const int IB2; // Secondary index bits per element + + constexpr int NumColors() const { return NS * 2; } + + constexpr Bitfield Partition() const { return {IDX + 1, PB}; } + + constexpr Bitfield Rotation() const { return Partition().Then(RB); } + + constexpr Bitfield IndexSelection() const { return Rotation().Then(ISB); } + + constexpr Bitfield Red(int idx) const { + return IndexSelection().Then(CB * idx).Then(CB); + } + + constexpr Bitfield Green(int idx) const { + return Red(NumColors() - 1).Then(CB * idx).Then(CB); + } + + constexpr Bitfield Blue(int idx) const { + return Green(NumColors() - 1).Then(CB * idx).Then(CB); + } + + constexpr Bitfield Alpha(int idx) const { + return Blue(NumColors() - 1).Then(AB * idx).Then(AB); + } + + constexpr Bitfield EndpointPBit(int idx) const { + return Alpha(NumColors() - 1).Then(EPB * idx).Then(EPB); + } + + constexpr Bitfield SharedPBit0() const { + return EndpointPBit(NumColors() - 1).Then(SPB); + } + + constexpr Bitfield SharedPBit1() const { + return SharedPBit0().Then(SPB); + } + + constexpr Bitfield PrimaryIndex(int offset, int count) const { + return SharedPBit1().Then(offset).Then(count); + } + + constexpr Bitfield SecondaryIndex(int offset, int count) const { + return SharedPBit1().Then(IBC + offset).Then(count); + } + }; + + static constexpr Mode Modes[] = { + // IDX NS PB RB ISB CB AB EPB SPB IB IBC, IB2 + /**/ {0x0, 0x3, 0x4, 0x0, 0x0, 0x4, 0x0, 0x1, 0x0, 0x3, 0x2d, 0x0}, +/**/ {0x1, 0x2, 0x6, 0x0, 0x0, 0x6, 0x0, 0x0, 0x1, 0x3, 0x2e, 0x0}, +/**/ {0x2, 0x3, 0x6, 0x0, 0x0, 0x5, 0x0, 0x0, 0x0, 0x2, 0x1d, 0x0}, +/**/ {0x3, 0x2, 0x6, 0x0, 0x0, 0x7, 0x0, 0x1, 0x0, 0x2, 0x1e, 0x0}, +/**/ {0x4, 0x1, 0x0, 0x2, 0x1, 0x5, 0x6, 0x0, 0x0, 0x2, 0x1f, 0x3}, +/**/ {0x5, 0x1, 0x0, 0x2, 0x0, 0x7, 0x8, 0x0, 0x0, 0x2, 0x1f, 0x2}, +/**/ {0x6, 0x1, 0x0, 0x0, 0x0, 0x7, 0x7, 0x1, 0x0, 0x4, 0x3f, 0x0}, +/**/ {0x7, 0x2, 0x6, 0x0, 0x0, 0x5, 0x5, 0x1, 0x0, 0x2, 0x1e, 0x0}, +/**/ {-1, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x0, 0x00, 0x0}, + }; + + static constexpr int MaxPartitions = 64; + static constexpr int MaxSubsets = 3; + + static constexpr uint8_t PartitionTable2[MaxPartitions][16] = { + {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}, + {0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1}, + {0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 1, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 1}, + {0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1}, + {0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1, 1}, + {0, 1, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0}, + {0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0}, + {0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0}, + {0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 1}, + {0, 0, 1, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0}, + {0, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 0, 0}, + {0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0}, + {0, 0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 1, 1, 0, 0}, + {0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0}, + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}, + {0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0}, + {0, 0, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0}, + {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1}, + {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1}, + {0, 1, 0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0}, + {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0}, + {0, 0, 1, 1, 1, 1, 0, 0, 0, 0, 1, 1, 1, 1, 0, 0}, + {0, 1, 0, 1, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0}, + {0, 1, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 0, 1}, + {0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1}, + {0, 1, 1, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 1, 0}, + {0, 0, 0, 1, 0, 0, 1, 1, 1, 1, 0, 0, 1, 0, 0, 0}, + {0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1, 0, 0}, + {0, 0, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 1, 1, 0, 0}, + {0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1, 0, 1, 1, 0}, + {0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 1, 1}, + {0, 1, 1, 0, 0, 1, 1, 0, 1, 0, 0, 1, 1, 0, 0, 1}, + {0, 0, 0, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 0, 0, 0}, + {0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0, 0}, + {0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0, 0, 0, 0}, + {0, 0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0}, + {0, 0, 0, 0, 0, 1, 0, 0, 1, 1, 1, 0, 0, 1, 0, 0}, + {0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 1, 1, 0, 1, 1, 0, 0, 1, 0, 0, 1}, + {0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0}, + {0, 0, 1, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0}, + {0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1}, + {0, 1, 1, 0, 0, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 1}, + {0, 1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1}, + {0, 0, 0, 1, 1, 0, 0, 0, 1, 1, 1, 0, 0, 1, 1, 1}, + {0, 0, 0, 0, 1, 1, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 0, 0, 0}, + {0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 0, 1, 1, 1, 0}, + {0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 1, 1, 0, 1, 1, 1}, + }; + + static constexpr uint8_t PartitionTable3[MaxPartitions][16] = { + {0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 1, 2, 2, 2, 2}, + {0, 0, 0, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 2, 1}, + {0, 0, 0, 0, 2, 0, 0, 1, 2, 2, 1, 1, 2, 2, 1, 1}, + {0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 1, 0, 1, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2}, + {0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 2, 2}, + {0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1}, + {0, 0, 1, 1, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1}, + {0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2}, + {0, 0, 0, 0, 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 2, 2}, + {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2}, + {0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2, 0, 1, 1, 2}, + {0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2, 0, 1, 2, 2}, + {0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2, 1, 2, 2, 2}, + {0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0, 2, 2, 2, 0}, + {0, 0, 0, 1, 0, 0, 1, 1, 0, 1, 1, 2, 1, 1, 2, 2}, + {0, 1, 1, 1, 0, 0, 1, 1, 2, 0, 0, 1, 2, 2, 0, 0}, + {0, 0, 0, 0, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2}, + {0, 0, 2, 2, 0, 0, 2, 2, 0, 0, 2, 2, 1, 1, 1, 1}, + {0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2, 0, 2, 2, 2}, + {0, 0, 0, 1, 0, 0, 0, 1, 2, 2, 2, 1, 2, 2, 2, 1}, + {0, 0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2}, + {0, 0, 0, 0, 1, 1, 0, 0, 2, 2, 1, 0, 2, 2, 1, 0}, + {0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1, 0, 0, 0, 0}, + {0, 0, 1, 2, 0, 0, 1, 2, 1, 1, 2, 2, 2, 2, 2, 2}, + {0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1, 0, 1, 1, 0}, + {0, 0, 0, 0, 0, 1, 1, 0, 1, 2, 2, 1, 1, 2, 2, 1}, + {0, 0, 2, 2, 1, 1, 0, 2, 1, 1, 0, 2, 0, 0, 2, 2}, + {0, 1, 1, 0, 0, 1, 1, 0, 2, 0, 0, 2, 2, 2, 2, 2}, + {0, 0, 1, 1, 0, 1, 2, 2, 0, 1, 2, 2, 0, 0, 1, 1}, + {0, 0, 0, 0, 2, 0, 0, 0, 2, 2, 1, 1, 2, 2, 2, 1}, + {0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 2, 2, 2}, + {0, 2, 2, 2, 0, 0, 2, 2, 0, 0, 1, 2, 0, 0, 1, 1}, + {0, 0, 1, 1, 0, 0, 1, 2, 0, 0, 2, 2, 0, 2, 2, 2}, + {0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0, 0, 1, 2, 0}, + {0, 0, 0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0}, + {0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0, 1, 2, 0}, + {0, 1, 2, 0, 2, 0, 1, 2, 1, 2, 0, 1, 0, 1, 2, 0}, + {0, 0, 1, 1, 2, 2, 0, 0, 1, 1, 2, 2, 0, 0, 1, 1}, + {0, 0, 1, 1, 1, 1, 2, 2, 2, 2, 0, 0, 0, 0, 1, 1}, + {0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1}, + {0, 0, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2, 1, 1, 2, 2}, + {0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 2, 2, 0, 0, 1, 1}, + {0, 2, 2, 0, 1, 2, 2, 1, 0, 2, 2, 0, 1, 2, 2, 1}, + {0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 0, 1, 0, 1}, + {0, 0, 0, 0, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1, 2, 1}, + {0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 2, 2, 2, 2}, + {0, 2, 2, 2, 0, 1, 1, 1, 0, 2, 2, 2, 0, 1, 1, 1}, + {0, 0, 0, 2, 1, 1, 1, 2, 0, 0, 0, 2, 1, 1, 1, 2}, + {0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2}, + {0, 2, 2, 2, 0, 1, 1, 1, 0, 1, 1, 1, 0, 2, 2, 2}, + {0, 0, 0, 2, 1, 1, 1, 2, 1, 1, 1, 2, 0, 0, 0, 2}, + {0, 1, 1, 0, 0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2, 2, 1, 1, 2}, + {0, 1, 1, 0, 0, 1, 1, 0, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 0, 2, 2, 0, 0, 1, 1, 0, 0, 1, 1, 0, 0, 2, 2}, + {0, 0, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 0, 0, 2, 2}, + {0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 1, 1, 2}, + {0, 0, 0, 2, 0, 0, 0, 1, 0, 0, 0, 2, 0, 0, 0, 1}, + {0, 2, 2, 2, 1, 2, 2, 2, 0, 2, 2, 2, 1, 2, 2, 2}, + {0, 1, 0, 1, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2}, + {0, 1, 1, 1, 2, 0, 1, 1, 2, 2, 0, 1, 2, 2, 2, 0}, + }; + + static constexpr uint8_t AnchorTable2[MaxPartitions] = { +// @fmt:off +0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, +0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, +0xf, 0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0xf, +0x2, 0x8, 0x2, 0x2, 0x8, 0x8, 0x2, 0x2, +0xf, 0xf, 0x6, 0x8, 0x2, 0x8, 0xf, 0xf, +0x2, 0x8, 0x2, 0x2, 0x2, 0xf, 0xf, 0x6, +0x6, 0x2, 0x6, 0x8, 0xf, 0xf, 0x2, 0x2, +0xf, 0xf, 0xf, 0xf, 0xf, 0x2, 0x2, 0xf, +// @fmt:on + }; + + static constexpr uint8_t AnchorTable3a[MaxPartitions] = { +// @fmt:off +0x3, 0x3, 0xf, 0xf, 0x8, 0x3, 0xf, 0xf, +0x8, 0x8, 0x6, 0x6, 0x6, 0x5, 0x3, 0x3, +0x3, 0x3, 0x8, 0xf, 0x3, 0x3, 0x6, 0xa, +0x5, 0x8, 0x8, 0x6, 0x8, 0x5, 0xf, 0xf, +0x8, 0xf, 0x3, 0x5, 0x6, 0xa, 0x8, 0xf, +0xf, 0x3, 0xf, 0x5, 0xf, 0xf, 0xf, 0xf, +0x3, 0xf, 0x5, 0x5, 0x5, 0x8, 0x5, 0xa, +0x5, 0xa, 0x8, 0xd, 0xf, 0xc, 0x3, 0x3, +// @fmt:on + }; + + static constexpr uint8_t AnchorTable3b[MaxPartitions] = { +// @fmt:off +0xf, 0x8, 0x8, 0x3, 0xf, 0xf, 0x3, 0x8, +0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, 0x8, +0xf, 0x8, 0xf, 0x3, 0xf, 0x8, 0xf, 0x8, +0x3, 0xf, 0x6, 0xa, 0xf, 0xf, 0xa, 0x8, +0xf, 0x3, 0xf, 0xa, 0xa, 0x8, 0x9, 0xa, +0x6, 0xf, 0x8, 0xf, 0x3, 0x6, 0x6, 0x8, +0xf, 0x3, 0xf, 0xf, 0xf, 0xf, 0xf, 0xf, +0xf, 0xf, 0xf, 0xf, 0x3, 0xf, 0xf, 0x8, +// @fmt:on + }; + + struct Color { + struct RGB { + RGB() = default; + + RGB(uint8_t r, uint8_t g, uint8_t b) + : b(b), g(g), r(r) {} + + RGB(int r, int g, int b) + : b(static_cast(b)), g(static_cast(g)), r(static_cast(r)) {} + + RGB operator<<(int shift) const { return {r << shift, g << shift, b << shift}; } + + RGB operator>>(int shift) const { return {r >> shift, g >> shift, b >> shift}; } + + RGB operator|(int bits) const { return {r | bits, g | bits, b | bits}; } + + RGB operator|(const RGB &rhs) const { return {r | rhs.r, g | rhs.g, b | rhs.b}; } + + RGB operator+(const RGB &rhs) const { return {r + rhs.r, g + rhs.g, b + rhs.b}; } + + uint8_t b; + uint8_t g; + uint8_t r; + }; + + RGB rgb; + uint8_t a; + }; + + static_assert(sizeof(Color) == 4, "Color size must be 4 bytes"); + + struct Block { + constexpr uint64_t Get(const Bitfield &bf) const { + uint64_t mask = (1ULL << bf.count) - 1; + if (bf.offset + bf.count <= 64) { + return (low >> bf.offset) & mask; + } + if (bf.offset >= 64) { + return (high >> (bf.offset - 64)) & mask; + } + return ((low >> bf.offset) | (high << (64 - bf.offset))) & mask; + } + + const Mode &mode() const { + if ((low & 0b00000001) != 0) { + return Modes[0]; + } + if ((low & 0b00000010) != 0) { + return Modes[1]; + } + if ((low & 0b00000100) != 0) { + return Modes[2]; + } + if ((low & 0b00001000) != 0) { + return Modes[3]; + } + if ((low & 0b00010000) != 0) { + return Modes[4]; + } + if ((low & 0b00100000) != 0) { + return Modes[5]; + } + if ((low & 0b01000000) != 0) { + return Modes[6]; + } + if ((low & 0b10000000) != 0) { + return Modes[7]; + } + return Modes[8]; // Invalid mode + } + + struct IndexInfo { + uint64_t value; + int numBits; + }; + + uint8_t interpolate(uint8_t e0, uint8_t e1, const IndexInfo &index) const { + static constexpr uint16_t weights2[] = {0, 21, 43, 64}; + static constexpr uint16_t weights3[] = {0, 9, 18, 27, 37, 46, 55, 64}; + static constexpr uint16_t weights4[] = {0, 4, 9, 13, 17, 21, 26, 30, + 34, 38, 43, 47, 51, 55, 60, 64}; + static constexpr uint16_t const *weightsN[] = { + nullptr, nullptr, weights2, weights3, weights4 + }; + auto weights = weightsN[index.numBits]; + assert(weights != nullptr); + return (uint8_t) (((64 - weights[index.value]) * uint16_t(e0) + weights[index.value] * uint16_t(e1) + 32) >> 6); + } + + void decode(uint8_t *dst, size_t dstX, size_t dstY, size_t dstWidth, size_t dstHeight, size_t dstPitch) const { + auto const &mode = this->mode(); + + if (mode.IDX < 0) // Invalid mode: + { + for (size_t y = 0; y < 4 && y + dstY < dstHeight; y++) { + for (size_t x = 0; x < 4 && x + dstX < dstWidth; x++) { + auto out = reinterpret_cast(dst + sizeof(Color) * x + dstPitch * y); + out->rgb = {0, 0, 0}; + out->a = 0; + } + } + return; + } + + using Endpoint = std::array; + std::array subsets; + + for (size_t i = 0; i < mode.NS; i++) { + auto &subset = subsets[i]; + subset[0].rgb.r = Get(mode.Red(i * 2 + 0)); + subset[0].rgb.g = Get(mode.Green(i * 2 + 0)); + subset[0].rgb.b = Get(mode.Blue(i * 2 + 0)); + subset[0].a = (mode.AB > 0) ? Get(mode.Alpha(i * 2 + 0)) : 255; + + subset[1].rgb.r = Get(mode.Red(i * 2 + 1)); + subset[1].rgb.g = Get(mode.Green(i * 2 + 1)); + subset[1].rgb.b = Get(mode.Blue(i * 2 + 1)); + subset[1].a = (mode.AB > 0) ? Get(mode.Alpha(i * 2 + 1)) : 255; + } + + if (mode.SPB > 0) { + auto pbit0 = Get(mode.SharedPBit0()); + auto pbit1 = Get(mode.SharedPBit1()); + subsets[0][0].rgb = (subsets[0][0].rgb << 1) | pbit0; + subsets[0][1].rgb = (subsets[0][1].rgb << 1) | pbit0; + subsets[1][0].rgb = (subsets[1][0].rgb << 1) | pbit1; + subsets[1][1].rgb = (subsets[1][1].rgb << 1) | pbit1; + } + + if (mode.EPB > 0) { + for (size_t i = 0; i < mode.NS; i++) { + auto &subset = subsets[i]; + auto pbit0 = Get(mode.EndpointPBit(i * 2 + 0)); + auto pbit1 = Get(mode.EndpointPBit(i * 2 + 1)); + subset[0].rgb = (subset[0].rgb << 1) | pbit0; + subset[1].rgb = (subset[1].rgb << 1) | pbit1; + if (mode.AB > 0) { + subset[0].a = (subset[0].a << 1) | pbit0; + subset[1].a = (subset[1].a << 1) | pbit1; + } + } + } + + auto const colorBits = mode.CB + mode.SPB + mode.EPB; + auto const alphaBits = mode.AB + mode.SPB + mode.EPB; + + for (size_t i = 0; i < mode.NS; i++) { + auto &subset = subsets[i]; + subset[0].rgb = subset[0].rgb << (8 - colorBits); + subset[1].rgb = subset[1].rgb << (8 - colorBits); + subset[0].rgb = subset[0].rgb | (subset[0].rgb >> colorBits); + subset[1].rgb = subset[1].rgb | (subset[1].rgb >> colorBits); + + if (mode.AB > 0) { + subset[0].a = subset[0].a << (8 - alphaBits); + subset[1].a = subset[1].a << (8 - alphaBits); + subset[0].a = subset[0].a | (subset[0].a >> alphaBits); + subset[1].a = subset[1].a | (subset[1].a >> alphaBits); + } + } + + int colorIndexBitOffset = 0; + int alphaIndexBitOffset = 0; + for (int y = 0; y < 4; y++) { + for (int x = 0; x < 4; x++) { + auto texelIdx = y * 4 + x; + auto partitionIdx = Get(mode.Partition()); + assert(partitionIdx < MaxPartitions); + auto subsetIdx = subsetIndex(mode, partitionIdx, texelIdx); + assert(subsetIdx < MaxSubsets); + auto const &subset = subsets[subsetIdx]; + + auto anchorIdx = anchorIndex(mode, partitionIdx, subsetIdx); + auto isAnchor = anchorIdx == texelIdx; + auto colorIdx = colorIndex(mode, isAnchor, colorIndexBitOffset); + auto alphaIdx = alphaIndex(mode, isAnchor, alphaIndexBitOffset); + + if (y + dstY >= dstHeight || x + dstX >= dstWidth) { + // Don't be tempted to skip early at the loops: + // The calls to colorIndex() and alphaIndex() adjust bit + // offsets that need to be carefully tracked. + continue; + } + + Color output; + // Note: We flip r and b channels past this point as the texture storage is BGR while the output is RGB + output.rgb.r = interpolate(subset[0].rgb.b, subset[1].rgb.b, colorIdx); + output.rgb.g = interpolate(subset[0].rgb.g, subset[1].rgb.g, colorIdx); + output.rgb.b = interpolate(subset[0].rgb.r, subset[1].rgb.r, colorIdx); + output.a = interpolate(subset[0].a, subset[1].a, alphaIdx); + + switch (Get(mode.Rotation())) { + default: + break; + case 1: + std::swap(output.a, output.rgb.b); + break; + case 2: + std::swap(output.a, output.rgb.g); + break; + case 3: + std::swap(output.a, output.rgb.r); + break; + } + + auto out = reinterpret_cast(dst + sizeof(Color) * x + dstPitch * y); + *out = output; + } + } + } + + int subsetIndex(const Mode &mode, int partitionIdx, int texelIndex) const { + switch (mode.NS) { + default: + return 0; + case 2: + return PartitionTable2[partitionIdx][texelIndex]; + case 3: + return PartitionTable3[partitionIdx][texelIndex]; + } + } + + int anchorIndex(const Mode &mode, int partitionIdx, int subsetIdx) const { + // ARB_texture_compression_bptc states: + // "In partition zero, the anchor index is always index zero. + // In other partitions, the anchor index is specified by tables + // Table.A2 and Table.A3."" + // Note: This is really confusing - I believe they meant subset instead + // of partition here. + switch (subsetIdx) { + default: + return 0; + case 1: + return mode.NS == 2 ? AnchorTable2[partitionIdx] : AnchorTable3a[partitionIdx]; + case 2: + return AnchorTable3b[partitionIdx]; + } + } + + IndexInfo colorIndex(const Mode &mode, bool isAnchor, + int &indexBitOffset) const { + // ARB_texture_compression_bptc states: + // "The index value for interpolating color comes from the secondary + // index for the texel if the format has an index selection bit and its + // value is one and from the primary index otherwise."" + auto idx = Get(mode.IndexSelection()); + assert(idx <= 1); + bool secondary = idx == 1; + auto numBits = secondary ? mode.IB2 : mode.IB; + auto numReadBits = numBits - (isAnchor ? 1 : 0); + auto index = + Get(secondary ? mode.SecondaryIndex(indexBitOffset, numReadBits) + : mode.PrimaryIndex(indexBitOffset, numReadBits)); + indexBitOffset += numReadBits; + return {index, numBits}; + } + + IndexInfo alphaIndex(const Mode &mode, bool isAnchor, + int &indexBitOffset) const { + // ARB_texture_compression_bptc states: + // "The alpha index comes from the secondary index if the block has a + // secondary index and the block either doesn't have an index selection + // bit or that bit is zero and the primary index otherwise." + auto idx = Get(mode.IndexSelection()); + assert(idx <= 1); + bool secondary = (mode.IB2 != 0) && (idx == 0); + auto numBits = secondary ? mode.IB2 : mode.IB; + auto numReadBits = numBits - (isAnchor ? 1 : 0); + auto index = + Get(secondary ? mode.SecondaryIndex(indexBitOffset, numReadBits) + : mode.PrimaryIndex(indexBitOffset, numReadBits)); + indexBitOffset += numReadBits; + return {index, numBits}; + } + + // Assumes little-endian + uint64_t low; + uint64_t high; + }; + + } // namespace BC7 +} // anonymous namespace + +namespace bcn { + constexpr size_t R8Bpp{1}; //!< The amount of bytes per pixel in R8 + constexpr size_t R8g8Bpp{2}; //!< The amount of bytes per pixel in R8G8 + constexpr size_t R8g8b8a8Bpp{4}; //!< The amount of bytes per pixel in R8G8B8A8 + constexpr size_t R16g16b16a16Bpp{8}; //!< The amount of bytes per pixel in R16G16B16 + + void DecodeBc1(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *color{reinterpret_cast(src)}; + size_t pitch{R8g8b8a8Bpp * width}; + color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, true, false); + } + + void DecodeBc2(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *alpha{reinterpret_cast(src)}; + const auto *color{reinterpret_cast(src + 8)}; + size_t pitch{R8g8b8a8Bpp * width}; + color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, false, true); + alpha->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp); + } + + void DecodeBc3(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *alpha{reinterpret_cast(src)}; + const auto *color{reinterpret_cast(src + 8)}; + size_t pitch{R8g8b8a8Bpp * width}; + color->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, false, true); + alpha->decode(dst, x, y, width, height, pitch, R8g8b8a8Bpp, 3, false); + } + + void DecodeBc4(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) { + const auto *red{reinterpret_cast(src)}; + size_t pitch{R8Bpp * width}; + red->decode(dst, x, y, width, height, pitch, R8Bpp, 0, isSigned); + } + + void DecodeBc5(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) { + const auto *red{reinterpret_cast(src)}; + const auto *green{reinterpret_cast(src + 8)}; + size_t pitch{R8g8Bpp * width}; + red->decode(dst, x, y, width, height, pitch, R8g8Bpp, 0, isSigned); + green->decode(dst, x, y, width, height, pitch, R8g8Bpp, 1, isSigned); + } + + void DecodeBc6(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned) { + const auto *block{reinterpret_cast(src)}; + size_t pitch{R16g16b16a16Bpp * width}; + block->decode(dst, x, y, width, height, pitch, R16g16b16a16Bpp, isSigned); + } + + void DecodeBc7(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height) { + const auto *block{reinterpret_cast(src)}; + size_t pitch{R8g8b8a8Bpp * width}; + block->decode(dst, x, y, width, height, pitch); + } +} diff --git a/externals/bc_decoder/bc_decoder.h b/externals/bc_decoder/bc_decoder.h new file mode 100644 index 0000000000..4f0ead7d38 --- /dev/null +++ b/externals/bc_decoder/bc_decoder.h @@ -0,0 +1,43 @@ +// SPDX-License-Identifier: MPL-2.0 +// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) + +#pragma once + +#include + +namespace bcn { + /** + * @brief Decodes a BC1 encoded image to R8G8B8A8 + */ + void DecodeBc1(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); + + /** + * @brief Decodes a BC2 encoded image to R8G8B8A8 + */ + void DecodeBc2(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); + + /** + * @brief Decodes a BC3 encoded image to R8G8B8A8 + */ + void DecodeBc3(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); + + /** + * @brief Decodes a BC4 encoded image to R8 + */ + void DecodeBc4(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned); + + /** + * @brief Decodes a BC5 encoded image to R8G8 + */ + void DecodeBc5(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned); + + /** + * @brief Decodes a BC6 encoded image to R16G16B16A16 + */ + void DecodeBc6(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height, bool isSigned); + + /** + * @brief Decodes a BC7 encoded image to R8G8B8A8 + */ + void DecodeBc7(const uint8_t *src, uint8_t *dst, size_t x, size_t y, size_t width, size_t height); +} diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index e9e6f278d5..3b2fe01da0 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -220,8 +220,8 @@ add_library(video_core STATIC surface.h texture_cache/accelerated_swizzle.cpp texture_cache/accelerated_swizzle.h - texture_cache/decode_bc4.cpp - texture_cache/decode_bc4.h + texture_cache/decode_bc.cpp + texture_cache/decode_bc.h texture_cache/descriptor_table.h texture_cache/formatter.cpp texture_cache/formatter.h @@ -279,7 +279,7 @@ add_library(video_core STATIC create_target_directory_groups(video_core) target_link_libraries(video_core PUBLIC common core) -target_link_libraries(video_core PUBLIC glad shader_recompiler stb) +target_link_libraries(video_core PUBLIC glad shader_recompiler stb bc_decoder) if (YUZU_USE_BUNDLED_FFMPEG AND NOT (WIN32 OR ANDROID)) add_dependencies(video_core ffmpeg-build) diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index 9a0b105683..a8540339db 100644 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -259,6 +259,26 @@ FormatInfo SurfaceFormat(const Device& device, FormatType format_type, bool with break; } } + // Transcode on hardware that doesn't support BCn natively + if (!device.IsOptimalBcnSupported() && VideoCore::Surface::IsPixelFormatBCn(pixel_format)) { + const bool is_srgb = with_srgb && VideoCore::Surface::IsPixelFormatSRGB(pixel_format); + if (pixel_format == PixelFormat::BC4_SNORM) { + tuple.format = VK_FORMAT_R8_SNORM; + } else if (pixel_format == PixelFormat::BC4_UNORM) { + tuple.format = VK_FORMAT_R8_UNORM; + } else if (pixel_format == PixelFormat::BC5_SNORM) { + tuple.format = VK_FORMAT_R8G8_SNORM; + } else if (pixel_format == PixelFormat::BC5_UNORM) { + tuple.format = VK_FORMAT_R8G8_UNORM; + } else if (pixel_format == PixelFormat::BC6H_SFLOAT || + pixel_format == PixelFormat::BC6H_UFLOAT) { + tuple.format = VK_FORMAT_R16G16B16A16_SFLOAT; + } else if (is_srgb) { + tuple.format = VK_FORMAT_A8B8G8R8_SRGB_PACK32; + } else { + tuple.format = VK_FORMAT_A8B8G8R8_UNORM_PACK32; + } + } const bool attachable = (tuple.usage & Attachable) != 0; const bool storage = (tuple.usage & Storage) != 0; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index 268b955fb0..f7c0d939a5 100644 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -315,7 +315,14 @@ void RasterizerVulkan::Clear(u32 layer_count) { FlushWork(); gpu_memory->FlushCaching(); +#if ANDROID + if (Settings::IsGPULevelHigh()) { + // This is problematic on Android, disable on GPU Normal. + query_cache.UpdateCounters(); + } +#else query_cache.UpdateCounters(); +#endif auto& regs = maxwell3d->regs; const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index ce6acc30c1..8385b5509d 100644 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -1279,6 +1279,10 @@ Image::Image(TextureCacheRuntime& runtime_, const ImageInfo& info_, GPUVAddr gpu flags |= VideoCommon::ImageFlagBits::Converted; flags |= VideoCommon::ImageFlagBits::CostlyLoad; } + if (IsPixelFormatBCn(info.format) && !runtime->device.IsOptimalBcnSupported()) { + flags |= VideoCommon::ImageFlagBits::Converted; + flags |= VideoCommon::ImageFlagBits::CostlyLoad; + } if (runtime->device.HasDebuggingToolAttached()) { original_image.SetObjectNameEXT(VideoCommon::Name(*this).c_str()); } diff --git a/src/video_core/surface.cpp b/src/video_core/surface.cpp index cb51529e4d..e16cd5e734 100644 --- a/src/video_core/surface.cpp +++ b/src/video_core/surface.cpp @@ -269,6 +269,28 @@ bool IsPixelFormatASTC(PixelFormat format) { } } +bool IsPixelFormatBCn(PixelFormat format) { + switch (format) { + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC2_UNORM: + case PixelFormat::BC3_UNORM: + case PixelFormat::BC4_UNORM: + case PixelFormat::BC4_SNORM: + case PixelFormat::BC5_UNORM: + case PixelFormat::BC5_SNORM: + case PixelFormat::BC1_RGBA_SRGB: + case PixelFormat::BC2_SRGB: + case PixelFormat::BC3_SRGB: + case PixelFormat::BC7_UNORM: + case PixelFormat::BC6H_UFLOAT: + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC7_SRGB: + return true; + default: + return false; + } +} + bool IsPixelFormatSRGB(PixelFormat format) { switch (format) { case PixelFormat::A8B8G8R8_SRGB: diff --git a/src/video_core/surface.h b/src/video_core/surface.h index 0225d3287f..9b9c4d9bc2 100644 --- a/src/video_core/surface.h +++ b/src/video_core/surface.h @@ -501,6 +501,8 @@ SurfaceType GetFormatType(PixelFormat pixel_format); bool IsPixelFormatASTC(PixelFormat format); +bool IsPixelFormatBCn(PixelFormat format); + bool IsPixelFormatSRGB(PixelFormat format); bool IsPixelFormatInteger(PixelFormat format); diff --git a/src/video_core/texture_cache/decode_bc.cpp b/src/video_core/texture_cache/decode_bc.cpp new file mode 100644 index 0000000000..3e26474a3e --- /dev/null +++ b/src/video_core/texture_cache/decode_bc.cpp @@ -0,0 +1,129 @@ +// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project +// SPDX-License-Identifier: GPL-2.0-or-later + +#include +#include +#include +#include + +#include "common/common_types.h" +#include "video_core/texture_cache/decode_bc.h" + +namespace VideoCommon { + +namespace { +constexpr u32 BLOCK_SIZE = 4; + +using VideoCore::Surface::PixelFormat; + +constexpr bool IsSigned(PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + return true; + default: + return false; + } +} + +constexpr u32 BlockSize(PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC1_RGBA_SRGB: + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + return 8; + default: + return 16; + } +} +} // Anonymous namespace + +u32 ConvertedBytesPerBlock(VideoCore::Surface::PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + return 1; + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + return 2; + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + return 8; + default: + return 4; + } +} + +template +void DecompressBlocks(std::span input, std::span output, Extent3D extent, + bool is_signed = false) { + const u32 out_bpp = ConvertedBytesPerBlock(pixel_format); + const u32 block_width = std::min(extent.width, BLOCK_SIZE); + const u32 block_height = std::min(extent.height, BLOCK_SIZE); + const u32 pitch = extent.width * out_bpp; + size_t input_offset = 0; + size_t output_offset = 0; + for (u32 slice = 0; slice < extent.depth; ++slice) { + for (u32 y = 0; y < extent.height; y += block_height) { + size_t row_offset = 0; + for (u32 x = 0; x < extent.width; + x += block_width, row_offset += block_width * out_bpp) { + const u8* src = input.data() + input_offset; + u8* const dst = output.data() + output_offset + row_offset; + if constexpr (IsSigned(pixel_format)) { + decompress(src, dst, x, y, extent.width, extent.height, is_signed); + } else { + decompress(src, dst, x, y, extent.width, extent.height); + } + input_offset += BlockSize(pixel_format); + } + output_offset += block_height * pitch; + } + } +} + +void DecompressBCn(std::span input, std::span output, Extent3D extent, + VideoCore::Surface::PixelFormat pixel_format) { + switch (pixel_format) { + case PixelFormat::BC1_RGBA_UNORM: + case PixelFormat::BC1_RGBA_SRGB: + DecompressBlocks(input, output, extent); + break; + case PixelFormat::BC2_UNORM: + case PixelFormat::BC2_SRGB: + DecompressBlocks(input, output, extent); + break; + case PixelFormat::BC3_UNORM: + case PixelFormat::BC3_SRGB: + DecompressBlocks(input, output, extent); + break; + case PixelFormat::BC4_SNORM: + case PixelFormat::BC4_UNORM: + DecompressBlocks( + input, output, extent, pixel_format == PixelFormat::BC4_SNORM); + break; + case PixelFormat::BC5_SNORM: + case PixelFormat::BC5_UNORM: + DecompressBlocks( + input, output, extent, pixel_format == PixelFormat::BC5_SNORM); + break; + case PixelFormat::BC6H_SFLOAT: + case PixelFormat::BC6H_UFLOAT: + DecompressBlocks( + input, output, extent, pixel_format == PixelFormat::BC6H_SFLOAT); + break; + case PixelFormat::BC7_SRGB: + case PixelFormat::BC7_UNORM: + DecompressBlocks(input, output, extent); + break; + default: + LOG_WARNING(HW_GPU, "Unimplemented BCn decompression {}", pixel_format); + } +} + +} // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.h b/src/video_core/texture_cache/decode_bc.h similarity index 50% rename from src/video_core/texture_cache/decode_bc4.h rename to src/video_core/texture_cache/decode_bc.h index ab2f735beb..41d1ec0a3a 100644 --- a/src/video_core/texture_cache/decode_bc4.h +++ b/src/video_core/texture_cache/decode_bc.h @@ -6,10 +6,14 @@ #include #include "common/common_types.h" +#include "video_core/surface.h" #include "video_core/texture_cache/types.h" namespace VideoCommon { -void DecompressBC4(std::span data, Extent3D extent, std::span output); +[[nodiscard]] u32 ConvertedBytesPerBlock(VideoCore::Surface::PixelFormat pixel_format); + +void DecompressBCn(std::span input, std::span output, Extent3D extent, + VideoCore::Surface::PixelFormat pixel_format); } // namespace VideoCommon diff --git a/src/video_core/texture_cache/decode_bc4.cpp b/src/video_core/texture_cache/decode_bc4.cpp deleted file mode 100644 index ef98afdcaa..0000000000 --- a/src/video_core/texture_cache/decode_bc4.cpp +++ /dev/null @@ -1,96 +0,0 @@ -// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project -// SPDX-License-Identifier: GPL-2.0-or-later - -#include -#include -#include - -#include "common/assert.h" -#include "common/common_types.h" -#include "video_core/texture_cache/decode_bc4.h" -#include "video_core/texture_cache/types.h" - -namespace VideoCommon { - -// https://www.khronos.org/registry/OpenGL/extensions/ARB/ARB_texture_compression_rgtc.txt -[[nodiscard]] constexpr u32 DecompressBlock(u64 bits, u32 x, u32 y) { - const u32 code_offset = 16 + 3 * (4 * y + x); - const u32 code = (bits >> code_offset) & 7; - const u32 red0 = (bits >> 0) & 0xff; - const u32 red1 = (bits >> 8) & 0xff; - if (red0 > red1) { - switch (code) { - case 0: - return red0; - case 1: - return red1; - case 2: - return (6 * red0 + 1 * red1) / 7; - case 3: - return (5 * red0 + 2 * red1) / 7; - case 4: - return (4 * red0 + 3 * red1) / 7; - case 5: - return (3 * red0 + 4 * red1) / 7; - case 6: - return (2 * red0 + 5 * red1) / 7; - case 7: - return (1 * red0 + 6 * red1) / 7; - } - } else { - switch (code) { - case 0: - return red0; - case 1: - return red1; - case 2: - return (4 * red0 + 1 * red1) / 5; - case 3: - return (3 * red0 + 2 * red1) / 5; - case 4: - return (2 * red0 + 3 * red1) / 5; - case 5: - return (1 * red0 + 4 * red1) / 5; - case 6: - return 0; - case 7: - return 0xff; - } - } - return 0; -} - -void DecompressBC4(std::span input, Extent3D extent, std::span output) { - UNIMPLEMENTED_IF_MSG(extent.width % 4 != 0, "Unaligned width={}", extent.width); - UNIMPLEMENTED_IF_MSG(extent.height % 4 != 0, "Unaligned height={}", extent.height); - static constexpr u32 BLOCK_SIZE = 4; - size_t input_offset = 0; - for (u32 slice = 0; slice < extent.depth; ++slice) { - for (u32 block_y = 0; block_y < extent.height / 4; ++block_y) { - for (u32 block_x = 0; block_x < extent.width / 4; ++block_x) { - u64 bits; - std::memcpy(&bits, &input[input_offset], sizeof(bits)); - input_offset += sizeof(bits); - - for (u32 y = 0; y < BLOCK_SIZE; ++y) { - for (u32 x = 0; x < BLOCK_SIZE; ++x) { - const u32 linear_z = slice; - const u32 linear_y = block_y * BLOCK_SIZE + y; - const u32 linear_x = block_x * BLOCK_SIZE + x; - const u32 offset_z = linear_z * extent.width * extent.height; - const u32 offset_y = linear_y * extent.width; - const u32 offset_x = linear_x; - const u32 output_offset = (offset_z + offset_y + offset_x) * 4ULL; - const u32 color = DecompressBlock(bits, x, y); - output[output_offset + 0] = static_cast(color); - output[output_offset + 1] = 0; - output[output_offset + 2] = 0; - output[output_offset + 3] = 0xff; - } - } - } - } - } -} - -} // namespace VideoCommon diff --git a/src/video_core/texture_cache/util.cpp b/src/video_core/texture_cache/util.cpp index f781cb7a07..9a618a57a1 100644 --- a/src/video_core/texture_cache/util.cpp +++ b/src/video_core/texture_cache/util.cpp @@ -24,7 +24,7 @@ #include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/surface.h" -#include "video_core/texture_cache/decode_bc4.h" +#include "video_core/texture_cache/decode_bc.h" #include "video_core/texture_cache/format_lookup_table.h" #include "video_core/texture_cache/formatter.h" #include "video_core/texture_cache/samples_helper.h" @@ -61,8 +61,6 @@ using VideoCore::Surface::PixelFormatFromDepthFormat; using VideoCore::Surface::PixelFormatFromRenderTargetFormat; using VideoCore::Surface::SurfaceType; -constexpr u32 CONVERTED_BYTES_PER_BLOCK = BytesPerBlock(PixelFormat::A8B8G8R8_UNORM); - struct LevelInfo { Extent3D size; Extent3D block; @@ -612,7 +610,8 @@ u32 CalculateConvertedSizeBytes(const ImageInfo& info) noexcept { } return output_size; } - return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * CONVERTED_BYTES_PER_BLOCK; + return NumBlocksPerLayer(info, TILE_SIZE) * info.resources.layers * + ConvertedBytesPerBlock(info.format); } u32 CalculateLayerStride(const ImageInfo& info) noexcept { @@ -945,7 +944,8 @@ void ConvertImage(std::span input, const ImageInfo& info, std::span input, const ImageInfo& info, std::span input, const ImageInfo& info, std::span(copy.buffer_size); } else { - DecompressBC4(input_offset, copy.image_extent, output.subspan(output_offset)); - + const Extent3D image_extent{ + .width = copy.image_extent.width, + .height = copy.image_extent.height * copy.image_subresource.num_layers, + .depth = copy.image_extent.depth, + }; + DecompressBCn(input_offset, output.subspan(output_offset), image_extent, info.format); output_offset += copy.image_extent.width * copy.image_extent.height * - copy.image_subresource.num_layers * CONVERTED_BYTES_PER_BLOCK; + copy.image_subresource.num_layers * + ConvertedBytesPerBlock(info.format); } } } diff --git a/src/video_core/textures/bcn.cpp b/src/video_core/textures/bcn.cpp index 671212a49b..16ddbe3202 100644 --- a/src/video_core/textures/bcn.cpp +++ b/src/video_core/textures/bcn.cpp @@ -3,7 +3,6 @@ #include #include - #include "common/alignment.h" #include "video_core/textures/bcn.h" #include "video_core/textures/workers.h" diff --git a/src/video_core/textures/bcn.h b/src/video_core/textures/bcn.h index 6464af8856..d5d2a16c91 100644 --- a/src/video_core/textures/bcn.h +++ b/src/video_core/textures/bcn.h @@ -4,14 +4,13 @@ #pragma once #include -#include + +#include "common/common_types.h" namespace Tegra::Texture::BCN { -void CompressBC1(std::span data, uint32_t width, uint32_t height, uint32_t depth, - std::span output); +void CompressBC1(std::span data, u32 width, u32 height, u32 depth, std::span output); -void CompressBC3(std::span data, uint32_t width, uint32_t height, uint32_t depth, - std::span output); +void CompressBC3(std::span data, u32 width, u32 height, u32 depth, std::span output); } // namespace Tegra::Texture::BCN diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index e05d04db32..1f17265d52 100644 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -293,6 +293,11 @@ public: return features.features.textureCompressionASTC_LDR; } + /// Returns true if BCn is natively supported. + bool IsOptimalBcnSupported() const { + return features.features.textureCompressionBC; + } + /// Returns true if descriptor aliasing is natively supported. bool IsDescriptorAliasingSupported() const { return GetDriverID() != VK_DRIVER_ID_QUALCOMM_PROPRIETARY; @@ -423,6 +428,11 @@ public: return extensions.sampler_filter_minmax; } + /// Returns true if the device supports VK_EXT_shader_stencil_export. + bool IsExtShaderStencilExportSupported() const { + return extensions.shader_stencil_export; + } + /// Returns true if the device supports VK_EXT_depth_range_unrestricted. bool IsExtDepthRangeUnrestrictedSupported() const { return extensions.depth_range_unrestricted; @@ -492,11 +502,6 @@ public: return extensions.vertex_input_dynamic_state; } - /// Returns true if the device supports VK_EXT_shader_stencil_export. - bool IsExtShaderStencilExportSupported() const { - return extensions.shader_stencil_export; - } - /// Returns true if the device supports VK_EXT_shader_demote_to_helper_invocation bool IsExtShaderDemoteToHelperInvocationSupported() const { return extensions.shader_demote_to_helper_invocation; From ddcc95833660c57647d3e99dad76ecfa3b86ee8d Mon Sep 17 00:00:00 2001 From: GPUCode Date: Thu, 15 Jun 2023 12:17:46 +0300 Subject: [PATCH 8/8] renderer_vulkan: Prevent crashes when blitting depth stencil --- src/video_core/renderer_vulkan/blit_image.cpp | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/video_core/renderer_vulkan/blit_image.cpp b/src/video_core/renderer_vulkan/blit_image.cpp index cf2964a3f0..28d4b15a08 100644 --- a/src/video_core/renderer_vulkan/blit_image.cpp +++ b/src/video_core/renderer_vulkan/blit_image.cpp @@ -495,6 +495,9 @@ void BlitImageHelper::BlitDepthStencil(const Framebuffer* dst_framebuffer, const Region2D& dst_region, const Region2D& src_region, Tegra::Engines::Fermi2D::Filter filter, Tegra::Engines::Fermi2D::Operation operation) { + if (!device.IsExtShaderStencilExportSupported()) { + return; + } ASSERT(filter == Tegra::Engines::Fermi2D::Filter::Point); ASSERT(operation == Tegra::Engines::Fermi2D::Operation::SrcCopy); const BlitImagePipelineKey key{