mirror of
https://git.suyu.dev/suyu/suyu.git
synced 2024-11-22 14:52:45 +01:00
Query Cache: Implement host side sample counting.
This commit is contained in:
parent
2fea1b8407
commit
c8237d5c31
5 changed files with 348 additions and 48 deletions
|
@ -41,6 +41,7 @@ set(SHADER_FILES
|
||||||
pitch_unswizzle.comp
|
pitch_unswizzle.comp
|
||||||
present_bicubic.frag
|
present_bicubic.frag
|
||||||
present_gaussian.frag
|
present_gaussian.frag
|
||||||
|
queries_prefix_scan_sum.comp
|
||||||
resolve_conditional_render.comp
|
resolve_conditional_render.comp
|
||||||
smaa_edge_detection.vert
|
smaa_edge_detection.vert
|
||||||
smaa_edge_detection.frag
|
smaa_edge_detection.frag
|
||||||
|
|
124
src/video_core/host_shaders/queries_prefix_scan_sum.comp
Normal file
124
src/video_core/host_shaders/queries_prefix_scan_sum.comp
Normal file
|
@ -0,0 +1,124 @@
|
||||||
|
// SPDX-FileCopyrightText: Copyright 2015 Graham Sellers, Richard Wright Jr. and Nicholas Haemel
|
||||||
|
// SPDX-License-Identifier: MIT
|
||||||
|
|
||||||
|
// Code obtained from OpenGL SuperBible, Seventh Edition by Graham Sellers, Richard Wright Jr. and
|
||||||
|
// Nicholas Haemel. Modified to suit needs and optimize for subgroup
|
||||||
|
|
||||||
|
#version 460 core
|
||||||
|
|
||||||
|
#ifdef VULKAN
|
||||||
|
|
||||||
|
#extension GL_KHR_shader_subgroup_arithmetic : enable
|
||||||
|
#define HAS_EXTENDED_TYPES 1
|
||||||
|
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
|
||||||
|
#define END_PUSH_CONSTANTS \
|
||||||
|
} \
|
||||||
|
;
|
||||||
|
#define UNIFORM(n)
|
||||||
|
#define BINDING_INPUT_BUFFER 0
|
||||||
|
#define BINDING_OUTPUT_IMAGE 1
|
||||||
|
|
||||||
|
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
|
||||||
|
|
||||||
|
#extension GL_KHR_shader_subgroup_arithmetic : enable
|
||||||
|
#extension GL_NV_gpu_shader5 : enable
|
||||||
|
#ifdef GL_NV_gpu_shader5
|
||||||
|
#define HAS_EXTENDED_TYPES 1
|
||||||
|
#else
|
||||||
|
#define HAS_EXTENDED_TYPES 0
|
||||||
|
#endif
|
||||||
|
#define BEGIN_PUSH_CONSTANTS
|
||||||
|
#define END_PUSH_CONSTANTS
|
||||||
|
#define UNIFORM(n) layout(location = n) uniform
|
||||||
|
#define BINDING_INPUT_BUFFER 0
|
||||||
|
#define BINDING_OUTPUT_IMAGE 0
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
|
BEGIN_PUSH_CONSTANTS
|
||||||
|
UNIFORM(0) uint max_accumulation_base;
|
||||||
|
UNIFORM(1) uint accumulation_limit;
|
||||||
|
END_PUSH_CONSTANTS
|
||||||
|
|
||||||
|
layout(local_size_x = 32) in;
|
||||||
|
|
||||||
|
layout(std430, binding = 0) readonly buffer block1 {
|
||||||
|
uvec2 input_data[gl_WorkGroupSize.x];
|
||||||
|
};
|
||||||
|
|
||||||
|
layout(std430, binding = 1) writeonly coherent buffer block2 {
|
||||||
|
uvec2 output_data[gl_WorkGroupSize.x];
|
||||||
|
};
|
||||||
|
|
||||||
|
layout(std430, binding = 2) coherent buffer block3 {
|
||||||
|
uvec2 accumulated_data;
|
||||||
|
};
|
||||||
|
|
||||||
|
shared uvec2 shared_data[gl_WorkGroupSize.x * 2];
|
||||||
|
|
||||||
|
uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
|
||||||
|
uint carry = 0;
|
||||||
|
uvec2 result;
|
||||||
|
result.x = uaddCarry(value_1.x, value_2.x, carry);
|
||||||
|
result.y = value_1.y + value_2.y + carry;
|
||||||
|
return result;
|
||||||
|
}
|
||||||
|
|
||||||
|
void main(void) {
|
||||||
|
uint id = gl_LocalInvocationID.x;
|
||||||
|
uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0);
|
||||||
|
uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0);
|
||||||
|
uint work_size = gl_WorkGroupSize.x;
|
||||||
|
uint rd_id;
|
||||||
|
uint wr_id;
|
||||||
|
uint mask;
|
||||||
|
uvec2 input_1 = input_data[id * 2];
|
||||||
|
uvec2 input_2 = input_data[id * 2 + 1];
|
||||||
|
// The number of steps is the log base 2 of the
|
||||||
|
// work group size, which should be a power of 2
|
||||||
|
const uint steps = uint(log2(work_size)) + 1;
|
||||||
|
uint step = 0;
|
||||||
|
|
||||||
|
// Each invocation is responsible for the content of
|
||||||
|
// two elements of the output array
|
||||||
|
shared_data[id * 2] = input_1;
|
||||||
|
shared_data[id * 2 + 1] = input_2;
|
||||||
|
// Synchronize to make sure that everyone has initialized
|
||||||
|
// their elements of shared_data[] with data loaded from
|
||||||
|
// the input arrays
|
||||||
|
barrier();
|
||||||
|
memoryBarrierShared();
|
||||||
|
// For each step...
|
||||||
|
for (step = 0; step < steps; step++) {
|
||||||
|
// Calculate the read and write index in the
|
||||||
|
// shared array
|
||||||
|
mask = (1 << step) - 1;
|
||||||
|
rd_id = ((id >> step) << (step + 1)) + mask;
|
||||||
|
wr_id = rd_id + 1 + (id & mask);
|
||||||
|
// Accumulate the read data into our element
|
||||||
|
|
||||||
|
shared_data[wr_id] = AddUint64(shared_data[rd_id], shared_data[wr_id]);
|
||||||
|
// Synchronize again to make sure that everyone
|
||||||
|
// has caught up with us
|
||||||
|
barrier();
|
||||||
|
memoryBarrierShared();
|
||||||
|
}
|
||||||
|
// Add the accumulation
|
||||||
|
shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1);
|
||||||
|
shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2);
|
||||||
|
barrier();
|
||||||
|
memoryBarrierShared();
|
||||||
|
|
||||||
|
// Finally write our data back to the output buffer
|
||||||
|
output_data[id * 2] = shared_data[id * 2];
|
||||||
|
output_data[id * 2 + 1] = shared_data[id * 2 + 1];
|
||||||
|
if (id == 0) {
|
||||||
|
if (max_accumulation_base >= accumulation_limit + 1) {
|
||||||
|
accumulated_data = shared_data[accumulation_limit];
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
uvec2 value_1 = shared_data[max_accumulation_base];
|
||||||
|
uvec2 value_2 = shared_data[accumulation_limit];
|
||||||
|
accumulated_data = AddUint64(value_1, -value_2);
|
||||||
|
}
|
||||||
|
}
|
|
@ -12,6 +12,7 @@
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
#include "common/div_ceil.h"
|
#include "common/div_ceil.h"
|
||||||
#include "video_core/host_shaders/astc_decoder_comp_spv.h"
|
#include "video_core/host_shaders/astc_decoder_comp_spv.h"
|
||||||
|
#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h"
|
||||||
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
|
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
|
||||||
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
|
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
|
||||||
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
|
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
|
||||||
|
@ -58,6 +59,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE
|
||||||
},
|
},
|
||||||
}};
|
}};
|
||||||
|
|
||||||
|
constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{
|
||||||
|
{
|
||||||
|
.binding = 0,
|
||||||
|
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||||
|
.descriptorCount = 1,
|
||||||
|
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||||
|
.pImmutableSamplers = nullptr,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.binding = 1,
|
||||||
|
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||||
|
.descriptorCount = 1,
|
||||||
|
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||||
|
.pImmutableSamplers = nullptr,
|
||||||
|
},
|
||||||
|
{
|
||||||
|
.binding = 2,
|
||||||
|
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||||
|
.descriptorCount = 1,
|
||||||
|
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
|
||||||
|
.pImmutableSamplers = nullptr,
|
||||||
|
},
|
||||||
|
}};
|
||||||
|
|
||||||
constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
|
constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
|
||||||
.uniform_buffers = 0,
|
.uniform_buffers = 0,
|
||||||
.storage_buffers = 2,
|
.storage_buffers = 2,
|
||||||
|
@ -68,6 +93,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
|
||||||
.score = 2,
|
.score = 2,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{
|
||||||
|
.uniform_buffers = 0,
|
||||||
|
.storage_buffers = 3,
|
||||||
|
.texture_buffers = 0,
|
||||||
|
.image_buffers = 0,
|
||||||
|
.textures = 0,
|
||||||
|
.images = 0,
|
||||||
|
.score = 3,
|
||||||
|
};
|
||||||
|
|
||||||
constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
|
constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
|
||||||
{
|
{
|
||||||
.binding = ASTC_BINDING_INPUT_BUFFER,
|
.binding = ASTC_BINDING_INPUT_BUFFER,
|
||||||
|
@ -104,6 +139,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT
|
||||||
.stride = sizeof(DescriptorUpdateEntry),
|
.stride = sizeof(DescriptorUpdateEntry),
|
||||||
};
|
};
|
||||||
|
|
||||||
|
constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{
|
||||||
|
.dstBinding = 0,
|
||||||
|
.dstArrayElement = 0,
|
||||||
|
.descriptorCount = 3,
|
||||||
|
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
|
||||||
|
.offset = 0,
|
||||||
|
.stride = sizeof(DescriptorUpdateEntry),
|
||||||
|
};
|
||||||
|
|
||||||
constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS>
|
constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS>
|
||||||
ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
|
ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
|
||||||
{
|
{
|
||||||
|
@ -132,6 +176,11 @@ struct AstcPushConstants {
|
||||||
u32 block_height;
|
u32 block_height;
|
||||||
u32 block_height_mask;
|
u32 block_height_mask;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
struct QueriesPrefixScanPushConstants {
|
||||||
|
u32 max_accumulation_base;
|
||||||
|
u32 accumulation_limit;
|
||||||
|
};
|
||||||
} // Anonymous namespace
|
} // Anonymous namespace
|
||||||
|
|
||||||
ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
|
ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
|
||||||
|
@ -313,8 +362,6 @@ ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(
|
||||||
|
|
||||||
void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer,
|
void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer,
|
||||||
u32 src_offset, bool compare_to_zero) {
|
u32 src_offset, bool compare_to_zero) {
|
||||||
scheduler.RequestOutsideRenderPassOperationContext();
|
|
||||||
|
|
||||||
const size_t compare_size = compare_to_zero ? 8 : 24;
|
const size_t compare_size = compare_to_zero ? 8 : 24;
|
||||||
|
|
||||||
compute_pass_descriptor_queue.Acquire();
|
compute_pass_descriptor_queue.Acquire();
|
||||||
|
@ -327,7 +374,7 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
|
||||||
static constexpr VkMemoryBarrier read_barrier{
|
static constexpr VkMemoryBarrier read_barrier{
|
||||||
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||||
.pNext = nullptr,
|
.pNext = nullptr,
|
||||||
.srcAccessMask = VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
|
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
||||||
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
||||||
};
|
};
|
||||||
static constexpr VkMemoryBarrier write_barrier{
|
static constexpr VkMemoryBarrier write_barrier{
|
||||||
|
@ -349,6 +396,63 @@ void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
QueriesPrefixScanPass::QueriesPrefixScanPass(
|
||||||
|
const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
|
||||||
|
ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
|
||||||
|
: ComputePass(device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS,
|
||||||
|
QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO,
|
||||||
|
COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>,
|
||||||
|
QUERIES_PREFIX_SCAN_SUM_COMP_SPV),
|
||||||
|
scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
|
||||||
|
|
||||||
|
void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer,
|
||||||
|
VkBuffer src_buffer, size_t number_of_sums,
|
||||||
|
size_t max_accumulation_limit) {
|
||||||
|
size_t aligned_runs = Common::AlignUp(number_of_sums, 32);
|
||||||
|
|
||||||
|
compute_pass_descriptor_queue.Acquire();
|
||||||
|
compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, aligned_runs * sizeof(u64));
|
||||||
|
compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, aligned_runs * sizeof(u64));
|
||||||
|
compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64));
|
||||||
|
const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()};
|
||||||
|
|
||||||
|
scheduler.RequestOutsideRenderPassOperationContext();
|
||||||
|
scheduler.Record([this, descriptor_data, max_accumulation_limit, number_of_sums,
|
||||||
|
aligned_runs](vk::CommandBuffer cmdbuf) {
|
||||||
|
static constexpr VkMemoryBarrier read_barrier{
|
||||||
|
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||||
|
.pNext = nullptr,
|
||||||
|
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
|
||||||
|
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
|
||||||
|
};
|
||||||
|
static constexpr VkMemoryBarrier write_barrier{
|
||||||
|
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
|
||||||
|
.pNext = nullptr,
|
||||||
|
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
|
||||||
|
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT |
|
||||||
|
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
|
||||||
|
VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT |
|
||||||
|
VK_ACCESS_UNIFORM_READ_BIT |
|
||||||
|
VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT,
|
||||||
|
};
|
||||||
|
const QueriesPrefixScanPushConstants uniforms{
|
||||||
|
.max_accumulation_base = static_cast<u32>(max_accumulation_limit),
|
||||||
|
.accumulation_limit = static_cast<u32>(number_of_sums - 1),
|
||||||
|
};
|
||||||
|
const VkDescriptorSet set = descriptor_allocator.Commit();
|
||||||
|
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
|
||||||
|
|
||||||
|
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
|
||||||
|
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier);
|
||||||
|
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
|
||||||
|
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
|
||||||
|
cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms);
|
||||||
|
cmdbuf.Dispatch(static_cast<u32>(aligned_runs / 32U), 1, 1);
|
||||||
|
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
|
||||||
|
VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier);
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
|
ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
|
||||||
DescriptorPool& descriptor_pool_,
|
DescriptorPool& descriptor_pool_,
|
||||||
StagingBufferPool& staging_buffer_pool_,
|
StagingBufferPool& staging_buffer_pool_,
|
||||||
|
|
|
@ -95,6 +95,20 @@ private:
|
||||||
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
|
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
|
||||||
};
|
};
|
||||||
|
|
||||||
|
class QueriesPrefixScanPass final : public ComputePass {
|
||||||
|
public:
|
||||||
|
explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_,
|
||||||
|
DescriptorPool& descriptor_pool_,
|
||||||
|
ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
|
||||||
|
|
||||||
|
void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer,
|
||||||
|
size_t number_of_sums, size_t max_accumulation_limit);
|
||||||
|
|
||||||
|
private:
|
||||||
|
Scheduler& scheduler;
|
||||||
|
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
|
||||||
|
};
|
||||||
|
|
||||||
class ASTCDecoderPass final : public ComputePass {
|
class ASTCDecoderPass final : public ComputePass {
|
||||||
public:
|
public:
|
||||||
explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
|
explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
|
||||||
|
|
|
@ -11,6 +11,7 @@
|
||||||
#include <utility>
|
#include <utility>
|
||||||
#include <vector>
|
#include <vector>
|
||||||
|
|
||||||
|
#include "common/bit_util.h"
|
||||||
#include "common/common_types.h"
|
#include "common/common_types.h"
|
||||||
#include "core/memory.h"
|
#include "core/memory.h"
|
||||||
#include "video_core/engines/draw_manager.h"
|
#include "video_core/engines/draw_manager.h"
|
||||||
|
@ -112,14 +113,34 @@ class SamplesStreamer : public BaseStreamer {
|
||||||
public:
|
public:
|
||||||
explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_,
|
explicit SamplesStreamer(size_t id_, QueryCacheRuntime& runtime_,
|
||||||
VideoCore::RasterizerInterface* rasterizer_, const Device& device_,
|
VideoCore::RasterizerInterface* rasterizer_, const Device& device_,
|
||||||
Scheduler& scheduler_, const MemoryAllocator& memory_allocator_)
|
Scheduler& scheduler_, const MemoryAllocator& memory_allocator_,
|
||||||
|
ComputePassDescriptorQueue& compute_pass_descriptor_queue,
|
||||||
|
DescriptorPool& descriptor_pool)
|
||||||
: BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_},
|
: BaseStreamer(id_), runtime{runtime_}, rasterizer{rasterizer_}, device{device_},
|
||||||
scheduler{scheduler_}, memory_allocator{memory_allocator_} {
|
scheduler{scheduler_}, memory_allocator{memory_allocator_} {
|
||||||
BuildResolveBuffer();
|
|
||||||
current_bank = nullptr;
|
current_bank = nullptr;
|
||||||
current_query = nullptr;
|
current_query = nullptr;
|
||||||
ammend_value = 0;
|
ammend_value = 0;
|
||||||
acumulation_value = 0;
|
acumulation_value = 0;
|
||||||
|
queries_prefix_scan_pass = std::make_unique<QueriesPrefixScanPass>(
|
||||||
|
device, scheduler, descriptor_pool, compute_pass_descriptor_queue);
|
||||||
|
|
||||||
|
const VkBufferCreateInfo buffer_ci = {
|
||||||
|
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
||||||
|
.pNext = nullptr,
|
||||||
|
.flags = 0,
|
||||||
|
.size = 8,
|
||||||
|
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
|
||||||
|
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
|
||||||
|
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
|
||||||
|
.queueFamilyIndexCount = 0,
|
||||||
|
.pQueueFamilyIndices = nullptr,
|
||||||
|
};
|
||||||
|
accumulation_buffer = memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);
|
||||||
|
scheduler.RequestOutsideRenderPassOperationContext();
|
||||||
|
scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
|
||||||
|
cmdbuf.FillBuffer(buffer, 0, 8, 0);
|
||||||
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
~SamplesStreamer() = default;
|
~SamplesStreamer() = default;
|
||||||
|
@ -159,6 +180,8 @@ public:
|
||||||
acumulation_value = 0;
|
acumulation_value = 0;
|
||||||
});
|
});
|
||||||
rasterizer->SyncOperation(std::move(func));
|
rasterizer->SyncOperation(std::move(func));
|
||||||
|
accumulation_since_last_sync = false;
|
||||||
|
last_accumulation_checkpoint = std::min(last_accumulation_checkpoint, num_slots_used);
|
||||||
}
|
}
|
||||||
|
|
||||||
void CloseCounter() override {
|
void CloseCounter() override {
|
||||||
|
@ -175,7 +198,8 @@ public:
|
||||||
}
|
}
|
||||||
|
|
||||||
for (size_t i = 0; i < sync_values_stash.size(); i++) {
|
for (size_t i = 0; i < sync_values_stash.size(); i++) {
|
||||||
runtime.template SyncValues<HostSyncValues>(sync_values_stash[i], *resolve_buffers[i]);
|
runtime.template SyncValues<HostSyncValues>(sync_values_stash[i],
|
||||||
|
*buffers[resolve_buffers[i]]);
|
||||||
}
|
}
|
||||||
|
|
||||||
sync_values_stash.clear();
|
sync_values_stash.clear();
|
||||||
|
@ -189,36 +213,21 @@ public:
|
||||||
sync_values_stash.clear();
|
sync_values_stash.clear();
|
||||||
sync_values_stash.emplace_back();
|
sync_values_stash.emplace_back();
|
||||||
std::vector<HostSyncValues>* sync_values = &sync_values_stash.back();
|
std::vector<HostSyncValues>* sync_values = &sync_values_stash.back();
|
||||||
sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE);
|
sync_values->reserve(num_slots_used);
|
||||||
std::unordered_map<size_t, std::pair<size_t, size_t>> offsets;
|
std::unordered_map<size_t, std::pair<size_t, size_t>> offsets;
|
||||||
size_t this_bank_slot = std::numeric_limits<size_t>::max();
|
resolve_buffers.clear();
|
||||||
size_t resolve_slots_remaining = resolve_slots;
|
size_t resolve_buffer_index = ObtainBuffer<true>(num_slots_used);
|
||||||
size_t resolve_buffer_index = 0;
|
resolve_buffers.push_back(resolve_buffer_index);
|
||||||
|
size_t base_offset = 0;
|
||||||
|
|
||||||
ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start,
|
ApplyBanksWideOp<true>(pending_sync, [&](SamplesQueryBank* bank, size_t start,
|
||||||
size_t amount) {
|
size_t amount) {
|
||||||
size_t bank_id = bank->GetIndex();
|
size_t bank_id = bank->GetIndex();
|
||||||
if (this_bank_slot != bank_id) {
|
auto& resolve_buffer = buffers[resolve_buffer_index];
|
||||||
this_bank_slot = bank_id;
|
|
||||||
if (resolve_slots_remaining == 0) {
|
|
||||||
resolve_buffer_index++;
|
|
||||||
if (resolve_buffer_index >= resolve_buffers.size()) {
|
|
||||||
BuildResolveBuffer();
|
|
||||||
}
|
|
||||||
resolve_slots_remaining = resolve_slots;
|
|
||||||
sync_values_stash.emplace_back();
|
|
||||||
sync_values = &sync_values_stash.back();
|
|
||||||
sync_values->reserve(resolve_slots * SamplesQueryBank::BANK_SIZE);
|
|
||||||
}
|
|
||||||
resolve_slots_remaining--;
|
|
||||||
}
|
|
||||||
auto& resolve_buffer = resolve_buffers[resolve_buffer_index];
|
|
||||||
const size_t base_offset = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE *
|
|
||||||
(resolve_slots - resolve_slots_remaining - 1);
|
|
||||||
VkQueryPool query_pool = bank->GetInnerPool();
|
VkQueryPool query_pool = bank->GetInnerPool();
|
||||||
scheduler.RequestOutsideRenderPassOperationContext();
|
scheduler.RequestOutsideRenderPassOperationContext();
|
||||||
scheduler.Record([start, amount, base_offset, query_pool,
|
scheduler.Record([start, amount, base_offset, query_pool,
|
||||||
buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) {
|
buffer = *resolve_buffer](vk::CommandBuffer cmdbuf) {
|
||||||
size_t final_offset = base_offset + start * SamplesQueryBank::QUERY_SIZE;
|
|
||||||
const VkBufferMemoryBarrier copy_query_pool_barrier{
|
const VkBufferMemoryBarrier copy_query_pool_barrier{
|
||||||
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
|
.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER,
|
||||||
.pNext = nullptr,
|
.pNext = nullptr,
|
||||||
|
@ -227,39 +236,60 @@ public:
|
||||||
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||||
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED,
|
||||||
.buffer = buffer,
|
.buffer = buffer,
|
||||||
.offset = final_offset,
|
.offset = base_offset,
|
||||||
.size = amount * SamplesQueryBank::QUERY_SIZE,
|
.size = amount * SamplesQueryBank::QUERY_SIZE,
|
||||||
};
|
};
|
||||||
|
|
||||||
cmdbuf.CopyQueryPoolResults(
|
cmdbuf.CopyQueryPoolResults(
|
||||||
query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer,
|
query_pool, static_cast<u32>(start), static_cast<u32>(amount), buffer,
|
||||||
static_cast<u32>(final_offset), SamplesQueryBank::QUERY_SIZE,
|
static_cast<u32>(base_offset), SamplesQueryBank::QUERY_SIZE,
|
||||||
VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT);
|
VK_QUERY_RESULT_WAIT_BIT | VK_QUERY_RESULT_64_BIT);
|
||||||
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
|
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT,
|
||||||
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier);
|
VK_PIPELINE_STAGE_TRANSFER_BIT, 0, copy_query_pool_barrier);
|
||||||
});
|
});
|
||||||
offsets[bank_id] = {sync_values_stash.size() - 1, base_offset};
|
offsets[bank_id] = {start, base_offset};
|
||||||
|
base_offset += amount * SamplesQueryBank::QUERY_SIZE;
|
||||||
});
|
});
|
||||||
|
|
||||||
// Convert queries
|
// Convert queries
|
||||||
|
bool has_multi_queries = false;
|
||||||
for (auto q : pending_sync) {
|
for (auto q : pending_sync) {
|
||||||
auto* query = GetQuery(q);
|
auto* query = GetQuery(q);
|
||||||
|
size_t sync_value_slot = 0;
|
||||||
if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) {
|
if (True(query->flags & VideoCommon::QueryFlagBits::IsRewritten)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) {
|
if (True(query->flags & VideoCommon::QueryFlagBits::IsInvalidated)) {
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
if (query->size_slots > 1) {
|
if (accumulation_since_last_sync || query->size_slots > 1) {
|
||||||
// This is problematic.
|
if (!has_multi_queries) {
|
||||||
// UNIMPLEMENTED();
|
has_multi_queries = true;
|
||||||
|
sync_values_stash.emplace_back();
|
||||||
|
}
|
||||||
|
sync_value_slot = 1;
|
||||||
}
|
}
|
||||||
query->flags |= VideoCommon::QueryFlagBits::IsHostSynced;
|
query->flags |= VideoCommon::QueryFlagBits::IsHostSynced;
|
||||||
auto loc_data = offsets[query->start_bank_id];
|
auto loc_data = offsets[query->start_bank_id];
|
||||||
sync_values_stash[loc_data.first].emplace_back(HostSyncValues{
|
sync_values_stash[sync_value_slot].emplace_back(HostSyncValues{
|
||||||
.address = query->guest_address,
|
.address = query->guest_address,
|
||||||
.size = SamplesQueryBank::QUERY_SIZE,
|
.size = SamplesQueryBank::QUERY_SIZE,
|
||||||
.offset = loc_data.second + query->start_slot * SamplesQueryBank::QUERY_SIZE,
|
.offset =
|
||||||
|
loc_data.second + (query->start_slot - loc_data.first + query->size_slots - 1) *
|
||||||
|
SamplesQueryBank::QUERY_SIZE,
|
||||||
|
});
|
||||||
|
}
|
||||||
|
|
||||||
|
if (has_multi_queries) {
|
||||||
|
size_t intermediary_buffer_index = ObtainBuffer<false>(num_slots_used);
|
||||||
|
resolve_buffers.push_back(intermediary_buffer_index);
|
||||||
|
queries_prefix_scan_pass->Run(*accumulation_buffer, *buffers[intermediary_buffer_index],
|
||||||
|
*buffers[resolve_buffer_index], num_slots_used,
|
||||||
|
std::min(last_accumulation_checkpoint, num_slots_used));
|
||||||
|
} else {
|
||||||
|
scheduler.RequestOutsideRenderPassOperationContext();
|
||||||
|
scheduler.Record([buffer = *accumulation_buffer](vk::CommandBuffer cmdbuf) {
|
||||||
|
cmdbuf.FillBuffer(buffer, 0, 8, 0);
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -267,6 +297,9 @@ public:
|
||||||
std::function<void()> func([this] { ammend_value = acumulation_value; });
|
std::function<void()> func([this] { ammend_value = acumulation_value; });
|
||||||
rasterizer->SyncOperation(std::move(func));
|
rasterizer->SyncOperation(std::move(func));
|
||||||
AbandonCurrentQuery();
|
AbandonCurrentQuery();
|
||||||
|
num_slots_used = 0;
|
||||||
|
last_accumulation_checkpoint = std::numeric_limits<size_t>::max();
|
||||||
|
accumulation_since_last_sync = has_multi_queries;
|
||||||
pending_sync.clear();
|
pending_sync.clear();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -400,6 +433,7 @@ private:
|
||||||
void ReserveHostQuery() {
|
void ReserveHostQuery() {
|
||||||
size_t new_slot = ReserveBankSlot();
|
size_t new_slot = ReserveBankSlot();
|
||||||
current_bank->AddReference(1);
|
current_bank->AddReference(1);
|
||||||
|
num_slots_used++;
|
||||||
if (current_query) {
|
if (current_query) {
|
||||||
size_t bank_id = current_query->start_bank_id;
|
size_t bank_id = current_query->start_bank_id;
|
||||||
size_t banks_set = current_query->size_banks - 1;
|
size_t banks_set = current_query->size_banks - 1;
|
||||||
|
@ -470,32 +504,50 @@ private:
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
|
|
||||||
void BuildResolveBuffer() {
|
template <bool is_resolve>
|
||||||
|
size_t ObtainBuffer(size_t num_needed) {
|
||||||
|
const size_t log_2 = std::max<size_t>(6U, Common::Log2Ceil64(num_needed));
|
||||||
|
if constexpr (is_resolve) {
|
||||||
|
if (resolve_table[log_2] != 0) {
|
||||||
|
return resolve_table[log_2] - 1;
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
if (intermediary_table[log_2] != 0) {
|
||||||
|
return intermediary_table[log_2] - 1;
|
||||||
|
}
|
||||||
|
}
|
||||||
const VkBufferCreateInfo buffer_ci = {
|
const VkBufferCreateInfo buffer_ci = {
|
||||||
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
|
||||||
.pNext = nullptr,
|
.pNext = nullptr,
|
||||||
.flags = 0,
|
.flags = 0,
|
||||||
.size = SamplesQueryBank::QUERY_SIZE * SamplesQueryBank::BANK_SIZE * resolve_slots,
|
.size = SamplesQueryBank::QUERY_SIZE * (1ULL << log_2),
|
||||||
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
|
.usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_TRANSFER_SRC_BIT |
|
||||||
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
|
VK_BUFFER_USAGE_STORAGE_BUFFER_BIT,
|
||||||
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
|
.sharingMode = VK_SHARING_MODE_EXCLUSIVE,
|
||||||
.queueFamilyIndexCount = 0,
|
.queueFamilyIndexCount = 0,
|
||||||
.pQueueFamilyIndices = nullptr,
|
.pQueueFamilyIndices = nullptr,
|
||||||
};
|
};
|
||||||
resolve_buffers.emplace_back(
|
buffers.emplace_back(memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal));
|
||||||
memory_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal));
|
if constexpr (is_resolve) {
|
||||||
|
resolve_table[log_2] = buffers.size();
|
||||||
|
} else {
|
||||||
|
intermediary_table[log_2] = buffers.size();
|
||||||
|
}
|
||||||
|
return buffers.size() - 1;
|
||||||
}
|
}
|
||||||
|
|
||||||
static constexpr size_t resolve_slots = 8;
|
|
||||||
|
|
||||||
QueryCacheRuntime& runtime;
|
QueryCacheRuntime& runtime;
|
||||||
VideoCore::RasterizerInterface* rasterizer;
|
VideoCore::RasterizerInterface* rasterizer;
|
||||||
const Device& device;
|
const Device& device;
|
||||||
Scheduler& scheduler;
|
Scheduler& scheduler;
|
||||||
const MemoryAllocator& memory_allocator;
|
const MemoryAllocator& memory_allocator;
|
||||||
VideoCommon::BankPool<SamplesQueryBank> bank_pool;
|
VideoCommon::BankPool<SamplesQueryBank> bank_pool;
|
||||||
std::deque<vk::Buffer> resolve_buffers;
|
std::deque<vk::Buffer> buffers;
|
||||||
|
std::array<size_t, 32> resolve_table{};
|
||||||
|
std::array<size_t, 32> intermediary_table{};
|
||||||
|
vk::Buffer accumulation_buffer;
|
||||||
std::deque<std::vector<HostSyncValues>> sync_values_stash;
|
std::deque<std::vector<HostSyncValues>> sync_values_stash;
|
||||||
|
std::vector<size_t> resolve_buffers;
|
||||||
|
|
||||||
// syncing queue
|
// syncing queue
|
||||||
std::vector<size_t> pending_sync;
|
std::vector<size_t> pending_sync;
|
||||||
|
@ -510,10 +562,14 @@ private:
|
||||||
SamplesQueryBank* current_bank;
|
SamplesQueryBank* current_bank;
|
||||||
VkQueryPool current_query_pool;
|
VkQueryPool current_query_pool;
|
||||||
size_t current_query_id;
|
size_t current_query_id;
|
||||||
|
size_t num_slots_used{};
|
||||||
|
size_t last_accumulation_checkpoint{};
|
||||||
|
bool accumulation_since_last_sync{};
|
||||||
VideoCommon::HostQueryBase* current_query;
|
VideoCommon::HostQueryBase* current_query;
|
||||||
bool has_started{};
|
bool has_started{};
|
||||||
bool current_unset{};
|
|
||||||
std::mutex flush_guard;
|
std::mutex flush_guard;
|
||||||
|
|
||||||
|
std::unique_ptr<QueriesPrefixScanPass> queries_prefix_scan_pass;
|
||||||
};
|
};
|
||||||
|
|
||||||
// Transform feedback queries
|
// Transform feedback queries
|
||||||
|
@ -1090,7 +1146,8 @@ struct QueryCacheRuntimeImpl {
|
||||||
memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_},
|
memory_allocator{memory_allocator_}, scheduler{scheduler_}, staging_pool{staging_pool_},
|
||||||
guest_streamer(0, runtime),
|
guest_streamer(0, runtime),
|
||||||
sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer,
|
sample_streamer(static_cast<size_t>(QueryType::ZPassPixelCount64), runtime, rasterizer,
|
||||||
device, scheduler, memory_allocator),
|
device, scheduler, memory_allocator, compute_pass_descriptor_queue,
|
||||||
|
descriptor_pool),
|
||||||
tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device,
|
tfb_streamer(static_cast<size_t>(QueryType::StreamingByteCount), runtime, device,
|
||||||
scheduler, memory_allocator, staging_pool),
|
scheduler, memory_allocator, staging_pool),
|
||||||
primitives_succeeded_streamer(
|
primitives_succeeded_streamer(
|
||||||
|
@ -1319,10 +1376,10 @@ bool QueryCacheRuntime::HostConditionalRenderingCompareValues(VideoCommon::Looku
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
if (!is_in_bc[0] && !is_in_bc[1]) {
|
/*if (!is_in_bc[0] && !is_in_bc[1]) {
|
||||||
// Both queries are in query cache, it's best to just flush.
|
// Both queries are in query cache, it's best to just flush.
|
||||||
return false;
|
return true;
|
||||||
}
|
}*/
|
||||||
HostConditionalRenderingCompareBCImpl(object_1.address, equal_check);
|
HostConditionalRenderingCompareBCImpl(object_1.address, equal_check);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
Loading…
Reference in a new issue