early-access version 3831

This commit is contained in:
pineappleEA 2023-08-24 18:22:48 +02:00
parent e62fb0263c
commit e0a13c3ca8
44 changed files with 2252 additions and 495 deletions

View file

@ -1,7 +1,7 @@
yuzu emulator early access yuzu emulator early access
============= =============
This is the source code for early-access 3830. This is the source code for early-access 3831.
## Legal Notice ## Legal Notice

View file

@ -95,6 +95,7 @@ android {
// builds a release build that doesn't need signing // builds a release build that doesn't need signing
// Attaches 'debug' suffix to version and package name, allowing installation alongside the release build. // Attaches 'debug' suffix to version and package name, allowing installation alongside the release build.
register("relWithDebInfo") { register("relWithDebInfo") {
isDefault = true
resValue("string", "app_name_suffixed", "yuzu Debug Release") resValue("string", "app_name_suffixed", "yuzu Debug Release")
signingConfig = signingConfigs.getByName("debug") signingConfig = signingConfigs.getByName("debug")
isMinifyEnabled = true isMinifyEnabled = true
@ -122,6 +123,7 @@ android {
flavorDimensions.add("version") flavorDimensions.add("version")
productFlavors { productFlavors {
create("mainline") { create("mainline") {
isDefault = true
dimension = "version" dimension = "version"
buildConfigField("Boolean", "PREMIUM", "false") buildConfigField("Boolean", "PREMIUM", "false")
} }

View file

@ -11,6 +11,7 @@ import kotlinx.serialization.json.Json
import org.yuzu.yuzu_emu.NativeLibrary import org.yuzu.yuzu_emu.NativeLibrary
import org.yuzu.yuzu_emu.YuzuApplication import org.yuzu.yuzu_emu.YuzuApplication
import org.yuzu.yuzu_emu.model.Game import org.yuzu.yuzu_emu.model.Game
import org.yuzu.yuzu_emu.model.MinimalDocumentFile
object GameHelper { object GameHelper {
const val KEY_GAME_PATH = "game_path" const val KEY_GAME_PATH = "game_path"
@ -29,15 +30,7 @@ object GameHelper {
// Ensure keys are loaded so that ROM metadata can be decrypted. // Ensure keys are loaded so that ROM metadata can be decrypted.
NativeLibrary.reloadKeys() NativeLibrary.reloadKeys()
val children = FileUtil.listFiles(context, gamesUri) addGamesRecursive(games, FileUtil.listFiles(context, gamesUri), 3)
for (file in children) {
if (!file.isDirectory) {
// Check that the file has an extension we care about before trying to read out of it.
if (Game.extensions.contains(FileUtil.getExtension(file.uri))) {
games.add(getGame(file.uri))
}
}
}
// Cache list of games found on disk // Cache list of games found on disk
val serializedGames = mutableSetOf<String>() val serializedGames = mutableSetOf<String>()
@ -52,6 +45,30 @@ object GameHelper {
return games.toList() return games.toList()
} }
private fun addGamesRecursive(
games: MutableList<Game>,
files: Array<MinimalDocumentFile>,
depth: Int
) {
if (depth <= 0) {
return
}
files.forEach {
if (it.isDirectory) {
addGamesRecursive(
games,
FileUtil.listFiles(YuzuApplication.appContext, it.uri),
depth - 1
)
} else {
if (Game.extensions.contains(FileUtil.getExtension(it.uri))) {
games.add(getGame(it.uri))
}
}
}
}
private fun getGame(uri: Uri): Game { private fun getGame(uri: Uri): Game {
val filePath = uri.toString() val filePath = uri.toString()
var name = NativeLibrary.getTitle(filePath) var name = NativeLibrary.getTitle(filePath)

View file

@ -129,13 +129,17 @@ void LogSettings() {
log_path("DataStorage_SDMCDir", Common::FS::GetYuzuPath(Common::FS::YuzuPath::SDMCDir)); log_path("DataStorage_SDMCDir", Common::FS::GetYuzuPath(Common::FS::YuzuPath::SDMCDir));
} }
void UpdateGPUAccuracy() {
values.current_gpu_accuracy = values.gpu_accuracy.GetValue();
}
bool IsGPULevelExtreme() { bool IsGPULevelExtreme() {
return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme; return values.current_gpu_accuracy == GpuAccuracy::Extreme;
} }
bool IsGPULevelHigh() { bool IsGPULevelHigh() {
return values.gpu_accuracy.GetValue() == GpuAccuracy::Extreme || return values.current_gpu_accuracy == GpuAccuracy::Extreme ||
values.gpu_accuracy.GetValue() == GpuAccuracy::High; values.current_gpu_accuracy == GpuAccuracy::High;
} }
bool IsFastmemEnabled() { bool IsFastmemEnabled() {

View file

@ -307,6 +307,7 @@ struct Values {
Specialization::Default, Specialization::Default,
true, true,
true}; true};
GpuAccuracy current_gpu_accuracy{GpuAccuracy::High};
SwitchableSetting<AnisotropyMode, true> max_anisotropy{ SwitchableSetting<AnisotropyMode, true> max_anisotropy{
linkage, AnisotropyMode::Automatic, AnisotropyMode::Automatic, AnisotropyMode::X16, linkage, AnisotropyMode::Automatic, AnisotropyMode::Automatic, AnisotropyMode::X16,
"max_anisotropy", Category::RendererAdvanced}; "max_anisotropy", Category::RendererAdvanced};
@ -514,6 +515,7 @@ struct Values {
extern Values values; extern Values values;
void UpdateGPUAccuracy();
bool IsGPULevelExtreme(); bool IsGPULevelExtreme();
bool IsGPULevelHigh(); bool IsGPULevelHigh();

View file

@ -557,12 +557,6 @@ static std::optional<u64> FindTicketOffset(const std::array<u8, size>& data) {
} }
std::optional<Key128> KeyManager::ParseTicketTitleKey(const Ticket& ticket) { std::optional<Key128> KeyManager::ParseTicketTitleKey(const Ticket& ticket) {
if (eticket_rsa_keypair == RSAKeyPair<2048>{}) {
LOG_WARNING(Crypto,
"Skipping ticket title key parsing due to missing ETicket RSA key-pair.");
return std::nullopt;
}
if (!ticket.IsValid()) { if (!ticket.IsValid()) {
LOG_WARNING(Crypto, "Attempted to parse title key of invalid ticket."); LOG_WARNING(Crypto, "Attempted to parse title key of invalid ticket.");
return std::nullopt; return std::nullopt;
@ -587,6 +581,13 @@ std::optional<Key128> KeyManager::ParseTicketTitleKey(const Ticket& ticket) {
return ticket.GetData().title_key_common; return ticket.GetData().title_key_common;
} }
if (eticket_rsa_keypair == RSAKeyPair<2048>{}) {
LOG_WARNING(
Crypto,
"Skipping personalized ticket title key parsing due to missing ETicket RSA key-pair.");
return std::nullopt;
}
mbedtls_mpi D; // RSA Private Exponent mbedtls_mpi D; // RSA Private Exponent
mbedtls_mpi N; // RSA Modulus mbedtls_mpi N; // RSA Modulus
mbedtls_mpi S; // Input mbedtls_mpi S; // Input
@ -1188,10 +1189,6 @@ void KeyManager::DeriveETicket(PartitionDataManager& data,
} }
void KeyManager::PopulateTickets() { void KeyManager::PopulateTickets() {
if (eticket_rsa_keypair == RSAKeyPair<2048>{}) {
return;
}
if (!common_tickets.empty() && !personal_tickets.empty()) { if (!common_tickets.empty() && !personal_tickets.empty()) {
return; return;
} }

View file

@ -95,6 +95,12 @@ add_library(video_core STATIC
memory_manager.h memory_manager.h
precompiled_headers.h precompiled_headers.h
pte_kind.h pte_kind.h
query_cache/bank_base.h
query_cache/query_base.h
query_cache/query_cache_base.h
query_cache/query_cache.h
query_cache/query_stream.h
query_cache/types.h
query_cache.h query_cache.h
rasterizer_accelerated.cpp rasterizer_accelerated.cpp
rasterizer_accelerated.h rasterizer_accelerated.h

View file

@ -272,13 +272,19 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad
if (!cpu_addr) { if (!cpu_addr) {
return {&slot_buffers[NULL_BUFFER_ID], 0}; return {&slot_buffers[NULL_BUFFER_ID], 0};
} }
const BufferId buffer_id = FindBuffer(*cpu_addr, size); return ObtainCPUBuffer(*cpu_addr, size, sync_info, post_op);
}
template <class P>
std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainCPUBuffer(
VAddr cpu_addr, u32 size, ObtainBufferSynchronize sync_info, ObtainBufferOperation post_op) {
const BufferId buffer_id = FindBuffer(cpu_addr, size);
Buffer& buffer = slot_buffers[buffer_id]; Buffer& buffer = slot_buffers[buffer_id];
// synchronize op // synchronize op
switch (sync_info) { switch (sync_info) {
case ObtainBufferSynchronize::FullSynchronize: case ObtainBufferSynchronize::FullSynchronize:
SynchronizeBuffer(buffer, *cpu_addr, size); SynchronizeBuffer(buffer, cpu_addr, size);
break; break;
default: default:
break; break;
@ -286,10 +292,10 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad
switch (post_op) { switch (post_op) {
case ObtainBufferOperation::MarkAsWritten: case ObtainBufferOperation::MarkAsWritten:
MarkWrittenBuffer(buffer_id, *cpu_addr, size); MarkWrittenBuffer(buffer_id, cpu_addr, size);
break; break;
case ObtainBufferOperation::DiscardWrite: { case ObtainBufferOperation::DiscardWrite: {
IntervalType interval{*cpu_addr, size}; IntervalType interval{cpu_addr, size};
ClearDownload(interval); ClearDownload(interval);
break; break;
} }
@ -297,7 +303,7 @@ std::pair<typename P::Buffer*, u32> BufferCache<P>::ObtainBuffer(GPUVAddr gpu_ad
break; break;
} }
return {&buffer, buffer.Offset(*cpu_addr)}; return {&buffer, buffer.Offset(cpu_addr)};
} }
template <class P> template <class P>

View file

@ -295,6 +295,10 @@ public:
[[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size, [[nodiscard]] std::pair<Buffer*, u32> ObtainBuffer(GPUVAddr gpu_addr, u32 size,
ObtainBufferSynchronize sync_info, ObtainBufferSynchronize sync_info,
ObtainBufferOperation post_op); ObtainBufferOperation post_op);
[[nodiscard]] std::pair<Buffer*, u32> ObtainCPUBuffer(VAddr gpu_addr, u32 size,
ObtainBufferSynchronize sync_info,
ObtainBufferOperation post_op);
void FlushCachedWrites(); void FlushCachedWrites();
/// Return true when there are uncommitted buffers to be downloaded /// Return true when there are uncommitted buffers to be downloaded
@ -335,6 +339,14 @@ public:
[[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer(); [[nodiscard]] std::pair<Buffer*, u32> GetDrawIndirectBuffer();
template <typename Func>
void BufferOperations(Func&& func) {
do {
channel_state->has_deleted_buffers = false;
func();
} while (channel_state->has_deleted_buffers);
}
std::recursive_mutex mutex; std::recursive_mutex mutex;
Runtime& runtime; Runtime& runtime;

View file

@ -51,7 +51,7 @@ public:
virtual void CreateChannel(Tegra::Control::ChannelState& channel); virtual void CreateChannel(Tegra::Control::ChannelState& channel);
/// Bind a channel for execution. /// Bind a channel for execution.
void BindToChannel(s32 id); virtual void BindToChannel(s32 id);
/// Erase channel's state. /// Erase channel's state.
void EraseChannel(s32 id); void EraseChannel(s32 id);

View file

@ -46,6 +46,7 @@ public:
}; };
struct IndirectParams { struct IndirectParams {
bool is_byte_count;
bool is_indexed; bool is_indexed;
bool include_count; bool include_count;
GPUVAddr count_start_address; GPUVAddr count_start_address;

View file

@ -20,8 +20,6 @@
namespace Tegra::Engines { namespace Tegra::Engines {
using VideoCore::QueryType;
/// First register id that is actually a Macro call. /// First register id that is actually a Macro call.
constexpr u32 MacroRegistersStart = 0xE00; constexpr u32 MacroRegistersStart = 0xE00;
@ -496,27 +494,21 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
} }
void Maxwell3D::ProcessQueryGet() { void Maxwell3D::ProcessQueryGet() {
VideoCommon::QueryPropertiesFlags flags{};
if (regs.report_semaphore.query.short_query == 0) {
flags |= VideoCommon::QueryPropertiesFlags::HasTimeout;
}
const GPUVAddr sequence_address{regs.report_semaphore.Address()};
const VideoCommon::QueryType query_type =
static_cast<VideoCommon::QueryType>(regs.report_semaphore.query.report.Value());
const u32 payload = regs.report_semaphore.payload;
const u32 subreport = regs.report_semaphore.query.sub_report;
switch (regs.report_semaphore.query.operation) { switch (regs.report_semaphore.query.operation) {
case Regs::ReportSemaphore::Operation::Release: case Regs::ReportSemaphore::Operation::Release:
if (regs.report_semaphore.query.short_query != 0) { if (regs.report_semaphore.query.short_query != 0) {
const GPUVAddr sequence_address{regs.report_semaphore.Address()}; flags |= VideoCommon::QueryPropertiesFlags::IsAFence;
const u32 payload = regs.report_semaphore.payload;
std::function<void()> operation([this, sequence_address, payload] {
memory_manager.Write<u32>(sequence_address, payload);
});
rasterizer->SignalFence(std::move(operation));
} else {
struct LongQueryResult {
u64_le value;
u64_le timestamp;
};
const GPUVAddr sequence_address{regs.report_semaphore.Address()};
const u32 payload = regs.report_semaphore.payload;
[this, sequence_address, payload] {
memory_manager.Write<u64>(sequence_address + sizeof(u64), system.GPU().GetTicks());
memory_manager.Write<u64>(sequence_address, payload);
}();
} }
rasterizer->Query(sequence_address, query_type, flags, payload, subreport);
break; break;
case Regs::ReportSemaphore::Operation::Acquire: case Regs::ReportSemaphore::Operation::Acquire:
// TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that // TODO(Blinkhawk): Under this operation, the GPU waits for the CPU to write a value that
@ -524,11 +516,7 @@ void Maxwell3D::ProcessQueryGet() {
UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE"); UNIMPLEMENTED_MSG("Unimplemented query operation ACQUIRE");
break; break;
case Regs::ReportSemaphore::Operation::ReportOnly: case Regs::ReportSemaphore::Operation::ReportOnly:
if (const std::optional<u64> result = GetQueryResult()) { rasterizer->Query(sequence_address, query_type, flags, payload, subreport);
// If the query returns an empty optional it means it's cached and deferred.
// In this case we have a non-empty result, so we stamp it immediately.
StampQueryResult(*result, regs.report_semaphore.query.short_query == 0);
}
break; break;
case Regs::ReportSemaphore::Operation::Trap: case Regs::ReportSemaphore::Operation::Trap:
UNIMPLEMENTED_MSG("Unimplemented query operation TRAP"); UNIMPLEMENTED_MSG("Unimplemented query operation TRAP");
@ -540,6 +528,10 @@ void Maxwell3D::ProcessQueryGet() {
} }
void Maxwell3D::ProcessQueryCondition() { void Maxwell3D::ProcessQueryCondition() {
if (rasterizer->AccelerateConditionalRendering()) {
execute_on = true;
return;
}
const GPUVAddr condition_address{regs.render_enable.Address()}; const GPUVAddr condition_address{regs.render_enable.Address()};
switch (regs.render_enable_override) { switch (regs.render_enable_override) {
case Regs::RenderEnable::Override::AlwaysRender: case Regs::RenderEnable::Override::AlwaysRender:
@ -549,10 +541,6 @@ void Maxwell3D::ProcessQueryCondition() {
execute_on = false; execute_on = false;
break; break;
case Regs::RenderEnable::Override::UseRenderEnable: { case Regs::RenderEnable::Override::UseRenderEnable: {
if (rasterizer->AccelerateConditionalRendering()) {
execute_on = true;
return;
}
switch (regs.render_enable.mode) { switch (regs.render_enable.mode) {
case Regs::RenderEnable::Mode::True: { case Regs::RenderEnable::Mode::True: {
execute_on = true; execute_on = true;
@ -594,15 +582,9 @@ void Maxwell3D::ProcessQueryCondition() {
} }
void Maxwell3D::ProcessCounterReset() { void Maxwell3D::ProcessCounterReset() {
#if ANDROID
if (!Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
return;
}
#endif
switch (regs.clear_report_value) { switch (regs.clear_report_value) {
case Regs::ClearReport::ZPassPixelCount: case Regs::ClearReport::ZPassPixelCount:
rasterizer->ResetCounter(QueryType::SamplesPassed); rasterizer->ResetCounter(VideoCommon::QueryType::ZPassPixelCount64);
break; break;
default: default:
LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value); LOG_DEBUG(Render_OpenGL, "Unimplemented counter reset={}", regs.clear_report_value);
@ -616,28 +598,6 @@ void Maxwell3D::ProcessSyncPoint() {
rasterizer->SignalSyncPoint(sync_point); rasterizer->SignalSyncPoint(sync_point);
} }
std::optional<u64> Maxwell3D::GetQueryResult() {
switch (regs.report_semaphore.query.report) {
case Regs::ReportSemaphore::Report::Payload:
return regs.report_semaphore.payload;
case Regs::ReportSemaphore::Report::ZPassPixelCount64:
#if ANDROID
if (!Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
return 120;
}
#endif
// Deferred.
rasterizer->Query(regs.report_semaphore.Address(), QueryType::SamplesPassed,
system.GPU().GetTicks());
return std::nullopt;
default:
LOG_DEBUG(HW_GPU, "Unimplemented query report type {}",
regs.report_semaphore.query.report.Value());
return 1;
}
}
void Maxwell3D::ProcessCBBind(size_t stage_index) { void Maxwell3D::ProcessCBBind(size_t stage_index) {
// Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader
// stage. // stage.

View file

@ -3182,9 +3182,6 @@ private:
/// Handles writes to syncing register. /// Handles writes to syncing register.
void ProcessSyncPoint(); void ProcessSyncPoint();
/// Returns a query's value or an empty object if the value will be deferred through a cache.
std::optional<u64> GetQueryResult();
void RefreshParametersImpl(); void RefreshParametersImpl();
bool IsMethodExecutable(u32 method); bool IsMethodExecutable(u32 method);

View file

@ -361,21 +361,17 @@ void MaxwellDMA::ReleaseSemaphore() {
const auto type = regs.launch_dma.semaphore_type; const auto type = regs.launch_dma.semaphore_type;
const GPUVAddr address = regs.semaphore.address; const GPUVAddr address = regs.semaphore.address;
const u32 payload = regs.semaphore.payload; const u32 payload = regs.semaphore.payload;
VideoCommon::QueryPropertiesFlags flags{VideoCommon::QueryPropertiesFlags::IsAFence};
switch (type) { switch (type) {
case LaunchDMA::SemaphoreType::NONE: case LaunchDMA::SemaphoreType::NONE:
break; break;
case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: { case LaunchDMA::SemaphoreType::RELEASE_ONE_WORD_SEMAPHORE: {
std::function<void()> operation( rasterizer->Query(address, VideoCommon::QueryType::Payload, flags, payload, 0);
[this, address, payload] { memory_manager.Write<u32>(address, payload); });
rasterizer->SignalFence(std::move(operation));
break; break;
} }
case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: { case LaunchDMA::SemaphoreType::RELEASE_FOUR_WORD_SEMAPHORE: {
std::function<void()> operation([this, address, payload] { rasterizer->Query(address, VideoCommon::QueryType::Payload,
memory_manager.Write<u64>(address + sizeof(u64), system.GPU().GetTicks()); flags | VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0);
memory_manager.Write<u64>(address, payload);
});
rasterizer->SignalFence(std::move(operation));
break; break;
} }
default: default:

View file

@ -77,10 +77,8 @@ void Puller::ProcessSemaphoreTriggerMethod() {
if (op == GpuSemaphoreOperation::WriteLong) { if (op == GpuSemaphoreOperation::WriteLong) {
const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()};
const u32 payload = regs.semaphore_sequence; const u32 payload = regs.semaphore_sequence;
[this, sequence_address, payload] { rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload,
memory_manager.Write<u64>(sequence_address + sizeof(u64), gpu.GetTicks()); VideoCommon::QueryPropertiesFlags::HasTimeout, payload, 0);
memory_manager.Write<u64>(sequence_address, payload);
}();
} else { } else {
do { do {
const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())}; const u32 word{memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress())};
@ -115,10 +113,8 @@ void Puller::ProcessSemaphoreTriggerMethod() {
void Puller::ProcessSemaphoreRelease() { void Puller::ProcessSemaphoreRelease() {
const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()}; const GPUVAddr sequence_address{regs.semaphore_address.SemaphoreAddress()};
const u32 payload = regs.semaphore_release; const u32 payload = regs.semaphore_release;
std::function<void()> operation([this, sequence_address, payload] { rasterizer->Query(sequence_address, VideoCommon::QueryType::Payload,
memory_manager.Write<u32>(sequence_address, payload); VideoCommon::QueryPropertiesFlags::IsAFence, payload, 0);
});
rasterizer->SignalFence(std::move(operation));
} }
void Puller::ProcessSemaphoreAcquire() { void Puller::ProcessSemaphoreAcquire() {
@ -127,7 +123,6 @@ void Puller::ProcessSemaphoreAcquire() {
while (word != value) { while (word != value) {
regs.acquire_active = true; regs.acquire_active = true;
regs.acquire_value = value; regs.acquire_value = value;
std::this_thread::sleep_for(std::chrono::milliseconds(1));
rasterizer->ReleaseFences(); rasterizer->ReleaseFences();
word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress()); word = memory_manager.Read<u32>(regs.semaphore_address.SemaphoreAddress());
// TODO(kemathe73) figure out how to do the acquire_timeout // TODO(kemathe73) figure out how to do the acquire_timeout

View file

@ -55,6 +55,9 @@ public:
// Unlike other fences, this one doesn't // Unlike other fences, this one doesn't
void SignalOrdering() { void SignalOrdering() {
if constexpr (!can_async_check) {
TryReleasePendingFences<false>();
}
std::scoped_lock lock{buffer_cache.mutex}; std::scoped_lock lock{buffer_cache.mutex};
buffer_cache.AccumulateFlushes(); buffer_cache.AccumulateFlushes();
} }
@ -104,9 +107,25 @@ public:
SignalFence(std::move(func)); SignalFence(std::move(func));
} }
void WaitPendingFences() { void WaitPendingFences([[maybe_unused]] bool force) {
if constexpr (!can_async_check) { if constexpr (!can_async_check) {
TryReleasePendingFences<true>(); TryReleasePendingFences<true>();
} else {
if (!force) {
return;
}
std::mutex wait_mutex;
std::condition_variable wait_cv;
std::atomic<bool> wait_finished{};
std::function<void()> func([&] {
std::scoped_lock lk(wait_mutex);
wait_finished.store(true, std::memory_order_relaxed);
wait_cv.notify_all();
});
SignalFence(std::move(func));
std::unique_lock lk(wait_mutex);
wait_cv.wait(
lk, [&wait_finished] { return wait_finished.load(std::memory_order_relaxed); });
} }
} }

View file

@ -102,7 +102,8 @@ struct GPU::Impl {
/// Signal the ending of command list. /// Signal the ending of command list.
void OnCommandListEnd() { void OnCommandListEnd() {
rasterizer->ReleaseFences(); rasterizer->ReleaseFences(false);
Settings::UpdateGPUAccuracy();
} }
/// Request a host GPU memory flush from the CPU. /// Request a host GPU memory flush from the CPU.
@ -220,6 +221,7 @@ struct GPU::Impl {
/// This can be used to launch any necessary threads and register any necessary /// This can be used to launch any necessary threads and register any necessary
/// core timing events. /// core timing events.
void Start() { void Start() {
Settings::UpdateGPUAccuracy();
gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler); gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler);
} }

View file

@ -41,6 +41,9 @@ set(SHADER_FILES
pitch_unswizzle.comp pitch_unswizzle.comp
present_bicubic.frag present_bicubic.frag
present_gaussian.frag present_gaussian.frag
queries_prefix_scan_sum.comp
queries_prefix_scan_sum_nosubgroups.comp
resolve_conditional_render.comp
smaa_edge_detection.vert smaa_edge_detection.vert
smaa_edge_detection.frag smaa_edge_detection.frag
smaa_blending_weight_calculation.vert smaa_blending_weight_calculation.vert
@ -70,6 +73,7 @@ if ("${GLSLANGVALIDATOR}" STREQUAL "GLSLANGVALIDATOR-NOTFOUND")
endif() endif()
set(GLSL_FLAGS "") set(GLSL_FLAGS "")
set(SPIR_V_VERSION "spirv1.3")
set(QUIET_FLAG "--quiet") set(QUIET_FLAG "--quiet")
set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include) set(SHADER_INCLUDE ${CMAKE_CURRENT_BINARY_DIR}/include)
@ -123,7 +127,7 @@ foreach(FILENAME IN ITEMS ${SHADER_FILES})
OUTPUT OUTPUT
${SPIRV_HEADER_FILE} ${SPIRV_HEADER_FILE}
COMMAND COMMAND
${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} ${GLSLANGVALIDATOR} -V ${QUIET_FLAG} -I"${FIDELITYFX_INCLUDE_DIR}" ${GLSL_FLAGS} --variable-name ${SPIRV_VARIABLE_NAME} -o ${SPIRV_HEADER_FILE} ${SOURCE_FILE} --target-env ${SPIR_V_VERSION}
MAIN_DEPENDENCY MAIN_DEPENDENCY
${SOURCE_FILE} ${SOURCE_FILE}
) )

View file

@ -34,11 +34,16 @@
#endif #endif
BEGIN_PUSH_CONSTANTS BEGIN_PUSH_CONSTANTS
UNIFORM(0) uint max_accumulation_base; UNIFORM(0) uint min_accumulation_base;
UNIFORM(1) uint accumulation_limit; UNIFORM(1) uint max_accumulation_base;
UNIFORM(2) uint accumulation_limit;
UNIFORM(3) uint buffer_offset;
END_PUSH_CONSTANTS END_PUSH_CONSTANTS
layout(local_size_x = 32) in; #define LOCAL_RESULTS 8
#define QUERIES_PER_INVOC 2048
layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in;
layout(std430, binding = 0) readonly buffer block1 { layout(std430, binding = 0) readonly buffer block1 {
uvec2 input_data[]; uvec2 input_data[];
@ -52,7 +57,7 @@ layout(std430, binding = 2) coherent buffer block3 {
uvec2 accumulated_data; uvec2 accumulated_data;
}; };
shared uvec2 shared_data[2]; shared uvec2 shared_data[128];
// Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64 // Simple Uint64 add that uses 2 uint variables for GPUs that don't support uint64
uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
@ -67,8 +72,8 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
uvec2 subgroupInclusiveAddUint64(uvec2 value) { uvec2 subgroupInclusiveAddUint64(uvec2 value) {
uvec2 result = value; uvec2 result = value;
for (uint i = 1; i < gl_SubgroupSize; i *= 2) { for (uint i = 1; i < gl_SubgroupSize; i *= 2) {
if (i <= gl_SubgroupInvocationID) {
uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i; uvec2 other = subgroupShuffleUp(result, i); // get value from subgroup_inv_id - i;
if (i <= gl_SubgroupInvocationID) {
result = AddUint64(result, other); result = AddUint64(result, other);
} }
} }
@ -76,89 +81,93 @@ uvec2 subgroupInclusiveAddUint64(uvec2 value) {
} }
// Writes down the results to the output buffer and to the accumulation buffer // Writes down the results to the output buffer and to the accumulation buffer
void WriteResults(uvec2 result) { void WriteResults(uvec2 results[LOCAL_RESULTS]) {
uint current_global_id = gl_GlobalInvocationID.x; const uint current_id = gl_LocalInvocationID.x;
uvec2 base_data = current_global_id < max_accumulation_base ? accumulated_data : uvec2(0); const uvec2 accum = accumulated_data;
output_data[current_global_id] = result + base_data; for (uint i = 0; i < LOCAL_RESULTS; i++) {
if (max_accumulation_base >= accumulation_limit + 1) { uvec2 base_data = current_id * LOCAL_RESULTS + i < min_accumulation_base ? accum : uvec2(0, 0);
if (current_global_id == accumulation_limit) { AddUint64(results[i], base_data);
accumulated_data = result; }
for (uint i = 0; i < LOCAL_RESULTS; i++) {
output_data[buffer_offset + current_id * LOCAL_RESULTS + i] = results[i];
}
uint index = accumulation_limit % LOCAL_RESULTS;
uint base_id = accumulation_limit / LOCAL_RESULTS;
if (min_accumulation_base >= accumulation_limit + 1) {
if (current_id == base_id) {
accumulated_data = results[index];
} }
return; return;
} }
// We have that ugly case in which the accumulation data is reset in the middle somewhere. // We have that ugly case in which the accumulation data is reset in the middle somewhere.
barrier(); barrier();
groupMemoryBarrier(); groupMemoryBarrier();
if (current_global_id == accumulation_limit) {
uvec2 value_1 = output_data[max_accumulation_base]; if (current_id == base_id) {
accumulated_data = AddUint64(result, -value_1); uvec2 reset_value = output_data[max_accumulation_base - 1];
// Calculate two complement / negate manually
reset_value = AddUint64(uvec2(1,0), ~reset_value);
accumulated_data = AddUint64(results[index], reset_value);
} }
} }
void main() { void main() {
uint subgroup_inv_id = gl_SubgroupInvocationID; const uint subgroup_inv_id = gl_SubgroupInvocationID;
uint subgroup_id = gl_SubgroupID; const uint subgroup_id = gl_SubgroupID + gl_WorkGroupID.x * gl_NumSubgroups;
uint last_subgroup_id = subgroupMax(subgroup_inv_id); const uint last_subgroup_id = subgroupMax(subgroup_inv_id);
uint current_global_id = gl_GlobalInvocationID.x; const uint current_id = gl_LocalInvocationID.x;
uint total_work = gl_NumWorkGroups.x * gl_WorkGroupSize.x; const uint total_work = accumulation_limit;
uvec2 data = input_data[current_global_id]; const uint last_result_id = LOCAL_RESULTS - 1;
uvec2 data[LOCAL_RESULTS];
for (uint i = 0; i < LOCAL_RESULTS; i++) {
data[i] = input_data[buffer_offset + current_id * LOCAL_RESULTS + i];
}
uvec2 results[LOCAL_RESULTS];
results[0] = data[0];
for (uint i = 1; i < LOCAL_RESULTS; i++) {
results[i] = AddUint64(data[i], results[i - 1]);
}
// make sure all input data has been loaded // make sure all input data has been loaded
subgroupBarrier(); subgroupBarrier();
subgroupMemoryBarrier(); subgroupMemoryBarrier();
uvec2 result = subgroupInclusiveAddUint64(data); // on the last local result, do a subgroup inclusive scan sum
results[last_result_id] = subgroupInclusiveAddUint64(results[last_result_id]);
// get the last local result from the subgroup behind the current
uvec2 result_behind = subgroupShuffleUp(results[last_result_id], 1);
if (subgroup_inv_id != 0) {
for (uint i = 1; i < LOCAL_RESULTS; i++) {
results[i - 1] = AddUint64(results[i - 1], result_behind);
}
}
// if we had less queries than our subgroup, just write down the results. // if we had less queries than our subgroup, just write down the results.
if (total_work <= gl_SubgroupSize) { // This condition is constant per dispatch. if (total_work <= gl_SubgroupSize * LOCAL_RESULTS) { // This condition is constant per dispatch.
WriteResults(result); WriteResults(results);
return; return;
} }
// We now have more, so lets write the last result into shared memory. // We now have more, so lets write the last result into shared memory.
// Only pick the last subgroup. // Only pick the last subgroup.
if (subgroup_inv_id == last_subgroup_id) { if (subgroup_inv_id == last_subgroup_id) {
shared_data[subgroup_id] = result; shared_data[subgroup_id] = results[last_result_id];
} }
// wait until everyone loaded their stuffs // wait until everyone loaded their stuffs
barrier(); barrier();
memoryBarrierShared(); memoryBarrierShared();
// Case 1: the total work for the grouped results can be calculated in a single subgroup // only if it's not the first subgroup
// operation (about 1024 queries).
uint total_extra_work = gl_NumSubgroups * gl_NumWorkGroups.x;
if (total_extra_work <= gl_SubgroupSize) { // This condition is constant per dispatch.
if (subgroup_id != 0) { if (subgroup_id != 0) {
// get the results from some previous invocation
uvec2 tmp = shared_data[subgroup_inv_id]; uvec2 tmp = shared_data[subgroup_inv_id];
subgroupBarrier(); subgroupBarrier();
subgroupMemoryBarrierShared(); subgroupMemoryBarrierShared();
tmp = subgroupInclusiveAddUint64(tmp); tmp = subgroupInclusiveAddUint64(tmp);
result = AddUint64(result, subgroupShuffle(tmp, subgroup_id - 1)); // obtain the result that would be equivalent to the previous result
uvec2 shuffled_result = subgroupShuffle(tmp, subgroup_id - 1);
for (uint i = 0; i < LOCAL_RESULTS; i++) {
results[i] = AddUint64(results[i], shuffled_result);
} }
WriteResults(result);
return;
} }
WriteResults(results);
// Case 2: our work amount is huge, so lets do it in O(log n) steps.
const uint extra = (total_extra_work ^ (total_extra_work - 1)) != 0 ? 1 : 0;
const uint steps = 1 << (findMSB(total_extra_work) + extra);
uint step;
// Hillis and Steele's algorithm
for (step = 1; step < steps; step *= 2) {
if (current_global_id < steps && current_global_id >= step) {
uvec2 current = shared_data[current_global_id];
uvec2 other = shared_data[current_global_id - step];
shared_data[current_global_id] = AddUint64(current, other);
}
// steps is constant, so this will always execute in ever workgroup's thread.
barrier();
memoryBarrierShared();
}
// Only add results for groups higher than 0
if (subgroup_id != 0) {
result = AddUint64(result, shared_data[subgroup_id - 1]);
}
// Just write the final results. We are done
WriteResults(result);
} }

View file

@ -32,25 +32,30 @@
#endif #endif
BEGIN_PUSH_CONSTANTS BEGIN_PUSH_CONSTANTS
UNIFORM(0) uint max_accumulation_base; UNIFORM(0) uint min_accumulation_base;
UNIFORM(1) uint accumulation_limit; UNIFORM(1) uint max_accumulation_base;
UNIFORM(2) uint accumulation_limit;
UNIFORM(3) uint buffer_offset;
END_PUSH_CONSTANTS END_PUSH_CONSTANTS
layout(local_size_x = 32) in; #define LOCAL_RESULTS 4
#define QUERIES_PER_INVOC 2048
layout(local_size_x = QUERIES_PER_INVOC / LOCAL_RESULTS) in;
layout(std430, binding = 0) readonly buffer block1 { layout(std430, binding = 0) readonly buffer block1 {
uvec2 input_data[gl_WorkGroupSize.x]; uvec2 input_data[gl_WorkGroupSize.x * LOCAL_RESULTS];
}; };
layout(std430, binding = 1) writeonly coherent buffer block2 { layout(std430, binding = 1) writeonly coherent buffer block2 {
uvec2 output_data[gl_WorkGroupSize.x]; uvec2 output_data[gl_WorkGroupSize.x * LOCAL_RESULTS];
}; };
layout(std430, binding = 2) coherent buffer block3 { layout(std430, binding = 2) coherent buffer block3 {
uvec2 accumulated_data; uvec2 accumulated_data;
}; };
shared uvec2 shared_data[gl_WorkGroupSize.x * 2]; shared uvec2 shared_data[gl_WorkGroupSize.x * LOCAL_RESULTS];
uvec2 AddUint64(uvec2 value_1, uvec2 value_2) { uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
uint carry = 0; uint carry = 0;
@ -62,23 +67,31 @@ uvec2 AddUint64(uvec2 value_1, uvec2 value_2) {
void main(void) { void main(void) {
uint id = gl_LocalInvocationID.x; uint id = gl_LocalInvocationID.x;
uvec2 base_value_1 = (id * 2) < max_accumulation_base ? accumulated_data : uvec2(0); uvec2 base_value[LOCAL_RESULTS];
uvec2 base_value_2 = (id * 2 + 1) < max_accumulation_base ? accumulated_data : uvec2(0); const uvec2 accum = accumulated_data;
for (uint i = 0; i < LOCAL_RESULTS; i++) {
base_value[i] = (buffer_offset + id * LOCAL_RESULTS + i) < min_accumulation_base
? accumulated_data
: uvec2(0);
}
uint work_size = gl_WorkGroupSize.x; uint work_size = gl_WorkGroupSize.x;
uint rd_id; uint rd_id;
uint wr_id; uint wr_id;
uint mask; uint mask;
uvec2 input_1 = input_data[id * 2]; uvec2 inputs[LOCAL_RESULTS];
uvec2 input_2 = input_data[id * 2 + 1]; for (uint i = 0; i < LOCAL_RESULTS; i++) {
inputs[i] = input_data[buffer_offset + id * LOCAL_RESULTS + i];
}
// The number of steps is the log base 2 of the // The number of steps is the log base 2 of the
// work group size, which should be a power of 2 // work group size, which should be a power of 2
const uint steps = uint(log2(work_size)) + 1; const uint steps = uint(log2(work_size)) + uint(log2(LOCAL_RESULTS));
uint step = 0; uint step = 0;
// Each invocation is responsible for the content of // Each invocation is responsible for the content of
// two elements of the output array // two elements of the output array
shared_data[id * 2] = input_1; for (uint i = 0; i < LOCAL_RESULTS; i++) {
shared_data[id * 2 + 1] = input_2; shared_data[id * LOCAL_RESULTS + i] = inputs[i];
}
// Synchronize to make sure that everyone has initialized // Synchronize to make sure that everyone has initialized
// their elements of shared_data[] with data loaded from // their elements of shared_data[] with data loaded from
// the input arrays // the input arrays
@ -100,21 +113,26 @@ void main(void) {
memoryBarrierShared(); memoryBarrierShared();
} }
// Add the accumulation // Add the accumulation
shared_data[id * 2] = AddUint64(shared_data[id * 2], base_value_1); for (uint i = 0; i < LOCAL_RESULTS; i++) {
shared_data[id * 2 + 1] = AddUint64(shared_data[id * 2 + 1], base_value_2); shared_data[id * LOCAL_RESULTS + i] =
AddUint64(shared_data[id * LOCAL_RESULTS + i], base_value[i]);
}
barrier(); barrier();
memoryBarrierShared(); memoryBarrierShared();
// Finally write our data back to the output buffer // Finally write our data back to the output buffer
output_data[id * 2] = shared_data[id * 2]; for (uint i = 0; i < LOCAL_RESULTS; i++) {
output_data[id * 2 + 1] = shared_data[id * 2 + 1]; output_data[buffer_offset + id * LOCAL_RESULTS + i] = shared_data[id * LOCAL_RESULTS + i];
}
if (id == 0) { if (id == 0) {
if (max_accumulation_base >= accumulation_limit + 1) { if (min_accumulation_base >= accumulation_limit + 1) {
accumulated_data = shared_data[accumulation_limit]; accumulated_data = shared_data[accumulation_limit];
return; return;
} }
uvec2 value_1 = shared_data[max_accumulation_base]; uvec2 reset_value = shared_data[max_accumulation_base - 1];
uvec2 value_2 = shared_data[accumulation_limit]; uvec2 final_value = shared_data[accumulation_limit];
accumulated_data = AddUint64(value_1, -value_2); // Two complements
reset_value = AddUint64(uvec2(1, 0), ~reset_value);
accumulated_data = AddUint64(final_value, reset_value);
} }
} }

View file

@ -67,6 +67,7 @@ public:
} }
auto& params = maxwell3d.draw_manager->GetIndirectParams(); auto& params = maxwell3d.draw_manager->GetIndirectParams();
params.is_byte_count = false;
params.is_indexed = false; params.is_indexed = false;
params.include_count = false; params.include_count = false;
params.count_start_address = 0; params.count_start_address = 0;
@ -161,6 +162,7 @@ public:
0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance); 0, 0x644, Maxwell3D::HLEReplacementAttributeType::BaseInstance);
} }
auto& params = maxwell3d.draw_manager->GetIndirectParams(); auto& params = maxwell3d.draw_manager->GetIndirectParams();
params.is_byte_count = false;
params.is_indexed = true; params.is_indexed = true;
params.include_count = false; params.include_count = false;
params.count_start_address = 0; params.count_start_address = 0;
@ -256,6 +258,7 @@ public:
const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize()); const u32 estimate = static_cast<u32>(maxwell3d.EstimateIndexBufferSize());
maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true; maxwell3d.dirty.flags[VideoCommon::Dirty::IndexBuffer] = true;
auto& params = maxwell3d.draw_manager->GetIndirectParams(); auto& params = maxwell3d.draw_manager->GetIndirectParams();
params.is_byte_count = false;
params.is_indexed = true; params.is_indexed = true;
params.include_count = true; params.include_count = true;
params.count_start_address = maxwell3d.GetMacroAddress(4); params.count_start_address = maxwell3d.GetMacroAddress(4);
@ -319,6 +322,47 @@ private:
} }
}; };
class HLE_DrawIndirectByteCount final : public HLEMacroImpl {
public:
explicit HLE_DrawIndirectByteCount(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
void Execute(const std::vector<u32>& parameters, [[maybe_unused]] u32 method) override {
auto topology = static_cast<Maxwell3D::Regs::PrimitiveTopology>(parameters[0] & 0xFFFFU);
if (!maxwell3d.AnyParametersDirty() || !IsTopologySafe(topology)) {
Fallback(parameters);
return;
}
auto& params = maxwell3d.draw_manager->GetIndirectParams();
params.is_byte_count = true;
params.is_indexed = false;
params.include_count = false;
params.count_start_address = 0;
params.indirect_start_address = maxwell3d.GetMacroAddress(2);
params.buffer_size = 4;
params.max_draw_counts = 1;
params.stride = parameters[1];
maxwell3d.regs.draw.begin = parameters[0];
maxwell3d.regs.draw_auto_stride = parameters[1];
maxwell3d.regs.draw_auto_byte_count = parameters[2];
maxwell3d.draw_manager->DrawArrayIndirect(topology);
}
private:
void Fallback(const std::vector<u32>& parameters) {
maxwell3d.RefreshParameters();
maxwell3d.regs.draw.begin = parameters[0];
maxwell3d.regs.draw_auto_stride = parameters[1];
maxwell3d.regs.draw_auto_byte_count = parameters[2];
maxwell3d.draw_manager->DrawArray(
maxwell3d.regs.draw.topology, 0,
maxwell3d.regs.draw_auto_byte_count / maxwell3d.regs.draw_auto_stride, 0, 1);
}
};
class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl { class HLE_C713C83D8F63CCF3 final : public HLEMacroImpl {
public: public:
explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {} explicit HLE_C713C83D8F63CCF3(Maxwell3D& maxwell3d_) : HLEMacroImpl(maxwell3d_) {}
@ -536,6 +580,11 @@ HLEMacro::HLEMacro(Maxwell3D& maxwell3d_) : maxwell3d{maxwell3d_} {
[](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> { [](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__); return std::make_unique<HLE_TransformFeedbackSetup>(maxwell3d__);
})); }));
builders.emplace(0xB5F74EDB717278ECULL,
std::function<std::unique_ptr<CachedMacro>(Maxwell3D&)>(
[](Maxwell3D& maxwell3d__) -> std::unique_ptr<CachedMacro> {
return std::make_unique<HLE_DrawIndirectByteCount>(maxwell3d__);
}));
} }
HLEMacro::~HLEMacro() = default; HLEMacro::~HLEMacro() = default;

View file

@ -25,6 +25,13 @@
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
#include "video_core/texture_cache/slot_vector.h" #include "video_core/texture_cache/slot_vector.h"
namespace VideoCore {
enum class QueryType {
SamplesPassed,
};
constexpr std::size_t NumQueryTypes = 1;
} // namespace VideoCore
namespace VideoCommon { namespace VideoCommon {
using AsyncJobId = SlotId; using AsyncJobId = SlotId;
@ -98,9 +105,9 @@ private:
}; };
template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter> template <class QueryCache, class CachedQuery, class CounterStream, class HostCounter>
class QueryCacheBase : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> { class QueryCacheLegacy : public VideoCommon::ChannelSetupCaches<VideoCommon::ChannelInfo> {
public: public:
explicit QueryCacheBase(VideoCore::RasterizerInterface& rasterizer_, explicit QueryCacheLegacy(VideoCore::RasterizerInterface& rasterizer_,
Core::Memory::Memory& cpu_memory_) Core::Memory::Memory& cpu_memory_)
: rasterizer{rasterizer_}, : rasterizer{rasterizer_},
// Use reinterpret_cast instead of static_cast as workaround for // Use reinterpret_cast instead of static_cast as workaround for

View file

@ -256,6 +256,7 @@ void QueryCacheBase<Traits>::CounterReport(GPUVAddr addr, QueryType counter_type
u8* pointer = impl->cpu_memory.GetPointer(cpu_addr); u8* pointer = impl->cpu_memory.GetPointer(cpu_addr);
u8* pointer_timestamp = impl->cpu_memory.GetPointer(cpu_addr + 8); u8* pointer_timestamp = impl->cpu_memory.GetPointer(cpu_addr + 8);
bool is_synced = !Settings::IsGPULevelHigh() && is_fence; bool is_synced = !Settings::IsGPULevelHigh() && is_fence;
std::function<void()> operation([this, is_synced, streamer, query_base = query, query_location, std::function<void()> operation([this, is_synced, streamer, query_base = query, query_location,
pointer, pointer_timestamp] { pointer, pointer_timestamp] {
if (True(query_base->flags & QueryFlagBits::IsInvalidated)) { if (True(query_base->flags & QueryFlagBits::IsInvalidated)) {
@ -285,6 +286,18 @@ void QueryCacheBase<Traits>::CounterReport(GPUVAddr addr, QueryType counter_type
if (is_fence) { if (is_fence) {
impl->rasterizer.SignalFence(std::move(operation)); impl->rasterizer.SignalFence(std::move(operation));
} else { } else {
if (!Settings::IsGPULevelHigh() && counter_type == QueryType::Payload) {
if (has_timestamp) {
u64 timestamp = impl->gpu.GetTicks();
u64 value = static_cast<u64>(payload);
std::memcpy(pointer_timestamp, &timestamp, sizeof(timestamp));
std::memcpy(pointer, &value, sizeof(value));
} else {
std::memcpy(pointer, &payload, sizeof(payload));
}
streamer->Free(new_query_id);
return;
}
impl->rasterizer.SyncOperation(std::move(operation)); impl->rasterizer.SyncOperation(std::move(operation));
} }
if (is_synced) { if (is_synced) {

View file

@ -12,6 +12,7 @@
#include "video_core/cache_types.h" #include "video_core/cache_types.h"
#include "video_core/engines/fermi_2d.h" #include "video_core/engines/fermi_2d.h"
#include "video_core/gpu.h" #include "video_core/gpu.h"
#include "video_core/query_cache/types.h"
#include "video_core/rasterizer_download_area.h" #include "video_core/rasterizer_download_area.h"
namespace Tegra { namespace Tegra {
@ -26,11 +27,6 @@ struct ChannelState;
namespace VideoCore { namespace VideoCore {
enum class QueryType {
SamplesPassed,
};
constexpr std::size_t NumQueryTypes = 1;
enum class LoadCallbackStage { enum class LoadCallbackStage {
Prepare, Prepare,
Build, Build,
@ -58,10 +54,11 @@ public:
virtual void DispatchCompute() = 0; virtual void DispatchCompute() = 0;
/// Resets the counter of a query /// Resets the counter of a query
virtual void ResetCounter(QueryType type) = 0; virtual void ResetCounter(VideoCommon::QueryType type) = 0;
/// Records a GPU query and caches it /// Records a GPU query and caches it
virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional<u64> timestamp) = 0; virtual void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) = 0;
/// Signal an uniform buffer binding /// Signal an uniform buffer binding
virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
@ -83,7 +80,7 @@ public:
virtual void SignalReference() = 0; virtual void SignalReference() = 0;
/// Release all pending fences. /// Release all pending fences.
virtual void ReleaseFences() = 0; virtual void ReleaseFences(bool force = true) = 0;
/// Notify rasterizer that all caches should be flushed to Switch memory /// Notify rasterizer that all caches should be flushed to Switch memory
virtual void FlushAll() = 0; virtual void FlushAll() = 0;

View file

@ -26,16 +26,18 @@ void RasterizerNull::Draw(bool is_indexed, u32 instance_count) {}
void RasterizerNull::DrawTexture() {} void RasterizerNull::DrawTexture() {}
void RasterizerNull::Clear(u32 layer_count) {} void RasterizerNull::Clear(u32 layer_count) {}
void RasterizerNull::DispatchCompute() {} void RasterizerNull::DispatchCompute() {}
void RasterizerNull::ResetCounter(VideoCore::QueryType type) {} void RasterizerNull::ResetCounter(VideoCommon::QueryType type) {}
void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, void RasterizerNull::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
std::optional<u64> timestamp) { VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) {
if (!gpu_memory) { if (!gpu_memory) {
return; return;
} }
if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) {
gpu_memory->Write(gpu_addr, u64{0}); u64 ticks = m_gpu.GetTicks();
if (timestamp) { gpu_memory->Write<u64>(gpu_addr + 8, ticks);
gpu_memory->Write(gpu_addr + 8, *timestamp); gpu_memory->Write<u64>(gpu_addr, static_cast<u64>(payload));
} else {
gpu_memory->Write<u32>(gpu_addr, payload);
} }
} }
void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, void RasterizerNull::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
@ -74,7 +76,7 @@ void RasterizerNull::SignalSyncPoint(u32 value) {
syncpoint_manager.IncrementHost(value); syncpoint_manager.IncrementHost(value);
} }
void RasterizerNull::SignalReference() {} void RasterizerNull::SignalReference() {}
void RasterizerNull::ReleaseFences() {} void RasterizerNull::ReleaseFences(bool) {}
void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {} void RasterizerNull::FlushAndInvalidateRegion(VAddr addr, u64 size, VideoCommon::CacheType) {}
void RasterizerNull::WaitForIdle() {} void RasterizerNull::WaitForIdle() {}
void RasterizerNull::FragmentBarrier() {} void RasterizerNull::FragmentBarrier() {}

View file

@ -42,8 +42,9 @@ public:
void DrawTexture() override; void DrawTexture() override;
void Clear(u32 layer_count) override; void Clear(u32 layer_count) override;
void DispatchCompute() override; void DispatchCompute() override;
void ResetCounter(VideoCore::QueryType type) override; void ResetCounter(VideoCommon::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
void FlushAll() override; void FlushAll() override;
@ -63,7 +64,7 @@ public:
void SyncOperation(std::function<void()>&& func) override; void SyncOperation(std::function<void()>&& func) override;
void SignalSyncPoint(u32 value) override; void SignalSyncPoint(u32 value) override;
void SignalReference() override; void SignalReference() override;
void ReleaseFences() override; void ReleaseFences(bool force) override;
void FlushAndInvalidateRegion( void FlushAndInvalidateRegion(
VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
void WaitForIdle() override; void WaitForIdle() override;

View file

@ -27,7 +27,7 @@ constexpr GLenum GetTarget(VideoCore::QueryType type) {
} // Anonymous namespace } // Anonymous namespace
QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_) QueryCache::QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_)
: QueryCacheBase(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {} : QueryCacheLegacy(rasterizer_, cpu_memory_), gl_rasterizer{rasterizer_} {}
QueryCache::~QueryCache() = default; QueryCache::~QueryCache() = default;

View file

@ -26,7 +26,7 @@ class RasterizerOpenGL;
using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>;
class QueryCache final class QueryCache final
: public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { : public VideoCommon::QueryCacheLegacy<QueryCache, CachedQuery, CounterStream, HostCounter> {
public: public:
explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_); explicit QueryCache(RasterizerOpenGL& rasterizer_, Core::Memory::Memory& cpu_memory_);
~QueryCache(); ~QueryCache();

View file

@ -385,13 +385,39 @@ void RasterizerOpenGL::DispatchCompute() {
has_written_global_memory |= pipeline->WritesGlobalMemory(); has_written_global_memory |= pipeline->WritesGlobalMemory();
} }
void RasterizerOpenGL::ResetCounter(VideoCore::QueryType type) { void RasterizerOpenGL::ResetCounter(VideoCommon::QueryType type) {
query_cache.ResetCounter(type); if (type == VideoCommon::QueryType::ZPassPixelCount64) {
query_cache.ResetCounter(VideoCore::QueryType::SamplesPassed);
}
} }
void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
std::optional<u64> timestamp) { VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) {
query_cache.Query(gpu_addr, type, timestamp); if (type == VideoCommon::QueryType::ZPassPixelCount64) {
if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) {
query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, {gpu.GetTicks()});
} else {
query_cache.Query(gpu_addr, VideoCore::QueryType::SamplesPassed, std::nullopt);
}
return;
}
if (type != VideoCommon::QueryType::Payload) {
payload = 1u;
}
std::function<void()> func([this, gpu_addr, flags, memory_manager = gpu_memory, payload]() {
if (True(flags & VideoCommon::QueryPropertiesFlags::HasTimeout)) {
u64 ticks = gpu.GetTicks();
memory_manager->Write<u64>(gpu_addr + 8, ticks);
memory_manager->Write<u64>(gpu_addr, static_cast<u64>(payload));
} else {
memory_manager->Write<u32>(gpu_addr, payload);
}
});
if (True(flags & VideoCommon::QueryPropertiesFlags::IsAFence)) {
SignalFence(std::move(func));
return;
}
func();
} }
void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
@ -562,8 +588,8 @@ void RasterizerOpenGL::SignalReference() {
fence_manager.SignalOrdering(); fence_manager.SignalOrdering();
} }
void RasterizerOpenGL::ReleaseFences() { void RasterizerOpenGL::ReleaseFences(bool force) {
fence_manager.WaitPendingFences(); fence_manager.WaitPendingFences(force);
} }
void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size, void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size,

View file

@ -86,8 +86,9 @@ public:
void DrawTexture() override; void DrawTexture() override;
void Clear(u32 layer_count) override; void Clear(u32 layer_count) override;
void DispatchCompute() override; void DispatchCompute() override;
void ResetCounter(VideoCore::QueryType type) override; void ResetCounter(VideoCommon::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
void FlushAll() override; void FlushAll() override;
@ -107,7 +108,7 @@ public:
void SyncOperation(std::function<void()>&& func) override; void SyncOperation(std::function<void()>&& func) override;
void SignalSyncPoint(u32 value) override; void SignalSyncPoint(u32 value) override;
void SignalReference() override; void SignalReference() override;
void ReleaseFences() override; void ReleaseFences(bool force = true) override;
void FlushAndInvalidateRegion( void FlushAndInvalidateRegion(
VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
void WaitForIdle() override; void WaitForIdle() override;

View file

@ -61,6 +61,9 @@ vk::Buffer CreateBuffer(const Device& device, const MemoryAllocator& memory_allo
if (device.IsExtTransformFeedbackSupported()) { if (device.IsExtTransformFeedbackSupported()) {
flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT; flags |= VK_BUFFER_USAGE_TRANSFORM_FEEDBACK_BUFFER_BIT_EXT;
} }
if (device.IsExtConditionalRendering()) {
flags |= VK_BUFFER_USAGE_CONDITIONAL_RENDERING_BIT_EXT;
}
const VkBufferCreateInfo buffer_ci = { const VkBufferCreateInfo buffer_ci = {
.sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
.pNext = nullptr, .pNext = nullptr,

View file

@ -12,6 +12,9 @@
#include "common/common_types.h" #include "common/common_types.h"
#include "common/div_ceil.h" #include "common/div_ceil.h"
#include "video_core/host_shaders/astc_decoder_comp_spv.h" #include "video_core/host_shaders/astc_decoder_comp_spv.h"
#include "video_core/host_shaders/queries_prefix_scan_sum_comp_spv.h"
#include "video_core/host_shaders/queries_prefix_scan_sum_nosubgroups_comp_spv.h"
#include "video_core/host_shaders/resolve_conditional_render_comp_spv.h"
#include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h"
#include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h"
#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_compute_pass.h"
@ -57,6 +60,30 @@ constexpr std::array<VkDescriptorSetLayoutBinding, 2> INPUT_OUTPUT_DESCRIPTOR_SE
}, },
}}; }};
constexpr std::array<VkDescriptorSetLayoutBinding, 3> QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS{{
{
.binding = 0,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = 1,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
{
.binding = 2,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.descriptorCount = 1,
.stageFlags = VK_SHADER_STAGE_COMPUTE_BIT,
.pImmutableSamplers = nullptr,
},
}};
constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
.uniform_buffers = 0, .uniform_buffers = 0,
.storage_buffers = 2, .storage_buffers = 2,
@ -67,6 +94,16 @@ constexpr DescriptorBankInfo INPUT_OUTPUT_BANK_INFO{
.score = 2, .score = 2,
}; };
constexpr DescriptorBankInfo QUERIES_SCAN_BANK_INFO{
.uniform_buffers = 0,
.storage_buffers = 3,
.texture_buffers = 0,
.image_buffers = 0,
.textures = 0,
.images = 0,
.score = 3,
};
constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{ constexpr std::array<VkDescriptorSetLayoutBinding, ASTC_NUM_BINDINGS> ASTC_DESCRIPTOR_SET_BINDINGS{{
{ {
.binding = ASTC_BINDING_INPUT_BUFFER, .binding = ASTC_BINDING_INPUT_BUFFER,
@ -103,6 +140,15 @@ constexpr VkDescriptorUpdateTemplateEntry INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLAT
.stride = sizeof(DescriptorUpdateEntry), .stride = sizeof(DescriptorUpdateEntry),
}; };
constexpr VkDescriptorUpdateTemplateEntry QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE{
.dstBinding = 0,
.dstArrayElement = 0,
.descriptorCount = 3,
.descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
.offset = 0,
.stride = sizeof(DescriptorUpdateEntry),
};
constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS> constexpr std::array<VkDescriptorUpdateTemplateEntry, ASTC_NUM_BINDINGS>
ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{ ASTC_PASS_DESCRIPTOR_UPDATE_TEMPLATE_ENTRY{{
{ {
@ -131,13 +177,21 @@ struct AstcPushConstants {
u32 block_height; u32 block_height;
u32 block_height_mask; u32 block_height_mask;
}; };
struct QueriesPrefixScanPushConstants {
u32 min_accumulation_base;
u32 max_accumulation_base;
u32 accumulation_limit;
u32 buffer_offset;
};
} // Anonymous namespace } // Anonymous namespace
ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool, ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
vk::Span<VkDescriptorSetLayoutBinding> bindings, vk::Span<VkDescriptorSetLayoutBinding> bindings,
vk::Span<VkDescriptorUpdateTemplateEntry> templates, vk::Span<VkDescriptorUpdateTemplateEntry> templates,
const DescriptorBankInfo& bank_info, const DescriptorBankInfo& bank_info,
vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code) vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code,
std::optional<u32> optional_subgroup_size)
: device{device_} { : device{device_} {
descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({ descriptor_set_layout = device.GetLogical().CreateDescriptorSetLayout({
.sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
@ -178,13 +232,19 @@ ComputePass::ComputePass(const Device& device_, DescriptorPool& descriptor_pool,
.pCode = code.data(), .pCode = code.data(),
}); });
device.SaveShader(code); device.SaveShader(code);
const VkPipelineShaderStageRequiredSubgroupSizeCreateInfoEXT subgroup_size_ci{
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_REQUIRED_SUBGROUP_SIZE_CREATE_INFO_EXT,
.pNext = nullptr,
.requiredSubgroupSize = optional_subgroup_size ? *optional_subgroup_size : 32U,
};
bool use_setup_size = device.IsExtSubgroupSizeControlSupported() && optional_subgroup_size;
pipeline = device.GetLogical().CreateComputePipeline({ pipeline = device.GetLogical().CreateComputePipeline({
.sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
.pNext = nullptr, .pNext = nullptr,
.flags = 0, .flags = 0,
.stage{ .stage{
.sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
.pNext = nullptr, .pNext = use_setup_size ? &subgroup_size_ci : nullptr,
.flags = 0, .flags = 0,
.stage = VK_SHADER_STAGE_COMPUTE_BIT, .stage = VK_SHADER_STAGE_COMPUTE_BIT,
.module = *module, .module = *module,
@ -302,6 +362,123 @@ std::pair<VkBuffer, VkDeviceSize> QuadIndexedPass::Assemble(
return {staging.buffer, staging.offset}; return {staging.buffer, staging.offset};
} }
ConditionalRenderingResolvePass::ConditionalRenderingResolvePass(
const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
: ComputePass(device_, descriptor_pool_, INPUT_OUTPUT_DESCRIPTOR_SET_BINDINGS,
INPUT_OUTPUT_DESCRIPTOR_UPDATE_TEMPLATE, INPUT_OUTPUT_BANK_INFO, nullptr,
RESOLVE_CONDITIONAL_RENDER_COMP_SPV),
scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
void ConditionalRenderingResolvePass::Resolve(VkBuffer dst_buffer, VkBuffer src_buffer,
u32 src_offset, bool compare_to_zero) {
const size_t compare_size = compare_to_zero ? 8 : 24;
compute_pass_descriptor_queue.Acquire();
compute_pass_descriptor_queue.AddBuffer(src_buffer, src_offset, compare_size);
compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, sizeof(u32));
const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()};
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([this, descriptor_data](vk::CommandBuffer cmdbuf) {
static constexpr VkMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT | VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
};
static constexpr VkMemoryBarrier write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT,
};
const VkDescriptorSet set = descriptor_allocator.Commit();
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
cmdbuf.Dispatch(1, 1, 1);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0, write_barrier);
});
}
QueriesPrefixScanPass::QueriesPrefixScanPass(
const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_)
: ComputePass(
device_, descriptor_pool_, QUERIES_SCAN_DESCRIPTOR_SET_BINDINGS,
QUERIES_SCAN_DESCRIPTOR_UPDATE_TEMPLATE, QUERIES_SCAN_BANK_INFO,
COMPUTE_PUSH_CONSTANT_RANGE<sizeof(QueriesPrefixScanPushConstants)>,
device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_BASIC_BIT) &&
device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_ARITHMETIC_BIT) &&
device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_BIT) &&
device_.IsSubgroupFeatureSupported(VK_SUBGROUP_FEATURE_SHUFFLE_RELATIVE_BIT)
? std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_COMP_SPV)
: std::span<const u32>(QUERIES_PREFIX_SCAN_SUM_NOSUBGROUPS_COMP_SPV)),
scheduler{scheduler_}, compute_pass_descriptor_queue{compute_pass_descriptor_queue_} {}
void QueriesPrefixScanPass::Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer,
VkBuffer src_buffer, size_t number_of_sums,
size_t min_accumulation_limit, size_t max_accumulation_limit) {
size_t current_runs = number_of_sums;
size_t offset = 0;
while (current_runs != 0) {
static constexpr size_t DISPATCH_SIZE = 2048U;
size_t runs_to_do = std::min<size_t>(current_runs, DISPATCH_SIZE);
current_runs -= runs_to_do;
compute_pass_descriptor_queue.Acquire();
compute_pass_descriptor_queue.AddBuffer(src_buffer, 0, number_of_sums * sizeof(u64));
compute_pass_descriptor_queue.AddBuffer(dst_buffer, 0, number_of_sums * sizeof(u64));
compute_pass_descriptor_queue.AddBuffer(accumulation_buffer, 0, sizeof(u64));
const void* const descriptor_data{compute_pass_descriptor_queue.UpdateData()};
size_t used_offset = offset;
offset += runs_to_do;
scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([this, descriptor_data, min_accumulation_limit, max_accumulation_limit,
runs_to_do, used_offset](vk::CommandBuffer cmdbuf) {
static constexpr VkMemoryBarrier read_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT,
};
static constexpr VkMemoryBarrier write_barrier{
.sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER,
.pNext = nullptr,
.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT,
.dstAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_TRANSFER_READ_BIT |
VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT |
VK_ACCESS_INDIRECT_COMMAND_READ_BIT | VK_ACCESS_INDEX_READ_BIT |
VK_ACCESS_UNIFORM_READ_BIT |
VK_ACCESS_CONDITIONAL_RENDERING_READ_BIT_EXT,
};
const QueriesPrefixScanPushConstants uniforms{
.min_accumulation_base = static_cast<u32>(min_accumulation_limit),
.max_accumulation_base = static_cast<u32>(max_accumulation_limit),
.accumulation_limit = static_cast<u32>(runs_to_do - 1),
.buffer_offset = static_cast<u32>(used_offset),
};
const VkDescriptorSet set = descriptor_allocator.Commit();
device.GetLogical().UpdateDescriptorSet(set, *descriptor_template, descriptor_data);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT,
VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, 0, read_barrier);
cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *layout, 0, set, {});
cmdbuf.PushConstants(*layout, VK_SHADER_STAGE_COMPUTE_BIT, uniforms);
cmdbuf.Dispatch(1, 1, 1);
cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT,
VK_PIPELINE_STAGE_CONDITIONAL_RENDERING_BIT_EXT, 0,
write_barrier);
});
}
}
ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, ASTCDecoderPass::ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_, DescriptorPool& descriptor_pool_,
StagingBufferPool& staging_buffer_pool_, StagingBufferPool& staging_buffer_pool_,

View file

@ -3,6 +3,7 @@
#pragma once #pragma once
#include <optional>
#include <span> #include <span>
#include <utility> #include <utility>
@ -31,7 +32,8 @@ public:
vk::Span<VkDescriptorSetLayoutBinding> bindings, vk::Span<VkDescriptorSetLayoutBinding> bindings,
vk::Span<VkDescriptorUpdateTemplateEntry> templates, vk::Span<VkDescriptorUpdateTemplateEntry> templates,
const DescriptorBankInfo& bank_info, const DescriptorBankInfo& bank_info,
vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code); vk::Span<VkPushConstantRange> push_constants, std::span<const u32> code,
std::optional<u32> optional_subgroup_size = std::nullopt);
~ComputePass(); ~ComputePass();
protected: protected:
@ -82,6 +84,33 @@ private:
ComputePassDescriptorQueue& compute_pass_descriptor_queue; ComputePassDescriptorQueue& compute_pass_descriptor_queue;
}; };
class ConditionalRenderingResolvePass final : public ComputePass {
public:
explicit ConditionalRenderingResolvePass(
const Device& device_, Scheduler& scheduler_, DescriptorPool& descriptor_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
void Resolve(VkBuffer dst_buffer, VkBuffer src_buffer, u32 src_offset, bool compare_to_zero);
private:
Scheduler& scheduler;
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
};
class QueriesPrefixScanPass final : public ComputePass {
public:
explicit QueriesPrefixScanPass(const Device& device_, Scheduler& scheduler_,
DescriptorPool& descriptor_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue_);
void Run(VkBuffer accumulation_buffer, VkBuffer dst_buffer, VkBuffer src_buffer,
size_t number_of_sums, size_t min_accumulation_limit, size_t max_accumulation_limit);
private:
Scheduler& scheduler;
ComputePassDescriptorQueue& compute_pass_descriptor_queue;
};
class ASTCDecoderPass final : public ComputePass { class ASTCDecoderPass final : public ComputePass {
public: public:
explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_, explicit ASTCDecoderPass(const Device& device_, Scheduler& scheduler_,

View file

@ -7,6 +7,7 @@
#include "video_core/fence_manager.h" #include "video_core/fence_manager.h"
#include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_query_cache.h"
#include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_texture_cache.h"
namespace Core { namespace Core {
@ -20,7 +21,6 @@ class RasterizerInterface;
namespace Vulkan { namespace Vulkan {
class Device; class Device;
class QueryCache;
class Scheduler; class Scheduler;
class InnerFence : public VideoCommon::FenceBase { class InnerFence : public VideoCommon::FenceBase {

File diff suppressed because it is too large Load diff

View file

@ -1,101 +1,75 @@
// SPDX-FileCopyrightText: Copyright 2020 yuzu Emulator Project // SPDX-FileCopyrightText: Copyright 2023 yuzu Emulator Project
// SPDX-License-Identifier: GPL-2.0-or-later // SPDX-License-Identifier: GPL-3.0-or-later
#pragma once #pragma once
#include <cstddef>
#include <memory> #include <memory>
#include <utility>
#include <vector>
#include "common/common_types.h" #include "video_core/query_cache/query_cache_base.h"
#include "video_core/query_cache.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h"
#include "video_core/renderer_vulkan/vk_resource_pool.h"
#include "video_core/vulkan_common/vulkan_wrapper.h"
namespace VideoCore { namespace VideoCore {
class RasterizerInterface; class RasterizerInterface;
} }
namespace VideoCommon {
class StreamerInterface;
}
namespace Vulkan { namespace Vulkan {
class CachedQuery;
class Device; class Device;
class HostCounter;
class QueryCache;
class Scheduler; class Scheduler;
class StagingBufferPool;
using CounterStream = VideoCommon::CounterStreamBase<QueryCache, HostCounter>; struct QueryCacheRuntimeImpl;
class QueryPool final : public ResourcePool { class QueryCacheRuntime {
public: public:
explicit QueryPool(const Device& device, Scheduler& scheduler, VideoCore::QueryType type); explicit QueryCacheRuntime(VideoCore::RasterizerInterface* rasterizer,
~QueryPool() override; Core::Memory::Memory& cpu_memory_,
Vulkan::BufferCache& buffer_cache_, const Device& device_,
const MemoryAllocator& memory_allocator_, Scheduler& scheduler_,
StagingBufferPool& staging_pool_,
ComputePassDescriptorQueue& compute_pass_descriptor_queue,
DescriptorPool& descriptor_pool);
~QueryCacheRuntime();
std::pair<VkQueryPool, u32> Commit(); template <typename SyncValuesType>
void SyncValues(std::span<SyncValuesType> values, VkBuffer base_src_buffer = nullptr);
void Reserve(std::pair<VkQueryPool, u32> query); void Barriers(bool is_prebarrier);
protected: void EndHostConditionalRendering();
void Allocate(std::size_t begin, std::size_t end) override;
void PauseHostConditionalRendering();
void ResumeHostConditionalRendering();
bool HostConditionalRenderingCompareValue(VideoCommon::LookupData object_1, bool qc_dirty);
bool HostConditionalRenderingCompareValues(VideoCommon::LookupData object_1,
VideoCommon::LookupData object_2, bool qc_dirty,
bool equal_check);
VideoCommon::StreamerInterface* GetStreamerInterface(VideoCommon::QueryType query_type);
void Bind3DEngine(Tegra::Engines::Maxwell3D* maxwell3d);
template <typename Func>
void View3DRegs(Func&& func);
private: private:
static constexpr std::size_t GROW_STEP = 512; void HostConditionalRenderingCompareValueImpl(VideoCommon::LookupData object, bool is_equal);
void HostConditionalRenderingCompareBCImpl(VAddr address, bool is_equal);
const Device& device; friend struct QueryCacheRuntimeImpl;
const VideoCore::QueryType type; std::unique_ptr<QueryCacheRuntimeImpl> impl;
std::vector<vk::QueryPool> pools;
std::vector<bool> usage;
}; };
class QueryCache final struct QueryCacheParams {
: public VideoCommon::QueryCacheBase<QueryCache, CachedQuery, CounterStream, HostCounter> { using RuntimeType = typename Vulkan::QueryCacheRuntime;
public:
explicit QueryCache(VideoCore::RasterizerInterface& rasterizer_,
Core::Memory::Memory& cpu_memory_, const Device& device_,
Scheduler& scheduler_);
~QueryCache();
std::pair<VkQueryPool, u32> AllocateQuery(VideoCore::QueryType type);
void Reserve(VideoCore::QueryType type, std::pair<VkQueryPool, u32> query);
const Device& GetDevice() const noexcept {
return device;
}
Scheduler& GetScheduler() const noexcept {
return scheduler;
}
private:
const Device& device;
Scheduler& scheduler;
std::array<QueryPool, VideoCore::NumQueryTypes> query_pools;
}; };
class HostCounter final : public VideoCommon::HostCounterBase<QueryCache, HostCounter> { using QueryCache = VideoCommon::QueryCacheBase<QueryCacheParams>;
public:
explicit HostCounter(QueryCache& cache_, std::shared_ptr<HostCounter> dependency_,
VideoCore::QueryType type_);
~HostCounter();
void EndQuery();
private:
u64 BlockingQuery(bool async = false) const override;
QueryCache& cache;
const VideoCore::QueryType type;
const std::pair<VkQueryPool, u32> query;
const u64 tick;
};
class CachedQuery : public VideoCommon::CachedQueryBase<HostCounter> {
public:
explicit CachedQuery(QueryCache&, VideoCore::QueryType, VAddr cpu_addr_, u8* host_ptr_)
: CachedQueryBase{cpu_addr_, host_ptr_} {}
};
} // namespace Vulkan } // namespace Vulkan

View file

@ -24,6 +24,7 @@
#include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h"
#include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h"
#include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h"
#include "video_core/renderer_vulkan/vk_query_cache.h"
#include "video_core/renderer_vulkan/vk_rasterizer.h" #include "video_core/renderer_vulkan/vk_rasterizer.h"
#include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_scheduler.h"
#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h"
@ -170,9 +171,11 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra
buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool,
guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool), guest_descriptor_queue, compute_pass_descriptor_queue, descriptor_pool),
buffer_cache(*this, cpu_memory_, buffer_cache_runtime), buffer_cache(*this, cpu_memory_, buffer_cache_runtime),
query_cache_runtime(this, cpu_memory_, buffer_cache, device, memory_allocator, scheduler,
staging_pool, compute_pass_descriptor_queue, descriptor_pool),
query_cache(gpu, *this, cpu_memory_, query_cache_runtime),
pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue, pipeline_cache(*this, device, scheduler, descriptor_pool, guest_descriptor_queue,
render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()), render_pass_cache, buffer_cache, texture_cache, gpu.ShaderNotify()),
query_cache{*this, cpu_memory_, device, scheduler},
accelerate_dma(buffer_cache, texture_cache, scheduler), accelerate_dma(buffer_cache, texture_cache, scheduler),
fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler),
wfi_event(device.GetLogical().CreateEvent()) { wfi_event(device.GetLogical().CreateEvent()) {
@ -189,14 +192,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
FlushWork(); FlushWork();
gpu_memory->FlushCaching(); gpu_memory->FlushCaching();
#if ANDROID query_cache.NotifySegment(true);
if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
query_cache.UpdateCounters();
}
#else
query_cache.UpdateCounters();
#endif
GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()}; GraphicsPipeline* const pipeline{pipeline_cache.CurrentGraphicsPipeline()};
if (!pipeline) { if (!pipeline) {
@ -207,13 +203,12 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
pipeline->SetEngine(maxwell3d, gpu_memory); pipeline->SetEngine(maxwell3d, gpu_memory);
pipeline->Configure(is_indexed); pipeline->Configure(is_indexed);
BeginTransformFeedback();
UpdateDynamicStates(); UpdateDynamicStates();
HandleTransformFeedback();
query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
maxwell3d->regs.zpass_pixel_count_enable);
draw_func(); draw_func();
EndTransformFeedback();
} }
void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) { void RasterizerVulkan::Draw(bool is_indexed, u32 instance_count) {
@ -241,6 +236,14 @@ void RasterizerVulkan::DrawIndirect() {
const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer(); const auto indirect_buffer = buffer_cache.GetDrawIndirectBuffer();
const auto& buffer = indirect_buffer.first; const auto& buffer = indirect_buffer.first;
const auto& offset = indirect_buffer.second; const auto& offset = indirect_buffer.second;
if (params.is_byte_count) {
scheduler.Record([buffer_obj = buffer->Handle(), offset,
stride = params.stride](vk::CommandBuffer cmdbuf) {
cmdbuf.DrawIndirectByteCountEXT(1, 0, buffer_obj, offset, 0,
static_cast<u32>(stride));
});
return;
}
if (params.include_count) { if (params.include_count) {
const auto count = buffer_cache.GetDrawIndirectCount(); const auto count = buffer_cache.GetDrawIndirectCount();
const auto& draw_buffer = count.first; const auto& draw_buffer = count.first;
@ -280,20 +283,15 @@ void RasterizerVulkan::DrawTexture() {
SCOPE_EXIT({ gpu.TickWork(); }); SCOPE_EXIT({ gpu.TickWork(); });
FlushWork(); FlushWork();
#if ANDROID query_cache.NotifySegment(true);
if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal.
query_cache.UpdateCounters();
}
#else
query_cache.UpdateCounters();
#endif
texture_cache.SynchronizeGraphicsDescriptors(); texture_cache.SynchronizeGraphicsDescriptors();
texture_cache.UpdateRenderTargets(false); texture_cache.UpdateRenderTargets(false);
UpdateDynamicStates(); UpdateDynamicStates();
query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
maxwell3d->regs.zpass_pixel_count_enable);
const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState(); const auto& draw_texture_state = maxwell3d->draw_manager->GetDrawTextureState();
const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler); const auto& sampler = texture_cache.GetGraphicsSampler(draw_texture_state.src_sampler);
const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture); const auto& texture = texture_cache.GetImageView(draw_texture_state.src_texture);
@ -316,14 +314,9 @@ void RasterizerVulkan::Clear(u32 layer_count) {
FlushWork(); FlushWork();
gpu_memory->FlushCaching(); gpu_memory->FlushCaching();
#if ANDROID query_cache.NotifySegment(true);
if (Settings::IsGPULevelHigh()) { query_cache.CounterEnable(VideoCommon::QueryType::ZPassPixelCount64,
// This is problematic on Android, disable on GPU Normal. maxwell3d->regs.zpass_pixel_count_enable);
query_cache.UpdateCounters();
}
#else
query_cache.UpdateCounters();
#endif
auto& regs = maxwell3d->regs; auto& regs = maxwell3d->regs;
const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B || const bool use_color = regs.clear_surface.R || regs.clear_surface.G || regs.clear_surface.B ||
@ -468,13 +461,13 @@ void RasterizerVulkan::DispatchCompute() {
scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); }); scheduler.Record([dim](vk::CommandBuffer cmdbuf) { cmdbuf.Dispatch(dim[0], dim[1], dim[2]); });
} }
void RasterizerVulkan::ResetCounter(VideoCore::QueryType type) { void RasterizerVulkan::ResetCounter(VideoCommon::QueryType type) {
query_cache.ResetCounter(type); query_cache.CounterReset(type);
} }
void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
std::optional<u64> timestamp) { VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) {
query_cache.Query(gpu_addr, type, timestamp); query_cache.CounterReport(gpu_addr, type, flags, payload, subreport);
} }
void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr,
@ -655,8 +648,8 @@ void RasterizerVulkan::SignalReference() {
fence_manager.SignalReference(); fence_manager.SignalReference();
} }
void RasterizerVulkan::ReleaseFences() { void RasterizerVulkan::ReleaseFences(bool force) {
fence_manager.WaitPendingFences(); fence_manager.WaitPendingFences(force);
} }
void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size, void RasterizerVulkan::FlushAndInvalidateRegion(VAddr addr, u64 size,
@ -680,6 +673,8 @@ void RasterizerVulkan::WaitForIdle() {
flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT; flags |= VK_PIPELINE_STAGE_TRANSFORM_FEEDBACK_BIT_EXT;
} }
query_cache.NotifyWFI();
scheduler.RequestOutsideRenderPassOperationContext(); scheduler.RequestOutsideRenderPassOperationContext();
scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) { scheduler.Record([event = *wfi_event, flags](vk::CommandBuffer cmdbuf) {
cmdbuf.SetEvent(event, flags); cmdbuf.SetEvent(event, flags);
@ -723,19 +718,7 @@ void RasterizerVulkan::TickFrame() {
bool RasterizerVulkan::AccelerateConditionalRendering() { bool RasterizerVulkan::AccelerateConditionalRendering() {
gpu_memory->FlushCaching(); gpu_memory->FlushCaching();
if (Settings::IsGPULevelHigh()) { return query_cache.AccelerateHostConditionalRendering();
// TODO(Blinkhawk): Reimplement Host conditional rendering.
return false;
}
// Medium / Low Hack: stub any checks on queries written into the buffer cache.
const GPUVAddr condition_address{maxwell3d->regs.render_enable.Address()};
Maxwell::ReportSemaphore::Compare cmp;
if (gpu_memory->IsMemoryDirty(condition_address, sizeof(cmp),
VideoCommon::CacheType::BufferCache |
VideoCommon::CacheType::QueryCache)) {
return true;
}
return false;
} }
bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src,
@ -781,6 +764,7 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config,
if (!image_view) { if (!image_view) {
return false; return false;
} }
query_cache.NotifySegment(false);
screen_info.image = image_view->ImageHandle(); screen_info.image = image_view->ImageHandle();
screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D); screen_info.image_view = image_view->Handle(Shader::TextureType::Color2D);
screen_info.width = image_view->size.width; screen_info.width = image_view->size.width;
@ -919,31 +903,18 @@ void RasterizerVulkan::UpdateDynamicStates() {
} }
} }
void RasterizerVulkan::BeginTransformFeedback() { void RasterizerVulkan::HandleTransformFeedback() {
const auto& regs = maxwell3d->regs; const auto& regs = maxwell3d->regs;
if (regs.transform_feedback_enabled == 0) {
return;
}
if (!device.IsExtTransformFeedbackSupported()) { if (!device.IsExtTransformFeedbackSupported()) {
LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported");
return; return;
} }
query_cache.CounterEnable(VideoCommon::QueryType::StreamingByteCount,
regs.transform_feedback_enabled);
if (regs.transform_feedback_enabled != 0) {
UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) || UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderType::TessellationInit) ||
regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation)); regs.IsShaderConfigEnabled(Maxwell::ShaderType::Tessellation));
scheduler.Record(
[](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); });
}
void RasterizerVulkan::EndTransformFeedback() {
const auto& regs = maxwell3d->regs;
if (regs.transform_feedback_enabled == 0) {
return;
} }
if (!device.IsExtTransformFeedbackSupported()) {
return;
}
scheduler.Record(
[](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); });
} }
void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) {

View file

@ -84,8 +84,9 @@ public:
void DrawTexture() override; void DrawTexture() override;
void Clear(u32 layer_count) override; void Clear(u32 layer_count) override;
void DispatchCompute() override; void DispatchCompute() override;
void ResetCounter(VideoCore::QueryType type) override; void ResetCounter(VideoCommon::QueryType type) override;
void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional<u64> timestamp) override; void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type,
VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override;
void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override;
void DisableGraphicsUniformBuffer(size_t stage, u32 index) override; void DisableGraphicsUniformBuffer(size_t stage, u32 index) override;
void FlushAll() override; void FlushAll() override;
@ -106,7 +107,7 @@ public:
void SyncOperation(std::function<void()>&& func) override; void SyncOperation(std::function<void()>&& func) override;
void SignalSyncPoint(u32 value) override; void SignalSyncPoint(u32 value) override;
void SignalReference() override; void SignalReference() override;
void ReleaseFences() override; void ReleaseFences(bool force = true) override;
void FlushAndInvalidateRegion( void FlushAndInvalidateRegion(
VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override; VAddr addr, u64 size, VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
void WaitForIdle() override; void WaitForIdle() override;
@ -146,9 +147,7 @@ private:
void UpdateDynamicStates(); void UpdateDynamicStates();
void BeginTransformFeedback(); void HandleTransformFeedback();
void EndTransformFeedback();
void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs);
void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs);
@ -195,8 +194,9 @@ private:
TextureCache texture_cache; TextureCache texture_cache;
BufferCacheRuntime buffer_cache_runtime; BufferCacheRuntime buffer_cache_runtime;
BufferCache buffer_cache; BufferCache buffer_cache;
PipelineCache pipeline_cache; QueryCacheRuntime query_cache_runtime;
QueryCache query_cache; QueryCache query_cache;
PipelineCache pipeline_cache;
AccelerateDMA accelerate_dma; AccelerateDMA accelerate_dma;
FenceManager fence_manager; FenceManager fence_manager;

View file

@ -243,10 +243,10 @@ void Scheduler::AllocateNewContext() {
#if ANDROID #if ANDROID
if (Settings::IsGPULevelHigh()) { if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal. // This is problematic on Android, disable on GPU Normal.
query_cache->UpdateCounters(); query_cache->NotifySegment(true);
} }
#else #else
query_cache->UpdateCounters(); query_cache->NotifySegment(true);
#endif #endif
} }
} }
@ -261,11 +261,12 @@ void Scheduler::EndPendingOperations() {
#if ANDROID #if ANDROID
if (Settings::IsGPULevelHigh()) { if (Settings::IsGPULevelHigh()) {
// This is problematic on Android, disable on GPU Normal. // This is problematic on Android, disable on GPU Normal.
query_cache->DisableStreams(); // query_cache->DisableStreams();
} }
#else #else
query_cache->DisableStreams(); // query_cache->DisableStreams();
#endif #endif
query_cache->NotifySegment(false);
EndRenderPass(); EndRenderPass();
} }

View file

@ -17,6 +17,11 @@
#include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h"
#include "video_core/vulkan_common/vulkan_wrapper.h" #include "video_core/vulkan_common/vulkan_wrapper.h"
namespace VideoCommon {
template <typename Trait>
class QueryCacheBase;
}
namespace Vulkan { namespace Vulkan {
class CommandPool; class CommandPool;
@ -24,7 +29,8 @@ class Device;
class Framebuffer; class Framebuffer;
class GraphicsPipeline; class GraphicsPipeline;
class StateTracker; class StateTracker;
class QueryCache;
struct QueryCacheParams;
/// The scheduler abstracts command buffer and fence management with an interface that's able to do /// The scheduler abstracts command buffer and fence management with an interface that's able to do
/// OpenGL-like operations on Vulkan command buffers. /// OpenGL-like operations on Vulkan command buffers.
@ -63,7 +69,7 @@ public:
void InvalidateState(); void InvalidateState();
/// Assigns the query cache. /// Assigns the query cache.
void SetQueryCache(QueryCache& query_cache_) { void SetQueryCache(VideoCommon::QueryCacheBase<QueryCacheParams>& query_cache_) {
query_cache = &query_cache_; query_cache = &query_cache_;
} }
@ -219,7 +225,7 @@ private:
std::unique_ptr<MasterSemaphore> master_semaphore; std::unique_ptr<MasterSemaphore> master_semaphore;
std::unique_ptr<CommandPool> command_pool; std::unique_ptr<CommandPool> command_pool;
QueryCache* query_cache = nullptr; VideoCommon::QueryCacheBase<QueryCacheParams>* query_cache = nullptr;
vk::CommandBuffer current_cmdbuf; vk::CommandBuffer current_cmdbuf;

View file

@ -60,6 +60,7 @@ VK_DEFINE_HANDLE(VmaAllocator)
// Define miscellaneous extensions which may be used by the implementation here. // Define miscellaneous extensions which may be used by the implementation here.
#define FOR_EACH_VK_EXTENSION(EXTENSION) \ #define FOR_EACH_VK_EXTENSION(EXTENSION) \
EXTENSION(EXT, CONDITIONAL_RENDERING, conditional_rendering) \
EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization) \ EXTENSION(EXT, CONSERVATIVE_RASTERIZATION, conservative_rasterization) \
EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted) \ EXTENSION(EXT, DEPTH_RANGE_UNRESTRICTED, depth_range_unrestricted) \
EXTENSION(EXT, MEMORY_BUDGET, memory_budget) \ EXTENSION(EXT, MEMORY_BUDGET, memory_budget) \
@ -92,6 +93,7 @@ VK_DEFINE_HANDLE(VmaAllocator)
// Define extensions where the absence of the extension may result in a degraded experience. // Define extensions where the absence of the extension may result in a degraded experience.
#define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME) \ #define FOR_EACH_VK_RECOMMENDED_EXTENSION(EXTENSION_NAME) \
EXTENSION_NAME(VK_EXT_CONDITIONAL_RENDERING_EXTENSION_NAME) \
EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_CONSERVATIVE_RASTERIZATION_EXTENSION_NAME) \
EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_DEPTH_RANGE_UNRESTRICTED_EXTENSION_NAME) \
EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME) \ EXTENSION_NAME(VK_EXT_EXTENDED_DYNAMIC_STATE_EXTENSION_NAME) \
@ -528,6 +530,10 @@ public:
return extensions.shader_atomic_int64; return extensions.shader_atomic_int64;
} }
bool IsExtConditionalRendering() const {
return extensions.conditional_rendering;
}
bool HasTimelineSemaphore() const; bool HasTimelineSemaphore() const;
/// Returns the minimum supported version of SPIR-V. /// Returns the minimum supported version of SPIR-V.

View file

@ -75,6 +75,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
X(vkBeginCommandBuffer); X(vkBeginCommandBuffer);
X(vkBindBufferMemory); X(vkBindBufferMemory);
X(vkBindImageMemory); X(vkBindImageMemory);
X(vkCmdBeginConditionalRenderingEXT);
X(vkCmdBeginQuery); X(vkCmdBeginQuery);
X(vkCmdBeginRenderPass); X(vkCmdBeginRenderPass);
X(vkCmdBeginTransformFeedbackEXT); X(vkCmdBeginTransformFeedbackEXT);
@ -91,6 +92,7 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
X(vkCmdCopyBufferToImage); X(vkCmdCopyBufferToImage);
X(vkCmdCopyImage); X(vkCmdCopyImage);
X(vkCmdCopyImageToBuffer); X(vkCmdCopyImageToBuffer);
X(vkCmdCopyQueryPoolResults);
X(vkCmdDispatch); X(vkCmdDispatch);
X(vkCmdDraw); X(vkCmdDraw);
X(vkCmdDrawIndexed); X(vkCmdDrawIndexed);
@ -98,6 +100,8 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept {
X(vkCmdDrawIndexedIndirect); X(vkCmdDrawIndexedIndirect);
X(vkCmdDrawIndirectCount); X(vkCmdDrawIndirectCount);
X(vkCmdDrawIndexedIndirectCount); X(vkCmdDrawIndexedIndirectCount);
X(vkCmdDrawIndirectByteCountEXT);
X(vkCmdEndConditionalRenderingEXT);
X(vkCmdEndQuery); X(vkCmdEndQuery);
X(vkCmdEndRenderPass); X(vkCmdEndRenderPass);
X(vkCmdEndTransformFeedbackEXT); X(vkCmdEndTransformFeedbackEXT);

View file

@ -185,6 +185,7 @@ struct DeviceDispatch : InstanceDispatch {
PFN_vkBeginCommandBuffer vkBeginCommandBuffer{}; PFN_vkBeginCommandBuffer vkBeginCommandBuffer{};
PFN_vkBindBufferMemory vkBindBufferMemory{}; PFN_vkBindBufferMemory vkBindBufferMemory{};
PFN_vkBindImageMemory vkBindImageMemory{}; PFN_vkBindImageMemory vkBindImageMemory{};
PFN_vkCmdBeginConditionalRenderingEXT vkCmdBeginConditionalRenderingEXT{};
PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{}; PFN_vkCmdBeginDebugUtilsLabelEXT vkCmdBeginDebugUtilsLabelEXT{};
PFN_vkCmdBeginQuery vkCmdBeginQuery{}; PFN_vkCmdBeginQuery vkCmdBeginQuery{};
PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{}; PFN_vkCmdBeginRenderPass vkCmdBeginRenderPass{};
@ -202,6 +203,7 @@ struct DeviceDispatch : InstanceDispatch {
PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{}; PFN_vkCmdCopyBufferToImage vkCmdCopyBufferToImage{};
PFN_vkCmdCopyImage vkCmdCopyImage{}; PFN_vkCmdCopyImage vkCmdCopyImage{};
PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{}; PFN_vkCmdCopyImageToBuffer vkCmdCopyImageToBuffer{};
PFN_vkCmdCopyQueryPoolResults vkCmdCopyQueryPoolResults{};
PFN_vkCmdDispatch vkCmdDispatch{}; PFN_vkCmdDispatch vkCmdDispatch{};
PFN_vkCmdDraw vkCmdDraw{}; PFN_vkCmdDraw vkCmdDraw{};
PFN_vkCmdDrawIndexed vkCmdDrawIndexed{}; PFN_vkCmdDrawIndexed vkCmdDrawIndexed{};
@ -209,6 +211,8 @@ struct DeviceDispatch : InstanceDispatch {
PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{}; PFN_vkCmdDrawIndexedIndirect vkCmdDrawIndexedIndirect{};
PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{}; PFN_vkCmdDrawIndirectCount vkCmdDrawIndirectCount{};
PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{}; PFN_vkCmdDrawIndexedIndirectCount vkCmdDrawIndexedIndirectCount{};
PFN_vkCmdDrawIndirectByteCountEXT vkCmdDrawIndirectByteCountEXT{};
PFN_vkCmdEndConditionalRenderingEXT vkCmdEndConditionalRenderingEXT{};
PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{}; PFN_vkCmdEndDebugUtilsLabelEXT vkCmdEndDebugUtilsLabelEXT{};
PFN_vkCmdEndQuery vkCmdEndQuery{}; PFN_vkCmdEndQuery vkCmdEndQuery{};
PFN_vkCmdEndRenderPass vkCmdEndRenderPass{}; PFN_vkCmdEndRenderPass vkCmdEndRenderPass{};
@ -1181,6 +1185,13 @@ public:
count_offset, draw_count, stride); count_offset, draw_count, stride);
} }
void DrawIndirectByteCountEXT(u32 instance_count, u32 first_instance, VkBuffer counter_buffer,
VkDeviceSize counter_buffer_offset, u32 counter_offset,
u32 stride) {
dld->vkCmdDrawIndirectByteCountEXT(handle, instance_count, first_instance, counter_buffer,
counter_buffer_offset, counter_offset, stride);
}
void ClearAttachments(Span<VkClearAttachment> attachments, void ClearAttachments(Span<VkClearAttachment> attachments,
Span<VkClearRect> rects) const noexcept { Span<VkClearRect> rects) const noexcept {
dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(), dld->vkCmdClearAttachments(handle, attachments.size(), attachments.data(), rects.size(),
@ -1265,6 +1276,13 @@ public:
regions.data()); regions.data());
} }
void CopyQueryPoolResults(VkQueryPool query_pool, u32 first_query, u32 query_count,
VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize stride,
VkQueryResultFlags flags) const noexcept {
dld->vkCmdCopyQueryPoolResults(handle, query_pool, first_query, query_count, dst_buffer,
dst_offset, stride, flags);
}
void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size, void FillBuffer(VkBuffer dst_buffer, VkDeviceSize dst_offset, VkDeviceSize size,
u32 data) const noexcept { u32 data) const noexcept {
dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data); dld->vkCmdFillBuffer(handle, dst_buffer, dst_offset, size, data);
@ -1443,6 +1461,15 @@ public:
counter_buffers, counter_buffer_offsets); counter_buffers, counter_buffer_offsets);
} }
void BeginConditionalRenderingEXT(
const VkConditionalRenderingBeginInfoEXT& info) const noexcept {
dld->vkCmdBeginConditionalRenderingEXT(handle, &info);
}
void EndConditionalRenderingEXT() const noexcept {
dld->vkCmdEndConditionalRenderingEXT(handle);
}
void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept { void BeginDebugUtilsLabelEXT(const char* label, std::span<float, 4> color) const noexcept {
const VkDebugUtilsLabelEXT label_info{ const VkDebugUtilsLabelEXT label_info{
.sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT, .sType = VK_STRUCTURE_TYPE_DEBUG_UTILS_LABEL_EXT,

View file

@ -191,8 +191,9 @@ QString FormatPatchNameVersions(const FileSys::PatchManager& patch_manager,
} }
QList<QStandardItem*> MakeGameListEntry(const std::string& path, const std::string& name, QList<QStandardItem*> MakeGameListEntry(const std::string& path, const std::string& name,
const std::vector<u8>& icon, Loader::AppLoader& loader, const std::size_t size, const std::vector<u8>& icon,
u64 program_id, const CompatibilityList& compatibility_list, Loader::AppLoader& loader, u64 program_id,
const CompatibilityList& compatibility_list,
const FileSys::PatchManager& patch) { const FileSys::PatchManager& patch) {
const auto it = FindMatchingCompatibilityEntry(compatibility_list, program_id); const auto it = FindMatchingCompatibilityEntry(compatibility_list, program_id);
@ -210,7 +211,7 @@ QList<QStandardItem*> MakeGameListEntry(const std::string& path, const std::stri
file_type_string, program_id), file_type_string, program_id),
new GameListItemCompat(compatibility), new GameListItemCompat(compatibility),
new GameListItem(file_type_string), new GameListItem(file_type_string),
new GameListItemSize(Common::FS::GetSize(path)), new GameListItemSize(size),
}; };
const auto patch_versions = GetGameListCachedObject( const auto patch_versions = GetGameListCachedObject(
@ -278,8 +279,8 @@ void GameListWorker::AddTitlesToGameList(GameListDir* parent_dir) {
GetMetadataFromControlNCA(patch, *control, icon, name); GetMetadataFromControlNCA(patch, *control, icon, name);
} }
emit EntryReady(MakeGameListEntry(file->GetFullPath(), name, icon, *loader, program_id, emit EntryReady(MakeGameListEntry(file->GetFullPath(), name, file->GetSize(), icon, *loader,
compatibility_list, patch), program_id, compatibility_list, patch),
parent_dir); parent_dir);
} }
} }
@ -354,8 +355,9 @@ void GameListWorker::ScanFileSystem(ScanTarget target, const std::string& dir_pa
const FileSys::PatchManager patch{id, system.GetFileSystemController(), const FileSys::PatchManager patch{id, system.GetFileSystemController(),
system.GetContentProvider()}; system.GetContentProvider()};
emit EntryReady(MakeGameListEntry(physical_name, name, icon, *loader, id, emit EntryReady(MakeGameListEntry(physical_name, name,
compatibility_list, patch), Common::FS::GetSize(physical_name), icon,
*loader, id, compatibility_list, patch),
parent_dir); parent_dir);
} }
} else { } else {
@ -368,8 +370,9 @@ void GameListWorker::ScanFileSystem(ScanTarget target, const std::string& dir_pa
const FileSys::PatchManager patch{program_id, system.GetFileSystemController(), const FileSys::PatchManager patch{program_id, system.GetFileSystemController(),
system.GetContentProvider()}; system.GetContentProvider()};
emit EntryReady(MakeGameListEntry(physical_name, name, icon, *loader, emit EntryReady(
program_id, compatibility_list, patch), MakeGameListEntry(physical_name, name, Common::FS::GetSize(physical_name),
icon, *loader, program_id, compatibility_list, patch),
parent_dir); parent_dir);
} }
} }