mirror of
https://git.suyu.dev/suyu/suyu.git
synced 2024-11-22 23:02:47 +01:00
Merge pull request #9559 from FernandoS27/cached-writes
VideoCore: Implement Cached Writes, use fastmem for reading GPU memory and eliminate old stuffs
This commit is contained in:
commit
5bcbb8de45
15 changed files with 233 additions and 53 deletions
|
@ -436,7 +436,7 @@ struct Memory::Impl {
|
|||
}
|
||||
|
||||
if (Settings::IsFastmemEnabled()) {
|
||||
const bool is_read_enable = Settings::IsGPULevelHigh() || !cached;
|
||||
const bool is_read_enable = !Settings::IsGPULevelExtreme() || !cached;
|
||||
system.DeviceMemory().buffer.Protect(vaddr, size, is_read_enable, !cached);
|
||||
}
|
||||
|
||||
|
|
|
@ -538,7 +538,7 @@ TEST_CASE("BufferBase: Cached write downloads") {
|
|||
int num = 0;
|
||||
buffer.ForEachDownloadRangeAndClear(c, WORD, [&](u64 offset, u64 size) { ++num; });
|
||||
buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; });
|
||||
REQUIRE(num == 0);
|
||||
REQUIRE(num == 1);
|
||||
REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE));
|
||||
REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE));
|
||||
buffer.FlushCachedWrites();
|
||||
|
|
|
@ -85,6 +85,7 @@ add_library(video_core STATIC
|
|||
gpu.h
|
||||
gpu_thread.cpp
|
||||
gpu_thread.h
|
||||
invalidation_accumulator.h
|
||||
memory_manager.cpp
|
||||
memory_manager.h
|
||||
precompiled_headers.h
|
||||
|
|
|
@ -430,7 +430,7 @@ private:
|
|||
if (query_begin >= SizeBytes() || size < 0) {
|
||||
return;
|
||||
}
|
||||
u64* const untracked_words = Array<Type::Untracked>();
|
||||
[[maybe_unused]] u64* const untracked_words = Array<Type::Untracked>();
|
||||
u64* const state_words = Array<type>();
|
||||
const u64 query_end = query_begin + std::min(static_cast<u64>(size), SizeBytes());
|
||||
u64* const words_begin = state_words + query_begin / BYTES_PER_WORD;
|
||||
|
@ -483,7 +483,7 @@ private:
|
|||
NotifyRasterizer<true>(word_index, current_bits, ~u64{0});
|
||||
}
|
||||
// Exclude CPU modified pages when visiting GPU pages
|
||||
const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0);
|
||||
const u64 word = current_word;
|
||||
u64 page = page_begin;
|
||||
page_begin = 0;
|
||||
|
||||
|
@ -531,7 +531,7 @@ private:
|
|||
[[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept {
|
||||
static_assert(type != Type::Untracked);
|
||||
|
||||
const u64* const untracked_words = Array<Type::Untracked>();
|
||||
[[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
|
||||
const u64* const state_words = Array<type>();
|
||||
const u64 num_query_words = size / BYTES_PER_WORD + 1;
|
||||
const u64 word_begin = offset / BYTES_PER_WORD;
|
||||
|
@ -539,8 +539,7 @@ private:
|
|||
const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE);
|
||||
u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD;
|
||||
for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) {
|
||||
const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
|
||||
const u64 word = state_words[word_index] & ~off_word;
|
||||
const u64 word = state_words[word_index];
|
||||
if (word == 0) {
|
||||
continue;
|
||||
}
|
||||
|
@ -564,7 +563,7 @@ private:
|
|||
[[nodiscard]] std::pair<u64, u64> ModifiedRegion(u64 offset, u64 size) const noexcept {
|
||||
static_assert(type != Type::Untracked);
|
||||
|
||||
const u64* const untracked_words = Array<Type::Untracked>();
|
||||
[[maybe_unused]] const u64* const untracked_words = Array<Type::Untracked>();
|
||||
const u64* const state_words = Array<type>();
|
||||
const u64 num_query_words = size / BYTES_PER_WORD + 1;
|
||||
const u64 word_begin = offset / BYTES_PER_WORD;
|
||||
|
@ -574,8 +573,7 @@ private:
|
|||
u64 begin = std::numeric_limits<u64>::max();
|
||||
u64 end = 0;
|
||||
for (u64 word_index = word_begin; word_index < word_end; ++word_index) {
|
||||
const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0;
|
||||
const u64 word = state_words[word_index] & ~off_word;
|
||||
const u64 word = state_words[word_index];
|
||||
if (word == 0) {
|
||||
continue;
|
||||
}
|
||||
|
|
|
@ -76,7 +76,7 @@ void State::ProcessData(std::span<const u8> read_buffer) {
|
|||
regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
|
||||
x_elements, regs.line_count, regs.dest.BlockHeight(),
|
||||
regs.dest.BlockDepth(), regs.line_length_in);
|
||||
memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
|
||||
memory_manager.WriteBlockCached(address, tmp_buffer.data(), dst_size);
|
||||
}
|
||||
}
|
||||
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include "common/microprofile.h"
|
||||
#include "video_core/engines/fermi_2d.h"
|
||||
#include "video_core/engines/sw_blitter/blitter.h"
|
||||
#include "video_core/memory_manager.h"
|
||||
#include "video_core/rasterizer_interface.h"
|
||||
#include "video_core/surface.h"
|
||||
#include "video_core/textures/decoders.h"
|
||||
|
@ -20,8 +21,8 @@ namespace Tegra::Engines {
|
|||
|
||||
using namespace Texture;
|
||||
|
||||
Fermi2D::Fermi2D(MemoryManager& memory_manager_) {
|
||||
sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager_);
|
||||
Fermi2D::Fermi2D(MemoryManager& memory_manager_) : memory_manager{memory_manager_} {
|
||||
sw_blitter = std::make_unique<Blitter::SoftwareBlitEngine>(memory_manager);
|
||||
// Nvidia's OpenGL driver seems to assume these values
|
||||
regs.src.depth = 1;
|
||||
regs.dst.depth = 1;
|
||||
|
@ -104,6 +105,7 @@ void Fermi2D::Blit() {
|
|||
config.src_x0 = 0;
|
||||
}
|
||||
|
||||
memory_manager.FlushCaching();
|
||||
if (!rasterizer->AccelerateSurfaceCopy(src, regs.dst, config)) {
|
||||
sw_blitter->Blit(src, regs.dst, config);
|
||||
}
|
||||
|
|
|
@ -305,6 +305,7 @@ public:
|
|||
private:
|
||||
VideoCore::RasterizerInterface* rasterizer = nullptr;
|
||||
std::unique_ptr<Blitter::SoftwareBlitEngine> sw_blitter;
|
||||
MemoryManager& memory_manager;
|
||||
|
||||
/// Performs the copy from the source surface to the destination surface as configured in the
|
||||
/// registers.
|
||||
|
|
|
@ -485,11 +485,6 @@ void Maxwell3D::StampQueryResult(u64 payload, bool long_query) {
|
|||
}
|
||||
|
||||
void Maxwell3D::ProcessQueryGet() {
|
||||
// TODO(Subv): Support the other query units.
|
||||
if (regs.report_semaphore.query.location != Regs::ReportSemaphore::Location::All) {
|
||||
LOG_DEBUG(HW_GPU, "Locations other than ALL are unimplemented");
|
||||
}
|
||||
|
||||
switch (regs.report_semaphore.query.operation) {
|
||||
case Regs::ReportSemaphore::Operation::Release:
|
||||
if (regs.report_semaphore.query.short_query != 0) {
|
||||
|
@ -649,7 +644,7 @@ void Maxwell3D::ProcessCBMultiData(const u32* start_base, u32 amount) {
|
|||
|
||||
const GPUVAddr address{buffer_address + regs.const_buffer.offset};
|
||||
const size_t copy_size = amount * sizeof(u32);
|
||||
memory_manager.WriteBlock(address, start_base, copy_size);
|
||||
memory_manager.WriteBlockCached(address, start_base, copy_size);
|
||||
|
||||
// Increment the current buffer position.
|
||||
regs.const_buffer.offset += static_cast<u32>(copy_size);
|
||||
|
|
|
@ -69,7 +69,7 @@ void MaxwellDMA::Launch() {
|
|||
if (launch.multi_line_enable) {
|
||||
const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
|
||||
const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
|
||||
|
||||
memory_manager.FlushCaching();
|
||||
if (!is_src_pitch && !is_dst_pitch) {
|
||||
// If both the source and the destination are in block layout, assert.
|
||||
CopyBlockLinearToBlockLinear();
|
||||
|
@ -104,6 +104,7 @@ void MaxwellDMA::Launch() {
|
|||
reinterpret_cast<u8*>(tmp_buffer.data()),
|
||||
regs.line_length_in * sizeof(u32));
|
||||
} else {
|
||||
memory_manager.FlushCaching();
|
||||
const auto convert_linear_2_blocklinear_addr = [](u64 address) {
|
||||
return (address & ~0x1f0ULL) | ((address & 0x40) >> 2) | ((address & 0x10) << 1) |
|
||||
((address & 0x180) >> 1) | ((address & 0x20) << 3);
|
||||
|
@ -121,7 +122,7 @@ void MaxwellDMA::Launch() {
|
|||
memory_manager.ReadBlockUnsafe(
|
||||
convert_linear_2_blocklinear_addr(regs.offset_in + offset),
|
||||
tmp_buffer.data(), tmp_buffer.size());
|
||||
memory_manager.WriteBlock(regs.offset_out + offset, tmp_buffer.data(),
|
||||
memory_manager.WriteBlockCached(regs.offset_out + offset, tmp_buffer.data(),
|
||||
tmp_buffer.size());
|
||||
}
|
||||
} else if (is_src_pitch && !is_dst_pitch) {
|
||||
|
@ -132,7 +133,7 @@ void MaxwellDMA::Launch() {
|
|||
for (u32 offset = 0; offset < regs.line_length_in; offset += 16) {
|
||||
memory_manager.ReadBlockUnsafe(regs.offset_in + offset, tmp_buffer.data(),
|
||||
tmp_buffer.size());
|
||||
memory_manager.WriteBlock(
|
||||
memory_manager.WriteBlockCached(
|
||||
convert_linear_2_blocklinear_addr(regs.offset_out + offset),
|
||||
tmp_buffer.data(), tmp_buffer.size());
|
||||
}
|
||||
|
@ -141,7 +142,7 @@ void MaxwellDMA::Launch() {
|
|||
std::vector<u8> tmp_buffer(regs.line_length_in);
|
||||
memory_manager.ReadBlockUnsafe(regs.offset_in, tmp_buffer.data(),
|
||||
regs.line_length_in);
|
||||
memory_manager.WriteBlock(regs.offset_out, tmp_buffer.data(),
|
||||
memory_manager.WriteBlockCached(regs.offset_out, tmp_buffer.data(),
|
||||
regs.line_length_in);
|
||||
}
|
||||
}
|
||||
|
@ -204,7 +205,7 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
|
|||
src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
|
||||
regs.pitch_out);
|
||||
|
||||
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
|
||||
memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
|
||||
}
|
||||
|
||||
void MaxwellDMA::CopyPitchToBlockLinear() {
|
||||
|
@ -256,7 +257,7 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
|
|||
dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
|
||||
regs.pitch_in);
|
||||
|
||||
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
|
||||
memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
|
||||
}
|
||||
|
||||
void MaxwellDMA::FastCopyBlockLinearToPitch() {
|
||||
|
@ -287,7 +288,7 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
|
|||
regs.src_params.block_size.height, regs.src_params.block_size.depth,
|
||||
regs.pitch_out);
|
||||
|
||||
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
|
||||
memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
|
||||
}
|
||||
|
||||
void MaxwellDMA::CopyBlockLinearToBlockLinear() {
|
||||
|
@ -347,7 +348,7 @@ void MaxwellDMA::CopyBlockLinearToBlockLinear() {
|
|||
dst.depth, dst_x_offset, dst.origin.y, x_elements, regs.line_count,
|
||||
dst.block_size.height, dst.block_size.depth, pitch);
|
||||
|
||||
memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
|
||||
memory_manager.WriteBlockCached(regs.offset_out, write_buffer.data(), dst_size);
|
||||
}
|
||||
|
||||
void MaxwellDMA::ReleaseSemaphore() {
|
||||
|
|
79
src/video_core/invalidation_accumulator.h
Normal file
79
src/video_core/invalidation_accumulator.h
Normal file
|
@ -0,0 +1,79 @@
|
|||
// SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
|
||||
// SPDX-License-Identifier: GPL-2.0-or-later
|
||||
|
||||
#pragma once
|
||||
|
||||
#include <utility>
|
||||
#include <vector>
|
||||
|
||||
#include "common/common_types.h"
|
||||
|
||||
namespace VideoCommon {
|
||||
|
||||
class InvalidationAccumulator {
|
||||
public:
|
||||
InvalidationAccumulator() = default;
|
||||
~InvalidationAccumulator() = default;
|
||||
|
||||
void Add(GPUVAddr address, size_t size) {
|
||||
const auto reset_values = [&]() {
|
||||
if (has_collected) {
|
||||
buffer.emplace_back(start_address, accumulated_size);
|
||||
}
|
||||
start_address = address;
|
||||
accumulated_size = size;
|
||||
last_collection = start_address + size;
|
||||
};
|
||||
if (address >= start_address && address + size <= last_collection) [[likely]] {
|
||||
return;
|
||||
}
|
||||
size = ((address + size + atomicity_size_mask) & atomicity_mask) - address;
|
||||
address = address & atomicity_mask;
|
||||
if (!has_collected) [[unlikely]] {
|
||||
reset_values();
|
||||
has_collected = true;
|
||||
return;
|
||||
}
|
||||
if (address != last_collection) [[unlikely]] {
|
||||
reset_values();
|
||||
return;
|
||||
}
|
||||
accumulated_size += size;
|
||||
last_collection += size;
|
||||
}
|
||||
|
||||
void Clear() {
|
||||
buffer.clear();
|
||||
start_address = 0;
|
||||
last_collection = 0;
|
||||
has_collected = false;
|
||||
}
|
||||
|
||||
bool AnyAccumulated() const {
|
||||
return has_collected;
|
||||
}
|
||||
|
||||
template <typename Func>
|
||||
void Callback(Func&& func) {
|
||||
if (!has_collected) {
|
||||
return;
|
||||
}
|
||||
buffer.emplace_back(start_address, accumulated_size);
|
||||
for (auto& [address, size] : buffer) {
|
||||
func(address, size);
|
||||
}
|
||||
}
|
||||
|
||||
private:
|
||||
static constexpr size_t atomicity_bits = 5;
|
||||
static constexpr size_t atomicity_size = 1ULL << atomicity_bits;
|
||||
static constexpr size_t atomicity_size_mask = atomicity_size - 1;
|
||||
static constexpr size_t atomicity_mask = ~atomicity_size_mask;
|
||||
GPUVAddr start_address{};
|
||||
GPUVAddr last_collection{};
|
||||
size_t accumulated_size{};
|
||||
bool has_collected{};
|
||||
std::vector<std::pair<VAddr, size_t>> buffer;
|
||||
};
|
||||
|
||||
} // namespace VideoCommon
|
|
@ -6,11 +6,13 @@
|
|||
#include "common/alignment.h"
|
||||
#include "common/assert.h"
|
||||
#include "common/logging/log.h"
|
||||
#include "common/settings.h"
|
||||
#include "core/core.h"
|
||||
#include "core/device_memory.h"
|
||||
#include "core/hle/kernel/k_page_table.h"
|
||||
#include "core/hle/kernel/k_process.h"
|
||||
#include "core/memory.h"
|
||||
#include "video_core/invalidation_accumulator.h"
|
||||
#include "video_core/memory_manager.h"
|
||||
#include "video_core/rasterizer_interface.h"
|
||||
#include "video_core/renderer_base.h"
|
||||
|
@ -26,7 +28,8 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
|
|||
entries{}, big_entries{}, page_table{address_space_bits, address_space_bits + page_bits - 38,
|
||||
page_bits != big_page_bits ? page_bits : 0},
|
||||
kind_map{PTEKind::INVALID}, unique_identifier{unique_identifier_generator.fetch_add(
|
||||
1, std::memory_order_acq_rel)} {
|
||||
1, std::memory_order_acq_rel)},
|
||||
accumulator{std::make_unique<VideoCommon::InvalidationAccumulator>()} {
|
||||
address_space_size = 1ULL << address_space_bits;
|
||||
page_size = 1ULL << page_bits;
|
||||
page_mask = page_size - 1ULL;
|
||||
|
@ -43,6 +46,11 @@ MemoryManager::MemoryManager(Core::System& system_, u64 address_space_bits_, u64
|
|||
big_page_table_cpu.resize(big_page_table_size);
|
||||
big_page_continous.resize(big_page_table_size / continous_bits, 0);
|
||||
entries.resize(page_table_size / 32, 0);
|
||||
if (!Settings::IsGPULevelExtreme() && Settings::IsFastmemEnabled()) {
|
||||
fastmem_arena = system.DeviceMemory().buffer.VirtualBasePointer();
|
||||
} else {
|
||||
fastmem_arena = nullptr;
|
||||
}
|
||||
}
|
||||
|
||||
MemoryManager::~MemoryManager() = default;
|
||||
|
@ -185,15 +193,12 @@ void MemoryManager::Unmap(GPUVAddr gpu_addr, std::size_t size) {
|
|||
if (size == 0) {
|
||||
return;
|
||||
}
|
||||
const auto submapped_ranges = GetSubmappedRange(gpu_addr, size);
|
||||
GetSubmappedRangeImpl<false>(gpu_addr, size, page_stash);
|
||||
|
||||
for (const auto& [map_addr, map_size] : submapped_ranges) {
|
||||
// Flush and invalidate through the GPU interface, to be asynchronous if possible.
|
||||
const std::optional<VAddr> cpu_addr = GpuToCpuAddress(map_addr);
|
||||
ASSERT(cpu_addr);
|
||||
|
||||
rasterizer->UnmapMemory(*cpu_addr, map_size);
|
||||
for (const auto& [map_addr, map_size] : page_stash) {
|
||||
rasterizer->UnmapMemory(map_addr, map_size);
|
||||
}
|
||||
page_stash.clear();
|
||||
|
||||
BigPageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
|
||||
PageTableOp<EntryType::Free>(gpu_addr, 0, size, PTEKind::INVALID);
|
||||
|
@ -355,7 +360,7 @@ inline void MemoryManager::MemoryOperation(GPUVAddr gpu_src_addr, std::size_t si
|
|||
}
|
||||
}
|
||||
|
||||
template <bool is_safe>
|
||||
template <bool is_safe, bool use_fastmem>
|
||||
void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
|
||||
[[maybe_unused]] VideoCommon::CacheType which) const {
|
||||
auto set_to_zero = [&]([[maybe_unused]] std::size_t page_index,
|
||||
|
@ -369,8 +374,12 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
|
|||
if constexpr (is_safe) {
|
||||
rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
|
||||
}
|
||||
if constexpr (use_fastmem) {
|
||||
std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
|
||||
} else {
|
||||
u8* physical = memory.GetPointer(cpu_addr_base);
|
||||
std::memcpy(dest_buffer, physical, copy_amount);
|
||||
}
|
||||
dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
|
||||
};
|
||||
auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
|
||||
|
@ -379,12 +388,16 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
|
|||
if constexpr (is_safe) {
|
||||
rasterizer->FlushRegion(cpu_addr_base, copy_amount, which);
|
||||
}
|
||||
if constexpr (use_fastmem) {
|
||||
std::memcpy(dest_buffer, &fastmem_arena[cpu_addr_base], copy_amount);
|
||||
} else {
|
||||
if (!IsBigPageContinous(page_index)) [[unlikely]] {
|
||||
memory.ReadBlockUnsafe(cpu_addr_base, dest_buffer, copy_amount);
|
||||
} else {
|
||||
u8* physical = memory.GetPointer(cpu_addr_base);
|
||||
std::memcpy(dest_buffer, physical, copy_amount);
|
||||
}
|
||||
}
|
||||
dest_buffer = static_cast<u8*>(dest_buffer) + copy_amount;
|
||||
};
|
||||
auto read_short_pages = [&](std::size_t page_index, std::size_t offset,
|
||||
|
@ -397,12 +410,20 @@ void MemoryManager::ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std:
|
|||
|
||||
void MemoryManager::ReadBlock(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
|
||||
VideoCommon::CacheType which) const {
|
||||
ReadBlockImpl<true>(gpu_src_addr, dest_buffer, size, which);
|
||||
if (fastmem_arena) [[likely]] {
|
||||
ReadBlockImpl<true, true>(gpu_src_addr, dest_buffer, size, which);
|
||||
return;
|
||||
}
|
||||
ReadBlockImpl<true, false>(gpu_src_addr, dest_buffer, size, which);
|
||||
}
|
||||
|
||||
void MemoryManager::ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer,
|
||||
const std::size_t size) const {
|
||||
ReadBlockImpl<false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
|
||||
if (fastmem_arena) [[likely]] {
|
||||
ReadBlockImpl<false, true>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
|
||||
return;
|
||||
}
|
||||
ReadBlockImpl<false, false>(gpu_src_addr, dest_buffer, size, VideoCommon::CacheType::None);
|
||||
}
|
||||
|
||||
template <bool is_safe>
|
||||
|
@ -454,6 +475,12 @@ void MemoryManager::WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buf
|
|||
WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
|
||||
}
|
||||
|
||||
void MemoryManager::WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer,
|
||||
std::size_t size) {
|
||||
WriteBlockImpl<false>(gpu_dest_addr, src_buffer, size, VideoCommon::CacheType::None);
|
||||
accumulator->Add(gpu_dest_addr, size);
|
||||
}
|
||||
|
||||
void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size,
|
||||
VideoCommon::CacheType which) const {
|
||||
auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
|
||||
|
@ -663,7 +690,17 @@ bool MemoryManager::IsFullyMappedRange(GPUVAddr gpu_addr, std::size_t size) cons
|
|||
std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
|
||||
GPUVAddr gpu_addr, std::size_t size) const {
|
||||
std::vector<std::pair<GPUVAddr, std::size_t>> result{};
|
||||
std::optional<std::pair<GPUVAddr, std::size_t>> last_segment{};
|
||||
GetSubmappedRangeImpl<true>(gpu_addr, size, result);
|
||||
return result;
|
||||
}
|
||||
|
||||
template <bool is_gpu_address>
|
||||
void MemoryManager::GetSubmappedRangeImpl(
|
||||
GPUVAddr gpu_addr, std::size_t size,
|
||||
std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
|
||||
result) const {
|
||||
std::optional<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>
|
||||
last_segment{};
|
||||
std::optional<VAddr> old_page_addr{};
|
||||
const auto split = [&last_segment, &result]([[maybe_unused]] std::size_t page_index,
|
||||
[[maybe_unused]] std::size_t offset,
|
||||
|
@ -685,8 +722,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
|
|||
}
|
||||
old_page_addr = {cpu_addr_base + copy_amount};
|
||||
if (!last_segment) {
|
||||
if constexpr (is_gpu_address) {
|
||||
const GPUVAddr new_base_addr = (page_index << big_page_bits) + offset;
|
||||
last_segment = {new_base_addr, copy_amount};
|
||||
} else {
|
||||
last_segment = {cpu_addr_base, copy_amount};
|
||||
}
|
||||
} else {
|
||||
last_segment->second += copy_amount;
|
||||
}
|
||||
|
@ -703,8 +744,12 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
|
|||
}
|
||||
old_page_addr = {cpu_addr_base + copy_amount};
|
||||
if (!last_segment) {
|
||||
if constexpr (is_gpu_address) {
|
||||
const GPUVAddr new_base_addr = (page_index << page_bits) + offset;
|
||||
last_segment = {new_base_addr, copy_amount};
|
||||
} else {
|
||||
last_segment = {cpu_addr_base, copy_amount};
|
||||
}
|
||||
} else {
|
||||
last_segment->second += copy_amount;
|
||||
}
|
||||
|
@ -715,7 +760,18 @@ std::vector<std::pair<GPUVAddr, std::size_t>> MemoryManager::GetSubmappedRange(
|
|||
};
|
||||
MemoryOperation<true>(gpu_addr, size, extend_size_big, split, do_short_pages);
|
||||
split(0, 0, 0);
|
||||
return result;
|
||||
}
|
||||
|
||||
void MemoryManager::FlushCaching() {
|
||||
if (!accumulator->AnyAccumulated()) {
|
||||
return;
|
||||
}
|
||||
accumulator->Callback([this](GPUVAddr addr, size_t size) {
|
||||
GetSubmappedRangeImpl<false>(addr, size, page_stash);
|
||||
});
|
||||
rasterizer->InnerInvalidation(page_stash);
|
||||
page_stash.clear();
|
||||
accumulator->Clear();
|
||||
}
|
||||
|
||||
} // namespace Tegra
|
||||
|
|
|
@ -19,6 +19,10 @@ namespace VideoCore {
|
|||
class RasterizerInterface;
|
||||
}
|
||||
|
||||
namespace VideoCommon {
|
||||
class InvalidationAccumulator;
|
||||
}
|
||||
|
||||
namespace Core {
|
||||
class DeviceMemory;
|
||||
namespace Memory {
|
||||
|
@ -80,6 +84,7 @@ public:
|
|||
*/
|
||||
void ReadBlockUnsafe(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size) const;
|
||||
void WriteBlockUnsafe(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
|
||||
void WriteBlockCached(GPUVAddr gpu_dest_addr, const void* src_buffer, std::size_t size);
|
||||
|
||||
/**
|
||||
* Checks if a gpu region can be simply read with a pointer.
|
||||
|
@ -129,12 +134,14 @@ public:
|
|||
size_t GetMemoryLayoutSize(GPUVAddr gpu_addr,
|
||||
size_t max_size = std::numeric_limits<size_t>::max()) const;
|
||||
|
||||
void FlushCaching();
|
||||
|
||||
private:
|
||||
template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
|
||||
inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
|
||||
FuncReserved&& func_reserved, FuncUnmapped&& func_unmapped) const;
|
||||
|
||||
template <bool is_safe>
|
||||
template <bool is_safe, bool use_fastmem>
|
||||
void ReadBlockImpl(GPUVAddr gpu_src_addr, void* dest_buffer, std::size_t size,
|
||||
VideoCommon::CacheType which) const;
|
||||
|
||||
|
@ -154,6 +161,12 @@ private:
|
|||
inline bool IsBigPageContinous(size_t big_page_index) const;
|
||||
inline void SetBigPageContinous(size_t big_page_index, bool value);
|
||||
|
||||
template <bool is_gpu_address>
|
||||
void GetSubmappedRangeImpl(
|
||||
GPUVAddr gpu_addr, std::size_t size,
|
||||
std::vector<std::pair<std::conditional_t<is_gpu_address, GPUVAddr, VAddr>, std::size_t>>&
|
||||
result) const;
|
||||
|
||||
Core::System& system;
|
||||
Core::Memory::Memory& memory;
|
||||
Core::DeviceMemory& device_memory;
|
||||
|
@ -201,10 +214,13 @@ private:
|
|||
Common::VirtualBuffer<u32> big_page_table_cpu;
|
||||
|
||||
std::vector<u64> big_page_continous;
|
||||
std::vector<std::pair<VAddr, std::size_t>> page_stash{};
|
||||
u8* fastmem_arena{};
|
||||
|
||||
constexpr static size_t continous_bits = 64;
|
||||
|
||||
const size_t unique_identifier;
|
||||
std::unique_ptr<VideoCommon::InvalidationAccumulator> accumulator;
|
||||
|
||||
static std::atomic<size_t> unique_identifier_generator;
|
||||
};
|
||||
|
|
|
@ -6,6 +6,7 @@
|
|||
#include <functional>
|
||||
#include <optional>
|
||||
#include <span>
|
||||
#include <utility>
|
||||
#include "common/common_types.h"
|
||||
#include "common/polyfill_thread.h"
|
||||
#include "video_core/cache_types.h"
|
||||
|
@ -95,6 +96,12 @@ public:
|
|||
virtual void InvalidateRegion(VAddr addr, u64 size,
|
||||
VideoCommon::CacheType which = VideoCommon::CacheType::All) = 0;
|
||||
|
||||
virtual void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
|
||||
for (const auto& [cpu_addr, size] : sequences) {
|
||||
InvalidateRegion(cpu_addr, size);
|
||||
}
|
||||
}
|
||||
|
||||
/// Notify rasterizer that any caches of the specified region are desync with guest
|
||||
virtual void OnCPUWrite(VAddr addr, u64 size) = 0;
|
||||
|
||||
|
|
|
@ -186,6 +186,7 @@ void RasterizerVulkan::PrepareDraw(bool is_indexed, Func&& draw_func) {
|
|||
|
||||
SCOPE_EXIT({ gpu.TickWork(); });
|
||||
FlushWork();
|
||||
gpu_memory->FlushCaching();
|
||||
|
||||
query_cache.UpdateCounters();
|
||||
|
||||
|
@ -393,6 +394,7 @@ void RasterizerVulkan::Clear(u32 layer_count) {
|
|||
|
||||
void RasterizerVulkan::DispatchCompute() {
|
||||
FlushWork();
|
||||
gpu_memory->FlushCaching();
|
||||
|
||||
ComputePipeline* const pipeline{pipeline_cache.CurrentComputePipeline()};
|
||||
if (!pipeline) {
|
||||
|
@ -481,6 +483,27 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size, VideoCommon::Cache
|
|||
}
|
||||
}
|
||||
|
||||
void RasterizerVulkan::InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) {
|
||||
{
|
||||
std::scoped_lock lock{texture_cache.mutex};
|
||||
for (const auto& [addr, size] : sequences) {
|
||||
texture_cache.WriteMemory(addr, size);
|
||||
}
|
||||
}
|
||||
{
|
||||
std::scoped_lock lock{buffer_cache.mutex};
|
||||
for (const auto& [addr, size] : sequences) {
|
||||
buffer_cache.WriteMemory(addr, size);
|
||||
}
|
||||
}
|
||||
{
|
||||
for (const auto& [addr, size] : sequences) {
|
||||
query_cache.InvalidateRegion(addr, size);
|
||||
pipeline_cache.InvalidateRegion(addr, size);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) {
|
||||
if (addr == 0 || size == 0) {
|
||||
return;
|
||||
|
|
|
@ -79,6 +79,7 @@ public:
|
|||
VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
|
||||
void InvalidateRegion(VAddr addr, u64 size,
|
||||
VideoCommon::CacheType which = VideoCommon::CacheType::All) override;
|
||||
void InnerInvalidation(std::span<const std::pair<VAddr, std::size_t>> sequences) override;
|
||||
void OnCPUWrite(VAddr addr, u64 size) override;
|
||||
void InvalidateGPUCache() override;
|
||||
void UnmapMemory(VAddr addr, u64 size) override;
|
||||
|
|
Loading…
Reference in a new issue