From 9d8ca6cc4a6994e2fd7bf1628fee8615a45d2e45 Mon Sep 17 00:00:00 2001 From: ReinUsesLisp Date: Sat, 16 Jan 2021 19:52:45 -0300 Subject: [PATCH] buffer_base: Add support for cached CPU writes Some games usually write memory pages currently used by the GPU, causing rendering issues (e.g. flashing geometry and shadows on Link's Awakening). To workaround this issue, Guest CPU writes are delayed until the command buffer finishes processing, but the pages are updated immediately. The overall behavior is: - CPU writes are cached until they are flushed, they update the page state, but don't change the modification state. Cached writes stop pages from being flushed, in case games have meaningful data in it. - Command processing writes (e.g. push constants) update the page state and are marked to the command processor as dirty. They don't remove the state of cached writes. --- src/video_core/buffer_cache/buffer_base.h | 206 +++++++++++++++------- 1 file changed, 145 insertions(+), 61 deletions(-) diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index ee8602ce97..8a5e6a3e7d 100644 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -19,6 +19,7 @@ namespace VideoCommon { enum class BufferFlagBits { Picked = 1 << 0, + CachedWrites = 1 << 1, }; DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits) @@ -40,7 +41,7 @@ class BufferBase { static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; /// Vector tracking modified pages tightly packed with small vector optimization - union WrittenWords { + union WordsArray { /// Returns the pointer to the words state [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { return is_short ? &stack : heap; @@ -55,49 +56,59 @@ class BufferBase { u64* heap; ///< Not-small buffers pointer to the storage }; - struct GpuCpuWords { - explicit GpuCpuWords() = default; - explicit GpuCpuWords(u64 size_bytes_) : size_bytes{size_bytes_} { + struct Words { + explicit Words() = default; + explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { if (IsShort()) { cpu.stack = ~u64{0}; gpu.stack = 0; + cached_cpu.stack = 0; + untracked.stack = ~u64{0}; } else { // Share allocation between CPU and GPU pages and set their default values const size_t num_words = NumWords(); - u64* const alloc = new u64[num_words * 2]; + u64* const alloc = new u64[num_words * 4]; cpu.heap = alloc; gpu.heap = alloc + num_words; + cached_cpu.heap = alloc + num_words * 2; + untracked.heap = alloc + num_words * 3; std::fill_n(cpu.heap, num_words, ~u64{0}); std::fill_n(gpu.heap, num_words, 0); + std::fill_n(cached_cpu.heap, num_words, 0); + std::fill_n(untracked.heap, num_words, ~u64{0}); } // Clean up tailing bits - const u64 last_local_page = - Common::DivCeil(size_bytes % BYTES_PER_WORD, BYTES_PER_PAGE); + const u64 last_word_size = size_bytes % BYTES_PER_WORD; + const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; - u64& last_word = cpu.Pointer(IsShort())[NumWords() - 1]; - last_word = (last_word << shift) >> shift; + const u64 last_word = (~u64{0} << shift) >> shift; + cpu.Pointer(IsShort())[NumWords() - 1] = last_word; + untracked.Pointer(IsShort())[NumWords() - 1] = last_word; } - ~GpuCpuWords() { + ~Words() { Release(); } - GpuCpuWords& operator=(GpuCpuWords&& rhs) noexcept { + Words& operator=(Words&& rhs) noexcept { Release(); size_bytes = rhs.size_bytes; cpu = rhs.cpu; gpu = rhs.gpu; + cached_cpu = rhs.cached_cpu; + untracked = rhs.untracked; rhs.cpu.heap = nullptr; return *this; } - GpuCpuWords(GpuCpuWords&& rhs) noexcept - : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu} { + Words(Words&& rhs) noexcept + : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, + cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { rhs.cpu.heap = nullptr; } - GpuCpuWords& operator=(const GpuCpuWords&) = delete; - GpuCpuWords(const GpuCpuWords&) = delete; + Words& operator=(const Words&) = delete; + Words(const Words&) = delete; /// Returns true when the buffer fits in the small vector optimization [[nodiscard]] bool IsShort() const noexcept { @@ -118,8 +129,17 @@ class BufferBase { } u64 size_bytes = 0; - WrittenWords cpu; - WrittenWords gpu; + WordsArray cpu; + WordsArray gpu; + WordsArray cached_cpu; + WordsArray untracked; + }; + + enum class Type { + CPU, + GPU, + CachedCPU, + Untracked, }; public: @@ -132,68 +152,93 @@ public: BufferBase& operator=(const BufferBase&) = delete; BufferBase(const BufferBase&) = delete; + BufferBase& operator=(BufferBase&&) = default; + BufferBase(BufferBase&&) = default; + /// Returns the inclusive CPU modified range in a begin end pair [[nodiscard]] std::pair ModifiedCpuRegion(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion(offset, query_size); + return ModifiedRegion(offset, query_size); } /// Returns the inclusive GPU modified range in a begin end pair [[nodiscard]] std::pair ModifiedGpuRegion(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion(offset, query_size); + return ModifiedRegion(offset, query_size); } /// Returns true if a region has been modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified(offset, query_size); + return IsRegionModified(offset, query_size); } /// Returns true if a region has been modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified(offset, query_size); + return IsRegionModified(offset, query_size); } /// Mark region as CPU modified, notifying the rasterizer about this change void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState(words.cpu, dirty_cpu_addr, size); + ChangeRegionState(dirty_cpu_addr, size); } /// Unmark region as CPU modified, notifying the rasterizer about this change void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState(words.cpu, dirty_cpu_addr, size); + ChangeRegionState(dirty_cpu_addr, size); } /// Mark region as modified from the host GPU void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState(words.gpu, dirty_cpu_addr, size); + ChangeRegionState(dirty_cpu_addr, size); } /// Unmark region as modified from the host GPU void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState(words.gpu, dirty_cpu_addr, size); + ChangeRegionState(dirty_cpu_addr, size); + } + + /// Mark region as modified from the CPU + /// but don't mark it as modified until FlusHCachedWrites is called. + void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { + flags |= BufferFlagBits::CachedWrites; + ChangeRegionState(dirty_cpu_addr, size); + } + + /// Flushes cached CPU writes, and notify the rasterizer about the deltas + void FlushCachedWrites() noexcept { + flags &= ~BufferFlagBits::CachedWrites; + const u64 num_words = NumWords(); + const u64* const cached_words = Array(); + u64* const untracked_words = Array(); + u64* const cpu_words = Array(); + for (u64 word_index = 0; word_index < num_words; ++word_index) { + const u64 cached_bits = cached_words[word_index]; + NotifyRasterizer(word_index, untracked_words[word_index], cached_bits); + untracked_words[word_index] |= cached_bits; + cpu_words[word_index] |= cached_bits; + } } /// Call 'func' for each CPU modified range and unmark those pages as CPU modified template void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, func); + ForEachModifiedRange(query_cpu_range, size, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, func); + ForEachModifiedRange(query_cpu_range, size, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(Func&& func) { - ForEachModifiedRange(cpu_addr, SizeBytes(), func); + ForEachModifiedRange(cpu_addr, SizeBytes(), func); } /// Mark buffer as picked @@ -216,6 +261,11 @@ public: return True(flags & BufferFlagBits::Picked); } + /// Returns true when the buffer has pending cached writes + [[nodiscard]] bool HasCachedWrites() const noexcept { + return True(flags & BufferFlagBits::CachedWrites); + } + /// Returns the base CPU address of the buffer [[nodiscard]] VAddr CpuAddr() const noexcept { return cpu_addr; @@ -233,26 +283,48 @@ public: } private: + template + u64* Array() noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + template + const u64* Array() const noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + /** * Change the state of a range of pages * - * @param written_words Pages to be marked or unmarked as modified * @param dirty_addr Base address to mark or unmark as modified * @param size Size in bytes to mark or unmark as modified - * - * @tparam enable True when the bits will be set to one, false for zero - * @tparam notify_rasterizer True when the rasterizer has to be notified about the changes */ - template - void ChangeRegionState(WrittenWords& written_words, u64 dirty_addr, - s64 size) noexcept(!notify_rasterizer) { + template + void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { const s64 difference = dirty_addr - cpu_addr; const u64 offset = std::max(difference, 0); size += std::min(difference, 0); if (offset >= SizeBytes() || size < 0) { return; } - u64* const state_words = written_words.Pointer(IsShort()); + u64* const untracked_words = Array(); + u64* const state_words = Array(); const u64 offset_end = std::min(offset + size, SizeBytes()); const u64 begin_page_index = offset / BYTES_PER_PAGE; const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; @@ -268,13 +340,19 @@ private: u64 bits = ~u64{0}; bits = (bits >> right_offset) << right_offset; bits = (bits << left_offset) >> left_offset; - if constexpr (notify_rasterizer) { - NotifyRasterizer(word_index, state_words[word_index], bits); + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + NotifyRasterizer(word_index, untracked_words[word_index], bits); } if constexpr (enable) { state_words[word_index] |= bits; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[word_index] |= bits; + } } else { state_words[word_index] &= ~bits; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[word_index] &= ~bits; + } } page_index = 0; ++word_index; @@ -291,7 +369,7 @@ private: * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages */ template - void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) { + void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; while (changed_bits != 0) { @@ -315,21 +393,20 @@ private: * @param query_cpu_range Base CPU address to loop over * @param size Size in bytes of the CPU range to loop over * @param func Function to call for each turned off region - * - * @tparam gpu True for host GPU pages, false for CPU pages - * @tparam notify_rasterizer True when the rasterizer should be notified about state changes */ - template + template void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { + static_assert(type != Type::Untracked); + const s64 difference = query_cpu_range - cpu_addr; const u64 query_begin = std::max(difference, 0); size += std::min(difference, 0); if (query_begin >= SizeBytes() || size < 0) { return; } - const u64* const cpu_words = words.cpu.Pointer(IsShort()); + u64* const untracked_words = Array(); + u64* const state_words = Array(); const u64 query_end = query_begin + std::min(static_cast(size), SizeBytes()); - u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); @@ -345,7 +422,8 @@ private: const u64 word_index_end = std::distance(state_words, last_modified_word); const unsigned local_page_begin = std::countr_zero(*first_modified_word); - const unsigned local_page_end = PAGES_PER_WORD - std::countl_zero(last_modified_word[-1]); + const unsigned local_page_end = + static_cast(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; const u64 query_page_begin = query_begin / BYTES_PER_PAGE; @@ -371,11 +449,13 @@ private: const u64 current_word = state_words[word_index] & bits; state_words[word_index] &= ~bits; - // Exclude CPU modified pages when visiting GPU pages - const u64 word = current_word & ~(gpu ? cpu_words[word_index] : 0); - if constexpr (notify_rasterizer) { - NotifyRasterizer(word_index, word, ~u64{0}); + if constexpr (type == Type::CPU) { + const u64 current_bits = untracked_words[word_index] & bits; + untracked_words[word_index] &= ~bits; + NotifyRasterizer(word_index, current_bits, ~u64{0}); } + // Exclude CPU modified pages when visiting GPU pages + const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); u64 page = page_begin; page_begin = 0; @@ -416,17 +496,20 @@ private: * @param offset Offset in bytes from the start of the buffer * @param size Size in bytes of the region to query for modifications */ - template + template [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { - const u64* const cpu_words = words.cpu.Pointer(IsShort()); - const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); + static_assert(type != Type::Untracked); + + const u64* const untracked_words = Array(); + const u64* const state_words = Array(); const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 word_begin = offset / BYTES_PER_WORD; const u64 word_end = std::min(word_begin + num_query_words, NumWords()); const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { - const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0); + const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; + const u64 word = state_words[word_index] & ~off_word; if (word == 0) { continue; } @@ -445,13 +528,13 @@ private: * * @param offset Offset in bytes from the start of the buffer * @param size Size in bytes of the region to query for modifications - * - * @tparam gpu True to query GPU modified pages, false for CPU pages */ - template + template [[nodiscard]] std::pair ModifiedRegion(u64 offset, u64 size) const noexcept { - const u64* const cpu_words = words.cpu.Pointer(IsShort()); - const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); + static_assert(type != Type::Untracked); + + const u64* const untracked_words = Array(); + const u64* const state_words = Array(); const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 word_begin = offset / BYTES_PER_WORD; const u64 word_end = std::min(word_begin + num_query_words, NumWords()); @@ -460,7 +543,8 @@ private: u64 begin = std::numeric_limits::max(); u64 end = 0; for (u64 word_index = word_begin; word_index < word_end; ++word_index) { - const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0); + const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; + const u64 word = state_words[word_index] & ~off_word; if (word == 0) { continue; } @@ -488,7 +572,7 @@ private: RasterizerInterface* rasterizer = nullptr; VAddr cpu_addr = 0; - GpuCpuWords words; + Words words; BufferFlagBits flags{}; };