diff --git a/src/core/core_timing.cpp b/src/core/core_timing.cpp index 6ac0007764..1abfa920c4 100644 --- a/src/core/core_timing.cpp +++ b/src/core/core_timing.cpp @@ -26,6 +26,24 @@ std::shared_ptr CreateEvent(std::string name, TimedCallback&& callbac return std::make_shared(std::move(callback), std::move(name)); } +struct CoreTiming::Event { + s64 time; + u64 fifo_order; + std::weak_ptr type; + s64 reschedule_time; + heap_t::handle_type handle{}; + + // Sort by time, unless the times are the same, in which case sort by + // the order added to the queue + friend bool operator>(const Event& left, const Event& right) { + return std::tie(left.time, left.fifo_order) > std::tie(right.time, right.fifo_order); + } + + friend bool operator<(const Event& left, const Event& right) { + return std::tie(left.time, left.fifo_order) < std::tie(right.time, right.fifo_order); + } +}; + CoreTiming::CoreTiming() : clock{Common::CreateOptimalClock()} {} CoreTiming::~CoreTiming() { @@ -69,7 +87,7 @@ void CoreTiming::Pause(bool is_paused) { } void CoreTiming::SyncPause(bool is_paused) { - if (is_paused == paused && paused_set == is_paused) { + if (is_paused == paused && paused_set == paused) { return; } @@ -94,7 +112,7 @@ bool CoreTiming::IsRunning() const { bool CoreTiming::HasPendingEvents() const { std::scoped_lock lock{basic_lock}; - return !event_queue.empty(); + return !(wait_set && event_queue.empty()); } void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future, @@ -103,8 +121,8 @@ void CoreTiming::ScheduleEvent(std::chrono::nanoseconds ns_into_future, std::scoped_lock scope{basic_lock}; const auto next_time{absolute_time ? ns_into_future : GetGlobalTimeNs() + ns_into_future}; - event_queue.emplace_back(Event{next_time.count(), event_fifo_id++, event_type}); - std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>()); + auto h{event_queue.emplace(Event{next_time.count(), event_fifo_id++, event_type, 0})}; + (*h).handle = h; } event.Set(); @@ -118,9 +136,9 @@ void CoreTiming::ScheduleLoopingEvent(std::chrono::nanoseconds start_time, std::scoped_lock scope{basic_lock}; const auto next_time{absolute_time ? start_time : GetGlobalTimeNs() + start_time}; - event_queue.emplace_back( - Event{next_time.count(), event_fifo_id++, event_type, resched_time.count()}); - std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>()); + auto h{event_queue.emplace( + Event{next_time.count(), event_fifo_id++, event_type, resched_time.count()})}; + (*h).handle = h; } event.Set(); @@ -131,11 +149,17 @@ void CoreTiming::UnscheduleEvent(const std::shared_ptr& event_type, { std::scoped_lock lk{basic_lock}; - event_queue.erase( - std::remove_if(event_queue.begin(), event_queue.end(), - [&](const Event& e) { return e.type.lock().get() == event_type.get(); }), - event_queue.end()); - std::make_heap(event_queue.begin(), event_queue.end(), std::greater<>()); + std::vector to_remove; + for (auto itr = event_queue.begin(); itr != event_queue.end(); itr++) { + const Event& e = *itr; + if (e.type.lock().get() == event_type.get()) { + to_remove.push_back(itr->handle); + } + } + + for (auto& h : to_remove) { + event_queue.erase(h); + } event_type->sequence_number++; } @@ -148,7 +172,7 @@ void CoreTiming::UnscheduleEvent(const std::shared_ptr& event_type, void CoreTiming::AddTicks(u64 ticks_to_add) { cpu_ticks += ticks_to_add; - downcount -= static_cast(ticks_to_add); + downcount -= static_cast(cpu_ticks); } void CoreTiming::Idle() { @@ -156,7 +180,7 @@ void CoreTiming::Idle() { } void CoreTiming::ResetTicks() { - downcount.store(MAX_SLICE_LENGTH, std::memory_order_release); + downcount = MAX_SLICE_LENGTH; } u64 CoreTiming::GetClockTicks() const { @@ -177,38 +201,48 @@ std::optional CoreTiming::Advance() { std::scoped_lock lock{advance_lock, basic_lock}; global_timer = GetGlobalTimeNs().count(); - while (!event_queue.empty() && event_queue.front().time <= global_timer) { - Event evt = std::move(event_queue.front()); - std::pop_heap(event_queue.begin(), event_queue.end(), std::greater<>()); - event_queue.pop_back(); + while (!event_queue.empty() && event_queue.top().time <= global_timer) { + const Event& evt = event_queue.top(); - if (const auto event_type = evt.type.lock()) { + if (const auto event_type{evt.type.lock()}) { const auto evt_time = evt.time; const auto evt_sequence_num = event_type->sequence_number; - basic_lock.unlock(); + if (evt.reschedule_time == 0) { + event_queue.pop(); - const auto new_schedule_time = event_type->callback( - evt_time, std::chrono::nanoseconds{GetGlobalTimeNs().count() - evt_time}); + basic_lock.unlock(); - basic_lock.lock(); + event_type->callback( + evt_time, std::chrono::nanoseconds{GetGlobalTimeNs().count() - evt_time}); - if (evt_sequence_num != event_type->sequence_number) { - continue; - } + basic_lock.lock(); + } else { + basic_lock.unlock(); - if (new_schedule_time.has_value() || evt.reschedule_time != 0) { - const auto next_schedule_time = new_schedule_time.value_or( - std::chrono::nanoseconds{evt.reschedule_time}); + const auto new_schedule_time{event_type->callback( + evt_time, std::chrono::nanoseconds{GetGlobalTimeNs().count() - evt_time})}; - auto next_time = evt.time + next_schedule_time.count(); - if (evt.time < pause_end_time) { - next_time = pause_end_time + next_schedule_time.count(); + basic_lock.lock(); + + if (evt_sequence_num != event_type->sequence_number) { + // Heap handle is invalidated after external modification. + continue; } - event_queue.emplace_back(Event{next_time, event_fifo_id++, evt.type, - next_schedule_time.count()}); - std::push_heap(event_queue.begin(), event_queue.end(), std::greater<>()); + const auto next_schedule_time{new_schedule_time.has_value() + ? new_schedule_time.value().count() + : evt.reschedule_time}; + + // If this event was scheduled into a pause, its time now is going to be way + // behind. Re-set this event to continue from the end of the pause. + auto next_time{evt.time + next_schedule_time}; + if (evt.time < pause_end_time) { + next_time = pause_end_time + next_schedule_time; + } + + event_queue.update(evt.handle, Event{next_time, event_fifo_id++, evt.type, + next_schedule_time, evt.handle}); } } @@ -216,7 +250,7 @@ std::optional CoreTiming::Advance() { } if (!event_queue.empty()) { - return event_queue.front().time; + return event_queue.top().time; } else { return std::nullopt; } @@ -235,7 +269,7 @@ void CoreTiming::ThreadLoop() { #ifdef _WIN32 while (!paused && !event.IsSet() && wait_time > 0) { wait_time = *next_time - GetGlobalTimeNs().count(); - if (wait_time >= 1'000'000) { // 1ms + if (wait_time >= timer_resolution_ns) { Common::Windows::SleepForOneTick(); } else { #ifdef ARCHITECTURE_x86_64 @@ -256,8 +290,10 @@ void CoreTiming::ThreadLoop() { } else { // Queue is empty, wait until another event is scheduled and signals us to // continue. + wait_set = true; event.Wait(); } + wait_set = false; } paused_set = true; @@ -291,4 +327,10 @@ std::chrono::microseconds CoreTiming::GetGlobalTimeUs() const { return std::chrono::microseconds{Common::WallClock::CPUTickToUS(cpu_ticks)}; } +#ifdef _WIN32 +void CoreTiming::SetTimerResolutionNs(std::chrono::nanoseconds ns) { + timer_resolution_ns = ns.count(); +} +#endif + } // namespace Core::Timing diff --git a/src/core/core_timing.h b/src/core/core_timing.h index 5b42b0e491..7e4dff7f3d 100644 --- a/src/core/core_timing.h +++ b/src/core/core_timing.h @@ -11,7 +11,8 @@ #include #include #include -#include + +#include #include "common/common_types.h" #include "common/thread.h" @@ -42,6 +43,18 @@ enum class UnscheduleEventType { NoWait, }; +/** + * This is a system to schedule events into the emulated machine's future. Time is measured + * in main CPU clock cycles. + * + * To schedule an event, you first have to register its type. This is where you pass in the + * callback. You then schedule events using the type ID you get back. + * + * The s64 ns_late that the callbacks get is how many ns late it was. + * So to schedule a new event on a regular basis: + * inside callback: + * ScheduleEvent(period_in_ns - ns_late, callback, "whatever") + */ class CoreTiming { public: CoreTiming(); @@ -53,56 +66,99 @@ public: CoreTiming& operator=(const CoreTiming&) = delete; CoreTiming& operator=(CoreTiming&&) = delete; + /// CoreTiming begins at the boundary of timing slice -1. An initial call to Advance() is + /// required to end slice - 1 and start slice 0 before the first cycle of code is executed. void Initialize(std::function&& on_thread_init_); + + /// Clear all pending events. This should ONLY be done on exit. void ClearPendingEvents(); + + /// Sets if emulation is multicore or single core, must be set before Initialize void SetMulticore(bool is_multicore_) { is_multicore = is_multicore_; } + + /// Pauses/Unpauses the execution of the timer thread. void Pause(bool is_paused); + + /// Pauses/Unpauses the execution of the timer thread and waits until paused. void SyncPause(bool is_paused); + + /// Checks if core timing is running. bool IsRunning() const; + + /// Checks if the timer thread has started. bool HasStarted() const { return has_started; } + + /// Checks if there are any pending time events. bool HasPendingEvents() const; + + /// Schedules an event in core timing void ScheduleEvent(std::chrono::nanoseconds ns_into_future, const std::shared_ptr& event_type, bool absolute_time = false); + + /// Schedules an event which will automatically re-schedule itself with the given time, until + /// unscheduled void ScheduleLoopingEvent(std::chrono::nanoseconds start_time, std::chrono::nanoseconds resched_time, const std::shared_ptr& event_type, bool absolute_time = false); + void UnscheduleEvent(const std::shared_ptr& event_type, UnscheduleEventType type = UnscheduleEventType::Wait); + void AddTicks(u64 ticks_to_add); + void ResetTicks(); + void Idle(); + s64 GetDowncount() const { - return downcount.load(std::memory_order_relaxed); + return downcount; } + + /// Returns the current CNTPCT tick value. u64 GetClockTicks() const; + + /// Returns the current GPU tick value. u64 GetGPUTicks() const; + + /// Returns current time in microseconds. std::chrono::microseconds GetGlobalTimeUs() const; + + /// Returns current time in nanoseconds. std::chrono::nanoseconds GetGlobalTimeNs() const; + + /// Checks for events manually and returns time in nanoseconds for next event, threadsafe. std::optional Advance(); +#ifdef _WIN32 + void SetTimerResolutionNs(std::chrono::nanoseconds ns); +#endif + private: - struct Event { - s64 time; - u64 fifo_order; - std::shared_ptr type; - bool operator>(const Event& other) const { - return std::tie(time, fifo_order) > std::tie(other.time, other.fifo_order); - } - }; + struct Event; static void ThreadEntry(CoreTiming& instance); void ThreadLoop(); + void Reset(); std::unique_ptr clock; - std::atomic global_timer{0}; - std::vector event_queue; - std::atomic event_fifo_id{0}; + + s64 global_timer = 0; + +#ifdef _WIN32 + s64 timer_resolution_ns; +#endif + + using heap_t = + boost::heap::fibonacci_heap>>; + + heap_t event_queue; + u64 event_fifo_id = 0; Common::Event event{}; Common::Event pause_event{}; @@ -117,12 +173,20 @@ private: std::function on_thread_init{}; bool is_multicore{}; - std::atomic pause_end_time{}; + s64 pause_end_time{}; - std::atomic cpu_ticks{}; - std::atomic downcount{}; + /// Cycle timing + u64 cpu_ticks{}; + s64 downcount{}; }; +/// Creates a core timing event with the given name and callback. +/// +/// @param name The name of the core timing event to create. +/// @param callback The callback to execute for the event. +/// +/// @returns An EventType instance representing the created event. +/// std::shared_ptr CreateEvent(std::string name, TimedCallback&& callback); } // namespace Core::Timing diff --git a/src/core/cpu_manager.cpp b/src/core/cpu_manager.cpp index e7e341de16..9b1c773877 100644 --- a/src/core/cpu_manager.cpp +++ b/src/core/cpu_manager.cpp @@ -1,12 +1,6 @@ // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later -#include -#include -#include -#include -#include - #include "common/fiber.h" #include "common/microprofile.h" #include "common/scope_exit.h" @@ -30,7 +24,6 @@ void CpuManager::Initialize() { num_cores = is_multicore ? Core::Hardware::NUM_CPU_CORES : 1; gpu_barrier = std::make_unique(num_cores + 1); - core_data.resize(num_cores); for (std::size_t core = 0; core < num_cores; core++) { core_data[core].host_thread = std::jthread([this, core](std::stop_token token) { RunThread(token, core); }); @@ -38,10 +31,10 @@ void CpuManager::Initialize() { } void CpuManager::Shutdown() { - for (auto& data : core_data) { - if (data.host_thread.joinable()) { - data.host_thread.request_stop(); - data.host_thread.join(); + for (std::size_t core = 0; core < num_cores; core++) { + if (core_data[core].host_thread.joinable()) { + core_data[core].host_thread.request_stop(); + core_data[core].host_thread.join(); } } } @@ -73,7 +66,12 @@ void CpuManager::HandleInterrupt() { Kernel::KInterruptManager::HandleInterrupt(kernel, static_cast(core_index)); } +/////////////////////////////////////////////////////////////////////////////// +/// MultiCore /// +/////////////////////////////////////////////////////////////////////////////// + void CpuManager::MultiCoreRunGuestThread() { + // Similar to UserModeThreadStarter in HOS auto& kernel = system.Kernel(); auto* thread = Kernel::GetCurrentThreadPointer(kernel); kernel.CurrentScheduler()->OnThreadStart(); @@ -90,6 +88,10 @@ void CpuManager::MultiCoreRunGuestThread() { } void CpuManager::MultiCoreRunIdleThread() { + // Not accurate to HOS. Remove this entire method when singlecore is removed. + // See notes in KScheduler::ScheduleImpl for more information about why this + // is inaccurate. + auto& kernel = system.Kernel(); kernel.CurrentScheduler()->OnThreadStart(); @@ -103,6 +105,10 @@ void CpuManager::MultiCoreRunIdleThread() { } } +/////////////////////////////////////////////////////////////////////////////// +/// SingleCore /// +/////////////////////////////////////////////////////////////////////////////// + void CpuManager::SingleCoreRunGuestThread() { auto& kernel = system.Kernel(); auto* thread = Kernel::GetCurrentThreadPointer(kernel); @@ -148,16 +154,19 @@ void CpuManager::PreemptSingleCore(bool from_running_environment) { system.CoreTiming().Advance(); kernel.SetIsPhantomModeForSingleCore(false); } - current_core.store((current_core + 1) % Core::Hardware::NUM_CPU_CORES, std::memory_order_release); + current_core.store((current_core + 1) % Core::Hardware::NUM_CPU_CORES); system.CoreTiming().ResetTicks(); kernel.Scheduler(current_core).PreemptSingleCore(); + // We've now been scheduled again, and we may have exchanged schedulers. + // Reload the scheduler in case it's different. if (!kernel.Scheduler(current_core).IsIdle()) { idle_count = 0; } } void CpuManager::GuestActivate() { + // Similar to the HorizonKernelMain callback in HOS auto& kernel = system.Kernel(); auto* scheduler = kernel.CurrentScheduler(); @@ -175,19 +184,27 @@ void CpuManager::ShutdownThread() { } void CpuManager::RunThread(std::stop_token token, std::size_t core) { + /// Initialization system.RegisterCoreThread(core); - std::string name = is_multicore ? "CPUCore_" + std::to_string(core) : "CPUThread"; + std::string name; + if (is_multicore) { + name = "CPUCore_" + std::to_string(core); + } else { + name = "CPUThread"; + } MicroProfileOnThreadCreate(name.c_str()); Common::SetCurrentThreadName(name.c_str()); Common::SetCurrentThreadPriority(Common::ThreadPriority::Critical); auto& data = core_data[core]; data.host_context = Common::Fiber::ThreadToFiber(); + // Cleanup SCOPE_EXIT { data.host_context->Exit(); MicroProfileOnThreadExit(); }; + // Running if (!gpu_barrier->Sync(token)) { return; } diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 6e53db3640..f7eae9c598 100644 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -6,7 +6,6 @@ #include #include #include -#include #include "common/assert.h" #include "common/atomic_ops.h" @@ -33,18 +32,17 @@ namespace Core::Memory { namespace { -constexpr size_t PAGE_SIZE = 0x1000; -constexpr size_t PAGE_BITS = 12; -constexpr size_t PAGE_MASK = PAGE_SIZE - 1; - -inline bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr, - const std::size_t size) { +bool AddressSpaceContains(const Common::PageTable& table, const Common::ProcessAddress addr, + const std::size_t size) { const Common::ProcessAddress max_addr = 1ULL << table.GetAddressSpaceBits(); return addr + size >= addr && addr + size <= max_addr; } -} // Anonymous namespace +} // namespace +// Implementation class used to keep the specifics of the memory subsystem hidden +// from outside classes. This also allows modification to the internals of the memory +// subsystem without needing to rebuild all files that make use of the memory interface. struct Memory::Impl { explicit Impl(Core::System& system_) : system{system_} {} @@ -68,11 +66,12 @@ struct Memory::Impl { void MapMemoryRegion(Common::PageTable& page_table, Common::ProcessAddress base, u64 size, Common::PhysicalAddress target, Common::MemoryPermission perms, bool separate_heap) { - ASSERT_MSG((size & PAGE_MASK) == 0, "non-page aligned size: {:016X}", size); - ASSERT_MSG((base & PAGE_MASK) == 0, "non-page aligned base: {:016X}", GetInteger(base)); + ASSERT_MSG((size & SUYU_PAGEMASK) == 0, "non-page aligned size: {:016X}", size); + ASSERT_MSG((base & SUYU_PAGEMASK) == 0, "non-page aligned base: {:016X}", GetInteger(base)); ASSERT_MSG(target >= DramMemoryMap::Base, "Out of bounds target: {:016X}", GetInteger(target)); - MapPages(page_table, base / PAGE_SIZE, size / PAGE_SIZE, target, Common::PageType::Memory); + MapPages(page_table, base / SUYU_PAGESIZE, size / SUYU_PAGESIZE, target, + Common::PageType::Memory); if (current_page_table->fastmem_arena) { buffer->Map(GetInteger(base), GetInteger(target) - DramMemoryMap::Base, size, perms, @@ -82,9 +81,10 @@ struct Memory::Impl { void UnmapRegion(Common::PageTable& page_table, Common::ProcessAddress base, u64 size, bool separate_heap) { - ASSERT_MSG((size & PAGE_MASK) == 0, "non-page aligned size: {:016X}", size); - ASSERT_MSG((base & PAGE_MASK) == 0, "non-page aligned base: {:016X}", GetInteger(base)); - MapPages(page_table, base / PAGE_SIZE, size / PAGE_SIZE, 0, Common::PageType::Unmapped); + ASSERT_MSG((size & SUYU_PAGEMASK) == 0, "non-page aligned size: {:016X}", size); + ASSERT_MSG((base & SUYU_PAGEMASK) == 0, "non-page aligned base: {:016X}", GetInteger(base)); + MapPages(page_table, base / SUYU_PAGESIZE, size / SUYU_PAGESIZE, 0, + Common::PageType::Unmapped); if (current_page_table->fastmem_arena) { buffer->Unmap(GetInteger(base), size, separate_heap); @@ -93,28 +93,55 @@ struct Memory::Impl { void ProtectRegion(Common::PageTable& page_table, VAddr vaddr, u64 size, Common::MemoryPermission perms) { - ASSERT_MSG((size & PAGE_MASK) == 0, "non-page aligned size: {:016X}", size); - ASSERT_MSG((vaddr & PAGE_MASK) == 0, "non-page aligned base: {:016X}", vaddr); + ASSERT_MSG((size & SUYU_PAGEMASK) == 0, "non-page aligned size: {:016X}", size); + ASSERT_MSG((vaddr & SUYU_PAGEMASK) == 0, "non-page aligned base: {:016X}", vaddr); if (!current_page_table->fastmem_arena) { return; } - for (u64 addr = vaddr; addr < vaddr + size; addr += PAGE_SIZE) { + u64 protect_bytes{}; + u64 protect_begin{}; + for (u64 addr = vaddr; addr < vaddr + size; addr += SUYU_PAGESIZE) { const Common::PageType page_type{ - current_page_table->pointers[addr >> PAGE_BITS].Type()}; - if (page_type != Common::PageType::RasterizerCachedMemory) { - buffer->Protect(addr, PAGE_SIZE, perms); + current_page_table->pointers[addr >> SUYU_PAGEBITS].Type()}; + switch (page_type) { + case Common::PageType::RasterizerCachedMemory: + if (protect_bytes > 0) { + buffer->Protect(protect_begin, protect_bytes, perms); + protect_bytes = 0; + } + break; + default: + if (protect_bytes == 0) { + protect_begin = addr; + } + protect_bytes += SUYU_PAGESIZE; } } + + if (protect_bytes > 0) { + buffer->Protect(protect_begin, protect_bytes, perms); + } } - u8* GetPointerFromRasterizerCachedMemory(u64 vaddr) const { + [[nodiscard]] u8* GetPointerFromRasterizerCachedMemory(u64 vaddr) const { const Common::PhysicalAddress paddr{ - current_page_table->backing_addr[vaddr >> PAGE_BITS]}; + current_page_table->backing_addr[vaddr >> SUYU_PAGEBITS]}; if (!paddr) { - return nullptr; + return {}; + } + + return system.DeviceMemory().GetPointer(paddr + vaddr); + } + + [[nodiscard]] u8* GetPointerFromDebugMemory(u64 vaddr) const { + const Common::PhysicalAddress paddr{ + current_page_table->backing_addr[vaddr >> SUYU_PAGEBITS]}; + + if (paddr == 0) { + return {}; } return system.DeviceMemory().GetPointer(paddr + vaddr); @@ -128,7 +155,9 @@ struct Memory::Impl { if ((addr & 1) == 0) { return Read(addr); } else { - return Read(addr) | static_cast(Read(addr + sizeof(u8))) << 8; + const u32 a{Read(addr)}; + const u32 b{Read(addr + sizeof(u8))}; + return static_cast((b << 8) | a); } } @@ -136,7 +165,9 @@ struct Memory::Impl { if ((addr & 3) == 0) { return Read(addr); } else { - return Read16(addr) | static_cast(Read16(addr + sizeof(u16))) << 16; + const u32 a{Read16(addr)}; + const u32 b{Read16(addr + sizeof(u16))}; + return (b << 16) | a; } } @@ -144,7 +175,9 @@ struct Memory::Impl { if ((addr & 7) == 0) { return Read(addr); } else { - return Read32(addr) | static_cast(Read32(addr + sizeof(u32))) << 32; + const u32 a{Read32(addr)}; + const u32 b{Read32(addr + sizeof(u32))}; + return (static_cast(b) << 32) | a; } } @@ -199,7 +232,7 @@ struct Memory::Impl { std::string string; string.reserve(max_length); for (std::size_t i = 0; i < max_length; ++i) { - const char c = Read(vaddr); + const char c = Read(vaddr); if (c == '\0') { break; } @@ -210,72 +243,648 @@ struct Memory::Impl { return string; } - template - T Read(const Common::ProcessAddress vaddr) { - T value; - const u8* const ptr = GetPointerFromRasterizerCachedMemory(GetInteger(vaddr)); - if (ptr) { - std::memcpy(&value, ptr, sizeof(T)); - } else { - LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:016X}", sizeof(T) * 8, GetInteger(vaddr)); - value = 0; + bool WalkBlock(const Common::ProcessAddress addr, const std::size_t size, auto on_unmapped, + auto on_memory, auto on_rasterizer, auto increment) { + const auto& page_table = *current_page_table; + std::size_t remaining_size = size; + std::size_t page_index = addr >> SUYU_PAGEBITS; + std::size_t page_offset = addr & SUYU_PAGEMASK; + bool user_accessible = true; + + if (!AddressSpaceContains(page_table, addr, size)) [[unlikely]] { + on_unmapped(size, addr); + return false; } - return value; + + while (remaining_size) { + const std::size_t copy_amount = + std::min(static_cast(SUYU_PAGESIZE) - page_offset, remaining_size); + const auto current_vaddr = + static_cast((page_index << SUYU_PAGEBITS) + page_offset); + + const auto [pointer, type] = page_table.pointers[page_index].PointerType(); + switch (type) { + case Common::PageType::Unmapped: { + user_accessible = false; + on_unmapped(copy_amount, current_vaddr); + break; + } + case Common::PageType::Memory: { + u8* mem_ptr = + reinterpret_cast(pointer + page_offset + (page_index << SUYU_PAGEBITS)); + on_memory(copy_amount, mem_ptr); + break; + } + case Common::PageType::DebugMemory: { + u8* const mem_ptr{GetPointerFromDebugMemory(current_vaddr)}; + on_memory(copy_amount, mem_ptr); + break; + } + case Common::PageType::RasterizerCachedMemory: { + u8* const host_ptr{GetPointerFromRasterizerCachedMemory(current_vaddr)}; + on_rasterizer(current_vaddr, copy_amount, host_ptr); + break; + } + default: + UNREACHABLE(); + } + + page_index++; + page_offset = 0; + increment(copy_amount); + remaining_size -= copy_amount; + } + + return user_accessible; } + template + bool ReadBlockImpl(const Common::ProcessAddress src_addr, void* dest_buffer, + const std::size_t size) { + return WalkBlock( + src_addr, size, + [src_addr, size, &dest_buffer](const std::size_t copy_amount, + const Common::ProcessAddress current_vaddr) { + LOG_ERROR(HW_Memory, + "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", + GetInteger(current_vaddr), GetInteger(src_addr), size); + std::memset(dest_buffer, 0, copy_amount); + }, + [&](const std::size_t copy_amount, const u8* const src_ptr) { + std::memcpy(dest_buffer, src_ptr, copy_amount); + }, + [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, + const u8* const host_ptr) { + if constexpr (!UNSAFE) { + HandleRasterizerDownload(GetInteger(current_vaddr), copy_amount); + } + std::memcpy(dest_buffer, host_ptr, copy_amount); + }, + [&](const std::size_t copy_amount) { + dest_buffer = static_cast(dest_buffer) + copy_amount; + }); + } + + bool ReadBlock(const Common::ProcessAddress src_addr, void* dest_buffer, + const std::size_t size) { + return ReadBlockImpl(src_addr, dest_buffer, size); + } + + bool ReadBlockUnsafe(const Common::ProcessAddress src_addr, void* dest_buffer, + const std::size_t size) { + return ReadBlockImpl(src_addr, dest_buffer, size); + } + + const u8* GetSpan(const VAddr src_addr, const std::size_t size) const { + if (current_page_table->blocks[src_addr >> SUYU_PAGEBITS] == + current_page_table->blocks[(src_addr + size) >> SUYU_PAGEBITS]) { + return GetPointerSilent(src_addr); + } + return nullptr; + } + + u8* GetSpan(const VAddr src_addr, const std::size_t size) { + if (current_page_table->blocks[src_addr >> SUYU_PAGEBITS] == + current_page_table->blocks[(src_addr + size) >> SUYU_PAGEBITS]) { + return GetPointerSilent(src_addr); + } + return nullptr; + } + + template + bool WriteBlockImpl(const Common::ProcessAddress dest_addr, const void* src_buffer, + const std::size_t size) { + return WalkBlock( + dest_addr, size, + [dest_addr, size](const std::size_t copy_amount, + const Common::ProcessAddress current_vaddr) { + LOG_ERROR(HW_Memory, + "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", + GetInteger(current_vaddr), GetInteger(dest_addr), size); + }, + [&](const std::size_t copy_amount, u8* const dest_ptr) { + std::memcpy(dest_ptr, src_buffer, copy_amount); + }, + [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, + u8* const host_ptr) { + if constexpr (!UNSAFE) { + HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount); + } + std::memcpy(host_ptr, src_buffer, copy_amount); + }, + [&](const std::size_t copy_amount) { + src_buffer = static_cast(src_buffer) + copy_amount; + }); + } + + bool WriteBlock(const Common::ProcessAddress dest_addr, const void* src_buffer, + const std::size_t size) { + return WriteBlockImpl(dest_addr, src_buffer, size); + } + + bool WriteBlockUnsafe(const Common::ProcessAddress dest_addr, const void* src_buffer, + const std::size_t size) { + return WriteBlockImpl(dest_addr, src_buffer, size); + } + + bool ZeroBlock(const Common::ProcessAddress dest_addr, const std::size_t size) { + return WalkBlock( + dest_addr, size, + [dest_addr, size](const std::size_t copy_amount, + const Common::ProcessAddress current_vaddr) { + LOG_ERROR(HW_Memory, + "Unmapped ZeroBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", + GetInteger(current_vaddr), GetInteger(dest_addr), size); + }, + [](const std::size_t copy_amount, u8* const dest_ptr) { + std::memset(dest_ptr, 0, copy_amount); + }, + [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, + u8* const host_ptr) { + HandleRasterizerWrite(GetInteger(current_vaddr), copy_amount); + std::memset(host_ptr, 0, copy_amount); + }, + [](const std::size_t copy_amount) {}); + } + + bool CopyBlock(Common::ProcessAddress dest_addr, Common::ProcessAddress src_addr, + const std::size_t size) { + return WalkBlock( + dest_addr, size, + [&](const std::size_t copy_amount, const Common::ProcessAddress current_vaddr) { + LOG_ERROR(HW_Memory, + "Unmapped CopyBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", + GetInteger(current_vaddr), GetInteger(src_addr), size); + ZeroBlock(dest_addr, copy_amount); + }, + [&](const std::size_t copy_amount, const u8* const src_ptr) { + WriteBlockImpl(dest_addr, src_ptr, copy_amount); + }, + [&](const Common::ProcessAddress current_vaddr, const std::size_t copy_amount, + u8* const host_ptr) { + HandleRasterizerDownload(GetInteger(current_vaddr), copy_amount); + WriteBlockImpl(dest_addr, host_ptr, copy_amount); + }, + [&](const std::size_t copy_amount) { + dest_addr += copy_amount; + src_addr += copy_amount; + }); + } + + template + Result PerformCacheOperation(Common::ProcessAddress dest_addr, std::size_t size, + Callback&& cb) { + class InvalidMemoryException : public std::exception {}; + + try { + WalkBlock( + dest_addr, size, + [&](const std::size_t block_size, const Common::ProcessAddress current_vaddr) { + LOG_ERROR(HW_Memory, "Unmapped cache maintenance @ {:#018X}", + GetInteger(current_vaddr)); + throw InvalidMemoryException(); + }, + [&](const std::size_t block_size, u8* const host_ptr) {}, + [&](const Common::ProcessAddress current_vaddr, const std::size_t block_size, + u8* const host_ptr) { cb(current_vaddr, block_size); }, + [](const std::size_t block_size) {}); + } catch (InvalidMemoryException&) { + return Kernel::ResultInvalidCurrentMemory; + } + + return ResultSuccess; + } + + Result InvalidateDataCache(Common::ProcessAddress dest_addr, std::size_t size) { + auto on_rasterizer = [&](const Common::ProcessAddress current_vaddr, + const std::size_t block_size) { + // dc ivac: Invalidate to point of coherency + // GPU flush -> CPU invalidate + HandleRasterizerDownload(GetInteger(current_vaddr), block_size); + }; + return PerformCacheOperation(dest_addr, size, on_rasterizer); + } + + Result StoreDataCache(Common::ProcessAddress dest_addr, std::size_t size) { + auto on_rasterizer = [&](const Common::ProcessAddress current_vaddr, + const std::size_t block_size) { + // dc cvac: Store to point of coherency + // CPU flush -> GPU invalidate + HandleRasterizerWrite(GetInteger(current_vaddr), block_size); + }; + return PerformCacheOperation(dest_addr, size, on_rasterizer); + } + + Result FlushDataCache(Common::ProcessAddress dest_addr, std::size_t size) { + auto on_rasterizer = [&](const Common::ProcessAddress current_vaddr, + const std::size_t block_size) { + // dc civac: Store to point of coherency, and invalidate from cache + // CPU flush -> GPU invalidate + HandleRasterizerWrite(GetInteger(current_vaddr), block_size); + }; + return PerformCacheOperation(dest_addr, size, on_rasterizer); + } + + void MarkRegionDebug(u64 vaddr, u64 size, bool debug) { + if (vaddr == 0 || !AddressSpaceContains(*current_page_table, vaddr, size)) { + return; + } + + if (current_page_table->fastmem_arena) { + const auto perm{debug ? Common::MemoryPermission{} + : Common::MemoryPermission::ReadWrite}; + buffer->Protect(vaddr, size, perm); + } + + // Iterate over a contiguous CPU address space, marking/unmarking the region. + // The region is at a granularity of CPU pages. + + const u64 num_pages = ((vaddr + size - 1) >> SUYU_PAGEBITS) - (vaddr >> SUYU_PAGEBITS) + 1; + for (u64 i = 0; i < num_pages; ++i, vaddr += SUYU_PAGESIZE) { + const Common::PageType page_type{ + current_page_table->pointers[vaddr >> SUYU_PAGEBITS].Type()}; + if (debug) { + // Switch page type to debug if now debug + switch (page_type) { + case Common::PageType::Unmapped: + ASSERT_MSG(false, "Attempted to mark unmapped pages as debug"); + break; + case Common::PageType::RasterizerCachedMemory: + case Common::PageType::DebugMemory: + // Page is already marked. + break; + case Common::PageType::Memory: + current_page_table->pointers[vaddr >> SUYU_PAGEBITS].Store( + 0, Common::PageType::DebugMemory); + break; + default: + UNREACHABLE(); + } + } else { + // Switch page type to non-debug if now non-debug + switch (page_type) { + case Common::PageType::Unmapped: + ASSERT_MSG(false, "Attempted to mark unmapped pages as non-debug"); + break; + case Common::PageType::RasterizerCachedMemory: + case Common::PageType::Memory: + // Don't mess with already non-debug or rasterizer memory. + break; + case Common::PageType::DebugMemory: { + u8* const pointer{GetPointerFromDebugMemory(vaddr & ~SUYU_PAGEMASK)}; + current_page_table->pointers[vaddr >> SUYU_PAGEBITS].Store( + reinterpret_cast(pointer) - (vaddr & ~SUYU_PAGEMASK), + Common::PageType::Memory); + break; + } + default: + UNREACHABLE(); + } + } + } + } + + void RasterizerMarkRegionCached(u64 vaddr, u64 size, bool cached) { + if (vaddr == 0 || !AddressSpaceContains(*current_page_table, vaddr, size)) { + return; + } + + if (current_page_table->fastmem_arena) { + Common::MemoryPermission perm{}; + if (!Settings::values.use_reactive_flushing.GetValue() || !cached) { + perm |= Common::MemoryPermission::Read; + } + if (!cached) { + perm |= Common::MemoryPermission::Write; + } + buffer->Protect(vaddr, size, perm); + } + + // Iterate over a contiguous CPU address space, which corresponds to the specified GPU + // address space, marking the region as un/cached. The region is marked un/cached at a + // granularity of CPU pages, hence why we iterate on a CPU page basis (note: GPU page size + // is different). This assumes the specified GPU address region is contiguous as well. + + const u64 num_pages = ((vaddr + size - 1) >> SUYU_PAGEBITS) - (vaddr >> SUYU_PAGEBITS) + 1; + for (u64 i = 0; i < num_pages; ++i, vaddr += SUYU_PAGESIZE) { + const Common::PageType page_type{ + current_page_table->pointers[vaddr >> SUYU_PAGEBITS].Type()}; + if (cached) { + // Switch page type to cached if now cached + switch (page_type) { + case Common::PageType::Unmapped: + // It is not necessary for a process to have this region mapped into its address + // space, for example, a system module need not have a VRAM mapping. + break; + case Common::PageType::DebugMemory: + case Common::PageType::Memory: + current_page_table->pointers[vaddr >> SUYU_PAGEBITS].Store( + 0, Common::PageType::RasterizerCachedMemory); + break; + case Common::PageType::RasterizerCachedMemory: + // There can be more than one GPU region mapped per CPU region, so it's common + // that this area is already marked as cached. + break; + default: + UNREACHABLE(); + } + } else { + // Switch page type to uncached if now uncached + switch (page_type) { + case Common::PageType::Unmapped: // NOLINT(bugprone-branch-clone) + // It is not necessary for a process to have this region mapped into its address + // space, for example, a system module need not have a VRAM mapping. + break; + case Common::PageType::DebugMemory: + case Common::PageType::Memory: + // There can be more than one GPU region mapped per CPU region, so it's common + // that this area is already unmarked as cached. + break; + case Common::PageType::RasterizerCachedMemory: { + u8* const pointer{GetPointerFromRasterizerCachedMemory(vaddr & ~SUYU_PAGEMASK)}; + if (pointer == nullptr) { + // It's possible that this function has been called while updating the + // pagetable after unmapping a VMA. In that case the underlying VMA will no + // longer exist, and we should just leave the pagetable entry blank. + current_page_table->pointers[vaddr >> SUYU_PAGEBITS].Store( + 0, Common::PageType::Unmapped); + } else { + current_page_table->pointers[vaddr >> SUYU_PAGEBITS].Store( + reinterpret_cast(pointer) - (vaddr & ~SUYU_PAGEMASK), + Common::PageType::Memory); + } + break; + } + default: + UNREACHABLE(); + } + } + } + } + + /** + * Maps a region of pages as a specific type. + * + * @param page_table The page table to use to perform the mapping. + * @param base The base address to begin mapping at. + * @param size The total size of the range in bytes. + * @param target The target address to begin mapping from. + * @param type The page type to map the memory as. + */ + void MapPages(Common::PageTable& page_table, Common::ProcessAddress base_address, u64 size, + Common::PhysicalAddress target, Common::PageType type) { + auto base = GetInteger(base_address); + + LOG_DEBUG(HW_Memory, "Mapping {:016X} onto {:016X}-{:016X}", GetInteger(target), + base * SUYU_PAGESIZE, (base + size) * SUYU_PAGESIZE); + + const auto end = base + size; + ASSERT_MSG(end <= page_table.pointers.size(), "out of range mapping at {:016X}", + base + page_table.pointers.size()); + + if (!target) { + ASSERT_MSG(type != Common::PageType::Memory, + "Mapping memory page without a pointer @ {:016x}", base * SUYU_PAGESIZE); + + while (base != end) { + page_table.pointers[base].Store(0, type); + page_table.backing_addr[base] = 0; + page_table.blocks[base] = 0; + base += 1; + } + } else { + auto orig_base = base; + while (base != end) { + auto host_ptr = + reinterpret_cast(system.DeviceMemory().GetPointer(target)) - + (base << SUYU_PAGEBITS); + auto backing = GetInteger(target) - (base << SUYU_PAGEBITS); + page_table.pointers[base].Store(host_ptr, type); + page_table.backing_addr[base] = backing; + page_table.blocks[base] = orig_base << SUYU_PAGEBITS; + + ASSERT_MSG(page_table.pointers[base].Pointer(), + "memory mapping base yield a nullptr within the table"); + + base += 1; + target += SUYU_PAGESIZE; + } + } + } + + [[nodiscard]] u8* GetPointerImpl(u64 vaddr, auto on_unmapped, auto on_rasterizer) const { + // AARCH64 masks the upper 16 bit of all memory accesses + vaddr = vaddr & 0xffffffffffffULL; + + if (!AddressSpaceContains(*current_page_table, vaddr, 1)) [[unlikely]] { + on_unmapped(); + return nullptr; + } + + // Avoid adding any extra logic to this fast-path block + const uintptr_t raw_pointer = current_page_table->pointers[vaddr >> SUYU_PAGEBITS].Raw(); + if (const uintptr_t pointer = Common::PageTable::PageInfo::ExtractPointer(raw_pointer)) { + return reinterpret_cast(pointer + vaddr); + } + switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) { + case Common::PageType::Unmapped: + on_unmapped(); + return nullptr; + case Common::PageType::Memory: + ASSERT_MSG(false, "Mapped memory page without a pointer @ 0x{:016X}", vaddr); + return nullptr; + case Common::PageType::DebugMemory: + return GetPointerFromDebugMemory(vaddr); + case Common::PageType::RasterizerCachedMemory: { + u8* const host_ptr{GetPointerFromRasterizerCachedMemory(vaddr)}; + on_rasterizer(); + return host_ptr; + } + default: + UNREACHABLE(); + } + return nullptr; + } + + [[nodiscard]] u8* GetPointer(const Common::ProcessAddress vaddr) const { + return GetPointerImpl( + GetInteger(vaddr), + [vaddr]() { + LOG_ERROR(HW_Memory, "Unmapped GetPointer @ 0x{:016X}", GetInteger(vaddr)); + }, + []() {}); + } + + [[nodiscard]] u8* GetPointerSilent(const Common::ProcessAddress vaddr) const { + return GetPointerImpl( + GetInteger(vaddr), []() {}, []() {}); + } + + /** + * Reads a particular data type out of memory at the given virtual address. + * + * @param vaddr The virtual address to read the data type from. + * + * @tparam T The data type to read out of memory. This type *must* be + * trivially copyable, otherwise the behavior of this function + * is undefined. + * + * @returns The instance of T read from the specified virtual address. + */ + template + T Read(Common::ProcessAddress vaddr) { + T result = 0; + const u8* const ptr = GetPointerImpl( + GetInteger(vaddr), + [vaddr]() { + LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:016X}", sizeof(T) * 8, + GetInteger(vaddr)); + }, + [&]() { HandleRasterizerDownload(GetInteger(vaddr), sizeof(T)); }); + if (ptr) { + std::memcpy(&result, ptr, sizeof(T)); + } + return result; + } + + /** + * Writes a particular data type to memory at the given virtual address. + * + * @param vaddr The virtual address to write the data type to. + * + * @tparam T The data type to write to memory. This type *must* be + * trivially copyable, otherwise the behavior of this function + * is undefined. + */ template void Write(Common::ProcessAddress vaddr, const T data) { - u8* const ptr = GetPointerFromRasterizerCachedMemory(GetInteger(vaddr)); + u8* const ptr = GetPointerImpl( + GetInteger(vaddr), + [vaddr, data]() { + LOG_ERROR(HW_Memory, "Unmapped Write{} @ 0x{:016X} = 0x{:016X}", sizeof(T) * 8, + GetInteger(vaddr), static_cast(data)); + }, + [&]() { HandleRasterizerWrite(GetInteger(vaddr), sizeof(T)); }); if (ptr) { std::memcpy(ptr, &data, sizeof(T)); - system.GPU().InvalidateRegion(GetInteger(vaddr), sizeof(T)); - } else { - LOG_ERROR(HW_Memory, "Unmapped Write{} @ 0x{:016X} = 0x{:016X}", sizeof(T) * 8, - GetInteger(vaddr), static_cast(data)); } } template bool WriteExclusive(Common::ProcessAddress vaddr, const T data, const T expected) { - u8* const ptr = GetPointerFromRasterizerCachedMemory(GetInteger(vaddr)); + u8* const ptr = GetPointerImpl( + GetInteger(vaddr), + [vaddr, data]() { + LOG_ERROR(HW_Memory, "Unmapped WriteExclusive{} @ 0x{:016X} = 0x{:016X}", + sizeof(T) * 8, GetInteger(vaddr), static_cast(data)); + }, + [&]() { HandleRasterizerWrite(GetInteger(vaddr), sizeof(T)); }); if (ptr) { - const bool result = Common::AtomicCompareAndSwap(reinterpret_cast(ptr), data, expected); - if (result) { - system.GPU().InvalidateRegion(GetInteger(vaddr), sizeof(T)); + return Common::AtomicCompareAndSwap(reinterpret_cast(ptr), data, expected); + } + return true; + } + + bool WriteExclusive128(Common::ProcessAddress vaddr, const u128 data, const u128 expected) { + u8* const ptr = GetPointerImpl( + GetInteger(vaddr), + [vaddr, data]() { + LOG_ERROR(HW_Memory, "Unmapped WriteExclusive128 @ 0x{:016X} = 0x{:016X}{:016X}", + GetInteger(vaddr), static_cast(data[1]), static_cast(data[0])); + }, + [&]() { HandleRasterizerWrite(GetInteger(vaddr), sizeof(u128)); }); + if (ptr) { + return Common::AtomicCompareAndSwap(reinterpret_cast(ptr), data, expected); + } + return true; + } + + void HandleRasterizerDownload(VAddr v_address, size_t size) { + const auto* p = GetPointerImpl( + v_address, []() {}, []() {}); + if (!gpu_device_memory) [[unlikely]] { + gpu_device_memory = &system.Host1x().MemoryManager(); + } + const size_t core = system.GetCurrentHostThreadID(); + auto& current_area = rasterizer_read_areas[core]; + gpu_device_memory->ApplyOpOnPointer(p, scratch_buffers[core], [&](DAddr address) { + const DAddr end_address = address + size; + if (current_area.start_address <= address && end_address <= current_area.end_address) + [[likely]] { + return; } - return result; - } else { - LOG_ERROR(HW_Memory, "Unmapped WriteExclusive{} @ 0x{:016X} = 0x{:016X}", sizeof(T) * 8, - GetInteger(vaddr), static_cast(data)); - return true; - } + current_area = system.GPU().OnCPURead(address, size); + }); } - bool ReadBlock(const Common::ProcessAddress src_addr, void* dest_buffer, - const std::size_t size) { - const u8* src_ptr = GetPointerFromRasterizerCachedMemory(GetInteger(src_addr)); - if (src_ptr) { - std::memcpy(dest_buffer, src_ptr, size); - return true; + void HandleRasterizerWrite(VAddr v_address, size_t size) { + const auto* p = GetPointerImpl( + v_address, []() {}, []() {}); + constexpr size_t sys_core = Core::Hardware::NUM_CPU_CORES - 1; + const size_t core = std::min(system.GetCurrentHostThreadID(), + sys_core); // any other calls threads go to syscore. + if (!gpu_device_memory) [[unlikely]] { + gpu_device_memory = &system.Host1x().MemoryManager(); } - LOG_ERROR(HW_Memory, "Unmapped ReadBlock @ 0x{:016X}", GetInteger(src_addr)); - return false; + // Guard on sys_core; + if (core == sys_core) [[unlikely]] { + sys_core_guard.lock(); + } + SCOPE_EXIT { + if (core == sys_core) [[unlikely]] { + sys_core_guard.unlock(); + } + }; + gpu_device_memory->ApplyOpOnPointer(p, scratch_buffers[core], [&](DAddr address) { + auto& current_area = rasterizer_write_areas[core]; + PAddr subaddress = address >> SUYU_PAGEBITS; + bool do_collection = current_area.last_address == subaddress; + if (!do_collection) [[unlikely]] { + do_collection = system.GPU().OnCPUWrite(address, size); + if (!do_collection) { + return; + } + current_area.last_address = subaddress; + } + gpu_dirty_managers[core].Collect(address, size); + }); } - bool WriteBlock(const Common::ProcessAddress dest_addr, const void* src_buffer, - const std::size_t size) { - u8* const dest_ptr = GetPointerFromRasterizerCachedMemory(GetInteger(dest_addr)); - if (dest_ptr) { - std::memcpy(dest_ptr, src_buffer, size); - system.GPU().InvalidateRegion(GetInteger(dest_addr), size); - return true; + struct GPUDirtyState { + PAddr last_address; + }; + + void InvalidateGPUMemory(u8* p, size_t size) { + constexpr size_t sys_core = Core::Hardware::NUM_CPU_CORES - 1; + const size_t core = std::min(system.GetCurrentHostThreadID(), + sys_core); // any other calls threads go to syscore. + if (!gpu_device_memory) [[unlikely]] { + gpu_device_memory = &system.Host1x().MemoryManager(); } - LOG_ERROR(HW_Memory, "Unmapped WriteBlock @ 0x{:016X}", GetInteger(dest_addr)); - return false; + // Guard on sys_core; + if (core == sys_core) [[unlikely]] { + sys_core_guard.lock(); + } + SCOPE_EXIT { + if (core == sys_core) [[unlikely]] { + sys_core_guard.unlock(); + } + }; + auto& gpu = system.GPU(); + gpu_device_memory->ApplyOpOnPointer( + p, scratch_buffers[core], [&](DAddr address) { gpu.InvalidateRegion(address, size); }); } Core::System& system; + Tegra::MaxwellDeviceMemoryManager* gpu_device_memory{}; Common::PageTable* current_page_table = nullptr; + std::array + rasterizer_read_areas{}; + std::array rasterizer_write_areas{}; + std::array, Core::Hardware::NUM_CPU_CORES> scratch_buffers{}; + std::span gpu_dirty_managers; + std::mutex sys_core_guard; + std::optional heap_tracker; #ifdef __linux__ Common::HeapTracker* buffer{}; @@ -284,10 +893,16 @@ struct Memory::Impl { #endif }; -Memory::Memory(Core::System& system_) : impl{std::make_unique(system_)} {} +Memory::Memory(Core::System& system_) : system{system_} { + Reset(); +} Memory::~Memory() = default; +void Memory::Reset() { + impl = std::make_unique(system); +} + void Memory::SetCurrentPageTable(Kernel::KProcess& process) { impl->SetCurrentPageTable(process); } @@ -310,20 +925,38 @@ void Memory::ProtectRegion(Common::PageTable& page_table, Common::ProcessAddress bool Memory::IsValidVirtualAddress(const Common::ProcessAddress vaddr) const { const auto& page_table = *impl->current_page_table; - const size_t page = vaddr >> PAGE_BITS; + const size_t page = vaddr >> SUYU_PAGEBITS; if (page >= page_table.pointers.size()) { return false; } const auto [pointer, type] = page_table.pointers[page].PointerType(); - return pointer != 0 || type == Common::PageType::RasterizerCachedMemory; + return pointer != 0 || type == Common::PageType::RasterizerCachedMemory || + type == Common::PageType::DebugMemory; +} + +bool Memory::IsValidVirtualAddressRange(Common::ProcessAddress base, u64 size) const { + Common::ProcessAddress end = base + size; + Common::ProcessAddress page = Common::AlignDown(GetInteger(base), SUYU_PAGESIZE); + + for (; page < end; page += SUYU_PAGESIZE) { + if (!IsValidVirtualAddress(page)) { + return false; + } + } + + return true; } u8* Memory::GetPointer(Common::ProcessAddress vaddr) { - return impl->GetPointerFromRasterizerCachedMemory(GetInteger(vaddr)); + return impl->GetPointer(vaddr); +} + +u8* Memory::GetPointerSilent(Common::ProcessAddress vaddr) { + return impl->GetPointerSilent(vaddr); } const u8* Memory::GetPointer(Common::ProcessAddress vaddr) const { - return impl->GetPointerFromRasterizerCachedMemory(GetInteger(vaddr)); + return impl->GetPointer(vaddr); } u8 Memory::Read8(const Common::ProcessAddress addr) { @@ -374,6 +1007,10 @@ bool Memory::WriteExclusive64(Common::ProcessAddress addr, u64 data, u64 expecte return impl->WriteExclusive64(addr, data, expected); } +bool Memory::WriteExclusive128(Common::ProcessAddress addr, u128 data, u128 expected) { + return impl->WriteExclusive128(addr, data, expected); +} + std::string Memory::ReadCString(Common::ProcessAddress vaddr, std::size_t max_length) { return impl->ReadCString(vaddr, max_length); } @@ -383,9 +1020,93 @@ bool Memory::ReadBlock(const Common::ProcessAddress src_addr, void* dest_buffer, return impl->ReadBlock(src_addr, dest_buffer, size); } +bool Memory::ReadBlockUnsafe(const Common::ProcessAddress src_addr, void* dest_buffer, + const std::size_t size) { + return impl->ReadBlockUnsafe(src_addr, dest_buffer, size); +} + +const u8* Memory::GetSpan(const VAddr src_addr, const std::size_t size) const { + return impl->GetSpan(src_addr, size); +} + +u8* Memory::GetSpan(const VAddr src_addr, const std::size_t size) { + return impl->GetSpan(src_addr, size); +} + bool Memory::WriteBlock(const Common::ProcessAddress dest_addr, const void* src_buffer, const std::size_t size) { return impl->WriteBlock(dest_addr, src_buffer, size); } +bool Memory::WriteBlockUnsafe(const Common::ProcessAddress dest_addr, const void* src_buffer, + const std::size_t size) { + return impl->WriteBlockUnsafe(dest_addr, src_buffer, size); +} + +bool Memory::CopyBlock(Common::ProcessAddress dest_addr, Common::ProcessAddress src_addr, + const std::size_t size) { + return impl->CopyBlock(dest_addr, src_addr, size); +} + +bool Memory::ZeroBlock(Common::ProcessAddress dest_addr, const std::size_t size) { + return impl->ZeroBlock(dest_addr, size); +} + +void Memory::SetGPUDirtyManagers(std::span managers) { + impl->gpu_dirty_managers = managers; +} + +Result Memory::InvalidateDataCache(Common::ProcessAddress dest_addr, const std::size_t size) { + return impl->InvalidateDataCache(dest_addr, size); +} + +Result Memory::StoreDataCache(Common::ProcessAddress dest_addr, const std::size_t size) { + return impl->StoreDataCache(dest_addr, size); +} + +Result Memory::FlushDataCache(Common::ProcessAddress dest_addr, const std::size_t size) { + return impl->FlushDataCache(dest_addr, size); +} + +void Memory::RasterizerMarkRegionCached(Common::ProcessAddress vaddr, u64 size, bool cached) { + impl->RasterizerMarkRegionCached(GetInteger(vaddr), size, cached); +} + +void Memory::MarkRegionDebug(Common::ProcessAddress vaddr, u64 size, bool debug) { + impl->MarkRegionDebug(GetInteger(vaddr), size, debug); +} + +bool Memory::InvalidateNCE(Common::ProcessAddress vaddr, size_t size) { + [[maybe_unused]] bool mapped = true; + [[maybe_unused]] bool rasterizer = false; + + u8* const ptr = impl->GetPointerImpl( + GetInteger(vaddr), + [&] { + LOG_ERROR(HW_Memory, "Unmapped InvalidateNCE for {} bytes @ {:#x}", size, + GetInteger(vaddr)); + mapped = false; + }, + [&] { rasterizer = true; }); + if (rasterizer) { + impl->InvalidateGPUMemory(ptr, size); + } + +#ifdef __linux__ + if (!rasterizer && mapped) { + impl->buffer->DeferredMapSeparateHeap(GetInteger(vaddr)); + } +#endif + + return mapped && ptr != nullptr; +} + +bool Memory::InvalidateSeparateHeap(void* fault_address) { +#ifdef __linux__ + return impl->buffer->DeferredMapSeparateHeap(static_cast(fault_address)); +#else + return false; +#endif +} + } // namespace Core::Memory diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index b8c07d791a..c816f47fec 100644 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -40,23 +40,10 @@ struct GPU::Impl { explicit Impl(GPU& gpu_, Core::System& system_, bool is_async_, bool use_nvdec_) : gpu{gpu_}, system{system_}, host1x{system.Host1x()}, use_nvdec{use_nvdec_}, shader_notify{std::make_unique()}, is_async{is_async_}, - gpu_thread{system_, is_async_}, scheduler{std::make_unique(gpu)} { - Initialize(); - } + gpu_thread{system_, is_async_}, scheduler{std::make_unique(gpu)} {} ~Impl() = default; - void Initialize() { - // Initialize the GPU memory manager - memory_manager = std::make_unique(system); - - // Initialize the command buffer - command_buffer.reserve(COMMAND_BUFFER_SIZE); - - // Initialize the fence manager - fence_manager = std::make_unique(); - } - std::shared_ptr CreateChannel(s32 channel_id) { auto channel_state = std::make_shared(channel_id); channels.emplace(channel_id, channel_state); @@ -104,15 +91,14 @@ struct GPU::Impl { /// Flush all current written commands into the host GPU for execution. void FlushCommands() { - if (!command_buffer.empty()) { - rasterizer->ExecuteCommands(command_buffer); - command_buffer.clear(); - } + rasterizer->FlushCommands(); } /// Synchronizes CPU writes with Host GPU memory. void InvalidateGPUCache() { - rasterizer->InvalidateGPUCache(); + std::function callback_writes( + [this](PAddr address, size_t size) { rasterizer->OnCacheInvalidation(address, size); }); + system.GatherGPUDirtyMemory(callback_writes); } /// Signal the ending of command list. @@ -122,10 +108,11 @@ struct GPU::Impl { } /// Request a host GPU memory flush from the CPU. - u64 RequestSyncOperation(std::function&& action) { + template + [[nodiscard]] u64 RequestSyncOperation(Func&& action) { std::unique_lock lck{sync_request_mutex}; const u64 fence = ++last_sync_fence; - sync_requests.emplace_back(std::move(action), fence); + sync_requests.emplace_back(action); return fence; } @@ -143,12 +130,12 @@ struct GPU::Impl { void TickWork() { std::unique_lock lck{sync_request_mutex}; while (!sync_requests.empty()) { - auto& request = sync_requests.front(); + auto request = std::move(sync_requests.front()); + sync_requests.pop_front(); sync_request_mutex.unlock(); - request.first(); + request(); current_sync_fence.fetch_add(1, std::memory_order_release); sync_request_mutex.lock(); - sync_requests.pop_front(); sync_request_cv.notify_all(); } } @@ -235,6 +222,7 @@ struct GPU::Impl { /// This can be used to launch any necessary threads and register any necessary /// core timing events. void Start() { + Settings::UpdateGPUAccuracy(); gpu_thread.StartThread(*renderer, renderer->Context(), *scheduler); } @@ -264,7 +252,7 @@ struct GPU::Impl { /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory void FlushRegion(DAddr addr, u64 size) { - rasterizer->FlushRegion(addr, size); + gpu_thread.FlushRegion(addr, size); } VideoCore::RasterizerDownloadArea OnCPURead(DAddr addr, u64 size) { @@ -284,7 +272,7 @@ struct GPU::Impl { /// Notify rasterizer that any caches of the specified region should be invalidated void InvalidateRegion(DAddr addr, u64 size) { - rasterizer->InvalidateRegion(addr, size); + gpu_thread.InvalidateRegion(addr, size); } bool OnCPUWrite(DAddr addr, u64 size) { @@ -293,7 +281,57 @@ struct GPU::Impl { /// Notify rasterizer that any caches of the specified region should be flushed and invalidated void FlushAndInvalidateRegion(DAddr addr, u64 size) { - rasterizer->FlushAndInvalidateRegion(addr, size); + gpu_thread.FlushAndInvalidateRegion(addr, size); + } + + void RequestComposite(std::vector&& layers, + std::vector&& fences) { + size_t num_fences{fences.size()}; + size_t current_request_counter{}; + { + std::unique_lock lk(request_swap_mutex); + if (free_swap_counters.empty()) { + current_request_counter = request_swap_counters.size(); + request_swap_counters.emplace_back(num_fences); + } else { + current_request_counter = free_swap_counters.front(); + request_swap_counters[current_request_counter] = num_fences; + free_swap_counters.pop_front(); + } + } + const auto wait_fence = + RequestSyncOperation([this, current_request_counter, &layers, &fences, num_fences] { + auto& syncpoint_manager = host1x.GetSyncpointManager(); + if (num_fences == 0) { + renderer->Composite(layers); + } + const auto executer = [this, current_request_counter, layers_copy = layers]() { + { + std::unique_lock lk(request_swap_mutex); + if (--request_swap_counters[current_request_counter] != 0) { + return; + } + free_swap_counters.push_back(current_request_counter); + } + renderer->Composite(layers_copy); + }; + for (size_t i = 0; i < num_fences; i++) { + syncpoint_manager.RegisterGuestAction(fences[i].id, fences[i].value, executer); + } + }); + gpu_thread.TickGPU(); + WaitForSyncOperation(wait_fence); + } + + std::vector GetAppletCaptureBuffer() { + std::vector out; + + const auto wait_fence = + RequestSyncOperation([&] { out = renderer->GetAppletCaptureBuffer(); }); + gpu_thread.TickGPU(); + WaitForSyncOperation(wait_fence); + + return out; } GPU& gpu; @@ -310,12 +348,16 @@ struct GPU::Impl { /// When true, we are about to shut down emulation session, so terminate outstanding tasks std::atomic_bool shutting_down{}; + std::array, Service::Nvidia::MaxSyncPoints> syncpoints{}; + + std::array, Service::Nvidia::MaxSyncPoints> syncpt_interrupts; + std::mutex sync_mutex; std::mutex device_mutex; std::condition_variable sync_cv; - std::list, u64>> sync_requests; + std::list> sync_requests; std::atomic current_sync_fence{}; u64 last_sync_fence{}; std::mutex sync_request_mutex; @@ -331,13 +373,182 @@ struct GPU::Impl { Tegra::Control::ChannelState* current_channel; s32 bound_channel{-1}; - std::unique_ptr memory_manager; - std::vector command_buffer; - std::unique_ptr fence_manager; - - static constexpr size_t COMMAND_BUFFER_SIZE = 4 * 1024 * 1024; + std::deque free_swap_counters; + std::deque request_swap_counters; + std::mutex request_swap_mutex; }; -// ... (rest of the implementation remains the same) +GPU::GPU(Core::System& system, bool is_async, bool use_nvdec) + : impl{std::make_unique(*this, system, is_async, use_nvdec)} {} + +GPU::~GPU() = default; + +std::shared_ptr GPU::AllocateChannel() { + return impl->AllocateChannel(); +} + +void GPU::InitChannel(Control::ChannelState& to_init, u64 program_id) { + impl->InitChannel(to_init, program_id); +} + +void GPU::BindChannel(s32 channel_id) { + impl->BindChannel(channel_id); +} + +void GPU::ReleaseChannel(Control::ChannelState& to_release) { + impl->ReleaseChannel(to_release); +} + +void GPU::InitAddressSpace(Tegra::MemoryManager& memory_manager) { + impl->InitAddressSpace(memory_manager); +} + +void GPU::BindRenderer(std::unique_ptr renderer) { + impl->BindRenderer(std::move(renderer)); +} + +void GPU::FlushCommands() { + impl->FlushCommands(); +} + +void GPU::InvalidateGPUCache() { + impl->InvalidateGPUCache(); +} + +void GPU::OnCommandListEnd() { + impl->OnCommandListEnd(); +} + +u64 GPU::RequestFlush(DAddr addr, std::size_t size) { + return impl->RequestSyncOperation( + [this, addr, size]() { impl->rasterizer->FlushRegion(addr, size); }); +} + +u64 GPU::CurrentSyncRequestFence() const { + return impl->CurrentSyncRequestFence(); +} + +void GPU::WaitForSyncOperation(u64 fence) { + return impl->WaitForSyncOperation(fence); +} + +void GPU::TickWork() { + impl->TickWork(); +} + +/// Gets a mutable reference to the Host1x interface +Host1x::Host1x& GPU::Host1x() { + return impl->host1x; +} + +/// Gets an immutable reference to the Host1x interface. +const Host1x::Host1x& GPU::Host1x() const { + return impl->host1x; +} + +Engines::Maxwell3D& GPU::Maxwell3D() { + return impl->Maxwell3D(); +} + +const Engines::Maxwell3D& GPU::Maxwell3D() const { + return impl->Maxwell3D(); +} + +Engines::KeplerCompute& GPU::KeplerCompute() { + return impl->KeplerCompute(); +} + +const Engines::KeplerCompute& GPU::KeplerCompute() const { + return impl->KeplerCompute(); +} + +Tegra::DmaPusher& GPU::DmaPusher() { + return impl->DmaPusher(); +} + +const Tegra::DmaPusher& GPU::DmaPusher() const { + return impl->DmaPusher(); +} + +VideoCore::RendererBase& GPU::Renderer() { + return impl->Renderer(); +} + +const VideoCore::RendererBase& GPU::Renderer() const { + return impl->Renderer(); +} + +VideoCore::ShaderNotify& GPU::ShaderNotify() { + return impl->ShaderNotify(); +} + +const VideoCore::ShaderNotify& GPU::ShaderNotify() const { + return impl->ShaderNotify(); +} + +void GPU::RequestComposite(std::vector&& layers, + std::vector&& fences) { + impl->RequestComposite(std::move(layers), std::move(fences)); +} + +std::vector GPU::GetAppletCaptureBuffer() { + return impl->GetAppletCaptureBuffer(); +} + +u64 GPU::GetTicks() const { + return impl->GetTicks(); +} + +bool GPU::IsAsync() const { + return impl->IsAsync(); +} + +bool GPU::UseNvdec() const { + return impl->UseNvdec(); +} + +void GPU::RendererFrameEndNotify() { + impl->RendererFrameEndNotify(); +} + +void GPU::Start() { + impl->Start(); +} + +void GPU::NotifyShutdown() { + impl->NotifyShutdown(); +} + +void GPU::ObtainContext() { + impl->ObtainContext(); +} + +void GPU::ReleaseContext() { + impl->ReleaseContext(); +} + +void GPU::PushGPUEntries(s32 channel, Tegra::CommandList&& entries) { + impl->PushGPUEntries(channel, std::move(entries)); +} + +VideoCore::RasterizerDownloadArea GPU::OnCPURead(PAddr addr, u64 size) { + return impl->OnCPURead(addr, size); +} + +void GPU::FlushRegion(DAddr addr, u64 size) { + impl->FlushRegion(addr, size); +} + +void GPU::InvalidateRegion(DAddr addr, u64 size) { + impl->InvalidateRegion(addr, size); +} + +bool GPU::OnCPUWrite(DAddr addr, u64 size) { + return impl->OnCPUWrite(addr, size); +} + +void GPU::FlushAndInvalidateRegion(DAddr addr, u64 size) { + impl->FlushAndInvalidateRegion(addr, size); +} } // namespace Tegra diff --git a/src/video_core/optimized_rasterizer.cpp b/src/video_core/optimized_rasterizer.cpp deleted file mode 100644 index 02631f3c56..0000000000 --- a/src/video_core/optimized_rasterizer.cpp +++ /dev/null @@ -1,221 +0,0 @@ -#include "video_core/optimized_rasterizer.h" -#include "common/settings.h" -#include "video_core/gpu.h" -#include "video_core/memory_manager.h" -#include "video_core/engines/maxwell_3d.h" - -namespace VideoCore { - -OptimizedRasterizer::OptimizedRasterizer(Core::System& system, Tegra::GPU& gpu) - : system{system}, gpu{gpu}, memory_manager{gpu.MemoryManager()} { - InitializeShaderCache(); -} - -OptimizedRasterizer::~OptimizedRasterizer() = default; - -void OptimizedRasterizer::Draw(bool is_indexed, u32 instance_count) { - MICROPROFILE_SCOPE(GPU_Rasterization); - - PrepareRendertarget(); - UpdateDynamicState(); - - if (is_indexed) { - DrawIndexed(instance_count); - } else { - DrawArrays(instance_count); - } -} - -void OptimizedRasterizer::Clear(u32 layer_count) { - MICROPROFILE_SCOPE(GPU_Rasterization); - - PrepareRendertarget(); - ClearFramebuffer(layer_count); -} - -void OptimizedRasterizer::DispatchCompute() { - MICROPROFILE_SCOPE(GPU_Compute); - - PrepareCompute(); - LaunchComputeShader(); -} - -void OptimizedRasterizer::ResetCounter(VideoCommon::QueryType type) { - query_cache.ResetCounter(type); -} - -void OptimizedRasterizer::Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, - VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) { - query_cache.Query(gpu_addr, type, flags, payload, subreport); -} - -void OptimizedRasterizer::FlushAll() { - MICROPROFILE_SCOPE(GPU_Synchronization); - - FlushShaderCache(); - FlushRenderTargets(); -} - -void OptimizedRasterizer::FlushRegion(DAddr addr, u64 size, VideoCommon::CacheType which) { - MICROPROFILE_SCOPE(GPU_Synchronization); - - if (which == VideoCommon::CacheType::All || which == VideoCommon::CacheType::Unified) { - FlushMemoryRegion(addr, size); - } -} - -bool OptimizedRasterizer::MustFlushRegion(DAddr addr, u64 size, VideoCommon::CacheType which) { - if (which == VideoCommon::CacheType::All || which == VideoCommon::CacheType::Unified) { - return IsRegionCached(addr, size); - } - return false; -} - -RasterizerDownloadArea OptimizedRasterizer::GetFlushArea(DAddr addr, u64 size) { - return GetFlushableArea(addr, size); -} - -void OptimizedRasterizer::InvalidateRegion(DAddr addr, u64 size, VideoCommon::CacheType which) { - MICROPROFILE_SCOPE(GPU_Synchronization); - - if (which == VideoCommon::CacheType::All || which == VideoCommon::CacheType::Unified) { - InvalidateMemoryRegion(addr, size); - } -} - -void OptimizedRasterizer::OnCacheInvalidation(PAddr addr, u64 size) { - MICROPROFILE_SCOPE(GPU_Synchronization); - - InvalidateCachedRegion(addr, size); -} - -bool OptimizedRasterizer::OnCPUWrite(PAddr addr, u64 size) { - return HandleCPUWrite(addr, size); -} - -void OptimizedRasterizer::InvalidateGPUCache() { - MICROPROFILE_SCOPE(GPU_Synchronization); - - InvalidateAllCache(); -} - -void OptimizedRasterizer::UnmapMemory(DAddr addr, u64 size) { - MICROPROFILE_SCOPE(GPU_Synchronization); - - UnmapGPUMemoryRegion(addr, size); -} - -void OptimizedRasterizer::ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) { - MICROPROFILE_SCOPE(GPU_Synchronization); - - UpdateMappedGPUMemory(as_id, addr, size); -} - -void OptimizedRasterizer::FlushAndInvalidateRegion(DAddr addr, u64 size, VideoCommon::CacheType which) { - MICROPROFILE_SCOPE(GPU_Synchronization); - - if (which == VideoCommon::CacheType::All || which == VideoCommon::CacheType::Unified) { - FlushAndInvalidateMemoryRegion(addr, size); - } -} - -void OptimizedRasterizer::WaitForIdle() { - MICROPROFILE_SCOPE(GPU_Synchronization); - - WaitForGPUIdle(); -} - -void OptimizedRasterizer::FragmentBarrier() { - MICROPROFILE_SCOPE(GPU_Synchronization); - - InsertFragmentBarrier(); -} - -void OptimizedRasterizer::TiledCacheBarrier() { - MICROPROFILE_SCOPE(GPU_Synchronization); - - InsertTiledCacheBarrier(); -} - -void OptimizedRasterizer::FlushCommands() { - MICROPROFILE_SCOPE(GPU_Synchronization); - - SubmitCommands(); -} - -void OptimizedRasterizer::TickFrame() { - MICROPROFILE_SCOPE(GPU_Synchronization); - - EndFrame(); -} - -void OptimizedRasterizer::PrepareRendertarget() { - const auto& regs{gpu.Maxwell3D().regs}; - const auto& framebuffer{regs.framebuffer}; - - render_targets.resize(framebuffer.num_color_buffers); - for (std::size_t index = 0; index < framebuffer.num_color_buffers; ++index) { - render_targets[index] = GetColorBuffer(index); - } - - depth_stencil = GetDepthBuffer(); -} - -void OptimizedRasterizer::UpdateDynamicState() { - const auto& regs{gpu.Maxwell3D().regs}; - - UpdateViewport(regs.viewport_transform); - UpdateScissor(regs.scissor_test); - UpdateDepthBias(regs.polygon_offset_units, regs.polygon_offset_clamp, regs.polygon_offset_factor); - UpdateBlendConstants(regs.blend_color); - UpdateStencilFaceMask(regs.stencil_front_func_mask, regs.stencil_back_func_mask); -} - -void OptimizedRasterizer::DrawIndexed(u32 instance_count) { - const auto& draw_state{gpu.Maxwell3D().draw_manager->GetDrawState()}; - const auto& index_buffer{memory_manager.ReadBlockUnsafe(draw_state.index_buffer.Address(), - draw_state.index_buffer.size)}; - - shader_cache.BindComputeShader(); - shader_cache.BindGraphicsShader(); - - DrawElementsInstanced(draw_state.topology, draw_state.index_buffer.count, - draw_state.index_buffer.format, index_buffer.data(), instance_count); -} - -void OptimizedRasterizer::DrawArrays(u32 instance_count) { - const auto& draw_state{gpu.Maxwell3D().draw_manager->GetDrawState()}; - - shader_cache.BindComputeShader(); - shader_cache.BindGraphicsShader(); - - DrawArraysInstanced(draw_state.topology, draw_state.vertex_buffer.first, - draw_state.vertex_buffer.count, instance_count); -} - -void OptimizedRasterizer::ClearFramebuffer(u32 layer_count) { - const auto& regs{gpu.Maxwell3D().regs}; - const auto& clear_state{regs.clear_buffers}; - - if (clear_state.R || clear_state.G || clear_state.B || clear_state.A) { - ClearColorBuffers(clear_state.R, clear_state.G, clear_state.B, clear_state.A, - regs.clear_color[0], regs.clear_color[1], regs.clear_color[2], - regs.clear_color[3], layer_count); - } - - if (clear_state.Z || clear_state.S) { - ClearDepthStencilBuffer(clear_state.Z, clear_state.S, regs.clear_depth, regs.clear_stencil, - layer_count); - } -} - -void OptimizedRasterizer::PrepareCompute() { - shader_cache.BindComputeShader(); -} - -void OptimizedRasterizer::LaunchComputeShader() { - const auto& launch_desc{gpu.KeplerCompute().launch_description}; - DispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); -} - -} // namespace VideoCore \ No newline at end of file diff --git a/src/video_core/optimized_rasterizer.h b/src/video_core/optimized_rasterizer.h deleted file mode 100644 index 9c9fe1f35e..0000000000 --- a/src/video_core/optimized_rasterizer.h +++ /dev/null @@ -1,73 +0,0 @@ -#pragma once - -#include -#include -#include "common/common_types.h" -#include "video_core/rasterizer_interface.h" -#include "video_core/engines/maxwell_3d.h" - -namespace Core { -class System; -} - -namespace Tegra { -class GPU; -class MemoryManager; -} - -namespace VideoCore { - -class ShaderCache; -class QueryCache; - -class OptimizedRasterizer final : public RasterizerInterface { -public: - explicit OptimizedRasterizer(Core::System& system, Tegra::GPU& gpu); - ~OptimizedRasterizer() override; - - void Draw(bool is_indexed, u32 instance_count) override; - void Clear(u32 layer_count) override; - void DispatchCompute() override; - void ResetCounter(VideoCommon::QueryType type) override; - void Query(GPUVAddr gpu_addr, VideoCommon::QueryType type, - VideoCommon::QueryPropertiesFlags flags, u32 payload, u32 subreport) override; - void FlushAll() override; - void FlushRegion(DAddr addr, u64 size, VideoCommon::CacheType which) override; - bool MustFlushRegion(DAddr addr, u64 size, VideoCommon::CacheType which) override; - RasterizerDownloadArea GetFlushArea(DAddr addr, u64 size) override; - void InvalidateRegion(DAddr addr, u64 size, VideoCommon::CacheType which) override; - void OnCacheInvalidation(PAddr addr, u64 size) override; - bool OnCPUWrite(PAddr addr, u64 size) override; - void InvalidateGPUCache() override; - void UnmapMemory(DAddr addr, u64 size) override; - void ModifyGPUMemory(size_t as_id, GPUVAddr addr, u64 size) override; - void FlushAndInvalidateRegion(DAddr addr, u64 size, VideoCommon::CacheType which) override; - void WaitForIdle() override; - void FragmentBarrier() override; - void TiledCacheBarrier() override; - void FlushCommands() override; - void TickFrame() override; - -private: - void PrepareRendertarget(); - void UpdateDynamicState(); - void DrawIndexed(u32 instance_count); - void DrawArrays(u32 instance_count); - void ClearFramebuffer(u32 layer_count); - void PrepareCompute(); - void LaunchComputeShader(); - - Core::System& system; - Tegra::GPU& gpu; - Tegra::MemoryManager& memory_manager; - - std::unique_ptr shader_cache; - std::unique_ptr query_cache; - - std::vector render_targets; - DepthStencilConfig depth_stencil; - - // Add any additional member variables needed for the optimized rasterizer -}; - -} // namespace VideoCore \ No newline at end of file diff --git a/src/video_core/shader_cache.cpp b/src/video_core/shader_cache.cpp index c32bd88b22..a281f5d541 100644 --- a/src/video_core/shader_cache.cpp +++ b/src/video_core/shader_cache.cpp @@ -3,18 +3,9 @@ #include #include -#include -#include -#include -#include -#include #include #include "common/assert.h" -#include "common/fs/file.h" -#include "common/fs/path_util.h" -#include "common/logging/log.h" -#include "common/thread_worker.h" #include "shader_recompiler/frontend/maxwell/control_flow.h" #include "shader_recompiler/object_pool.h" #include "video_core/control/channel_state.h" @@ -28,288 +19,233 @@ namespace VideoCommon { -constexpr size_t MAX_SHADER_CACHE_SIZE = 1024 * 1024 * 1024; // 1GB - -class ShaderCacheWorker : public Common::ThreadWorker { -public: - explicit ShaderCacheWorker(const std::string& name) : ThreadWorker(name) {} - ~ShaderCacheWorker() = default; - - void CompileShader(ShaderInfo* shader) { - Push([shader]() { - // Compile shader here - // This is a placeholder for the actual compilation process - std::this_thread::sleep_for(std::chrono::milliseconds(10)); - shader->is_compiled.store(true, std::memory_order_release); - }); - } -}; - -class ShaderCache::Impl { -public: - explicit Impl(Tegra::MaxwellDeviceMemoryManager& device_memory_) - : device_memory{device_memory_}, workers{CreateWorkers()} { - LoadCache(); - } - - ~Impl() { - SaveCache(); - } - - void InvalidateRegion(VAddr addr, size_t size) { - std::scoped_lock lock{invalidation_mutex}; - InvalidatePagesInRegion(addr, size); - RemovePendingShaders(); - } - - void OnCacheInvalidation(VAddr addr, size_t size) { - std::scoped_lock lock{invalidation_mutex}; - InvalidatePagesInRegion(addr, size); - } - - void SyncGuestHost() { - std::scoped_lock lock{invalidation_mutex}; - RemovePendingShaders(); - } - - bool RefreshStages(std::array& unique_hashes); - const ShaderInfo* ComputeShader(); - void GetGraphicsEnvironments(GraphicsEnvironments& result, const std::array& unique_hashes); - - ShaderInfo* TryGet(VAddr addr) const { - std::scoped_lock lock{lookup_mutex}; - - const auto it = lookup_cache.find(addr); - if (it == lookup_cache.end()) { - return nullptr; - } - return it->second->data; - } - - void Register(std::unique_ptr data, VAddr addr, size_t size) { - std::scoped_lock lock{invalidation_mutex, lookup_mutex}; - - const VAddr addr_end = addr + size; - Entry* const entry = NewEntry(addr, addr_end, data.get()); - - const u64 page_end = (addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS; - for (u64 page = addr >> SUYU_PAGEBITS; page < page_end; ++page) { - invalidation_cache[page].push_back(entry); - } - - storage.push_back(std::move(data)); - - device_memory.UpdatePagesCachedCount(addr, size, 1); - } - -private: - std::vector> CreateWorkers() { - const size_t num_workers = std::thread::hardware_concurrency(); - std::vector> workers; - workers.reserve(num_workers); - for (size_t i = 0; i < num_workers; ++i) { - workers.emplace_back(std::make_unique(fmt::format("ShaderWorker{}", i))); - } - return workers; - } - - void LoadCache() { - const auto cache_dir = Common::FS::GetSuyuPath(Common::FS::SuyuPath::ShaderDir); - std::filesystem::create_directories(cache_dir); - - const auto cache_file = cache_dir / "shader_cache.bin"; - if (!std::filesystem::exists(cache_file)) { - return; - } - - std::ifstream file(cache_file, std::ios::binary); - if (!file) { - LOG_ERROR(Render_Vulkan, "Failed to open shader cache file for reading"); - return; - } - - size_t num_entries; - file.read(reinterpret_cast(&num_entries), sizeof(num_entries)); - - for (size_t i = 0; i < num_entries; ++i) { - VAddr addr; - size_t size; - file.read(reinterpret_cast(&addr), sizeof(addr)); - file.read(reinterpret_cast(&size), sizeof(size)); - - auto info = std::make_unique(); - file.read(reinterpret_cast(info.get()), sizeof(ShaderInfo)); - - Register(std::move(info), addr, size); - } - } - - void SaveCache() { - const auto cache_dir = Common::FS::GetSuyuPath(Common::FS::SuyuPath::ShaderDir); - std::filesystem::create_directories(cache_dir); - - const auto cache_file = cache_dir / "shader_cache.bin"; - std::ofstream file(cache_file, std::ios::binary | std::ios::trunc); - if (!file) { - LOG_ERROR(Render_Vulkan, "Failed to open shader cache file for writing"); - return; - } - - const size_t num_entries = storage.size(); - file.write(reinterpret_cast(&num_entries), sizeof(num_entries)); - - for (const auto& shader : storage) { - const VAddr addr = shader->addr; - const size_t size = shader->size_bytes; - file.write(reinterpret_cast(&addr), sizeof(addr)); - file.write(reinterpret_cast(&size), sizeof(size)); - file.write(reinterpret_cast(shader.get()), sizeof(ShaderInfo)); - } - } - - void InvalidatePagesInRegion(VAddr addr, size_t size) { - const VAddr addr_end = addr + size; - const u64 page_end = (addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS; - for (u64 page = addr >> SUYU_PAGEBITS; page < page_end; ++page) { - auto it = invalidation_cache.find(page); - if (it == invalidation_cache.end()) { - continue; - } - InvalidatePageEntries(it->second, addr, addr_end); - } - } - - void RemovePendingShaders() { - if (marked_for_removal.empty()) { - return; - } - // Remove duplicates - std::sort(marked_for_removal.begin(), marked_for_removal.end()); - marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()), - marked_for_removal.end()); - - std::vector removed_shaders; - - std::scoped_lock lock{lookup_mutex}; - for (Entry* const entry : marked_for_removal) { - removed_shaders.push_back(entry->data); - - const auto it = lookup_cache.find(entry->addr_start); - ASSERT(it != lookup_cache.end()); - lookup_cache.erase(it); - } - marked_for_removal.clear(); - - if (!removed_shaders.empty()) { - RemoveShadersFromStorage(removed_shaders); - } - } - - void InvalidatePageEntries(std::vector& entries, VAddr addr, VAddr addr_end) { - size_t index = 0; - while (index < entries.size()) { - Entry* const entry = entries[index]; - if (!entry->Overlaps(addr, addr_end)) { - ++index; - continue; - } - - UnmarkMemory(entry); - RemoveEntryFromInvalidationCache(entry); - marked_for_removal.push_back(entry); - } - } - - void RemoveEntryFromInvalidationCache(const Entry* entry) { - const u64 page_end = (entry->addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS; - for (u64 page = entry->addr_start >> SUYU_PAGEBITS; page < page_end; ++page) { - const auto entries_it = invalidation_cache.find(page); - ASSERT(entries_it != invalidation_cache.end()); - std::vector& entries = entries_it->second; - - const auto entry_it = std::find(entries.begin(), entries.end(), entry); - ASSERT(entry_it != entries.end()); - entries.erase(entry_it); - } - } - - void UnmarkMemory(Entry* entry) { - if (!entry->is_memory_marked) { - return; - } - entry->is_memory_marked = false; - - const VAddr addr = entry->addr_start; - const size_t size = entry->addr_end - addr; - device_memory.UpdatePagesCachedCount(addr, size, -1); - } - - void RemoveShadersFromStorage(const std::vector& removed_shaders) { - storage.erase( - std::remove_if(storage.begin(), storage.end(), - [&removed_shaders](const std::unique_ptr& shader) { - return std::find(removed_shaders.begin(), removed_shaders.end(), - shader.get()) != removed_shaders.end(); - }), - storage.end()); - } - - Entry* NewEntry(VAddr addr, VAddr addr_end, ShaderInfo* data) { - auto entry = std::make_unique(Entry{addr, addr_end, data}); - Entry* const entry_pointer = entry.get(); - - lookup_cache.emplace(addr, std::move(entry)); - return entry_pointer; - } - - Tegra::MaxwellDeviceMemoryManager& device_memory; - std::vector> workers; - - mutable std::mutex lookup_mutex; - std::mutex invalidation_mutex; - - std::unordered_map> lookup_cache; - std::unordered_map> invalidation_cache; - std::vector> storage; - std::vector marked_for_removal; -}; - -ShaderCache::ShaderCache(Tegra::MaxwellDeviceMemoryManager& device_memory_) - : impl{std::make_unique(device_memory_)} {} - -ShaderCache::~ShaderCache() = default; - void ShaderCache::InvalidateRegion(VAddr addr, size_t size) { - impl->InvalidateRegion(addr, size); + std::scoped_lock lock{invalidation_mutex}; + InvalidatePagesInRegion(addr, size); + RemovePendingShaders(); } void ShaderCache::OnCacheInvalidation(VAddr addr, size_t size) { - impl->OnCacheInvalidation(addr, size); + std::scoped_lock lock{invalidation_mutex}; + InvalidatePagesInRegion(addr, size); } void ShaderCache::SyncGuestHost() { - impl->SyncGuestHost(); + std::scoped_lock lock{invalidation_mutex}; + RemovePendingShaders(); } +ShaderCache::ShaderCache(Tegra::MaxwellDeviceMemoryManager& device_memory_) + : device_memory{device_memory_} {} + bool ShaderCache::RefreshStages(std::array& unique_hashes) { - return impl->RefreshStages(unique_hashes); + auto& dirty{maxwell3d->dirty.flags}; + if (!dirty[VideoCommon::Dirty::Shaders]) { + return last_shaders_valid; + } + dirty[VideoCommon::Dirty::Shaders] = false; + + const GPUVAddr base_addr{maxwell3d->regs.program_region.Address()}; + for (size_t index = 0; index < Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; ++index) { + if (!maxwell3d->regs.IsShaderConfigEnabled(index)) { + unique_hashes[index] = 0; + continue; + } + const auto& shader_config{maxwell3d->regs.pipelines[index]}; + const auto program{static_cast(index)}; + if (program == Tegra::Engines::Maxwell3D::Regs::ShaderType::Pixel && + !maxwell3d->regs.rasterize_enable) { + unique_hashes[index] = 0; + continue; + } + const GPUVAddr shader_addr{base_addr + shader_config.offset}; + const std::optional cpu_shader_addr{gpu_memory->GpuToCpuAddress(shader_addr)}; + if (!cpu_shader_addr) { + LOG_ERROR(HW_GPU, "Invalid GPU address for shader 0x{:016x}", shader_addr); + last_shaders_valid = false; + return false; + } + const ShaderInfo* shader_info{TryGet(*cpu_shader_addr)}; + if (!shader_info) { + const u32 start_address{shader_config.offset}; + GraphicsEnvironment env{*maxwell3d, *gpu_memory, program, base_addr, start_address}; + shader_info = MakeShaderInfo(env, *cpu_shader_addr); + } + shader_infos[index] = shader_info; + unique_hashes[index] = shader_info->unique_hash; + } + last_shaders_valid = true; + return true; } const ShaderInfo* ShaderCache::ComputeShader() { - return impl->ComputeShader(); + const GPUVAddr program_base{kepler_compute->regs.code_loc.Address()}; + const auto& qmd{kepler_compute->launch_description}; + const GPUVAddr shader_addr{program_base + qmd.program_start}; + const std::optional cpu_shader_addr{gpu_memory->GpuToCpuAddress(shader_addr)}; + if (!cpu_shader_addr) { + LOG_ERROR(HW_GPU, "Invalid GPU address for shader 0x{:016x}", shader_addr); + return nullptr; + } + if (const ShaderInfo* const shader = TryGet(*cpu_shader_addr)) { + return shader; + } + ComputeEnvironment env{*kepler_compute, *gpu_memory, program_base, qmd.program_start}; + return MakeShaderInfo(env, *cpu_shader_addr); } void ShaderCache::GetGraphicsEnvironments(GraphicsEnvironments& result, const std::array& unique_hashes) { - impl->GetGraphicsEnvironments(result, unique_hashes); + size_t env_index{}; + const GPUVAddr base_addr{maxwell3d->regs.program_region.Address()}; + for (size_t index = 0; index < NUM_PROGRAMS; ++index) { + if (unique_hashes[index] == 0) { + continue; + } + const auto program{static_cast(index)}; + auto& env{result.envs[index]}; + const u32 start_address{maxwell3d->regs.pipelines[index].offset}; + env = GraphicsEnvironment{*maxwell3d, *gpu_memory, program, base_addr, start_address}; + env.SetCachedSize(shader_infos[index]->size_bytes); + result.env_ptrs[env_index++] = &env; + } } ShaderInfo* ShaderCache::TryGet(VAddr addr) const { - return impl->TryGet(addr); + std::scoped_lock lock{lookup_mutex}; + + const auto it = lookup_cache.find(addr); + if (it == lookup_cache.end()) { + return nullptr; + } + return it->second->data; } void ShaderCache::Register(std::unique_ptr data, VAddr addr, size_t size) { - impl->Register(std::move(data), addr, size); + std::scoped_lock lock{invalidation_mutex, lookup_mutex}; + + const VAddr addr_end = addr + size; + Entry* const entry = NewEntry(addr, addr_end, data.get()); + + const u64 page_end = (addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS; + for (u64 page = addr >> SUYU_PAGEBITS; page < page_end; ++page) { + invalidation_cache[page].push_back(entry); + } + + storage.push_back(std::move(data)); + + device_memory.UpdatePagesCachedCount(addr, size, 1); +} + +void ShaderCache::InvalidatePagesInRegion(VAddr addr, size_t size) { + const VAddr addr_end = addr + size; + const u64 page_end = (addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS; + for (u64 page = addr >> SUYU_PAGEBITS; page < page_end; ++page) { + auto it = invalidation_cache.find(page); + if (it == invalidation_cache.end()) { + continue; + } + InvalidatePageEntries(it->second, addr, addr_end); + } +} + +void ShaderCache::RemovePendingShaders() { + if (marked_for_removal.empty()) { + return; + } + // Remove duplicates + std::ranges::sort(marked_for_removal); + marked_for_removal.erase(std::unique(marked_for_removal.begin(), marked_for_removal.end()), + marked_for_removal.end()); + + boost::container::small_vector removed_shaders; + + std::scoped_lock lock{lookup_mutex}; + for (Entry* const entry : marked_for_removal) { + removed_shaders.push_back(entry->data); + + const auto it = lookup_cache.find(entry->addr_start); + ASSERT(it != lookup_cache.end()); + lookup_cache.erase(it); + } + marked_for_removal.clear(); + + if (!removed_shaders.empty()) { + RemoveShadersFromStorage(removed_shaders); + } +} + +void ShaderCache::InvalidatePageEntries(std::vector& entries, VAddr addr, VAddr addr_end) { + size_t index = 0; + while (index < entries.size()) { + Entry* const entry = entries[index]; + if (!entry->Overlaps(addr, addr_end)) { + ++index; + continue; + } + + UnmarkMemory(entry); + RemoveEntryFromInvalidationCache(entry); + marked_for_removal.push_back(entry); + } +} + +void ShaderCache::RemoveEntryFromInvalidationCache(const Entry* entry) { + const u64 page_end = (entry->addr_end + SUYU_PAGESIZE - 1) >> SUYU_PAGEBITS; + for (u64 page = entry->addr_start >> SUYU_PAGEBITS; page < page_end; ++page) { + const auto entries_it = invalidation_cache.find(page); + ASSERT(entries_it != invalidation_cache.end()); + std::vector& entries = entries_it->second; + + const auto entry_it = std::ranges::find(entries, entry); + ASSERT(entry_it != entries.end()); + entries.erase(entry_it); + } +} + +void ShaderCache::UnmarkMemory(Entry* entry) { + if (!entry->is_memory_marked) { + return; + } + entry->is_memory_marked = false; + + const VAddr addr = entry->addr_start; + const size_t size = entry->addr_end - addr; + device_memory.UpdatePagesCachedCount(addr, size, -1); +} + +void ShaderCache::RemoveShadersFromStorage(std::span removed_shaders) { + // Remove them from the cache + std::erase_if(storage, [&removed_shaders](const std::unique_ptr& shader) { + return std::ranges::find(removed_shaders, shader.get()) != removed_shaders.end(); + }); +} + +ShaderCache::Entry* ShaderCache::NewEntry(VAddr addr, VAddr addr_end, ShaderInfo* data) { + auto entry = std::make_unique(Entry{addr, addr_end, data}); + Entry* const entry_pointer = entry.get(); + + lookup_cache.emplace(addr, std::move(entry)); + return entry_pointer; +} + +const ShaderInfo* ShaderCache::MakeShaderInfo(GenericEnvironment& env, VAddr cpu_addr) { + auto info = std::make_unique(); + if (const std::optional cached_hash{env.Analyze()}) { + info->unique_hash = *cached_hash; + info->size_bytes = env.CachedSizeBytes(); + } else { + // Slow path, not really hit on commercial games + // Build a control flow graph to get the real shader size + Shader::ObjectPool flow_block; + Shader::Maxwell::Flow::CFG cfg{env, flow_block, env.StartAddress()}; + info->unique_hash = env.CalculateHash(); + info->size_bytes = env.ReadSizeBytes(); + } + const size_t size_bytes{info->size_bytes}; + const ShaderInfo* const result{info.get()}; + Register(std::move(info), cpu_addr, size_bytes); + return result; } } // namespace VideoCommon