diff --git a/README.md b/README.md index f21b3a730..11bafb5c7 100755 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ yuzu emulator early access ============= -This is the source code for early-access 1331. +This is the source code for early-access 1332. ## Legal Notice diff --git a/dist/icons/controller/controller.qrc b/dist/icons/controller/controller.qrc index 1c4e960c0..78eae461c 100755 --- a/dist/icons/controller/controller.qrc +++ b/dist/icons/controller/controller.qrc @@ -1,26 +1,5 @@ - dual_joycon.png - dual_joycon_dark.png - dual_joycon_midnight.png - handheld.png - handheld_dark.png - handheld_midnight.png - pro_controller.png - pro_controller_dark.png - pro_controller_midnight.png - single_joycon_left.png - single_joycon_left_dark.png - single_joycon_left_midnight.png - single_joycon_right.png - single_joycon_right_dark.png - single_joycon_right_midnight.png - single_joycon_left_vertical.png - single_joycon_left_vertical_dark.png - single_joycon_left_vertical_midnight.png - single_joycon_right_vertical.png - single_joycon_right_vertical_dark.png - single_joycon_right_vertical_midnight.png applet_dual_joycon.png applet_dual_joycon_dark.png applet_dual_joycon_midnight.png diff --git a/src/common/uint128.cpp b/src/common/uint128.cpp index 16bf7c828..343693c2c 100755 --- a/src/common/uint128.cpp +++ b/src/common/uint128.cpp @@ -38,34 +38,4 @@ u64 MultiplyAndDivide64(u64 a, u64 b, u64 d) { #endif -u128 Multiply64Into128(u64 a, u64 b) { - u128 result; -#ifdef _MSC_VER - result[0] = _umul128(a, b, &result[1]); -#else - unsigned __int128 tmp = a; - tmp *= b; - std::memcpy(&result, &tmp, sizeof(u128)); -#endif - return result; -} - -std::pair Divide128On32(u128 dividend, u32 divisor) { - u64 remainder = dividend[0] % divisor; - u64 accum = dividend[0] / divisor; - if (dividend[1] == 0) - return {accum, remainder}; - // We ignore dividend[1] / divisor as that overflows - const u64 first_segment = (dividend[1] % divisor) << 32; - accum += (first_segment / divisor) << 32; - const u64 second_segment = (first_segment % divisor) << 32; - accum += (second_segment / divisor); - remainder += second_segment % divisor; - if (remainder >= divisor) { - accum++; - remainder -= divisor; - } - return {accum, remainder}; -} - } // namespace Common diff --git a/src/common/uint128.h b/src/common/uint128.h index 969259ab6..a313765be 100755 --- a/src/common/uint128.h +++ b/src/common/uint128.h @@ -12,11 +12,4 @@ namespace Common { // This function multiplies 2 u64 values and divides it by a u64 value. [[nodiscard]] u64 MultiplyAndDivide64(u64 a, u64 b, u64 d); -// This function multiplies 2 u64 values and produces a u128 value; -[[nodiscard]] u128 Multiply64Into128(u64 a, u64 b); - -// This function divides a u128 by a u32 value and produces two u64 values: -// the result of division and the remainder -[[nodiscard]] std::pair Divide128On32(u128 dividend, u32 divisor); - } // namespace Common diff --git a/src/common/wall_clock.cpp b/src/common/wall_clock.cpp index a8c143f85..c0505e898 100755 --- a/src/common/wall_clock.cpp +++ b/src/common/wall_clock.cpp @@ -2,7 +2,6 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include "common/uint128.h" #include "common/wall_clock.h" #ifdef ARCHITECTURE_x86_64 @@ -41,16 +40,11 @@ public: } u64 GetClockCycles() override { - std::chrono::nanoseconds time_now = GetTimeNS(); - const u128 temporary = - Common::Multiply64Into128(time_now.count(), emulated_clock_frequency); - return Common::Divide128On32(temporary, 1000000000).first; + return GetTimeNS().count() * (emulated_clock_frequency / 1000) / 1000000; } u64 GetCPUCycles() override { - std::chrono::nanoseconds time_now = GetTimeNS(); - const u128 temporary = Common::Multiply64Into128(time_now.count(), emulated_cpu_frequency); - return Common::Divide128On32(temporary, 1000000000).first; + return GetTimeNS().count() * (emulated_cpu_frequency / 1000) / 1000000; } void Pause([[maybe_unused]] bool is_paused) override { diff --git a/src/core/CMakeLists.txt b/src/core/CMakeLists.txt index 99310dc50..3906aaf9b 100755 --- a/src/core/CMakeLists.txt +++ b/src/core/CMakeLists.txt @@ -19,7 +19,6 @@ add_library(core STATIC core.h core_timing.cpp core_timing.h - core_timing_util.cpp core_timing_util.h cpu_manager.cpp cpu_manager.h diff --git a/src/core/core_timing_util.h b/src/core/core_timing_util.h index e4a046bf9..14c36a485 100755 --- a/src/core/core_timing_util.h +++ b/src/core/core_timing_util.h @@ -1,24 +1,59 @@ -// Copyright 2008 Dolphin Emulator Project / 2017 Citra Emulator Project -// Licensed under GPLv2+ +// Copyright 2020 yuzu Emulator Project +// Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once #include + #include "common/common_types.h" +#include "core/hardware_properties.h" namespace Core::Timing { -s64 msToCycles(std::chrono::milliseconds ms); -s64 usToCycles(std::chrono::microseconds us); -s64 nsToCycles(std::chrono::nanoseconds ns); -u64 msToClockCycles(std::chrono::milliseconds ns); -u64 usToClockCycles(std::chrono::microseconds ns); -u64 nsToClockCycles(std::chrono::nanoseconds ns); -std::chrono::milliseconds CyclesToMs(s64 cycles); -std::chrono::nanoseconds CyclesToNs(s64 cycles); -std::chrono::microseconds CyclesToUs(s64 cycles); +namespace detail { +constexpr u64 CNTFREQ_ADJUSTED = Hardware::CNTFREQ / 1000; +constexpr u64 BASE_CLOCK_RATE_ADJUSTED = Hardware::BASE_CLOCK_RATE / 1000; +} // namespace detail -u64 CpuCyclesToClockCycles(u64 ticks); +[[nodiscard]] constexpr s64 msToCycles(std::chrono::milliseconds ms) { + return ms.count() * detail::BASE_CLOCK_RATE_ADJUSTED; +} + +[[nodiscard]] constexpr s64 usToCycles(std::chrono::microseconds us) { + return us.count() * detail::BASE_CLOCK_RATE_ADJUSTED / 1000; +} + +[[nodiscard]] constexpr s64 nsToCycles(std::chrono::nanoseconds ns) { + return ns.count() * detail::BASE_CLOCK_RATE_ADJUSTED / 1000000; +} + +[[nodiscard]] constexpr u64 msToClockCycles(std::chrono::milliseconds ms) { + return static_cast(ms.count()) * detail::CNTFREQ_ADJUSTED; +} + +[[nodiscard]] constexpr u64 usToClockCycles(std::chrono::microseconds us) { + return us.count() * detail::CNTFREQ_ADJUSTED / 1000; +} + +[[nodiscard]] constexpr u64 nsToClockCycles(std::chrono::nanoseconds ns) { + return ns.count() * detail::CNTFREQ_ADJUSTED / 1000000; +} + +[[nodiscard]] constexpr u64 CpuCyclesToClockCycles(u64 ticks) { + return ticks * detail::CNTFREQ_ADJUSTED / detail::BASE_CLOCK_RATE_ADJUSTED; +} + +[[nodiscard]] constexpr std::chrono::milliseconds CyclesToMs(s64 cycles) { + return std::chrono::milliseconds(cycles / detail::BASE_CLOCK_RATE_ADJUSTED); +} + +[[nodiscard]] constexpr std::chrono::nanoseconds CyclesToNs(s64 cycles) { + return std::chrono::nanoseconds(cycles * 1000000 / detail::BASE_CLOCK_RATE_ADJUSTED); +} + +[[nodiscard]] constexpr std::chrono::microseconds CyclesToUs(s64 cycles) { + return std::chrono::microseconds(cycles * 1000 / detail::BASE_CLOCK_RATE_ADJUSTED); +} } // namespace Core::Timing diff --git a/src/core/frontend/emu_window.cpp b/src/core/frontend/emu_window.cpp index 8c1193894..ee7a58b1c 100755 --- a/src/core/frontend/emu_window.cpp +++ b/src/core/frontend/emu_window.cpp @@ -21,21 +21,18 @@ public: std::mutex mutex; - bool touch_pressed = false; ///< True if touchpad area is currently pressed, otherwise false - - float touch_x = 0.0f; ///< Touchpad X-position - float touch_y = 0.0f; ///< Touchpad Y-position + Input::TouchStatus status; private: class Device : public Input::TouchDevice { public: explicit Device(std::weak_ptr&& touch_state) : touch_state(touch_state) {} - std::tuple GetStatus() const override { + Input::TouchStatus GetStatus() const override { if (auto state = touch_state.lock()) { std::lock_guard guard{state->mutex}; - return std::make_tuple(state->touch_x, state->touch_y, state->touch_pressed); + return state->status; } - return std::make_tuple(0.0f, 0.0f, false); + return {}; } private: @@ -79,36 +76,44 @@ std::tuple EmuWindow::ClipToTouchScreen(unsigned new_x, unsi return std::make_tuple(new_x, new_y); } -void EmuWindow::TouchPressed(unsigned framebuffer_x, unsigned framebuffer_y) { - if (!IsWithinTouchscreen(framebuffer_layout, framebuffer_x, framebuffer_y)) +void EmuWindow::TouchPressed(unsigned framebuffer_x, unsigned framebuffer_y, std::size_t id) { + if (!IsWithinTouchscreen(framebuffer_layout, framebuffer_x, framebuffer_y)) { return; + } + if (id >= touch_state->status.size()) { + return; + } std::lock_guard guard{touch_state->mutex}; - touch_state->touch_x = + const float x = static_cast(framebuffer_x - framebuffer_layout.screen.left) / static_cast(framebuffer_layout.screen.right - framebuffer_layout.screen.left); - touch_state->touch_y = + const float y = static_cast(framebuffer_y - framebuffer_layout.screen.top) / static_cast(framebuffer_layout.screen.bottom - framebuffer_layout.screen.top); - touch_state->touch_pressed = true; + touch_state->status[id] = std::make_tuple(x, y, true); } -void EmuWindow::TouchReleased() { +void EmuWindow::TouchReleased(std::size_t id) { + if (id >= touch_state->status.size()) { + return; + } std::lock_guard guard{touch_state->mutex}; - touch_state->touch_pressed = false; - touch_state->touch_x = 0; - touch_state->touch_y = 0; + touch_state->status[id] = std::make_tuple(0.0f, 0.0f, false); } -void EmuWindow::TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y) { - if (!touch_state->touch_pressed) +void EmuWindow::TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y, std::size_t id) { + if (id >= touch_state->status.size()) { + return; + } + if (!std::get<2>(touch_state->status[id])) return; if (!IsWithinTouchscreen(framebuffer_layout, framebuffer_x, framebuffer_y)) std::tie(framebuffer_x, framebuffer_y) = ClipToTouchScreen(framebuffer_x, framebuffer_y); - TouchPressed(framebuffer_x, framebuffer_y); + TouchPressed(framebuffer_x, framebuffer_y, id); } void EmuWindow::UpdateCurrentFramebufferLayout(unsigned width, unsigned height) { diff --git a/src/core/frontend/emu_window.h b/src/core/frontend/emu_window.h index 276d2b906..2436c6580 100755 --- a/src/core/frontend/emu_window.h +++ b/src/core/frontend/emu_window.h @@ -117,18 +117,23 @@ public: * Signal that a touch pressed event has occurred (e.g. mouse click pressed) * @param framebuffer_x Framebuffer x-coordinate that was pressed * @param framebuffer_y Framebuffer y-coordinate that was pressed + * @param id Touch event ID */ - void TouchPressed(unsigned framebuffer_x, unsigned framebuffer_y); + void TouchPressed(unsigned framebuffer_x, unsigned framebuffer_y, std::size_t id); - /// Signal that a touch released event has occurred (e.g. mouse click released) - void TouchReleased(); + /** + * Signal that a touch released event has occurred (e.g. mouse click released) + * @param id Touch event ID + */ + void TouchReleased(std::size_t id); /** * Signal that a touch movement event has occurred (e.g. mouse was moved over the emu window) * @param framebuffer_x Framebuffer x-coordinate * @param framebuffer_y Framebuffer y-coordinate + * @param id Touch event ID */ - void TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y); + void TouchMoved(unsigned framebuffer_x, unsigned framebuffer_y, std::size_t id); /** * Returns currently active configuration. diff --git a/src/core/frontend/input.h b/src/core/frontend/input.h index de51a754e..88ebc6497 100755 --- a/src/core/frontend/input.h +++ b/src/core/frontend/input.h @@ -21,6 +21,11 @@ enum class AnalogDirection : u8 { UP, DOWN, }; +struct AnalogProperties { + float deadzone; + float range; + float threshold; +}; /// An abstract class template for an input device (a button, an analog input, etc.). template @@ -30,6 +35,12 @@ public: virtual StatusType GetStatus() const { return {}; } + virtual StatusType GetRawStatus() const { + return GetStatus(); + } + virtual AnalogProperties GetAnalogProperties() const { + return {}; + } virtual bool GetAnalogDirectionStatus([[maybe_unused]] AnalogDirection direction) const { return {}; } @@ -163,10 +174,11 @@ using MotionStatus = std::tuple, Common::Vec3, Common using MotionDevice = InputDevice; /** - * A touch status is an object that returns a tuple of two floats and a bool. The floats are - * x and y coordinates in the range 0.0 - 1.0, and the bool indicates whether it is pressed. + * A touch status is an object that returns an array of 16 tuple elements of two floats and a bool. + * The floats are x and y coordinates in the range 0.0 - 1.0, and the bool indicates whether it is + * pressed. */ -using TouchStatus = std::tuple; +using TouchStatus = std::array, 16>; /** * A touch device is an input device that returns a touch status object diff --git a/src/core/hle/service/hid/controllers/touchscreen.cpp b/src/core/hle/service/hid/controllers/touchscreen.cpp index 0df395e85..5219f2dad 100755 --- a/src/core/hle/service/hid/controllers/touchscreen.cpp +++ b/src/core/hle/service/hid/controllers/touchscreen.cpp @@ -2,6 +2,7 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +#include #include #include "common/common_types.h" #include "core/core_timing.h" @@ -16,7 +17,13 @@ constexpr std::size_t SHARED_MEMORY_OFFSET = 0x400; Controller_Touchscreen::Controller_Touchscreen(Core::System& system) : ControllerBase(system) {} Controller_Touchscreen::~Controller_Touchscreen() = default; -void Controller_Touchscreen::OnInit() {} +void Controller_Touchscreen::OnInit() { + for (std::size_t id = 0; id < MAX_FINGERS; ++id) { + mouse_finger_id[id] = MAX_FINGERS; + keyboard_finger_id[id] = MAX_FINGERS; + udp_finger_id[id] = MAX_FINGERS; + } +} void Controller_Touchscreen::OnRelease() {} @@ -40,38 +47,106 @@ void Controller_Touchscreen::OnUpdate(const Core::Timing::CoreTiming& core_timin cur_entry.sampling_number = last_entry.sampling_number + 1; cur_entry.sampling_number2 = cur_entry.sampling_number; - bool pressed = false; - float x, y; - std::tie(x, y, pressed) = touch_device->GetStatus(); - auto& touch_entry = cur_entry.states[0]; - touch_entry.attribute.raw = 0; - if (!pressed && touch_btn_device) { - std::tie(x, y, pressed) = touch_btn_device->GetStatus(); - } - if (pressed && Settings::values.touchscreen.enabled) { - touch_entry.x = static_cast(x * Layout::ScreenUndocked::Width); - touch_entry.y = static_cast(y * Layout::ScreenUndocked::Height); - touch_entry.diameter_x = Settings::values.touchscreen.diameter_x; - touch_entry.diameter_y = Settings::values.touchscreen.diameter_y; - touch_entry.rotation_angle = Settings::values.touchscreen.rotation_angle; - const u64 tick = core_timing.GetCPUTicks(); - touch_entry.delta_time = tick - last_touch; - last_touch = tick; - touch_entry.finger = Settings::values.touchscreen.finger; - cur_entry.entry_count = 1; - } else { - cur_entry.entry_count = 0; + const Input::TouchStatus& mouse_status = touch_mouse_device->GetStatus(); + const Input::TouchStatus& udp_status = touch_udp_device->GetStatus(); + for (std::size_t id = 0; id < mouse_status.size(); ++id) { + mouse_finger_id[id] = UpdateTouchInputEvent(mouse_status[id], mouse_finger_id[id]); + udp_finger_id[id] = UpdateTouchInputEvent(udp_status[id], udp_finger_id[id]); } + if (Settings::values.use_touch_from_button) { + const Input::TouchStatus& keyboard_status = touch_btn_device->GetStatus(); + for (std::size_t id = 0; id < mouse_status.size(); ++id) { + keyboard_finger_id[id] = + UpdateTouchInputEvent(keyboard_status[id], keyboard_finger_id[id]); + } + } + + std::array active_fingers; + const auto end_iter = std::copy_if(fingers.begin(), fingers.end(), active_fingers.begin(), + [](const auto& finger) { return finger.pressed; }); + const auto active_fingers_count = + static_cast(std::distance(active_fingers.begin(), end_iter)); + + const u64 tick = core_timing.GetCPUTicks(); + cur_entry.entry_count = static_cast(active_fingers_count); + for (std::size_t id = 0; id < MAX_FINGERS; ++id) { + auto& touch_entry = cur_entry.states[id]; + if (id < active_fingers_count) { + touch_entry.x = static_cast(active_fingers[id].x * Layout::ScreenUndocked::Width); + touch_entry.y = static_cast(active_fingers[id].y * Layout::ScreenUndocked::Height); + touch_entry.diameter_x = Settings::values.touchscreen.diameter_x; + touch_entry.diameter_y = Settings::values.touchscreen.diameter_y; + touch_entry.rotation_angle = Settings::values.touchscreen.rotation_angle; + touch_entry.delta_time = tick - active_fingers[id].last_touch; + fingers[active_fingers[id].id].last_touch = tick; + touch_entry.finger = active_fingers[id].id; + touch_entry.attribute.raw = active_fingers[id].attribute.raw; + } else { + // Clear touch entry + touch_entry.attribute.raw = 0; + touch_entry.x = 0; + touch_entry.y = 0; + touch_entry.diameter_x = 0; + touch_entry.diameter_y = 0; + touch_entry.rotation_angle = 0; + touch_entry.delta_time = 0; + touch_entry.finger = 0; + } + } std::memcpy(data + SHARED_MEMORY_OFFSET, &shared_memory, sizeof(TouchScreenSharedMemory)); } void Controller_Touchscreen::OnLoadInputDevices() { - touch_device = Input::CreateDevice(Settings::values.touchscreen.device); - if (Settings::values.use_touch_from_button) { - touch_btn_device = Input::CreateDevice("engine:touch_from_button"); - } else { - touch_btn_device.reset(); - } + touch_mouse_device = Input::CreateDevice("engine:emu_window"); + touch_udp_device = Input::CreateDevice("engine:cemuhookudp"); + touch_btn_device = Input::CreateDevice("engine:touch_from_button"); } + +std::optional Controller_Touchscreen::GetUnusedFingerID() const { + std::size_t first_free_id = 0; + while (first_free_id < MAX_FINGERS) { + if (!fingers[first_free_id].pressed) { + return first_free_id; + } else { + first_free_id++; + } + } + return std::nullopt; +} + +std::size_t Controller_Touchscreen::UpdateTouchInputEvent( + const std::tuple& touch_input, std::size_t finger_id) { + const auto& [x, y, pressed] = touch_input; + if (pressed) { + Attributes attribute{}; + if (finger_id == MAX_FINGERS) { + const auto first_free_id = GetUnusedFingerID(); + if (!first_free_id) { + // Invalid finger id do nothing + return MAX_FINGERS; + } + finger_id = first_free_id.value(); + fingers[finger_id].pressed = true; + fingers[finger_id].id = static_cast(finger_id); + attribute.start_touch.Assign(1); + } + fingers[finger_id].x = x; + fingers[finger_id].y = y; + fingers[finger_id].attribute = attribute; + return finger_id; + } + + if (finger_id != MAX_FINGERS) { + if (!fingers[finger_id].attribute.end_touch) { + fingers[finger_id].attribute.end_touch.Assign(1); + fingers[finger_id].attribute.start_touch.Assign(0); + return finger_id; + } + fingers[finger_id].pressed = false; + } + + return MAX_FINGERS; +} + } // namespace Service::HID diff --git a/src/core/hle/service/hid/controllers/touchscreen.h b/src/core/hle/service/hid/controllers/touchscreen.h index 4d9042adc..784124e25 100755 --- a/src/core/hle/service/hid/controllers/touchscreen.h +++ b/src/core/hle/service/hid/controllers/touchscreen.h @@ -30,6 +30,18 @@ public: void OnLoadInputDevices() override; private: + static constexpr std::size_t MAX_FINGERS = 16; + + // Returns an unused finger id, if there is no fingers available std::nullopt will be returned + std::optional GetUnusedFingerID() const; + + // If the touch is new it tries to assing a new finger id, if there is no fingers avaliable no + // changes will be made. Updates the coordinates if the finger id it's already set. If the touch + // ends delays the output by one frame to set the end_touch flag before finally freeing the + // finger id + std::size_t UpdateTouchInputEvent(const std::tuple& touch_input, + std::size_t finger_id); + struct Attributes { union { u32 raw{}; @@ -55,7 +67,7 @@ private: s64_le sampling_number; s64_le sampling_number2; s32_le entry_count; - std::array states; + std::array states; }; static_assert(sizeof(TouchScreenEntry) == 0x298, "TouchScreenEntry is an invalid size"); @@ -66,9 +78,23 @@ private: }; static_assert(sizeof(TouchScreenSharedMemory) == 0x3000, "TouchScreenSharedMemory is an invalid size"); + + struct Finger { + u64_le last_touch{}; + float x{}; + float y{}; + u32_le id{}; + bool pressed{}; + Attributes attribute; + }; + TouchScreenSharedMemory shared_memory{}; - std::unique_ptr touch_device; + std::unique_ptr touch_mouse_device; + std::unique_ptr touch_udp_device; std::unique_ptr touch_btn_device; - s64_le last_touch{}; + std::array mouse_finger_id; + std::array keyboard_finger_id; + std::array udp_finger_id; + std::array fingers; }; } // namespace Service::HID diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp index 36970f828..0ac5fc6ae 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec.cpp @@ -37,6 +37,7 @@ NvResult nvhost_nvdec::Ioctl1(Ioctl command, const std::vector& input, Tegra::ChCommandHeaderList cmdlist(1); cmdlist[0] = Tegra::ChCommandHeader{0xDEADB33F}; system.GPU().PushCommandBuffer(cmdlist); + system.GPU().MemoryManager().InvalidateQueuedCaches(); } return UnmapBuffer(input, output); } diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp index 4898dc27a..d1b5136da 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.cpp @@ -193,7 +193,13 @@ NvResult nvhost_nvdec_common::UnmapBuffer(const std::vector& input, std::vec return NvResult::InvalidState; } if (const auto size{RemoveBufferMap(object->dma_map_addr)}; size) { - gpu.MemoryManager().Unmap(object->dma_map_addr, *size); + if (vic_device) { + // UnmapVicFrame defers texture_cache invalidation of the frame address until + // the stream is over + gpu.MemoryManager().UnmapVicFrame(object->dma_map_addr, *size); + } else { + gpu.MemoryManager().Unmap(object->dma_map_addr, *size); + } } else { // This occurs quite frequently, however does not seem to impact functionality LOG_DEBUG(Service_NVDRV, "invalid offset=0x{:X} dma=0x{:X}", object->addr, diff --git a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h index 4c9d4ba41..2e0aece2f 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h +++ b/src/core/hle/service/nvdrv/devices/nvhost_nvdec_common.h @@ -160,6 +160,7 @@ protected: s32_le nvmap_fd{}; u32_le submit_timeout{}; + bool vic_device{}; std::shared_ptr nvmap_dev; SyncpointManager& syncpoint_manager; std::array device_syncpoints{}; diff --git a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp index 72499654c..9118ab254 100755 --- a/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp +++ b/src/core/hle/service/nvdrv/devices/nvhost_vic.cpp @@ -12,8 +12,9 @@ namespace Service::Nvidia::Devices { nvhost_vic::nvhost_vic(Core::System& system, std::shared_ptr nvmap_dev, SyncpointManager& syncpoint_manager) - : nvhost_nvdec_common(system, std::move(nvmap_dev), syncpoint_manager) {} - + : nvhost_nvdec_common(system, std::move(nvmap_dev), syncpoint_manager) { + vic_device = true; +} nvhost_vic::~nvhost_vic() = default; NvResult nvhost_vic::Ioctl1(Ioctl command, const std::vector& input, std::vector& output) { diff --git a/src/core/memory.cpp b/src/core/memory.cpp index 11609682a..961f1c028 100755 --- a/src/core/memory.cpp +++ b/src/core/memory.cpp @@ -195,8 +195,9 @@ struct Memory::Impl { switch (type) { case Common::PageType::Unmapped: { LOG_ERROR(HW_Memory, - "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", - current_vaddr, src_addr, size); + "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) " + "at PC 0x{:08X}", + current_vaddr, src_addr, size, system.CurrentArmInterface().GetPC()); std::memset(dest_buffer, 0, copy_amount); break; } @@ -240,8 +241,9 @@ struct Memory::Impl { switch (type) { case Common::PageType::Unmapped: { LOG_ERROR(HW_Memory, - "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", - current_vaddr, src_addr, size); + "Unmapped ReadBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) " + "at PC 0x{:08X}", + current_vaddr, src_addr, size, system.CurrentArmInterface().GetPC()); std::memset(dest_buffer, 0, copy_amount); break; } @@ -291,8 +293,9 @@ struct Memory::Impl { switch (type) { case Common::PageType::Unmapped: { LOG_ERROR(HW_Memory, - "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", - current_vaddr, dest_addr, size); + "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) " + "at PC 0x{:08X}", + current_vaddr, dest_addr, size, system.CurrentArmInterface().GetPC()); break; } case Common::PageType::Memory: { @@ -334,8 +337,9 @@ struct Memory::Impl { switch (type) { case Common::PageType::Unmapped: { LOG_ERROR(HW_Memory, - "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", - current_vaddr, dest_addr, size); + "Unmapped WriteBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) " + "at PC 0x{:08X}", + current_vaddr, dest_addr, size, system.CurrentArmInterface().GetPC()); break; } case Common::PageType::Memory: { @@ -383,8 +387,9 @@ struct Memory::Impl { switch (type) { case Common::PageType::Unmapped: { LOG_ERROR(HW_Memory, - "Unmapped ZeroBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", - current_vaddr, dest_addr, size); + "Unmapped ZeroBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) " + "at PC 0x{:08X}", + current_vaddr, dest_addr, size, system.CurrentArmInterface().GetPC()); break; } case Common::PageType::Memory: { @@ -429,8 +434,9 @@ struct Memory::Impl { switch (type) { case Common::PageType::Unmapped: { LOG_ERROR(HW_Memory, - "Unmapped CopyBlock @ 0x{:016X} (start address = 0x{:016X}, size = {})", - current_vaddr, src_addr, size); + "Unmapped CopyBlock @ 0x{:016X} (start address = 0x{:016X}, size = {}) " + "at PC 0x{:08X}", + current_vaddr, src_addr, size, system.CurrentArmInterface().GetPC()); ZeroBlock(process, dest_addr, copy_amount); break; } @@ -601,7 +607,8 @@ struct Memory::Impl { } switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) { case Common::PageType::Unmapped: - LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:08X}", sizeof(T) * 8, vaddr); + LOG_ERROR(HW_Memory, "Unmapped Read{} @ 0x{:08X} at PC 0x{:08X}", sizeof(T) * 8, vaddr, + system.CurrentArmInterface().GetPC()); return 0; case Common::PageType::Memory: ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr); @@ -638,8 +645,9 @@ struct Memory::Impl { } switch (Common::PageTable::PageInfo::ExtractType(raw_pointer)) { case Common::PageType::Unmapped: - LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X}", sizeof(data) * 8, - static_cast(data), vaddr); + LOG_ERROR(HW_Memory, "Unmapped Write{} 0x{:08X} @ 0x{:016X} at PC 0x{:08X}", + sizeof(data) * 8, static_cast(data), vaddr, + system.CurrentArmInterface().GetPC()); return; case Common::PageType::Memory: ASSERT_MSG(false, "Mapped memory page without a pointer @ {:016X}", vaddr); diff --git a/src/input_common/gcadapter/gc_poller.cpp b/src/input_common/gcadapter/gc_poller.cpp index 9670bdeb2..1b6ded8d6 100755 --- a/src/input_common/gcadapter/gc_poller.cpp +++ b/src/input_common/gcadapter/gc_poller.cpp @@ -185,6 +185,16 @@ public: return {0.0f, 0.0f}; } + std::tuple GetRawStatus() const override { + const float x = GetAxis(axis_x); + const float y = GetAxis(axis_y); + return {x, y}; + } + + Input::AnalogProperties GetAnalogProperties() const override { + return {deadzone, range, 0.5f}; + } + bool GetAnalogDirectionStatus(Input::AnalogDirection direction) const override { const auto [x, y] = GetStatus(); const float directional_deadzone = 0.5f; diff --git a/src/input_common/sdl/sdl_impl.cpp b/src/input_common/sdl/sdl_impl.cpp index d32eb732a..a87db6084 100755 --- a/src/input_common/sdl/sdl_impl.cpp +++ b/src/input_common/sdl/sdl_impl.cpp @@ -373,6 +373,16 @@ public: return {}; } + std::tuple GetRawStatus() const override { + const float x = joystick->GetAxis(axis_x, range); + const float y = joystick->GetAxis(axis_y, range); + return {x, -y}; + } + + Input::AnalogProperties GetAnalogProperties() const override { + return {deadzone, range, 0.5f}; + } + bool GetAnalogDirectionStatus(Input::AnalogDirection direction) const override { const auto [x, y] = GetStatus(); const float directional_deadzone = 0.5f; diff --git a/src/input_common/touch_from_button.cpp b/src/input_common/touch_from_button.cpp index a07124a86..ffbe4f2ed 100755 --- a/src/input_common/touch_from_button.cpp +++ b/src/input_common/touch_from_button.cpp @@ -25,18 +25,19 @@ public: } } - std::tuple GetStatus() const override { - for (const auto& m : map) { - const bool state = std::get<0>(m)->GetStatus(); + Input::TouchStatus GetStatus() const override { + Input::TouchStatus touch_status{}; + for (std::size_t id = 0; id < map.size() && id < touch_status.size(); ++id) { + const bool state = std::get<0>(map[id])->GetStatus(); if (state) { - const float x = static_cast(std::get<1>(m)) / + const float x = static_cast(std::get<1>(map[id])) / static_cast(Layout::ScreenUndocked::Width); - const float y = static_cast(std::get<2>(m)) / + const float y = static_cast(std::get<2>(map[id])) / static_cast(Layout::ScreenUndocked::Height); - return {x, y, true}; + touch_status[id] = {x, y, true}; } } - return {}; + return touch_status; } private: diff --git a/src/input_common/udp/client.cpp b/src/input_common/udp/client.cpp index 412d57896..e7e50d789 100755 --- a/src/input_common/udp/client.cpp +++ b/src/input_common/udp/client.cpp @@ -136,6 +136,7 @@ static void SocketLoop(Socket* socket) { Client::Client() { LOG_INFO(Input, "Udp Initialization started"); + finger_id.fill(MAX_TOUCH_FINGERS); ReloadSockets(); } @@ -176,7 +177,7 @@ void Client::ReloadSockets() { std::string server_token; std::size_t client = 0; while (std::getline(servers_ss, server_token, ',')) { - if (client == max_udp_clients) { + if (client == MAX_UDP_CLIENTS) { break; } std::stringstream server_ss(server_token); @@ -194,7 +195,7 @@ void Client::ReloadSockets() { for (std::size_t pad = 0; pad < 4; ++pad) { const std::size_t client_number = GetClientNumber(udp_input_address, udp_input_port, pad); - if (client_number != max_udp_clients) { + if (client_number != MAX_UDP_CLIENTS) { LOG_ERROR(Input, "Duplicated UDP servers found"); continue; } @@ -213,7 +214,7 @@ std::size_t Client::GetClientNumber(std::string_view host, u16 port, std::size_t return client; } } - return max_udp_clients; + return MAX_UDP_CLIENTS; } void Client::OnVersion([[maybe_unused]] Response::Version data) { @@ -259,33 +260,14 @@ void Client::OnPadData(Response::PadData data, std::size_t client) { std::lock_guard guard(clients[client].status.update_mutex); clients[client].status.motion_status = clients[client].motion.GetMotion(); - // TODO: add a setting for "click" touch. Click touch refers to a device that differentiates - // between a simple "tap" and a hard press that causes the touch screen to click. - const bool is_active = data.touch_1.is_active != 0; - - float x = 0; - float y = 0; - - if (is_active && clients[client].status.touch_calibration) { - const u16 min_x = clients[client].status.touch_calibration->min_x; - const u16 max_x = clients[client].status.touch_calibration->max_x; - const u16 min_y = clients[client].status.touch_calibration->min_y; - const u16 max_y = clients[client].status.touch_calibration->max_y; - - x = static_cast(std::clamp(static_cast(data.touch_1.x), min_x, max_x) - - min_x) / - static_cast(max_x - min_x); - y = static_cast(std::clamp(static_cast(data.touch_1.y), min_y, max_y) - - min_y) / - static_cast(max_y - min_y); + for (std::size_t id = 0; id < data.touch.size(); ++id) { + UpdateTouchInput(data.touch[id], client, id); } - clients[client].status.touch_status = {x, y, is_active}; - if (configuring) { const Common::Vec3f gyroscope = clients[client].motion.GetGyroscope(); const Common::Vec3f accelerometer = clients[client].motion.GetAcceleration(); - UpdateYuzuSettings(client, accelerometer, gyroscope, is_active); + UpdateYuzuSettings(client, accelerometer, gyroscope); } } } @@ -320,21 +302,17 @@ void Client::Reset() { } void Client::UpdateYuzuSettings(std::size_t client, const Common::Vec3& acc, - const Common::Vec3& gyro, bool touch) { + const Common::Vec3& gyro) { if (gyro.Length() > 0.2f) { - LOG_DEBUG(Input, "UDP Controller {}: gyro=({}, {}, {}), accel=({}, {}, {}), touch={}", - client, gyro[0], gyro[1], gyro[2], acc[0], acc[1], acc[2], touch); + LOG_DEBUG(Input, "UDP Controller {}: gyro=({}, {}, {}), accel=({}, {}, {})", client, + gyro[0], gyro[1], gyro[2], acc[0], acc[1], acc[2]); } UDPPadStatus pad{ .host = clients[client].host, .port = clients[client].port, .pad_index = clients[client].pad_index, }; - if (touch) { - pad.touch = PadTouch::Click; - pad_queue.Push(pad); - } - for (size_t i = 0; i < 3; ++i) { + for (std::size_t i = 0; i < 3; ++i) { if (gyro[i] > 5.0f || gyro[i] < -5.0f) { pad.motion = static_cast(i); pad.motion_value = gyro[i]; @@ -348,6 +326,50 @@ void Client::UpdateYuzuSettings(std::size_t client, const Common::Vec3& a } } +std::optional Client::GetUnusedFingerID() const { + std::size_t first_free_id = 0; + while (first_free_id < MAX_TOUCH_FINGERS) { + if (!std::get<2>(touch_status[first_free_id])) { + return first_free_id; + } else { + first_free_id++; + } + } + return std::nullopt; +} + +void Client::UpdateTouchInput(Response::TouchPad& touch_pad, std::size_t client, std::size_t id) { + // TODO: Use custom calibration per device + const Common::ParamPackage touch_param(Settings::values.touch_device); + const u16 min_x = static_cast(touch_param.Get("min_x", 100)); + const u16 min_y = static_cast(touch_param.Get("min_y", 50)); + const u16 max_x = static_cast(touch_param.Get("max_x", 1800)); + const u16 max_y = static_cast(touch_param.Get("max_y", 850)); + const std::size_t touch_id = client * 2 + id; + if (touch_pad.is_active) { + if (finger_id[touch_id] == MAX_TOUCH_FINGERS) { + const auto first_free_id = GetUnusedFingerID(); + if (!first_free_id) { + // Invalid finger id skip to next input + return; + } + finger_id[touch_id] = *first_free_id; + } + auto& [x, y, pressed] = touch_status[finger_id[touch_id]]; + x = static_cast(std::clamp(static_cast(touch_pad.x), min_x, max_x) - min_x) / + static_cast(max_x - min_x); + y = static_cast(std::clamp(static_cast(touch_pad.y), min_y, max_y) - min_y) / + static_cast(max_y - min_y); + pressed = true; + return; + } + + if (finger_id[touch_id] != MAX_TOUCH_FINGERS) { + touch_status[finger_id[touch_id]] = {}; + finger_id[touch_id] = MAX_TOUCH_FINGERS; + } +} + void Client::BeginConfiguration() { pad_queue.Clear(); configuring = true; @@ -360,7 +382,7 @@ void Client::EndConfiguration() { DeviceStatus& Client::GetPadState(const std::string& host, u16 port, std::size_t pad) { const std::size_t client_number = GetClientNumber(host, port, pad); - if (client_number == max_udp_clients) { + if (client_number == MAX_UDP_CLIENTS) { return clients[0].status; } return clients[client_number].status; @@ -368,12 +390,20 @@ DeviceStatus& Client::GetPadState(const std::string& host, u16 port, std::size_t const DeviceStatus& Client::GetPadState(const std::string& host, u16 port, std::size_t pad) const { const std::size_t client_number = GetClientNumber(host, port, pad); - if (client_number == max_udp_clients) { + if (client_number == MAX_UDP_CLIENTS) { return clients[0].status; } return clients[client_number].status; } +Input::TouchStatus& Client::GetTouchState() { + return touch_status; +} + +const Input::TouchStatus& Client::GetTouchState() const { + return touch_status; +} + Common::SPSCQueue& Client::GetPadQueue() { return pad_queue; } @@ -426,24 +456,24 @@ CalibrationConfigurationJob::CalibrationConfigurationJob( current_status = Status::Ready; status_callback(current_status); } - if (data.touch_1.is_active == 0) { + if (data.touch[0].is_active == 0) { return; } - LOG_DEBUG(Input, "Current touch: {} {}", data.touch_1.x, - data.touch_1.y); - min_x = std::min(min_x, static_cast(data.touch_1.x)); - min_y = std::min(min_y, static_cast(data.touch_1.y)); + LOG_DEBUG(Input, "Current touch: {} {}", data.touch[0].x, + data.touch[0].y); + min_x = std::min(min_x, static_cast(data.touch[0].x)); + min_y = std::min(min_y, static_cast(data.touch[0].y)); if (current_status == Status::Ready) { // First touch - min data (min_x/min_y) current_status = Status::Stage1Completed; status_callback(current_status); } - if (data.touch_1.x - min_x > CALIBRATION_THRESHOLD && - data.touch_1.y - min_y > CALIBRATION_THRESHOLD) { + if (data.touch[0].x - min_x > CALIBRATION_THRESHOLD && + data.touch[0].y - min_y > CALIBRATION_THRESHOLD) { // Set the current position as max value and finishes // configuration - max_x = data.touch_1.x; - max_y = data.touch_1.y; + max_x = data.touch[0].x; + max_y = data.touch[0].y; current_status = Status::Completed; data_callback(min_x, min_y, max_x, max_y); status_callback(current_status); diff --git a/src/input_common/udp/client.h b/src/input_common/udp/client.h index 00c8b09f5..822f9c550 100755 --- a/src/input_common/udp/client.h +++ b/src/input_common/udp/client.h @@ -28,6 +28,7 @@ class Socket; namespace Response { struct PadData; struct PortInfo; +struct TouchPad; struct Version; } // namespace Response @@ -50,7 +51,6 @@ struct UDPPadStatus { std::string host{"127.0.0.1"}; u16 port{26760}; std::size_t pad_index{}; - PadTouch touch{PadTouch::Undefined}; PadMotion motion{PadMotion::Undefined}; f32 motion_value{0.0f}; }; @@ -93,6 +93,9 @@ public: DeviceStatus& GetPadState(const std::string& host, u16 port, std::size_t pad); const DeviceStatus& GetPadState(const std::string& host, u16 port, std::size_t pad) const; + Input::TouchStatus& GetTouchState(); + const Input::TouchStatus& GetTouchState() const; + private: struct ClientData { std::string host{"127.0.0.1"}; @@ -122,14 +125,25 @@ private: void StartCommunication(std::size_t client, const std::string& host, u16 port, std::size_t pad_index, u32 client_id); void UpdateYuzuSettings(std::size_t client, const Common::Vec3& acc, - const Common::Vec3& gyro, bool touch); + const Common::Vec3& gyro); + + // Returns an unused finger id, if there is no fingers available std::nullopt will be + // returned + std::optional GetUnusedFingerID() const; + + // Merges and updates all touch inputs into the touch_status array + void UpdateTouchInput(Response::TouchPad& touch_pad, std::size_t client, std::size_t id); bool configuring = false; // Allocate clients for 8 udp servers - const std::size_t max_udp_clients = 32; - std::array clients; - Common::SPSCQueue pad_queue; + static constexpr std::size_t MAX_UDP_CLIENTS = 4 * 8; + // Each client can have up 2 touch inputs + static constexpr std::size_t MAX_TOUCH_FINGERS = MAX_UDP_CLIENTS * 2; + std::array clients{}; + Common::SPSCQueue pad_queue{}; + Input::TouchStatus touch_status{}; + std::array finger_id{}; }; /// An async job allowing configuration of the touchpad calibration. diff --git a/src/input_common/udp/protocol.h b/src/input_common/udp/protocol.h index fc1aea4b9..a3d276697 100755 --- a/src/input_common/udp/protocol.h +++ b/src/input_common/udp/protocol.h @@ -140,6 +140,14 @@ static_assert(sizeof(PortInfo) == 12, "UDP Response PortInfo struct has wrong si static_assert(std::is_trivially_copyable_v, "UDP Response PortInfo is not trivially copyable"); +struct TouchPad { + u8 is_active{}; + u8 id{}; + u16_le x{}; + u16_le y{}; +}; +static_assert(sizeof(TouchPad) == 6, "UDP Response TouchPad struct has wrong size "); + #pragma pack(push, 1) struct PadData { PortInfo info{}; @@ -190,12 +198,7 @@ struct PadData { u8 button_13{}; } analog_button; - struct TouchPad { - u8 is_active{}; - u8 id{}; - u16_le x{}; - u16_le y{}; - } touch_1, touch_2; + std::array touch; u64_le motion_timestamp; @@ -222,7 +225,6 @@ static_assert(sizeof(Message) == MAX_PACKET_SIZE, static_assert(sizeof(PadData::AnalogButton) == 12, "UDP Response AnalogButton struct has wrong size "); -static_assert(sizeof(PadData::TouchPad) == 6, "UDP Response TouchPad struct has wrong size "); static_assert(sizeof(PadData::Accelerometer) == 12, "UDP Response Accelerometer struct has wrong size "); static_assert(sizeof(PadData::Gyroscope) == 12, "UDP Response Gyroscope struct has wrong size "); diff --git a/src/input_common/udp/udp.cpp b/src/input_common/udp/udp.cpp index c5da27a38..b630281a0 100755 --- a/src/input_common/udp/udp.cpp +++ b/src/input_common/udp/udp.cpp @@ -78,8 +78,8 @@ public: explicit UDPTouch(std::string ip_, u16 port_, u16 pad_, CemuhookUDP::Client* client_) : ip(std::move(ip_)), port(port_), pad(pad_), client(client_) {} - std::tuple GetStatus() const override { - return client->GetPadState(ip, port, pad).touch_status; + Input::TouchStatus GetStatus() const override { + return client->GetTouchState(); } private: @@ -107,32 +107,4 @@ std::unique_ptr UDPTouchFactory::Create(const Common::ParamP return std::make_unique(std::move(ip), port, pad, client.get()); } -void UDPTouchFactory::BeginConfiguration() { - polling = true; - client->BeginConfiguration(); -} - -void UDPTouchFactory::EndConfiguration() { - polling = false; - client->EndConfiguration(); -} - -Common::ParamPackage UDPTouchFactory::GetNextInput() { - Common::ParamPackage params; - CemuhookUDP::UDPPadStatus pad; - auto& queue = client->GetPadQueue(); - while (queue.Pop(pad)) { - if (pad.touch == CemuhookUDP::PadTouch::Undefined) { - continue; - } - params.Set("engine", "cemuhookudp"); - params.Set("ip", pad.host); - params.Set("port", static_cast(pad.port)); - params.Set("pad_index", static_cast(pad.pad_index)); - params.Set("touch", static_cast(pad.touch)); - return params; - } - return params; -} - } // namespace InputCommon diff --git a/src/tests/video_core/buffer_base.cpp b/src/tests/video_core/buffer_base.cpp index 651633e9e..edced69bb 100755 --- a/src/tests/video_core/buffer_base.cpp +++ b/src/tests/video_core/buffer_base.cpp @@ -471,3 +471,79 @@ TEST_CASE("BufferBase: Unaligned page region query") { REQUIRE(buffer.IsRegionCpuModified(c + 4000, 1000)); REQUIRE(buffer.IsRegionCpuModified(c + 4000, 1)); } + +TEST_CASE("BufferBase: Cached write") { + RasterizerInterface rasterizer; + BufferBase buffer(rasterizer, c, WORD); + buffer.UnmarkRegionAsCpuModified(c, WORD); + buffer.CachedCpuWrite(c + PAGE, PAGE); + REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); + buffer.FlushCachedWrites(); + REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); + buffer.MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("BufferBase: Multiple cached write") { + RasterizerInterface rasterizer; + BufferBase buffer(rasterizer, c, WORD); + buffer.UnmarkRegionAsCpuModified(c, WORD); + buffer.CachedCpuWrite(c + PAGE, PAGE); + buffer.CachedCpuWrite(c + PAGE * 3, PAGE); + REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(!buffer.IsRegionCpuModified(c + PAGE * 3, PAGE)); + buffer.FlushCachedWrites(); + REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(buffer.IsRegionCpuModified(c + PAGE * 3, PAGE)); + buffer.MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("BufferBase: Cached write unmarked") { + RasterizerInterface rasterizer; + BufferBase buffer(rasterizer, c, WORD); + buffer.UnmarkRegionAsCpuModified(c, WORD); + buffer.CachedCpuWrite(c + PAGE, PAGE); + buffer.UnmarkRegionAsCpuModified(c + PAGE, PAGE); + REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); + buffer.FlushCachedWrites(); + REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); + buffer.MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("BufferBase: Cached write iterated") { + RasterizerInterface rasterizer; + BufferBase buffer(rasterizer, c, WORD); + buffer.UnmarkRegionAsCpuModified(c, WORD); + buffer.CachedCpuWrite(c + PAGE, PAGE); + int num = 0; + buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 0); + REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); + buffer.FlushCachedWrites(); + REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); + buffer.MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} + +TEST_CASE("BufferBase: Cached write downloads") { + RasterizerInterface rasterizer; + BufferBase buffer(rasterizer, c, WORD); + buffer.UnmarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 64); + buffer.CachedCpuWrite(c + PAGE, PAGE); + REQUIRE(rasterizer.Count() == 63); + buffer.MarkRegionAsGpuModified(c + PAGE, PAGE); + int num = 0; + buffer.ForEachDownloadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); + buffer.ForEachUploadRange(c, WORD, [&](u64 offset, u64 size) { ++num; }); + REQUIRE(num == 0); + REQUIRE(!buffer.IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE)); + buffer.FlushCachedWrites(); + REQUIRE(buffer.IsRegionCpuModified(c + PAGE, PAGE)); + REQUIRE(!buffer.IsRegionGpuModified(c + PAGE, PAGE)); + buffer.MarkRegionAsCpuModified(c, WORD); + REQUIRE(rasterizer.Count() == 0); +} diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index e01ea55ab..4ab20f20e 100755 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -2,10 +2,8 @@ add_subdirectory(host_shaders) add_library(video_core STATIC buffer_cache/buffer_base.h - buffer_cache/buffer_block.h + buffer_cache/buffer_cache.cpp buffer_cache/buffer_cache.h - buffer_cache/map_interval.cpp - buffer_cache/map_interval.h cdma_pusher.cpp cdma_pusher.h command_classes/codecs/codec.cpp @@ -154,8 +152,6 @@ add_library(video_core STATIC renderer_vulkan/vk_staging_buffer_pool.h renderer_vulkan/vk_state_tracker.cpp renderer_vulkan/vk_state_tracker.h - renderer_vulkan/vk_stream_buffer.cpp - renderer_vulkan/vk_stream_buffer.h renderer_vulkan/vk_swapchain.cpp renderer_vulkan/vk_swapchain.h renderer_vulkan/vk_texture_cache.cpp diff --git a/src/video_core/buffer_cache/buffer_base.h b/src/video_core/buffer_cache/buffer_base.h index fd740c2c1..8a5e6a3e7 100755 --- a/src/video_core/buffer_cache/buffer_base.h +++ b/src/video_core/buffer_cache/buffer_base.h @@ -19,6 +19,7 @@ namespace VideoCommon { enum class BufferFlagBits { Picked = 1 << 0, + CachedWrites = 1 << 1, }; DECLARE_ENUM_FLAG_OPERATORS(BufferFlagBits) @@ -40,7 +41,7 @@ class BufferBase { static constexpr u64 BYTES_PER_WORD = PAGES_PER_WORD * BYTES_PER_PAGE; /// Vector tracking modified pages tightly packed with small vector optimization - union WrittenWords { + union WordsArray { /// Returns the pointer to the words state [[nodiscard]] const u64* Pointer(bool is_short) const noexcept { return is_short ? &stack : heap; @@ -55,49 +56,59 @@ class BufferBase { u64* heap; ///< Not-small buffers pointer to the storage }; - struct GpuCpuWords { - explicit GpuCpuWords() = default; - explicit GpuCpuWords(u64 size_bytes_) : size_bytes{size_bytes_} { + struct Words { + explicit Words() = default; + explicit Words(u64 size_bytes_) : size_bytes{size_bytes_} { if (IsShort()) { cpu.stack = ~u64{0}; gpu.stack = 0; + cached_cpu.stack = 0; + untracked.stack = ~u64{0}; } else { // Share allocation between CPU and GPU pages and set their default values const size_t num_words = NumWords(); - u64* const alloc = new u64[num_words * 2]; + u64* const alloc = new u64[num_words * 4]; cpu.heap = alloc; gpu.heap = alloc + num_words; + cached_cpu.heap = alloc + num_words * 2; + untracked.heap = alloc + num_words * 3; std::fill_n(cpu.heap, num_words, ~u64{0}); std::fill_n(gpu.heap, num_words, 0); + std::fill_n(cached_cpu.heap, num_words, 0); + std::fill_n(untracked.heap, num_words, ~u64{0}); } // Clean up tailing bits - const u64 last_local_page = - Common::DivCeil(size_bytes % BYTES_PER_WORD, BYTES_PER_PAGE); + const u64 last_word_size = size_bytes % BYTES_PER_WORD; + const u64 last_local_page = Common::DivCeil(last_word_size, BYTES_PER_PAGE); const u64 shift = (PAGES_PER_WORD - last_local_page) % PAGES_PER_WORD; - u64& last_word = cpu.Pointer(IsShort())[NumWords() - 1]; - last_word = (last_word << shift) >> shift; + const u64 last_word = (~u64{0} << shift) >> shift; + cpu.Pointer(IsShort())[NumWords() - 1] = last_word; + untracked.Pointer(IsShort())[NumWords() - 1] = last_word; } - ~GpuCpuWords() { + ~Words() { Release(); } - GpuCpuWords& operator=(GpuCpuWords&& rhs) noexcept { + Words& operator=(Words&& rhs) noexcept { Release(); size_bytes = rhs.size_bytes; cpu = rhs.cpu; gpu = rhs.gpu; + cached_cpu = rhs.cached_cpu; + untracked = rhs.untracked; rhs.cpu.heap = nullptr; return *this; } - GpuCpuWords(GpuCpuWords&& rhs) noexcept - : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu} { + Words(Words&& rhs) noexcept + : size_bytes{rhs.size_bytes}, cpu{rhs.cpu}, gpu{rhs.gpu}, + cached_cpu{rhs.cached_cpu}, untracked{rhs.untracked} { rhs.cpu.heap = nullptr; } - GpuCpuWords& operator=(const GpuCpuWords&) = delete; - GpuCpuWords(const GpuCpuWords&) = delete; + Words& operator=(const Words&) = delete; + Words(const Words&) = delete; /// Returns true when the buffer fits in the small vector optimization [[nodiscard]] bool IsShort() const noexcept { @@ -118,8 +129,17 @@ class BufferBase { } u64 size_bytes = 0; - WrittenWords cpu; - WrittenWords gpu; + WordsArray cpu; + WordsArray gpu; + WordsArray cached_cpu; + WordsArray untracked; + }; + + enum class Type { + CPU, + GPU, + CachedCPU, + Untracked, }; public: @@ -132,68 +152,93 @@ public: BufferBase& operator=(const BufferBase&) = delete; BufferBase(const BufferBase&) = delete; + BufferBase& operator=(BufferBase&&) = default; + BufferBase(BufferBase&&) = default; + /// Returns the inclusive CPU modified range in a begin end pair [[nodiscard]] std::pair ModifiedCpuRegion(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion(offset, query_size); + return ModifiedRegion(offset, query_size); } /// Returns the inclusive GPU modified range in a begin end pair [[nodiscard]] std::pair ModifiedGpuRegion(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return ModifiedRegion(offset, query_size); + return ModifiedRegion(offset, query_size); } /// Returns true if a region has been modified from the CPU [[nodiscard]] bool IsRegionCpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified(offset, query_size); + return IsRegionModified(offset, query_size); } /// Returns true if a region has been modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr query_cpu_addr, u64 query_size) const noexcept { const u64 offset = query_cpu_addr - cpu_addr; - return IsRegionModified(offset, query_size); + return IsRegionModified(offset, query_size); } /// Mark region as CPU modified, notifying the rasterizer about this change void MarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState(words.cpu, dirty_cpu_addr, size); + ChangeRegionState(dirty_cpu_addr, size); } /// Unmark region as CPU modified, notifying the rasterizer about this change void UnmarkRegionAsCpuModified(VAddr dirty_cpu_addr, u64 size) { - ChangeRegionState(words.cpu, dirty_cpu_addr, size); + ChangeRegionState(dirty_cpu_addr, size); } /// Mark region as modified from the host GPU void MarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState(words.gpu, dirty_cpu_addr, size); + ChangeRegionState(dirty_cpu_addr, size); } /// Unmark region as modified from the host GPU void UnmarkRegionAsGpuModified(VAddr dirty_cpu_addr, u64 size) noexcept { - ChangeRegionState(words.gpu, dirty_cpu_addr, size); + ChangeRegionState(dirty_cpu_addr, size); + } + + /// Mark region as modified from the CPU + /// but don't mark it as modified until FlusHCachedWrites is called. + void CachedCpuWrite(VAddr dirty_cpu_addr, u64 size) { + flags |= BufferFlagBits::CachedWrites; + ChangeRegionState(dirty_cpu_addr, size); + } + + /// Flushes cached CPU writes, and notify the rasterizer about the deltas + void FlushCachedWrites() noexcept { + flags &= ~BufferFlagBits::CachedWrites; + const u64 num_words = NumWords(); + const u64* const cached_words = Array(); + u64* const untracked_words = Array(); + u64* const cpu_words = Array(); + for (u64 word_index = 0; word_index < num_words; ++word_index) { + const u64 cached_bits = cached_words[word_index]; + NotifyRasterizer(word_index, untracked_words[word_index], cached_bits); + untracked_words[word_index] |= cached_bits; + cpu_words[word_index] |= cached_bits; + } } /// Call 'func' for each CPU modified range and unmark those pages as CPU modified template void ForEachUploadRange(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, func); + ForEachModifiedRange(query_cpu_range, size, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(VAddr query_cpu_range, u64 size, Func&& func) { - ForEachModifiedRange(query_cpu_range, size, func); + ForEachModifiedRange(query_cpu_range, size, func); } /// Call 'func' for each GPU modified range and unmark those pages as GPU modified template void ForEachDownloadRange(Func&& func) { - ForEachModifiedRange(cpu_addr, SizeBytes(), func); + ForEachModifiedRange(cpu_addr, SizeBytes(), func); } /// Mark buffer as picked @@ -216,6 +261,11 @@ public: return True(flags & BufferFlagBits::Picked); } + /// Returns true when the buffer has pending cached writes + [[nodiscard]] bool HasCachedWrites() const noexcept { + return True(flags & BufferFlagBits::CachedWrites); + } + /// Returns the base CPU address of the buffer [[nodiscard]] VAddr CpuAddr() const noexcept { return cpu_addr; @@ -233,26 +283,48 @@ public: } private: + template + u64* Array() noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + + template + const u64* Array() const noexcept { + if constexpr (type == Type::CPU) { + return words.cpu.Pointer(IsShort()); + } else if constexpr (type == Type::GPU) { + return words.gpu.Pointer(IsShort()); + } else if constexpr (type == Type::CachedCPU) { + return words.cached_cpu.Pointer(IsShort()); + } else if constexpr (type == Type::Untracked) { + return words.untracked.Pointer(IsShort()); + } + } + /** * Change the state of a range of pages * - * @param written_words Pages to be marked or unmarked as modified * @param dirty_addr Base address to mark or unmark as modified * @param size Size in bytes to mark or unmark as modified - * - * @tparam enable True when the bits will be set to one, false for zero - * @tparam notify_rasterizer True when the rasterizer has to be notified about the changes */ - template - void ChangeRegionState(WrittenWords& written_words, u64 dirty_addr, - s64 size) noexcept(!notify_rasterizer) { + template + void ChangeRegionState(u64 dirty_addr, s64 size) noexcept(type == Type::GPU) { const s64 difference = dirty_addr - cpu_addr; const u64 offset = std::max(difference, 0); size += std::min(difference, 0); if (offset >= SizeBytes() || size < 0) { return; } - u64* const state_words = written_words.Pointer(IsShort()); + u64* const untracked_words = Array(); + u64* const state_words = Array(); const u64 offset_end = std::min(offset + size, SizeBytes()); const u64 begin_page_index = offset / BYTES_PER_PAGE; const u64 begin_word_index = begin_page_index / PAGES_PER_WORD; @@ -268,13 +340,19 @@ private: u64 bits = ~u64{0}; bits = (bits >> right_offset) << right_offset; bits = (bits << left_offset) >> left_offset; - if constexpr (notify_rasterizer) { - NotifyRasterizer(word_index, state_words[word_index], bits); + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + NotifyRasterizer(word_index, untracked_words[word_index], bits); } if constexpr (enable) { state_words[word_index] |= bits; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[word_index] |= bits; + } } else { state_words[word_index] &= ~bits; + if constexpr (type == Type::CPU || type == Type::CachedCPU) { + untracked_words[word_index] &= ~bits; + } } page_index = 0; ++word_index; @@ -291,7 +369,7 @@ private: * @tparam add_to_rasterizer True when the rasterizer should start tracking the new pages */ template - void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) { + void NotifyRasterizer(u64 word_index, u64 current_bits, u64 new_bits) const { u64 changed_bits = (add_to_rasterizer ? current_bits : ~current_bits) & new_bits; VAddr addr = cpu_addr + word_index * BYTES_PER_WORD; while (changed_bits != 0) { @@ -315,21 +393,20 @@ private: * @param query_cpu_range Base CPU address to loop over * @param size Size in bytes of the CPU range to loop over * @param func Function to call for each turned off region - * - * @tparam gpu True for host GPU pages, false for CPU pages - * @tparam notify_rasterizer True when the rasterizer should be notified about state changes */ - template + template void ForEachModifiedRange(VAddr query_cpu_range, s64 size, Func&& func) { + static_assert(type != Type::Untracked); + const s64 difference = query_cpu_range - cpu_addr; const u64 query_begin = std::max(difference, 0); size += std::min(difference, 0); if (query_begin >= SizeBytes() || size < 0) { return; } - const u64* const cpu_words = words.cpu.Pointer(IsShort()); + u64* const untracked_words = Array(); + u64* const state_words = Array(); const u64 query_end = query_begin + std::min(static_cast(size), SizeBytes()); - u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); u64* const words_begin = state_words + query_begin / BYTES_PER_WORD; u64* const words_end = state_words + Common::DivCeil(query_end, BYTES_PER_WORD); @@ -345,7 +422,8 @@ private: const u64 word_index_end = std::distance(state_words, last_modified_word); const unsigned local_page_begin = std::countr_zero(*first_modified_word); - const unsigned local_page_end = PAGES_PER_WORD - std::countl_zero(last_modified_word[-1]); + const unsigned local_page_end = + static_cast(PAGES_PER_WORD) - std::countl_zero(last_modified_word[-1]); const u64 word_page_begin = word_index_begin * PAGES_PER_WORD; const u64 word_page_end = (word_index_end - 1) * PAGES_PER_WORD; const u64 query_page_begin = query_begin / BYTES_PER_PAGE; @@ -371,11 +449,13 @@ private: const u64 current_word = state_words[word_index] & bits; state_words[word_index] &= ~bits; - // Exclude CPU modified pages when visiting GPU pages - const u64 word = current_word & ~(gpu ? cpu_words[word_index] : 0); - if constexpr (notify_rasterizer) { - NotifyRasterizer(word_index, word, ~u64{0}); + if constexpr (type == Type::CPU) { + const u64 current_bits = untracked_words[word_index] & bits; + untracked_words[word_index] &= ~bits; + NotifyRasterizer(word_index, current_bits, ~u64{0}); } + // Exclude CPU modified pages when visiting GPU pages + const u64 word = current_word & ~(type == Type::GPU ? untracked_words[word_index] : 0); u64 page = page_begin; page_begin = 0; @@ -416,17 +496,20 @@ private: * @param offset Offset in bytes from the start of the buffer * @param size Size in bytes of the region to query for modifications */ - template + template [[nodiscard]] bool IsRegionModified(u64 offset, u64 size) const noexcept { - const u64* const cpu_words = words.cpu.Pointer(IsShort()); - const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); + static_assert(type != Type::Untracked); + + const u64* const untracked_words = Array(); + const u64* const state_words = Array(); const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 word_begin = offset / BYTES_PER_WORD; const u64 word_end = std::min(word_begin + num_query_words, NumWords()); const u64 page_limit = Common::DivCeil(offset + size, BYTES_PER_PAGE); u64 page_index = (offset / BYTES_PER_PAGE) % PAGES_PER_WORD; for (u64 word_index = word_begin; word_index < word_end; ++word_index, page_index = 0) { - const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0); + const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; + const u64 word = state_words[word_index] & ~off_word; if (word == 0) { continue; } @@ -445,13 +528,13 @@ private: * * @param offset Offset in bytes from the start of the buffer * @param size Size in bytes of the region to query for modifications - * - * @tparam True to query GPU modified pages, false for CPU pages */ - template + template [[nodiscard]] std::pair ModifiedRegion(u64 offset, u64 size) const noexcept { - const u64* const cpu_words = words.cpu.Pointer(IsShort()); - const u64* const state_words = (gpu ? words.gpu : words.cpu).Pointer(IsShort()); + static_assert(type != Type::Untracked); + + const u64* const untracked_words = Array(); + const u64* const state_words = Array(); const u64 num_query_words = size / BYTES_PER_WORD + 1; const u64 word_begin = offset / BYTES_PER_WORD; const u64 word_end = std::min(word_begin + num_query_words, NumWords()); @@ -460,7 +543,8 @@ private: u64 begin = std::numeric_limits::max(); u64 end = 0; for (u64 word_index = word_begin; word_index < word_end; ++word_index) { - const u64 word = state_words[word_index] & ~(gpu ? cpu_words[word_index] : 0); + const u64 off_word = type == Type::GPU ? untracked_words[word_index] : 0; + const u64 word = state_words[word_index] & ~off_word; if (word == 0) { continue; } @@ -488,7 +572,7 @@ private: RasterizerInterface* rasterizer = nullptr; VAddr cpu_addr = 0; - GpuCpuWords words; + Words words; BufferFlagBits flags{}; }; diff --git a/src/video_core/buffer_cache/buffer_cache.cpp b/src/video_core/buffer_cache/buffer_cache.cpp new file mode 100755 index 000000000..ab32294c8 --- /dev/null +++ b/src/video_core/buffer_cache/buffer_cache.cpp @@ -0,0 +1,13 @@ +// Copyright 2021 yuzu Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "common/microprofile.h" + +namespace VideoCommon { + +MICROPROFILE_DEFINE(GPU_PrepareBuffers, "GPU", "Prepare buffers", MP_RGB(224, 128, 128)); +MICROPROFILE_DEFINE(GPU_BindUploadBuffers, "GPU", "Bind and upload buffers", MP_RGB(224, 128, 128)); +MICROPROFILE_DEFINE(GPU_DownloadMemory, "GPU", "Download buffers", MP_RGB(224, 128, 128)); + +} // namespace VideoCommon diff --git a/src/video_core/buffer_cache/buffer_cache.h b/src/video_core/buffer_cache/buffer_cache.h index 83b9ee871..e4f3c8e35 100755 --- a/src/video_core/buffer_cache/buffer_cache.h +++ b/src/video_core/buffer_cache/buffer_cache.h @@ -4,591 +4,1231 @@ #pragma once -#include +#include +#include +#include #include #include +#include #include -#include -#include #include #include -#include -#include -#include "common/alignment.h" -#include "common/assert.h" #include "common/common_types.h" -#include "common/logging/log.h" -#include "core/core.h" +#include "common/div_ceil.h" +#include "common/microprofile.h" +#include "common/scope_exit.h" #include "core/memory.h" #include "core/settings.h" -#include "video_core/buffer_cache/buffer_block.h" -#include "video_core/buffer_cache/map_interval.h" +#include "video_core/buffer_cache/buffer_base.h" +#include "video_core/delayed_destruction_ring.h" +#include "video_core/dirty_flags.h" +#include "video_core/engines/kepler_compute.h" +#include "video_core/engines/maxwell_3d.h" #include "video_core/memory_manager.h" #include "video_core/rasterizer_interface.h" +#include "video_core/texture_cache/slot_vector.h" +#include "video_core/texture_cache/types.h" namespace VideoCommon { -template +MICROPROFILE_DECLARE(GPU_PrepareBuffers); +MICROPROFILE_DECLARE(GPU_BindUploadBuffers); +MICROPROFILE_DECLARE(GPU_DownloadMemory); + +using BufferId = SlotId; + +constexpr u32 NUM_VERTEX_BUFFERS = 32; +constexpr u32 NUM_TRANSFORM_FEEDBACK_BUFFERS = 4; +constexpr u32 NUM_GRAPHICS_UNIFORM_BUFFERS = 18; +constexpr u32 NUM_COMPUTE_UNIFORM_BUFFERS = 8; +constexpr u32 NUM_STORAGE_BUFFERS = 16; +constexpr u32 NUM_STAGES = 5; + +template class BufferCache { - using IntervalSet = boost::icl::interval_set; - using IntervalType = typename IntervalSet::interval_type; - using VectorMapInterval = boost::container::small_vector; + // Page size for caching purposes. + // This is unrelated to the CPU page size and it can be changed as it seems optimal. + static constexpr u32 PAGE_BITS = 16; + static constexpr u64 PAGE_SIZE = u64{1} << PAGE_BITS; - static constexpr u64 WRITE_PAGE_BIT = 11; - static constexpr u64 BLOCK_PAGE_BITS = 21; - static constexpr u64 BLOCK_PAGE_SIZE = 1ULL << BLOCK_PAGE_BITS; + static constexpr bool IS_OPENGL = P::IS_OPENGL; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = + P::HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = + P::HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = P::NEEDS_BIND_UNIFORM_INDEX; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = P::NEEDS_BIND_STORAGE_INDEX; + static constexpr bool USE_MEMORY_MAPS = P::USE_MEMORY_MAPS; -public: - struct BufferInfo { - BufferType handle; - u64 offset; - u64 address; + static constexpr BufferId NULL_BUFFER_ID{0}; + + using Maxwell = Tegra::Engines::Maxwell3D::Regs; + + using Runtime = typename P::Runtime; + using Buffer = typename P::Buffer; + + struct Empty {}; + + struct Binding { + VAddr cpu_addr{}; + u32 size{}; + BufferId buffer_id; }; - BufferInfo UploadMemory(GPUVAddr gpu_addr, std::size_t size, std::size_t alignment = 4, - bool is_written = false, bool use_fast_cbuf = false) { - std::lock_guard lock{mutex}; + static constexpr Binding NULL_BINDING{ + .cpu_addr = 0, + .size = 0, + .buffer_id = NULL_BUFFER_ID, + }; - const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); - if (!cpu_addr) { - return GetEmptyBuffer(size); - } +public: + static constexpr size_t SKIP_CACHE_SIZE = 4096; - // Cache management is a big overhead, so only cache entries with a given size. - // TODO: Figure out which size is the best for given games. - constexpr std::size_t max_stream_size = 0x800; - if (use_fast_cbuf || size < max_stream_size) { - if (!is_written && !IsRegionWritten(*cpu_addr, *cpu_addr + size - 1)) { - const bool is_granular = gpu_memory.IsGranularRange(gpu_addr, size); - if (use_fast_cbuf) { - u8* dest; - if (is_granular) { - dest = gpu_memory.GetPointer(gpu_addr); - } else { - staging_buffer.resize(size); - dest = staging_buffer.data(); - gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size); - } - return ConstBufferUpload(dest, size); - } - if (is_granular) { - u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); - return StreamBufferUpload(size, alignment, [host_ptr, size](u8* dest) { - std::memcpy(dest, host_ptr, size); - }); - } else { - return StreamBufferUpload(size, alignment, [this, gpu_addr, size](u8* dest) { - gpu_memory.ReadBlockUnsafe(gpu_addr, dest, size); - }); - } - } - } - - Buffer* const block = GetBlock(*cpu_addr, size); - MapInterval* const map = MapAddress(block, gpu_addr, *cpu_addr, size); - if (!map) { - return GetEmptyBuffer(size); - } - if (is_written) { - map->MarkAsModified(true, GetModifiedTicks()); - if (Settings::IsGPULevelHigh() && - Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - MarkForAsyncFlush(map); - } - if (!map->is_written) { - map->is_written = true; - MarkRegionAsWritten(map->start, map->end - 1); - } - } - - return BufferInfo{block->Handle(), block->Offset(*cpu_addr), block->Address()}; - } - - /// Uploads from a host memory. Returns the OpenGL buffer where it's located and its offset. - BufferInfo UploadHostMemory(const void* raw_pointer, std::size_t size, - std::size_t alignment = 4) { - std::lock_guard lock{mutex}; - return StreamBufferUpload(size, alignment, [raw_pointer, size](u8* dest) { - std::memcpy(dest, raw_pointer, size); - }); - } - - /// Prepares the buffer cache for data uploading - /// @param max_size Maximum number of bytes that will be uploaded - /// @return True when a stream buffer invalidation was required, false otherwise - void Map(std::size_t max_size) { - std::lock_guard lock{mutex}; - - std::tie(buffer_ptr, buffer_offset_base) = stream_buffer.Map(max_size, 4); - buffer_offset = buffer_offset_base; - } - - /// Finishes the upload stream - void Unmap() { - std::lock_guard lock{mutex}; - stream_buffer.Unmap(buffer_offset - buffer_offset_base); - } - - /// Function called at the end of each frame, inteded for deferred operations - void TickFrame() { - ++epoch; - - while (!pending_destruction.empty()) { - // Delay at least 4 frames before destruction. - // This is due to triple buffering happening on some drivers. - static constexpr u64 epochs_to_destroy = 5; - if (pending_destruction.front()->Epoch() + epochs_to_destroy > epoch) { - break; - } - pending_destruction.pop(); - } - } - - /// Write any cached resources overlapping the specified region back to memory - void FlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; - - VectorMapInterval objects = GetMapsInRange(addr, size); - std::sort(objects.begin(), objects.end(), - [](MapInterval* lhs, MapInterval* rhs) { return lhs->ticks < rhs->ticks; }); - for (MapInterval* object : objects) { - if (object->is_modified && object->is_registered) { - mutex.unlock(); - FlushMap(object); - mutex.lock(); - } - } - } - - bool MustFlushRegion(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; - - const VectorMapInterval objects = GetMapsInRange(addr, size); - return std::any_of(objects.cbegin(), objects.cend(), [](const MapInterval* map) { - return map->is_modified && map->is_registered; - }); - } - - /// Mark the specified region as being invalidated - void InvalidateRegion(VAddr addr, u64 size) { - std::lock_guard lock{mutex}; - - for (auto& object : GetMapsInRange(addr, size)) { - if (object->is_registered) { - Unregister(object); - } - } - } - - void OnCPUWrite(VAddr addr, std::size_t size) { - std::lock_guard lock{mutex}; - - for (MapInterval* object : GetMapsInRange(addr, size)) { - if (object->is_memory_marked && object->is_registered) { - UnmarkMemory(object); - object->is_sync_pending = true; - marked_for_unregister.emplace_back(object); - } - } - } - - void SyncGuestHost() { - std::lock_guard lock{mutex}; - - for (auto& object : marked_for_unregister) { - if (object->is_registered) { - object->is_sync_pending = false; - Unregister(object); - } - } - marked_for_unregister.clear(); - } - - void CommitAsyncFlushes() { - if (uncommitted_flushes) { - auto commit_list = std::make_shared>(); - for (MapInterval* map : *uncommitted_flushes) { - if (map->is_registered && map->is_modified) { - // TODO(Blinkhawk): Implement backend asynchronous flushing - // AsyncFlushMap(map) - commit_list->push_back(map); - } - } - if (!commit_list->empty()) { - committed_flushes.push_back(commit_list); - } else { - committed_flushes.emplace_back(); - } - } else { - committed_flushes.emplace_back(); - } - uncommitted_flushes.reset(); - } - - bool ShouldWaitAsyncFlushes() const { - return !committed_flushes.empty() && committed_flushes.front() != nullptr; - } - - bool HasUncommittedFlushes() const { - return uncommitted_flushes != nullptr; - } - - void PopAsyncFlushes() { - if (committed_flushes.empty()) { - return; - } - auto& flush_list = committed_flushes.front(); - if (!flush_list) { - committed_flushes.pop_front(); - return; - } - for (MapInterval* map : *flush_list) { - if (map->is_registered) { - // TODO(Blinkhawk): Replace this for reading the asynchronous flush - FlushMap(map); - } - } - committed_flushes.pop_front(); - } - - virtual BufferInfo GetEmptyBuffer(std::size_t size) = 0; - -protected: explicit BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, - StreamBuffer& stream_buffer_) - : rasterizer{rasterizer_}, gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, - stream_buffer{stream_buffer_} {} + Runtime& runtime_); - ~BufferCache() = default; + void TickFrame(); - virtual std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) = 0; + void WriteMemory(VAddr cpu_addr, u64 size); - virtual BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) { - return {}; - } + void CachedWriteMemory(VAddr cpu_addr, u64 size); - /// Register an object into the cache - MapInterval* Register(MapInterval new_map, bool inherit_written = false) { - const VAddr cpu_addr = new_map.start; - if (!cpu_addr) { - LOG_CRITICAL(HW_GPU, "Failed to register buffer with unmapped gpu_address 0x{:016x}", - new_map.gpu_addr); - return nullptr; - } - const std::size_t size = new_map.end - new_map.start; - new_map.is_registered = true; - rasterizer.UpdatePagesCachedCount(cpu_addr, size, 1); - new_map.is_memory_marked = true; - if (inherit_written) { - MarkRegionAsWritten(new_map.start, new_map.end - 1); - new_map.is_written = true; - } - MapInterval* const storage = mapped_addresses_allocator.Allocate(); - *storage = new_map; - mapped_addresses.insert(*storage); - return storage; - } + void DownloadMemory(VAddr cpu_addr, u64 size); - void UnmarkMemory(MapInterval* map) { - if (!map->is_memory_marked) { - return; - } - const std::size_t size = map->end - map->start; - rasterizer.UpdatePagesCachedCount(map->start, size, -1); - map->is_memory_marked = false; - } + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size); - /// Unregisters an object from the cache - void Unregister(MapInterval* map) { - UnmarkMemory(map); - map->is_registered = false; - if (map->is_sync_pending) { - map->is_sync_pending = false; - marked_for_unregister.remove(map); - } - if (map->is_written) { - UnmarkRegionAsWritten(map->start, map->end - 1); - } - const auto it = mapped_addresses.find(*map); - ASSERT(it != mapped_addresses.end()); - mapped_addresses.erase(it); - mapped_addresses_allocator.Release(map); - } + void UpdateGraphicsBuffers(bool is_indexed); + + void UpdateComputeBuffers(); + + void BindHostGeometryBuffers(bool is_indexed); + + void BindHostStageBuffers(size_t stage); + + void BindHostComputeBuffers(); + + void SetEnabledUniformBuffers(size_t stage, u32 enabled); + + void SetEnabledComputeUniformBuffers(u32 enabled); + + void UnbindGraphicsStorageBuffers(size_t stage); + + void BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); + + void UnbindComputeStorageBuffers(); + + void BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written); + + void FlushCachedWrites(); + + /// Return true when there are uncommitted buffers to be downloaded + [[nodiscard]] bool HasUncommittedFlushes() const noexcept; + + /// Return true when the caller should wait for async downloads + [[nodiscard]] bool ShouldWaitAsyncFlushes() const noexcept; + + /// Commit asynchronous downloads + void CommitAsyncFlushes(); + + /// Pop asynchronous downloads + void PopAsyncFlushes(); + + /// Return true when a CPU region is modified from the GPU + [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + + std::mutex mutex; private: - MapInterval* MapAddress(Buffer* block, GPUVAddr gpu_addr, VAddr cpu_addr, std::size_t size) { - const VectorMapInterval overlaps = GetMapsInRange(cpu_addr, size); - if (overlaps.empty()) { - const VAddr cpu_addr_end = cpu_addr + size; - if (gpu_memory.IsGranularRange(gpu_addr, size)) { - u8* const host_ptr = gpu_memory.GetPointer(gpu_addr); - block->Upload(block->Offset(cpu_addr), size, host_ptr); - } else { - staging_buffer.resize(size); - gpu_memory.ReadBlockUnsafe(gpu_addr, staging_buffer.data(), size); - block->Upload(block->Offset(cpu_addr), size, staging_buffer.data()); - } - return Register(MapInterval(cpu_addr, cpu_addr_end, gpu_addr)); + template + static void ForEachEnabledBit(u32 enabled_mask, Func&& func) { + for (u32 index = 0; enabled_mask != 0; ++index, enabled_mask >>= 1) { + const int disabled_bits = std::countr_zero(enabled_mask); + index += disabled_bits; + enabled_mask >>= disabled_bits; + func(index); } - - const VAddr cpu_addr_end = cpu_addr + size; - if (overlaps.size() == 1) { - MapInterval* const current_map = overlaps[0]; - if (current_map->IsInside(cpu_addr, cpu_addr_end)) { - return current_map; - } - } - VAddr new_start = cpu_addr; - VAddr new_end = cpu_addr_end; - bool write_inheritance = false; - bool modified_inheritance = false; - // Calculate new buffer parameters - for (MapInterval* overlap : overlaps) { - new_start = std::min(overlap->start, new_start); - new_end = std::max(overlap->end, new_end); - write_inheritance |= overlap->is_written; - modified_inheritance |= overlap->is_modified; - } - GPUVAddr new_gpu_addr = gpu_addr + new_start - cpu_addr; - for (auto& overlap : overlaps) { - Unregister(overlap); - } - UpdateBlock(block, new_start, new_end, overlaps); - - const MapInterval new_map{new_start, new_end, new_gpu_addr}; - MapInterval* const map = Register(new_map, write_inheritance); - if (!map) { - return nullptr; - } - if (modified_inheritance) { - map->MarkAsModified(true, GetModifiedTicks()); - if (Settings::IsGPULevelHigh() && - Settings::values.use_asynchronous_gpu_emulation.GetValue()) { - MarkForAsyncFlush(map); - } - } - return map; } - void UpdateBlock(Buffer* block, VAddr start, VAddr end, const VectorMapInterval& overlaps) { - const IntervalType base_interval{start, end}; - IntervalSet interval_set{}; - interval_set.add(base_interval); - for (auto& overlap : overlaps) { - const IntervalType subtract{overlap->start, overlap->end}; - interval_set.subtract(subtract); - } - for (auto& interval : interval_set) { - const std::size_t size = interval.upper() - interval.lower(); - if (size == 0) { + template + void ForEachBufferInRange(VAddr cpu_addr, u64 size, Func&& func) { + const u64 page_end = Common::DivCeil(cpu_addr + size, PAGE_SIZE); + for (u64 page = cpu_addr >> PAGE_BITS; page < page_end;) { + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + ++page; continue; } - staging_buffer.resize(size); - cpu_memory.ReadBlockUnsafe(interval.lower(), staging_buffer.data(), size); - block->Upload(block->Offset(interval.lower()), size, staging_buffer.data()); + Buffer& buffer = slot_buffers[buffer_id]; + func(buffer_id, buffer); + + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, PAGE_SIZE); } } - VectorMapInterval GetMapsInRange(VAddr addr, std::size_t size) { - VectorMapInterval result; - if (size == 0) { - return result; - } - - const VAddr addr_end = addr + size; - auto it = mapped_addresses.lower_bound(addr); - if (it != mapped_addresses.begin()) { - --it; - } - while (it != mapped_addresses.end() && it->start < addr_end) { - if (it->Overlaps(addr, addr_end)) { - result.push_back(&*it); - } - ++it; - } - return result; + static bool IsRangeGranular(VAddr cpu_addr, size_t size) { + return (cpu_addr & ~Core::Memory::PAGE_MASK) == + ((cpu_addr + size) & ~Core::Memory::PAGE_MASK); } - /// Returns a ticks counter used for tracking when cached objects were last modified - u64 GetModifiedTicks() { - return ++modified_ticks; - } + void BindHostIndexBuffer(); - void FlushMap(MapInterval* map) { - const auto it = blocks.find(map->start >> BLOCK_PAGE_BITS); - ASSERT_OR_EXECUTE(it != blocks.end(), return;); + void BindHostVertexBuffers(); - std::shared_ptr block = it->second; + void BindHostGraphicsUniformBuffers(size_t stage); - const std::size_t size = map->end - map->start; - staging_buffer.resize(size); - block->Download(block->Offset(map->start), size, staging_buffer.data()); - cpu_memory.WriteBlockUnsafe(map->start, staging_buffer.data(), size); - map->MarkAsModified(false, 0); - } + void BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, bool needs_bind); - template - BufferInfo StreamBufferUpload(std::size_t size, std::size_t alignment, Callable&& callable) { - AlignBuffer(alignment); - const std::size_t uploaded_offset = buffer_offset; - callable(buffer_ptr); + void BindHostGraphicsStorageBuffers(size_t stage); - buffer_ptr += size; - buffer_offset += size; - return BufferInfo{stream_buffer.Handle(), uploaded_offset, stream_buffer.Address()}; - } + void BindHostTransformFeedbackBuffers(); - void AlignBuffer(std::size_t alignment) { - // Align the offset, not the mapped pointer - const std::size_t offset_aligned = Common::AlignUp(buffer_offset, alignment); - buffer_ptr += offset_aligned - buffer_offset; - buffer_offset = offset_aligned; - } + void BindHostComputeUniformBuffers(); - std::shared_ptr EnlargeBlock(std::shared_ptr buffer) { - const std::size_t old_size = buffer->Size(); - const std::size_t new_size = old_size + BLOCK_PAGE_SIZE; - const VAddr cpu_addr = buffer->CpuAddr(); - std::shared_ptr new_buffer = CreateBlock(cpu_addr, new_size); - new_buffer->CopyFrom(*buffer, 0, 0, old_size); - QueueDestruction(std::move(buffer)); + void BindHostComputeStorageBuffers(); - const VAddr cpu_addr_end = cpu_addr + new_size - 1; - const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; - for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { - blocks.insert_or_assign(page_start, new_buffer); - } + void DoUpdateGraphicsBuffers(bool is_indexed); - return new_buffer; - } + void DoUpdateComputeBuffers(); - std::shared_ptr MergeBlocks(std::shared_ptr first, - std::shared_ptr second) { - const std::size_t size_1 = first->Size(); - const std::size_t size_2 = second->Size(); - const VAddr first_addr = first->CpuAddr(); - const VAddr second_addr = second->CpuAddr(); - const VAddr new_addr = std::min(first_addr, second_addr); - const std::size_t new_size = size_1 + size_2; + void UpdateIndexBuffer(); - std::shared_ptr new_buffer = CreateBlock(new_addr, new_size); - new_buffer->CopyFrom(*first, 0, new_buffer->Offset(first_addr), size_1); - new_buffer->CopyFrom(*second, 0, new_buffer->Offset(second_addr), size_2); - QueueDestruction(std::move(first)); - QueueDestruction(std::move(second)); + void UpdateVertexBuffers(); - const VAddr cpu_addr_end = new_addr + new_size - 1; - const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; - for (u64 page_start = new_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { - blocks.insert_or_assign(page_start, new_buffer); - } - return new_buffer; - } + void UpdateVertexBuffer(u32 index); - Buffer* GetBlock(VAddr cpu_addr, std::size_t size) { - std::shared_ptr found; + void UpdateUniformBuffers(size_t stage); - const VAddr cpu_addr_end = cpu_addr + size - 1; - const u64 page_end = cpu_addr_end >> BLOCK_PAGE_BITS; - for (u64 page_start = cpu_addr >> BLOCK_PAGE_BITS; page_start <= page_end; ++page_start) { - auto it = blocks.find(page_start); - if (it == blocks.end()) { - if (found) { - found = EnlargeBlock(found); - continue; - } - const VAddr start_addr = page_start << BLOCK_PAGE_BITS; - found = CreateBlock(start_addr, BLOCK_PAGE_SIZE); - blocks.insert_or_assign(page_start, found); - continue; - } - if (!found) { - found = it->second; - continue; - } - if (found != it->second) { - found = MergeBlocks(std::move(found), it->second); - } - } - return found.get(); - } + void UpdateStorageBuffers(size_t stage); - void MarkRegionAsWritten(VAddr start, VAddr end) { - const u64 page_end = end >> WRITE_PAGE_BIT; - for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { - if (const auto [it, inserted] = written_pages.emplace(page_start, 1); !inserted) { - ++it->second; - } - } - } + void UpdateTransformFeedbackBuffers(); - void UnmarkRegionAsWritten(VAddr start, VAddr end) { - const u64 page_end = end >> WRITE_PAGE_BIT; - for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { - auto it = written_pages.find(page_start); - if (it != written_pages.end()) { - if (it->second > 1) { - --it->second; - } else { - written_pages.erase(it); - } - } - } - } + void UpdateTransformFeedbackBuffer(u32 index); - bool IsRegionWritten(VAddr start, VAddr end) const { - const u64 page_end = end >> WRITE_PAGE_BIT; - for (u64 page_start = start >> WRITE_PAGE_BIT; page_start <= page_end; ++page_start) { - if (written_pages.contains(page_start)) { - return true; - } - } - return false; - } + void UpdateComputeUniformBuffers(); - void QueueDestruction(std::shared_ptr buffer) { - buffer->SetEpoch(epoch); - pending_destruction.push(std::move(buffer)); - } + void UpdateComputeStorageBuffers(); - void MarkForAsyncFlush(MapInterval* map) { - if (!uncommitted_flushes) { - uncommitted_flushes = std::make_shared>(); - } - uncommitted_flushes->insert(map); - } + void MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size); + + [[nodiscard]] BufferId FindBuffer(VAddr cpu_addr, u32 size); + + [[nodiscard]] BufferId CreateBuffer(VAddr cpu_addr, u32 wanted_size); + + void Register(BufferId buffer_id); + + void Unregister(BufferId buffer_id); + + template + void ChangeRegister(BufferId buffer_id); + + void SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size); + + void SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size); + + void UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, + std::span copies); + + void ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, + std::span copies); + + void MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, + std::span copies); + + void DeleteBuffer(BufferId buffer_id); + + void ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id); + + void NotifyBufferDeletion(); + + [[nodiscard]] Binding StorageBufferBinding(GPUVAddr ssbo_addr) const; + + [[nodiscard]] std::span ImmediateBufferWithData(VAddr cpu_addr, size_t size); + + [[nodiscard]] std::span ImmediateBuffer(size_t wanted_capacity); + + [[nodiscard]] bool HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept; VideoCore::RasterizerInterface& rasterizer; + Tegra::Engines::Maxwell3D& maxwell3d; + Tegra::Engines::KeplerCompute& kepler_compute; Tegra::MemoryManager& gpu_memory; Core::Memory::Memory& cpu_memory; - StreamBuffer& stream_buffer; + Runtime& runtime; - u8* buffer_ptr = nullptr; - u64 buffer_offset = 0; - u64 buffer_offset_base = 0; + SlotVector slot_buffers; + DelayedDestructionRing delayed_destruction_ring; - MapIntervalAllocator mapped_addresses_allocator; - boost::intrusive::set> - mapped_addresses; + u32 last_index_count = 0; - std::unordered_map written_pages; - std::unordered_map> blocks; + Binding index_buffer; + std::array vertex_buffers; + std::array, NUM_STAGES> uniform_buffers; + std::array, NUM_STAGES> storage_buffers; + std::array transform_feedback_buffers; - std::queue> pending_destruction; - u64 epoch = 0; - u64 modified_ticks = 0; + std::array compute_uniform_buffers; + std::array compute_storage_buffers; - std::vector staging_buffer; + std::array enabled_uniform_buffers{}; + u32 enabled_compute_uniform_buffers = 0; - std::list marked_for_unregister; + std::array enabled_storage_buffers{}; + std::array written_storage_buffers{}; + u32 enabled_compute_storage_buffers = 0; + u32 written_compute_storage_buffers = 0; - std::shared_ptr> uncommitted_flushes; - std::list>> committed_flushes; + std::array fast_bound_uniform_buffers{}; - std::recursive_mutex mutex; + bool has_deleted_buffers = false; + + std::conditional_t, Empty> + dirty_uniform_buffers{}; + + std::vector cached_write_buffer_ids; + + // TODO: This data structure is not optimal and it should be reworked + std::vector uncommitted_downloads; + std::deque> committed_downloads; + + size_t immediate_buffer_capacity = 0; + std::unique_ptr immediate_buffer_alloc; + + std::array> PAGE_BITS)> page_table; }; +template +BufferCache

::BufferCache(VideoCore::RasterizerInterface& rasterizer_, + Tegra::Engines::Maxwell3D& maxwell3d_, + Tegra::Engines::KeplerCompute& kepler_compute_, + Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, + Runtime& runtime_) + : rasterizer{rasterizer_}, maxwell3d{maxwell3d_}, kepler_compute{kepler_compute_}, + gpu_memory{gpu_memory_}, cpu_memory{cpu_memory_}, runtime{runtime_} { + // Ensure the first slot is used for the null buffer + void(slot_buffers.insert(runtime, NullBufferParams{})); +} + +template +void BufferCache

::TickFrame() { + delayed_destruction_ring.Tick(); +} + +template +void BufferCache

::WriteMemory(VAddr cpu_addr, u64 size) { + ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { + buffer.MarkRegionAsCpuModified(cpu_addr, size); + }); +} + +template +void BufferCache

::CachedWriteMemory(VAddr cpu_addr, u64 size) { + ForEachBufferInRange(cpu_addr, size, [&](BufferId buffer_id, Buffer& buffer) { + if (!buffer.HasCachedWrites()) { + cached_write_buffer_ids.push_back(buffer_id); + } + buffer.CachedCpuWrite(cpu_addr, size); + }); +} + +template +void BufferCache

::DownloadMemory(VAddr cpu_addr, u64 size) { + ForEachBufferInRange(cpu_addr, size, [&](BufferId, Buffer& buffer) { + boost::container::small_vector copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + buffer.ForEachDownloadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = range_offset, + .dst_offset = total_size_bytes, + .size = range_size, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + if (total_size_bytes == 0) { + return; + } + MICROPROFILE_SCOPE(GPU_DownloadMemory); + + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + const u8* const mapped_memory = download_staging.mapped_span.data(); + const std::span copies_span(copies.data(), copies.data() + copies.size()); + runtime.CopyBuffer(download_staging.buffer, buffer, copies_span); + runtime.Finish(); + for (const BufferCopy& copy : copies) { + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + const u8* copy_mapped_memory = mapped_memory + copy.dst_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, copy_mapped_memory, copy.size); + } + } else { + const std::span immediate_buffer = ImmediateBuffer(largest_copy); + for (const BufferCopy& copy : copies) { + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr copy_cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(copy_cpu_addr, immediate_buffer.data(), copy.size); + } + } + }); +} + +template +void BufferCache

::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, + u32 size) { + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (!cpu_addr) { + uniform_buffers[stage][index] = NULL_BINDING; + return; + } + const Binding binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = BufferId{}, + }; + uniform_buffers[stage][index] = binding; +} + +template +void BufferCache

::UpdateGraphicsBuffers(bool is_indexed) { + MICROPROFILE_SCOPE(GPU_PrepareBuffers); + do { + has_deleted_buffers = false; + DoUpdateGraphicsBuffers(is_indexed); + } while (has_deleted_buffers); +} + +template +void BufferCache

::UpdateComputeBuffers() { + MICROPROFILE_SCOPE(GPU_PrepareBuffers); + do { + has_deleted_buffers = false; + DoUpdateComputeBuffers(); + } while (has_deleted_buffers); +} + +template +void BufferCache

::BindHostGeometryBuffers(bool is_indexed) { + MICROPROFILE_SCOPE(GPU_BindUploadBuffers); + if (is_indexed) { + BindHostIndexBuffer(); + } else if constexpr (!HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { + const auto& regs = maxwell3d.regs; + if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) { + runtime.BindQuadArrayIndexBuffer(regs.vertex_buffer.first, regs.vertex_buffer.count); + } + } + BindHostVertexBuffers(); + BindHostTransformFeedbackBuffers(); +} + +template +void BufferCache

::BindHostStageBuffers(size_t stage) { + MICROPROFILE_SCOPE(GPU_BindUploadBuffers); + BindHostGraphicsUniformBuffers(stage); + BindHostGraphicsStorageBuffers(stage); +} + +template +void BufferCache

::BindHostComputeBuffers() { + MICROPROFILE_SCOPE(GPU_BindUploadBuffers); + BindHostComputeUniformBuffers(); + BindHostComputeStorageBuffers(); +} + +template +void BufferCache

::SetEnabledUniformBuffers(size_t stage, u32 enabled) { + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + if (enabled_uniform_buffers[stage] != enabled) { + dirty_uniform_buffers[stage] = ~u32{0}; + } + } + enabled_uniform_buffers[stage] = enabled; +} + +template +void BufferCache

::SetEnabledComputeUniformBuffers(u32 enabled) { + enabled_compute_uniform_buffers = enabled; +} + +template +void BufferCache

::UnbindGraphicsStorageBuffers(size_t stage) { + enabled_storage_buffers[stage] = 0; + written_storage_buffers[stage] = 0; +} + +template +void BufferCache

::BindGraphicsStorageBuffer(size_t stage, size_t ssbo_index, u32 cbuf_index, + u32 cbuf_offset, bool is_written) { + enabled_storage_buffers[stage] |= 1U << ssbo_index; + written_storage_buffers[stage] |= (is_written ? 1U : 0U) << ssbo_index; + + const auto& cbufs = maxwell3d.state.shader_stages[stage]; + const GPUVAddr ssbo_addr = cbufs.const_buffers[cbuf_index].address + cbuf_offset; + storage_buffers[stage][ssbo_index] = StorageBufferBinding(ssbo_addr); +} + +template +void BufferCache

::UnbindComputeStorageBuffers() { + enabled_compute_storage_buffers = 0; + written_compute_storage_buffers = 0; +} + +template +void BufferCache

::BindComputeStorageBuffer(size_t ssbo_index, u32 cbuf_index, u32 cbuf_offset, + bool is_written) { + enabled_compute_storage_buffers |= 1U << ssbo_index; + written_compute_storage_buffers |= (is_written ? 1U : 0U) << ssbo_index; + + const auto& launch_desc = kepler_compute.launch_description; + ASSERT(((launch_desc.const_buffer_enable_mask >> cbuf_index) & 1) != 0); + + const auto& cbufs = launch_desc.const_buffer_config; + const GPUVAddr ssbo_addr = cbufs[cbuf_index].Address() + cbuf_offset; + compute_storage_buffers[ssbo_index] = StorageBufferBinding(ssbo_addr); +} + +template +void BufferCache

::FlushCachedWrites() { + for (const BufferId buffer_id : cached_write_buffer_ids) { + slot_buffers[buffer_id].FlushCachedWrites(); + } + cached_write_buffer_ids.clear(); +} + +template +bool BufferCache

::HasUncommittedFlushes() const noexcept { + return !uncommitted_downloads.empty(); +} + +template +bool BufferCache

::ShouldWaitAsyncFlushes() const noexcept { + return !committed_downloads.empty() && !committed_downloads.front().empty(); +} + +template +void BufferCache

::CommitAsyncFlushes() { + // This is intentionally passing the value by copy + committed_downloads.push_front(uncommitted_downloads); + uncommitted_downloads.clear(); +} + +template +void BufferCache

::PopAsyncFlushes() { + if (committed_downloads.empty()) { + return; + } + auto scope_exit_pop_download = detail::ScopeExit([this] { committed_downloads.pop_back(); }); + const std::span download_ids = committed_downloads.back(); + if (download_ids.empty()) { + return; + } + MICROPROFILE_SCOPE(GPU_DownloadMemory); + + boost::container::small_vector, 1> downloads; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + for (const BufferId buffer_id : download_ids) { + slot_buffers[buffer_id].ForEachDownloadRange([&](u64 range_offset, u64 range_size) { + downloads.push_back({ + BufferCopy{ + .src_offset = range_offset, + .dst_offset = total_size_bytes, + .size = range_size, + }, + buffer_id, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + } + if (downloads.empty()) { + return; + } + if constexpr (USE_MEMORY_MAPS) { + auto download_staging = runtime.DownloadStagingBuffer(total_size_bytes); + for (const auto [copy, buffer_id] : downloads) { + const std::array copies{copy}; + runtime.CopyBuffer(download_staging.buffer, slot_buffers[buffer_id], copies); + } + runtime.Finish(); + for (const auto [copy, buffer_id] : downloads) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + const u8* read_mapped_memory = download_staging.mapped_span.data() + copy.dst_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, read_mapped_memory, copy.size); + } + } else { + const std::span immediate_buffer = ImmediateBuffer(largest_copy); + for (const auto [copy, buffer_id] : downloads) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.ImmediateDownload(copy.src_offset, immediate_buffer.subspan(0, copy.size)); + const VAddr cpu_addr = buffer.CpuAddr() + copy.src_offset; + cpu_memory.WriteBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + } + } +} + +template +bool BufferCache

::IsRegionGpuModified(VAddr addr, size_t size) { + const u64 page_end = Common::DivCeil(addr + size, PAGE_SIZE); + for (u64 page = addr >> PAGE_BITS; page < page_end;) { + const BufferId image_id = page_table[page]; + if (!image_id) { + ++page; + continue; + } + Buffer& buffer = slot_buffers[image_id]; + if (buffer.IsRegionGpuModified(addr, size)) { + return true; + } + const VAddr end_addr = buffer.CpuAddr() + buffer.SizeBytes(); + page = Common::DivCeil(end_addr, PAGE_SIZE); + } + return false; +} + +template +void BufferCache

::BindHostIndexBuffer() { + Buffer& buffer = slot_buffers[index_buffer.buffer_id]; + const u32 offset = buffer.Offset(index_buffer.cpu_addr); + const u32 size = index_buffer.size; + SynchronizeBuffer(buffer, index_buffer.cpu_addr, size); + if constexpr (HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT) { + runtime.BindIndexBuffer(buffer, offset, size); + } else { + runtime.BindIndexBuffer(maxwell3d.regs.draw.topology, maxwell3d.regs.index_array.format, + maxwell3d.regs.index_array.first, maxwell3d.regs.index_array.count, + buffer, offset, size); + } +} + +template +void BufferCache

::BindHostVertexBuffers() { + auto& flags = maxwell3d.dirty.flags; + for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { + const Binding& binding = vertex_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + SynchronizeBuffer(buffer, binding.cpu_addr, binding.size); + if (!flags[Dirty::VertexBuffer0 + index]) { + continue; + } + flags[Dirty::VertexBuffer0 + index] = false; + + const u32 stride = maxwell3d.regs.vertex_array[index].stride; + const u32 offset = buffer.Offset(binding.cpu_addr); + runtime.BindVertexBuffer(index, buffer, offset, binding.size, stride); + } +} + +template +void BufferCache

::BindHostGraphicsUniformBuffers(size_t stage) { + u32 dirty = ~0U; + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + dirty = std::exchange(dirty_uniform_buffers[stage], 0); + } + u32 binding_index = 0; + ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) { + const bool needs_bind = ((dirty >> index) & 1) != 0; + BindHostGraphicsUniformBuffer(stage, index, binding_index, needs_bind); + if constexpr (NEEDS_BIND_UNIFORM_INDEX) { + ++binding_index; + } + }); +} + +template +void BufferCache

::BindHostGraphicsUniformBuffer(size_t stage, u32 index, u32 binding_index, + bool needs_bind) { + const Binding& binding = uniform_buffers[stage][index]; + const VAddr cpu_addr = binding.cpu_addr; + const u32 size = binding.size; + Buffer& buffer = slot_buffers[binding.buffer_id]; + if constexpr (IS_OPENGL) { + if (size <= SKIP_CACHE_SIZE && !buffer.IsRegionGpuModified(cpu_addr, size)) { + if (runtime.HasFastBufferSubData()) { + // Fast path for Nvidia + if (!HasFastUniformBufferBound(stage, binding_index)) { + // We only have to bind when the currently bound buffer is not the fast version + fast_bound_uniform_buffers[stage] |= 1U << binding_index; + runtime.BindFastUniformBuffer(stage, binding_index, size); + } + const auto span = ImmediateBufferWithData(cpu_addr, size); + runtime.PushFastUniformBuffer(stage, binding_index, span); + } else { + // Stream buffer path to avoid stalling on non-Nvidia drivers + const auto span = runtime.BindMappedUniformBuffer(stage, binding_index, size); + cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size); + } + return; + } + } + // Classic cached path + SynchronizeBuffer(buffer, cpu_addr, size); + if (!needs_bind && !HasFastUniformBufferBound(stage, binding_index)) { + // Skip binding if it's not needed and if the bound buffer is not the fast version + // This exists to avoid instances where the fast buffer is bound and a GPU write happens + return; + } + fast_bound_uniform_buffers[stage] &= ~(1U << binding_index); + + const u32 offset = buffer.Offset(cpu_addr); + if constexpr (NEEDS_BIND_UNIFORM_INDEX) { + runtime.BindUniformBuffer(stage, binding_index, buffer, offset, size); + } else { + runtime.BindUniformBuffer(buffer, offset, size); + } +} + +template +void BufferCache

::BindHostGraphicsStorageBuffers(size_t stage) { + u32 binding_index = 0; + ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { + const Binding& binding = storage_buffers[stage][index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + const bool is_written = ((written_storage_buffers[stage] >> index) & 1) != 0; + if constexpr (NEEDS_BIND_STORAGE_INDEX) { + runtime.BindStorageBuffer(stage, binding_index, buffer, offset, size, is_written); + ++binding_index; + } else { + runtime.BindStorageBuffer(buffer, offset, size, is_written); + } + }); +} + +template +void BufferCache

::BindHostTransformFeedbackBuffers() { + if (maxwell3d.regs.tfb_enabled == 0) { + return; + } + for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { + const Binding& binding = transform_feedback_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + runtime.BindTransformFeedbackBuffer(index, buffer, offset, size); + } +} + +template +void BufferCache

::BindHostComputeUniformBuffers() { + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + // Mark all uniform buffers as dirty + dirty_uniform_buffers.fill(~u32{0}); + } + u32 binding_index = 0; + ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { + const Binding& binding = compute_uniform_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + if constexpr (NEEDS_BIND_UNIFORM_INDEX) { + runtime.BindComputeUniformBuffer(binding_index, buffer, offset, size); + ++binding_index; + } else { + runtime.BindUniformBuffer(buffer, offset, size); + } + }); +} + +template +void BufferCache

::BindHostComputeStorageBuffers() { + u32 binding_index = 0; + ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { + const Binding& binding = compute_storage_buffers[index]; + Buffer& buffer = slot_buffers[binding.buffer_id]; + const u32 size = binding.size; + SynchronizeBuffer(buffer, binding.cpu_addr, size); + + const u32 offset = buffer.Offset(binding.cpu_addr); + const bool is_written = ((written_compute_storage_buffers >> index) & 1) != 0; + if constexpr (NEEDS_BIND_STORAGE_INDEX) { + runtime.BindComputeStorageBuffer(binding_index, buffer, offset, size, is_written); + ++binding_index; + } else { + runtime.BindStorageBuffer(buffer, offset, size, is_written); + } + }); +} + +template +void BufferCache

::DoUpdateGraphicsBuffers(bool is_indexed) { + if (is_indexed) { + UpdateIndexBuffer(); + } + UpdateVertexBuffers(); + UpdateTransformFeedbackBuffers(); + for (size_t stage = 0; stage < NUM_STAGES; ++stage) { + UpdateUniformBuffers(stage); + UpdateStorageBuffers(stage); + } +} + +template +void BufferCache

::DoUpdateComputeBuffers() { + UpdateComputeUniformBuffers(); + UpdateComputeStorageBuffers(); +} + +template +void BufferCache

::UpdateIndexBuffer() { + // We have to check for the dirty flags and index count + // The index count is currently changed without updating the dirty flags + const auto& index_array = maxwell3d.regs.index_array; + auto& flags = maxwell3d.dirty.flags; + if (!flags[Dirty::IndexBuffer] && last_index_count == index_array.count) { + return; + } + flags[Dirty::IndexBuffer] = false; + last_index_count = index_array.count; + + const GPUVAddr gpu_addr_begin = index_array.StartAddress(); + const GPUVAddr gpu_addr_end = index_array.EndAddress(); + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); + const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin); + const u32 draw_size = index_array.count * index_array.FormatSizeInBytes(); + const u32 size = std::min(address_size, draw_size); + if (size == 0 || !cpu_addr) { + index_buffer = NULL_BINDING; + return; + } + index_buffer = Binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = FindBuffer(*cpu_addr, size), + }; +} + +template +void BufferCache

::UpdateVertexBuffers() { + auto& flags = maxwell3d.dirty.flags; + if (!maxwell3d.dirty.flags[Dirty::VertexBuffers]) { + return; + } + flags[Dirty::VertexBuffers] = false; + + for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { + UpdateVertexBuffer(index); + } +} + +template +void BufferCache

::UpdateVertexBuffer(u32 index) { + if (!maxwell3d.dirty.flags[Dirty::VertexBuffer0 + index]) { + return; + } + const auto& array = maxwell3d.regs.vertex_array[index]; + const auto& limit = maxwell3d.regs.vertex_array_limit[index]; + const GPUVAddr gpu_addr_begin = array.StartAddress(); + const GPUVAddr gpu_addr_end = limit.LimitAddress() + 1; + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr_begin); + const u32 address_size = static_cast(gpu_addr_end - gpu_addr_begin); + const u32 size = address_size; // TODO: Analyze stride and number of vertices + if (array.enable == 0 || size == 0 || !cpu_addr) { + vertex_buffers[index] = NULL_BINDING; + return; + } + vertex_buffers[index] = Binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = FindBuffer(*cpu_addr, size), + }; +} + +template +void BufferCache

::UpdateUniformBuffers(size_t stage) { + ForEachEnabledBit(enabled_uniform_buffers[stage], [&](u32 index) { + Binding& binding = uniform_buffers[stage][index]; + if (binding.buffer_id) { + // Already updated + return; + } + // Mark as dirty + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + dirty_uniform_buffers[stage] |= 1U << index; + } + // Resolve buffer + binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size); + }); +} + +template +void BufferCache

::UpdateStorageBuffers(size_t stage) { + const u32 written_mask = written_storage_buffers[stage]; + ForEachEnabledBit(enabled_storage_buffers[stage], [&](u32 index) { + // Resolve buffer + Binding& binding = storage_buffers[stage][index]; + const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size); + binding.buffer_id = buffer_id; + // Mark buffer as written if needed + if (((written_mask >> index) & 1) != 0) { + MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size); + } + }); +} + +template +void BufferCache

::UpdateTransformFeedbackBuffers() { + if (maxwell3d.regs.tfb_enabled == 0) { + return; + } + for (u32 index = 0; index < NUM_TRANSFORM_FEEDBACK_BUFFERS; ++index) { + UpdateTransformFeedbackBuffer(index); + } +} + +template +void BufferCache

::UpdateTransformFeedbackBuffer(u32 index) { + const auto& binding = maxwell3d.regs.tfb_bindings[index]; + const GPUVAddr gpu_addr = binding.Address() + binding.buffer_offset; + const u32 size = binding.buffer_size; + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (binding.buffer_enable == 0 || size == 0 || !cpu_addr) { + transform_feedback_buffers[index] = NULL_BINDING; + return; + } + const BufferId buffer_id = FindBuffer(*cpu_addr, size); + transform_feedback_buffers[index] = Binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = buffer_id, + }; + MarkWrittenBuffer(buffer_id, *cpu_addr, size); +} + +template +void BufferCache

::UpdateComputeUniformBuffers() { + ForEachEnabledBit(enabled_compute_uniform_buffers, [&](u32 index) { + Binding& binding = compute_uniform_buffers[index]; + binding = NULL_BINDING; + const auto& launch_desc = kepler_compute.launch_description; + if (((launch_desc.const_buffer_enable_mask >> index) & 1) != 0) { + const auto& cbuf = launch_desc.const_buffer_config[index]; + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(cbuf.Address()); + if (cpu_addr) { + binding.cpu_addr = *cpu_addr; + binding.size = cbuf.size; + } + } + binding.buffer_id = FindBuffer(binding.cpu_addr, binding.size); + }); +} + +template +void BufferCache

::UpdateComputeStorageBuffers() { + ForEachEnabledBit(enabled_compute_storage_buffers, [&](u32 index) { + // Resolve buffer + Binding& binding = compute_storage_buffers[index]; + const BufferId buffer_id = FindBuffer(binding.cpu_addr, binding.size); + binding.buffer_id = buffer_id; + // Mark as written if needed + if (((written_compute_storage_buffers >> index) & 1) != 0) { + MarkWrittenBuffer(buffer_id, binding.cpu_addr, binding.size); + } + }); +} + +template +void BufferCache

::MarkWrittenBuffer(BufferId buffer_id, VAddr cpu_addr, u32 size) { + Buffer& buffer = slot_buffers[buffer_id]; + buffer.MarkRegionAsGpuModified(cpu_addr, size); + + const bool is_accuracy_high = Settings::IsGPULevelHigh(); + const bool is_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); + if (!is_accuracy_high || !is_async) { + return; + } + if (std::ranges::find(uncommitted_downloads, buffer_id) != uncommitted_downloads.end()) { + // Already inserted + return; + } + uncommitted_downloads.push_back(buffer_id); +} + +template +BufferId BufferCache

::FindBuffer(VAddr cpu_addr, u32 size) { + if (cpu_addr == 0) { + return NULL_BUFFER_ID; + } + const u64 page = cpu_addr >> PAGE_BITS; + const BufferId buffer_id = page_table[page]; + if (!buffer_id) { + return CreateBuffer(cpu_addr, size); + } + const Buffer& buffer = slot_buffers[buffer_id]; + if (buffer.IsInBounds(cpu_addr, size)) { + return buffer_id; + } + return CreateBuffer(cpu_addr, size); +} + +template +BufferId BufferCache

::CreateBuffer(VAddr cpu_addr, u32 wanted_size) { + std::vector overlap_ids; + VAddr cpu_addr_begin = cpu_addr; + VAddr cpu_addr_end = cpu_addr + wanted_size; + for (; cpu_addr >> PAGE_BITS < Common::DivCeil(cpu_addr_end, PAGE_SIZE); + cpu_addr += PAGE_SIZE) { + const BufferId overlap_id = page_table[cpu_addr >> PAGE_BITS]; + if (!overlap_id) { + continue; + } + Buffer& overlap = slot_buffers[overlap_id]; + if (overlap.IsPicked()) { + continue; + } + overlap.Pick(); + overlap_ids.push_back(overlap_id); + const VAddr overlap_cpu_addr = overlap.CpuAddr(); + if (overlap_cpu_addr < cpu_addr_begin) { + cpu_addr = cpu_addr_begin = overlap_cpu_addr; + } + cpu_addr_end = std::max(cpu_addr_end, overlap_cpu_addr + overlap.SizeBytes()); + } + const u32 size = static_cast(cpu_addr_end - cpu_addr_begin); + const BufferId new_buffer_id = slot_buffers.insert(runtime, rasterizer, cpu_addr_begin, size); + Buffer& new_buffer = slot_buffers[new_buffer_id]; + + for (const BufferId overlap_id : overlap_ids) { + Buffer& overlap = slot_buffers[overlap_id]; + overlap.Unpick(); + + std::vector copies; + const size_t dst_base_offset = overlap.CpuAddr() - new_buffer.CpuAddr(); + overlap.ForEachDownloadRange([&](u64 begin, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = begin, + .dst_offset = dst_base_offset + begin, + .size = range_size, + }); + new_buffer.UnmarkRegionAsCpuModified(begin, range_size); + new_buffer.MarkRegionAsGpuModified(begin, range_size); + }); + if (!copies.empty()) { + runtime.CopyBuffer(slot_buffers[new_buffer_id], overlap, copies); + } + ReplaceBufferDownloads(overlap_id, new_buffer_id); + DeleteBuffer(overlap_id); + } + Register(new_buffer_id); + return new_buffer_id; +} + +template +void BufferCache

::Register(BufferId buffer_id) { + ChangeRegister(buffer_id); +} + +template +void BufferCache

::Unregister(BufferId buffer_id) { + ChangeRegister(buffer_id); +} + +template +template +void BufferCache

::ChangeRegister(BufferId buffer_id) { + const Buffer& buffer = slot_buffers[buffer_id]; + const VAddr cpu_addr_begin = buffer.CpuAddr(); + const VAddr cpu_addr_end = cpu_addr_begin + buffer.SizeBytes(); + const u64 page_begin = cpu_addr_begin / PAGE_SIZE; + const u64 page_end = Common::DivCeil(cpu_addr_end, PAGE_SIZE); + for (u64 page = page_begin; page != page_end; ++page) { + if constexpr (insert) { + page_table[page] = buffer_id; + } else { + page_table[page] = BufferId{}; + } + } +} + +template +void BufferCache

::SynchronizeBuffer(Buffer& buffer, VAddr cpu_addr, u32 size) { + if (buffer.CpuAddr() == 0) { + return; + } + SynchronizeBufferImpl(buffer, cpu_addr, size); +} + +template +void BufferCache

::SynchronizeBufferImpl(Buffer& buffer, VAddr cpu_addr, u32 size) { + boost::container::small_vector copies; + u64 total_size_bytes = 0; + u64 largest_copy = 0; + buffer.ForEachUploadRange(cpu_addr, size, [&](u64 range_offset, u64 range_size) { + copies.push_back(BufferCopy{ + .src_offset = total_size_bytes, + .dst_offset = range_offset, + .size = range_size, + }); + total_size_bytes += range_size; + largest_copy = std::max(largest_copy, range_size); + }); + if (total_size_bytes == 0) { + return; + } + const std::span copies_span(copies.data(), copies.size()); + UploadMemory(buffer, total_size_bytes, largest_copy, copies_span); +} + +template +void BufferCache

::UploadMemory(Buffer& buffer, u64 total_size_bytes, u64 largest_copy, + std::span copies) { + if constexpr (USE_MEMORY_MAPS) { + MappedUploadMemory(buffer, total_size_bytes, copies); + } else { + ImmediateUploadMemory(buffer, largest_copy, copies); + } +} + +template +void BufferCache

::ImmediateUploadMemory(Buffer& buffer, u64 largest_copy, + std::span copies) { + std::span immediate_buffer; + for (const BufferCopy& copy : copies) { + std::span upload_span; + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + if (IsRangeGranular(cpu_addr, copy.size)) { + upload_span = std::span(cpu_memory.GetPointer(cpu_addr), copy.size); + } else { + if (immediate_buffer.empty()) { + immediate_buffer = ImmediateBuffer(largest_copy); + } + cpu_memory.ReadBlockUnsafe(cpu_addr, immediate_buffer.data(), copy.size); + upload_span = immediate_buffer.subspan(0, copy.size); + } + buffer.ImmediateUpload(copy.dst_offset, upload_span); + } +} + +template +void BufferCache

::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes, + std::span copies) { + auto upload_staging = runtime.UploadStagingBuffer(total_size_bytes); + const std::span staging_pointer = upload_staging.mapped_span; + for (const BufferCopy& copy : copies) { + const VAddr cpu_addr = buffer.CpuAddr() + copy.dst_offset; + u8* const src_pointer = staging_pointer.data() + copy.src_offset; + cpu_memory.ReadBlockUnsafe(cpu_addr, src_pointer, copy.size); + } + runtime.CopyBuffer(buffer, upload_staging.buffer, copies); +} + +template +void BufferCache

::DeleteBuffer(BufferId buffer_id) { + const auto scalar_replace = [buffer_id](Binding& binding) { + if (binding.buffer_id == buffer_id) { + binding.buffer_id = BufferId{}; + } + }; + const auto replace = [scalar_replace](std::span bindings) { + std::ranges::for_each(bindings, scalar_replace); + }; + scalar_replace(index_buffer); + replace(vertex_buffers); + std::ranges::for_each(uniform_buffers, replace); + std::ranges::for_each(storage_buffers, replace); + replace(transform_feedback_buffers); + replace(compute_uniform_buffers); + replace(compute_storage_buffers); + std::erase(cached_write_buffer_ids, buffer_id); + + // Mark the whole buffer as CPU written to stop tracking CPU writes + Buffer& buffer = slot_buffers[buffer_id]; + buffer.MarkRegionAsCpuModified(buffer.CpuAddr(), buffer.SizeBytes()); + + Unregister(buffer_id); + delayed_destruction_ring.Push(std::move(slot_buffers[buffer_id])); + + NotifyBufferDeletion(); +} + +template +void BufferCache

::ReplaceBufferDownloads(BufferId old_buffer_id, BufferId new_buffer_id) { + const auto replace = [old_buffer_id, new_buffer_id](std::vector& buffers) { + std::ranges::replace(buffers, old_buffer_id, new_buffer_id); + if (auto it = std::ranges::find(buffers, new_buffer_id); it != buffers.end()) { + buffers.erase(std::remove(it + 1, buffers.end(), new_buffer_id), buffers.end()); + } + }; + replace(uncommitted_downloads); + std::ranges::for_each(committed_downloads, replace); +} + +template +void BufferCache

::NotifyBufferDeletion() { + if constexpr (HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS) { + dirty_uniform_buffers.fill(~u32{0}); + } + auto& flags = maxwell3d.dirty.flags; + flags[Dirty::IndexBuffer] = true; + flags[Dirty::VertexBuffers] = true; + for (u32 index = 0; index < NUM_VERTEX_BUFFERS; ++index) { + flags[Dirty::VertexBuffer0 + index] = true; + } + has_deleted_buffers = true; +} + +template +typename BufferCache

::Binding BufferCache

::StorageBufferBinding(GPUVAddr ssbo_addr) const { + const GPUVAddr gpu_addr = gpu_memory.Read(ssbo_addr); + const u32 size = gpu_memory.Read(ssbo_addr + 8); + const std::optional cpu_addr = gpu_memory.GpuToCpuAddress(gpu_addr); + if (!cpu_addr || size == 0) { + return NULL_BINDING; + } + const Binding binding{ + .cpu_addr = *cpu_addr, + .size = size, + .buffer_id = BufferId{}, + }; + return binding; +} + +template +std::span BufferCache

::ImmediateBufferWithData(VAddr cpu_addr, size_t size) { + u8* const base_pointer = cpu_memory.GetPointer(cpu_addr); + if (IsRangeGranular(cpu_addr, size) || + base_pointer + size == cpu_memory.GetPointer(cpu_addr + size)) { + return std::span(base_pointer, size); + } else { + const std::span span = ImmediateBuffer(size); + cpu_memory.ReadBlockUnsafe(cpu_addr, span.data(), size); + return span; + } +} + +template +std::span BufferCache

::ImmediateBuffer(size_t wanted_capacity) { + if (wanted_capacity > immediate_buffer_capacity) { + immediate_buffer_capacity = wanted_capacity; + immediate_buffer_alloc = std::make_unique(wanted_capacity); + } + return std::span(immediate_buffer_alloc.get(), wanted_capacity); +} + +template +bool BufferCache

::HasFastUniformBufferBound(size_t stage, u32 binding_index) const noexcept { + if constexpr (IS_OPENGL) { + return ((fast_bound_uniform_buffers[stage] >> binding_index) & 1) != 0; + } else { + // Only OpenGL has fast uniform buffers + return false; + } +} + } // namespace VideoCommon diff --git a/src/video_core/command_classes/vic.cpp b/src/video_core/command_classes/vic.cpp index 55e632346..2b7569335 100755 --- a/src/video_core/command_classes/vic.cpp +++ b/src/video_core/command_classes/vic.cpp @@ -110,12 +110,10 @@ void Vic::Execute() { converted_frame_buffer.get(), block_height, 0, 0); gpu.MemoryManager().WriteBlock(output_surface_luma_address, swizzled_data.data(), size); - gpu.Maxwell3D().OnMemoryWrite(); } else { // send pitch linear frame gpu.MemoryManager().WriteBlock(output_surface_luma_address, converted_frame_buf_addr, linear_size); - gpu.Maxwell3D().OnMemoryWrite(); } break; } @@ -163,7 +161,6 @@ void Vic::Execute() { } gpu.MemoryManager().WriteBlock(output_surface_chroma_u_address, chroma_buffer.data(), chroma_buffer.size()); - gpu.Maxwell3D().OnMemoryWrite(); break; } default: diff --git a/src/video_core/dirty_flags.cpp b/src/video_core/dirty_flags.cpp index b1eaac00c..7149af290 100755 --- a/src/video_core/dirty_flags.cpp +++ b/src/video_core/dirty_flags.cpp @@ -12,13 +12,30 @@ #define NUM(field_name) (sizeof(::Tegra::Engines::Maxwell3D::Regs::field_name) / (sizeof(u32))) namespace VideoCommon::Dirty { - +namespace { using Tegra::Engines::Maxwell3D; -void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables) { +void SetupDirtyVertexBuffers(Maxwell3D::DirtyState::Tables& tables) { + static constexpr std::size_t num_array = 3; + for (std::size_t i = 0; i < Maxwell3D::Regs::NumVertexArrays; ++i) { + const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]); + const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]); + + FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers); + FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers); + } +} + +void SetupIndexBuffer(Maxwell3D::DirtyState::Tables& tables) { + FillBlock(tables[0], OFF(index_array), NUM(index_array), IndexBuffer); +} + +void SetupDirtyDescriptors(Maxwell3D::DirtyState::Tables& tables) { FillBlock(tables[0], OFF(tic), NUM(tic), Descriptors); FillBlock(tables[0], OFF(tsc), NUM(tsc), Descriptors); +} +void SetupDirtyRenderTargets(Maxwell3D::DirtyState::Tables& tables) { static constexpr std::size_t num_per_rt = NUM(rt[0]); static constexpr std::size_t begin = OFF(rt); static constexpr std::size_t num = num_per_rt * Maxwell3D::Regs::NumRenderTargets; @@ -41,5 +58,13 @@ void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tabl FillBlock(table, OFF(zeta), NUM(zeta), flag); } } +} // Anonymous namespace + +void SetupDirtyFlags(Maxwell3D::DirtyState::Tables& tables) { + SetupDirtyVertexBuffers(tables); + SetupIndexBuffer(tables); + SetupDirtyDescriptors(tables); + SetupDirtyRenderTargets(tables); +} } // namespace VideoCommon::Dirty diff --git a/src/video_core/dirty_flags.h b/src/video_core/dirty_flags.h index 875527ddd..702688ace 100755 --- a/src/video_core/dirty_flags.h +++ b/src/video_core/dirty_flags.h @@ -30,6 +30,12 @@ enum : u8 { ColorBuffer7, ZetaBuffer, + VertexBuffers, + VertexBuffer0, + VertexBuffer31 = VertexBuffer0 + 31, + + IndexBuffer, + LastCommonEntry, }; @@ -47,6 +53,6 @@ void FillBlock(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables, std::size_ FillBlock(tables[1], begin, num, index_b); } -void SetupDirtyRenderTargets(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables); +void SetupDirtyFlags(Tegra::Engines::Maxwell3D::DirtyState::Tables& tables); } // namespace VideoCommon::Dirty diff --git a/src/video_core/dma_pusher.cpp b/src/video_core/dma_pusher.cpp index 2c8b20024..8b33c04ab 100755 --- a/src/video_core/dma_pusher.cpp +++ b/src/video_core/dma_pusher.cpp @@ -23,8 +23,6 @@ void DmaPusher::DispatchCalls() { MICROPROFILE_SCOPE(DispatchCalls); gpu.SyncGuestHost(); - // On entering GPU code, assume all memory may be touched by the ARM core. - gpu.Maxwell3D().OnMemoryWrite(); dma_pushbuffer_subindex = 0; diff --git a/src/video_core/engines/fermi_2d.cpp b/src/video_core/engines/fermi_2d.cpp index a01d334ad..0f640fdae 100755 --- a/src/video_core/engines/fermi_2d.cpp +++ b/src/video_core/engines/fermi_2d.cpp @@ -18,8 +18,8 @@ Fermi2D::Fermi2D() { Fermi2D::~Fermi2D() = default; -void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { - rasterizer = &rasterizer_; +void Fermi2D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; } void Fermi2D::CallMethod(u32 method, u32 method_argument, bool is_last_call) { diff --git a/src/video_core/engines/fermi_2d.h b/src/video_core/engines/fermi_2d.h index 0de3280a2..c808a577d 100755 --- a/src/video_core/engines/fermi_2d.h +++ b/src/video_core/engines/fermi_2d.h @@ -38,7 +38,7 @@ public: ~Fermi2D(); /// Binds a rasterizer to this engine. - void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); /// Write the value to the register identified by method. void CallMethod(u32 method, u32 method_argument, bool is_last_call) override; diff --git a/src/video_core/engines/kepler_compute.cpp b/src/video_core/engines/kepler_compute.cpp index ba387506e..a9b75091e 100755 --- a/src/video_core/engines/kepler_compute.cpp +++ b/src/video_core/engines/kepler_compute.cpp @@ -21,8 +21,8 @@ KeplerCompute::KeplerCompute(Core::System& system_, MemoryManager& memory_manage KeplerCompute::~KeplerCompute() = default; -void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { - rasterizer = &rasterizer_; +void KeplerCompute::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; } void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_call) { @@ -39,7 +39,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal case KEPLER_COMPUTE_REG_INDEX(data_upload): { upload_state.ProcessData(method_argument, is_last_call); if (is_last_call) { - system.GPU().Maxwell3D().OnMemoryWrite(); } break; } diff --git a/src/video_core/engines/kepler_compute.h b/src/video_core/engines/kepler_compute.h index 9f0a7b76d..7c40cba38 100755 --- a/src/video_core/engines/kepler_compute.h +++ b/src/video_core/engines/kepler_compute.h @@ -46,7 +46,7 @@ public: ~KeplerCompute(); /// Binds a rasterizer to this engine. - void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); static constexpr std::size_t NumConstBuffers = 8; diff --git a/src/video_core/engines/kepler_memory.cpp b/src/video_core/engines/kepler_memory.cpp index 9911140e9..560551157 100755 --- a/src/video_core/engines/kepler_memory.cpp +++ b/src/video_core/engines/kepler_memory.cpp @@ -33,7 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call case KEPLERMEMORY_REG_INDEX(data): { upload_state.ProcessData(method_argument, is_last_call); if (is_last_call) { - system.GPU().Maxwell3D().OnMemoryWrite(); } break; } diff --git a/src/video_core/engines/maxwell_3d.cpp b/src/video_core/engines/maxwell_3d.cpp index 9be651e24..152bc8592 100755 --- a/src/video_core/engines/maxwell_3d.cpp +++ b/src/video_core/engines/maxwell_3d.cpp @@ -30,8 +30,8 @@ Maxwell3D::Maxwell3D(Core::System& system_, MemoryManager& memory_manager_) Maxwell3D::~Maxwell3D() = default; -void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { - rasterizer = &rasterizer_; +void Maxwell3D::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; } void Maxwell3D::InitializeRegisterDefaults() { @@ -223,7 +223,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume case MAXWELL3D_REG_INDEX(data_upload): upload_state.ProcessData(argument, is_last_call); if (is_last_call) { - OnMemoryWrite(); } return; case MAXWELL3D_REG_INDEX(fragment_barrier): @@ -570,17 +569,18 @@ std::optional Maxwell3D::GetQueryResult() { } } -void Maxwell3D::ProcessCBBind(std::size_t stage_index) { +void Maxwell3D::ProcessCBBind(size_t stage_index) { // Bind the buffer currently in CB_ADDRESS to the specified index in the desired shader stage. - auto& shader = state.shader_stages[stage_index]; - auto& bind_data = regs.cb_bind[stage_index]; - - ASSERT(bind_data.index < Regs::MaxConstBuffers); - auto& buffer = shader.const_buffers[bind_data.index]; - + const auto& bind_data = regs.cb_bind[stage_index]; + auto& buffer = state.shader_stages[stage_index].const_buffers[bind_data.index]; buffer.enabled = bind_data.valid.Value() != 0; buffer.address = regs.const_buffer.BufferAddress(); buffer.size = regs.const_buffer.cb_size; + + const bool is_enabled = bind_data.valid.Value() != 0; + const GPUVAddr gpu_addr = is_enabled ? regs.const_buffer.BufferAddress() : 0; + const u32 size = is_enabled ? regs.const_buffer.cb_size : 0; + rasterizer->BindGraphicsUniformBuffer(stage_index, bind_data.index, gpu_addr, size); } void Maxwell3D::ProcessCBData(u32 value) { @@ -635,7 +635,6 @@ void Maxwell3D::FinishCBData() { const u32 id = cb_data_state.id; memory_manager.WriteBlock(address, cb_data_state.buffer[id].data(), size); - OnMemoryWrite(); cb_data_state.id = null_cb_data; cb_data_state.current = null_cb_data; diff --git a/src/video_core/engines/maxwell_3d.h b/src/video_core/engines/maxwell_3d.h index 326b32228..ac18fb917 100755 --- a/src/video_core/engines/maxwell_3d.h +++ b/src/video_core/engines/maxwell_3d.h @@ -55,7 +55,7 @@ public: ~Maxwell3D(); /// Binds a rasterizer to this engine. - void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); /// Register structure of the Maxwell3D engine. /// TODO(Subv): This structure will need to be made bigger as more registers are discovered. @@ -1314,8 +1314,7 @@ public: GPUVAddr LimitAddress() const { return static_cast((static_cast(limit_high) << 32) | - limit_low) + - 1; + limit_low); } } vertex_array_limit[NumVertexArrays]; @@ -1403,6 +1402,7 @@ public: }; std::array shader_stages; + u32 current_instance = 0; ///< Current instance to be used to simulate instanced rendering. }; @@ -1452,11 +1452,6 @@ public: return *rasterizer; } - /// Notify a memory write has happened. - void OnMemoryWrite() { - dirty.flags |= dirty.on_write_stores; - } - enum class MMEDrawMode : u32 { Undefined, Array, @@ -1478,7 +1473,6 @@ public: using Tables = std::array; Flags flags; - Flags on_write_stores; Tables tables{}; } dirty; @@ -1541,7 +1535,7 @@ private: void FinishCBData(); /// Handles a write to the CB_BIND register. - void ProcessCBBind(std::size_t stage_index); + void ProcessCBBind(size_t stage_index); /// Handles a write to the VERTEX_END_GL register, triggering a draw. void DrawArrays(); diff --git a/src/video_core/engines/maxwell_dma.cpp b/src/video_core/engines/maxwell_dma.cpp index ba750748c..a2f19559f 100755 --- a/src/video_core/engines/maxwell_dma.cpp +++ b/src/video_core/engines/maxwell_dma.cpp @@ -60,9 +60,6 @@ void MaxwellDMA::Launch() { return; } - // All copies here update the main memory, so mark all rasterizer states as invalid. - system.GPU().Maxwell3D().OnMemoryWrite(); - if (is_src_pitch && is_dst_pitch) { CopyPitchToPitch(); } else { diff --git a/src/video_core/fence_manager.h b/src/video_core/fence_manager.h index 3512283ff..f055b61e9 100755 --- a/src/video_core/fence_manager.h +++ b/src/video_core/fence_manager.h @@ -143,22 +143,26 @@ private: } bool ShouldWait() const { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; return texture_cache.ShouldWaitAsyncFlushes() || buffer_cache.ShouldWaitAsyncFlushes() || query_cache.ShouldWaitAsyncFlushes(); } bool ShouldFlush() const { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; return texture_cache.HasUncommittedFlushes() || buffer_cache.HasUncommittedFlushes() || query_cache.HasUncommittedFlushes(); } void PopAsyncFlushes() { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; texture_cache.PopAsyncFlushes(); buffer_cache.PopAsyncFlushes(); query_cache.PopAsyncFlushes(); } void CommitAsyncFlushes() { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; texture_cache.CommitAsyncFlushes(); buffer_cache.CommitAsyncFlushes(); query_cache.CommitAsyncFlushes(); diff --git a/src/video_core/gpu.cpp b/src/video_core/gpu.cpp index 6ab06775f..2a9bd4121 100755 --- a/src/video_core/gpu.cpp +++ b/src/video_core/gpu.cpp @@ -44,8 +44,8 @@ GPU::~GPU() = default; void GPU::BindRenderer(std::unique_ptr renderer_) { renderer = std::move(renderer_); + rasterizer = renderer->ReadRasterizer(); - VideoCore::RasterizerInterface& rasterizer = renderer->Rasterizer(); memory_manager->BindRasterizer(rasterizer); maxwell_3d->BindRasterizer(rasterizer); fermi_2d->BindRasterizer(rasterizer); @@ -171,7 +171,7 @@ void GPU::TickWork() { const std::size_t size = request.size; flush_requests.pop_front(); flush_request_mutex.unlock(); - renderer->Rasterizer().FlushRegion(addr, size); + rasterizer->FlushRegion(addr, size); current_flush_fence.store(fence); flush_request_mutex.lock(); } @@ -193,11 +193,11 @@ u64 GPU::GetTicks() const { } void GPU::FlushCommands() { - renderer->Rasterizer().FlushCommands(); + rasterizer->FlushCommands(); } void GPU::SyncGuestHost() { - renderer->Rasterizer().SyncGuestHost(); + rasterizer->SyncGuestHost(); } enum class GpuSemaphoreOperation { diff --git a/src/video_core/gpu.h b/src/video_core/gpu.h index b4ce6b154..b2ee45496 100755 --- a/src/video_core/gpu.h +++ b/src/video_core/gpu.h @@ -366,6 +366,7 @@ protected: std::unique_ptr dma_pusher; std::unique_ptr cdma_pusher; std::unique_ptr renderer; + VideoCore::RasterizerInterface* rasterizer = nullptr; const bool use_nvdec; private: diff --git a/src/video_core/gpu_thread.cpp b/src/video_core/gpu_thread.cpp index 7e490bcc3..50319f1d5 100755 --- a/src/video_core/gpu_thread.cpp +++ b/src/video_core/gpu_thread.cpp @@ -38,6 +38,7 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, } auto current_context = context.Acquire(); + VideoCore::RasterizerInterface* const rasterizer = renderer.ReadRasterizer(); CommandDataContainer next; while (state.is_running) { @@ -52,13 +53,13 @@ static void RunThread(Core::System& system, VideoCore::RendererBase& renderer, } else if (const auto* data = std::get_if(&next.data)) { renderer.SwapBuffers(data->framebuffer ? &*data->framebuffer : nullptr); } else if (std::holds_alternative(next.data)) { - renderer.Rasterizer().ReleaseFences(); + rasterizer->ReleaseFences(); } else if (std::holds_alternative(next.data)) { system.GPU().TickWork(); } else if (const auto* flush = std::get_if(&next.data)) { - renderer.Rasterizer().FlushRegion(flush->addr, flush->size); + rasterizer->FlushRegion(flush->addr, flush->size); } else if (const auto* invalidate = std::get_if(&next.data)) { - renderer.Rasterizer().OnCPUWrite(invalidate->addr, invalidate->size); + rasterizer->OnCPUWrite(invalidate->addr, invalidate->size); } else if (std::holds_alternative(next.data)) { return; } else { @@ -84,6 +85,7 @@ ThreadManager::~ThreadManager() { void ThreadManager::StartThread(VideoCore::RendererBase& renderer, Core::Frontend::GraphicsContext& context, Tegra::DmaPusher& dma_pusher, Tegra::CDmaPusher& cdma_pusher) { + rasterizer = renderer.ReadRasterizer(); thread = std::thread(RunThread, std::ref(system), std::ref(renderer), std::ref(context), std::ref(dma_pusher), std::ref(state), std::ref(cdma_pusher)); } @@ -129,12 +131,12 @@ void ThreadManager::FlushRegion(VAddr addr, u64 size) { } void ThreadManager::InvalidateRegion(VAddr addr, u64 size) { - system.Renderer().Rasterizer().OnCPUWrite(addr, size); + rasterizer->OnCPUWrite(addr, size); } void ThreadManager::FlushAndInvalidateRegion(VAddr addr, u64 size) { // Skip flush on asynch mode, as FlushAndInvalidateRegion is not used for anything too important - system.Renderer().Rasterizer().OnCPUWrite(addr, size); + rasterizer->OnCPUWrite(addr, size); } void ThreadManager::WaitIdle() const { diff --git a/src/video_core/gpu_thread.h b/src/video_core/gpu_thread.h index 2775629e7..4cd951169 100755 --- a/src/video_core/gpu_thread.h +++ b/src/video_core/gpu_thread.h @@ -27,6 +27,7 @@ class System; } // namespace Core namespace VideoCore { +class RasterizerInterface; class RendererBase; } // namespace VideoCore @@ -151,11 +152,12 @@ private: /// Pushes a command to be executed by the GPU thread u64 PushCommand(CommandData&& command_data); - SynchState state; Core::System& system; - std::thread thread; - std::thread::id thread_id; const bool is_async; + VideoCore::RasterizerInterface* rasterizer = nullptr; + + SynchState state; + std::thread thread; }; } // namespace VideoCommon::GPUThread diff --git a/src/video_core/host_shaders/CMakeLists.txt b/src/video_core/host_shaders/CMakeLists.txt index 4c7399d5a..0199e9275 100755 --- a/src/video_core/host_shaders/CMakeLists.txt +++ b/src/video_core/host_shaders/CMakeLists.txt @@ -12,7 +12,6 @@ set(SHADER_FILES vulkan_blit_depth_stencil.frag vulkan_present.frag vulkan_present.vert - vulkan_quad_array.comp vulkan_quad_indexed.comp vulkan_uint8.comp ) diff --git a/src/video_core/host_shaders/vulkan_uint8.comp b/src/video_core/host_shaders/vulkan_uint8.comp index ad74d7af9..872291670 100755 --- a/src/video_core/host_shaders/vulkan_uint8.comp +++ b/src/video_core/host_shaders/vulkan_uint8.comp @@ -16,9 +16,16 @@ layout (std430, set = 0, binding = 1) writeonly buffer OutputBuffer { uint16_t output_indexes[]; }; +uint AssembleIndex(uint id) { + // Most primitive restart indices are 0xFF + // Hardcode this to 0xFF for now + uint index = uint(input_indexes[id]); + return index == 0xFF ? 0xFFFF : index; +} + void main() { uint id = gl_GlobalInvocationID.x; if (id < input_indexes.length()) { - output_indexes[id] = uint16_t(input_indexes[id]); + output_indexes[id] = uint16_t(AssembleIndex(id)); } } diff --git a/src/video_core/memory_manager.cpp b/src/video_core/memory_manager.cpp index 65feff588..20e90be3a 100755 --- a/src/video_core/memory_manager.cpp +++ b/src/video_core/memory_manager.cpp @@ -20,8 +20,8 @@ MemoryManager::MemoryManager(Core::System& system_) MemoryManager::~MemoryManager() = default; -void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface& rasterizer_) { - rasterizer = &rasterizer_; +void MemoryManager::BindRasterizer(VideoCore::RasterizerInterface* rasterizer_) { + rasterizer = rasterizer_; } GPUVAddr MemoryManager::UpdateRange(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size) { @@ -101,6 +101,25 @@ void MemoryManager::TryUnlockPage(PageEntry page_entry, std::size_t size) { .IsSuccess()); } +void MemoryManager::UnmapVicFrame(GPUVAddr gpu_addr, std::size_t size) { + if (!size) { + return; + } + + const std::optional cpu_addr = GpuToCpuAddress(gpu_addr); + ASSERT(cpu_addr); + rasterizer->InvalidateExceptTextureCache(*cpu_addr, size); + cache_invalidate_queue.push_back({*cpu_addr, size}); + + UpdateRange(gpu_addr, PageEntry::State::Unmapped, size); +} + +void MemoryManager::InvalidateQueuedCaches() { + for (const auto& entry : cache_invalidate_queue) { + rasterizer->InvalidateTextureCache(entry.first, entry.second); + } + cache_invalidate_queue.clear(); +} PageEntry MemoryManager::GetPageEntry(GPUVAddr gpu_addr) const { return page_table[PageEntryIndex(gpu_addr)]; } diff --git a/src/video_core/memory_manager.h b/src/video_core/memory_manager.h index c35e57689..a57d66022 100755 --- a/src/video_core/memory_manager.h +++ b/src/video_core/memory_manager.h @@ -72,7 +72,7 @@ public: ~MemoryManager(); /// Binds a renderer to the memory manager. - void BindRasterizer(VideoCore::RasterizerInterface& rasterizer); + void BindRasterizer(VideoCore::RasterizerInterface* rasterizer); [[nodiscard]] std::optional GpuToCpuAddress(GPUVAddr addr) const; @@ -121,6 +121,14 @@ public: [[nodiscard]] GPUVAddr Allocate(std::size_t size, std::size_t align); void Unmap(GPUVAddr gpu_addr, std::size_t size); + /** + * Some Decoded NVDEC frames require that texture cache does not get invalidated. + * UnmapVicFrame defers the texture cache invalidation until the stream ends + * by invoking InvalidateQueuedCaches to invalidate all frame texture caches. + */ + void UnmapVicFrame(GPUVAddr gpu_addr, std::size_t size); + void InvalidateQueuedCaches(); + private: [[nodiscard]] PageEntry GetPageEntry(GPUVAddr gpu_addr) const; void SetPageEntry(GPUVAddr gpu_addr, PageEntry page_entry, std::size_t size = page_size); @@ -150,6 +158,7 @@ private: VideoCore::RasterizerInterface* rasterizer = nullptr; std::vector page_table; + std::vector> cache_invalidate_queue; }; } // namespace Tegra diff --git a/src/video_core/rasterizer_interface.h b/src/video_core/rasterizer_interface.h index 0cb0f387d..afc7ff4ae 100755 --- a/src/video_core/rasterizer_interface.h +++ b/src/video_core/rasterizer_interface.h @@ -7,6 +7,7 @@ #include #include #include +#include #include "common/common_types.h" #include "video_core/engines/fermi_2d.h" #include "video_core/gpu.h" @@ -49,6 +50,10 @@ public: /// Records a GPU query and caches it virtual void Query(GPUVAddr gpu_addr, QueryType type, std::optional timestamp) = 0; + /// Signal an uniform buffer binding + virtual void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, + u32 size) = 0; + /// Signal a GPU based semaphore as a fence virtual void SignalSemaphore(GPUVAddr addr, u32 value) = 0; @@ -64,6 +69,12 @@ public: /// Notify rasterizer that any caches of the specified region should be flushed to Switch memory virtual void FlushRegion(VAddr addr, u64 size) = 0; + /// Notify rasterizer to flush the texture cache to Switch memory + virtual void InvalidateExceptTextureCache(VAddr addr, u64 size) = 0; + + /// Notify rasterizer to invalidate the texture cache + virtual void InvalidateTextureCache(VAddr addr, u64 size) = 0; + /// Check if the the specified memory area requires flushing to CPU Memory. virtual bool MustFlushRegion(VAddr addr, u64 size) = 0; diff --git a/src/video_core/renderer_base.h b/src/video_core/renderer_base.h index 51dde8eb5..320ee8d30 100755 --- a/src/video_core/renderer_base.h +++ b/src/video_core/renderer_base.h @@ -37,15 +37,11 @@ public: std::unique_ptr context); virtual ~RendererBase(); - /// Initialize the renderer - [[nodiscard]] virtual bool Init() = 0; - - /// Shutdown the renderer - virtual void ShutDown() = 0; - /// Finalize rendering the guest frame and draw into the presentation texture virtual void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) = 0; + [[nodiscard]] virtual RasterizerInterface* ReadRasterizer() = 0; + // Getter/setter functions: // ------------------------ @@ -57,14 +53,6 @@ public: return m_current_frame; } - [[nodiscard]] RasterizerInterface& Rasterizer() { - return *rasterizer; - } - - [[nodiscard]] const RasterizerInterface& Rasterizer() const { - return *rasterizer; - } - [[nodiscard]] Core::Frontend::GraphicsContext& Context() { return *context; } @@ -98,7 +86,6 @@ public: protected: Core::Frontend::EmuWindow& render_window; ///< Reference to the render window handle. - std::unique_ptr rasterizer; std::unique_ptr context; f32 m_current_fps = 0.0f; ///< Current framerate, should be set by the renderer int m_current_frame = 0; ///< Current frame, should be set by the renderer diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.cpp b/src/video_core/renderer_opengl/gl_buffer_cache.cpp index 5772cad87..889ad6c56 100755 --- a/src/video_core/renderer_opengl/gl_buffer_cache.cpp +++ b/src/video_core/renderer_opengl/gl_buffer_cache.cpp @@ -2,98 +2,235 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include +#include -#include - -#include "common/assert.h" -#include "common/microprofile.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/engines/maxwell_3d.h" -#include "video_core/rasterizer_interface.h" #include "video_core/renderer_opengl/gl_buffer_cache.h" #include "video_core/renderer_opengl/gl_device.h" -#include "video_core/renderer_opengl/gl_rasterizer.h" -#include "video_core/renderer_opengl/gl_resource_manager.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_instance.h" +#include "video_core/vulkan_common/vulkan_library.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" namespace OpenGL { +namespace { +struct BindlessSSBO { + GLuint64EXT address; + GLsizei length; + GLsizei padding; +}; +static_assert(sizeof(BindlessSSBO) == sizeof(GLuint) * 4); -using Maxwell = Tegra::Engines::Maxwell3D::Regs; +constexpr std::array PROGRAM_LUT{ + GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV, + GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, +}; +} // Anonymous namespace -MICROPROFILE_DEFINE(OpenGL_Buffer_Download, "OpenGL", "Buffer Download", MP_RGB(192, 192, 128)); +Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params) + : VideoCommon::BufferBase(null_params) {} -Buffer::Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_) - : BufferBlock{cpu_addr_, size_} { - gl_buffer.Create(); - glNamedBufferData(gl_buffer.handle, static_cast(size_), nullptr, GL_DYNAMIC_DRAW); - if (device_.UseAssemblyShaders() || device_.HasVertexBufferUnifiedMemory()) { - glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_WRITE); - glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); +Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_, + VAddr cpu_addr_, u64 size_bytes_) + : VideoCommon::BufferBase(rasterizer_, cpu_addr_, size_bytes_) { + buffer.Create(); + const std::string name = fmt::format("Buffer 0x{:x}", CpuAddr()); + glObjectLabel(GL_BUFFER, buffer.handle, static_cast(name.size()), name.data()); + if (runtime.device.UseAssemblyShaders()) { + CreateMemoryObjects(runtime); + glNamedBufferStorageMemEXT(buffer.handle, SizeBytes(), memory_commit.ExportOpenGLHandle(), + memory_commit.Offset()); + } else { + glNamedBufferData(buffer.handle, SizeBytes(), nullptr, GL_DYNAMIC_DRAW); + } + if (runtime.has_unified_vertex_buffers) { + glGetNamedBufferParameterui64vNV(buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &address); } } -Buffer::~Buffer() = default; - -void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) { - glNamedBufferSubData(Handle(), static_cast(offset), - static_cast(data_size), data); +void Buffer::ImmediateUpload(size_t offset, std::span data) noexcept { + glNamedBufferSubData(buffer.handle, static_cast(offset), + static_cast(data.size_bytes()), data.data()); } -void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) { - MICROPROFILE_SCOPE(OpenGL_Buffer_Download); - const GLsizeiptr gl_size = static_cast(data_size); - const GLintptr gl_offset = static_cast(offset); - if (read_buffer.handle == 0) { - read_buffer.Create(); - glNamedBufferData(read_buffer.handle, static_cast(Size()), nullptr, - GL_STREAM_READ); - } - glMemoryBarrier(GL_BUFFER_UPDATE_BARRIER_BIT); - glCopyNamedBufferSubData(gl_buffer.handle, read_buffer.handle, gl_offset, gl_offset, gl_size); - glGetNamedBufferSubData(read_buffer.handle, gl_offset, gl_size, data); +void Buffer::ImmediateDownload(size_t offset, std::span data) noexcept { + glGetNamedBufferSubData(buffer.handle, static_cast(offset), + static_cast(data.size_bytes()), data.data()); } -void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t copy_size) { - glCopyNamedBufferSubData(src.Handle(), Handle(), static_cast(src_offset), - static_cast(dst_offset), static_cast(copy_size)); -} - -OGLBufferCache::OGLBufferCache(VideoCore::RasterizerInterface& rasterizer_, - Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, - const Device& device_, OGLStreamBuffer& stream_buffer_, - StateTracker& state_tracker) - : GenericBufferCache{rasterizer_, gpu_memory_, cpu_memory_, stream_buffer_}, device{device_} { - if (!device.HasFastBufferSubData()) { +void Buffer::MakeResident(GLenum access) noexcept { + // Abuse GLenum's order to exit early + // GL_NONE (default) < GL_READ_ONLY < GL_READ_WRITE + if (access <= current_residency_access || buffer.handle == 0) { return; } + if (std::exchange(current_residency_access, access) != GL_NONE) { + // If the buffer is already resident, remove its residency before promoting it + glMakeNamedBufferNonResidentNV(buffer.handle); + } + glMakeNamedBufferResidentNV(buffer.handle, access); +} - static constexpr GLsizeiptr size = static_cast(Maxwell::MaxConstBufferSize); - glCreateBuffers(static_cast(std::size(cbufs)), std::data(cbufs)); - for (const GLuint cbuf : cbufs) { - glNamedBufferData(cbuf, size, nullptr, GL_STREAM_DRAW); +GLuint Buffer::SubBuffer(u32 offset) { + if (offset == 0) { + return buffer.handle; + } + for (const auto& [sub_buffer, sub_offset] : subs) { + if (sub_offset == offset) { + return sub_buffer.handle; + } + } + OGLBuffer sub_buffer; + sub_buffer.Create(); + glNamedBufferStorageMemEXT(sub_buffer.handle, SizeBytes() - offset, + memory_commit.ExportOpenGLHandle(), memory_commit.Offset() + offset); + return subs.emplace_back(std::move(sub_buffer), offset).first.handle; +} + +void Buffer::CreateMemoryObjects(BufferCacheRuntime& runtime) { + auto& allocator = runtime.vulkan_memory_allocator; + auto& device = runtime.vulkan_device->GetLogical(); + auto vulkan_buffer = device.CreateBuffer(VkBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = SizeBytes(), + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | + VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); + const VkMemoryRequirements requirements = device.GetBufferMemoryRequirements(*vulkan_buffer); + memory_commit = allocator->Commit(requirements, Vulkan::MemoryUsage::DeviceLocal); +} + +BufferCacheRuntime::BufferCacheRuntime(const Device& device_, const Vulkan::Device* vulkan_device_, + Vulkan::MemoryAllocator* vulkan_memory_allocator_) + : device{device_}, vulkan_device{vulkan_device_}, + vulkan_memory_allocator{vulkan_memory_allocator_}, + stream_buffer{device.HasFastBufferSubData() ? std::nullopt + : std::make_optional()} { + GLint gl_max_attributes; + glGetIntegerv(GL_MAX_VERTEX_ATTRIBS, &gl_max_attributes); + max_attributes = static_cast(gl_max_attributes); + use_assembly_shaders = device.UseAssemblyShaders(); + has_unified_vertex_buffers = device.HasVertexBufferUnifiedMemory(); + + for (auto& stage_uniforms : fast_uniforms) { + for (OGLBuffer& buffer : stage_uniforms) { + buffer.Create(); + glNamedBufferData(buffer.handle, BufferCache::SKIP_CACHE_SIZE, nullptr, GL_STREAM_DRAW); + } } } -OGLBufferCache::~OGLBufferCache() { - glDeleteBuffers(static_cast(std::size(cbufs)), std::data(cbufs)); +void BufferCacheRuntime::CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, + std::span copies) { + for (const VideoCommon::BufferCopy& copy : copies) { + glCopyNamedBufferSubData( + src_buffer.Handle(), dst_buffer.Handle(), static_cast(copy.src_offset), + static_cast(copy.dst_offset), static_cast(copy.size)); + } } -std::shared_ptr OGLBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared(device, cpu_addr, size); +void BufferCacheRuntime::BindIndexBuffer(Buffer& buffer, u32 offset, u32 size) { + if (has_unified_vertex_buffers) { + buffer.MakeResident(GL_READ_ONLY); + glBufferAddressRangeNV(GL_ELEMENT_ARRAY_ADDRESS_NV, 0, buffer.HostGpuAddr() + offset, + static_cast(size)); + } else { + glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, buffer.Handle()); + index_buffer_offset = offset; + } } -OGLBufferCache::BufferInfo OGLBufferCache::GetEmptyBuffer(std::size_t) { - return {0, 0, 0}; +void BufferCacheRuntime::BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, + u32 stride) { + if (index >= max_attributes) { + return; + } + if (has_unified_vertex_buffers) { + buffer.MakeResident(GL_READ_ONLY); + glBindVertexBuffer(index, 0, 0, static_cast(stride)); + glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, index, + buffer.HostGpuAddr() + offset, static_cast(size)); + } else { + glBindVertexBuffer(index, buffer.Handle(), static_cast(offset), + static_cast(stride)); + } } -OGLBufferCache::BufferInfo OGLBufferCache::ConstBufferUpload(const void* raw_pointer, - std::size_t size) { - DEBUG_ASSERT(cbuf_cursor < std::size(cbufs)); - const GLuint cbuf = cbufs[cbuf_cursor++]; +void BufferCacheRuntime::BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, + u32 offset, u32 size) { + if (use_assembly_shaders) { + const GLuint sub_buffer = buffer.SubBuffer(offset); + glBindBufferRangeNV(PABO_LUT[stage], binding_index, sub_buffer, 0, + static_cast(size)); + } else { + const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_UNIFORM_BUFFER, binding, buffer.Handle(), + static_cast(offset), static_cast(size)); + } +} - glNamedBufferSubData(cbuf, 0, static_cast(size), raw_pointer); - return {cbuf, 0, 0}; +void BufferCacheRuntime::BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, + u32 size) { + if (use_assembly_shaders) { + glBindBufferRangeNV(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding_index, + buffer.SubBuffer(offset), 0, static_cast(size)); + } else { + glBindBufferRange(GL_UNIFORM_BUFFER, binding_index, buffer.Handle(), + static_cast(offset), static_cast(size)); + } +} + +void BufferCacheRuntime::BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer, + u32 offset, u32 size, bool is_written) { + if (use_assembly_shaders) { + const BindlessSSBO ssbo{ + .address = buffer.HostGpuAddr() + offset, + .length = static_cast(size), + .padding = 0, + }; + buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY); + glProgramLocalParametersI4uivNV(PROGRAM_LUT[stage], binding_index, 1, + reinterpret_cast(&ssbo)); + } else { + const GLuint base_binding = device.GetBaseBindings(stage).shader_storage_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, buffer.Handle(), + static_cast(offset), static_cast(size)); + } +} + +void BufferCacheRuntime::BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset, + u32 size, bool is_written) { + if (use_assembly_shaders) { + const BindlessSSBO ssbo{ + .address = buffer.HostGpuAddr() + offset, + .length = static_cast(size), + .padding = 0, + }; + buffer.MakeResident(is_written ? GL_READ_WRITE : GL_READ_ONLY); + glProgramLocalParametersI4uivNV(GL_COMPUTE_PROGRAM_NV, binding_index, 1, + reinterpret_cast(&ssbo)); + } else if (size == 0) { + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, 0, 0, 0); + } else { + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding_index, buffer.Handle(), + static_cast(offset), static_cast(size)); + } +} + +void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, + u32 size) { + glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, index, buffer.Handle(), + static_cast(offset), static_cast(size)); } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_buffer_cache.h b/src/video_core/renderer_opengl/gl_buffer_cache.h index 17ee90316..f4d8871a9 100755 --- a/src/video_core/renderer_opengl/gl_buffer_cache.h +++ b/src/video_core/renderer_opengl/gl_buffer_cache.h @@ -5,79 +5,167 @@ #pragma once #include -#include +#include +#include "common/alignment.h" #include "common/common_types.h" +#include "common/dynamic_library.h" #include "video_core/buffer_cache/buffer_cache.h" -#include "video_core/engines/maxwell_3d.h" +#include "video_core/rasterizer_interface.h" +#include "video_core/renderer_opengl/gl_device.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" -namespace Core { -class System; -} +namespace Vulkan { +class Device; +class MemoryAllocator; +} // namespace Vulkan namespace OpenGL { -class Device; -class OGLStreamBuffer; -class RasterizerOpenGL; -class StateTracker; +class BufferCacheRuntime; -class Buffer : public VideoCommon::BufferBlock { +class Buffer : public VideoCommon::BufferBase { public: - explicit Buffer(const Device& device_, VAddr cpu_addr_, std::size_t size_); - ~Buffer(); + explicit Buffer(BufferCacheRuntime&, VideoCore::RasterizerInterface& rasterizer, VAddr cpu_addr, + u64 size_bytes); + explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams); - void Upload(std::size_t offset, std::size_t data_size, const u8* data); + void ImmediateUpload(size_t offset, std::span data) noexcept; - void Download(std::size_t offset, std::size_t data_size, u8* data); + void ImmediateDownload(size_t offset, std::span data) noexcept; - void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t copy_size); + void MakeResident(GLenum access) noexcept; - GLuint Handle() const noexcept { - return gl_buffer.handle; + [[nodiscard]] GLuint SubBuffer(u32 offset); + + [[nodiscard]] GLuint64EXT HostGpuAddr() const noexcept { + return address; } - u64 Address() const noexcept { - return gpu_address; + [[nodiscard]] GLuint Handle() const noexcept { + return buffer.handle; } private: - OGLBuffer gl_buffer; - OGLBuffer read_buffer; - u64 gpu_address = 0; + void CreateMemoryObjects(BufferCacheRuntime& runtime); + + GLuint64EXT address = 0; + Vulkan::MemoryCommit memory_commit; + OGLBuffer buffer; + GLenum current_residency_access = GL_NONE; + std::vector> subs; }; -using GenericBufferCache = VideoCommon::BufferCache; -class OGLBufferCache final : public GenericBufferCache { +class BufferCacheRuntime { + friend Buffer; + public: - explicit OGLBufferCache(VideoCore::RasterizerInterface& rasterizer, - Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, - const Device& device, OGLStreamBuffer& stream_buffer, - StateTracker& state_tracker); - ~OGLBufferCache(); + static constexpr u8 INVALID_BINDING = std::numeric_limits::max(); - BufferInfo GetEmptyBuffer(std::size_t) override; + explicit BufferCacheRuntime(const Device& device_, const Vulkan::Device* vulkan_device_, + Vulkan::MemoryAllocator* vulkan_memory_allocator_); - void Acquire() noexcept { - cbuf_cursor = 0; + void CopyBuffer(Buffer& dst_buffer, Buffer& src_buffer, + std::span copies); + + void BindIndexBuffer(Buffer& buffer, u32 offset, u32 size); + + void BindVertexBuffer(u32 index, Buffer& buffer, u32 offset, u32 size, u32 stride); + + void BindUniformBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size); + + void BindComputeUniformBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size); + + void BindStorageBuffer(size_t stage, u32 binding_index, Buffer& buffer, u32 offset, u32 size, + bool is_written); + + void BindComputeStorageBuffer(u32 binding_index, Buffer& buffer, u32 offset, u32 size, + bool is_written); + + void BindTransformFeedbackBuffer(u32 index, Buffer& buffer, u32 offset, u32 size); + + void BindFastUniformBuffer(size_t stage, u32 binding_index, u32 size) { + if (use_assembly_shaders) { + const GLuint handle = fast_uniforms[stage][binding_index].handle; + const GLsizeiptr gl_size = static_cast(size); + glBindBufferRangeNV(PABO_LUT[stage], binding_index, handle, 0, gl_size); + } else { + const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_UNIFORM_BUFFER, binding, + fast_uniforms[stage][binding_index].handle, 0, + static_cast(size)); + } } -protected: - std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) override; + void PushFastUniformBuffer(size_t stage, u32 binding_index, std::span data) { + if (use_assembly_shaders) { + glProgramBufferParametersIuivNV( + PABO_LUT[stage], binding_index, 0, + static_cast(data.size_bytes() / sizeof(GLuint)), + reinterpret_cast(data.data())); + } else { + glNamedBufferSubData(fast_uniforms[stage][binding_index].handle, 0, + static_cast(data.size_bytes()), data.data()); + } + } - BufferInfo ConstBufferUpload(const void* raw_pointer, std::size_t size) override; + std::span BindMappedUniformBuffer(size_t stage, u32 binding_index, u32 size) noexcept { + const auto [mapped_span, offset] = stream_buffer->Request(static_cast(size)); + const GLuint base_binding = device.GetBaseBindings(stage).uniform_buffer; + const GLuint binding = base_binding + binding_index; + glBindBufferRange(GL_UNIFORM_BUFFER, binding, stream_buffer->Handle(), + static_cast(offset), static_cast(size)); + return mapped_span; + } + + [[nodiscard]] const GLvoid* IndexOffset() const noexcept { + return reinterpret_cast(static_cast(index_buffer_offset)); + } + + [[nodiscard]] bool HasFastBufferSubData() const noexcept { + return device.HasFastBufferSubData(); + } private: - static constexpr std::size_t NUM_CBUFS = Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * - Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; + static constexpr std::array PABO_LUT{ + GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, + GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, + GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV, + }; const Device& device; + const Vulkan::Device* vulkan_device; + Vulkan::MemoryAllocator* vulkan_memory_allocator; + std::optional stream_buffer; - std::size_t cbuf_cursor = 0; - std::array cbufs{}; + u32 max_attributes = 0; + + bool use_assembly_shaders = false; + bool has_unified_vertex_buffers = false; + + std::array, + VideoCommon::NUM_STAGES> + fast_uniforms; + + u32 index_buffer_offset = 0; }; +struct BufferCacheParams { + using Runtime = OpenGL::BufferCacheRuntime; + using Buffer = OpenGL::Buffer; + + static constexpr bool IS_OPENGL = true; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = true; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = true; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = true; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = true; + static constexpr bool USE_MEMORY_MAPS = false; +}; + +using BufferCache = VideoCommon::BufferCache; + } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_device.cpp b/src/video_core/renderer_opengl/gl_device.cpp index 81b71edfb..4e3e2c1b0 100755 --- a/src/video_core/renderer_opengl/gl_device.cpp +++ b/src/video_core/renderer_opengl/gl_device.cpp @@ -21,9 +21,7 @@ #include "video_core/renderer_opengl/gl_resource_manager.h" namespace OpenGL { - namespace { - // One uniform block is reserved for emulation purposes constexpr u32 ReservedUniformBlocks = 1; @@ -198,10 +196,18 @@ bool IsASTCSupported() { return nsight || HasExtension(extensions, "GL_EXT_debug_tool"); } +[[nodiscard]] std::string UuidString(std::span uuid) { + return fmt::format("{:x}{:x}{:x}{:x}-{:x}{:x}-{:x}{:x}-{:x}{:x}-{:x}{:x}{:x}{:x}{:x}", uuid[0], + uuid[1], uuid[2], uuid[3], uuid[4], uuid[5], uuid[6], uuid[7], uuid[8], + uuid[9], uuid[10], uuid[11], uuid[12], uuid[13], uuid[14], uuid[15]); +} } // Anonymous namespace -Device::Device() - : max_uniform_buffers{BuildMaxUniformBuffers()}, base_bindings{BuildBaseBindings()} { +Device::Device(bool has_vulkan_instance) { + if (!GLAD_GL_VERSION_4_3) { + LOG_ERROR(Render_OpenGL, "OpenGL 4.3 is not available"); + throw std::runtime_error{"Insufficient version"}; + } const std::string_view vendor = reinterpret_cast(glGetString(GL_VENDOR)); const std::string_view version = reinterpret_cast(glGetString(GL_VERSION)); const std::vector extensions = GetExtensions(); @@ -217,6 +223,9 @@ Device::Device() "Beta driver 443.24 is known to have issues. There might be performance issues."); disable_fast_buffer_sub_data = true; } + + max_uniform_buffers = BuildMaxUniformBuffers(); + base_bindings = BuildBaseBindings(); uniform_buffer_alignment = GetInteger(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT); shader_storage_alignment = GetInteger(GL_SHADER_STORAGE_BUFFER_OFFSET_ALIGNMENT); max_vertex_attributes = GetInteger(GL_MAX_VERTEX_ATTRIBS); @@ -243,7 +252,8 @@ Device::Device() use_assembly_shaders = Settings::values.use_assembly_shaders.GetValue() && GLAD_GL_NV_gpu_program5 && GLAD_GL_NV_compute_program5 && - GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2; + GLAD_GL_NV_transform_feedback && GLAD_GL_NV_transform_feedback2 && + has_vulkan_instance; use_asynchronous_shaders = Settings::values.use_asynchronous_shaders.GetValue(); diff --git a/src/video_core/renderer_opengl/gl_device.h b/src/video_core/renderer_opengl/gl_device.h index 3e79d1e37..21a38544f 100755 --- a/src/video_core/renderer_opengl/gl_device.h +++ b/src/video_core/renderer_opengl/gl_device.h @@ -10,18 +10,16 @@ namespace OpenGL { -static constexpr u32 EmulationUniformBlockBinding = 0; - -class Device final { +class Device { public: - struct BaseBindings final { + struct BaseBindings { u32 uniform_buffer{}; u32 shader_storage_buffer{}; u32 sampler{}; u32 image{}; }; - explicit Device(); + explicit Device(bool has_vulkan_instance); explicit Device(std::nullptr_t); u32 GetMaxUniformBuffers(Tegra::Engines::ShaderType shader_type) const noexcept { diff --git a/src/video_core/renderer_opengl/gl_fence_manager.cpp b/src/video_core/renderer_opengl/gl_fence_manager.cpp index 3e9c922f5..151290101 100755 --- a/src/video_core/renderer_opengl/gl_fence_manager.cpp +++ b/src/video_core/renderer_opengl/gl_fence_manager.cpp @@ -47,7 +47,7 @@ void GLInnerFence::Wait() { FenceManagerOpenGL::FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, TextureCache& texture_cache_, - OGLBufferCache& buffer_cache_, QueryCache& query_cache_) + BufferCache& buffer_cache_, QueryCache& query_cache_) : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_} {} Fence FenceManagerOpenGL::CreateFence(u32 value, bool is_stubbed) { diff --git a/src/video_core/renderer_opengl/gl_fence_manager.h b/src/video_core/renderer_opengl/gl_fence_manager.h index 30dbee613..e714aa115 100755 --- a/src/video_core/renderer_opengl/gl_fence_manager.h +++ b/src/video_core/renderer_opengl/gl_fence_manager.h @@ -32,14 +32,13 @@ private: }; using Fence = std::shared_ptr; -using GenericFenceManager = - VideoCommon::FenceManager; +using GenericFenceManager = VideoCommon::FenceManager; class FenceManagerOpenGL final : public GenericFenceManager { public: - explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, - TextureCache& texture_cache_, OGLBufferCache& buffer_cache_, - QueryCache& query_cache_); + explicit FenceManagerOpenGL(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + TextureCache& texture_cache, BufferCache& buffer_cache, + QueryCache& query_cache); protected: Fence CreateFence(u32 value, bool is_stubbed) override; diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index 8aa63d329..3e90e6481 100755 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -44,17 +44,10 @@ using VideoCore::Surface::PixelFormat; using VideoCore::Surface::SurfaceTarget; using VideoCore::Surface::SurfaceType; -MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Format Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_VB, "OpenGL", "Vertex Buffer Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Shader, "OpenGL", "Shader Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_UBO, "OpenGL", "Const Buffer Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Index, "OpenGL", "Index Buffer Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Texture, "OpenGL", "Texture Setup", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_Framebuffer, "OpenGL", "Framebuffer Setup", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192)); +MICROPROFILE_DEFINE(OpenGL_Clears, "OpenGL", "Clears", MP_RGB(128, 128, 192)); MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(128, 128, 192)); -MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100)); -MICROPROFILE_DEFINE(OpenGL_PrimitiveAssembly, "OpenGL", "Prim Asmbl", MP_RGB(255, 100, 100)); +MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Management", MP_RGB(100, 255, 100)); namespace { @@ -104,20 +97,6 @@ TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); } -std::size_t GetConstBufferSize(const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry) { - if (!entry.IsIndirect()) { - return entry.GetSize(); - } - if (buffer.size > Maxwell::MaxConstBufferSize) { - LOG_WARNING(Render_OpenGL, "Indirect constbuffer size {} exceeds maximum {}", buffer.size, - Maxwell::MaxConstBufferSize); - return Maxwell::MaxConstBufferSize; - } - - return buffer.size; -} - /// Translates hardware transform feedback indices /// @param location Hardware location /// @return Pair of ARB_transform_feedback3 token stream first and third arguments @@ -150,14 +129,6 @@ void oglEnable(GLenum cap, bool state) { (state ? glEnable : glDisable)(cap); } -void UpdateBindlessSSBOs(GLenum target, const BindlessSSBO* ssbos, size_t num_ssbos) { - if (num_ssbos == 0) { - return; - } - glProgramLocalParametersI4uivNV(target, 0, static_cast(num_ssbos), - reinterpret_cast(ssbos)); -} - ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { if (entry.is_buffer) { return ImageViewType::Buffer; @@ -199,49 +170,35 @@ ImageViewType ImageViewTypeFromEntry(const ImageEntry& entry) { RasterizerOpenGL::RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, Core::Memory::Memory& cpu_memory_, const Device& device_, + const Vulkan::Device* vulkan_device, + Vulkan::MemoryAllocator* vulkan_memory_allocator, ScreenInfo& screen_info_, ProgramManager& program_manager_, StateTracker& state_tracker_) : RasterizerAccelerated(cpu_memory_), gpu(gpu_), maxwell3d(gpu.Maxwell3D()), kepler_compute(gpu.KeplerCompute()), gpu_memory(gpu.MemoryManager()), device(device_), screen_info(screen_info_), program_manager(program_manager_), state_tracker(state_tracker_), - stream_buffer(device, state_tracker), texture_cache_runtime(device, program_manager, state_tracker), texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), + buffer_cache_runtime(device, vulkan_device, vulkan_memory_allocator), + buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime), shader_cache(*this, emu_window_, gpu, maxwell3d, kepler_compute, gpu_memory, device), query_cache(*this, maxwell3d, gpu_memory), - buffer_cache(*this, gpu_memory, cpu_memory_, device, stream_buffer, state_tracker), fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache), async_shaders(emu_window_) { - unified_uniform_buffer.Create(); - glNamedBufferStorage(unified_uniform_buffer.handle, TOTAL_CONST_BUFFER_BYTES, nullptr, 0); - - if (device.UseAssemblyShaders()) { - glCreateBuffers(static_cast(staging_cbufs.size()), staging_cbufs.data()); - for (const GLuint cbuf : staging_cbufs) { - glNamedBufferStorage(cbuf, static_cast(Maxwell::MaxConstBufferSize), - nullptr, 0); - } - } if (device.UseAsynchronousShaders()) { async_shaders.AllocateWorkers(); } } -RasterizerOpenGL::~RasterizerOpenGL() { - if (device.UseAssemblyShaders()) { - glDeleteBuffers(static_cast(staging_cbufs.size()), staging_cbufs.data()); - } -} +RasterizerOpenGL::~RasterizerOpenGL() = default; -void RasterizerOpenGL::SetupVertexFormat() { +void RasterizerOpenGL::SyncVertexFormats() { auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexFormats]) { return; } flags[Dirty::VertexFormats] = false; - MICROPROFILE_SCOPE(OpenGL_VAO); - // Use the vertex array as-is, assumes that the data is formatted correctly for OpenGL. Enables // the first 16 vertex attributes always, as we don't know which ones are actually used until // shader time. Note, Tegra technically supports 32, but we're capping this to 16 for now to @@ -277,55 +234,7 @@ void RasterizerOpenGL::SetupVertexFormat() { } } -void RasterizerOpenGL::SetupVertexBuffer() { - auto& flags = maxwell3d.dirty.flags; - if (!flags[Dirty::VertexBuffers]) { - return; - } - flags[Dirty::VertexBuffers] = false; - - MICROPROFILE_SCOPE(OpenGL_VB); - - const bool use_unified_memory = device.HasVertexBufferUnifiedMemory(); - - // Upload all guest vertex arrays sequentially to our buffer - const auto& regs = maxwell3d.regs; - for (std::size_t index = 0; index < NUM_SUPPORTED_VERTEX_BINDINGS; ++index) { - if (!flags[Dirty::VertexBuffer0 + index]) { - continue; - } - flags[Dirty::VertexBuffer0 + index] = false; - - const auto& vertex_array = regs.vertex_array[index]; - if (!vertex_array.IsEnabled()) { - continue; - } - - const GPUVAddr start = vertex_array.StartAddress(); - const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - ASSERT(end >= start); - - const GLuint gl_index = static_cast(index); - const u64 size = end - start; - if (size == 0) { - glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); - if (use_unified_memory) { - glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, 0, 0); - } - continue; - } - const auto info = buffer_cache.UploadMemory(start, size); - if (use_unified_memory) { - glBindVertexBuffer(gl_index, 0, 0, vertex_array.stride); - glBufferAddressRangeNV(GL_VERTEX_ATTRIB_ARRAY_ADDRESS_NV, gl_index, - info.address + info.offset, size); - } else { - glBindVertexBuffer(gl_index, info.handle, info.offset, vertex_array.stride); - } - } -} - -void RasterizerOpenGL::SetupVertexInstances() { +void RasterizerOpenGL::SyncVertexInstances() { auto& flags = maxwell3d.dirty.flags; if (!flags[Dirty::VertexInstances]) { return; @@ -346,17 +255,7 @@ void RasterizerOpenGL::SetupVertexInstances() { } } -GLintptr RasterizerOpenGL::SetupIndexBuffer() { - MICROPROFILE_SCOPE(OpenGL_Index); - const auto& regs = maxwell3d.regs; - const std::size_t size = CalculateIndexBufferSize(); - const auto info = buffer_cache.UploadMemory(regs.index_array.IndexStart(), size); - glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, info.handle); - return info.offset; -} - -void RasterizerOpenGL::SetupShaders() { - MICROPROFILE_SCOPE(OpenGL_Shader); +void RasterizerOpenGL::SetupShaders(bool is_indexed) { u32 clip_distances = 0; std::array shaders{}; @@ -413,11 +312,19 @@ void RasterizerOpenGL::SetupShaders() { const size_t stage = index == 0 ? 0 : index - 1; shaders[stage] = shader; - SetupDrawConstBuffers(stage, shader); - SetupDrawGlobalMemory(stage, shader); SetupDrawTextures(shader, stage); SetupDrawImages(shader, stage); + buffer_cache.SetEnabledUniformBuffers(stage, shader->GetEntries().enabled_uniform_buffers); + + buffer_cache.UnbindGraphicsStorageBuffers(stage); + u32 ssbo_index = 0; + for (const auto& buffer : shader->GetEntries().global_memory_entries) { + buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index, + buffer.cbuf_offset, buffer.is_written); + ++ssbo_index; + } + // Workaround for Intel drivers. // When a clip distance is enabled but not set in the shader it crops parts of the screen // (sometimes it's half the screen, sometimes three quarters). To avoid this, enable the @@ -433,43 +340,26 @@ void RasterizerOpenGL::SetupShaders() { SyncClipEnabled(clip_distances); maxwell3d.dirty.flags[Dirty::Shaders] = false; + buffer_cache.UpdateGraphicsBuffers(is_indexed); + const std::span indices_span(image_view_indices.data(), image_view_indices.size()); texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); + buffer_cache.BindHostGeometryBuffers(is_indexed); + size_t image_view_index = 0; size_t texture_index = 0; size_t image_index = 0; for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { const Shader* const shader = shaders[stage]; - if (shader) { - const auto base = device.GetBaseBindings(stage); - BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index, - texture_index, image_index); - } - } -} - -std::size_t RasterizerOpenGL::CalculateVertexArraysSize() const { - const auto& regs = maxwell3d.regs; - - std::size_t size = 0; - for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - if (!regs.vertex_array[index].IsEnabled()) + if (!shader) { continue; - - const GPUVAddr start = regs.vertex_array[index].StartAddress(); - const GPUVAddr end = regs.vertex_array_limit[index].LimitAddress(); - - size += end - start; - ASSERT(end >= start); + } + buffer_cache.BindHostStageBuffers(stage); + const auto& base = device.GetBaseBindings(stage); + BindTextures(shader->GetEntries(), base.sampler, base.image, image_view_index, + texture_index, image_index); } - - return size; -} - -std::size_t RasterizerOpenGL::CalculateIndexBufferSize() const { - return static_cast(maxwell3d.regs.index_array.count) * - static_cast(maxwell3d.regs.index_array.FormatSizeInBytes()); } void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& stop_loading, @@ -478,6 +368,7 @@ void RasterizerOpenGL::LoadDiskResources(u64 title_id, const std::atomic_bool& s } void RasterizerOpenGL::Clear() { + MICROPROFILE_SCOPE(OpenGL_Clears); if (!maxwell3d.ShouldExecute()) { return; } @@ -528,11 +419,9 @@ void RasterizerOpenGL::Clear() { } UNIMPLEMENTED_IF(regs.clear_flags.viewport); - { - auto lock = texture_cache.AcquireLock(); - texture_cache.UpdateRenderTargets(true); - state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); - } + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UpdateRenderTargets(true); + state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); if (use_color) { glClearBufferfv(GL_COLOR, regs.clear_buffers.RT, regs.clear_color); @@ -544,7 +433,6 @@ void RasterizerOpenGL::Clear() { } else if (use_stencil) { glClearBufferiv(GL_STENCIL, 0, ®s.clear_stencil); } - ++num_queued_commands; } @@ -553,75 +441,12 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { query_cache.UpdateCounters(); - SyncViewport(); - SyncRasterizeEnable(); - SyncPolygonModes(); - SyncColorMask(); - SyncFragmentColorClampState(); - SyncMultiSampleState(); - SyncDepthTestState(); - SyncDepthClamp(); - SyncStencilTestState(); - SyncBlendState(); - SyncLogicOpState(); - SyncCullMode(); - SyncPrimitiveRestart(); - SyncScissorTest(); - SyncPointState(); - SyncLineState(); - SyncPolygonOffset(); - SyncAlphaTest(); - SyncFramebufferSRGB(); - - buffer_cache.Acquire(); - current_cbuf = 0; - - std::size_t buffer_size = CalculateVertexArraysSize(); - - // Add space for index buffer - if (is_indexed) { - buffer_size = Common::AlignUp(buffer_size, 4) + CalculateIndexBufferSize(); - } - - // Uniform space for the 5 shader stages - buffer_size = - Common::AlignUp(buffer_size, 4) + - (sizeof(MaxwellUniformData) + device.GetUniformBufferAlignment()) * Maxwell::MaxShaderStage; - - // Add space for at least 18 constant buffers - buffer_size += Maxwell::MaxConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - - // Prepare the vertex array. - buffer_cache.Map(buffer_size); - - // Prepare vertex array format. - SetupVertexFormat(); - - // Upload vertex and index data. - SetupVertexBuffer(); - SetupVertexInstances(); - GLintptr index_buffer_offset = 0; - if (is_indexed) { - index_buffer_offset = SetupIndexBuffer(); - } - - // Setup emulation uniform buffer. - if (!device.UseAssemblyShaders()) { - MaxwellUniformData ubo; - ubo.SetFromRegs(maxwell3d); - const auto info = - buffer_cache.UploadHostMemory(&ubo, sizeof(ubo), device.GetUniformBufferAlignment()); - glBindBufferRange(GL_UNIFORM_BUFFER, EmulationUniformBlockBinding, info.handle, info.offset, - static_cast(sizeof(ubo))); - } + SyncState(); // Setup shaders and their used resources. - auto lock = texture_cache.AcquireLock(); - SetupShaders(); + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; + SetupShaders(is_indexed); - // Signal the buffer cache that we are not going to upload more things. - buffer_cache.Unmap(); texture_cache.UpdateRenderTargets(false); state_tracker.BindFramebuffer(texture_cache.GetFramebuffer()->Handle()); program_manager.BindGraphicsPipeline(); @@ -635,7 +460,7 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { if (is_indexed) { const GLint base_vertex = static_cast(maxwell3d.regs.vb_element_base); const GLsizei num_vertices = static_cast(maxwell3d.regs.index_array.count); - const GLvoid* offset = reinterpret_cast(index_buffer_offset); + const GLvoid* const offset = buffer_cache_runtime.IndexOffset(); const GLenum format = MaxwellToGL::IndexFormat(maxwell3d.regs.index_array.format); if (num_instances == 1 && base_instance == 0 && base_vertex == 0) { glDrawElements(primitive_mode, num_vertices, format, offset); @@ -675,22 +500,22 @@ void RasterizerOpenGL::Draw(bool is_indexed, bool is_instanced) { } void RasterizerOpenGL::DispatchCompute(GPUVAddr code_addr) { - buffer_cache.Acquire(); - current_cbuf = 0; - Shader* const kernel = shader_cache.GetComputeKernel(code_addr); - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; BindComputeTextures(kernel); - const size_t buffer_size = Tegra::Engines::KeplerCompute::NumConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); - buffer_cache.Map(buffer_size); - - SetupComputeConstBuffers(kernel); - SetupComputeGlobalMemory(kernel); - - buffer_cache.Unmap(); + const auto& entries = kernel->GetEntries(); + buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers); + buffer_cache.UnbindComputeStorageBuffers(); + u32 ssbo_index = 0; + for (const auto& buffer : entries.global_memory_entries) { + buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset, + buffer.is_written); + ++ssbo_index; + } + buffer_cache.UpdateComputeBuffers(); + buffer_cache.BindHostComputeBuffers(); const auto& launch_desc = kepler_compute.launch_description; glDispatchCompute(launch_desc.grid_dim_x, launch_desc.grid_dim_y, launch_desc.grid_dim_z); @@ -706,6 +531,12 @@ void RasterizerOpenGL::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, query_cache.Query(gpu_addr, type, timestamp); } +void RasterizerOpenGL::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, + u32 size) { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size); +} + void RasterizerOpenGL::FlushAll() {} void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { @@ -714,19 +545,43 @@ void RasterizerOpenGL::FlushRegion(VAddr addr, u64 size) { return; } { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.DownloadMemory(addr, size); } - buffer_cache.FlushRegion(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.DownloadMemory(addr, size); + } query_cache.FlushRegion(addr, size); } +void RasterizerOpenGL::InvalidateExceptTextureCache(VAddr addr, u64 size) { + if (addr == 0 || size == 0) { + return; + } + shader_cache.InvalidateRegion(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } + query_cache.InvalidateRegion(addr, size); +} + +void RasterizerOpenGL::InvalidateTextureCache(VAddr addr, u64 size) { + if (addr == 0 || size == 0) { + return; + } + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UnmapMemory(addr, size); +} + bool RasterizerOpenGL::MustFlushRegion(VAddr addr, u64 size) { + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; if (!Settings::IsGPULevelHigh()) { - return buffer_cache.MustFlushRegion(addr, size); + return buffer_cache.IsRegionGpuModified(addr, size); } return texture_cache.IsRegionGpuModified(addr, size) || - buffer_cache.MustFlushRegion(addr, size); + buffer_cache.IsRegionGpuModified(addr, size); } void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { @@ -735,11 +590,14 @@ void RasterizerOpenGL::InvalidateRegion(VAddr addr, u64 size) { return; } { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.WriteMemory(addr, size); } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } shader_cache.InvalidateRegion(addr, size); - buffer_cache.InvalidateRegion(addr, size); query_cache.InvalidateRegion(addr, size); } @@ -748,26 +606,35 @@ void RasterizerOpenGL::OnCPUWrite(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } + shader_cache.OnCPUWrite(addr, size); { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.WriteMemory(addr, size); } - shader_cache.OnCPUWrite(addr, size); - buffer_cache.OnCPUWrite(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.CachedWriteMemory(addr, size); + } } void RasterizerOpenGL::SyncGuestHost() { MICROPROFILE_SCOPE(OpenGL_CacheManagement); - buffer_cache.SyncGuestHost(); shader_cache.SyncGuestHost(); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.FlushCachedWrites(); + } } void RasterizerOpenGL::UnmapMemory(VAddr addr, u64 size) { { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.UnmapMemory(addr, size); } - buffer_cache.OnCPUWrite(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } shader_cache.OnCPUWrite(addr, size); } @@ -802,14 +669,7 @@ void RasterizerOpenGL::FlushAndInvalidateRegion(VAddr addr, u64 size) { } void RasterizerOpenGL::WaitForIdle() { - // Place a barrier on everything that is not framebuffer related. - // This is related to another flag that is not currently implemented. - glMemoryBarrier(GL_VERTEX_ATTRIB_ARRAY_BARRIER_BIT | GL_ELEMENT_ARRAY_BARRIER_BIT | - GL_UNIFORM_BARRIER_BIT | GL_TEXTURE_FETCH_BARRIER_BIT | - GL_SHADER_IMAGE_ACCESS_BARRIER_BIT | GL_COMMAND_BARRIER_BIT | - GL_PIXEL_BUFFER_BARRIER_BIT | GL_TEXTURE_UPDATE_BARRIER_BIT | - GL_BUFFER_UPDATE_BARRIER_BIT | GL_TRANSFORM_FEEDBACK_BARRIER_BIT | - GL_SHADER_STORAGE_BARRIER_BIT | GL_QUERY_BUFFER_BARRIER_BIT); + glMemoryBarrier(GL_ALL_BARRIER_BITS); } void RasterizerOpenGL::FragmentBarrier() { @@ -834,18 +694,21 @@ void RasterizerOpenGL::TickFrame() { num_queued_commands = 0; fence_manager.TickFrame(); - buffer_cache.TickFrame(); { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.TickFrame(); } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.TickFrame(); + } } bool RasterizerOpenGL::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { MICROPROFILE_SCOPE(OpenGL_Blits); - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.BlitImage(dst, src, copy_config); return true; } @@ -857,7 +720,7 @@ bool RasterizerOpenGL::AccelerateDisplay(const Tegra::FramebufferConfig& config, } MICROPROFILE_SCOPE(OpenGL_CacheManagement); - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; ImageView* const image_view{texture_cache.TryFindFramebufferImageView(framebuffer_addr)}; if (!image_view) { return false; @@ -924,166 +787,6 @@ void RasterizerOpenGL::BindTextures(const ShaderEntries& entries, GLuint base_te } } -void RasterizerOpenGL::SetupDrawConstBuffers(std::size_t stage_index, Shader* shader) { - static constexpr std::array PARAMETER_LUT{ - GL_VERTEX_PROGRAM_PARAMETER_BUFFER_NV, GL_TESS_CONTROL_PROGRAM_PARAMETER_BUFFER_NV, - GL_TESS_EVALUATION_PROGRAM_PARAMETER_BUFFER_NV, GL_GEOMETRY_PROGRAM_PARAMETER_BUFFER_NV, - GL_FRAGMENT_PROGRAM_PARAMETER_BUFFER_NV, - }; - MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& stages = maxwell3d.state.shader_stages; - const auto& shader_stage = stages[stage_index]; - const auto& entries = shader->GetEntries(); - const bool use_unified = entries.use_unified_uniforms; - const std::size_t base_unified_offset = stage_index * NUM_CONST_BUFFERS_BYTES_PER_STAGE; - - const auto base_bindings = device.GetBaseBindings(stage_index); - u32 binding = device.UseAssemblyShaders() ? 0 : base_bindings.uniform_buffer; - for (const auto& entry : entries.const_buffers) { - const u32 index = entry.GetIndex(); - const auto& buffer = shader_stage.const_buffers[index]; - SetupConstBuffer(PARAMETER_LUT[stage_index], binding, buffer, entry, use_unified, - base_unified_offset + index * Maxwell::MaxConstBufferSize); - ++binding; - } - if (use_unified) { - const u32 index = static_cast(base_bindings.shader_storage_buffer + - entries.global_memory_entries.size()); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, - base_unified_offset, NUM_CONST_BUFFERS_BYTES_PER_STAGE); - } -} - -void RasterizerOpenGL::SetupComputeConstBuffers(Shader* kernel) { - MICROPROFILE_SCOPE(OpenGL_UBO); - const auto& launch_desc = kepler_compute.launch_description; - const auto& entries = kernel->GetEntries(); - const bool use_unified = entries.use_unified_uniforms; - - u32 binding = 0; - for (const auto& entry : entries.const_buffers) { - const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; - const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); - Tegra::Engines::ConstBufferInfo buffer; - buffer.address = config.Address(); - buffer.size = config.size; - buffer.enabled = mask[entry.GetIndex()]; - SetupConstBuffer(GL_COMPUTE_PROGRAM_PARAMETER_BUFFER_NV, binding, buffer, entry, - use_unified, entry.GetIndex() * Maxwell::MaxConstBufferSize); - ++binding; - } - if (use_unified) { - const GLuint index = static_cast(entries.global_memory_entries.size()); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, index, unified_uniform_buffer.handle, 0, - NUM_CONST_BUFFERS_BYTES_PER_STAGE); - } -} - -void RasterizerOpenGL::SetupConstBuffer(GLenum stage, u32 binding, - const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry, bool use_unified, - std::size_t unified_offset) { - if (!buffer.enabled) { - // Set values to zero to unbind buffers - if (device.UseAssemblyShaders()) { - glBindBufferRangeNV(stage, entry.GetIndex(), 0, 0, 0); - } else { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, 0, 0, sizeof(float)); - } - return; - } - - // Align the actual size so it ends up being a multiple of vec4 to meet the OpenGL std140 - // UBO alignment requirements. - const std::size_t size = Common::AlignUp(GetConstBufferSize(buffer, entry), sizeof(GLvec4)); - - const bool fast_upload = !use_unified && device.HasFastBufferSubData(); - - const std::size_t alignment = use_unified ? 4 : device.GetUniformBufferAlignment(); - const GPUVAddr gpu_addr = buffer.address; - auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, false, fast_upload); - - if (device.UseAssemblyShaders()) { - UNIMPLEMENTED_IF(use_unified); - if (info.offset != 0) { - const GLuint staging_cbuf = staging_cbufs[current_cbuf++]; - glCopyNamedBufferSubData(info.handle, staging_cbuf, info.offset, 0, size); - info.handle = staging_cbuf; - info.offset = 0; - } - glBindBufferRangeNV(stage, binding, info.handle, info.offset, size); - return; - } - - if (use_unified) { - glCopyNamedBufferSubData(info.handle, unified_uniform_buffer.handle, info.offset, - unified_offset, size); - } else { - glBindBufferRange(GL_UNIFORM_BUFFER, binding, info.handle, info.offset, size); - } -} - -void RasterizerOpenGL::SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader) { - static constexpr std::array TARGET_LUT = { - GL_VERTEX_PROGRAM_NV, GL_TESS_CONTROL_PROGRAM_NV, GL_TESS_EVALUATION_PROGRAM_NV, - GL_GEOMETRY_PROGRAM_NV, GL_FRAGMENT_PROGRAM_NV, - }; - const auto& cbufs{maxwell3d.state.shader_stages[stage_index]}; - const auto& entries{shader->GetEntries().global_memory_entries}; - - std::array ssbos; - ASSERT(entries.size() < ssbos.size()); - - const bool assembly_shaders = device.UseAssemblyShaders(); - u32 binding = assembly_shaders ? 0 : device.GetBaseBindings(stage_index).shader_storage_buffer; - for (const auto& entry : entries) { - const GPUVAddr addr{cbufs.const_buffers[entry.cbuf_index].address + entry.cbuf_offset}; - const GPUVAddr gpu_addr{gpu_memory.Read(addr)}; - const u32 size{gpu_memory.Read(addr + 8)}; - SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]); - ++binding; - } - if (assembly_shaders) { - UpdateBindlessSSBOs(TARGET_LUT[stage_index], ssbos.data(), entries.size()); - } -} - -void RasterizerOpenGL::SetupComputeGlobalMemory(Shader* kernel) { - const auto& cbufs{kepler_compute.launch_description.const_buffer_config}; - const auto& entries{kernel->GetEntries().global_memory_entries}; - - std::array ssbos; - ASSERT(entries.size() < ssbos.size()); - - u32 binding = 0; - for (const auto& entry : entries) { - const GPUVAddr addr{cbufs[entry.cbuf_index].Address() + entry.cbuf_offset}; - const GPUVAddr gpu_addr{gpu_memory.Read(addr)}; - const u32 size{gpu_memory.Read(addr + 8)}; - SetupGlobalMemory(binding, entry, gpu_addr, size, &ssbos[binding]); - ++binding; - } - if (device.UseAssemblyShaders()) { - UpdateBindlessSSBOs(GL_COMPUTE_PROGRAM_NV, ssbos.data(), ssbos.size()); - } -} - -void RasterizerOpenGL::SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, - GPUVAddr gpu_addr, size_t size, BindlessSSBO* ssbo) { - const size_t alignment{device.GetShaderStorageBufferAlignment()}; - const auto info = buffer_cache.UploadMemory(gpu_addr, size, alignment, entry.is_written); - if (device.UseAssemblyShaders()) { - *ssbo = BindlessSSBO{ - .address = static_cast(info.address + info.offset), - .length = static_cast(size), - .padding = 0, - }; - } else { - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, binding, info.handle, info.offset, - static_cast(size)); - } -} - void RasterizerOpenGL::SetupDrawTextures(const Shader* shader, size_t stage_index) { const bool via_header_index = maxwell3d.regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; @@ -1131,6 +834,30 @@ void RasterizerOpenGL::SetupComputeImages(const Shader* shader) { } } +void RasterizerOpenGL::SyncState() { + SyncViewport(); + SyncRasterizeEnable(); + SyncPolygonModes(); + SyncColorMask(); + SyncFragmentColorClampState(); + SyncMultiSampleState(); + SyncDepthTestState(); + SyncDepthClamp(); + SyncStencilTestState(); + SyncBlendState(); + SyncLogicOpState(); + SyncCullMode(); + SyncPrimitiveRestart(); + SyncScissorTest(); + SyncPointState(); + SyncLineState(); + SyncPolygonOffset(); + SyncAlphaTest(); + SyncFramebufferSRGB(); + SyncVertexFormats(); + SyncVertexInstances(); +} + void RasterizerOpenGL::SyncViewport() { auto& flags = maxwell3d.dirty.flags; const auto& regs = maxwell3d.regs; @@ -1166,9 +893,11 @@ void RasterizerOpenGL::SyncViewport() { if (regs.screen_y_control.y_negate != 0) { flip_y = !flip_y; } - glClipControl(flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT, - regs.depth_mode == Maxwell::DepthMode::ZeroToOne ? GL_ZERO_TO_ONE - : GL_NEGATIVE_ONE_TO_ONE); + const bool is_zero_to_one = regs.depth_mode == Maxwell::DepthMode::ZeroToOne; + const GLenum origin = flip_y ? GL_UPPER_LEFT : GL_LOWER_LEFT; + const GLenum depth = is_zero_to_one ? GL_ZERO_TO_ONE : GL_NEGATIVE_ONE_TO_ONE; + state_tracker.ClipControl(origin, depth); + state_tracker.SetYNegate(regs.screen_y_control.y_negate != 0); } if (dirty_viewport) { @@ -1652,36 +1381,13 @@ void RasterizerOpenGL::BeginTransformFeedback(GLenum primitive_mode) { if (regs.tfb_enabled == 0) { return; } - if (device.UseAssemblyShaders()) { SyncTransformFeedback(); } - UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); - - for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { - const auto& binding = regs.tfb_bindings[index]; - if (!binding.buffer_enable) { - if (enabled_transform_feedback_buffers[index]) { - glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast(index), 0, 0, - 0); - } - enabled_transform_feedback_buffers[index] = false; - continue; - } - enabled_transform_feedback_buffers[index] = true; - - auto& tfb_buffer = transform_feedback_buffers[index]; - tfb_buffer.Create(); - - const GLuint handle = tfb_buffer.handle; - const std::size_t size = binding.buffer_size; - glNamedBufferData(handle, static_cast(size), nullptr, GL_STREAM_COPY); - glBindBufferRange(GL_TRANSFORM_FEEDBACK_BUFFER, static_cast(index), handle, 0, - static_cast(size)); - } + UNIMPLEMENTED_IF(primitive_mode != GL_POINTS); // We may have to call BeginTransformFeedbackNV here since they seem to call different // implementations on Nvidia's driver (the pointer is different) but we are using @@ -1695,23 +1401,7 @@ void RasterizerOpenGL::EndTransformFeedback() { if (regs.tfb_enabled == 0) { return; } - glEndTransformFeedback(); - - for (std::size_t index = 0; index < Maxwell::NumTransformFeedbackBuffers; ++index) { - const auto& binding = regs.tfb_bindings[index]; - if (!binding.buffer_enable) { - continue; - } - UNIMPLEMENTED_IF(binding.buffer_offset != 0); - - const GLuint handle = transform_feedback_buffers[index].handle; - const GPUVAddr gpu_addr = binding.Address(); - const std::size_t size = binding.buffer_size; - const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - glCopyNamedBufferSubData(handle, info.handle, 0, info.offset, - static_cast(size)); - } } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_rasterizer.h b/src/video_core/renderer_opengl/gl_rasterizer.h index 82e03e677..b1094075a 100755 --- a/src/video_core/renderer_opengl/gl_rasterizer.h +++ b/src/video_core/renderer_opengl/gl_rasterizer.h @@ -30,7 +30,6 @@ #include "video_core/renderer_opengl/gl_shader_decompiler.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" -#include "video_core/renderer_opengl/gl_stream_buffer.h" #include "video_core/renderer_opengl/gl_texture_cache.h" #include "video_core/shader/async_shaders.h" #include "video_core/textures/texture.h" @@ -47,6 +46,11 @@ namespace Tegra { class MemoryManager; } +namespace Vulkan { +class Device; +class MemoryAllocator; +} // namespace Vulkan + namespace OpenGL { struct ScreenInfo; @@ -63,6 +67,8 @@ class RasterizerOpenGL : public VideoCore::RasterizerAccelerated { public: explicit RasterizerOpenGL(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, Core::Memory::Memory& cpu_memory_, const Device& device_, + const Vulkan::Device* vulkan_device, + Vulkan::MemoryAllocator* vulkan_memory_allocator, ScreenInfo& screen_info_, ProgramManager& program_manager_, StateTracker& state_tracker_); ~RasterizerOpenGL() override; @@ -72,8 +78,11 @@ public: void DispatchCompute(GPUVAddr code_addr) override; void ResetCounter(VideoCore::QueryType type) override; void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional timestamp) override; + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; + void InvalidateExceptTextureCache(VAddr addr, u64 size) override; + void InvalidateTextureCache(VAddr addr, u64 size) override; bool MustFlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; void OnCPUWrite(VAddr addr, u64 size) override; @@ -119,27 +128,6 @@ private: void BindTextures(const ShaderEntries& entries, GLuint base_texture, GLuint base_image, size_t& image_view_index, size_t& texture_index, size_t& image_index); - /// Configures the current constbuffers to use for the draw command. - void SetupDrawConstBuffers(std::size_t stage_index, Shader* shader); - - /// Configures the current constbuffers to use for the kernel invocation. - void SetupComputeConstBuffers(Shader* kernel); - - /// Configures a constant buffer. - void SetupConstBuffer(GLenum stage, u32 binding, const Tegra::Engines::ConstBufferInfo& buffer, - const ConstBufferEntry& entry, bool use_unified, - std::size_t unified_offset); - - /// Configures the current global memory entries to use for the draw command. - void SetupDrawGlobalMemory(std::size_t stage_index, Shader* shader); - - /// Configures the current global memory entries to use for the kernel invocation. - void SetupComputeGlobalMemory(Shader* kernel); - - /// Configures a global memory buffer. - void SetupGlobalMemory(u32 binding, const GlobalMemoryEntry& entry, GPUVAddr gpu_addr, - size_t size, BindlessSSBO* ssbo); - /// Configures the current textures to use for the draw command. void SetupDrawTextures(const Shader* shader, size_t stage_index); @@ -152,6 +140,9 @@ private: /// Configures images in a compute shader. void SetupComputeImages(const Shader* shader); + /// Syncs state to match guest's + void SyncState(); + /// Syncs the viewport and depth range to match the guest state void SyncViewport(); @@ -215,6 +206,12 @@ private: /// Syncs the framebuffer sRGB state to match the guest state void SyncFramebufferSRGB(); + /// Syncs vertex formats to match the guest state + void SyncVertexFormats(); + + /// Syncs vertex instances to match the guest state + void SyncVertexInstances(); + /// Syncs transform feedback state to match guest state /// @note Only valid on assembly shaders void SyncTransformFeedback(); @@ -225,19 +222,7 @@ private: /// End a transform feedback void EndTransformFeedback(); - std::size_t CalculateVertexArraysSize() const; - - std::size_t CalculateIndexBufferSize() const; - - /// Updates the current vertex format - void SetupVertexFormat(); - - void SetupVertexBuffer(); - void SetupVertexInstances(); - - GLintptr SetupIndexBuffer(); - - void SetupShaders(); + void SetupShaders(bool is_indexed); Tegra::GPU& gpu; Tegra::Engines::Maxwell3D& maxwell3d; @@ -249,12 +234,12 @@ private: ProgramManager& program_manager; StateTracker& state_tracker; - OGLStreamBuffer stream_buffer; TextureCacheRuntime texture_cache_runtime; TextureCache texture_cache; + BufferCacheRuntime buffer_cache_runtime; + BufferCache buffer_cache; ShaderCacheOpenGL shader_cache; QueryCache query_cache; - OGLBufferCache buffer_cache; FenceManagerOpenGL fence_manager; VideoCommon::Shader::AsyncShaders async_shaders; @@ -262,20 +247,8 @@ private: boost::container::static_vector image_view_indices; std::array image_view_ids; boost::container::static_vector sampler_handles; - std::array texture_handles; - std::array image_handles; - - std::array - transform_feedback_buffers; - std::bitset - enabled_transform_feedback_buffers; - - static constexpr std::size_t NUM_CONSTANT_BUFFERS = - Tegra::Engines::Maxwell3D::Regs::MaxConstBuffers * - Tegra::Engines::Maxwell3D::Regs::MaxShaderProgram; - std::array staging_cbufs{}; - std::size_t current_cbuf = 0; - OGLBuffer unified_uniform_buffer; + std::array texture_handles{}; + std::array image_handles{}; /// Number of commands queued to the OpenGL driver. Resetted on flush. std::size_t num_queued_commands = 0; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.cpp b/src/video_core/renderer_opengl/gl_resource_manager.cpp index 0e34a0f20..3428e5e21 100755 --- a/src/video_core/renderer_opengl/gl_resource_manager.cpp +++ b/src/video_core/renderer_opengl/gl_resource_manager.cpp @@ -171,12 +171,6 @@ void OGLBuffer::Release() { handle = 0; } -void OGLBuffer::MakeStreamCopy(std::size_t buffer_size) { - ASSERT_OR_EXECUTE((handle != 0 && buffer_size != 0), { return; }); - - glNamedBufferData(handle, buffer_size, nullptr, GL_STREAM_COPY); -} - void OGLSync::Create() { if (handle != 0) return; diff --git a/src/video_core/renderer_opengl/gl_resource_manager.h b/src/video_core/renderer_opengl/gl_resource_manager.h index f48398669..552d79db4 100755 --- a/src/video_core/renderer_opengl/gl_resource_manager.h +++ b/src/video_core/renderer_opengl/gl_resource_manager.h @@ -234,9 +234,6 @@ public: /// Deletes the internal OpenGL resource void Release(); - // Converts the buffer into a stream copy buffer with a fixed size - void MakeStreamCopy(std::size_t buffer_size); - GLuint handle = 0; }; diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp index 2e1fa252d..db30dcd65 100755 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.cpp +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.cpp @@ -63,7 +63,7 @@ using TextureIR = std::variant constexpr u32 MAX_CONSTBUFFER_SCALARS = static_cast(Maxwell::MaxConstBufferSize) / sizeof(u32); constexpr u32 MAX_CONSTBUFFER_ELEMENTS = MAX_CONSTBUFFER_SCALARS / sizeof(u32); -constexpr std::string_view CommonDeclarations = R"(#define ftoi floatBitsToInt +constexpr std::string_view COMMON_DECLARATIONS = R"(#define ftoi floatBitsToInt #define ftou floatBitsToUint #define itof intBitsToFloat #define utof uintBitsToFloat @@ -76,10 +76,6 @@ bvec2 HalfFloatNanComparison(bvec2 comparison, vec2 pair1, vec2 pair2) {{ const float fswzadd_modifiers_a[] = float[4](-1.0f, 1.0f, -1.0f, 0.0f ); const float fswzadd_modifiers_b[] = float[4](-1.0f, -1.0f, 1.0f, -1.0f ); - -layout (std140, binding = {}) uniform vs_config {{ - float y_direction; -}}; )"; class ShaderWriter final { @@ -401,13 +397,6 @@ std::string FlowStackTopName(MetaStackClass stack) { return fmt::format("{}_flow_stack_top", GetFlowStackPrefix(stack)); } -bool UseUnifiedUniforms(const Device& device, const ShaderIR& ir, ShaderType stage) { - const u32 num_ubos = static_cast(ir.GetConstantBuffers().size()); - // We waste one UBO for emulation - const u32 num_available_ubos = device.GetMaxUniformBuffers(stage) - 1; - return num_ubos > num_available_ubos; -} - struct GenericVaryingDescription { std::string name; u8 first_element = 0; @@ -419,9 +408,8 @@ public: explicit GLSLDecompiler(const Device& device_, const ShaderIR& ir_, const Registry& registry_, ShaderType stage_, std::string_view identifier_, std::string_view suffix_) - : device{device_}, ir{ir_}, registry{registry_}, stage{stage_}, identifier{identifier_}, - suffix{suffix_}, header{ir.GetHeader()}, use_unified_uniforms{ - UseUnifiedUniforms(device_, ir_, stage_)} { + : device{device_}, ir{ir_}, registry{registry_}, stage{stage_}, + identifier{identifier_}, suffix{suffix_}, header{ir.GetHeader()} { if (stage != ShaderType::Compute) { transform_feedback = BuildTransformFeedback(registry.GetGraphicsInfo()); } @@ -515,7 +503,8 @@ private: if (!identifier.empty()) { code.AddLine("// {}", identifier); } - code.AddLine("#version 440 {}", ir.UsesLegacyVaryings() ? "compatibility" : "core"); + const bool use_compatibility = ir.UsesLegacyVaryings() || ir.UsesYNegate(); + code.AddLine("#version 440 {}", use_compatibility ? "compatibility" : "core"); code.AddLine("#extension GL_ARB_separate_shader_objects : enable"); if (device.HasShaderBallot()) { code.AddLine("#extension GL_ARB_shader_ballot : require"); @@ -541,7 +530,7 @@ private: code.AddNewLine(); - code.AddLine(CommonDeclarations, EmulationUniformBlockBinding); + code.AddLine(COMMON_DECLARATIONS); } void DeclareVertex() { @@ -864,17 +853,6 @@ private: } void DeclareConstantBuffers() { - if (use_unified_uniforms) { - const u32 binding = device.GetBaseBindings(stage).shader_storage_buffer + - static_cast(ir.GetGlobalMemory().size()); - code.AddLine("layout (std430, binding = {}) readonly buffer UnifiedUniforms {{", - binding); - code.AddLine(" uint cbufs[];"); - code.AddLine("}};"); - code.AddNewLine(); - return; - } - u32 binding = device.GetBaseBindings(stage).uniform_buffer; for (const auto& [index, info] : ir.GetConstantBuffers()) { const u32 num_elements = Common::AlignUp(info.GetSize(), 4) / 4; @@ -1080,29 +1058,17 @@ private: if (const auto cbuf = std::get_if(&*node)) { const Node offset = cbuf->GetOffset(); - const u32 base_unified_offset = cbuf->GetIndex() * MAX_CONSTBUFFER_SCALARS; if (const auto immediate = std::get_if(&*offset)) { // Direct access const u32 offset_imm = immediate->GetValue(); ASSERT_MSG(offset_imm % 4 == 0, "Unaligned cbuf direct access"); - if (use_unified_uniforms) { - return {fmt::format("cbufs[{}]", base_unified_offset + offset_imm / 4), - Type::Uint}; - } else { - return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), - offset_imm / (4 * 4), (offset_imm / 4) % 4), - Type::Uint}; - } - } - - // Indirect access - if (use_unified_uniforms) { - return {fmt::format("cbufs[{} + ({} >> 2)]", base_unified_offset, - Visit(offset).AsUint()), + return {fmt::format("{}[{}][{}]", GetConstBuffer(cbuf->GetIndex()), + offset_imm / (4 * 4), (offset_imm / 4) % 4), Type::Uint}; } + // Indirect access const std::string final_offset = code.GenerateTemporary(); code.AddLine("uint {} = {} >> 2;", final_offset, Visit(offset).AsUint()); @@ -2292,7 +2258,6 @@ private: } } } - if (header.ps.omap.depth) { // The depth output is always 2 registers after the last color output, and current_reg // already contains one past the last color register. @@ -2336,7 +2301,8 @@ private: } Expression YNegate(Operation operation) { - return {"y_direction", Type::Float}; + // Y_NEGATE is mapped to this uniform value + return {"gl_FrontMaterial.ambient.a", Type::Float}; } template @@ -2786,7 +2752,6 @@ private: const std::string_view identifier; const std::string_view suffix; const Header header; - const bool use_unified_uniforms; std::unordered_map transform_feedback; ShaderWriter code; @@ -3002,8 +2967,10 @@ ShaderEntries MakeEntries(const Device& device, const ShaderIR& ir, ShaderType s for (std::size_t i = 0; i < std::size(clip_distances); ++i) { entries.clip_distances = (clip_distances[i] ? 1U : 0U) << i; } + for (const auto& buffer : entries.const_buffers) { + entries.enabled_uniform_buffers |= 1U << buffer.GetIndex(); + } entries.shader_length = ir.GetLength(); - entries.use_unified_uniforms = UseUnifiedUniforms(device, ir, stage); return entries; } diff --git a/src/video_core/renderer_opengl/gl_shader_decompiler.h b/src/video_core/renderer_opengl/gl_shader_decompiler.h index be68994bb..0397a000c 100755 --- a/src/video_core/renderer_opengl/gl_shader_decompiler.h +++ b/src/video_core/renderer_opengl/gl_shader_decompiler.h @@ -55,7 +55,7 @@ struct ShaderEntries { std::vector images; std::size_t shader_length{}; u32 clip_distances{}; - bool use_unified_uniforms{}; + u32 enabled_uniform_buffers{}; }; ShaderEntries MakeEntries(const Device& device, const VideoCommon::Shader::ShaderIR& ir, diff --git a/src/video_core/renderer_opengl/gl_state_tracker.cpp b/src/video_core/renderer_opengl/gl_state_tracker.cpp index 60e6fa39f..dbdf5230f 100755 --- a/src/video_core/renderer_opengl/gl_state_tracker.cpp +++ b/src/video_core/renderer_opengl/gl_state_tracker.cpp @@ -36,16 +36,10 @@ void SetupDirtyColorMasks(Tables& tables) { FillBlock(tables[1], OFF(color_mask), NUM(color_mask), ColorMasks); } -void SetupDirtyVertexArrays(Tables& tables) { - static constexpr std::size_t num_array = 3; +void SetupDirtyVertexInstances(Tables& tables) { static constexpr std::size_t instance_base_offset = 3; for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { const std::size_t array_offset = OFF(vertex_array) + i * NUM(vertex_array[0]); - const std::size_t limit_offset = OFF(vertex_array_limit) + i * NUM(vertex_array_limit[0]); - - FillBlock(tables, array_offset, num_array, VertexBuffer0 + i, VertexBuffers); - FillBlock(tables, limit_offset, NUM(vertex_array_limit), VertexBuffer0 + i, VertexBuffers); - const std::size_t instance_array_offset = array_offset + instance_base_offset; tables[0][instance_array_offset] = static_cast(VertexInstance0 + i); tables[1][instance_array_offset] = VertexInstances; @@ -217,11 +211,11 @@ void SetupDirtyMisc(Tables& tables) { StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} { auto& dirty = gpu.Maxwell3D().dirty; auto& tables = dirty.tables; - SetupDirtyRenderTargets(tables); + SetupDirtyFlags(tables); SetupDirtyColorMasks(tables); SetupDirtyViewports(tables); SetupDirtyScissors(tables); - SetupDirtyVertexArrays(tables); + SetupDirtyVertexInstances(tables); SetupDirtyVertexFormat(tables); SetupDirtyShaders(tables); SetupDirtyPolygonModes(tables); @@ -241,19 +235,6 @@ StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags} SetupDirtyClipControl(tables); SetupDirtyDepthClampEnabled(tables); SetupDirtyMisc(tables); - - auto& store = dirty.on_write_stores; - store[VertexBuffers] = true; - for (std::size_t i = 0; i < Regs::NumVertexArrays; ++i) { - store[VertexBuffer0 + i] = true; - } -} - -void StateTracker::InvalidateStreamBuffer() { - flags[Dirty::VertexBuffers] = true; - for (int index = Dirty::VertexBuffer0; index <= Dirty::VertexBuffer31; ++index) { - flags[index] = true; - } } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_state_tracker.h b/src/video_core/renderer_opengl/gl_state_tracker.h index 574615d3c..94c905116 100755 --- a/src/video_core/renderer_opengl/gl_state_tracker.h +++ b/src/video_core/renderer_opengl/gl_state_tracker.h @@ -28,10 +28,6 @@ enum : u8 { VertexFormat0, VertexFormat31 = VertexFormat0 + 31, - VertexBuffers, - VertexBuffer0, - VertexBuffer31 = VertexBuffer0 + 31, - VertexInstances, VertexInstance0, VertexInstance31 = VertexInstance0 + 31, @@ -92,8 +88,6 @@ class StateTracker { public: explicit StateTracker(Tegra::GPU& gpu); - void InvalidateStreamBuffer(); - void BindIndexBuffer(GLuint new_index_buffer) { if (index_buffer == new_index_buffer) { return; @@ -110,13 +104,32 @@ public: glBindFramebuffer(GL_DRAW_FRAMEBUFFER, framebuffer); } + void ClipControl(GLenum new_origin, GLenum new_depth) { + if (new_origin == origin && new_depth == depth) { + return; + } + origin = new_origin; + depth = new_depth; + glClipControl(origin, depth); + } + + void SetYNegate(bool new_y_negate) { + if (new_y_negate == y_negate) { + return; + } + // Y_NEGATE is mapped to gl_FrontMaterial.ambient.a + y_negate = new_y_negate; + const std::array ambient{0.0f, 0.0f, 0.0f, y_negate ? -1.0f : 1.0f}; + glMaterialfv(GL_FRONT, GL_AMBIENT, ambient.data()); + } + void NotifyScreenDrawVertexArray() { flags[OpenGL::Dirty::VertexFormats] = true; flags[OpenGL::Dirty::VertexFormat0 + 0] = true; flags[OpenGL::Dirty::VertexFormat0 + 1] = true; - flags[OpenGL::Dirty::VertexBuffers] = true; - flags[OpenGL::Dirty::VertexBuffer0] = true; + flags[VideoCommon::Dirty::VertexBuffers] = true; + flags[VideoCommon::Dirty::VertexBuffer0] = true; flags[OpenGL::Dirty::VertexInstances] = true; flags[OpenGL::Dirty::VertexInstance0 + 0] = true; @@ -202,6 +215,9 @@ private: GLuint framebuffer = 0; GLuint index_buffer = 0; + GLenum origin = GL_LOWER_LEFT; + GLenum depth = GL_NEGATIVE_ONE_TO_ONE; + bool y_negate = false; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.cpp b/src/video_core/renderer_opengl/gl_stream_buffer.cpp index e0819cdf2..bfb992a79 100755 --- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp +++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp @@ -1,70 +1,64 @@ -// Copyright 2018 Citra Emulator Project +// Copyright 2021 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. -#include -#include +#include +#include +#include + +#include #include "common/alignment.h" #include "common/assert.h" -#include "common/microprofile.h" -#include "video_core/renderer_opengl/gl_device.h" -#include "video_core/renderer_opengl/gl_state_tracker.h" #include "video_core/renderer_opengl/gl_stream_buffer.h" -MICROPROFILE_DEFINE(OpenGL_StreamBuffer, "OpenGL", "Stream Buffer Orphaning", - MP_RGB(128, 128, 192)); - namespace OpenGL { -OGLStreamBuffer::OGLStreamBuffer(const Device& device, StateTracker& state_tracker_) - : state_tracker{state_tracker_} { - gl_buffer.Create(); - - static constexpr GLbitfield flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT; - glNamedBufferStorage(gl_buffer.handle, BUFFER_SIZE, nullptr, flags); - mapped_ptr = static_cast( - glMapNamedBufferRange(gl_buffer.handle, 0, BUFFER_SIZE, flags | GL_MAP_FLUSH_EXPLICIT_BIT)); - - if (device.UseAssemblyShaders() || device.HasVertexBufferUnifiedMemory()) { - glMakeNamedBufferResidentNV(gl_buffer.handle, GL_READ_ONLY); - glGetNamedBufferParameterui64vNV(gl_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, &gpu_address); +StreamBuffer::StreamBuffer() { + static constexpr GLenum flags = GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | GL_MAP_COHERENT_BIT; + buffer.Create(); + glObjectLabel(GL_BUFFER, buffer.handle, -1, "Stream Buffer"); + glNamedBufferStorage(buffer.handle, STREAM_BUFFER_SIZE, nullptr, flags); + mapped_pointer = + static_cast(glMapNamedBufferRange(buffer.handle, 0, STREAM_BUFFER_SIZE, flags)); + for (OGLSync& sync : fences) { + sync.Create(); } } -OGLStreamBuffer::~OGLStreamBuffer() { - glUnmapNamedBuffer(gl_buffer.handle); - gl_buffer.Release(); -} - -std::pair OGLStreamBuffer::Map(GLsizeiptr size, GLintptr alignment) { - ASSERT(size <= BUFFER_SIZE); - ASSERT(alignment <= BUFFER_SIZE); - mapped_size = size; - - if (alignment > 0) { - buffer_pos = Common::AlignUp(buffer_pos, alignment); +std::pair, size_t> StreamBuffer::Request(size_t size) noexcept { + ASSERT(size < REGION_SIZE); + for (size_t region = Region(used_iterator), region_end = Region(iterator); region < region_end; + ++region) { + fences[region].Create(); } + used_iterator = iterator; - if (buffer_pos + size > BUFFER_SIZE) { - MICROPROFILE_SCOPE(OpenGL_StreamBuffer); - glInvalidateBufferData(gl_buffer.handle); - state_tracker.InvalidateStreamBuffer(); - - buffer_pos = 0; + for (size_t region = Region(free_iterator) + 1, + region_end = std::min(Region(iterator + size) + 1, NUM_SYNCS); + region < region_end; ++region) { + glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); + fences[region].Release(); } - - return std::make_pair(mapped_ptr + buffer_pos, buffer_pos); -} - -void OGLStreamBuffer::Unmap(GLsizeiptr size) { - ASSERT(size <= mapped_size); - - if (size > 0) { - glFlushMappedNamedBufferRange(gl_buffer.handle, buffer_pos, size); + if (iterator + size > free_iterator) { + free_iterator = iterator + size; } + if (iterator + size > STREAM_BUFFER_SIZE) { + for (size_t region = Region(used_iterator); region < NUM_SYNCS; ++region) { + fences[region].Create(); + } + used_iterator = 0; + iterator = 0; + free_iterator = size; - buffer_pos += size; + for (size_t region = 0, region_end = Region(size); region <= region_end; ++region) { + glClientWaitSync(fences[region].handle, 0, GL_TIMEOUT_IGNORED); + fences[region].Release(); + } + } + const size_t offset = iterator; + iterator = Common::AlignUp(iterator + size, MAX_ALIGNMENT); + return {std::span(mapped_pointer + offset, size), offset}; } } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_stream_buffer.h b/src/video_core/renderer_opengl/gl_stream_buffer.h index dd9cf67eb..6dbb6bfba 100755 --- a/src/video_core/renderer_opengl/gl_stream_buffer.h +++ b/src/video_core/renderer_opengl/gl_stream_buffer.h @@ -1,9 +1,12 @@ -// Copyright 2018 Citra Emulator Project +// Copyright 2021 yuzu Emulator Project // Licensed under GPLv2 or any later version // Refer to the license.txt file included. #pragma once +#include +#include +#include #include #include @@ -13,48 +16,35 @@ namespace OpenGL { -class Device; -class StateTracker; +class StreamBuffer { + static constexpr size_t STREAM_BUFFER_SIZE = 64 * 1024 * 1024; + static constexpr size_t NUM_SYNCS = 16; + static constexpr size_t REGION_SIZE = STREAM_BUFFER_SIZE / NUM_SYNCS; + static constexpr size_t MAX_ALIGNMENT = 256; + static_assert(STREAM_BUFFER_SIZE % MAX_ALIGNMENT == 0); + static_assert(STREAM_BUFFER_SIZE % NUM_SYNCS == 0); + static_assert(REGION_SIZE % MAX_ALIGNMENT == 0); -class OGLStreamBuffer : private NonCopyable { public: - explicit OGLStreamBuffer(const Device& device, StateTracker& state_tracker_); - ~OGLStreamBuffer(); + explicit StreamBuffer(); - /* - * Allocates a linear chunk of memory in the GPU buffer with at least "size" bytes - * and the optional alignment requirement. - * If the buffer is full, the whole buffer is reallocated which invalidates old chunks. - * The return values are the pointer to the new chunk, and the offset within the buffer. - * The actual used size must be specified on unmapping the chunk. - */ - std::pair Map(GLsizeiptr size, GLintptr alignment = 0); + [[nodiscard]] std::pair, size_t> Request(size_t size) noexcept; - void Unmap(GLsizeiptr size); - - GLuint Handle() const { - return gl_buffer.handle; - } - - u64 Address() const { - return gpu_address; - } - - GLsizeiptr Size() const noexcept { - return BUFFER_SIZE; + [[nodiscard]] GLuint Handle() const noexcept { + return buffer.handle; } private: - static constexpr GLsizeiptr BUFFER_SIZE = 256 * 1024 * 1024; + [[nodiscard]] static size_t Region(size_t offset) noexcept { + return offset / REGION_SIZE; + } - StateTracker& state_tracker; - - OGLBuffer gl_buffer; - - GLuint64EXT gpu_address = 0; - GLintptr buffer_pos = 0; - GLsizeiptr mapped_size = 0; - u8* mapped_ptr = nullptr; + size_t iterator = 0; + size_t used_iterator = 0; + size_t free_iterator = 0; + u8* mapped_pointer = nullptr; + OGLBuffer buffer; + std::array fences; }; } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/gl_texture_cache.cpp b/src/video_core/renderer_opengl/gl_texture_cache.cpp index 546cb6d00..37572ab28 100755 --- a/src/video_core/renderer_opengl/gl_texture_cache.cpp +++ b/src/video_core/renderer_opengl/gl_texture_cache.cpp @@ -398,9 +398,6 @@ void AttachTexture(GLuint fbo, GLenum attachment, const ImageView* image_view) { } // Anonymous namespace -ImageBufferMap::ImageBufferMap(GLuint handle_, u8* map, size_t size, OGLSync* sync_) - : span(map, size), sync{sync_}, handle{handle_} {} - ImageBufferMap::~ImageBufferMap() { if (sync) { sync->Create(); @@ -487,11 +484,11 @@ void TextureCacheRuntime::Finish() { glFinish(); } -ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) { +ImageBufferMap TextureCacheRuntime::UploadStagingBuffer(size_t size) { return upload_buffers.RequestMap(size, true); } -ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) { +ImageBufferMap TextureCacheRuntime::DownloadStagingBuffer(size_t size) { return download_buffers.RequestMap(size, false); } @@ -596,7 +593,11 @@ ImageBufferMap TextureCacheRuntime::StagingBuffers::RequestMap(size_t requested_ bool insert_fence) { const size_t index = RequestBuffer(requested_size); OGLSync* const sync = insert_fence ? &syncs[index] : nullptr; - return ImageBufferMap(buffers[index].handle, maps[index], requested_size, sync); + return ImageBufferMap{ + .mapped_span = std::span(maps[index], requested_size), + .sync = sync, + .buffer = buffers[index].handle, + }; } size_t TextureCacheRuntime::StagingBuffers::RequestBuffer(size_t requested_size) { @@ -711,7 +712,7 @@ Image::Image(TextureCacheRuntime& runtime, const VideoCommon::ImageInfo& info_, void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, std::span copies) { - glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.Handle()); + glBindBuffer(GL_PIXEL_UNPACK_BUFFER, map.buffer); glFlushMappedBufferRange(GL_PIXEL_UNPACK_BUFFER, buffer_offset, unswizzled_size_bytes); glPixelStorei(GL_UNPACK_ALIGNMENT, 1); @@ -735,7 +736,7 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, std::span copies) { for (const VideoCommon::BufferCopy& copy : copies) { - glCopyNamedBufferSubData(map.Handle(), buffer.handle, copy.src_offset + buffer_offset, + glCopyNamedBufferSubData(map.buffer, buffer.handle, copy.src_offset + buffer_offset, copy.dst_offset, copy.size); } } @@ -744,7 +745,7 @@ void Image::DownloadMemory(ImageBufferMap& map, size_t buffer_offset, std::span copies) { glMemoryBarrier(GL_PIXEL_BUFFER_BARRIER_BIT); // TODO: Move this to its own API - glBindBuffer(GL_PIXEL_PACK_BUFFER, map.Handle()); + glBindBuffer(GL_PIXEL_PACK_BUFFER, map.buffer); glPixelStorei(GL_PACK_ALIGNMENT, 1); u32 current_row_length = std::numeric_limits::max(); diff --git a/src/video_core/renderer_opengl/gl_texture_cache.h b/src/video_core/renderer_opengl/gl_texture_cache.h index 15b7c3676..60d08d6d6 100755 --- a/src/video_core/renderer_opengl/gl_texture_cache.h +++ b/src/video_core/renderer_opengl/gl_texture_cache.h @@ -31,23 +31,12 @@ using VideoCommon::NUM_RT; using VideoCommon::Offset2D; using VideoCommon::RenderTargets; -class ImageBufferMap { -public: - explicit ImageBufferMap(GLuint handle, u8* map, size_t size, OGLSync* sync); +struct ImageBufferMap { ~ImageBufferMap(); - GLuint Handle() const noexcept { - return handle; - } - - std::span Span() const noexcept { - return span; - } - -private: - std::span span; + std::span mapped_span; OGLSync* sync; - GLuint handle; + GLuint buffer; }; struct FormatProperties { @@ -69,9 +58,9 @@ public: void Finish(); - ImageBufferMap MapUploadBuffer(size_t size); + ImageBufferMap UploadStagingBuffer(size_t size); - ImageBufferMap MapDownloadBuffer(size_t size); + ImageBufferMap DownloadStagingBuffer(size_t size); void CopyImage(Image& dst, Image& src, std::span copies); diff --git a/src/video_core/renderer_opengl/renderer_opengl.cpp b/src/video_core/renderer_opengl/renderer_opengl.cpp index dd77a543c..8fcb86581 100755 --- a/src/video_core/renderer_opengl/renderer_opengl.cpp +++ b/src/video_core/renderer_opengl/renderer_opengl.cpp @@ -27,11 +27,14 @@ #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/renderer_opengl.h" #include "video_core/textures/decoders.h" +#include "video_core/vulkan_common/vulkan_debug_callback.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_instance.h" +#include "video_core/vulkan_common/vulkan_library.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" namespace OpenGL { - namespace { - constexpr GLint PositionLocation = 0; constexpr GLint TexCoordLocation = 1; constexpr GLint ModelViewMatrixLocation = 0; @@ -125,14 +128,100 @@ void APIENTRY DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severit } } +Vulkan::vk::PhysicalDevice FindPhysicalDevice(Vulkan::vk::Instance& instance) { + using namespace Vulkan; + using UUID = std::array; + + GLint num_device_uuids; + glGetIntegerv(GL_NUM_DEVICE_UUIDS_EXT, &num_device_uuids); + std::vector device_uuids(num_device_uuids); + for (GLint index = 0; index < num_device_uuids; ++index) { + glGetUnsignedBytei_vEXT(GL_DEVICE_UUID_EXT, 0, device_uuids[index].data()); + } + UUID driver_uuid; + glGetUnsignedBytevEXT(GL_DRIVER_UUID_EXT, driver_uuid.data()); + + for (const VkPhysicalDevice raw_physical_device : instance.EnumeratePhysicalDevices()) { + VkPhysicalDeviceIDProperties device_id_properties{}; + device_id_properties.sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ID_PROPERTIES; + + VkPhysicalDeviceProperties2KHR properties{ + .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_PROPERTIES_2_KHR, + .pNext = &device_id_properties, + .properties{}, + }; + vk::PhysicalDevice physical_device(raw_physical_device, instance.Dispatch()); + physical_device.GetProperties2KHR(properties); + if (!std::ranges::equal(device_id_properties.driverUUID, driver_uuid)) { + continue; + } + const auto it = + std::ranges::find_if(device_uuids, [&device_id_properties, driver_uuid](UUID uuid) { + return std::ranges::equal(device_id_properties.deviceUUID, uuid); + }); + if (it != device_uuids.end()) { + return physical_device; + } + } + throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER); +} } // Anonymous namespace +struct VulkanObjects { + static std::unique_ptr TryCreate() { + if (!GLAD_GL_EXT_memory_object) { + // Interop is not present + return nullptr; + } + const std::string_view vendor{reinterpret_cast(glGetString(GL_VENDOR))}; + if (vendor == "ATI Technologies Inc.") { + // Avoid using GL_EXT_memory_object on AMD, as it makes the GL driver crash + return nullptr; + } + if (!Settings::values.use_assembly_shaders.GetValue()) { + // We only need interop when assembly shaders are enabled + return nullptr; + } +#ifdef __linux__ + LOG_WARNING(Render_OpenGL, "Interop doesn't work on Linux at the moment"); + return nullptr; +#endif + try { + return std::make_unique(); + } catch (const Vulkan::vk::Exception& exception) { + LOG_ERROR(Render_OpenGL, "Failed to initialize Vulkan objects with error: {}", + exception.what()); + return nullptr; + } + } + + Common::DynamicLibrary library{Vulkan::OpenLibrary()}; + Vulkan::vk::InstanceDispatch dld; + Vulkan::vk::Instance instance{Vulkan::CreateInstance(library, dld, VK_API_VERSION_1_1)}; + Vulkan::Device device{*instance, FindPhysicalDevice(instance), nullptr, dld}; + Vulkan::MemoryAllocator memory_allocator{device, true}; +}; + RendererOpenGL::RendererOpenGL(Core::TelemetrySession& telemetry_session_, Core::Frontend::EmuWindow& emu_window_, Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, std::unique_ptr context_) : RendererBase{emu_window_, std::move(context_)}, telemetry_session{telemetry_session_}, - emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, program_manager{device} {} + emu_window{emu_window_}, cpu_memory{cpu_memory_}, gpu{gpu_}, + vulkan_objects{VulkanObjects::TryCreate()}, device{vulkan_objects != nullptr}, + state_tracker{gpu}, program_manager{device}, + rasterizer(emu_window, gpu, cpu_memory, device, + vulkan_objects ? &vulkan_objects->device : nullptr, + vulkan_objects ? &vulkan_objects->memory_allocator : nullptr, screen_info, + program_manager, state_tracker) { + if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { + glEnable(GL_DEBUG_OUTPUT); + glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); + glDebugMessageCallback(DebugHandler, nullptr); + } + AddTelemetryFields(); + InitOpenGLObjects(); +} RendererOpenGL::~RendererOpenGL() = default; @@ -148,7 +237,7 @@ void RendererOpenGL::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { ++m_current_frame; - rasterizer->TickFrame(); + rasterizer.TickFrame(); context->SwapBuffers(); render_window.OnFrameDisplayed(); @@ -179,7 +268,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const Tegra::FramebufferConfig& framebuf framebuffer_crop_rect = framebuffer.crop_rect; const VAddr framebuffer_addr{framebuffer.address + framebuffer.offset}; - if (rasterizer->AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { + if (rasterizer.AccelerateDisplay(framebuffer, framebuffer_addr, framebuffer.stride)) { return; } @@ -267,6 +356,7 @@ void RendererOpenGL::InitOpenGLObjects() { // Enable unified vertex attributes and query vertex buffer address when the driver supports it if (device.HasVertexBufferUnifiedMemory()) { glEnableClientState(GL_VERTEX_ATTRIB_ARRAY_UNIFIED_NV); + glEnableClientState(GL_ELEMENT_ARRAY_UNIFIED_NV); glMakeNamedBufferResidentNV(vertex_buffer.handle, GL_READ_ONLY); glGetNamedBufferParameterui64vNV(vertex_buffer.handle, GL_BUFFER_GPU_ADDRESS_NV, @@ -289,14 +379,6 @@ void RendererOpenGL::AddTelemetryFields() { telemetry_session.AddField(user_system, "GPU_OpenGL_Version", std::string(gl_version)); } -void RendererOpenGL::CreateRasterizer() { - if (rasterizer) { - return; - } - rasterizer = std::make_unique(emu_window, gpu, cpu_memory, device, - screen_info, program_manager, state_tracker); -} - void RendererOpenGL::ConfigureFramebufferTexture(TextureInfo& texture, const Tegra::FramebufferConfig& framebuffer) { texture.width = framebuffer.width; @@ -407,6 +489,7 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { program_manager.BindHostPipeline(pipeline.handle); + state_tracker.ClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE); glEnable(GL_CULL_FACE); if (screen_info.display_srgb) { glEnable(GL_FRAMEBUFFER_SRGB); @@ -425,7 +508,6 @@ void RendererOpenGL::DrawScreen(const Layout::FramebufferLayout& layout) { glCullFace(GL_BACK); glFrontFace(GL_CW); glColorMaski(0, GL_TRUE, GL_TRUE, GL_TRUE, GL_TRUE); - glClipControl(GL_LOWER_LEFT, GL_ZERO_TO_ONE); glViewportIndexedf(0, 0.0f, 0.0f, static_cast(layout.width), static_cast(layout.height)); glDepthRangeIndexed(0, 0.0, 0.0); @@ -497,25 +579,4 @@ void RendererOpenGL::RenderScreenshot() { renderer_settings.screenshot_requested = false; } -bool RendererOpenGL::Init() { - if (Settings::values.renderer_debug && GLAD_GL_KHR_debug) { - glEnable(GL_DEBUG_OUTPUT); - glEnable(GL_DEBUG_OUTPUT_SYNCHRONOUS); - glDebugMessageCallback(DebugHandler, nullptr); - } - - AddTelemetryFields(); - - if (!GLAD_GL_VERSION_4_3) { - return false; - } - - InitOpenGLObjects(); - CreateRasterizer(); - - return true; -} - -void RendererOpenGL::ShutDown() {} - } // namespace OpenGL diff --git a/src/video_core/renderer_opengl/renderer_opengl.h b/src/video_core/renderer_opengl/renderer_opengl.h index 44e109794..f210190dd 100755 --- a/src/video_core/renderer_opengl/renderer_opengl.h +++ b/src/video_core/renderer_opengl/renderer_opengl.h @@ -10,6 +10,7 @@ #include "common/math_util.h" #include "video_core/renderer_base.h" #include "video_core/renderer_opengl/gl_device.h" +#include "video_core/renderer_opengl/gl_rasterizer.h" #include "video_core/renderer_opengl/gl_resource_manager.h" #include "video_core/renderer_opengl/gl_shader_manager.h" #include "video_core/renderer_opengl/gl_state_tracker.h" @@ -37,6 +38,8 @@ class GPU; namespace OpenGL { +struct VulkanObjects; + /// Structure used for storing information about the textures for the Switch screen struct TextureInfo { OGLTexture resource; @@ -63,18 +66,18 @@ public: std::unique_ptr context_); ~RendererOpenGL() override; - bool Init() override; - void ShutDown() override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; + VideoCore::RasterizerInterface* ReadRasterizer() override { + return &rasterizer; + } + private: /// Initializes the OpenGL state and creates persistent objects. void InitOpenGLObjects(); void AddTelemetryFields(); - void CreateRasterizer(); - void ConfigureFramebufferTexture(TextureInfo& texture, const Tegra::FramebufferConfig& framebuffer); @@ -98,8 +101,11 @@ private: Core::Memory::Memory& cpu_memory; Tegra::GPU& gpu; - const Device device; - StateTracker state_tracker{gpu}; + std::unique_ptr vulkan_objects; + Device device; + StateTracker state_tracker; + ProgramManager program_manager; + RasterizerOpenGL rasterizer; // OpenGL object IDs OGLSampler present_sampler; @@ -115,9 +121,6 @@ private: /// Display information for Switch screen ScreenInfo screen_info; - /// Global dummy shader pipeline - ProgramManager program_manager; - /// OpenGL framebuffer data std::vector gl_framebuffer_data; diff --git a/src/video_core/renderer_opengl/util_shaders.cpp b/src/video_core/renderer_opengl/util_shaders.cpp index eb849cbf2..aeb36551c 100755 --- a/src/video_core/renderer_opengl/util_shaders.cpp +++ b/src/video_core/renderer_opengl/util_shaders.cpp @@ -71,7 +71,7 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; program_manager.BindHostCompute(block_linear_unswizzle_2d_program.handle); - glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); + glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); const GLenum store_format = StoreFormat(BytesPerBlock(image.info.format)); @@ -91,8 +91,8 @@ void UtilShaders::BlockLinearUpload2D(Image& image, const ImageBufferMap& map, s glUniform1ui(5, params.x_shift); glUniform1ui(6, params.block_height); glUniform1ui(7, params.block_height_mask); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), - input_offset, image.guest_size_bytes - swizzle.buffer_offset); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0, GL_WRITE_ONLY, store_format); glDispatchCompute(num_dispatches_x, num_dispatches_y, image.info.resources.layers); @@ -108,7 +108,7 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s static constexpr GLuint BINDING_INPUT_BUFFER = 1; static constexpr GLuint BINDING_OUTPUT_IMAGE = 0; - glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); + glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes); program_manager.BindHostCompute(block_linear_unswizzle_3d_program.handle); glBindBufferBase(GL_SHADER_STORAGE_BUFFER, BINDING_SWIZZLE_BUFFER, swizzle_table_buffer.handle); @@ -132,8 +132,8 @@ void UtilShaders::BlockLinearUpload3D(Image& image, const ImageBufferMap& map, s glUniform1ui(7, params.block_height_mask); glUniform1ui(8, params.block_depth); glUniform1ui(9, params.block_depth_mask); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), - input_offset, image.guest_size_bytes - swizzle.buffer_offset); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); glBindImageTexture(BINDING_OUTPUT_IMAGE, image.Handle(), swizzle.level, GL_TRUE, 0, GL_WRITE_ONLY, store_format); glDispatchCompute(num_dispatches_x, num_dispatches_y, num_dispatches_z); @@ -159,7 +159,7 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu "Non-power of two images are not implemented"); program_manager.BindHostCompute(pitch_unswizzle_program.handle); - glFlushMappedNamedBufferRange(map.Handle(), buffer_offset, image.guest_size_bytes); + glFlushMappedNamedBufferRange(map.buffer, buffer_offset, image.guest_size_bytes); glUniform2ui(LOC_ORIGIN, 0, 0); glUniform2i(LOC_DESTINATION, 0, 0); glUniform1ui(LOC_BYTES_PER_BLOCK, bytes_per_block); @@ -172,8 +172,8 @@ void UtilShaders::PitchUpload(Image& image, const ImageBufferMap& map, size_t bu const u32 num_dispatches_x = Common::DivCeil(num_tiles.width, WORKGROUP_SIZE.width); const u32 num_dispatches_y = Common::DivCeil(num_tiles.height, WORKGROUP_SIZE.height); - glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.Handle(), - input_offset, image.guest_size_bytes - swizzle.buffer_offset); + glBindBufferRange(GL_SHADER_STORAGE_BUFFER, BINDING_INPUT_BUFFER, map.buffer, input_offset, + image.guest_size_bytes - swizzle.buffer_offset); glDispatchCompute(num_dispatches_x, num_dispatches_y, 1); } program_manager.RestoreGuestCompute(); diff --git a/src/video_core/renderer_opengl/util_shaders.h b/src/video_core/renderer_opengl/util_shaders.h index 359997255..bec026bc3 100755 --- a/src/video_core/renderer_opengl/util_shaders.h +++ b/src/video_core/renderer_opengl/util_shaders.h @@ -15,9 +15,10 @@ namespace OpenGL { class Image; -class ImageBufferMap; class ProgramManager; +struct ImageBufferMap; + class UtilShaders { public: explicit UtilShaders(ProgramManager& program_manager); diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp index ca7c2c579..bb982b2fc 100755 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.cpp +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.cpp @@ -526,13 +526,9 @@ VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison) { return {}; } -VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format) { +VkIndexType IndexFormat(Maxwell::IndexFormat index_format) { switch (index_format) { case Maxwell::IndexFormat::UnsignedByte: - if (!device.IsExtIndexTypeUint8Supported()) { - UNIMPLEMENTED_MSG("Native uint8 indices are not supported on this device"); - return VK_INDEX_TYPE_UINT16; - } return VK_INDEX_TYPE_UINT8_EXT; case Maxwell::IndexFormat::UnsignedShort: return VK_INDEX_TYPE_UINT16; diff --git a/src/video_core/renderer_vulkan/maxwell_to_vk.h b/src/video_core/renderer_vulkan/maxwell_to_vk.h index 537969840..805e40077 100755 --- a/src/video_core/renderer_vulkan/maxwell_to_vk.h +++ b/src/video_core/renderer_vulkan/maxwell_to_vk.h @@ -45,7 +45,7 @@ VkFormat VertexFormat(Maxwell::VertexAttribute::Type type, Maxwell::VertexAttrib VkCompareOp ComparisonOp(Maxwell::ComparisonOp comparison); -VkIndexType IndexFormat(const Device& device, Maxwell::IndexFormat index_format); +VkIndexType IndexFormat(Maxwell::IndexFormat index_format); VkStencilOp StencilOp(Maxwell::StencilOp stencil_op); diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.cpp b/src/video_core/renderer_vulkan/renderer_vulkan.cpp index 61796e33a..1cc720ddd 100755 --- a/src/video_core/renderer_vulkan/renderer_vulkan.cpp +++ b/src/video_core/renderer_vulkan/renderer_vulkan.cpp @@ -80,17 +80,50 @@ std::string BuildCommaSeparatedExtensions(std::vector available_ext return separated_extensions; } +Device CreateDevice(const vk::Instance& instance, const vk::InstanceDispatch& dld, + VkSurfaceKHR surface) { + const std::vector devices = instance.EnumeratePhysicalDevices(); + const s32 device_index = Settings::values.vulkan_device.GetValue(); + if (device_index < 0 || device_index >= static_cast(devices.size())) { + LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index); + throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); + } + const vk::PhysicalDevice physical_device(devices[device_index], dld); + return Device(*instance, physical_device, surface, dld); +} } // Anonymous namespace RendererVulkan::RendererVulkan(Core::TelemetrySession& telemetry_session_, Core::Frontend::EmuWindow& emu_window, Core::Memory::Memory& cpu_memory_, Tegra::GPU& gpu_, - std::unique_ptr context_) - : RendererBase{emu_window, std::move(context_)}, telemetry_session{telemetry_session_}, - cpu_memory{cpu_memory_}, gpu{gpu_} {} + std::unique_ptr context_) try + : RendererBase(emu_window, std::move(context_)), + telemetry_session(telemetry_session_), + cpu_memory(cpu_memory_), + gpu(gpu_), + library(OpenLibrary()), + instance(CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type, + true, Settings::values.renderer_debug)), + debug_callback(Settings::values.renderer_debug ? CreateDebugCallback(instance) : nullptr), + surface(CreateSurface(instance, render_window)), + device(CreateDevice(instance, dld, *surface)), + memory_allocator(device, false), + state_tracker(gpu), + scheduler(device, state_tracker), + swapchain(*surface, device, scheduler, render_window.GetFramebufferLayout().width, + render_window.GetFramebufferLayout().height, false), + blit_screen(cpu_memory, render_window, device, memory_allocator, swapchain, scheduler, + screen_info), + rasterizer(render_window, gpu, gpu.MemoryManager(), cpu_memory, screen_info, device, + memory_allocator, state_tracker, scheduler) { + Report(); +} catch (const vk::Exception& exception) { + LOG_ERROR(Render_Vulkan, "Vulkan initialization failed with error: {}", exception.what()); + throw std::runtime_error{fmt::format("Vulkan initialization error {}", exception.what())}; +} RendererVulkan::~RendererVulkan() { - ShutDown(); + void(device.GetLogical().WaitIdle()); } void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { @@ -101,101 +134,38 @@ void RendererVulkan::SwapBuffers(const Tegra::FramebufferConfig* framebuffer) { if (layout.width > 0 && layout.height > 0 && render_window.IsShown()) { const VAddr framebuffer_addr = framebuffer->address + framebuffer->offset; const bool use_accelerated = - rasterizer->AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride); + rasterizer.AccelerateDisplay(*framebuffer, framebuffer_addr, framebuffer->stride); const bool is_srgb = use_accelerated && screen_info.is_srgb; - if (swapchain->HasFramebufferChanged(layout) || swapchain->GetSrgbState() != is_srgb) { - swapchain->Create(layout.width, layout.height, is_srgb); - blit_screen->Recreate(); + if (swapchain.HasFramebufferChanged(layout) || swapchain.GetSrgbState() != is_srgb) { + swapchain.Create(layout.width, layout.height, is_srgb); + blit_screen.Recreate(); } - scheduler->WaitWorker(); + scheduler.WaitWorker(); - swapchain->AcquireNextImage(); - const VkSemaphore render_semaphore = blit_screen->Draw(*framebuffer, use_accelerated); + swapchain.AcquireNextImage(); + const VkSemaphore render_semaphore = blit_screen.Draw(*framebuffer, use_accelerated); - scheduler->Flush(render_semaphore); + scheduler.Flush(render_semaphore); - if (swapchain->Present(render_semaphore)) { - blit_screen->Recreate(); + if (swapchain.Present(render_semaphore)) { + blit_screen.Recreate(); } - - rasterizer->TickFrame(); + rasterizer.TickFrame(); } render_window.OnFrameDisplayed(); } -bool RendererVulkan::Init() try { - library = OpenLibrary(); - instance = CreateInstance(library, dld, VK_API_VERSION_1_1, render_window.GetWindowInfo().type, - true, Settings::values.renderer_debug); - if (Settings::values.renderer_debug) { - debug_callback = CreateDebugCallback(instance); - } - surface = CreateSurface(instance, render_window); - - InitializeDevice(); - Report(); - - memory_allocator = std::make_unique(*device); - - state_tracker = std::make_unique(gpu); - - scheduler = std::make_unique(*device, *state_tracker); - - const auto& framebuffer = render_window.GetFramebufferLayout(); - swapchain = std::make_unique(*surface, *device, *scheduler); - swapchain->Create(framebuffer.width, framebuffer.height, false); - - rasterizer = std::make_unique(render_window, gpu, gpu.MemoryManager(), - cpu_memory, screen_info, *device, - *memory_allocator, *state_tracker, *scheduler); - - blit_screen = - std::make_unique(cpu_memory, render_window, *rasterizer, *device, - *memory_allocator, *swapchain, *scheduler, screen_info); - return true; - -} catch (const vk::Exception& exception) { - LOG_ERROR(Render_Vulkan, "Vulkan initialization failed with error: {}", exception.what()); - return false; -} - -void RendererVulkan::ShutDown() { - if (!device) { - return; - } - if (const auto& dev = device->GetLogical()) { - dev.WaitIdle(); - } - rasterizer.reset(); - blit_screen.reset(); - scheduler.reset(); - swapchain.reset(); - memory_allocator.reset(); - device.reset(); -} - -void RendererVulkan::InitializeDevice() { - const std::vector devices = instance.EnumeratePhysicalDevices(); - const s32 device_index = Settings::values.vulkan_device.GetValue(); - if (device_index < 0 || device_index >= static_cast(devices.size())) { - LOG_ERROR(Render_Vulkan, "Invalid device index {}!", device_index); - throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); - } - const vk::PhysicalDevice physical_device(devices[static_cast(device_index)], dld); - device = std::make_unique(*instance, physical_device, *surface, dld); -} - void RendererVulkan::Report() const { - const std::string vendor_name{device->GetVendorName()}; - const std::string model_name{device->GetModelName()}; - const std::string driver_version = GetDriverVersion(*device); + const std::string vendor_name{device.GetVendorName()}; + const std::string model_name{device.GetModelName()}; + const std::string driver_version = GetDriverVersion(device); const std::string driver_name = fmt::format("{} {}", vendor_name, driver_version); - const std::string api_version = GetReadableVersion(device->ApiVersion()); + const std::string api_version = GetReadableVersion(device.ApiVersion()); - const std::string extensions = BuildCommaSeparatedExtensions(device->GetAvailableExtensions()); + const std::string extensions = BuildCommaSeparatedExtensions(device.GetAvailableExtensions()); LOG_INFO(Render_Vulkan, "Driver: {}", driver_name); LOG_INFO(Render_Vulkan, "Device: {}", model_name); @@ -209,21 +179,4 @@ void RendererVulkan::Report() const { telemetry_session.AddField(field, "GPU_Vulkan_Extensions", extensions); } -std::vector RendererVulkan::EnumerateDevices() try { - vk::InstanceDispatch dld; - const Common::DynamicLibrary library = OpenLibrary(); - const vk::Instance instance = CreateInstance(library, dld, VK_API_VERSION_1_0); - const std::vector physical_devices = instance.EnumeratePhysicalDevices(); - std::vector names; - names.reserve(physical_devices.size()); - for (const VkPhysicalDevice device : physical_devices) { - names.push_back(vk::PhysicalDevice(device, dld).GetProperties().deviceName); - } - return names; - -} catch (const vk::Exception& exception) { - LOG_ERROR(Render_Vulkan, "Failed to enumerate devices with error: {}", exception.what()); - return {}; -} - } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/renderer_vulkan.h b/src/video_core/renderer_vulkan/renderer_vulkan.h index daf55b9b4..72071316c 100755 --- a/src/video_core/renderer_vulkan/renderer_vulkan.h +++ b/src/video_core/renderer_vulkan/renderer_vulkan.h @@ -9,8 +9,14 @@ #include #include "common/dynamic_library.h" - #include "video_core/renderer_base.h" +#include "video_core/renderer_vulkan/vk_blit_screen.h" +#include "video_core/renderer_vulkan/vk_rasterizer.h" +#include "video_core/renderer_vulkan/vk_scheduler.h" +#include "video_core/renderer_vulkan/vk_state_tracker.h" +#include "video_core/renderer_vulkan/vk_swapchain.h" +#include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Core { @@ -27,20 +33,6 @@ class GPU; namespace Vulkan { -class Device; -class StateTracker; -class MemoryAllocator; -class VKBlitScreen; -class VKSwapchain; -class VKScheduler; - -struct VKScreenInfo { - VkImageView image_view{}; - u32 width{}; - u32 height{}; - bool is_srgb{}; -}; - class RendererVulkan final : public VideoCore::RendererBase { public: explicit RendererVulkan(Core::TelemetrySession& telemtry_session, @@ -49,15 +41,13 @@ public: std::unique_ptr context_); ~RendererVulkan() override; - bool Init() override; - void ShutDown() override; void SwapBuffers(const Tegra::FramebufferConfig* framebuffer) override; - static std::vector EnumerateDevices(); + VideoCore::RasterizerInterface* ReadRasterizer() override { + return &rasterizer; + } private: - void InitializeDevice(); - void Report() const; Core::TelemetrySession& telemetry_session; @@ -68,18 +58,18 @@ private: vk::InstanceDispatch dld; vk::Instance instance; - + vk::DebugUtilsMessenger debug_callback; vk::SurfaceKHR surface; VKScreenInfo screen_info; - vk::DebugUtilsMessenger debug_callback; - std::unique_ptr device; - std::unique_ptr memory_allocator; - std::unique_ptr state_tracker; - std::unique_ptr scheduler; - std::unique_ptr swapchain; - std::unique_ptr blit_screen; + Device device; + MemoryAllocator memory_allocator; + StateTracker state_tracker; + VKScheduler scheduler; + VKSwapchain swapchain; + VKBlitScreen blit_screen; + RasterizerVulkan rasterizer; }; } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.cpp b/src/video_core/renderer_vulkan/vk_blit_screen.cpp index 3e3b895e0..a1a32aabe 100755 --- a/src/video_core/renderer_vulkan/vk_blit_screen.cpp +++ b/src/video_core/renderer_vulkan/vk_blit_screen.cpp @@ -18,7 +18,6 @@ #include "video_core/gpu.h" #include "video_core/host_shaders/vulkan_present_frag_spv.h" #include "video_core/host_shaders/vulkan_present_vert_spv.h" -#include "video_core/rasterizer_interface.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_blit_screen.h" #include "video_core/renderer_vulkan/vk_master_semaphore.h" @@ -113,13 +112,12 @@ struct VKBlitScreen::BufferData { }; VKBlitScreen::VKBlitScreen(Core::Memory::Memory& cpu_memory_, - Core::Frontend::EmuWindow& render_window_, - VideoCore::RasterizerInterface& rasterizer_, const Device& device_, + Core::Frontend::EmuWindow& render_window_, const Device& device_, MemoryAllocator& memory_allocator_, VKSwapchain& swapchain_, VKScheduler& scheduler_, const VKScreenInfo& screen_info_) - : cpu_memory{cpu_memory_}, render_window{render_window_}, rasterizer{rasterizer_}, - device{device_}, memory_allocator{memory_allocator_}, swapchain{swapchain_}, - scheduler{scheduler_}, image_count{swapchain.GetImageCount()}, screen_info{screen_info_} { + : cpu_memory{cpu_memory_}, render_window{render_window_}, device{device_}, + memory_allocator{memory_allocator_}, swapchain{swapchain_}, scheduler{scheduler_}, + image_count{swapchain.GetImageCount()}, screen_info{screen_info_} { resource_ticks.resize(image_count); CreateStaticResources(); @@ -150,8 +148,8 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool SetUniformData(data, framebuffer); SetVertexData(data, framebuffer); - const std::span map = buffer_commit.Map(); - std::memcpy(map.data(), &data, sizeof(data)); + const std::span mapped_span = buffer_commit.Map(); + std::memcpy(mapped_span.data(), &data, sizeof(data)); if (!use_accelerated) { const u64 image_offset = GetRawImageOffset(framebuffer, image_index); @@ -159,14 +157,13 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool const VAddr framebuffer_addr = framebuffer.address + framebuffer.offset; const u8* const host_ptr = cpu_memory.GetPointer(framebuffer_addr); const size_t size_bytes = GetSizeInBytes(framebuffer); - rasterizer.FlushRegion(ToCacheAddr(host_ptr), size_bytes); // TODO(Rodrigo): Read this from HLE constexpr u32 block_height_log2 = 4; const u32 bytes_per_pixel = GetBytesPerPixel(framebuffer); Tegra::Texture::UnswizzleTexture( - map.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes), bytes_per_pixel, - framebuffer.width, framebuffer.height, 1, block_height_log2, 0); + mapped_span.subspan(image_offset, size_bytes), std::span(host_ptr, size_bytes), + bytes_per_pixel, framebuffer.width, framebuffer.height, 1, block_height_log2, 0); const VkBufferImageCopy copy{ .bufferOffset = image_offset, @@ -266,7 +263,6 @@ VkSemaphore VKBlitScreen::Draw(const Tegra::FramebufferConfig& framebuffer, bool cmdbuf.Draw(4, 1, 0, 0); cmdbuf.EndRenderPass(); }); - return *semaphores[image_index]; } diff --git a/src/video_core/renderer_vulkan/vk_blit_screen.h b/src/video_core/renderer_vulkan/vk_blit_screen.h index b52576957..5e3177685 100755 --- a/src/video_core/renderer_vulkan/vk_blit_screen.h +++ b/src/video_core/renderer_vulkan/vk_blit_screen.h @@ -38,12 +38,18 @@ class RasterizerVulkan; class VKScheduler; class VKSwapchain; -class VKBlitScreen final { +struct VKScreenInfo { + VkImageView image_view{}; + u32 width{}; + u32 height{}; + bool is_srgb{}; +}; + +class VKBlitScreen { public: explicit VKBlitScreen(Core::Memory::Memory& cpu_memory, - Core::Frontend::EmuWindow& render_window, - VideoCore::RasterizerInterface& rasterizer, const Device& device, - MemoryAllocator& memory_allocator, VKSwapchain& swapchain, + Core::Frontend::EmuWindow& render_window, const Device& device, + MemoryAllocator& memory_manager, VKSwapchain& swapchain, VKScheduler& scheduler, const VKScreenInfo& screen_info); ~VKBlitScreen(); @@ -84,7 +90,6 @@ private: Core::Memory::Memory& cpu_memory; Core::Frontend::EmuWindow& render_window; - VideoCore::RasterizerInterface& rasterizer; const Device& device; MemoryAllocator& memory_allocator; VKSwapchain& swapchain; diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp index d8ad40a0f..48fc5d966 100755 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.cpp @@ -3,188 +3,276 @@ // Refer to the license.txt file included. #include +#include #include -#include +#include +#include -#include "core/core.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" -#include "video_core/renderer_vulkan/vk_stream_buffer.h" +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" +#include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/vulkan_common/vulkan_device.h" +#include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { - namespace { +VkBufferCopy MakeBufferCopy(const VideoCommon::BufferCopy& copy) { + return VkBufferCopy{ + .srcOffset = copy.src_offset, + .dstOffset = copy.dst_offset, + .size = copy.size, + }; +} -constexpr VkBufferUsageFlags BUFFER_USAGE = - VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | - VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | VK_BUFFER_USAGE_STORAGE_BUFFER_BIT; +VkIndexType IndexTypeFromNumElements(const Device& device, u32 num_elements) { + if (num_elements <= 0xff && device.IsExtIndexTypeUint8Supported()) { + return VK_INDEX_TYPE_UINT8_EXT; + } + if (num_elements <= 0xffff) { + return VK_INDEX_TYPE_UINT16; + } + return VK_INDEX_TYPE_UINT32; +} -constexpr VkPipelineStageFlags UPLOAD_PIPELINE_STAGE = - VK_PIPELINE_STAGE_TRANSFER_BIT | VK_PIPELINE_STAGE_VERTEX_INPUT_BIT | - VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT; - -constexpr VkAccessFlags UPLOAD_ACCESS_BARRIERS = - VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_UNIFORM_READ_BIT | - VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT | VK_ACCESS_INDEX_READ_BIT; - -constexpr VkAccessFlags TRANSFORM_FEEDBACK_WRITE_ACCESS = - VK_ACCESS_TRANSFORM_FEEDBACK_WRITE_BIT_EXT | VK_ACCESS_TRANSFORM_FEEDBACK_COUNTER_WRITE_BIT_EXT; +size_t BytesPerIndex(VkIndexType index_type) { + switch (index_type) { + case VK_INDEX_TYPE_UINT8_EXT: + return 1; + case VK_INDEX_TYPE_UINT16: + return 2; + case VK_INDEX_TYPE_UINT32: + return 4; + default: + UNREACHABLE_MSG("Invalid index type={}", index_type); + return 1; + } +} +template +std::array MakeQuadIndices(u32 quad, u32 first) { + std::array indices{0, 1, 2, 0, 2, 3}; + std::ranges::transform(indices, indices.begin(), + [quad, first](u32 index) { return first + index + quad * 4; }); + return indices; +} } // Anonymous namespace -Buffer::Buffer(const Device& device_, MemoryAllocator& memory_allocator, VKScheduler& scheduler_, - StagingBufferPool& staging_pool_, VAddr cpu_addr_, std::size_t size_) - : BufferBlock{cpu_addr_, size_}, device{device_}, scheduler{scheduler_}, staging_pool{ - staging_pool_} { - buffer = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ +Buffer::Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params) + : VideoCommon::BufferBase(null_params) {} + +Buffer::Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_, + VAddr cpu_addr_, u64 size_bytes_) + : VideoCommon::BufferBase(rasterizer_, cpu_addr_, size_bytes_) { + buffer = runtime.device.GetLogical().CreateBuffer(VkBufferCreateInfo{ .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .size = static_cast(size_), - .usage = BUFFER_USAGE | VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .size = SizeBytes(), + .usage = VK_BUFFER_USAGE_TRANSFER_SRC_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT | + VK_BUFFER_USAGE_UNIFORM_TEXEL_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_TEXEL_BUFFER_BIT | VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT | + VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_INDEX_BUFFER_BIT | + VK_BUFFER_USAGE_VERTEX_BUFFER_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }); - commit = memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); + if (runtime.device.HasDebuggingToolAttached()) { + buffer.SetObjectNameEXT(fmt::format("Buffer 0x{:x}", CpuAddr()).c_str()); + } + commit = runtime.memory_allocator.Commit(buffer, MemoryUsage::DeviceLocal); } -Buffer::~Buffer() = default; +BufferCacheRuntime::BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_allocator_, + VKScheduler& scheduler_, StagingBufferPool& staging_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + VKDescriptorPool& descriptor_pool) + : device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_}, + staging_pool{staging_pool_}, update_descriptor_queue{update_descriptor_queue_}, + uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), + quad_index_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue) {} -void Buffer::Upload(std::size_t offset, std::size_t data_size, const u8* data) { - const auto& staging = staging_pool.Request(data_size, MemoryUsage::Upload); - std::memcpy(staging.mapped_span.data(), data, data_size); +StagingBufferRef BufferCacheRuntime::UploadStagingBuffer(size_t size) { + return staging_pool.Request(size, MemoryUsage::Upload); +} +StagingBufferRef BufferCacheRuntime::DownloadStagingBuffer(size_t size) { + return staging_pool.Request(size, MemoryUsage::Download); +} + +void BufferCacheRuntime::Finish() { + scheduler.Finish(); +} + +void BufferCacheRuntime::CopyBuffer(VkBuffer dst_buffer, VkBuffer src_buffer, + std::span copies) { + static constexpr VkMemoryBarrier READ_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + }; + static constexpr VkMemoryBarrier WRITE_BARRIER{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + // Measuring a popular game, this number never exceeds the specified size once data is warmed up + boost::container::small_vector vk_copies(copies.size()); + std::ranges::transform(copies, vk_copies.begin(), MakeBufferCopy); scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, READ_BARRIER); + cmdbuf.CopyBuffer(src_buffer, dst_buffer, vk_copies); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + 0, WRITE_BARRIER); + }); +} - const VkBuffer handle = Handle(); - scheduler.Record([staging = staging.buffer, handle, offset, data_size, - &device = device](vk::CommandBuffer cmdbuf) { - const VkBufferMemoryBarrier read_barrier{ - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = - VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT | - VK_ACCESS_HOST_WRITE_BIT | - (device.IsExtTransformFeedbackSupported() ? TRANSFORM_FEEDBACK_WRITE_ACCESS : 0), - .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = handle, - .offset = offset, - .size = data_size, +void BufferCacheRuntime::BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, + u32 base_vertex, u32 num_indices, VkBuffer buffer, + u32 offset, [[maybe_unused]] u32 size) { + VkIndexType index_type = MaxwellToVK::IndexFormat(index_format); + if (topology == PrimitiveTopology::Quads) { + index_type = VK_INDEX_TYPE_UINT32; + std::tie(buffer, offset) = + quad_index_pass.Assemble(index_format, num_indices, base_vertex, buffer, offset); + } else if (index_type == VK_INDEX_TYPE_UINT8_EXT && !device.IsExtIndexTypeUint8Supported()) { + index_type = VK_INDEX_TYPE_UINT16; + std::tie(buffer, offset) = uint8_pass.Assemble(num_indices, buffer, offset); + } + scheduler.Record([buffer, offset, index_type](vk::CommandBuffer cmdbuf) { + cmdbuf.BindIndexBuffer(buffer, offset, index_type); + }); +} + +void BufferCacheRuntime::BindQuadArrayIndexBuffer(u32 first, u32 count) { + ReserveQuadArrayLUT(first + count, true); + + // The LUT has the indices 0, 1, 2, and 3 copied as an array + // To apply these 'first' offsets we can apply an offset based on the modulus. + const VkIndexType index_type = quad_array_lut_index_type; + const size_t sub_first_offset = static_cast(first % 4) * (current_num_indices / 4); + const size_t offset = (sub_first_offset + first / 4) * 6ULL * BytesPerIndex(index_type); + scheduler.Record([buffer = *quad_array_lut, index_type, offset](vk::CommandBuffer cmdbuf) { + cmdbuf.BindIndexBuffer(buffer, offset, index_type); + }); +} + +void BufferCacheRuntime::BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size, + u32 stride) { + if (device.IsExtExtendedDynamicStateSupported()) { + scheduler.Record([index, buffer, offset, size, stride](vk::CommandBuffer cmdbuf) { + const VkDeviceSize vk_offset = offset; + const VkDeviceSize vk_size = buffer != VK_NULL_HANDLE ? size : VK_WHOLE_SIZE; + const VkDeviceSize vk_stride = stride; + cmdbuf.BindVertexBuffers2EXT(index, 1, &buffer, &vk_offset, &vk_size, &vk_stride); + }); + } else { + scheduler.Record([index, buffer, offset](vk::CommandBuffer cmdbuf) { + cmdbuf.BindVertexBuffer(index, buffer, offset); + }); + } +} + +void BufferCacheRuntime::BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset, + u32 size) { + if (!device.IsExtTransformFeedbackSupported()) { + // Already logged in the rasterizer + return; + } + scheduler.Record([index, buffer, offset, size](vk::CommandBuffer cmdbuf) { + const VkDeviceSize vk_offset = offset; + const VkDeviceSize vk_size = size; + cmdbuf.BindTransformFeedbackBuffersEXT(index, 1, &buffer, &vk_offset, &vk_size); + }); +} + +void BufferCacheRuntime::BindBuffer(VkBuffer buffer, u32 offset, u32 size) { + update_descriptor_queue.AddBuffer(buffer, offset, size); +} + +void BufferCacheRuntime::ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle) { + if (num_indices <= current_num_indices) { + return; + } + if (wait_for_idle) { + scheduler.Finish(); + } + current_num_indices = num_indices; + quad_array_lut_index_type = IndexTypeFromNumElements(device, num_indices); + + const u32 num_quads = num_indices / 4; + const u32 num_triangle_indices = num_quads * 6; + const u32 num_first_offset_copies = 4; + const size_t bytes_per_index = BytesPerIndex(quad_array_lut_index_type); + const size_t size_bytes = num_triangle_indices * bytes_per_index * num_first_offset_copies; + quad_array_lut = device.GetLogical().CreateBuffer(VkBufferCreateInfo{ + .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, + .pNext = nullptr, + .flags = 0, + .size = size_bytes, + .usage = VK_BUFFER_USAGE_INDEX_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, + .sharingMode = VK_SHARING_MODE_EXCLUSIVE, + .queueFamilyIndexCount = 0, + .pQueueFamilyIndices = nullptr, + }); + if (device.HasDebuggingToolAttached()) { + quad_array_lut.SetObjectNameEXT("Quad LUT"); + } + quad_array_lut_commit = memory_allocator.Commit(quad_array_lut, MemoryUsage::DeviceLocal); + + const StagingBufferRef staging = staging_pool.Request(size_bytes, MemoryUsage::Upload); + u8* staging_data = staging.mapped_span.data(); + const size_t quad_size = bytes_per_index * 6; + for (u32 first = 0; first < num_first_offset_copies; ++first) { + for (u32 quad = 0; quad < num_quads; ++quad) { + switch (quad_array_lut_index_type) { + case VK_INDEX_TYPE_UINT8_EXT: + std::memcpy(staging_data, MakeQuadIndices(quad, first).data(), quad_size); + break; + case VK_INDEX_TYPE_UINT16: + std::memcpy(staging_data, MakeQuadIndices(quad, first).data(), quad_size); + break; + case VK_INDEX_TYPE_UINT32: + std::memcpy(staging_data, MakeQuadIndices(quad, first).data(), quad_size); + break; + default: + UNREACHABLE(); + break; + } + staging_data += quad_size; + } + } + scheduler.RequestOutsideRenderPassOperationContext(); + scheduler.Record([src_buffer = staging.buffer, dst_buffer = *quad_array_lut, + size_bytes](vk::CommandBuffer cmdbuf) { + const VkBufferCopy copy{ + .srcOffset = 0, + .dstOffset = 0, + .size = size_bytes, }; const VkBufferMemoryBarrier write_barrier{ .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, .pNext = nullptr, .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = UPLOAD_ACCESS_BARRIERS, + .dstAccessMask = VK_ACCESS_INDEX_READ_BIT, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = handle, - .offset = offset, - .size = data_size, + .buffer = dst_buffer, + .offset = 0, + .size = size_bytes, }; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, - 0, read_barrier); - cmdbuf.CopyBuffer(staging, handle, VkBufferCopy{0, offset, data_size}); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, - write_barrier); + cmdbuf.CopyBuffer(src_buffer, dst_buffer, copy); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, + 0, write_barrier); }); } -void Buffer::Download(std::size_t offset, std::size_t data_size, u8* data) { - auto staging = staging_pool.Request(data_size, MemoryUsage::Download); - scheduler.RequestOutsideRenderPassOperationContext(); - - const VkBuffer handle = Handle(); - scheduler.Record( - [staging = staging.buffer, handle, offset, data_size](vk::CommandBuffer cmdbuf) { - const VkBufferMemoryBarrier barrier{ - .sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER, - .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, - .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, - .buffer = handle, - .offset = offset, - .size = data_size, - }; - - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_VERTEX_SHADER_BIT | - VK_PIPELINE_STAGE_FRAGMENT_SHADER_BIT | - VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_TRANSFER_BIT, 0, {}, barrier, {}); - cmdbuf.CopyBuffer(handle, staging, VkBufferCopy{offset, 0, data_size}); - }); - scheduler.Finish(); - - std::memcpy(data, staging.mapped_span.data(), data_size); -} - -void Buffer::CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t copy_size) { - scheduler.RequestOutsideRenderPassOperationContext(); - - const VkBuffer dst_buffer = Handle(); - scheduler.Record([src_buffer = src.Handle(), dst_buffer, src_offset, dst_offset, - copy_size](vk::CommandBuffer cmdbuf) { - cmdbuf.CopyBuffer(src_buffer, dst_buffer, VkBufferCopy{src_offset, dst_offset, copy_size}); - - std::array barriers; - barriers[0].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barriers[0].pNext = nullptr; - barriers[0].srcAccessMask = VK_ACCESS_TRANSFER_READ_BIT; - barriers[0].dstAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - barriers[0].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[0].buffer = src_buffer; - barriers[0].offset = src_offset; - barriers[0].size = copy_size; - barriers[1].sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barriers[1].pNext = nullptr; - barriers[1].srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT; - barriers[1].dstAccessMask = UPLOAD_ACCESS_BARRIERS; - barriers[1].srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[1].dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barriers[1].buffer = dst_buffer; - barriers[1].offset = dst_offset; - barriers[1].size = copy_size; - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, UPLOAD_PIPELINE_STAGE, 0, {}, - barriers, {}); - }); -} - -VKBufferCache::VKBufferCache(VideoCore::RasterizerInterface& rasterizer_, - Tegra::MemoryManager& gpu_memory_, Core::Memory::Memory& cpu_memory_, - const Device& device_, MemoryAllocator& memory_allocator_, - VKScheduler& scheduler_, VKStreamBuffer& stream_buffer_, - StagingBufferPool& staging_pool_) - : VideoCommon::BufferCache{rasterizer_, gpu_memory_, - cpu_memory_, stream_buffer_}, - device{device_}, memory_allocator{memory_allocator_}, scheduler{scheduler_}, - staging_pool{staging_pool_} {} - -VKBufferCache::~VKBufferCache() = default; - -std::shared_ptr VKBufferCache::CreateBlock(VAddr cpu_addr, std::size_t size) { - return std::make_shared(device, memory_allocator, scheduler, staging_pool, cpu_addr, - size); -} - -VKBufferCache::BufferInfo VKBufferCache::GetEmptyBuffer(std::size_t size) { - size = std::max(size, std::size_t(4)); - const auto& empty = staging_pool.Request(size, MemoryUsage::DeviceLocal); - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([size, buffer = empty.buffer](vk::CommandBuffer cmdbuf) { - cmdbuf.FillBuffer(buffer, 0, size, 0); - }); - return {empty.buffer, 0, 0}; -} - } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_buffer_cache.h b/src/video_core/renderer_vulkan/vk_buffer_cache.h index 41d577510..d232e1f2d 100755 --- a/src/video_core/renderer_vulkan/vk_buffer_cache.h +++ b/src/video_core/renderer_vulkan/vk_buffer_cache.h @@ -4,69 +4,112 @@ #pragma once -#include - -#include "common/common_types.h" #include "video_core/buffer_cache/buffer_cache.h" +#include "video_core/engines/maxwell_3d.h" +#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" -#include "video_core/renderer_vulkan/vk_stream_buffer.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { class Device; +class VKDescriptorPool; class VKScheduler; +class VKUpdateDescriptorQueue; -class Buffer final : public VideoCommon::BufferBlock { +class BufferCacheRuntime; + +class Buffer : public VideoCommon::BufferBase { public: - explicit Buffer(const Device& device, MemoryAllocator& memory_allocator, VKScheduler& scheduler, - StagingBufferPool& staging_pool, VAddr cpu_addr_, std::size_t size_); - ~Buffer(); + explicit Buffer(BufferCacheRuntime&, VideoCommon::NullBufferParams null_params); + explicit Buffer(BufferCacheRuntime& runtime, VideoCore::RasterizerInterface& rasterizer_, + VAddr cpu_addr_, u64 size_bytes_); - void Upload(std::size_t offset, std::size_t data_size, const u8* data); - - void Download(std::size_t offset, std::size_t data_size, u8* data); - - void CopyFrom(const Buffer& src, std::size_t src_offset, std::size_t dst_offset, - std::size_t copy_size); - - VkBuffer Handle() const { + [[nodiscard]] VkBuffer Handle() const noexcept { return *buffer; } - u64 Address() const { - return 0; + operator VkBuffer() const noexcept { + return *buffer; } private: - const Device& device; - VKScheduler& scheduler; - StagingBufferPool& staging_pool; - vk::Buffer buffer; MemoryCommit commit; }; -class VKBufferCache final : public VideoCommon::BufferCache { +class BufferCacheRuntime { + friend Buffer; + + using PrimitiveTopology = Tegra::Engines::Maxwell3D::Regs::PrimitiveTopology; + using IndexFormat = Tegra::Engines::Maxwell3D::Regs::IndexFormat; + public: - explicit VKBufferCache(VideoCore::RasterizerInterface& rasterizer, - Tegra::MemoryManager& gpu_memory, Core::Memory::Memory& cpu_memory, - const Device& device, MemoryAllocator& memory_allocator, - VKScheduler& scheduler, VKStreamBuffer& stream_buffer, - StagingBufferPool& staging_pool); - ~VKBufferCache(); + explicit BufferCacheRuntime(const Device& device_, MemoryAllocator& memory_manager_, + VKScheduler& scheduler_, StagingBufferPool& staging_pool_, + VKUpdateDescriptorQueue& update_descriptor_queue_, + VKDescriptorPool& descriptor_pool); - BufferInfo GetEmptyBuffer(std::size_t size) override; + void Finish(); -protected: - std::shared_ptr CreateBlock(VAddr cpu_addr, std::size_t size) override; + [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); + + [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); + + void CopyBuffer(VkBuffer src_buffer, VkBuffer dst_buffer, + std::span copies); + + void BindIndexBuffer(PrimitiveTopology topology, IndexFormat index_format, u32 num_indices, + u32 base_vertex, VkBuffer buffer, u32 offset, u32 size); + + void BindQuadArrayIndexBuffer(u32 first, u32 count); + + void BindVertexBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size, u32 stride); + + void BindTransformFeedbackBuffer(u32 index, VkBuffer buffer, u32 offset, u32 size); + + void BindUniformBuffer(VkBuffer buffer, u32 offset, u32 size) { + BindBuffer(buffer, offset, size); + } + + void BindStorageBuffer(VkBuffer buffer, u32 offset, u32 size, + [[maybe_unused]] bool is_written) { + BindBuffer(buffer, offset, size); + } private: + void BindBuffer(VkBuffer buffer, u32 offset, u32 size); + + void ReserveQuadArrayLUT(u32 num_indices, bool wait_for_idle); + const Device& device; MemoryAllocator& memory_allocator; VKScheduler& scheduler; StagingBufferPool& staging_pool; + VKUpdateDescriptorQueue& update_descriptor_queue; + + vk::Buffer quad_array_lut; + MemoryCommit quad_array_lut_commit; + VkIndexType quad_array_lut_index_type{}; + u32 current_num_indices = 0; + + Uint8Pass uint8_pass; + QuadIndexedPass quad_index_pass; }; +struct BufferCacheParams { + using Runtime = Vulkan::BufferCacheRuntime; + using Buffer = Vulkan::Buffer; + + static constexpr bool IS_OPENGL = false; + static constexpr bool HAS_PERSISTENT_UNIFORM_BUFFER_BINDINGS = false; + static constexpr bool HAS_FULL_INDEX_AND_PRIMITIVE_SUPPORT = false; + static constexpr bool NEEDS_BIND_UNIFORM_INDEX = false; + static constexpr bool NEEDS_BIND_STORAGE_INDEX = false; + static constexpr bool USE_MEMORY_MAPS = true; +}; + +using BufferCache = VideoCommon::BufferCache; + } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.cpp b/src/video_core/renderer_vulkan/vk_compute_pass.cpp index 5eb6a54be..979a0bc9b 100755 --- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp +++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp @@ -10,7 +10,6 @@ #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" -#include "video_core/host_shaders/vulkan_quad_array_comp_spv.h" #include "video_core/host_shaders/vulkan_quad_indexed_comp_spv.h" #include "video_core/host_shaders/vulkan_uint8_comp_spv.h" #include "video_core/renderer_vulkan/vk_compute_pass.h" @@ -22,19 +21,7 @@ #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { - namespace { - -VkDescriptorSetLayoutBinding BuildQuadArrayPassDescriptorSetLayoutBinding() { - return { - .binding = 0, - .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, - .descriptorCount = 1, - .stageFlags = VK_SHADER_STAGE_COMPUTE_BIT, - .pImmutableSamplers = nullptr, - }; -} - VkDescriptorUpdateTemplateEntryKHR BuildQuadArrayPassDescriptorUpdateTemplateEntry() { return { .dstBinding = 0, @@ -162,55 +149,6 @@ VkDescriptorSet VKComputePass::CommitDescriptorSet( return set; } -QuadArrayPass::QuadArrayPass(const Device& device_, VKScheduler& scheduler_, - VKDescriptorPool& descriptor_pool_, - StagingBufferPool& staging_buffer_pool_, - VKUpdateDescriptorQueue& update_descriptor_queue_) - : VKComputePass(device_, descriptor_pool_, BuildQuadArrayPassDescriptorSetLayoutBinding(), - BuildQuadArrayPassDescriptorUpdateTemplateEntry(), - BuildComputePushConstantRange(sizeof(u32)), VULKAN_QUAD_ARRAY_COMP_SPV), - scheduler{scheduler_}, staging_buffer_pool{staging_buffer_pool_}, - update_descriptor_queue{update_descriptor_queue_} {} - -QuadArrayPass::~QuadArrayPass() = default; - -std::pair QuadArrayPass::Assemble(u32 num_vertices, u32 first) { - const u32 num_triangle_vertices = (num_vertices / 4) * 6; - const std::size_t staging_size = num_triangle_vertices * sizeof(u32); - const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); - - update_descriptor_queue.Acquire(); - update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size); - const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); - - scheduler.RequestOutsideRenderPassOperationContext(); - - ASSERT(num_vertices % 4 == 0); - const u32 num_quads = num_vertices / 4; - scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, - num_quads, first, set](vk::CommandBuffer cmdbuf) { - constexpr u32 dispatch_size = 1024; - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, layout, 0, set, {}); - cmdbuf.PushConstants(layout, VK_SHADER_STAGE_COMPUTE_BIT, 0, sizeof(first), &first); - cmdbuf.Dispatch(Common::AlignUp(num_quads, dispatch_size) / dispatch_size, 1, 1); - - VkBufferMemoryBarrier barrier; - barrier.sType = VK_STRUCTURE_TYPE_BUFFER_MEMORY_BARRIER; - barrier.pNext = nullptr; - barrier.srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT; - barrier.dstAccessMask = VK_ACCESS_VERTEX_ATTRIBUTE_READ_BIT; - barrier.srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED; - barrier.buffer = buffer; - barrier.offset = 0; - barrier.size = static_cast(num_quads) * 6 * sizeof(u32); - cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, - VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, {barrier}, {}); - }); - return {staging_ref.buffer, 0}; -} - Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_, VKDescriptorPool& descriptor_pool, StagingBufferPool& staging_buffer_pool_, VKUpdateDescriptorQueue& update_descriptor_queue_) @@ -221,18 +159,18 @@ Uint8Pass::Uint8Pass(const Device& device, VKScheduler& scheduler_, Uint8Pass::~Uint8Pass() = default; -std::pair Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, - u64 src_offset) { +std::pair Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buffer, + u32 src_offset) { const u32 staging_size = static_cast(num_vertices * sizeof(u16)); - const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); + const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(src_buffer, src_offset, num_vertices); - update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size); + update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size); const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, set, + scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set, num_vertices](vk::CommandBuffer cmdbuf) { constexpr u32 dispatch_size = 1024; cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, pipeline); @@ -252,7 +190,7 @@ std::pair Uint8Pass::Assemble(u32 num_vertices, VkBuffer src_buff cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); }); - return {staging_ref.buffer, 0}; + return {staging.buffer, 0}; } QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, @@ -267,9 +205,9 @@ QuadIndexedPass::QuadIndexedPass(const Device& device_, VKScheduler& scheduler_, QuadIndexedPass::~QuadIndexedPass() = default; -std::pair QuadIndexedPass::Assemble( +std::pair QuadIndexedPass::Assemble( Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex, - VkBuffer src_buffer, u64 src_offset) { + VkBuffer src_buffer, u32 src_offset) { const u32 index_shift = [index_format] { switch (index_format) { case Tegra::Engines::Maxwell3D::Regs::IndexFormat::UnsignedByte: @@ -286,15 +224,15 @@ std::pair QuadIndexedPass::Assemble( const u32 num_tri_vertices = (num_vertices / 4) * 6; const std::size_t staging_size = num_tri_vertices * sizeof(u32); - const auto staging_ref = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); + const auto staging = staging_buffer_pool.Request(staging_size, MemoryUsage::DeviceLocal); update_descriptor_queue.Acquire(); update_descriptor_queue.AddBuffer(src_buffer, src_offset, input_size); - update_descriptor_queue.AddBuffer(staging_ref.buffer, 0, staging_size); + update_descriptor_queue.AddBuffer(staging.buffer, 0, staging_size); const VkDescriptorSet set = CommitDescriptorSet(update_descriptor_queue); scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging_ref.buffer, set, + scheduler.Record([layout = *layout, pipeline = *pipeline, buffer = staging.buffer, set, num_tri_vertices, base_vertex, index_shift](vk::CommandBuffer cmdbuf) { static constexpr u32 dispatch_size = 1024; const std::array push_constants = {base_vertex, index_shift}; @@ -317,7 +255,7 @@ std::pair QuadIndexedPass::Assemble( cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_COMPUTE_SHADER_BIT, VK_PIPELINE_STAGE_VERTEX_INPUT_BIT, 0, {}, barrier, {}); }); - return {staging_ref.buffer, 0}; + return {staging.buffer, 0}; } } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_compute_pass.h b/src/video_core/renderer_vulkan/vk_compute_pass.h index f5c6f5f17..4904019f5 100755 --- a/src/video_core/renderer_vulkan/vk_compute_pass.h +++ b/src/video_core/renderer_vulkan/vk_compute_pass.h @@ -41,22 +41,6 @@ private: vk::ShaderModule module; }; -class QuadArrayPass final : public VKComputePass { -public: - explicit QuadArrayPass(const Device& device_, VKScheduler& scheduler_, - VKDescriptorPool& descriptor_pool_, - StagingBufferPool& staging_buffer_pool_, - VKUpdateDescriptorQueue& update_descriptor_queue_); - ~QuadArrayPass(); - - std::pair Assemble(u32 num_vertices, u32 first); - -private: - VKScheduler& scheduler; - StagingBufferPool& staging_buffer_pool; - VKUpdateDescriptorQueue& update_descriptor_queue; -}; - class Uint8Pass final : public VKComputePass { public: explicit Uint8Pass(const Device& device_, VKScheduler& scheduler_, @@ -64,7 +48,9 @@ public: VKUpdateDescriptorQueue& update_descriptor_queue_); ~Uint8Pass(); - std::pair Assemble(u32 num_vertices, VkBuffer src_buffer, u64 src_offset); + /// Assemble uint8 indices into an uint16 index buffer + /// Returns a pair with the staging buffer, and the offset where the assembled data is + std::pair Assemble(u32 num_vertices, VkBuffer src_buffer, u32 src_offset); private: VKScheduler& scheduler; @@ -80,9 +66,9 @@ public: VKUpdateDescriptorQueue& update_descriptor_queue_); ~QuadIndexedPass(); - std::pair Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, + std::pair Assemble(Tegra::Engines::Maxwell3D::Regs::IndexFormat index_format, u32 num_vertices, u32 base_vertex, VkBuffer src_buffer, - u64 src_offset); + u32 src_offset); private: VKScheduler& scheduler; diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.cpp b/src/video_core/renderer_vulkan/vk_fence_manager.cpp index 6cd00884d..3bec48d14 100755 --- a/src/video_core/renderer_vulkan/vk_fence_manager.cpp +++ b/src/video_core/renderer_vulkan/vk_fence_manager.cpp @@ -45,8 +45,8 @@ void InnerFence::Wait() { } VKFenceManager::VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, - Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_, - VKBufferCache& buffer_cache_, VKQueryCache& query_cache_, + TextureCache& texture_cache_, BufferCache& buffer_cache_, + VKQueryCache& query_cache_, const Device& device_, VKScheduler& scheduler_) : GenericFenceManager{rasterizer_, gpu_, texture_cache_, buffer_cache_, query_cache_}, scheduler{scheduler_} {} diff --git a/src/video_core/renderer_vulkan/vk_fence_manager.h b/src/video_core/renderer_vulkan/vk_fence_manager.h index 9c5e5aa8f..2f8322d29 100755 --- a/src/video_core/renderer_vulkan/vk_fence_manager.h +++ b/src/video_core/renderer_vulkan/vk_fence_manager.h @@ -22,7 +22,6 @@ class RasterizerInterface; namespace Vulkan { class Device; -class VKBufferCache; class VKQueryCache; class VKScheduler; @@ -45,14 +44,14 @@ private: using Fence = std::shared_ptr; using GenericFenceManager = - VideoCommon::FenceManager; + VideoCommon::FenceManager; class VKFenceManager final : public GenericFenceManager { public: - explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer_, Tegra::GPU& gpu_, - Tegra::MemoryManager& memory_manager_, TextureCache& texture_cache_, - VKBufferCache& buffer_cache_, VKQueryCache& query_cache_, - VKScheduler& scheduler_); + explicit VKFenceManager(VideoCore::RasterizerInterface& rasterizer, Tegra::GPU& gpu, + TextureCache& texture_cache, BufferCache& buffer_cache, + VKQueryCache& query_cache, const Device& device, + VKScheduler& scheduler); protected: Fence CreateFence(u32 value, bool is_stubbed) override; diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.cpp b/src/video_core/renderer_vulkan/vk_rasterizer.cpp index f0a111829..f1ccfe7b0 100755 --- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp +++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp @@ -8,8 +8,6 @@ #include #include -#include - #include "common/alignment.h" #include "common/assert.h" #include "common/logging/log.h" @@ -24,7 +22,6 @@ #include "video_core/renderer_vulkan/maxwell_to_vk.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_compute_pipeline.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_graphics_pipeline.h" @@ -50,15 +47,16 @@ MICROPROFILE_DEFINE(Vulkan_WaitForWorker, "Vulkan", "Wait for worker", MP_RGB(25 MICROPROFILE_DEFINE(Vulkan_Drawing, "Vulkan", "Record drawing", MP_RGB(192, 128, 128)); MICROPROFILE_DEFINE(Vulkan_Compute, "Vulkan", "Record compute", MP_RGB(192, 128, 128)); MICROPROFILE_DEFINE(Vulkan_Clearing, "Vulkan", "Record clearing", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_Geometry, "Vulkan", "Setup geometry", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_ConstBuffers, "Vulkan", "Setup constant buffers", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_GlobalBuffers, "Vulkan", "Setup global buffers", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_RenderTargets, "Vulkan", "Setup render targets", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_Textures, "Vulkan", "Setup textures", MP_RGB(192, 128, 128)); -MICROPROFILE_DEFINE(Vulkan_Images, "Vulkan", "Setup images", MP_RGB(192, 128, 128)); MICROPROFILE_DEFINE(Vulkan_PipelineCache, "Vulkan", "Pipeline cache", MP_RGB(192, 128, 128)); namespace { +struct DrawParams { + u32 base_instance; + u32 num_instances; + u32 base_vertex; + u32 num_vertices; + bool is_indexed; +}; constexpr auto COMPUTE_SHADER_INDEX = static_cast(Tegra::Engines::ShaderType::Compute); @@ -67,7 +65,6 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in const float width = src.scale_x * 2.0f; const float height = src.scale_y * 2.0f; const float reduce_z = regs.depth_mode == Maxwell::DepthMode::MinusOneToOne ? 1.0f : 0.0f; - VkViewport viewport{ .x = src.translate_x - src.scale_x, .y = src.translate_y - src.scale_y, @@ -76,12 +73,10 @@ VkViewport GetViewportState(const Device& device, const Maxwell& regs, size_t in .minDepth = src.translate_z - src.scale_z * reduce_z, .maxDepth = src.translate_z + src.scale_z, }; - if (!device.IsExtDepthRangeUnrestrictedSupported()) { viewport.minDepth = std::clamp(viewport.minDepth, 0.0f, 1.0f); viewport.maxDepth = std::clamp(viewport.maxDepth, 0.0f, 1.0f); } - return viewport; } @@ -146,13 +141,6 @@ TextureHandle GetTextureInfo(const Engine& engine, bool via_header_index, const return TextureHandle(engine.AccessConstBuffer32(shader_type, buffer, offset), via_header_index); } -template -std::array ExpandStrides(const std::array& strides) { - std::array expanded; - std::copy(strides.begin(), strides.end(), expanded.begin()); - return expanded; -} - ImageViewType ImageViewTypeFromEntry(const SamplerEntry& entry) { if (entry.is_buffer) { return ImageViewType::e2D; @@ -221,190 +209,25 @@ void PushImageDescriptors(const ShaderEntries& entries, TextureCache& texture_ca } } -} // Anonymous namespace - -class BufferBindings final { -public: - void AddVertexBinding(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size, u32 stride) { - vertex.buffers[vertex.num_buffers] = buffer; - vertex.offsets[vertex.num_buffers] = offset; - vertex.sizes[vertex.num_buffers] = size; - vertex.strides[vertex.num_buffers] = static_cast(stride); - ++vertex.num_buffers; - } - - void SetIndexBinding(VkBuffer buffer, VkDeviceSize offset, VkIndexType type) { - index.buffer = buffer; - index.offset = offset; - index.type = type; - } - - void Bind(const Device& device, VKScheduler& scheduler) const { - // Use this large switch case to avoid dispatching more memory in the record lambda than - // what we need. It looks horrible, but it's the best we can do on standard C++. - switch (vertex.num_buffers) { - case 0: - return BindStatic<0>(device, scheduler); - case 1: - return BindStatic<1>(device, scheduler); - case 2: - return BindStatic<2>(device, scheduler); - case 3: - return BindStatic<3>(device, scheduler); - case 4: - return BindStatic<4>(device, scheduler); - case 5: - return BindStatic<5>(device, scheduler); - case 6: - return BindStatic<6>(device, scheduler); - case 7: - return BindStatic<7>(device, scheduler); - case 8: - return BindStatic<8>(device, scheduler); - case 9: - return BindStatic<9>(device, scheduler); - case 10: - return BindStatic<10>(device, scheduler); - case 11: - return BindStatic<11>(device, scheduler); - case 12: - return BindStatic<12>(device, scheduler); - case 13: - return BindStatic<13>(device, scheduler); - case 14: - return BindStatic<14>(device, scheduler); - case 15: - return BindStatic<15>(device, scheduler); - case 16: - return BindStatic<16>(device, scheduler); - case 17: - return BindStatic<17>(device, scheduler); - case 18: - return BindStatic<18>(device, scheduler); - case 19: - return BindStatic<19>(device, scheduler); - case 20: - return BindStatic<20>(device, scheduler); - case 21: - return BindStatic<21>(device, scheduler); - case 22: - return BindStatic<22>(device, scheduler); - case 23: - return BindStatic<23>(device, scheduler); - case 24: - return BindStatic<24>(device, scheduler); - case 25: - return BindStatic<25>(device, scheduler); - case 26: - return BindStatic<26>(device, scheduler); - case 27: - return BindStatic<27>(device, scheduler); - case 28: - return BindStatic<28>(device, scheduler); - case 29: - return BindStatic<29>(device, scheduler); - case 30: - return BindStatic<30>(device, scheduler); - case 31: - return BindStatic<31>(device, scheduler); - case 32: - return BindStatic<32>(device, scheduler); - } - UNREACHABLE(); - } - -private: - // Some of these fields are intentionally left uninitialized to avoid initializing them twice. - struct { - size_t num_buffers = 0; - std::array buffers; - std::array offsets; - std::array sizes; - std::array strides; - } vertex; - - struct { - VkBuffer buffer = nullptr; - VkDeviceSize offset; - VkIndexType type; - } index; - - template - void BindStatic(const Device& device, VKScheduler& scheduler) const { - if (device.IsExtExtendedDynamicStateSupported()) { - if (index.buffer) { - BindStatic(scheduler); - } else { - BindStatic(scheduler); - } - } else { - if (index.buffer) { - BindStatic(scheduler); - } else { - BindStatic(scheduler); - } - } - } - - template - void BindStatic(VKScheduler& scheduler) const { - static_assert(N <= Maxwell::NumVertexArrays); - if constexpr (N == 0) { - return; - } - - std::array buffers; - std::array offsets; - std::copy(vertex.buffers.begin(), vertex.buffers.begin() + N, buffers.begin()); - std::copy(vertex.offsets.begin(), vertex.offsets.begin() + N, offsets.begin()); - - if constexpr (has_extended_dynamic_state) { - // With extended dynamic states we can specify the length and stride of a vertex buffer - std::array sizes; - std::array strides; - std::copy(vertex.sizes.begin(), vertex.sizes.begin() + N, sizes.begin()); - std::copy(vertex.strides.begin(), vertex.strides.begin() + N, strides.begin()); - - if constexpr (is_indexed) { - scheduler.Record( - [buffers, offsets, sizes, strides, index = index](vk::CommandBuffer cmdbuf) { - cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type); - cmdbuf.BindVertexBuffers2EXT(0, static_cast(N), buffers.data(), - offsets.data(), sizes.data(), - ExpandStrides(strides).data()); - }); - } else { - scheduler.Record([buffers, offsets, sizes, strides](vk::CommandBuffer cmdbuf) { - cmdbuf.BindVertexBuffers2EXT(0, static_cast(N), buffers.data(), - offsets.data(), sizes.data(), - ExpandStrides(strides).data()); - }); - } - return; - } - - if constexpr (is_indexed) { - // Indexed draw - scheduler.Record([buffers, offsets, index = index](vk::CommandBuffer cmdbuf) { - cmdbuf.BindIndexBuffer(index.buffer, index.offset, index.type); - cmdbuf.BindVertexBuffers(0, static_cast(N), buffers.data(), offsets.data()); - }); - } else { - // Array draw - scheduler.Record([buffers, offsets](vk::CommandBuffer cmdbuf) { - cmdbuf.BindVertexBuffers(0, static_cast(N), buffers.data(), offsets.data()); - }); - } - } -}; - -void RasterizerVulkan::DrawParameters::Draw(vk::CommandBuffer cmdbuf) const { - if (is_indexed) { - cmdbuf.DrawIndexed(num_vertices, num_instances, 0, base_vertex, base_instance); - } else { - cmdbuf.Draw(num_vertices, num_instances, base_vertex, base_instance); +DrawParams MakeDrawParams(const Maxwell& regs, u32 num_instances, bool is_instanced, + bool is_indexed) { + DrawParams params{ + .base_instance = regs.vb_base_instance, + .num_instances = is_instanced ? num_instances : 1, + .base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first, + .num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count, + .is_indexed = is_indexed, + }; + if (regs.draw.topology == Maxwell::PrimitiveTopology::Quads) { + // 6 triangle vertices per quad, base vertex is part of the index + // See BindQuadArrayIndexBuffer for more details + params.num_vertices = (params.num_vertices / 4) * 6; + params.base_vertex = 0; + params.is_indexed = true; } + return params; } +} // Anonymous namespace RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra::GPU& gpu_, Tegra::MemoryManager& gpu_memory_, @@ -414,21 +237,19 @@ RasterizerVulkan::RasterizerVulkan(Core::Frontend::EmuWindow& emu_window_, Tegra : RasterizerAccelerated{cpu_memory_}, gpu{gpu_}, gpu_memory{gpu_memory_}, maxwell3d{gpu.Maxwell3D()}, kepler_compute{gpu.KeplerCompute()}, screen_info{screen_info_}, device{device_}, memory_allocator{memory_allocator_}, - state_tracker{state_tracker_}, scheduler{scheduler_}, stream_buffer(device, scheduler), + state_tracker{state_tracker_}, scheduler{scheduler_}, staging_pool(device, memory_allocator, scheduler), descriptor_pool(device, scheduler), update_descriptor_queue(device, scheduler), blit_image(device, scheduler, state_tracker, descriptor_pool), - quad_array_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), - quad_indexed_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), - uint8_pass(device, scheduler, descriptor_pool, staging_pool, update_descriptor_queue), texture_cache_runtime{device, scheduler, memory_allocator, staging_pool, blit_image}, texture_cache(texture_cache_runtime, *this, maxwell3d, kepler_compute, gpu_memory), + buffer_cache_runtime(device, memory_allocator, scheduler, staging_pool, + update_descriptor_queue, descriptor_pool), + buffer_cache(*this, maxwell3d, kepler_compute, gpu_memory, cpu_memory_, buffer_cache_runtime), pipeline_cache(*this, gpu, maxwell3d, kepler_compute, gpu_memory, device, scheduler, descriptor_pool, update_descriptor_queue), - buffer_cache(*this, gpu_memory, cpu_memory_, device, memory_allocator, scheduler, - stream_buffer, staging_pool), query_cache{*this, maxwell3d, gpu_memory, device, scheduler}, - fence_manager(*this, gpu, gpu_memory, texture_cache, buffer_cache, query_cache, scheduler), + fence_manager(*this, gpu, texture_cache, buffer_cache, query_cache, device, scheduler), wfi_event(device.GetLogical().CreateEvent()), async_shaders(emu_window_) { scheduler.SetQueryCache(query_cache); if (device.UseAsynchronousShaders()) { @@ -449,22 +270,14 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { GraphicsPipelineCacheKey key; key.fixed_state.Fill(maxwell3d.regs, device.IsExtExtendedDynamicStateSupported()); - buffer_cache.Map(CalculateGraphicsStreamBufferSize(is_indexed)); + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; - BufferBindings buffer_bindings; - const DrawParameters draw_params = - SetupGeometry(key.fixed_state, buffer_bindings, is_indexed, is_instanced); - - auto lock = texture_cache.AcquireLock(); texture_cache.SynchronizeGraphicsDescriptors(); - texture_cache.UpdateRenderTargets(false); const auto shaders = pipeline_cache.GetShaders(); key.shaders = GetShaderAddresses(shaders); - SetupShaderDescriptors(shaders); - - buffer_cache.Unmap(); + SetupShaderDescriptors(shaders, is_indexed); const Framebuffer* const framebuffer = texture_cache.GetFramebuffer(); key.renderpass = framebuffer->RenderPass(); @@ -476,22 +289,29 @@ void RasterizerVulkan::Draw(bool is_indexed, bool is_instanced) { return; } - buffer_bindings.Bind(device, scheduler); - BeginTransformFeedback(); scheduler.RequestRenderpass(framebuffer); scheduler.BindGraphicsPipeline(pipeline->GetHandle()); UpdateDynamicStates(); - const auto pipeline_layout = pipeline->GetLayout(); - const auto descriptor_set = pipeline->CommitDescriptorSet(); + const auto& regs = maxwell3d.regs; + const u32 num_instances = maxwell3d.mme_draw.instance_count; + const DrawParams draw_params = MakeDrawParams(regs, num_instances, is_instanced, is_indexed); + const VkPipelineLayout pipeline_layout = pipeline->GetLayout(); + const VkDescriptorSet descriptor_set = pipeline->CommitDescriptorSet(); scheduler.Record([pipeline_layout, descriptor_set, draw_params](vk::CommandBuffer cmdbuf) { if (descriptor_set) { cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_GRAPHICS, pipeline_layout, - DESCRIPTOR_SET, descriptor_set, {}); + DESCRIPTOR_SET, descriptor_set, nullptr); + } + if (draw_params.is_indexed) { + cmdbuf.DrawIndexed(draw_params.num_vertices, draw_params.num_instances, 0, + draw_params.base_vertex, draw_params.base_instance); + } else { + cmdbuf.Draw(draw_params.num_vertices, draw_params.num_instances, + draw_params.base_vertex, draw_params.base_instance); } - draw_params.Draw(cmdbuf); }); EndTransformFeedback(); @@ -515,7 +335,7 @@ void RasterizerVulkan::Clear() { return; } - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.UpdateRenderTargets(true); const Framebuffer* const framebuffer = texture_cache.GetFramebuffer(); const VkExtent2D render_area = framebuffer->RenderArea(); @@ -559,7 +379,6 @@ void RasterizerVulkan::Clear() { if (use_stencil) { aspect_flags |= VK_IMAGE_ASPECT_STENCIL_BIT; } - scheduler.Record([clear_depth = regs.clear_depth, clear_stencil = regs.clear_stencil, clear_rect, aspect_flags](vk::CommandBuffer cmdbuf) { VkClearAttachment attachment; @@ -580,12 +399,11 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { auto& pipeline = pipeline_cache.GetComputePipeline({ .shader = code_addr, .shared_memory_size = launch_desc.shared_alloc, - .workgroup_size = - { - launch_desc.block_dim_x, - launch_desc.block_dim_y, - launch_desc.block_dim_z, - }, + .workgroup_size{ + launch_desc.block_dim_x, + launch_desc.block_dim_y, + launch_desc.block_dim_z, + }, }); // Compute dispatches can't be executed inside a renderpass @@ -594,10 +412,21 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { image_view_indices.clear(); sampler_handles.clear(); - auto lock = texture_cache.AcquireLock(); - texture_cache.SynchronizeComputeDescriptors(); + std::scoped_lock lock{buffer_cache.mutex, texture_cache.mutex}; const auto& entries = pipeline.GetEntries(); + buffer_cache.SetEnabledComputeUniformBuffers(entries.enabled_uniform_buffers); + buffer_cache.UnbindComputeStorageBuffers(); + u32 ssbo_index = 0; + for (const auto& buffer : entries.global_buffers) { + buffer_cache.BindComputeStorageBuffer(ssbo_index, buffer.cbuf_index, buffer.cbuf_offset, + buffer.is_written); + ++ssbo_index; + } + buffer_cache.UpdateComputeBuffers(); + + texture_cache.SynchronizeComputeDescriptors(); + SetupComputeUniformTexels(entries); SetupComputeTextures(entries); SetupComputeStorageTexels(entries); @@ -606,20 +435,15 @@ void RasterizerVulkan::DispatchCompute(GPUVAddr code_addr) { const std::span indices_span(image_view_indices.data(), image_view_indices.size()); texture_cache.FillComputeImageViews(indices_span, image_view_ids); - buffer_cache.Map(CalculateComputeStreamBufferSize()); - update_descriptor_queue.Acquire(); - SetupComputeConstBuffers(entries); - SetupComputeGlobalBuffers(entries); + buffer_cache.BindHostComputeBuffers(); ImageViewId* image_view_id_ptr = image_view_ids.data(); VkSampler* sampler_ptr = sampler_handles.data(); PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr, sampler_ptr); - buffer_cache.Unmap(); - const VkPipeline pipeline_handle = pipeline.GetHandle(); const VkPipelineLayout pipeline_layout = pipeline.GetLayout(); const VkDescriptorSet descriptor_set = pipeline.CommitDescriptorSet(); @@ -644,6 +468,11 @@ void RasterizerVulkan::Query(GPUVAddr gpu_addr, VideoCore::QueryType type, query_cache.Query(gpu_addr, type, timestamp); } +void RasterizerVulkan::BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, + u32 size) { + buffer_cache.BindGraphicsUniformBuffer(stage, index, gpu_addr, size); +} + void RasterizerVulkan::FlushAll() {} void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) { @@ -651,19 +480,43 @@ void RasterizerVulkan::FlushRegion(VAddr addr, u64 size) { return; } { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.DownloadMemory(addr, size); } - buffer_cache.FlushRegion(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.DownloadMemory(addr, size); + } query_cache.FlushRegion(addr, size); } +void Vulkan::RasterizerVulkan::InvalidateExceptTextureCache(VAddr addr, u64 size) { + if (addr == 0 || size == 0) { + return; + } + pipeline_cache.InvalidateRegion(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } + query_cache.InvalidateRegion(addr, size); +} + +void Vulkan::RasterizerVulkan::InvalidateTextureCache(VAddr addr, u64 size) { + if (addr == 0 || size == 0) { + return; + } + std::scoped_lock lock{texture_cache.mutex}; + texture_cache.UnmapMemory(addr, size); +} + bool RasterizerVulkan::MustFlushRegion(VAddr addr, u64 size) { + std::scoped_lock lock{texture_cache.mutex, buffer_cache.mutex}; if (!Settings::IsGPULevelHigh()) { - return buffer_cache.MustFlushRegion(addr, size); + return buffer_cache.IsRegionGpuModified(addr, size); } return texture_cache.IsRegionGpuModified(addr, size) || - buffer_cache.MustFlushRegion(addr, size); + buffer_cache.IsRegionGpuModified(addr, size); } void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) { @@ -671,11 +524,14 @@ void RasterizerVulkan::InvalidateRegion(VAddr addr, u64 size) { return; } { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.WriteMemory(addr, size); } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } pipeline_cache.InvalidateRegion(addr, size); - buffer_cache.InvalidateRegion(addr, size); query_cache.InvalidateRegion(addr, size); } @@ -683,25 +539,34 @@ void RasterizerVulkan::OnCPUWrite(VAddr addr, u64 size) { if (addr == 0 || size == 0) { return; } + pipeline_cache.OnCPUWrite(addr, size); { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.WriteMemory(addr, size); } - pipeline_cache.OnCPUWrite(addr, size); - buffer_cache.OnCPUWrite(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.CachedWriteMemory(addr, size); + } } void RasterizerVulkan::SyncGuestHost() { - buffer_cache.SyncGuestHost(); pipeline_cache.SyncGuestHost(); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.FlushCachedWrites(); + } } void RasterizerVulkan::UnmapMemory(VAddr addr, u64 size) { { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.UnmapMemory(addr, size); } - buffer_cache.OnCPUWrite(addr, size); + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.WriteMemory(addr, size); + } pipeline_cache.OnCPUWrite(addr, size); } @@ -774,18 +639,21 @@ void RasterizerVulkan::TickFrame() { draw_counter = 0; update_descriptor_queue.TickFrame(); fence_manager.TickFrame(); - buffer_cache.TickFrame(); staging_pool.TickFrame(); { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.TickFrame(); } + { + std::scoped_lock lock{buffer_cache.mutex}; + buffer_cache.TickFrame(); + } } bool RasterizerVulkan::AccelerateSurfaceCopy(const Tegra::Engines::Fermi2D::Surface& src, const Tegra::Engines::Fermi2D::Surface& dst, const Tegra::Engines::Fermi2D::Config& copy_config) { - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; texture_cache.BlitImage(dst, src, copy_config); return true; } @@ -795,13 +663,11 @@ bool RasterizerVulkan::AccelerateDisplay(const Tegra::FramebufferConfig& config, if (!framebuffer_addr) { return false; } - - auto lock = texture_cache.AcquireLock(); + std::scoped_lock lock{texture_cache.mutex}; ImageView* const image_view = texture_cache.TryFindFramebufferImageView(framebuffer_addr); if (!image_view) { return false; } - screen_info.image_view = image_view->Handle(VideoCommon::ImageViewType::e2D); screen_info.width = image_view->size.width; screen_info.height = image_view->size.height; @@ -830,29 +696,8 @@ void RasterizerVulkan::FlushWork() { draw_counter = 0; } -RasterizerVulkan::DrawParameters RasterizerVulkan::SetupGeometry(FixedPipelineState& fixed_state, - BufferBindings& buffer_bindings, - bool is_indexed, - bool is_instanced) { - MICROPROFILE_SCOPE(Vulkan_Geometry); - - const auto& regs = maxwell3d.regs; - - SetupVertexArrays(buffer_bindings); - - const u32 base_instance = regs.vb_base_instance; - const u32 num_instances = is_instanced ? maxwell3d.mme_draw.instance_count : 1; - const u32 base_vertex = is_indexed ? regs.vb_element_base : regs.vertex_buffer.first; - const u32 num_vertices = is_indexed ? regs.index_array.count : regs.vertex_buffer.count; - - DrawParameters params{base_instance, num_instances, base_vertex, num_vertices, is_indexed}; - SetupIndexBuffer(buffer_bindings, params, is_indexed); - - return params; -} - void RasterizerVulkan::SetupShaderDescriptors( - const std::array& shaders) { + const std::array& shaders, bool is_indexed) { image_view_indices.clear(); sampler_handles.clear(); for (size_t stage = 0; stage < Maxwell::MaxShaderStage; ++stage) { @@ -860,15 +705,27 @@ void RasterizerVulkan::SetupShaderDescriptors( if (!shader) { continue; } - const auto& entries = shader->GetEntries(); + const ShaderEntries& entries = shader->GetEntries(); SetupGraphicsUniformTexels(entries, stage); SetupGraphicsTextures(entries, stage); SetupGraphicsStorageTexels(entries, stage); SetupGraphicsImages(entries, stage); + + buffer_cache.SetEnabledUniformBuffers(stage, entries.enabled_uniform_buffers); + buffer_cache.UnbindGraphicsStorageBuffers(stage); + u32 ssbo_index = 0; + for (const auto& buffer : entries.global_buffers) { + buffer_cache.BindGraphicsStorageBuffer(stage, ssbo_index, buffer.cbuf_index, + buffer.cbuf_offset, buffer.is_written); + ++ssbo_index; + } } const std::span indices_span(image_view_indices.data(), image_view_indices.size()); + buffer_cache.UpdateGraphicsBuffers(is_indexed); texture_cache.FillGraphicsImageViews(indices_span, image_view_ids); + buffer_cache.BindHostGeometryBuffers(is_indexed); + update_descriptor_queue.Acquire(); ImageViewId* image_view_id_ptr = image_view_ids.data(); @@ -879,11 +736,9 @@ void RasterizerVulkan::SetupShaderDescriptors( if (!shader) { continue; } - const auto& entries = shader->GetEntries(); - SetupGraphicsConstBuffers(entries, stage); - SetupGraphicsGlobalBuffers(entries, stage); - PushImageDescriptors(entries, texture_cache, update_descriptor_queue, image_view_id_ptr, - sampler_ptr); + buffer_cache.BindHostStageBuffers(stage); + PushImageDescriptors(shader->GetEntries(), texture_cache, update_descriptor_queue, + image_view_id_ptr, sampler_ptr); } } @@ -916,27 +771,11 @@ void RasterizerVulkan::BeginTransformFeedback() { LOG_ERROR(Render_Vulkan, "Transform feedbacks used but not supported"); return; } - UNIMPLEMENTED_IF(regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationControl) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::TesselationEval) || regs.IsShaderConfigEnabled(Maxwell::ShaderProgram::Geometry)); - - UNIMPLEMENTED_IF(regs.tfb_bindings[1].buffer_enable); - UNIMPLEMENTED_IF(regs.tfb_bindings[2].buffer_enable); - UNIMPLEMENTED_IF(regs.tfb_bindings[3].buffer_enable); - - const auto& binding = regs.tfb_bindings[0]; - UNIMPLEMENTED_IF(binding.buffer_enable == 0); - UNIMPLEMENTED_IF(binding.buffer_offset != 0); - - const GPUVAddr gpu_addr = binding.Address(); - const VkDeviceSize size = static_cast(binding.buffer_size); - const auto info = buffer_cache.UploadMemory(gpu_addr, size, 4, true); - - scheduler.Record([buffer = info.handle, offset = info.offset, size](vk::CommandBuffer cmdbuf) { - cmdbuf.BindTransformFeedbackBuffersEXT(0, 1, &buffer, &offset, &size); - cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); - }); + scheduler.Record( + [](vk::CommandBuffer cmdbuf) { cmdbuf.BeginTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } void RasterizerVulkan::EndTransformFeedback() { @@ -947,104 +786,11 @@ void RasterizerVulkan::EndTransformFeedback() { if (!device.IsExtTransformFeedbackSupported()) { return; } - scheduler.Record( [](vk::CommandBuffer cmdbuf) { cmdbuf.EndTransformFeedbackEXT(0, 0, nullptr, nullptr); }); } -void RasterizerVulkan::SetupVertexArrays(BufferBindings& buffer_bindings) { - const auto& regs = maxwell3d.regs; - - for (size_t index = 0; index < Maxwell::NumVertexArrays; ++index) { - const auto& vertex_array = regs.vertex_array[index]; - if (!vertex_array.IsEnabled()) { - continue; - } - const GPUVAddr start{vertex_array.StartAddress()}; - const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; - - ASSERT(end >= start); - const size_t size = end - start; - if (size == 0) { - buffer_bindings.AddVertexBinding(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE, 0); - continue; - } - const auto info = buffer_cache.UploadMemory(start, size); - buffer_bindings.AddVertexBinding(info.handle, info.offset, size, vertex_array.stride); - } -} - -void RasterizerVulkan::SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, - bool is_indexed) { - if (params.num_vertices == 0) { - return; - } - const auto& regs = maxwell3d.regs; - switch (regs.draw.topology) { - case Maxwell::PrimitiveTopology::Quads: { - if (!params.is_indexed) { - const auto [buffer, offset] = - quad_array_pass.Assemble(params.num_vertices, params.base_vertex); - buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32); - params.base_vertex = 0; - params.num_vertices = params.num_vertices * 6 / 4; - params.is_indexed = true; - break; - } - const GPUVAddr gpu_addr = regs.index_array.IndexStart(); - const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); - VkBuffer buffer = info.handle; - u64 offset = info.offset; - std::tie(buffer, offset) = quad_indexed_pass.Assemble( - regs.index_array.format, params.num_vertices, params.base_vertex, buffer, offset); - - buffer_bindings.SetIndexBinding(buffer, offset, VK_INDEX_TYPE_UINT32); - params.num_vertices = (params.num_vertices / 4) * 6; - params.base_vertex = 0; - break; - } - default: { - if (!is_indexed) { - break; - } - const GPUVAddr gpu_addr = regs.index_array.IndexStart(); - const auto info = buffer_cache.UploadMemory(gpu_addr, CalculateIndexBufferSize()); - VkBuffer buffer = info.handle; - u64 offset = info.offset; - - auto format = regs.index_array.format; - const bool is_uint8 = format == Maxwell::IndexFormat::UnsignedByte; - if (is_uint8 && !device.IsExtIndexTypeUint8Supported()) { - std::tie(buffer, offset) = uint8_pass.Assemble(params.num_vertices, buffer, offset); - format = Maxwell::IndexFormat::UnsignedShort; - } - - buffer_bindings.SetIndexBinding(buffer, offset, MaxwellToVK::IndexFormat(device, format)); - break; - } - } -} - -void RasterizerVulkan::SetupGraphicsConstBuffers(const ShaderEntries& entries, size_t stage) { - MICROPROFILE_SCOPE(Vulkan_ConstBuffers); - const auto& shader_stage = maxwell3d.state.shader_stages[stage]; - for (const auto& entry : entries.const_buffers) { - SetupConstBuffer(entry, shader_stage.const_buffers[entry.GetIndex()]); - } -} - -void RasterizerVulkan::SetupGraphicsGlobalBuffers(const ShaderEntries& entries, size_t stage) { - MICROPROFILE_SCOPE(Vulkan_GlobalBuffers); - const auto& cbufs{maxwell3d.state.shader_stages[stage]}; - - for (const auto& entry : entries.global_buffers) { - const auto addr = cbufs.const_buffers[entry.GetCbufIndex()].address + entry.GetCbufOffset(); - SetupGlobalBuffer(entry, addr); - } -} - void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, size_t stage) { - MICROPROFILE_SCOPE(Vulkan_Textures); const auto& regs = maxwell3d.regs; const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.uniform_texels) { @@ -1054,7 +800,6 @@ void RasterizerVulkan::SetupGraphicsUniformTexels(const ShaderEntries& entries, } void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_t stage) { - MICROPROFILE_SCOPE(Vulkan_Textures); const auto& regs = maxwell3d.regs; const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.samplers) { @@ -1070,7 +815,6 @@ void RasterizerVulkan::SetupGraphicsTextures(const ShaderEntries& entries, size_ } void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, size_t stage) { - MICROPROFILE_SCOPE(Vulkan_Textures); const auto& regs = maxwell3d.regs; const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.storage_texels) { @@ -1080,7 +824,6 @@ void RasterizerVulkan::SetupGraphicsStorageTexels(const ShaderEntries& entries, } void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t stage) { - MICROPROFILE_SCOPE(Vulkan_Images); const auto& regs = maxwell3d.regs; const bool via_header_index = regs.sampler_index == Maxwell::SamplerIndex::ViaHeaderIndex; for (const auto& entry : entries.images) { @@ -1089,32 +832,7 @@ void RasterizerVulkan::SetupGraphicsImages(const ShaderEntries& entries, size_t } } -void RasterizerVulkan::SetupComputeConstBuffers(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_ConstBuffers); - const auto& launch_desc = kepler_compute.launch_description; - for (const auto& entry : entries.const_buffers) { - const auto& config = launch_desc.const_buffer_config[entry.GetIndex()]; - const std::bitset<8> mask = launch_desc.const_buffer_enable_mask.Value(); - const Tegra::Engines::ConstBufferInfo info{ - .address = config.Address(), - .size = config.size, - .enabled = mask[entry.GetIndex()], - }; - SetupConstBuffer(entry, info); - } -} - -void RasterizerVulkan::SetupComputeGlobalBuffers(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_GlobalBuffers); - const auto& cbufs{kepler_compute.launch_description.const_buffer_config}; - for (const auto& entry : entries.global_buffers) { - const auto addr{cbufs[entry.GetCbufIndex()].Address() + entry.GetCbufOffset()}; - SetupGlobalBuffer(entry, addr); - } -} - void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_Textures); const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.uniform_texels) { const TextureHandle handle = @@ -1124,7 +842,6 @@ void RasterizerVulkan::SetupComputeUniformTexels(const ShaderEntries& entries) { } void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_Textures); const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.samplers) { for (size_t index = 0; index < entry.size; ++index) { @@ -1139,7 +856,6 @@ void RasterizerVulkan::SetupComputeTextures(const ShaderEntries& entries) { } void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_Textures); const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.storage_texels) { const TextureHandle handle = @@ -1149,7 +865,6 @@ void RasterizerVulkan::SetupComputeStorageTexels(const ShaderEntries& entries) { } void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { - MICROPROFILE_SCOPE(Vulkan_Images); const bool via_header_index = kepler_compute.launch_description.linked_tsc; for (const auto& entry : entries.images) { const TextureHandle handle = @@ -1158,42 +873,6 @@ void RasterizerVulkan::SetupComputeImages(const ShaderEntries& entries) { } } -void RasterizerVulkan::SetupConstBuffer(const ConstBufferEntry& entry, - const Tegra::Engines::ConstBufferInfo& buffer) { - if (!buffer.enabled) { - // Set values to zero to unbind buffers - update_descriptor_queue.AddBuffer(DefaultBuffer(), 0, DEFAULT_BUFFER_SIZE); - return; - } - // Align the size to avoid bad std140 interactions - const size_t size = Common::AlignUp(CalculateConstBufferSize(entry, buffer), 4 * sizeof(float)); - ASSERT(size <= MaxConstbufferSize); - - const u64 alignment = device.GetUniformBufferAlignment(); - const auto info = buffer_cache.UploadMemory(buffer.address, size, alignment); - update_descriptor_queue.AddBuffer(info.handle, info.offset, size); -} - -void RasterizerVulkan::SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address) { - const u64 actual_addr = gpu_memory.Read(address); - const u32 size = gpu_memory.Read(address + 8); - - if (size == 0) { - // Sometimes global memory pointers don't have a proper size. Upload a dummy entry - // because Vulkan doesn't like empty buffers. - // Note: Do *not* use DefaultBuffer() here, storage buffers can be written breaking the - // default buffer. - static constexpr size_t dummy_size = 4; - const auto info = buffer_cache.GetEmptyBuffer(dummy_size); - update_descriptor_queue.AddBuffer(info.handle, info.offset, dummy_size); - return; - } - - const auto info = buffer_cache.UploadMemory( - actual_addr, size, device.GetStorageBufferAlignment(), entry.IsWritten()); - update_descriptor_queue.AddBuffer(info.handle, info.offset, size); -} - void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs) { if (!state_tracker.TouchViewports()) { return; @@ -1206,7 +885,8 @@ void RasterizerVulkan::UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& reg GetViewportState(device, regs, 8), GetViewportState(device, regs, 9), GetViewportState(device, regs, 10), GetViewportState(device, regs, 11), GetViewportState(device, regs, 12), GetViewportState(device, regs, 13), - GetViewportState(device, regs, 14), GetViewportState(device, regs, 15)}; + GetViewportState(device, regs, 14), GetViewportState(device, regs, 15), + }; scheduler.Record([viewports](vk::CommandBuffer cmdbuf) { cmdbuf.SetViewport(0, viewports); }); } @@ -1214,13 +894,14 @@ void RasterizerVulkan::UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs if (!state_tracker.TouchScissors()) { return; } - const std::array scissors = { + const std::array scissors{ GetScissorState(regs, 0), GetScissorState(regs, 1), GetScissorState(regs, 2), GetScissorState(regs, 3), GetScissorState(regs, 4), GetScissorState(regs, 5), GetScissorState(regs, 6), GetScissorState(regs, 7), GetScissorState(regs, 8), GetScissorState(regs, 9), GetScissorState(regs, 10), GetScissorState(regs, 11), GetScissorState(regs, 12), GetScissorState(regs, 13), GetScissorState(regs, 14), - GetScissorState(regs, 15)}; + GetScissorState(regs, 15), + }; scheduler.Record([scissors](vk::CommandBuffer cmdbuf) { cmdbuf.SetScissor(0, scissors); }); } @@ -1385,73 +1066,4 @@ void RasterizerVulkan::UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& }); } -size_t RasterizerVulkan::CalculateGraphicsStreamBufferSize(bool is_indexed) const { - size_t size = CalculateVertexArraysSize(); - if (is_indexed) { - size = Common::AlignUp(size, 4) + CalculateIndexBufferSize(); - } - size += Maxwell::MaxConstBuffers * (MaxConstbufferSize + device.GetUniformBufferAlignment()); - return size; -} - -size_t RasterizerVulkan::CalculateComputeStreamBufferSize() const { - return Tegra::Engines::KeplerCompute::NumConstBuffers * - (Maxwell::MaxConstBufferSize + device.GetUniformBufferAlignment()); -} - -size_t RasterizerVulkan::CalculateVertexArraysSize() const { - const auto& regs = maxwell3d.regs; - - size_t size = 0; - for (u32 index = 0; index < Maxwell::NumVertexArrays; ++index) { - // This implementation assumes that all attributes are used in the shader. - const GPUVAddr start{regs.vertex_array[index].StartAddress()}; - const GPUVAddr end{regs.vertex_array_limit[index].LimitAddress()}; - DEBUG_ASSERT(end >= start); - - size += (end - start) * regs.vertex_array[index].enable; - } - return size; -} - -size_t RasterizerVulkan::CalculateIndexBufferSize() const { - return static_cast(maxwell3d.regs.index_array.count) * - static_cast(maxwell3d.regs.index_array.FormatSizeInBytes()); -} - -size_t RasterizerVulkan::CalculateConstBufferSize( - const ConstBufferEntry& entry, const Tegra::Engines::ConstBufferInfo& buffer) const { - if (entry.IsIndirect()) { - // Buffer is accessed indirectly, so upload the entire thing - return buffer.size; - } else { - // Buffer is accessed directly, upload just what we use - return entry.GetSize(); - } -} - -VkBuffer RasterizerVulkan::DefaultBuffer() { - if (default_buffer) { - return *default_buffer; - } - default_buffer = device.GetLogical().CreateBuffer({ - .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, - .pNext = nullptr, - .flags = 0, - .size = DEFAULT_BUFFER_SIZE, - .usage = VK_BUFFER_USAGE_TRANSFER_DST_BIT | VK_BUFFER_USAGE_VERTEX_BUFFER_BIT | - VK_BUFFER_USAGE_UNIFORM_BUFFER_BIT, - .sharingMode = VK_SHARING_MODE_EXCLUSIVE, - .queueFamilyIndexCount = 0, - .pQueueFamilyIndices = nullptr, - }); - default_buffer_commit = memory_allocator.Commit(default_buffer, MemoryUsage::DeviceLocal); - - scheduler.RequestOutsideRenderPassOperationContext(); - scheduler.Record([buffer = *default_buffer](vk::CommandBuffer cmdbuf) { - cmdbuf.FillBuffer(buffer, 0, DEFAULT_BUFFER_SIZE, 0); - }); - return *default_buffer; -} - } // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_rasterizer.h b/src/video_core/renderer_vulkan/vk_rasterizer.h index 8e261b9bd..2841d7db6 100755 --- a/src/video_core/renderer_vulkan/vk_rasterizer.h +++ b/src/video_core/renderer_vulkan/vk_rasterizer.h @@ -18,14 +18,12 @@ #include "video_core/renderer_vulkan/blit_image.h" #include "video_core/renderer_vulkan/fixed_pipeline_state.h" #include "video_core/renderer_vulkan/vk_buffer_cache.h" -#include "video_core/renderer_vulkan/vk_compute_pass.h" #include "video_core/renderer_vulkan/vk_descriptor_pool.h" #include "video_core/renderer_vulkan/vk_fence_manager.h" #include "video_core/renderer_vulkan/vk_pipeline_cache.h" #include "video_core/renderer_vulkan/vk_query_cache.h" #include "video_core/renderer_vulkan/vk_scheduler.h" #include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" -#include "video_core/renderer_vulkan/vk_stream_buffer.h" #include "video_core/renderer_vulkan/vk_texture_cache.h" #include "video_core/renderer_vulkan/vk_update_descriptor.h" #include "video_core/shader/async_shaders.h" @@ -49,7 +47,6 @@ namespace Vulkan { struct VKScreenInfo; class StateTracker; -class BufferBindings; class RasterizerVulkan final : public VideoCore::RasterizerAccelerated { public: @@ -65,8 +62,11 @@ public: void DispatchCompute(GPUVAddr code_addr) override; void ResetCounter(VideoCore::QueryType type) override; void Query(GPUVAddr gpu_addr, VideoCore::QueryType type, std::optional timestamp) override; + void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size) override; void FlushAll() override; void FlushRegion(VAddr addr, u64 size) override; + void InvalidateExceptTextureCache(VAddr addr, u64 size) override; + void InvalidateTextureCache(VAddr addr, u64 size) override; bool MustFlushRegion(VAddr addr, u64 size) override; void InvalidateRegion(VAddr addr, u64 size) override; void OnCPUWrite(VAddr addr, u64 size) override; @@ -107,24 +107,11 @@ private: static constexpr VkDeviceSize DEFAULT_BUFFER_SIZE = 4 * sizeof(float); - struct DrawParameters { - void Draw(vk::CommandBuffer cmdbuf) const; - - u32 base_instance = 0; - u32 num_instances = 0; - u32 base_vertex = 0; - u32 num_vertices = 0; - bool is_indexed = 0; - }; - void FlushWork(); - /// Setups geometry buffers and state. - DrawParameters SetupGeometry(FixedPipelineState& fixed_state, BufferBindings& buffer_bindings, - bool is_indexed, bool is_instanced); - /// Setup descriptors in the graphics pipeline. - void SetupShaderDescriptors(const std::array& shaders); + void SetupShaderDescriptors(const std::array& shaders, + bool is_indexed); void UpdateDynamicStates(); @@ -132,16 +119,6 @@ private: void EndTransformFeedback(); - void SetupVertexArrays(BufferBindings& buffer_bindings); - - void SetupIndexBuffer(BufferBindings& buffer_bindings, DrawParameters& params, bool is_indexed); - - /// Setup constant buffers in the graphics pipeline. - void SetupGraphicsConstBuffers(const ShaderEntries& entries, std::size_t stage); - - /// Setup global buffers in the graphics pipeline. - void SetupGraphicsGlobalBuffers(const ShaderEntries& entries, std::size_t stage); - /// Setup uniform texels in the graphics pipeline. void SetupGraphicsUniformTexels(const ShaderEntries& entries, std::size_t stage); @@ -154,12 +131,6 @@ private: /// Setup images in the graphics pipeline. void SetupGraphicsImages(const ShaderEntries& entries, std::size_t stage); - /// Setup constant buffers in the compute pipeline. - void SetupComputeConstBuffers(const ShaderEntries& entries); - - /// Setup global buffers in the compute pipeline. - void SetupComputeGlobalBuffers(const ShaderEntries& entries); - /// Setup texel buffers in the compute pipeline. void SetupComputeUniformTexels(const ShaderEntries& entries); @@ -172,11 +143,6 @@ private: /// Setup images in the compute pipeline. void SetupComputeImages(const ShaderEntries& entries); - void SetupConstBuffer(const ConstBufferEntry& entry, - const Tegra::Engines::ConstBufferInfo& buffer); - - void SetupGlobalBuffer(const GlobalBufferEntry& entry, GPUVAddr address); - void UpdateViewportsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateScissorsState(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateDepthBias(Tegra::Engines::Maxwell3D::Regs& regs); @@ -193,19 +159,6 @@ private: void UpdateStencilOp(Tegra::Engines::Maxwell3D::Regs& regs); void UpdateStencilTestEnable(Tegra::Engines::Maxwell3D::Regs& regs); - size_t CalculateGraphicsStreamBufferSize(bool is_indexed) const; - - size_t CalculateComputeStreamBufferSize() const; - - size_t CalculateVertexArraysSize() const; - - size_t CalculateIndexBufferSize() const; - - size_t CalculateConstBufferSize(const ConstBufferEntry& entry, - const Tegra::Engines::ConstBufferInfo& buffer) const; - - VkBuffer DefaultBuffer(); - Tegra::GPU& gpu; Tegra::MemoryManager& gpu_memory; Tegra::Engines::Maxwell3D& maxwell3d; @@ -217,24 +170,19 @@ private: StateTracker& state_tracker; VKScheduler& scheduler; - VKStreamBuffer stream_buffer; StagingBufferPool staging_pool; VKDescriptorPool descriptor_pool; VKUpdateDescriptorQueue update_descriptor_queue; BlitImageHelper blit_image; - QuadArrayPass quad_array_pass; - QuadIndexedPass quad_indexed_pass; - Uint8Pass uint8_pass; TextureCacheRuntime texture_cache_runtime; TextureCache texture_cache; + BufferCacheRuntime buffer_cache_runtime; + BufferCache buffer_cache; VKPipelineCache pipeline_cache; - VKBufferCache buffer_cache; VKQueryCache query_cache; VKFenceManager fence_manager; - vk::Buffer default_buffer; - MemoryCommit default_buffer_commit; vk::Event wfi_event; VideoCommon::Shader::AsyncShaders async_shaders; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.cpp b/src/video_core/renderer_vulkan/vk_scheduler.cpp index 66004f9c0..f35c120b0 100755 --- a/src/video_core/renderer_vulkan/vk_scheduler.cpp +++ b/src/video_core/renderer_vulkan/vk_scheduler.cpp @@ -52,18 +52,6 @@ VKScheduler::~VKScheduler() { worker_thread.join(); } -u64 VKScheduler::CurrentTick() const noexcept { - return master_semaphore->CurrentTick(); -} - -bool VKScheduler::IsFree(u64 tick) const noexcept { - return master_semaphore->IsFree(tick); -} - -void VKScheduler::Wait(u64 tick) { - master_semaphore->Wait(tick); -} - void VKScheduler::Flush(VkSemaphore semaphore) { SubmitExecution(semaphore); AllocateNewContext(); @@ -269,7 +257,7 @@ void VKScheduler::EndRenderPass() { cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_EARLY_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_LATE_FRAGMENT_TESTS_BIT | VK_PIPELINE_STAGE_COLOR_ATTACHMENT_OUTPUT_BIT, - VK_PIPELINE_STAGE_ALL_GRAPHICS_BIT, 0, nullptr, nullptr, + VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, 0, nullptr, nullptr, vk::Span(barriers.data(), num_images)); }); state.renderpass = nullptr; diff --git a/src/video_core/renderer_vulkan/vk_scheduler.h b/src/video_core/renderer_vulkan/vk_scheduler.h index 4cd43e425..54e192d54 100755 --- a/src/video_core/renderer_vulkan/vk_scheduler.h +++ b/src/video_core/renderer_vulkan/vk_scheduler.h @@ -12,6 +12,7 @@ #include #include "common/common_types.h" #include "common/threadsafe_queue.h" +#include "video_core/renderer_vulkan/vk_master_semaphore.h" #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { @@ -19,7 +20,6 @@ namespace Vulkan { class CommandPool; class Device; class Framebuffer; -class MasterSemaphore; class StateTracker; class VKQueryCache; @@ -30,15 +30,6 @@ public: explicit VKScheduler(const Device& device, StateTracker& state_tracker); ~VKScheduler(); - /// Returns the current command buffer tick. - [[nodiscard]] u64 CurrentTick() const noexcept; - - /// Returns true when a tick has been triggered by the GPU. - [[nodiscard]] bool IsFree(u64 tick) const noexcept; - - /// Waits for the given tick to trigger on the GPU. - void Wait(u64 tick); - /// Sends the current execution context to the GPU. void Flush(VkSemaphore semaphore = nullptr); @@ -80,6 +71,21 @@ public: (void)chunk->Record(command); } + /// Returns the current command buffer tick. + [[nodiscard]] u64 CurrentTick() const noexcept { + return master_semaphore->CurrentTick(); + } + + /// Returns true when a tick has been triggered by the GPU. + [[nodiscard]] bool IsFree(u64 tick) const noexcept { + return master_semaphore->IsFree(tick); + } + + /// Waits for the given tick to trigger on the GPU. + void Wait(u64 tick) { + master_semaphore->Wait(tick); + } + /// Returns the master timeline semaphore. [[nodiscard]] MasterSemaphore& GetMasterSemaphore() const noexcept { return *master_semaphore; diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp index 89cbe01ad..c96255871 100755 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.cpp @@ -3124,6 +3124,9 @@ ShaderEntries GenerateShaderEntries(const VideoCommon::Shader::ShaderIR& ir) { entries.attributes.insert(GetGenericAttributeLocation(attribute)); } } + for (const auto& buffer : entries.const_buffers) { + entries.enabled_uniform_buffers |= 1U << buffer.GetIndex(); + } entries.clip_distances = ir.GetClipDistances(); entries.shader_length = ir.GetLength(); entries.uses_warps = ir.UsesWarps(); diff --git a/src/video_core/renderer_vulkan/vk_shader_decompiler.h b/src/video_core/renderer_vulkan/vk_shader_decompiler.h index 26381e444..5d94132a5 100755 --- a/src/video_core/renderer_vulkan/vk_shader_decompiler.h +++ b/src/video_core/renderer_vulkan/vk_shader_decompiler.h @@ -39,24 +39,7 @@ private: u32 index{}; }; -class GlobalBufferEntry { -public: - constexpr explicit GlobalBufferEntry(u32 cbuf_index_, u32 cbuf_offset_, bool is_written_) - : cbuf_index{cbuf_index_}, cbuf_offset{cbuf_offset_}, is_written{is_written_} {} - - constexpr u32 GetCbufIndex() const { - return cbuf_index; - } - - constexpr u32 GetCbufOffset() const { - return cbuf_offset; - } - - constexpr bool IsWritten() const { - return is_written; - } - -private: +struct GlobalBufferEntry { u32 cbuf_index{}; u32 cbuf_offset{}; bool is_written{}; @@ -78,6 +61,7 @@ struct ShaderEntries { std::set attributes; std::array clip_distances{}; std::size_t shader_length{}; + u32 enabled_uniform_buffers{}; bool uses_warps{}; }; diff --git a/src/video_core/renderer_vulkan/vk_state_tracker.cpp b/src/video_core/renderer_vulkan/vk_state_tracker.cpp index 1779a2e30..e81fad007 100755 --- a/src/video_core/renderer_vulkan/vk_state_tracker.cpp +++ b/src/video_core/renderer_vulkan/vk_state_tracker.cpp @@ -30,15 +30,18 @@ using Table = Maxwell3D::DirtyState::Table; using Flags = Maxwell3D::DirtyState::Flags; Flags MakeInvalidationFlags() { - static constexpr std::array INVALIDATION_FLAGS{ + static constexpr int INVALIDATION_FLAGS[]{ Viewports, Scissors, DepthBias, BlendConstants, DepthBounds, StencilProperties, CullMode, DepthBoundsEnable, DepthTestEnable, DepthWriteEnable, - DepthCompareOp, FrontFace, StencilOp, StencilTestEnable, + DepthCompareOp, FrontFace, StencilOp, StencilTestEnable, VertexBuffers, }; Flags flags{}; for (const int flag : INVALIDATION_FLAGS) { flags[flag] = true; } + for (int index = VertexBuffer0; index <= VertexBuffer31; ++index) { + flags[index] = true; + } return flags; } @@ -130,7 +133,7 @@ void SetupDirtyStencilTestEnable(Tables& tables) { StateTracker::StateTracker(Tegra::GPU& gpu) : flags{gpu.Maxwell3D().dirty.flags}, invalidation_flags{MakeInvalidationFlags()} { auto& tables = gpu.Maxwell3D().dirty.tables; - SetupDirtyRenderTargets(tables); + SetupDirtyFlags(tables); SetupDirtyViewports(tables); SetupDirtyScissors(tables); SetupDirtyDepthBias(tables); diff --git a/src/video_core/renderer_vulkan/vk_swapchain.cpp b/src/video_core/renderer_vulkan/vk_swapchain.cpp index 725a2a05d..0b63bd6c8 100755 --- a/src/video_core/renderer_vulkan/vk_swapchain.cpp +++ b/src/video_core/renderer_vulkan/vk_swapchain.cpp @@ -56,8 +56,11 @@ VkExtent2D ChooseSwapExtent(const VkSurfaceCapabilitiesKHR& capabilities, u32 wi } // Anonymous namespace -VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const Device& device_, VKScheduler& scheduler_) - : surface{surface_}, device{device_}, scheduler{scheduler_} {} +VKSwapchain::VKSwapchain(VkSurfaceKHR surface_, const Device& device_, VKScheduler& scheduler_, + u32 width, u32 height, bool srgb) + : surface{surface_}, device{device_}, scheduler{scheduler_} { + Create(width, height, srgb); +} VKSwapchain::~VKSwapchain() = default; diff --git a/src/video_core/renderer_vulkan/vk_swapchain.h b/src/video_core/renderer_vulkan/vk_swapchain.h index 2eadd62b3..a728511e0 100755 --- a/src/video_core/renderer_vulkan/vk_swapchain.h +++ b/src/video_core/renderer_vulkan/vk_swapchain.h @@ -20,7 +20,8 @@ class VKScheduler; class VKSwapchain { public: - explicit VKSwapchain(VkSurfaceKHR surface, const Device& device, VKScheduler& scheduler); + explicit VKSwapchain(VkSurfaceKHR surface, const Device& device, VKScheduler& scheduler, + u32 width, u32 height, bool srgb); ~VKSwapchain(); /// Creates (or recreates) the swapchain with a given size. diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.cpp b/src/video_core/renderer_vulkan/vk_texture_cache.cpp index ab14922d7..35b988d9a 100755 --- a/src/video_core/renderer_vulkan/vk_texture_cache.cpp +++ b/src/video_core/renderer_vulkan/vk_texture_cache.cpp @@ -410,46 +410,47 @@ constexpr VkBorderColor ConvertBorderColor(const std::array& color) { void CopyBufferToImage(vk::CommandBuffer cmdbuf, VkBuffer src_buffer, VkImage image, VkImageAspectFlags aspect_mask, bool is_initialized, std::span copies) { - static constexpr VkAccessFlags ACCESS_FLAGS = VK_ACCESS_SHADER_WRITE_BIT | - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + static constexpr VkAccessFlags WRITE_ACCESS_FLAGS = + VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT; + static constexpr VkAccessFlags READ_ACCESS_FLAGS = VK_ACCESS_SHADER_READ_BIT | + VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | + VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT; const VkImageMemoryBarrier read_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, - .srcAccessMask = ACCESS_FLAGS, + .srcAccessMask = WRITE_ACCESS_FLAGS, .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, .oldLayout = is_initialized ? VK_IMAGE_LAYOUT_GENERAL : VK_IMAGE_LAYOUT_UNDEFINED, .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image, - .subresourceRange = - { - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = VK_REMAINING_MIP_LEVELS, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, }; const VkImageMemoryBarrier write_barrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, .srcAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = ACCESS_FLAGS, + .dstAccessMask = WRITE_ACCESS_FLAGS | READ_ACCESS_FLAGS, .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, .newLayout = VK_IMAGE_LAYOUT_GENERAL, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, .image = image, - .subresourceRange = - { - .aspectMask = aspect_mask, - .baseMipLevel = 0, - .levelCount = VK_REMAINING_MIP_LEVELS, - .baseArrayLayer = 0, - .layerCount = VK_REMAINING_ARRAY_LAYERS, - }, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, }; cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, 0, read_barrier); @@ -553,20 +554,12 @@ void TextureCacheRuntime::Finish() { scheduler.Finish(); } -ImageBufferMap TextureCacheRuntime::MapUploadBuffer(size_t size) { - const auto staging_ref = staging_buffer_pool.Request(size, MemoryUsage::Upload); - return { - .handle = staging_ref.buffer, - .span = staging_ref.mapped_span, - }; +StagingBufferRef TextureCacheRuntime::UploadStagingBuffer(size_t size) { + return staging_buffer_pool.Request(size, MemoryUsage::Upload); } -ImageBufferMap TextureCacheRuntime::MapDownloadBuffer(size_t size) { - const auto staging_ref = staging_buffer_pool.Request(size, MemoryUsage::Download); - return { - .handle = staging_ref.buffer, - .span = staging_ref.mapped_span, - }; +StagingBufferRef TextureCacheRuntime::DownloadStagingBuffer(size_t size) { + return staging_buffer_pool.Request(size, MemoryUsage::Download); } void TextureCacheRuntime::BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, @@ -738,7 +731,7 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src, .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, - .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, .oldLayout = VK_IMAGE_LAYOUT_GENERAL, .newLayout = VK_IMAGE_LAYOUT_GENERAL, .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, @@ -749,12 +742,9 @@ void TextureCacheRuntime::CopyImage(Image& dst, Image& src, VkImageMemoryBarrier{ .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, .pNext = nullptr, - .srcAccessMask = VK_ACCESS_SHADER_READ_BIT | VK_ACCESS_SHADER_WRITE_BIT | - VK_ACCESS_COLOR_ATTACHMENT_READ_BIT | - VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | - VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_READ_BIT | + .srcAccessMask = VK_ACCESS_SHADER_WRITE_BIT | VK_ACCESS_COLOR_ATTACHMENT_WRITE_BIT | VK_ACCESS_DEPTH_STENCIL_ATTACHMENT_WRITE_BIT | - VK_ACCESS_TRANSFER_READ_BIT | VK_ACCESS_TRANSFER_WRITE_BIT, + VK_ACCESS_TRANSFER_WRITE_BIT, .dstAccessMask = VK_ACCESS_TRANSFER_WRITE_BIT, .oldLayout = VK_IMAGE_LAYOUT_GENERAL, .newLayout = VK_IMAGE_LAYOUT_TRANSFER_DST_OPTIMAL, @@ -812,12 +802,12 @@ Image::Image(TextureCacheRuntime& runtime, const ImageInfo& info_, GPUVAddr gpu_ } } -void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, +void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset, std::span copies) { // TODO: Move this to another API scheduler->RequestOutsideRenderPassOperationContext(); std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); - const VkBuffer src_buffer = map.handle; + const VkBuffer src_buffer = map.buffer; const VkImage vk_image = *image; const VkImageAspectFlags vk_aspect_mask = aspect_mask; const bool is_initialized = std::exchange(initialized, true); @@ -827,12 +817,12 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, }); } -void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, +void Image::UploadMemory(const StagingBufferRef& map, size_t buffer_offset, std::span copies) { // TODO: Move this to another API scheduler->RequestOutsideRenderPassOperationContext(); std::vector vk_copies = TransformBufferCopies(copies, buffer_offset); - const VkBuffer src_buffer = map.handle; + const VkBuffer src_buffer = map.buffer; const VkBuffer dst_buffer = *buffer; scheduler->Record([src_buffer, dst_buffer, vk_copies](vk::CommandBuffer cmdbuf) { // TODO: Barriers @@ -840,13 +830,58 @@ void Image::UploadMemory(const ImageBufferMap& map, size_t buffer_offset, }); } -void Image::DownloadMemory(const ImageBufferMap& map, size_t buffer_offset, +void Image::DownloadMemory(const StagingBufferRef& map, size_t buffer_offset, std::span copies) { std::vector vk_copies = TransformBufferImageCopies(copies, buffer_offset, aspect_mask); - scheduler->Record([buffer = map.handle, image = *image, aspect_mask = aspect_mask, + scheduler->Record([buffer = map.buffer, image = *image, aspect_mask = aspect_mask, vk_copies](vk::CommandBuffer cmdbuf) { - // TODO: Barriers - cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_GENERAL, buffer, vk_copies); + const VkImageMemoryBarrier read_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_TRANSFER_READ_BIT, + .oldLayout = VK_IMAGE_LAYOUT_GENERAL, + .newLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + const VkImageMemoryBarrier image_write_barrier{ + .sType = VK_STRUCTURE_TYPE_IMAGE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = 0, + .dstAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .oldLayout = VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, + .newLayout = VK_IMAGE_LAYOUT_GENERAL, + .srcQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .dstQueueFamilyIndex = VK_QUEUE_FAMILY_IGNORED, + .image = image, + .subresourceRange{ + .aspectMask = aspect_mask, + .baseMipLevel = 0, + .levelCount = VK_REMAINING_MIP_LEVELS, + .baseArrayLayer = 0, + .layerCount = VK_REMAINING_ARRAY_LAYERS, + }, + }; + const VkMemoryBarrier memory_write_barrier{ + .sType = VK_STRUCTURE_TYPE_MEMORY_BARRIER, + .pNext = nullptr, + .srcAccessMask = VK_ACCESS_MEMORY_WRITE_BIT, + .dstAccessMask = VK_ACCESS_MEMORY_READ_BIT | VK_ACCESS_MEMORY_WRITE_BIT, + }; + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, VK_PIPELINE_STAGE_TRANSFER_BIT, + 0, read_barrier); + cmdbuf.CopyImageToBuffer(image, VK_IMAGE_LAYOUT_TRANSFER_SRC_OPTIMAL, buffer, vk_copies); + cmdbuf.PipelineBarrier(VK_PIPELINE_STAGE_TRANSFER_BIT, VK_PIPELINE_STAGE_ALL_COMMANDS_BIT, + 0, memory_write_barrier, nullptr, image_write_barrier); }); } @@ -1106,7 +1141,7 @@ Framebuffer::Framebuffer(TextureCacheRuntime& runtime, std::span(num_layers), + .layers = static_cast(std::max(num_layers, 1)), }); if (runtime.device.HasDebuggingToolAttached()) { framebuffer.SetObjectNameEXT(VideoCommon::Name(key).c_str()); diff --git a/src/video_core/renderer_vulkan/vk_texture_cache.h b/src/video_core/renderer_vulkan/vk_texture_cache.h index a55d405d1..405021d30 100755 --- a/src/video_core/renderer_vulkan/vk_texture_cache.h +++ b/src/video_core/renderer_vulkan/vk_texture_cache.h @@ -7,6 +7,7 @@ #include #include +#include "video_core/renderer_vulkan/vk_staging_buffer_pool.h" #include "video_core/texture_cache/texture_cache.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" #include "video_core/vulkan_common/vulkan_wrapper.h" @@ -53,19 +54,6 @@ struct hash { namespace Vulkan { -struct ImageBufferMap { - [[nodiscard]] VkBuffer Handle() const noexcept { - return handle; - } - - [[nodiscard]] std::span Span() const noexcept { - return span; - } - - VkBuffer handle; - std::span span; -}; - struct TextureCacheRuntime { const Device& device; VKScheduler& scheduler; @@ -76,9 +64,9 @@ struct TextureCacheRuntime { void Finish(); - [[nodiscard]] ImageBufferMap MapUploadBuffer(size_t size); + [[nodiscard]] StagingBufferRef UploadStagingBuffer(size_t size); - [[nodiscard]] ImageBufferMap MapDownloadBuffer(size_t size); + [[nodiscard]] StagingBufferRef DownloadStagingBuffer(size_t size); void BlitImage(Framebuffer* dst_framebuffer, ImageView& dst, ImageView& src, const std::array& dst_region, @@ -94,7 +82,7 @@ struct TextureCacheRuntime { return false; } - void AccelerateImageUpload(Image&, const ImageBufferMap&, size_t, + void AccelerateImageUpload(Image&, const StagingBufferRef&, size_t, std::span) { UNREACHABLE(); } @@ -112,13 +100,13 @@ public: explicit Image(TextureCacheRuntime&, const VideoCommon::ImageInfo& info, GPUVAddr gpu_addr, VAddr cpu_addr); - void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + void UploadMemory(const StagingBufferRef& map, size_t buffer_offset, std::span copies); - void UploadMemory(const ImageBufferMap& map, size_t buffer_offset, + void UploadMemory(const StagingBufferRef& map, size_t buffer_offset, std::span copies); - void DownloadMemory(const ImageBufferMap& map, size_t buffer_offset, + void DownloadMemory(const StagingBufferRef& map, size_t buffer_offset, std::span copies); [[nodiscard]] VkImage Handle() const noexcept { diff --git a/src/video_core/shader/async_shaders.h b/src/video_core/shader/async_shaders.h index 0dbb1a31f..7fdff6e56 100755 --- a/src/video_core/shader/async_shaders.h +++ b/src/video_core/shader/async_shaders.h @@ -9,16 +9,7 @@ #include #include -// This header includes both Vulkan and OpenGL headers, this has to be fixed -// Unfortunately, including OpenGL will include Windows.h that defines macros that can cause issues. -// Forcefully include glad early and undefine macros #include -#ifdef CreateEvent -#undef CreateEvent -#endif -#ifdef CreateSemaphore -#undef CreateSemaphore -#endif #include "common/common_types.h" #include "video_core/renderer_opengl/gl_device.h" diff --git a/src/video_core/shader/decode/other.cpp b/src/video_core/shader/decode/other.cpp index d3ea07aac..5f88537bc 100755 --- a/src/video_core/shader/decode/other.cpp +++ b/src/video_core/shader/decode/other.cpp @@ -76,6 +76,7 @@ u32 ShaderIR::DecodeOther(NodeBlock& bb, u32 pc) { case SystemVariable::InvocationId: return Operation(OperationCode::InvocationId); case SystemVariable::Ydirection: + uses_y_negate = true; return Operation(OperationCode::YNegate); case SystemVariable::InvocationInfo: LOG_WARNING(HW_GPU, "S2R instruction with InvocationInfo is incomplete"); diff --git a/src/video_core/shader/shader_ir.h b/src/video_core/shader/shader_ir.h index 0c6ab0f07..1cd7c14d7 100755 --- a/src/video_core/shader/shader_ir.h +++ b/src/video_core/shader/shader_ir.h @@ -139,6 +139,10 @@ public: return uses_legacy_varyings; } + bool UsesYNegate() const { + return uses_y_negate; + } + bool UsesWarps() const { return uses_warps; } @@ -465,6 +469,7 @@ private: bool uses_instance_id{}; bool uses_vertex_id{}; bool uses_legacy_varyings{}; + bool uses_y_negate{}; bool uses_warps{}; bool uses_indexed_samplers{}; diff --git a/src/video_core/texture_cache/texture_cache.h b/src/video_core/texture_cache/texture_cache.h index d1080300f..f336b705f 100755 --- a/src/video_core/texture_cache/texture_cache.h +++ b/src/video_core/texture_cache/texture_cache.h @@ -103,9 +103,6 @@ public: /// Notify the cache that a new frame has been queued void TickFrame(); - /// Return an unique mutually exclusive lock for the cache - [[nodiscard]] std::unique_lock AcquireLock(); - /// Return a constant reference to the given image view id [[nodiscard]] const ImageView& GetImageView(ImageViewId id) const noexcept; @@ -179,6 +176,8 @@ public: /// Return true when a CPU region is modified from the GPU [[nodiscard]] bool IsRegionGpuModified(VAddr addr, size_t size); + std::mutex mutex; + private: /// Iterate over all page indices in a range template @@ -212,8 +211,8 @@ private: void RefreshContents(Image& image); /// Upload data from guest to an image - template - void UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset); + template + void UploadImageContents(Image& image, StagingBuffer& staging_buffer, size_t buffer_offset); /// Find or create an image view from a guest descriptor [[nodiscard]] ImageViewId FindImageView(const TICEntry& config); @@ -325,8 +324,6 @@ private: RenderTargets render_targets; - std::mutex mutex; - std::unordered_map image_views; std::unordered_map samplers; std::unordered_map framebuffers; @@ -385,11 +382,6 @@ void TextureCache

::TickFrame() { ++frame_tick; } -template -std::unique_lock TextureCache

::AcquireLock() { - return std::unique_lock{mutex}; -} - template const typename P::ImageView& TextureCache

::GetImageView(ImageViewId id) const noexcept { return slot_image_views[id]; @@ -598,11 +590,11 @@ void TextureCache

::DownloadMemory(VAddr cpu_addr, size_t size) { }); for (const ImageId image_id : images) { Image& image = slot_images[image_id]; - auto map = runtime.MapDownloadBuffer(image.unswizzled_size_bytes); + auto map = runtime.DownloadStagingBuffer(image.unswizzled_size_bytes); const auto copies = FullDownloadCopies(image.info); image.DownloadMemory(map, 0, copies); runtime.Finish(); - SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.Span()); + SwizzleImage(gpu_memory, image.gpu_addr, image.info, copies, map.mapped_span); } } @@ -757,7 +749,7 @@ void TextureCache

::PopAsyncFlushes() { for (const ImageId image_id : download_ids) { total_size_bytes += slot_images[image_id].unswizzled_size_bytes; } - auto download_map = runtime.MapDownloadBuffer(total_size_bytes); + auto download_map = runtime.DownloadStagingBuffer(total_size_bytes); size_t buffer_offset = 0; for (const ImageId image_id : download_ids) { Image& image = slot_images[image_id]; @@ -769,7 +761,7 @@ void TextureCache

::PopAsyncFlushes() { runtime.Finish(); buffer_offset = 0; - const std::span download_span = download_map.Span(); + const std::span download_span = download_map.mapped_span; for (const ImageId image_id : download_ids) { const ImageBase& image = slot_images[image_id]; const auto copies = FullDownloadCopies(image.info); @@ -806,7 +798,7 @@ void TextureCache

::RefreshContents(Image& image) { LOG_WARNING(HW_GPU, "MSAA image uploads are not implemented"); return; } - auto map = runtime.MapUploadBuffer(MapSizeBytes(image)); + auto map = runtime.UploadStagingBuffer(MapSizeBytes(image)); UploadImageContents(image, map, 0); runtime.InsertUploadMemoryBarrier(); } @@ -814,7 +806,7 @@ void TextureCache

::RefreshContents(Image& image) { template template void TextureCache

::UploadImageContents(Image& image, MapBuffer& map, size_t buffer_offset) { - const std::span mapped_span = map.Span().subspan(buffer_offset); + const std::span mapped_span = map.mapped_span.subspan(buffer_offset); const GPUVAddr gpu_addr = image.gpu_addr; if (True(image.flags & ImageFlagBits::AcceleratedUpload)) { diff --git a/src/video_core/video_core.cpp b/src/video_core/video_core.cpp index 53444e945..e1b38c6ac 100755 --- a/src/video_core/video_core.cpp +++ b/src/video_core/video_core.cpp @@ -38,19 +38,18 @@ namespace VideoCore { std::unique_ptr CreateGPU(Core::Frontend::EmuWindow& emu_window, Core::System& system) { const bool use_nvdec = Settings::values.use_nvdec_emulation.GetValue(); - std::unique_ptr gpu = std::make_unique( - system, Settings::values.use_asynchronous_gpu_emulation.GetValue(), use_nvdec); - + const bool use_async = Settings::values.use_asynchronous_gpu_emulation.GetValue(); + auto gpu = std::make_unique(system, use_async, use_nvdec); auto context = emu_window.CreateSharedContext(); - const auto scope = context->Acquire(); - - auto renderer = CreateRenderer(system, emu_window, *gpu, std::move(context)); - if (!renderer->Init()) { + auto scope = context->Acquire(); + try { + auto renderer = CreateRenderer(system, emu_window, *gpu, std::move(context)); + gpu->BindRenderer(std::move(renderer)); + return gpu; + } catch (const std::runtime_error& exception) { + LOG_ERROR(HW_GPU, "Failed to initialize GPU: {}", exception.what()); return nullptr; } - - gpu->BindRenderer(std::move(renderer)); - return gpu; } u16 GetResolutionScaleFactor(const RendererBase& renderer) { diff --git a/src/video_core/vulkan_common/nsight_aftermath_tracker.h b/src/video_core/vulkan_common/nsight_aftermath_tracker.h index cee3847fb..3e4fec4fb 100755 --- a/src/video_core/vulkan_common/nsight_aftermath_tracker.h +++ b/src/video_core/vulkan_common/nsight_aftermath_tracker.h @@ -8,9 +8,6 @@ #include #include -#define VK_NO_PROTOTYPES -#include - #ifdef HAS_NSIGHT_AFTERMATH #include #include @@ -19,6 +16,7 @@ #include "common/common_types.h" #include "common/dynamic_library.h" +#include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { diff --git a/src/video_core/vulkan_common/vulkan_device.cpp b/src/video_core/vulkan_common/vulkan_device.cpp index 37d7b45a3..65086d71f 100755 --- a/src/video_core/vulkan_common/vulkan_device.cpp +++ b/src/video_core/vulkan_common/vulkan_device.cpp @@ -18,27 +18,22 @@ #include "video_core/vulkan_common/vulkan_wrapper.h" namespace Vulkan { - namespace { - namespace Alternatives { - -constexpr std::array Depth24UnormS8_UINT{ +constexpr std::array DEPTH24_UNORM_STENCIL8_UINT{ VK_FORMAT_D32_SFLOAT_S8_UINT, VK_FORMAT_D16_UNORM_S8_UINT, - VkFormat{}, + VK_FORMAT_UNDEFINED, }; -constexpr std::array Depth16UnormS8_UINT{ +constexpr std::array DEPTH16_UNORM_STENCIL8_UINT{ VK_FORMAT_D24_UNORM_S8_UINT, VK_FORMAT_D32_SFLOAT_S8_UINT, - VkFormat{}, + VK_FORMAT_UNDEFINED, }; - } // namespace Alternatives constexpr std::array REQUIRED_EXTENSIONS{ - VK_KHR_SWAPCHAIN_EXTENSION_NAME, VK_KHR_MAINTENANCE1_EXTENSION_NAME, VK_KHR_STORAGE_BUFFER_STORAGE_CLASS_EXTENSION_NAME, VK_KHR_SHADER_DRAW_PARAMETERS_EXTENSION_NAME, @@ -52,6 +47,12 @@ constexpr std::array REQUIRED_EXTENSIONS{ VK_EXT_SHADER_SUBGROUP_BALLOT_EXTENSION_NAME, VK_EXT_SHADER_SUBGROUP_VOTE_EXTENSION_NAME, VK_EXT_HOST_QUERY_RESET_EXTENSION_NAME, +#ifdef _WIN32 + VK_KHR_EXTERNAL_MEMORY_WIN32_EXTENSION_NAME, +#endif +#ifdef __linux__ + VK_KHR_EXTERNAL_MEMORY_FD_EXTENSION_NAME, +#endif }; template @@ -63,9 +64,9 @@ void SetNext(void**& next, T& data) { constexpr const VkFormat* GetFormatAlternatives(VkFormat format) { switch (format) { case VK_FORMAT_D24_UNORM_S8_UINT: - return Alternatives::Depth24UnormS8_UINT.data(); + return Alternatives::DEPTH24_UNORM_STENCIL8_UINT.data(); case VK_FORMAT_D16_UNORM_S8_UINT: - return Alternatives::Depth16UnormS8_UINT.data(); + return Alternatives::DEPTH16_UNORM_STENCIL8_UINT.data(); default: return nullptr; } @@ -210,78 +211,77 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR const vk::InstanceDispatch& dld_) : instance{instance_}, dld{dld_}, physical{physical_}, properties{physical.GetProperties()}, format_properties{GetFormatProperties(physical)} { - CheckSuitability(); + CheckSuitability(surface != nullptr); SetupFamilies(surface); SetupFeatures(); const auto queue_cis = GetDeviceQueueCreateInfos(); - const std::vector extensions = LoadExtensions(); + const std::vector extensions = LoadExtensions(surface != nullptr); VkPhysicalDeviceFeatures2 features2{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_FEATURES_2, .pNext = nullptr, - .features{}, + .features{ + .robustBufferAccess = false, + .fullDrawIndexUint32 = false, + .imageCubeArray = true, + .independentBlend = true, + .geometryShader = true, + .tessellationShader = true, + .sampleRateShading = false, + .dualSrcBlend = false, + .logicOp = false, + .multiDrawIndirect = false, + .drawIndirectFirstInstance = false, + .depthClamp = true, + .depthBiasClamp = true, + .fillModeNonSolid = false, + .depthBounds = false, + .wideLines = false, + .largePoints = true, + .alphaToOne = false, + .multiViewport = true, + .samplerAnisotropy = true, + .textureCompressionETC2 = false, + .textureCompressionASTC_LDR = is_optimal_astc_supported, + .textureCompressionBC = false, + .occlusionQueryPrecise = true, + .pipelineStatisticsQuery = false, + .vertexPipelineStoresAndAtomics = true, + .fragmentStoresAndAtomics = true, + .shaderTessellationAndGeometryPointSize = false, + .shaderImageGatherExtended = true, + .shaderStorageImageExtendedFormats = false, + .shaderStorageImageMultisample = is_shader_storage_image_multisample, + .shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported, + .shaderStorageImageWriteWithoutFormat = true, + .shaderUniformBufferArrayDynamicIndexing = false, + .shaderSampledImageArrayDynamicIndexing = false, + .shaderStorageBufferArrayDynamicIndexing = false, + .shaderStorageImageArrayDynamicIndexing = false, + .shaderClipDistance = false, + .shaderCullDistance = false, + .shaderFloat64 = false, + .shaderInt64 = false, + .shaderInt16 = false, + .shaderResourceResidency = false, + .shaderResourceMinLod = false, + .sparseBinding = false, + .sparseResidencyBuffer = false, + .sparseResidencyImage2D = false, + .sparseResidencyImage3D = false, + .sparseResidency2Samples = false, + .sparseResidency4Samples = false, + .sparseResidency8Samples = false, + .sparseResidency16Samples = false, + .sparseResidencyAliased = false, + .variableMultisampleRate = false, + .inheritedQueries = false, + }, }; const void* first_next = &features2; void** next = &features2.pNext; - features2.features = { - .robustBufferAccess = false, - .fullDrawIndexUint32 = false, - .imageCubeArray = true, - .independentBlend = true, - .geometryShader = true, - .tessellationShader = true, - .sampleRateShading = false, - .dualSrcBlend = false, - .logicOp = false, - .multiDrawIndirect = false, - .drawIndirectFirstInstance = false, - .depthClamp = true, - .depthBiasClamp = true, - .fillModeNonSolid = false, - .depthBounds = false, - .wideLines = false, - .largePoints = true, - .alphaToOne = false, - .multiViewport = true, - .samplerAnisotropy = true, - .textureCompressionETC2 = false, - .textureCompressionASTC_LDR = is_optimal_astc_supported, - .textureCompressionBC = false, - .occlusionQueryPrecise = true, - .pipelineStatisticsQuery = false, - .vertexPipelineStoresAndAtomics = true, - .fragmentStoresAndAtomics = true, - .shaderTessellationAndGeometryPointSize = false, - .shaderImageGatherExtended = true, - .shaderStorageImageExtendedFormats = false, - .shaderStorageImageMultisample = is_shader_storage_image_multisample, - .shaderStorageImageReadWithoutFormat = is_formatless_image_load_supported, - .shaderStorageImageWriteWithoutFormat = true, - .shaderUniformBufferArrayDynamicIndexing = false, - .shaderSampledImageArrayDynamicIndexing = false, - .shaderStorageBufferArrayDynamicIndexing = false, - .shaderStorageImageArrayDynamicIndexing = false, - .shaderClipDistance = false, - .shaderCullDistance = false, - .shaderFloat64 = false, - .shaderInt64 = false, - .shaderInt16 = false, - .shaderResourceResidency = false, - .shaderResourceMinLod = false, - .sparseBinding = false, - .sparseResidencyBuffer = false, - .sparseResidencyImage2D = false, - .sparseResidencyImage3D = false, - .sparseResidency2Samples = false, - .sparseResidency4Samples = false, - .sparseResidency8Samples = false, - .sparseResidency16Samples = false, - .sparseResidencyAliased = false, - .variableMultisampleRate = false, - .inheritedQueries = false, - }; VkPhysicalDeviceTimelineSemaphoreFeaturesKHR timeline_semaphore{ .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_TIMELINE_SEMAPHORE_FEATURES_KHR, .pNext = nullptr, @@ -399,7 +399,7 @@ Device::Device(VkInstance instance_, vk::PhysicalDevice physical_, VkSurfaceKHR robustness2 = { .sType = VK_STRUCTURE_TYPE_PHYSICAL_DEVICE_ROBUSTNESS_2_FEATURES_EXT, .pNext = nullptr, - .robustBufferAccess2 = false, + .robustBufferAccess2 = true, .robustImageAccess2 = true, .nullDescriptor = true, }; @@ -553,16 +553,18 @@ bool Device::IsFormatSupported(VkFormat wanted_format, VkFormatFeatureFlags want return (supported_usage & wanted_usage) == wanted_usage; } -void Device::CheckSuitability() const { +void Device::CheckSuitability(bool requires_swapchain) const { std::bitset available_extensions; + bool has_swapchain = false; for (const VkExtensionProperties& property : physical.EnumerateDeviceExtensionProperties()) { - for (std::size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { + const std::string_view name{property.extensionName}; + for (size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { if (available_extensions[i]) { continue; } - const std::string_view name{property.extensionName}; available_extensions[i] = name == REQUIRED_EXTENSIONS[i]; } + has_swapchain = has_swapchain || name == VK_KHR_SWAPCHAIN_EXTENSION_NAME; } for (size_t i = 0; i < REQUIRED_EXTENSIONS.size(); ++i) { if (available_extensions[i]) { @@ -571,6 +573,11 @@ void Device::CheckSuitability() const { LOG_ERROR(Render_Vulkan, "Missing required extension: {}", REQUIRED_EXTENSIONS[i]); throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); } + if (requires_swapchain && !has_swapchain) { + LOG_ERROR(Render_Vulkan, "Missing required extension: VK_KHR_swapchain"); + throw vk::Exception(VK_ERROR_EXTENSION_NOT_PRESENT); + } + struct LimitTuple { u32 minimum; u32 value; @@ -593,6 +600,7 @@ void Device::CheckSuitability() const { const VkPhysicalDeviceFeatures features{physical.GetFeatures()}; const std::array feature_report{ std::make_pair(features.vertexPipelineStoresAndAtomics, "vertexPipelineStoresAndAtomics"), + std::make_pair(features.robustBufferAccess, "robustBufferAccess"), std::make_pair(features.imageCubeArray, "imageCubeArray"), std::make_pair(features.independentBlend, "independentBlend"), std::make_pair(features.depthClamp, "depthClamp"), @@ -617,10 +625,13 @@ void Device::CheckSuitability() const { } } -std::vector Device::LoadExtensions() { +std::vector Device::LoadExtensions(bool requires_surface) { std::vector extensions; - extensions.reserve(7 + REQUIRED_EXTENSIONS.size()); + extensions.reserve(8 + REQUIRED_EXTENSIONS.size()); extensions.insert(extensions.begin(), REQUIRED_EXTENSIONS.begin(), REQUIRED_EXTENSIONS.end()); + if (requires_surface) { + extensions.push_back(VK_KHR_SWAPCHAIN_EXTENSION_NAME); + } bool has_khr_shader_float16_int8{}; bool has_ext_subgroup_size_control{}; @@ -761,7 +772,8 @@ std::vector Device::LoadExtensions() { robustness2.pNext = nullptr; features.pNext = &robustness2; physical.GetFeatures2KHR(features); - if (robustness2.nullDescriptor && robustness2.robustImageAccess2) { + if (robustness2.nullDescriptor && robustness2.robustBufferAccess2 && + robustness2.robustImageAccess2) { extensions.push_back(VK_EXT_ROBUSTNESS_2_EXTENSION_NAME); ext_robustness2 = true; } diff --git a/src/video_core/vulkan_common/vulkan_device.h b/src/video_core/vulkan_common/vulkan_device.h index 4b66dba7a..e0711f733 100755 --- a/src/video_core/vulkan_common/vulkan_device.h +++ b/src/video_core/vulkan_common/vulkan_device.h @@ -23,7 +23,7 @@ enum class FormatType { Linear, Optimal, Buffer }; const u32 GuestWarpSize = 32; /// Handles data specific to a physical device. -class Device final { +class Device { public: explicit Device(VkInstance instance, vk::PhysicalDevice physical, VkSurfaceKHR surface, const vk::InstanceDispatch& dld); @@ -227,10 +227,10 @@ public: private: /// Checks if the physical device is suitable. - void CheckSuitability() const; + void CheckSuitability(bool requires_swapchain) const; /// Loads extensions into a vector and stores available ones in this object. - std::vector LoadExtensions(); + std::vector LoadExtensions(bool requires_surface); /// Sets up queue families. void SetupFamilies(VkSurfaceKHR surface); diff --git a/src/video_core/vulkan_common/vulkan_instance.cpp b/src/video_core/vulkan_common/vulkan_instance.cpp index 889ecda0c..bfd6e6add 100755 --- a/src/video_core/vulkan_common/vulkan_instance.cpp +++ b/src/video_core/vulkan_common/vulkan_instance.cpp @@ -3,6 +3,7 @@ // Refer to the license.txt file included. #include +#include #include #include #include @@ -140,7 +141,10 @@ vk::Instance CreateInstance(const Common::DynamicLibrary& library, vk::InstanceD VK_VERSION_MAJOR(required_version), VK_VERSION_MINOR(required_version)); throw vk::Exception(VK_ERROR_INCOMPATIBLE_DRIVER); } - vk::Instance instance = vk::Instance::Create(required_version, layers, extensions, dld); + vk::Instance instance = + std::async([&] { + return vk::Instance::Create(required_version, layers, extensions, dld); + }).get(); if (!vk::Load(*instance, dld)) { LOG_ERROR(Render_Vulkan, "Failed to load Vulkan instance function pointers"); throw vk::Exception(VK_ERROR_INITIALIZATION_FAILED); diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp index d6eb3af31..2a8b7a907 100755 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.cpp +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.cpp @@ -7,6 +7,8 @@ #include #include +#include + #include "common/alignment.h" #include "common/assert.h" #include "common/common_types.h" @@ -55,10 +57,24 @@ struct Range { class MemoryAllocation { public: - explicit MemoryAllocation(const Device& device_, vk::DeviceMemory memory_, - VkMemoryPropertyFlags properties, u64 allocation_size_, u32 type) - : device{device_}, memory{std::move(memory_)}, allocation_size{allocation_size_}, - property_flags{properties}, shifted_memory_type{1U << type} {} + explicit MemoryAllocation(vk::DeviceMemory memory_, VkMemoryPropertyFlags properties, + u64 allocation_size_, u32 type) + : memory{std::move(memory_)}, allocation_size{allocation_size_}, property_flags{properties}, + shifted_memory_type{1U << type} {} + +#if defined(_WIN32) || defined(__linux__) + ~MemoryAllocation() { + if (owning_opengl_handle != 0) { + glDeleteMemoryObjectsEXT(1, &owning_opengl_handle); + } + } +#endif + + MemoryAllocation& operator=(const MemoryAllocation&) = delete; + MemoryAllocation(const MemoryAllocation&) = delete; + + MemoryAllocation& operator=(MemoryAllocation&&) = delete; + MemoryAllocation(MemoryAllocation&&) = delete; [[nodiscard]] std::optional Commit(VkDeviceSize size, VkDeviceSize alignment) { const std::optional alloc = FindFreeRegion(size, alignment); @@ -88,6 +104,31 @@ public: return memory_mapped_span; } +#ifdef _WIN32 + [[nodiscard]] u32 ExportOpenGLHandle() { + if (!owning_opengl_handle) { + glCreateMemoryObjectsEXT(1, &owning_opengl_handle); + glImportMemoryWin32HandleEXT(owning_opengl_handle, allocation_size, + GL_HANDLE_TYPE_OPAQUE_WIN32_EXT, + memory.GetMemoryWin32HandleKHR()); + } + return owning_opengl_handle; + } +#elif __linux__ + [[nodiscard]] u32 ExportOpenGLHandle() { + if (!owning_opengl_handle) { + glCreateMemoryObjectsEXT(1, &owning_opengl_handle); + glImportMemoryFdEXT(owning_opengl_handle, allocation_size, GL_HANDLE_TYPE_OPAQUE_FD_EXT, + memory.GetMemoryFdKHR()); + } + return owning_opengl_handle; + } +#else + [[nodiscard]] u32 ExportOpenGLHandle() { + return 0; + } +#endif + /// Returns whether this allocation is compatible with the arguments. [[nodiscard]] bool IsCompatible(VkMemoryPropertyFlags flags, u32 type_mask) const { return (flags & property_flags) && (type_mask & shifted_memory_type) != 0; @@ -118,13 +159,15 @@ private: return candidate; } - const Device& device; ///< Vulkan device. const vk::DeviceMemory memory; ///< Vulkan memory allocation handler. const u64 allocation_size; ///< Size of this allocation. const VkMemoryPropertyFlags property_flags; ///< Vulkan memory property flags. const u32 shifted_memory_type; ///< Shifted Vulkan memory type. std::vector commits; ///< All commit ranges done from this allocation. std::span memory_mapped_span; ///< Memory mapped span. Empty if not queried before. +#if defined(_WIN32) || defined(__linux__) + u32 owning_opengl_handle{}; ///< Owning OpenGL memory object handle. +#endif }; MemoryCommit::MemoryCommit(MemoryAllocation* allocation_, VkDeviceMemory memory_, u64 begin_, @@ -156,14 +199,19 @@ std::span MemoryCommit::Map() { return span; } +u32 MemoryCommit::ExportOpenGLHandle() const { + return allocation->ExportOpenGLHandle(); +} + void MemoryCommit::Release() { if (allocation) { allocation->Free(begin); } } -MemoryAllocator::MemoryAllocator(const Device& device_) - : device{device_}, properties{device_.GetPhysical().GetMemoryProperties()} {} +MemoryAllocator::MemoryAllocator(const Device& device_, bool export_allocations_) + : device{device_}, properties{device_.GetPhysical().GetMemoryProperties()}, + export_allocations{export_allocations_} {} MemoryAllocator::~MemoryAllocator() = default; @@ -196,14 +244,24 @@ MemoryCommit MemoryAllocator::Commit(const vk::Image& image, MemoryUsage usage) void MemoryAllocator::AllocMemory(VkMemoryPropertyFlags flags, u32 type_mask, u64 size) { const u32 type = FindType(flags, type_mask).value(); + const VkExportMemoryAllocateInfo export_allocate_info{ + .sType = VK_STRUCTURE_TYPE_EXPORT_MEMORY_ALLOCATE_INFO, + .pNext = nullptr, +#ifdef _WIN32 + .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT, +#elif __linux__ + .handleTypes = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT, +#else + .handleTypes = 0, +#endif + }; vk::DeviceMemory memory = device.GetLogical().AllocateMemory({ .sType = VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_INFO, - .pNext = nullptr, + .pNext = export_allocations ? &export_allocate_info : nullptr, .allocationSize = size, .memoryTypeIndex = type, }); - allocations.push_back( - std::make_unique(device, std::move(memory), flags, size, type)); + allocations.push_back(std::make_unique(std::move(memory), flags, size, type)); } std::optional MemoryAllocator::TryCommit(const VkMemoryRequirements& requirements, diff --git a/src/video_core/vulkan_common/vulkan_memory_allocator.h b/src/video_core/vulkan_common/vulkan_memory_allocator.h index 53b3b275a..85cd7ea86 100755 --- a/src/video_core/vulkan_common/vulkan_memory_allocator.h +++ b/src/video_core/vulkan_common/vulkan_memory_allocator.h @@ -43,6 +43,9 @@ public: /// It will map the backing allocation if it hasn't been mapped before. std::span Map(); + /// Returns an non-owning OpenGL handle, creating one if it doesn't exist. + u32 ExportOpenGLHandle() const; + /// Returns the Vulkan memory handler. VkDeviceMemory Memory() const { return memory; @@ -67,7 +70,15 @@ private: /// Allocates and releases memory allocations on demand. class MemoryAllocator { public: - explicit MemoryAllocator(const Device& device_); + /** + * Construct memory allocator + * + * @param device_ Device to allocate from + * @param export_allocations_ True when allocations have to be exported + * + * @throw vk::Exception on failure + */ + explicit MemoryAllocator(const Device& device_, bool export_allocations_); ~MemoryAllocator(); MemoryAllocator& operator=(const MemoryAllocator&) = delete; @@ -107,8 +118,9 @@ private: /// Returns index to the fastest memory type compatible with the passed requirements. std::optional FindType(VkMemoryPropertyFlags flags, u32 type_mask) const; - const Device& device; ///< Device handle. - const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. + const Device& device; ///< Device handle. + const VkPhysicalDeviceMemoryProperties properties; ///< Physical device properties. + const bool export_allocations; ///< True when memory allocations have to be exported. std::vector> allocations; ///< Current allocations. }; diff --git a/src/video_core/vulkan_common/vulkan_wrapper.cpp b/src/video_core/vulkan_common/vulkan_wrapper.cpp index 5e15ad607..d39bbdc70 100755 --- a/src/video_core/vulkan_common/vulkan_wrapper.cpp +++ b/src/video_core/vulkan_common/vulkan_wrapper.cpp @@ -173,6 +173,10 @@ void Load(VkDevice device, DeviceDispatch& dld) noexcept { X(vkGetEventStatus); X(vkGetFenceStatus); X(vkGetImageMemoryRequirements); + X(vkGetMemoryFdKHR); +#ifdef _WIN32 + X(vkGetMemoryWin32HandleKHR); +#endif X(vkGetQueryPoolResults); X(vkGetSemaphoreCounterValueKHR); X(vkMapMemory); @@ -505,6 +509,32 @@ void ImageView::SetObjectNameEXT(const char* name) const { SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_IMAGE_VIEW, name); } +int DeviceMemory::GetMemoryFdKHR() const { + const VkMemoryGetFdInfoKHR get_fd_info{ + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_FD_INFO_KHR, + .pNext = nullptr, + .memory = handle, + .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_FD_BIT_KHR, + }; + int fd; + Check(dld->vkGetMemoryFdKHR(owner, &get_fd_info, &fd)); + return fd; +} + +#ifdef _WIN32 +HANDLE DeviceMemory::GetMemoryWin32HandleKHR() const { + const VkMemoryGetWin32HandleInfoKHR get_win32_handle_info{ + .sType = VK_STRUCTURE_TYPE_MEMORY_GET_WIN32_HANDLE_INFO_KHR, + .pNext = nullptr, + .memory = handle, + .handleType = VK_EXTERNAL_MEMORY_HANDLE_TYPE_OPAQUE_WIN32_BIT_KHR, + }; + HANDLE win32_handle; + Check(dld->vkGetMemoryWin32HandleKHR(owner, &get_win32_handle_info, &win32_handle)); + return win32_handle; +} +#endif + void DeviceMemory::SetObjectNameEXT(const char* name) const { SetObjectName(dld, owner, handle, VK_OBJECT_TYPE_DEVICE_MEMORY, name); } diff --git a/src/video_core/vulkan_common/vulkan_wrapper.h b/src/video_core/vulkan_common/vulkan_wrapper.h index 9689de0cb..7f781b081 100755 --- a/src/video_core/vulkan_common/vulkan_wrapper.h +++ b/src/video_core/vulkan_common/vulkan_wrapper.h @@ -15,8 +15,19 @@ #include #define VK_NO_PROTOTYPES +#ifdef _WIN32 +#define VK_USE_PLATFORM_WIN32_KHR +#endif #include +// Sanitize macros +#ifdef CreateEvent +#undef CreateEvent +#endif +#ifdef CreateSemaphore +#undef CreateSemaphore +#endif + #include "common/common_types.h" #ifdef _MSC_VER @@ -174,7 +185,7 @@ struct InstanceDispatch { }; /// Table holding Vulkan device function pointers. -struct DeviceDispatch : public InstanceDispatch { +struct DeviceDispatch : InstanceDispatch { PFN_vkAcquireNextImageKHR vkAcquireNextImageKHR{}; PFN_vkAllocateCommandBuffers vkAllocateCommandBuffers{}; PFN_vkAllocateDescriptorSets vkAllocateDescriptorSets{}; @@ -277,6 +288,10 @@ struct DeviceDispatch : public InstanceDispatch { PFN_vkGetEventStatus vkGetEventStatus{}; PFN_vkGetFenceStatus vkGetFenceStatus{}; PFN_vkGetImageMemoryRequirements vkGetImageMemoryRequirements{}; + PFN_vkGetMemoryFdKHR vkGetMemoryFdKHR{}; +#ifdef _WIN32 + PFN_vkGetMemoryWin32HandleKHR vkGetMemoryWin32HandleKHR{}; +#endif PFN_vkGetQueryPoolResults vkGetQueryPoolResults{}; PFN_vkGetSemaphoreCounterValueKHR vkGetSemaphoreCounterValueKHR{}; PFN_vkMapMemory vkMapMemory{}; @@ -344,6 +359,9 @@ public: /// Construct an empty handle. Handle() = default; + /// Construct an empty handle. + Handle(std::nullptr_t) {} + /// Copying Vulkan objects is not supported and will never be. Handle(const Handle&) = delete; Handle& operator=(const Handle&) = delete; @@ -659,6 +677,12 @@ class DeviceMemory : public Handle { using Handle::Handle; public: + int GetMemoryFdKHR() const; + +#ifdef _WIN32 + HANDLE GetMemoryWin32HandleKHR() const; +#endif + /// Set object name. void SetObjectNameEXT(const char* name) const; @@ -1031,6 +1055,12 @@ public: PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, {}, {}, {}); } + void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, + VkDependencyFlags dependency_flags, + const VkMemoryBarrier& memory_barrier) const noexcept { + PipelineBarrier(src_stage_mask, dst_stage_mask, dependency_flags, memory_barrier, {}, {}); + } + void PipelineBarrier(VkPipelineStageFlags src_stage_mask, VkPipelineStageFlags dst_stage_mask, VkDependencyFlags dependency_flags, const VkBufferMemoryBarrier& buffer_barrier) const noexcept { diff --git a/src/yuzu/CMakeLists.txt b/src/yuzu/CMakeLists.txt index e1bab2112..6802be295 100755 --- a/src/yuzu/CMakeLists.txt +++ b/src/yuzu/CMakeLists.txt @@ -71,6 +71,8 @@ add_executable(yuzu configuration/configure_input_player.cpp configuration/configure_input_player.h configuration/configure_input_player.ui + configuration/configure_input_player_widget.cpp + configuration/configure_input_player_widget.h configuration/configure_input_profile_dialog.cpp configuration/configure_input_profile_dialog.h configuration/configure_input_profile_dialog.ui diff --git a/src/yuzu/bootmanager.cpp b/src/yuzu/bootmanager.cpp index e6c8f18af..5bb55bc45 100755 --- a/src/yuzu/bootmanager.cpp +++ b/src/yuzu/bootmanager.cpp @@ -64,7 +64,7 @@ void EmuThread::run() { emit LoadProgress(VideoCore::LoadCallbackStage::Prepare, 0, 0); - system.Renderer().Rasterizer().LoadDiskResources( + system.Renderer().ReadRasterizer()->LoadDiskResources( system.CurrentProcess()->GetTitleID(), stop_run, [this](VideoCore::LoadCallbackStage stage, std::size_t value, std::size_t total) { emit LoadProgress(stage, value, total); @@ -394,7 +394,7 @@ void GRenderWindow::mousePressEvent(QMouseEvent* event) { input_subsystem->GetMouse()->PressButton(x, y, event->button()); if (event->button() == Qt::LeftButton) { - this->TouchPressed(x, y); + this->TouchPressed(x, y, 0); } emit MouseActivity(); @@ -409,7 +409,7 @@ void GRenderWindow::mouseMoveEvent(QMouseEvent* event) { auto pos = event->pos(); const auto [x, y] = ScaleTouch(pos); input_subsystem->GetMouse()->MouseMove(x, y); - this->TouchMoved(x, y); + this->TouchMoved(x, y, 0); emit MouseActivity(); } @@ -423,36 +423,72 @@ void GRenderWindow::mouseReleaseEvent(QMouseEvent* event) { input_subsystem->GetMouse()->ReleaseButton(event->button()); if (event->button() == Qt::LeftButton) { - this->TouchReleased(); + this->TouchReleased(0); } } void GRenderWindow::TouchBeginEvent(const QTouchEvent* event) { - // TouchBegin always has exactly one touch point, so take the .first() - const auto [x, y] = ScaleTouch(event->touchPoints().first().pos()); - this->TouchPressed(x, y); + QList touch_points = event->touchPoints(); + for (const auto& touch_point : touch_points) { + if (!TouchUpdate(touch_point)) { + TouchStart(touch_point); + } + } } void GRenderWindow::TouchUpdateEvent(const QTouchEvent* event) { - QPointF pos; - int active_points = 0; - - // average all active touch points - for (const auto& tp : event->touchPoints()) { - if (tp.state() & (Qt::TouchPointPressed | Qt::TouchPointMoved | Qt::TouchPointStationary)) { - active_points++; - pos += tp.pos(); + QList touch_points = event->touchPoints(); + for (const auto& touch_point : touch_points) { + if (!TouchUpdate(touch_point)) { + TouchStart(touch_point); + } + } + // Release all inactive points + for (std::size_t id = 0; id < touch_ids.size(); ++id) { + if (!TouchExist(touch_ids[id], touch_points)) { + touch_ids[id] = 0; + this->TouchReleased(id + 1); } } - - pos /= active_points; - - const auto [x, y] = ScaleTouch(pos); - this->TouchMoved(x, y); } void GRenderWindow::TouchEndEvent() { - this->TouchReleased(); + for (std::size_t id = 0; id < touch_ids.size(); ++id) { + if (touch_ids[id] != 0) { + touch_ids[id] = 0; + this->TouchReleased(id + 1); + } + } +} + +bool GRenderWindow::TouchStart(const QTouchEvent::TouchPoint& touch_point) { + for (std::size_t id = 0; id < touch_ids.size(); ++id) { + if (touch_ids[id] == 0) { + touch_ids[id] = touch_point.id() + 1; + const auto [x, y] = ScaleTouch(touch_point.pos()); + this->TouchPressed(x, y, id + 1); + return true; + } + } + return false; +} + +bool GRenderWindow::TouchUpdate(const QTouchEvent::TouchPoint& touch_point) { + for (std::size_t id = 0; id < touch_ids.size(); ++id) { + if (touch_ids[id] == static_cast(touch_point.id() + 1)) { + const auto [x, y] = ScaleTouch(touch_point.pos()); + this->TouchMoved(x, y, id + 1); + return true; + } + } + return false; +} + +bool GRenderWindow::TouchExist(std::size_t id, + const QList& touch_points) const { + return std::any_of(touch_points.begin(), touch_points.end(), [id](const auto& point) { + return id == static_cast(point.id() + 1); + }); } bool GRenderWindow::event(QEvent* event) { diff --git a/src/yuzu/bootmanager.h b/src/yuzu/bootmanager.h index 339095509..b5ec7de07 100755 --- a/src/yuzu/bootmanager.h +++ b/src/yuzu/bootmanager.h @@ -11,6 +11,7 @@ #include #include +#include #include #include @@ -21,7 +22,6 @@ class GRenderWindow; class GMainWindow; class QKeyEvent; -class QTouchEvent; class QStringList; namespace InputCommon { @@ -191,6 +191,10 @@ private: void TouchUpdateEvent(const QTouchEvent* event); void TouchEndEvent(); + bool TouchStart(const QTouchEvent::TouchPoint& touch_point); + bool TouchUpdate(const QTouchEvent::TouchPoint& touch_point); + bool TouchExist(std::size_t id, const QList& touch_points) const; + void OnMinimalClientAreaChangeRequest(std::pair minimal_size) override; bool InitializeOpenGL(); @@ -215,6 +219,8 @@ private: bool first_frame = false; + std::array touch_ids{}; + protected: void showEvent(QShowEvent* event) override; bool eventFilter(QObject* object, QEvent* event) override; diff --git a/src/yuzu/configuration/config.cpp b/src/yuzu/configuration/config.cpp index cda448718..5e4d22299 100755 --- a/src/yuzu/configuration/config.cpp +++ b/src/yuzu/configuration/config.cpp @@ -464,13 +464,7 @@ void Config::ReadMouseValues() { void Config::ReadTouchscreenValues() { Settings::values.touchscreen.enabled = ReadSetting(QStringLiteral("touchscreen_enabled"), true).toBool(); - Settings::values.touchscreen.device = - ReadSetting(QStringLiteral("touchscreen_device"), QStringLiteral("engine:emu_window")) - .toString() - .toStdString(); - Settings::values.touchscreen.finger = - ReadSetting(QStringLiteral("touchscreen_finger"), 0).toUInt(); Settings::values.touchscreen.rotation_angle = ReadSetting(QStringLiteral("touchscreen_angle"), 0).toUInt(); Settings::values.touchscreen.diameter_x = @@ -563,7 +557,8 @@ void Config::ReadMotionTouchValues() { .toString() .toStdString(); Settings::values.touch_device = - ReadSetting(QStringLiteral("touch_device"), QStringLiteral("engine:emu_window")) + ReadSetting(QStringLiteral("touch_device"), + QStringLiteral("min_x:100,min_y:50,max_x:1800,max_y:850")) .toString() .toStdString(); Settings::values.use_touch_from_button = @@ -1087,10 +1082,7 @@ void Config::SaveTouchscreenValues() { const auto& touchscreen = Settings::values.touchscreen; WriteSetting(QStringLiteral("touchscreen_enabled"), touchscreen.enabled, true); - WriteSetting(QStringLiteral("touchscreen_device"), QString::fromStdString(touchscreen.device), - QStringLiteral("engine:emu_window")); - WriteSetting(QStringLiteral("touchscreen_finger"), touchscreen.finger, 0); WriteSetting(QStringLiteral("touchscreen_angle"), touchscreen.rotation_angle, 0); WriteSetting(QStringLiteral("touchscreen_diameter_x"), touchscreen.diameter_x, 15); WriteSetting(QStringLiteral("touchscreen_diameter_y"), touchscreen.diameter_y, 15); diff --git a/src/yuzu/configuration/configure_graphics.cpp b/src/yuzu/configuration/configure_graphics.cpp index b78a5dff0..9ff32aec4 100755 --- a/src/yuzu/configuration/configure_graphics.cpp +++ b/src/yuzu/configuration/configure_graphics.cpp @@ -2,6 +2,9 @@ // Licensed under GPLv2 or any later version // Refer to the license.txt file included. +// Include this early to include Vulkan headers how we want to +#include "video_core/vulkan_common/vulkan_wrapper.h" + #include #include #include @@ -11,7 +14,8 @@ #include "core/core.h" #include "core/settings.h" #include "ui_configure_graphics.h" -#include "video_core/renderer_vulkan/renderer_vulkan.h" +#include "video_core/vulkan_common/vulkan_instance.h" +#include "video_core/vulkan_common/vulkan_library.h" #include "yuzu/configuration/configuration_shared.h" #include "yuzu/configuration/configure_graphics.h" @@ -212,11 +216,23 @@ void ConfigureGraphics::UpdateDeviceComboBox() { ui->device->setEnabled(enabled && !Core::System::GetInstance().IsPoweredOn()); } -void ConfigureGraphics::RetrieveVulkanDevices() { +void ConfigureGraphics::RetrieveVulkanDevices() try { + using namespace Vulkan; + + vk::InstanceDispatch dld; + const Common::DynamicLibrary library = OpenLibrary(); + const vk::Instance instance = CreateInstance(library, dld, VK_API_VERSION_1_0); + const std::vector physical_devices = instance.EnumeratePhysicalDevices(); + vulkan_devices.clear(); - for (const auto& name : Vulkan::RendererVulkan::EnumerateDevices()) { + vulkan_devices.reserve(physical_devices.size()); + for (const VkPhysicalDevice device : physical_devices) { + const char* const name = vk::PhysicalDevice(device, dld).GetProperties().deviceName; vulkan_devices.push_back(QString::fromStdString(name)); } + +} catch (const Vulkan::vk::Exception& exception) { + LOG_ERROR(Frontend, "Failed to enumerate devices with error: {}", exception.what()); } Settings::RendererBackend ConfigureGraphics::GetCurrentGraphicsBackend() const { diff --git a/src/yuzu/configuration/configure_input_player.cpp b/src/yuzu/configuration/configure_input_player.cpp index 46ea026e4..0a79993eb 100755 --- a/src/yuzu/configuration/configure_input_player.cpp +++ b/src/yuzu/configuration/configure_input_player.cpp @@ -23,6 +23,7 @@ #include "ui_configure_input_player.h" #include "yuzu/configuration/config.h" #include "yuzu/configuration/configure_input_player.h" +#include "yuzu/configuration/configure_input_player_widget.h" #include "yuzu/configuration/configure_vibration.h" #include "yuzu/configuration/input_profiles.h" #include "yuzu/util/limitable_input_dialog.h" @@ -254,11 +255,12 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i analog_map_range_groupbox = {ui->buttonLStickRangeGroup, ui->buttonRStickRangeGroup}; analog_map_range_spinbox = {ui->spinboxLStickRange, ui->spinboxRStickRange}; - const auto ConfigureButtonClick = [&](QPushButton* button, Common::ParamPackage* param, - int default_val, InputCommon::Polling::DeviceType type) { + const auto ConfigureButtonClick = [&](QPushButton* button, std::size_t button_id, + Common::ParamPackage* param, int default_val, + InputCommon::Polling::DeviceType type) { connect(button, &QPushButton::clicked, [=, this] { HandleClick( - button, + button, button_id, [=, this](Common::ParamPackage params) { // Workaround for ZL & ZR for analog triggers like on XBOX // controllers. Analog triggers (from controllers like the XBOX @@ -286,12 +288,11 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i continue; } - ConfigureButtonClick(button_map[button_id], &buttons_param[button_id], + ConfigureButtonClick(button_map[button_id], button_id, &buttons_param[button_id], Config::default_buttons[button_id], InputCommon::Polling::DeviceType::Button); button->setContextMenuPolicy(Qt::CustomContextMenu); - connect(button, &QPushButton::customContextMenuRequested, [=, this](const QPoint& menu_location) { QMenu context_menu; @@ -300,6 +301,7 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i button_map[button_id]->setText(tr("[not set]")); }); context_menu.exec(button_map[button_id]->mapToGlobal(menu_location)); + ui->controllerFrame->SetPlayerInput(player_index, buttons_param, analogs_param); }); } @@ -309,7 +311,7 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i continue; } - ConfigureButtonClick(motion_map[motion_id], &motions_param[motion_id], + ConfigureButtonClick(motion_map[motion_id], motion_id, &motions_param[motion_id], Config::default_motions[motion_id], InputCommon::Polling::DeviceType::Motion); @@ -348,7 +350,7 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i } } HandleClick( - analog_map_buttons[analog_id][sub_button_id], + analog_map_buttons[analog_id][sub_button_id], analog_id, [=, this](const Common::ParamPackage& params) { SetAnalogParam(params, analogs_param[analog_id], analog_sub_buttons[sub_button_id]); @@ -358,41 +360,43 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i analog_button->setContextMenuPolicy(Qt::CustomContextMenu); - connect(analog_button, &QPushButton::customContextMenuRequested, - [=, this](const QPoint& menu_location) { - QMenu context_menu; - context_menu.addAction(tr("Clear"), [&] { - analogs_param[analog_id].Clear(); - analog_map_buttons[analog_id][sub_button_id]->setText(tr("[not set]")); - }); - context_menu.addAction(tr("Invert axis"), [&] { - if (sub_button_id == 2 || sub_button_id == 3) { - const bool invert_value = - analogs_param[analog_id].Get("invert_x", "+") == "-"; - const std::string invert_str = invert_value ? "+" : "-"; - analogs_param[analog_id].Set("invert_x", invert_str); - } - if (sub_button_id == 0 || sub_button_id == 1) { - const bool invert_value = - analogs_param[analog_id].Get("invert_y", "+") == "-"; - const std::string invert_str = invert_value ? "+" : "-"; - analogs_param[analog_id].Set("invert_y", invert_str); - } - for (int sub_button_id = 0; sub_button_id < ANALOG_SUB_BUTTONS_NUM; - ++sub_button_id) { - analog_map_buttons[analog_id][sub_button_id]->setText(AnalogToText( - analogs_param[analog_id], analog_sub_buttons[sub_button_id])); - } - }); - context_menu.exec(analog_map_buttons[analog_id][sub_button_id]->mapToGlobal( - menu_location)); + connect( + analog_button, &QPushButton::customContextMenuRequested, + [=, this](const QPoint& menu_location) { + QMenu context_menu; + context_menu.addAction(tr("Clear"), [&] { + analogs_param[analog_id].Clear(); + analog_map_buttons[analog_id][sub_button_id]->setText(tr("[not set]")); }); + context_menu.addAction(tr("Invert axis"), [&] { + if (sub_button_id == 2 || sub_button_id == 3) { + const bool invert_value = + analogs_param[analog_id].Get("invert_x", "+") == "-"; + const std::string invert_str = invert_value ? "+" : "-"; + analogs_param[analog_id].Set("invert_x", invert_str); + } + if (sub_button_id == 0 || sub_button_id == 1) { + const bool invert_value = + analogs_param[analog_id].Get("invert_y", "+") == "-"; + const std::string invert_str = invert_value ? "+" : "-"; + analogs_param[analog_id].Set("invert_y", invert_str); + } + for (int sub_button_id = 0; sub_button_id < ANALOG_SUB_BUTTONS_NUM; + ++sub_button_id) { + analog_map_buttons[analog_id][sub_button_id]->setText(AnalogToText( + analogs_param[analog_id], analog_sub_buttons[sub_button_id])); + } + }); + context_menu.exec( + analog_map_buttons[analog_id][sub_button_id]->mapToGlobal(menu_location)); + ui->controllerFrame->SetPlayerInput(player_index, buttons_param, analogs_param); + }); } // Handle clicks for the modifier buttons as well. connect(analog_map_modifier_button[analog_id], &QPushButton::clicked, [=, this] { HandleClick( - analog_map_modifier_button[analog_id], + analog_map_modifier_button[analog_id], analog_id, [=, this](const Common::ParamPackage& params) { analogs_param[analog_id].Set("modifier", params.Serialize()); }, @@ -416,12 +420,14 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i [=, this] { const auto spinbox_value = analog_map_range_spinbox[analog_id]->value(); analogs_param[analog_id].Set("range", spinbox_value / 100.0f); + ui->controllerFrame->SetPlayerInput(player_index, buttons_param, analogs_param); }); connect(analog_map_deadzone_slider[analog_id], &QSlider::valueChanged, [=, this] { const auto slider_value = analog_map_deadzone_slider[analog_id]->value(); analog_map_deadzone_label[analog_id]->setText(tr("Deadzone: %1%").arg(slider_value)); analogs_param[analog_id].Set("deadzone", slider_value / 100.0f); + ui->controllerFrame->SetPlayerInput(player_index, buttons_param, analogs_param); }); connect(analog_map_modifier_slider[analog_id], &QSlider::valueChanged, [=, this] { @@ -433,8 +439,10 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i } // Player Connected checkbox - connect(ui->groupConnectedController, &QGroupBox::toggled, - [this](bool checked) { emit Connected(checked); }); + connect(ui->groupConnectedController, &QGroupBox::toggled, [this](bool checked) { + emit Connected(checked); + ui->controllerFrame->SetConnectedStatus(checked); + }); if (player_index == 0) { connect(ui->comboControllerType, qOverload(&QComboBox::currentIndexChanged), @@ -553,6 +561,8 @@ ConfigureInputPlayer::ConfigureInputPlayer(QWidget* parent, std::size_t player_i // TODO(wwylele): enable this when we actually emulate it ui->buttonHome->setEnabled(false); + ui->controllerFrame->SetPlayerInput(player_index, buttons_param, analogs_param); + ui->controllerFrame->SetConnectedStatus(ui->groupConnectedController->isChecked()); } ConfigureInputPlayer::~ConfigureInputPlayer() = default; @@ -849,6 +859,7 @@ void ConfigureInputPlayer::UpdateUI() { modifier_label->setVisible(!is_controller); modifier_slider->setVisible(!is_controller); range_groupbox->setVisible(is_controller); + ui->controllerFrame->SetPlayerInput(player_index, buttons_param, analogs_param); } } @@ -965,8 +976,8 @@ void ConfigureInputPlayer::UpdateControllerIcon() { return QString{}; } }(); - - ui->controllerFrame->setStyleSheet(stylesheet.arg(theme)); + ui->controllerFrame->SetControllerType( + GetControllerTypeFromIndex(ui->comboControllerType->currentIndex())); } void ConfigureInputPlayer::UpdateControllerAvailableButtons() { @@ -1103,7 +1114,8 @@ void ConfigureInputPlayer::UpdateMappingWithDefaults() { } void ConfigureInputPlayer::HandleClick( - QPushButton* button, std::function new_input_setter, + QPushButton* button, std::size_t button_id, + std::function new_input_setter, InputCommon::Polling::DeviceType type) { if (button == ui->buttonMotionLeft || button == ui->buttonMotionRight) { button->setText(tr("Shake!")); @@ -1147,6 +1159,12 @@ void ConfigureInputPlayer::HandleClick( input_subsystem->GetMouseTouch()->BeginConfiguration(); } + if (type == InputCommon::Polling::DeviceType::Button) { + ui->controllerFrame->BeginMappingButton(button_id); + } else if (type == InputCommon::Polling::DeviceType::AnalogPreferred) { + ui->controllerFrame->BeginMappingAnalog(button_id); + } + timeout_timer->start(2500); // Cancel after 2.5 seconds poll_timer->start(50); // Check for new inputs every 50ms } @@ -1177,6 +1195,7 @@ void ConfigureInputPlayer::SetPollingResult(const Common::ParamPackage& params, UpdateUI(); UpdateInputDeviceCombobox(); + ui->controllerFrame->EndMapping(); input_setter = std::nullopt; } diff --git a/src/yuzu/configuration/configure_input_player.h b/src/yuzu/configuration/configure_input_player.h index c4ae50de7..da2b89136 100755 --- a/src/yuzu/configuration/configure_input_player.h +++ b/src/yuzu/configuration/configure_input_player.h @@ -106,7 +106,7 @@ private: void LoadConfiguration(); /// Called when the button was pressed. - void HandleClick(QPushButton* button, + void HandleClick(QPushButton* button, std::size_t button_id, std::function new_input_setter, InputCommon::Polling::DeviceType type); diff --git a/src/yuzu/configuration/configure_input_player.ui b/src/yuzu/configuration/configure_input_player.ui index 1e78b4c10..e76aa484f 100755 --- a/src/yuzu/configuration/configure_input_player.ui +++ b/src/yuzu/configuration/configure_input_player.ui @@ -1964,39 +1964,39 @@ - - - - - 0 - 0 - - - - - 75 - true - - - - image: url(:/controller/pro); - - - - 0 - - - 0 - - - 0 - - - 0 - - - - + + + + + 0 + 0 + + + + + 75 + true + + + + image: url(:/controller/pro); + + + + 0 + + + 0 + + + 0 + + + 0 + + + + @@ -3087,6 +3087,14 @@ + + + PlayerControlPreview + QFrame +

yuzu/configuration/configure_input_player_widget.h
+ 1 + + diff --git a/src/yuzu/configuration/configure_motion_touch.cpp b/src/yuzu/configuration/configure_motion_touch.cpp index caaa85930..1f2b792e4 100755 --- a/src/yuzu/configuration/configure_motion_touch.cpp +++ b/src/yuzu/configuration/configure_motion_touch.cpp @@ -81,19 +81,11 @@ void CalibrationConfigurationDialog::UpdateButtonText(const QString& text) { cancel_button->setText(text); } -constexpr std::array, 2> TouchProviders = {{ - {"emu_window", QT_TRANSLATE_NOOP("ConfigureMotionTouch", "Emulator Window")}, - {"cemuhookudp", QT_TRANSLATE_NOOP("ConfigureMotionTouch", "CemuhookUDP")}, -}}; - ConfigureMotionTouch::ConfigureMotionTouch(QWidget* parent, InputCommon::InputSubsystem* input_subsystem_) : QDialog(parent), input_subsystem{input_subsystem_}, ui(std::make_unique()) { ui->setupUi(this); - for (const auto& [provider, name] : TouchProviders) { - ui->touch_provider->addItem(tr(name), QString::fromUtf8(provider)); - } ui->udp_learn_more->setOpenExternalLinks(true); ui->udp_learn_more->setText( @@ -112,10 +104,7 @@ ConfigureMotionTouch::~ConfigureMotionTouch() = default; void ConfigureMotionTouch::SetConfiguration() { const Common::ParamPackage motion_param(Settings::values.motion_device); const Common::ParamPackage touch_param(Settings::values.touch_device); - const std::string touch_engine = touch_param.Get("engine", "emu_window"); - ui->touch_provider->setCurrentIndex( - ui->touch_provider->findData(QString::fromStdString(touch_engine))); ui->touch_from_button_checkbox->setChecked(Settings::values.use_touch_from_button); touch_from_button_maps = Settings::values.touch_from_button_maps; for (const auto& touch_map : touch_from_button_maps) { @@ -148,30 +137,21 @@ void ConfigureMotionTouch::SetConfiguration() { } void ConfigureMotionTouch::UpdateUiDisplay() { - const QString touch_engine = ui->touch_provider->currentData().toString(); const QString cemuhook_udp = QStringLiteral("cemuhookudp"); ui->motion_sensitivity_label->setVisible(true); ui->motion_sensitivity->setVisible(true); - if (touch_engine == cemuhook_udp) { - ui->touch_calibration->setVisible(true); - ui->touch_calibration_config->setVisible(true); - ui->touch_calibration_label->setVisible(true); - ui->touch_calibration->setText( - QStringLiteral("(%1, %2) - (%3, %4)").arg(min_x).arg(min_y).arg(max_x).arg(max_y)); - } else { - ui->touch_calibration->setVisible(false); - ui->touch_calibration_config->setVisible(false); - ui->touch_calibration_label->setVisible(false); - } + ui->touch_calibration->setVisible(true); + ui->touch_calibration_config->setVisible(true); + ui->touch_calibration_label->setVisible(true); + ui->touch_calibration->setText( + QStringLiteral("(%1, %2) - (%3, %4)").arg(min_x).arg(min_y).arg(max_x).arg(max_y)); ui->udp_config_group_box->setVisible(true); } void ConfigureMotionTouch::ConnectEvents() { - connect(ui->touch_provider, qOverload(&QComboBox::currentIndexChanged), this, - [this](int index) { UpdateUiDisplay(); }); connect(ui->udp_test, &QPushButton::clicked, this, &ConfigureMotionTouch::OnCemuhookUDPTest); connect(ui->udp_add, &QPushButton::clicked, this, &ConfigureMotionTouch::OnUDPAddServer); connect(ui->udp_remove, &QPushButton::clicked, this, &ConfigureMotionTouch::OnUDPDeleteServer); @@ -327,16 +307,11 @@ void ConfigureMotionTouch::ApplyConfiguration() { return; } - std::string touch_engine = ui->touch_provider->currentData().toString().toStdString(); - Common::ParamPackage touch_param{}; - if (touch_engine == "cemuhookudp") { - touch_param.Set("min_x", min_x); - touch_param.Set("min_y", min_y); - touch_param.Set("max_x", max_x); - touch_param.Set("max_y", max_y); - } - touch_param.Set("engine", std::move(touch_engine)); + touch_param.Set("min_x", min_x); + touch_param.Set("min_y", min_y); + touch_param.Set("max_x", max_x); + touch_param.Set("max_y", max_y); Settings::values.touch_device = touch_param.Serialize(); Settings::values.use_touch_from_button = ui->touch_from_button_checkbox->isChecked(); diff --git a/src/yuzu/configuration/configure_motion_touch.ui b/src/yuzu/configuration/configure_motion_touch.ui index ebca835ac..1e35ea946 100755 --- a/src/yuzu/configuration/configure_motion_touch.ui +++ b/src/yuzu/configuration/configure_motion_touch.ui @@ -65,26 +65,12 @@ Touch - - - - - - Touch Provider: - - - - - - - - - Calibration: + UDP Calibration: diff --git a/src/yuzu/configuration/configure_touchscreen_advanced.cpp b/src/yuzu/configuration/configure_touchscreen_advanced.cpp index 7d7cc00b7..29c86c7bc 100755 --- a/src/yuzu/configuration/configure_touchscreen_advanced.cpp +++ b/src/yuzu/configuration/configure_touchscreen_advanced.cpp @@ -33,21 +33,18 @@ void ConfigureTouchscreenAdvanced::RetranslateUI() { } void ConfigureTouchscreenAdvanced::ApplyConfiguration() { - Settings::values.touchscreen.finger = ui->finger_box->value(); Settings::values.touchscreen.diameter_x = ui->diameter_x_box->value(); Settings::values.touchscreen.diameter_y = ui->diameter_y_box->value(); Settings::values.touchscreen.rotation_angle = ui->angle_box->value(); } void ConfigureTouchscreenAdvanced::LoadConfiguration() { - ui->finger_box->setValue(Settings::values.touchscreen.finger); ui->diameter_x_box->setValue(Settings::values.touchscreen.diameter_x); ui->diameter_y_box->setValue(Settings::values.touchscreen.diameter_y); ui->angle_box->setValue(Settings::values.touchscreen.rotation_angle); } void ConfigureTouchscreenAdvanced::RestoreDefaults() { - ui->finger_box->setValue(0); ui->diameter_x_box->setValue(15); ui->diameter_y_box->setValue(15); ui->angle_box->setValue(0); diff --git a/src/yuzu/configuration/configure_touchscreen_advanced.ui b/src/yuzu/configuration/configure_touchscreen_advanced.ui index 30ceccddb..88e7cf050 100755 --- a/src/yuzu/configuration/configure_touchscreen_advanced.ui +++ b/src/yuzu/configuration/configure_touchscreen_advanced.ui @@ -65,20 +65,13 @@ - + Touch Diameter Y - - - - Finger - - - @@ -92,37 +85,27 @@ - + Touch Diameter X - - - - - 80 - 0 - - - - - + Rotational Angle - + - + - + diff --git a/src/yuzu/discord_impl.cpp b/src/yuzu/discord_impl.cpp index a93733b26..aad06ac2a 100755 --- a/src/yuzu/discord_impl.cpp +++ b/src/yuzu/discord_impl.cpp @@ -38,7 +38,7 @@ void DiscordImpl::Update() { if (Core::System::GetInstance().IsPoweredOn()) Core::System::GetInstance().GetAppLoader().ReadTitle(title); DiscordRichPresence presence{}; - presence.largeImageKey = "yuzu_logo"; + presence.largeImageKey = "yuzu_logo_ea"; presence.largeImageText = "yuzu is an emulator for the Nintendo Switch"; if (Core::System::GetInstance().IsPoweredOn()) { presence.state = title.c_str(); diff --git a/src/yuzu_cmd/config.cpp b/src/yuzu_cmd/config.cpp index 41ef6f6b8..f76102459 100755 --- a/src/yuzu_cmd/config.cpp +++ b/src/yuzu_cmd/config.cpp @@ -296,10 +296,6 @@ void Config::ReadValues() { sdl2_config->GetBoolean("ControlsGeneral", "motion_enabled", true)); Settings::values.touchscreen.enabled = sdl2_config->GetBoolean("ControlsGeneral", "touch_enabled", true); - Settings::values.touchscreen.device = - sdl2_config->Get("ControlsGeneral", "touch_device", "engine:emu_window"); - Settings::values.touchscreen.finger = - sdl2_config->GetInteger("ControlsGeneral", "touch_finger", 0); Settings::values.touchscreen.rotation_angle = sdl2_config->GetInteger("ControlsGeneral", "touch_angle", 0); Settings::values.touchscreen.diameter_x = diff --git a/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp b/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp index e32bed5e6..7843d5167 100755 --- a/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp +++ b/src/yuzu_cmd/emu_window/emu_window_sdl2.cpp @@ -29,16 +29,16 @@ EmuWindow_SDL2::~EmuWindow_SDL2() { } void EmuWindow_SDL2::OnMouseMotion(s32 x, s32 y) { - TouchMoved((unsigned)std::max(x, 0), (unsigned)std::max(y, 0)); + TouchMoved((unsigned)std::max(x, 0), (unsigned)std::max(y, 0), 0); input_subsystem->GetMouse()->MouseMove(x, y); } void EmuWindow_SDL2::OnMouseButton(u32 button, u8 state, s32 x, s32 y) { if (button == SDL_BUTTON_LEFT) { if (state == SDL_PRESSED) { - TouchPressed((unsigned)std::max(x, 0), (unsigned)std::max(y, 0)); + TouchPressed((unsigned)std::max(x, 0), (unsigned)std::max(y, 0), 0); } else { - TouchReleased(); + TouchReleased(0); } } else if (button == SDL_BUTTON_RIGHT) { if (state == SDL_PRESSED) { @@ -66,16 +66,16 @@ void EmuWindow_SDL2::OnFingerDown(float x, float y) { // 3DS does const auto [px, py] = TouchToPixelPos(x, y); - TouchPressed(px, py); + TouchPressed(px, py, 0); } void EmuWindow_SDL2::OnFingerMotion(float x, float y) { const auto [px, py] = TouchToPixelPos(x, y); - TouchMoved(px, py); + TouchMoved(px, py, 0); } void EmuWindow_SDL2::OnFingerUp() { - TouchReleased(); + TouchReleased(0); } void EmuWindow_SDL2::OnKeyEvent(int key, u8 state) { diff --git a/src/yuzu_cmd/yuzu.cpp b/src/yuzu_cmd/yuzu.cpp index 4faf62ede..3279f0fd8 100755 --- a/src/yuzu_cmd/yuzu.cpp +++ b/src/yuzu_cmd/yuzu.cpp @@ -212,7 +212,7 @@ int main(int argc, char** argv) { // Core is loaded, start the GPU (makes the GPU contexts current to this thread) system.GPU().Start(); - system.Renderer().Rasterizer().LoadDiskResources( + system.Renderer().ReadRasterizer()->LoadDiskResources( system.CurrentProcess()->GetTitleID(), false, [](VideoCore::LoadCallbackStage, size_t value, size_t total) {});