DMA & InlineToMemory Engines Rework.

2025-01-11 02:01:01 +01:00 · 2022-08-14 02:36:36 -07:00 · 2022-08-14 02:36:36 -07:00 · f5fd6b5c86
commit f5fd6b5c86
parent b2099fbdcc
21 changed files with 323 additions and 242 deletions
--- a/src/common/algorithm.h
+++ b/src/common/algorithm.h
@ -24,4 +24,12 @@ template <class ForwardIt, class T, class Compare = std::less<>>
    return first != last && !comp(value, *first) ? first : last;
 }
 template <typename T, typename Func, typename... Args>
 T FoldRight(T initial_value, Func&& func, Args&&... args) {
    T value{initial_value};
    const auto high_func = [&value, &func]<typename T>(T x) { value = func(value, x); };
    (std::invoke(high_func, std::forward<Args>(args)), ...);
    return value;
 }
 } // namespace Common
--- a/src/video_core/buffer_cache/buffer_cache.h
+++ b/src/video_core/buffer_cache/buffer_cache.h
@ -126,7 +126,7 @@ public:
    void DownloadMemory(VAddr cpu_addr, u64 size);
-    bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<u8> inlined_buffer);
+    bool InlineMemory(VAddr dest_address, size_t copy_size, std::span<const u8> inlined_buffer);
    void BindGraphicsUniformBuffer(size_t stage, u32 index, GPUVAddr gpu_addr, u32 size);
@ -1685,7 +1685,7 @@ void BufferCache<P>::MappedUploadMemory(Buffer& buffer, u64 total_size_bytes,
 template <class P>
 bool BufferCache<P>::InlineMemory(VAddr dest_address, size_t copy_size,
-                                  std::span<u8> inlined_buffer) {
+                                  std::span<const u8> inlined_buffer) {
    const bool is_dirty = IsRegionRegistered(dest_address, copy_size);
    if (!is_dirty) {
        return false;
--- a/src/video_core/engines/engine_upload.cpp
+++ b/src/video_core/engines/engine_upload.cpp
@ -3,6 +3,7 @@
 #include <cstring>
 #include "common/algorithm.h"
 #include "common/assert.h"
 #include "video_core/engines/engine_upload.h"
 #include "video_core/memory_manager.h"
@ -34,21 +35,48 @@ void State::ProcessData(const u32 data, const bool is_last_call) {
    if (!is_last_call) {
        return;
    }
    ProcessData(inner_buffer);
 }
 void State::ProcessData(const u32* data, size_t num_data) {
    std::span<const u8> read_buffer(reinterpret_cast<const u8*>(data), num_data * sizeof(u32));
    ProcessData(read_buffer);
 }
 void State::ProcessData(std::span<const u8> read_buffer) {
    const GPUVAddr address{regs.dest.Address()};
    if (is_linear) {
-        rasterizer->AccelerateInlineToMemory(address, copy_size, inner_buffer);
+        if (regs.line_count == 1) {
            rasterizer->AccelerateInlineToMemory(address, copy_size, read_buffer);
        } else {
-        UNIMPLEMENTED_IF(regs.dest.z != 0);
+            for (u32 line = 0; line < regs.line_count; ++line) {
-        UNIMPLEMENTED_IF(regs.dest.depth != 1);
+                const GPUVAddr dest_line = address + static_cast<size_t>(line) * regs.dest.pitch;
-        UNIMPLEMENTED_IF(regs.dest.BlockWidth() != 0);
+                memory_manager.WriteBlockUnsafe(
-        UNIMPLEMENTED_IF(regs.dest.BlockDepth() != 0);
+                    dest_line, read_buffer.data() + static_cast<size_t>(line) * regs.line_length_in,
                    regs.line_length_in);
            }
            memory_manager.InvalidateRegion(address, regs.dest.pitch * regs.line_count);
        }
    } else {
        u32 width = regs.dest.width;
        u32 x_elements = regs.line_length_in;
        u32 x_offset = regs.dest.x;
        const u32 bpp_shift = Common::FoldRight(
            4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
            width, x_elements, x_offset, static_cast<u32>(address));
        width >>= bpp_shift;
        x_elements >>= bpp_shift;
        x_offset >>= bpp_shift;
        const u32 bytes_per_pixel = 1U << bpp_shift;
        const std::size_t dst_size = Tegra::Texture::CalculateSize(
-            true, 1, regs.dest.width, regs.dest.height, 1, regs.dest.BlockHeight(), 0);
+            true, bytes_per_pixel, width, regs.dest.height, regs.dest.depth,
            regs.dest.BlockHeight(), regs.dest.BlockDepth());
        tmp_buffer.resize(dst_size);
        memory_manager.ReadBlock(address, tmp_buffer.data(), dst_size);
-        Tegra::Texture::SwizzleKepler(regs.dest.width, regs.dest.height, regs.dest.x, regs.dest.y,
+        Tegra::Texture::SwizzleSubrect(tmp_buffer, read_buffer, bytes_per_pixel, width,
-                                      regs.dest.BlockHeight(), copy_size, inner_buffer.data(),
+                                       regs.dest.height, regs.dest.depth, x_offset, regs.dest.y,
-                                      tmp_buffer.data());
+                                       x_elements, regs.line_count, regs.dest.BlockHeight(),
                                       regs.dest.BlockDepth(), regs.line_length_in);
        memory_manager.WriteBlock(address, tmp_buffer.data(), dst_size);
    }
 }
--- a/src/video_core/engines/engine_upload.h
+++ b/src/video_core/engines/engine_upload.h
@ -3,6 +3,7 @@
 #pragma once
 #include <span>
 #include <vector>
 #include "common/bit_field.h"
 #include "common/common_types.h"
@ -33,7 +34,7 @@ struct Registers {
        u32 width;
        u32 height;
        u32 depth;
-        u32 z;
+        u32 layer;
        u32 x;
        u32 y;
@ -62,11 +63,14 @@ public:
    void ProcessExec(bool is_linear_);
    void ProcessData(u32 data, bool is_last_call);
    void ProcessData(const u32* data, size_t num_data);
    /// Binds a rasterizer to this engine.
    void BindRasterizer(VideoCore::RasterizerInterface* rasterizer);
 private:
    void ProcessData(std::span<const u8> read_buffer);
    u32 write_offset = 0;
    u32 copy_size = 0;
    std::vector<u8> inner_buffer;
--- a/src/video_core/engines/kepler_compute.cpp
+++ b/src/video_core/engines/kepler_compute.cpp
@ -36,8 +36,6 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
    }
    case KEPLER_COMPUTE_REG_INDEX(data_upload): {
        upload_state.ProcessData(method_argument, is_last_call);
        if (is_last_call) {
        }
        break;
    }
    case KEPLER_COMPUTE_REG_INDEX(launch):
@ -50,9 +48,16 @@ void KeplerCompute::CallMethod(u32 method, u32 method_argument, bool is_last_cal
 void KeplerCompute::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
                                    u32 methods_pending) {
    switch (method) {
    case KEPLER_COMPUTE_REG_INDEX(data_upload):
        upload_state.ProcessData(base_start, static_cast<size_t>(amount));
        return;
    default:
        for (std::size_t i = 0; i < amount; i++) {
            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
        }
        break;
    }
 }
 void KeplerCompute::ProcessLaunch() {
--- a/src/video_core/engines/kepler_memory.cpp
+++ b/src/video_core/engines/kepler_memory.cpp
@ -33,8 +33,6 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
    }
    case KEPLERMEMORY_REG_INDEX(data): {
        upload_state.ProcessData(method_argument, is_last_call);
        if (is_last_call) {
        }
        break;
    }
    }
@ -42,9 +40,16 @@ void KeplerMemory::CallMethod(u32 method, u32 method_argument, bool is_last_call
 void KeplerMemory::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
                                   u32 methods_pending) {
    switch (method) {
    case KEPLERMEMORY_REG_INDEX(data):
        upload_state.ProcessData(base_start, static_cast<size_t>(amount));
        return;
    default:
        for (std::size_t i = 0; i < amount; i++) {
            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
        }
        break;
    }
 }
 } // namespace Tegra::Engines
--- a/src/video_core/engines/maxwell_3d.cpp
+++ b/src/video_core/engines/maxwell_3d.cpp
@ -239,8 +239,6 @@ void Maxwell3D::ProcessMethodCall(u32 method, u32 argument, u32 nonshadow_argume
        return upload_state.ProcessExec(regs.exec_upload.linear != 0);
    case MAXWELL3D_REG_INDEX(data_upload):
        upload_state.ProcessData(argument, is_last_call);
        if (is_last_call) {
        }
        return;
    case MAXWELL3D_REG_INDEX(fragment_barrier):
        return rasterizer->FragmentBarrier();
@ -316,6 +314,9 @@ void Maxwell3D::CallMultiMethod(u32 method, const u32* base_start, u32 amount,
    case MAXWELL3D_REG_INDEX(const_buffer.cb_data) + 15:
        ProcessCBMultiData(base_start, amount);
        break;
    case MAXWELL3D_REG_INDEX(data_upload):
        upload_state.ProcessData(base_start, static_cast<size_t>(amount));
        return;
    default:
        for (std::size_t i = 0; i < amount; i++) {
            CallMethod(method, base_start[i], methods_pending - static_cast<u32>(i) <= 1);
--- a/src/video_core/engines/maxwell_dma.cpp
+++ b/src/video_core/engines/maxwell_dma.cpp
@ -1,6 +1,7 @@
 // SPDX-FileCopyrightText: Copyright 2018 yuzu Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later
 #include "common/algorithm.h"
 #include "common/assert.h"
 #include "common/logging/log.h"
 #include "common/microprofile.h"
@ -54,8 +55,6 @@ void MaxwellDMA::Launch() {
    const LaunchDMA& launch = regs.launch_dma;
    ASSERT(launch.interrupt_type == LaunchDMA::InterruptType::NONE);
    ASSERT(launch.data_transfer_type == LaunchDMA::DataTransferType::NON_PIPELINED);
    ASSERT(regs.dst_params.origin.x == 0);
    ASSERT(regs.dst_params.origin.y == 0);
    const bool is_src_pitch = launch.src_memory_layout == LaunchDMA::MemoryLayout::PITCH;
    const bool is_dst_pitch = launch.dst_memory_layout == LaunchDMA::MemoryLayout::PITCH;
@ -121,12 +120,13 @@ void MaxwellDMA::CopyPitchToPitch() {
 void MaxwellDMA::CopyBlockLinearToPitch() {
    UNIMPLEMENTED_IF(regs.src_params.block_size.width != 0);
    UNIMPLEMENTED_IF(regs.src_params.block_size.depth != 0);
    UNIMPLEMENTED_IF(regs.src_params.layer != 0);
    const bool is_remapping = regs.launch_dma.remap_enable != 0;
    // Optimized path for micro copies.
    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
-    if (dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X &&
+    if (!is_remapping && dst_size < GOB_SIZE && regs.pitch_out <= GOB_SIZE_X &&
        regs.src_params.height > GOB_SIZE_Y) {
        FastCopyBlockLinearToPitch();
        return;
@ -134,10 +134,27 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
    // Deswizzle the input and copy it over.
    UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
    const u32 bytes_per_pixel =
        regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1;
    const Parameters& src_params = regs.src_params;
-    const u32 width = src_params.width;
+
    const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
    const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
    const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
    u32 width = src_params.width;
    u32 x_elements = regs.line_length_in;
    u32 x_offset = src_params.origin.x;
    u32 bpp_shift = 0U;
    if (!is_remapping) {
        bpp_shift = Common::FoldRight(
            4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
            width, x_elements, x_offset, static_cast<u32>(regs.offset_in));
        width >>= bpp_shift;
        x_elements >>= bpp_shift;
        x_offset >>= bpp_shift;
    }
    const u32 bytes_per_pixel = base_bpp << bpp_shift;
    const u32 height = src_params.height;
    const u32 depth = src_params.depth;
    const u32 block_height = src_params.block_size.height;
@ -155,30 +172,46 @@ void MaxwellDMA::CopyBlockLinearToPitch() {
    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
    memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
-    UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, width, bytes_per_pixel,
+    UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
-                     block_height, src_params.origin.x, src_params.origin.y, write_buffer.data(),
+                     src_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
-                     read_buffer.data());
+                     regs.pitch_out);
    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
 }
 void MaxwellDMA::CopyPitchToBlockLinear() {
    UNIMPLEMENTED_IF_MSG(regs.dst_params.block_size.width != 0, "Block width is not one");
    UNIMPLEMENTED_IF(regs.dst_params.layer != 0);
    UNIMPLEMENTED_IF(regs.launch_dma.remap_enable != 0);
    const bool is_remapping = regs.launch_dma.remap_enable != 0;
    const u32 num_remap_components = regs.remap_const.num_dst_components_minus_one + 1;
    const u32 remap_components_size = regs.remap_const.component_size_minus_one + 1;
    const auto& dst_params = regs.dst_params;
-    const u32 bytes_per_pixel =
+
-        regs.launch_dma.remap_enable ? regs.pitch_in / regs.line_length_in : 1;
+    const u32 base_bpp = !is_remapping ? 1U : num_remap_components * remap_components_size;
-    const u32 width = dst_params.width;
+
    u32 width = dst_params.width;
    u32 x_elements = regs.line_length_in;
    u32 x_offset = dst_params.origin.x;
    u32 bpp_shift = 0U;
    if (!is_remapping) {
        bpp_shift = Common::FoldRight(
            4U, [](u32 x, u32 y) { return std::min(x, static_cast<u32>(std::countr_zero(y))); },
            width, x_elements, x_offset, static_cast<u32>(regs.offset_out));
        width >>= bpp_shift;
        x_elements >>= bpp_shift;
        x_offset >>= bpp_shift;
    }
    const u32 bytes_per_pixel = base_bpp << bpp_shift;
    const u32 height = dst_params.height;
    const u32 depth = dst_params.depth;
    const u32 block_height = dst_params.block_size.height;
    const u32 block_depth = dst_params.block_size.depth;
    const size_t dst_size =
        CalculateSize(true, bytes_per_pixel, width, height, depth, block_height, block_depth);
    const size_t dst_layer_size =
        CalculateSize(true, bytes_per_pixel, width, height, 1, block_height, block_depth);
    const size_t src_size = static_cast<size_t>(regs.pitch_in) * regs.line_count;
    if (read_buffer.size() < src_size) {
@ -188,32 +221,23 @@ void MaxwellDMA::CopyPitchToBlockLinear() {
        write_buffer.resize(dst_size);
    }
    if (Settings::IsGPULevelExtreme()) {
    memory_manager.ReadBlock(regs.offset_in, read_buffer.data(), src_size);
    if (Settings::IsGPULevelExtreme()) {
        memory_manager.ReadBlock(regs.offset_out, write_buffer.data(), dst_size);
    } else {
        memory_manager.ReadBlockUnsafe(regs.offset_in, read_buffer.data(), src_size);
        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
    }
    // If the input is linear and the output is tiled, swizzle the input and copy it over.
-    if (regs.dst_params.block_size.depth > 0) {
+    SwizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, width, height, depth, x_offset,
-        ASSERT(dst_params.layer == 0);
+                   dst_params.origin.y, x_elements, regs.line_count, block_height, block_depth,
-        SwizzleSliceToVoxel(regs.line_length_in, regs.line_count, regs.pitch_in, width, height,
+                   regs.pitch_in);
                            bytes_per_pixel, block_height, block_depth, dst_params.origin.x,
                            dst_params.origin.y, write_buffer.data(), read_buffer.data());
    } else {
        SwizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_in, width, bytes_per_pixel,
                       write_buffer.data() + dst_layer_size * dst_params.layer, read_buffer.data(),
                       block_height, dst_params.origin.x, dst_params.origin.y);
    }
    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
 }
 void MaxwellDMA::FastCopyBlockLinearToPitch() {
-    const u32 bytes_per_pixel =
+    const u32 bytes_per_pixel = 1U;
        regs.launch_dma.remap_enable ? regs.pitch_out / regs.line_length_in : 1;
    const size_t src_size = GOB_SIZE;
    const size_t dst_size = static_cast<size_t>(regs.pitch_out) * regs.line_count;
    u32 pos_x = regs.src_params.origin.x;
@ -239,9 +263,10 @@ void MaxwellDMA::FastCopyBlockLinearToPitch() {
        memory_manager.ReadBlockUnsafe(regs.offset_out, write_buffer.data(), dst_size);
    }
-    UnswizzleSubrect(regs.line_length_in, regs.line_count, regs.pitch_out, regs.src_params.width,
+    UnswizzleSubrect(write_buffer, read_buffer, bytes_per_pixel, regs.src_params.width,
-                     bytes_per_pixel, regs.src_params.block_size.height, pos_x, pos_y,
+                     regs.src_params.height, 1, pos_x, pos_y, regs.line_length_in, regs.line_count,
-                     write_buffer.data(), read_buffer.data());
+                     regs.src_params.block_size.height, regs.src_params.block_size.depth,
                     regs.pitch_out);
    memory_manager.WriteBlock(regs.offset_out, write_buffer.data(), dst_size);
 }
--- a/src/video_core/engines/maxwell_dma.h
+++ b/src/video_core/engines/maxwell_dma.h
@ -189,10 +189,16 @@ public:
            BitField<4, 3, Swizzle> dst_y;
            BitField<8, 3, Swizzle> dst_z;
            BitField<12, 3, Swizzle> dst_w;
            BitField<0, 12, u32> dst_components_raw;
            BitField<16, 2, u32> component_size_minus_one;
            BitField<20, 2, u32> num_src_components_minus_one;
            BitField<24, 2, u32> num_dst_components_minus_one;
        };
        Swizzle GetComponent(size_t i) {
            const u32 raw = dst_components_raw;
            return static_cast<Swizzle>((raw >> (i * 3)) & 0x7);
        }
    };
    static_assert(sizeof(RemapConst) == 12);
--- a/src/video_core/host1x/vic.cpp
+++ b/src/video_core/host1x/vic.cpp
@ -156,8 +156,9 @@ void Vic::WriteRGBFrame(const AVFrame* frame, const VicConfig& config) {
        const u32 block_height = static_cast<u32>(config.block_linear_height_log2);
        const auto size = Texture::CalculateSize(true, 4, width, height, 1, block_height, 0);
        luma_buffer.resize(size);
-        Texture::SwizzleSubrect(width, height, width * 4, width, 4, luma_buffer.data(),
+        std::span<const u8> frame_buff(converted_frame_buf_addr, 4 * width * height);
-                                converted_frame_buf_addr, block_height, 0, 0);
+        Texture::SwizzleSubrect(luma_buffer, frame_buff, 4, width, height, 1,
                                0, 0, width, height, block_height, 0, width * 4);
        host1x.MemoryManager().WriteBlock(output_surface_luma_address, luma_buffer.data(), size);
    } else {
--- a/src/video_core/memory_manager.cpp
+++ b/src/video_core/memory_manager.cpp
@ -462,6 +462,97 @@ void MemoryManager::FlushRegion(GPUVAddr gpu_addr, size_t size) const {
    MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, flush_short_pages);
 }
 bool MemoryManager::IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const {
    bool result = false;
    auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
                          [[maybe_unused]] std::size_t offset,
                          [[maybe_unused]] std::size_t copy_amount) { return false; };
    auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
        result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount);
        return result;
    };
    auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
        result |= rasterizer->MustFlushRegion(cpu_addr_base, copy_amount);
        return result;
    };
    auto check_short_pages = [&](std::size_t page_index, std::size_t offset,
                                 std::size_t copy_amount) {
        GPUVAddr base = (page_index << big_page_bits) + offset;
        MemoryOperation<false>(base, copy_amount, mapped_normal, do_nothing, do_nothing);
        return result;
    };
    MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, check_short_pages);
    return result;
 }
 size_t MemoryManager::MaxContinousRange(GPUVAddr gpu_addr, size_t size) const {
    std::optional<VAddr> old_page_addr{};
    size_t range_so_far = 0;
    bool result{false};
    auto fail = [&]([[maybe_unused]] std::size_t page_index, [[maybe_unused]] std::size_t offset,
                    std::size_t copy_amount) {
        result = true;
        return true;
    };
    auto short_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
        if (old_page_addr && *old_page_addr != cpu_addr_base) {
            result = true;
            return true;
        }
        range_so_far += copy_amount;
        old_page_addr = {cpu_addr_base + copy_amount};
        return false;
    };
    auto big_check = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
        if (old_page_addr && *old_page_addr != cpu_addr_base) {
            return true;
        }
        range_so_far += copy_amount;
        old_page_addr = {cpu_addr_base + copy_amount};
        return false;
    };
    auto check_short_pages = [&](std::size_t page_index, std::size_t offset,
                                 std::size_t copy_amount) {
        GPUVAddr base = (page_index << big_page_bits) + offset;
        MemoryOperation<false>(base, copy_amount, short_check, fail, fail);
        return result;
    };
    MemoryOperation<true>(gpu_addr, size, big_check, fail, check_short_pages);
    return range_so_far;
 }
 void MemoryManager::InvalidateRegion(GPUVAddr gpu_addr, size_t size) const {
    auto do_nothing = [&]([[maybe_unused]] std::size_t page_index,
                          [[maybe_unused]] std::size_t offset,
                          [[maybe_unused]] std::size_t copy_amount) {};
    auto mapped_normal = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(page_table[page_index]) << cpu_page_bits) + offset;
        rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
    };
    auto mapped_big = [&](std::size_t page_index, std::size_t offset, std::size_t copy_amount) {
        const VAddr cpu_addr_base =
            (static_cast<VAddr>(big_page_table_cpu[page_index]) << cpu_page_bits) + offset;
        rasterizer->InvalidateRegion(cpu_addr_base, copy_amount);
    };
    auto invalidate_short_pages = [&](std::size_t page_index, std::size_t offset,
                                      std::size_t copy_amount) {
        GPUVAddr base = (page_index << big_page_bits) + offset;
        MemoryOperation<false>(base, copy_amount, mapped_normal, do_nothing, do_nothing);
    };
    MemoryOperation<true>(gpu_addr, size, mapped_big, do_nothing, invalidate_short_pages);
 }
 void MemoryManager::CopyBlock(GPUVAddr gpu_dest_addr, GPUVAddr gpu_src_addr, std::size_t size) {
    std::vector<u8> tmp_buffer(size);
    ReadBlock(gpu_src_addr, tmp_buffer.data(), size);
--- a/src/video_core/memory_manager.h
+++ b/src/video_core/memory_manager.h
@ -104,6 +104,12 @@ public:
    void FlushRegion(GPUVAddr gpu_addr, size_t size) const;
    void InvalidateRegion(GPUVAddr gpu_addr, size_t size) const;
    bool IsMemoryDirty(GPUVAddr gpu_addr, size_t size) const;
    size_t MaxContinousRange(GPUVAddr gpu_addr, size_t size) const;
 private:
    template <bool is_big_pages, typename FuncMapped, typename FuncReserved, typename FuncUnmapped>
    inline void MemoryOperation(GPUVAddr gpu_src_addr, std::size_t size, FuncMapped&& func_mapped,
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@ -129,7 +129,7 @@ public:
    [[nodiscard]] virtual Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() = 0;
    virtual void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
-                                          std::span<u8> memory) = 0;
+                                          std::span<const u8> memory) = 0;
    /// Attempt to use a faster method to display the framebuffer to screen
    [[nodiscard]] virtual bool AccelerateDisplay(const Tegra::FramebufferConfig& config,
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -476,7 +476,7 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerOpenGL::AccessAccelerateDMA()
 }
 void RasterizerOpenGL::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
-                                                std::span<u8> memory) {
+                                                std::span<const u8> memory) {
    auto cpu_addr = gpu_memory->GpuToCpuAddress(address);
    if (!cpu_addr) [[unlikely]] {
        gpu_memory->WriteBlock(address, memory.data(), copy_size);
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@ -99,7 +99,7 @@ public:
                               const Tegra::Engines::Fermi2D::Config& copy_config) override;
    Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
    void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
-                                  std::span<u8> memory) override;
+                                  std::span<const u8> memory) override;
    bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                           u32 pixel_stride) override;
    void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
--- a/src/video_core/renderer_vulkan/vk_compute_pass.cpp
+++ b/src/video_core/renderer_vulkan/vk_compute_pass.cpp
@ -26,8 +26,6 @@
 namespace Vulkan {
 using Tegra::Texture::SWIZZLE_TABLE;
 namespace {
 constexpr u32 ASTC_BINDING_INPUT_BUFFER = 0;
--- a/src/video_core/renderer_vulkan/vk_rasterizer.cpp
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.cpp
@ -548,7 +548,7 @@ Tegra::Engines::AccelerateDMAInterface& RasterizerVulkan::AccessAccelerateDMA()
 }
 void RasterizerVulkan::AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
-                                                std::span<u8> memory) {
+                                                std::span<const u8> memory) {
    auto cpu_addr = gpu_memory->GpuToCpuAddress(address);
    if (!cpu_addr) [[unlikely]] {
        gpu_memory->WriteBlock(address, memory.data(), copy_size);
--- a/src/video_core/renderer_vulkan/vk_rasterizer.h
+++ b/src/video_core/renderer_vulkan/vk_rasterizer.h
@ -95,7 +95,7 @@ public:
                               const Tegra::Engines::Fermi2D::Config& copy_config) override;
    Tegra::Engines::AccelerateDMAInterface& AccessAccelerateDMA() override;
    void AccelerateInlineToMemory(GPUVAddr address, size_t copy_size,
-                                  std::span<u8> memory) override;
+                                  std::span<const u8> memory) override;
    bool AccelerateDisplay(const Tegra::FramebufferConfig& config, VAddr framebuffer_addr,
                           u32 pixel_stride) override;
    void LoadDiskResources(u64 title_id, std::stop_token stop_loading,
--- a/src/video_core/texture_cache/util.cpp
+++ b/src/video_core/texture_cache/util.cpp
@ -517,7 +517,6 @@ void SwizzleBlockLinearImage(Tegra::MemoryManager& gpu_memory, GPUVAddr gpu_addr
    const u32 host_bytes_per_layer = num_blocks_per_layer * bytes_per_block;
    UNIMPLEMENTED_IF(info.tile_width_spacing > 0);
    UNIMPLEMENTED_IF(copy.image_offset.x != 0);
    UNIMPLEMENTED_IF(copy.image_offset.y != 0);
    UNIMPLEMENTED_IF(copy.image_offset.z != 0);
--- a/src/video_core/textures/decoders.cpp
+++ b/src/video_core/textures/decoders.cpp
@ -89,6 +89,69 @@ void SwizzleImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32
    }
 }
 template <bool TO_LINEAR, u32 BYTES_PER_PIXEL>
 void SwizzleSubrectImpl(std::span<u8> output, std::span<const u8> input, u32 width, u32 height,
                        u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 num_lines,
                        u32 block_height, u32 block_depth, u32 pitch_linear) {
    // The origin of the transformation can be configured here, leave it as zero as the current API
    // doesn't expose it.
    static constexpr u32 origin_z = 0;
    // We can configure here a custom pitch
    // As it's not exposed 'width * BYTES_PER_PIXEL' will be the expected pitch.
    const u32 pitch = pitch_linear;
    const u32 stride = Common::AlignUpLog2(width * BYTES_PER_PIXEL, GOB_SIZE_X_SHIFT);
    const u32 gobs_in_x = Common::DivCeilLog2(stride, GOB_SIZE_X_SHIFT);
    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
    const u32 slice_size =
        Common::DivCeilLog2(height, block_height + GOB_SIZE_Y_SHIFT) * block_size;
    const u32 block_height_mask = (1U << block_height) - 1;
    const u32 block_depth_mask = (1U << block_depth) - 1;
    const u32 x_shift = GOB_SIZE_SHIFT + block_height + block_depth;
    u32 unprocessed_lines = num_lines;
    u32 extent_y = std::min(num_lines, height - origin_y);
    for (u32 slice = 0; slice < depth; ++slice) {
        const u32 z = slice + origin_z;
        const u32 offset_z = (z >> block_depth) * slice_size +
                             ((z & block_depth_mask) << (GOB_SIZE_SHIFT + block_height));
        const u32 lines_in_y = std::min(unprocessed_lines, extent_y);
        for (u32 line = 0; line < lines_in_y; ++line) {
            const u32 y = line + origin_y;
            const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(y);
            const u32 block_y = y >> GOB_SIZE_Y_SHIFT;
            const u32 offset_y = (block_y >> block_height) * block_size +
                                 ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
            u32 swizzled_x = pdep<SWIZZLE_X_BITS>(origin_x * BYTES_PER_PIXEL);
            for (u32 column = 0; column < extent_x;
                 ++column, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
                const u32 x = (column + origin_x) * BYTES_PER_PIXEL;
                const u32 offset_x = (x >> GOB_SIZE_X_SHIFT) << x_shift;
                const u32 base_swizzled_offset = offset_z + offset_y + offset_x;
                const u32 swizzled_offset = base_swizzled_offset + (swizzled_x | swizzled_y);
                const u32 unswizzled_offset =
                    slice * pitch * height + line * pitch + column * BYTES_PER_PIXEL;
                u8* const dst = &output[TO_LINEAR ? swizzled_offset : unswizzled_offset];
                const u8* const src = &input[TO_LINEAR ? unswizzled_offset : swizzled_offset];
                std::memcpy(dst, src, BYTES_PER_PIXEL);
            }
        }
        unprocessed_lines -= lines_in_y;
        if (unprocessed_lines == 0) {
            return;
        }
    }
 }
 template <bool TO_LINEAR>
 void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
             u32 height, u32 depth, u32 block_height, u32 block_depth, u32 stride_alignment) {
@ -111,97 +174,6 @@ void Swizzle(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixe
    }
 }
 template <u32 BYTES_PER_PIXEL>
 void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
                    u8* swizzled_data, const u8* unswizzled_data, u32 block_height_bit,
                    u32 offset_x, u32 offset_y) {
    const u32 block_height = 1U << block_height_bit;
    const u32 image_width_in_gobs =
        (swizzled_width * BYTES_PER_PIXEL + (GOB_SIZE_X - 1)) / GOB_SIZE_X;
    for (u32 line = 0; line < subrect_height; ++line) {
        const u32 dst_y = line + offset_y;
        const u32 gob_address_y =
            (dst_y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
            ((dst_y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
        const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(dst_y);
        u32 swizzled_x = pdep<SWIZZLE_X_BITS>(offset_x * BYTES_PER_PIXEL);
        for (u32 x = 0; x < subrect_width;
             ++x, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
            const u32 dst_x = x + offset_x;
            const u32 gob_address =
                gob_address_y + (dst_x * BYTES_PER_PIXEL / GOB_SIZE_X) * GOB_SIZE * block_height;
            const u32 swizzled_offset = gob_address + (swizzled_x | swizzled_y);
            const u32 unswizzled_offset = line * source_pitch + x * BYTES_PER_PIXEL;
            const u8* const source_line = unswizzled_data + unswizzled_offset;
            u8* const dest_addr = swizzled_data + swizzled_offset;
            std::memcpy(dest_addr, source_line, BYTES_PER_PIXEL);
        }
    }
 }
 template <u32 BYTES_PER_PIXEL>
 void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 block_height,
                      u32 origin_x, u32 origin_y, u8* output, const u8* input) {
    const u32 stride = width * BYTES_PER_PIXEL;
    const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height);
    const u32 block_height_mask = (1U << block_height) - 1;
    const u32 x_shift = GOB_SIZE_SHIFT + block_height;
    for (u32 line = 0; line < line_count; ++line) {
        const u32 src_y = line + origin_y;
        const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(src_y);
        const u32 block_y = src_y >> GOB_SIZE_Y_SHIFT;
        const u32 src_offset_y = (block_y >> block_height) * block_size +
                                 ((block_y & block_height_mask) << GOB_SIZE_SHIFT);
        u32 swizzled_x = pdep<SWIZZLE_X_BITS>(origin_x * BYTES_PER_PIXEL);
        for (u32 column = 0; column < line_length_in;
             ++column, incrpdep<SWIZZLE_X_BITS, BYTES_PER_PIXEL>(swizzled_x)) {
            const u32 src_x = (column + origin_x) * BYTES_PER_PIXEL;
            const u32 src_offset_x = (src_x >> GOB_SIZE_X_SHIFT) << x_shift;
            const u32 swizzled_offset = src_offset_y + src_offset_x + (swizzled_x | swizzled_y);
            const u32 unswizzled_offset = line * pitch + column * BYTES_PER_PIXEL;
            std::memcpy(output + unswizzled_offset, input + swizzled_offset, BYTES_PER_PIXEL);
        }
    }
 }
 template <u32 BYTES_PER_PIXEL>
 void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
                         u32 block_height, u32 block_depth, u32 origin_x, u32 origin_y, u8* output,
                         const u8* input) {
    UNIMPLEMENTED_IF(origin_x > 0);
    UNIMPLEMENTED_IF(origin_y > 0);
    const u32 stride = width * BYTES_PER_PIXEL;
    const u32 gobs_in_x = (stride + GOB_SIZE_X - 1) / GOB_SIZE_X;
    const u32 block_size = gobs_in_x << (GOB_SIZE_SHIFT + block_height + block_depth);
    const u32 block_height_mask = (1U << block_height) - 1;
    const u32 x_shift = static_cast<u32>(GOB_SIZE_SHIFT) + block_height + block_depth;
    for (u32 line = 0; line < line_count; ++line) {
        const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(line);
        const u32 block_y = line / GOB_SIZE_Y;
        const u32 dst_offset_y =
            (block_y >> block_height) * block_size + (block_y & block_height_mask) * GOB_SIZE;
        u32 swizzled_x = 0;
        for (u32 x = 0; x < line_length_in; ++x, incrpdep<SWIZZLE_X_BITS, 1>(swizzled_x)) {
            const u32 dst_offset =
                ((x / GOB_SIZE_X) << x_shift) + dst_offset_y + (swizzled_x | swizzled_y);
            const u32 src_offset = x * BYTES_PER_PIXEL + line * pitch;
            std::memcpy(output + dst_offset, input + src_offset, BYTES_PER_PIXEL);
        }
    }
 }
 } // Anonymous namespace
 void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
@ -218,15 +190,15 @@ void SwizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_p
                  stride_alignment);
 }
-void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
+void SwizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
-                    u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
+                    u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 extent_y,
-                    u32 block_height_bit, u32 offset_x, u32 offset_y) {
+                    u32 block_height, u32 block_depth, u32 pitch_linear) {
    switch (bytes_per_pixel) {
 #define BPP_CASE(x)                                                                                \
    case x:                                                                                        \
-        return SwizzleSubrect<x>(subrect_width, subrect_height, source_pitch, swizzled_width,      \
+        return SwizzleSubrectImpl<true, x>(output, input, width, height, depth, origin_x,          \
-                                 swizzled_data, unswizzled_data, block_height_bit, offset_x,       \
+                                           origin_y, extent_x, extent_y, block_height,             \
-                                 offset_y);
+                                           block_depth, pitch_linear);
        BPP_CASE(1)
        BPP_CASE(2)
        BPP_CASE(3)
@ -241,13 +213,15 @@ void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32
    }
 }
-void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
+void UnswizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
-                      u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input) {
+                      u32 width, u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x,
                      u32 extent_y, u32 block_height, u32 block_depth, u32 pitch_linear) {
    switch (bytes_per_pixel) {
 #define BPP_CASE(x)                                                                                \
    case x:                                                                                        \
-        return UnswizzleSubrect<x>(line_length_in, line_count, pitch, width, block_height,         \
+        return SwizzleSubrectImpl<false, x>(output, input, width, height, depth, origin_x,         \
-                                   origin_x, origin_y, output, input);
+                                            origin_y, extent_x, extent_y, block_height,            \
                                            block_depth, pitch_linear);
        BPP_CASE(1)
        BPP_CASE(2)
        BPP_CASE(3)
@ -262,55 +236,6 @@ void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width,
    }
 }
 void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
                         u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
                         u32 origin_y, u8* output, const u8* input) {
    switch (bytes_per_pixel) {
 #define BPP_CASE(x)                                                                                \
    case x:                                                                                        \
        return SwizzleSliceToVoxel<x>(line_length_in, line_count, pitch, width, height,            \
                                      block_height, block_depth, origin_x, origin_y, output,       \
                                      input);
        BPP_CASE(1)
        BPP_CASE(2)
        BPP_CASE(3)
        BPP_CASE(4)
        BPP_CASE(6)
        BPP_CASE(8)
        BPP_CASE(12)
        BPP_CASE(16)
 #undef BPP_CASE
    default:
        ASSERT_MSG(false, "Invalid bytes_per_pixel={}", bytes_per_pixel);
    }
 }
 void SwizzleKepler(const u32 width, const u32 height, const u32 dst_x, const u32 dst_y,
                   const u32 block_height_bit, const std::size_t copy_size, const u8* source_data,
                   u8* swizzle_data) {
    const u32 block_height = 1U << block_height_bit;
    const u32 image_width_in_gobs{(width + GOB_SIZE_X - 1) / GOB_SIZE_X};
    std::size_t count = 0;
    for (std::size_t y = dst_y; y < height && count < copy_size; ++y) {
        const std::size_t gob_address_y =
            (y / (GOB_SIZE_Y * block_height)) * GOB_SIZE * block_height * image_width_in_gobs +
            ((y % (GOB_SIZE_Y * block_height)) / GOB_SIZE_Y) * GOB_SIZE;
        const u32 swizzled_y = pdep<SWIZZLE_Y_BITS>(static_cast<u32>(y));
        u32 swizzled_x = pdep<SWIZZLE_X_BITS>(dst_x);
        for (std::size_t x = dst_x; x < width && count < copy_size;
             ++x, incrpdep<SWIZZLE_X_BITS, 1>(swizzled_x)) {
            const std::size_t gob_address =
                gob_address_y + (x / GOB_SIZE_X) * GOB_SIZE * block_height;
            const std::size_t swizzled_offset = gob_address + (swizzled_x | swizzled_y);
            const u8* source_line = source_data + count;
            u8* dest_addr = swizzle_data + swizzled_offset;
            count++;
            *dest_addr = *source_line;
        }
    }
 }
 std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height, u32 depth,
                          u32 block_height, u32 block_depth) {
    if (tiled) {
--- a/src/video_core/textures/decoders.h
+++ b/src/video_core/textures/decoders.h
@ -40,7 +40,6 @@ constexpr SwizzleTable MakeSwizzleTable() {
    }
    return table;
 }
 constexpr SwizzleTable SWIZZLE_TABLE = MakeSwizzleTable();
 /// Unswizzles a block linear texture into linear memory.
 void UnswizzleTexture(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
@ -57,34 +56,14 @@ std::size_t CalculateSize(bool tiled, u32 bytes_per_pixel, u32 width, u32 height
                          u32 block_height, u32 block_depth);
 /// Copies an untiled subrectangle into a tiled surface.
-void SwizzleSubrect(u32 subrect_width, u32 subrect_height, u32 source_pitch, u32 swizzled_width,
+void SwizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel, u32 width,
-                    u32 bytes_per_pixel, u8* swizzled_data, const u8* unswizzled_data,
+                    u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x, u32 extent_y,
-                    u32 block_height_bit, u32 offset_x, u32 offset_y);
+                    u32 block_height, u32 block_depth, u32 pitch_linear);
 /// Copies a tiled subrectangle into a linear surface.
-void UnswizzleSubrect(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 bytes_per_pixel,
+void UnswizzleSubrect(std::span<u8> output, std::span<const u8> input, u32 bytes_per_pixel,
-                      u32 block_height, u32 origin_x, u32 origin_y, u8* output, const u8* input);
+                      u32 width, u32 height, u32 depth, u32 origin_x, u32 origin_y, u32 extent_x,
-
+                      u32 extent_y, u32 block_height, u32 block_depth, u32 pitch_linear);
 /// @brief Swizzles a 2D array of pixels into a 3D texture
 /// @param line_length_in  Number of pixels per line
 /// @param line_count      Number of lines
 /// @param pitch           Number of bytes per line
 /// @param width           Width of the swizzled texture
 /// @param height          Height of the swizzled texture
 /// @param bytes_per_pixel Number of bytes used per pixel
 /// @param block_height    Block height shift
 /// @param block_depth     Block depth shift
 /// @param origin_x        Column offset in pixels of the swizzled texture
 /// @param origin_y        Row offset in pixels of the swizzled texture
 /// @param output          Pointer to the pixels of the swizzled texture
 /// @param input           Pointer to the 2D array of pixels used as input
 /// @pre input and output points to an array large enough to hold the number of bytes used
 void SwizzleSliceToVoxel(u32 line_length_in, u32 line_count, u32 pitch, u32 width, u32 height,
                         u32 bytes_per_pixel, u32 block_height, u32 block_depth, u32 origin_x,
                         u32 origin_y, u8* output, const u8* input);
 void SwizzleKepler(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,
                   std::size_t copy_size, const u8* source_data, u8* swizzle_data);
 /// Obtains the offset of the gob for positions 'dst_x' & 'dst_y'
 u64 GetGOBOffset(u32 width, u32 height, u32 dst_x, u32 dst_y, u32 block_height,