From 6eb85e846f25ae36a39685d6ac91025deaea306c Mon Sep 17 00:00:00 2001 From: Logan Stromberg Date: Thu, 14 Jul 2022 11:45:56 -0700 Subject: [PATCH] Reduce some unnecessary allocations in DMA handler (#2886) * experimental changes to try and reduce allocations in kernel threading and DMA handler * Simplify the changes in this branch to just 1. Don't make unnecessary copies of data just for texture-texture transfers and 2. Add a fast path for 1bpp linear byte copies * forgot to check src + dst linearity in 1bpp DMA fast path. Fixes the UE4 regression. * removing dev log I left in * Generalizing the DMA linear fast path to cases other than 1bpp copies * revert kernel changes * revert whitespace * remove unneeded references * PR feedback Co-authored-by: Logan Stromberg Co-authored-by: gdk --- Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs | 47 +++++++++++++++------ 1 file changed, 33 insertions(+), 14 deletions(-) diff --git a/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs b/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs index df7e55a11d..b1f766b830 100644 --- a/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs +++ b/Ryujinx.Graphics.Gpu/Engine/Dma/DmaClass.cs @@ -208,7 +208,6 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma } ReadOnlySpan srcSpan = memoryManager.GetSpan(srcGpuVa + (ulong)srcBaseOffset, srcSize, true); - Span dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray(); bool completeSource = IsTextureCopyComplete(src, srcLinear, srcBpp, srcStride, xCount, yCount); bool completeDest = IsTextureCopyComplete(dst, dstLinear, dstBpp, dstStride, xCount, yCount); @@ -262,43 +261,63 @@ namespace Ryujinx.Graphics.Gpu.Engine.Dma target.SynchronizeMemory(); target.SetData(data); target.SignalModified(); - return; } else if (srcCalculator.LayoutMatches(dstCalculator)) { - srcSpan.CopyTo(dstSpan); // No layout conversion has to be performed, just copy the data entirely. - - memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, dstSpan); - + // No layout conversion has to be performed, just copy the data entirely. + memoryManager.Write(dstGpuVa + (ulong)dstBaseOffset, srcSpan); return; } } unsafe bool Convert(Span dstSpan, ReadOnlySpan srcSpan) where T : unmanaged { - fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan) + if (srcLinear && dstLinear && srcBpp == dstBpp) { - byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset. - byte* srcBase = srcPtr - srcBaseOffset; - + // Optimized path for purely linear copies - we don't need to calculate every single byte offset, + // and we can make use of Span.CopyTo which is very very fast (even compared to pointers) for (int y = 0; y < yCount; y++) { srcCalculator.SetY(srcRegionY + y); dstCalculator.SetY(dstRegionY + y); + int srcOffset = srcCalculator.GetOffset(srcRegionX); + int dstOffset = dstCalculator.GetOffset(dstRegionX); + srcSpan.Slice(srcOffset - srcBaseOffset, xCount * srcBpp) + .CopyTo(dstSpan.Slice(dstOffset - dstBaseOffset, xCount * dstBpp)); + } + } + else + { + fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan) + { + byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset. + byte* srcBase = srcPtr - srcBaseOffset; - for (int x = 0; x < xCount; x++) + for (int y = 0; y < yCount; y++) { - int srcOffset = srcCalculator.GetOffset(srcRegionX + x); - int dstOffset = dstCalculator.GetOffset(dstRegionX + x); + srcCalculator.SetY(srcRegionY + y); + dstCalculator.SetY(dstRegionY + y); - *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset); + for (int x = 0; x < xCount; x++) + { + int srcOffset = srcCalculator.GetOffset(srcRegionX + x); + int dstOffset = dstCalculator.GetOffset(dstRegionX + x); + + *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset); + } } } } + return true; } + // OPT: This allocates a (potentially) huge temporary array and then copies an existing + // region of memory into it, data that might get overwritten entirely anyways. Ideally this should + // all be rewritten to use pooled arrays, but that gets complicated with packed data and strides + Span dstSpan = memoryManager.GetSpan(dstGpuVa + (ulong)dstBaseOffset, dstSize).ToArray(); + bool _ = srcBpp switch { 1 => Convert(dstSpan, srcSpan),