From bea1fc2e8d40ec792964852f57e7b884dfbd8306 Mon Sep 17 00:00:00 2001 From: riperiperi Date: Sat, 13 Jun 2020 23:31:06 +0100 Subject: [PATCH] Optimize texture format conversion, and MethodCopyBuffer (#1274) * Improve performance when converting texture formats. Still more work to do. * Speed up buffer -> texture copies. No longer copies byte by byte. Fast path when formats are identical. * Fix a few things, 64 byte block fast copy. * Spacing cleanup, unrelated change. * Fix base offset calculation for region copies. * Fix Linear -> BlockLinear * Fix some nits. (part 1 of review feedback) * Use a generic version of the Convert* functions rather than lambdas. This is some real monkey's paw shit. * Remove unnecessary span constructor. * Revert "Use a generic version of the Convert* functions rather than lambdas." This reverts commit aa43dcfbe8bba291eea4e10c68569af7a56a5851. * Fix bug with rectangle destination writing, better rectangle calculation for linear textures. --- .../Engine/MethodCopyBuffer.cs | 62 ++++-- Ryujinx.Graphics.Texture/BlockLinearLayout.cs | 94 +++++++++ Ryujinx.Graphics.Texture/Bpp12Pixel.cs | 11 ++ Ryujinx.Graphics.Texture/LayoutConverter.cs | 186 +++++++++++++----- Ryujinx.Graphics.Texture/OffsetCalculator.cs | 64 +++++- 5 files changed, 356 insertions(+), 61 deletions(-) create mode 100644 Ryujinx.Graphics.Texture/Bpp12Pixel.cs diff --git a/Ryujinx.Graphics.Gpu/Engine/MethodCopyBuffer.cs b/Ryujinx.Graphics.Gpu/Engine/MethodCopyBuffer.cs index 7244db324..2e6fe0ab1 100644 --- a/Ryujinx.Graphics.Gpu/Engine/MethodCopyBuffer.cs +++ b/Ryujinx.Graphics.Gpu/Engine/MethodCopyBuffer.cs @@ -1,6 +1,7 @@ using Ryujinx.Graphics.Gpu.State; using Ryujinx.Graphics.Texture; using System; +using System.Runtime.Intrinsics; namespace Ryujinx.Graphics.Gpu.Engine { @@ -56,19 +57,58 @@ namespace Ryujinx.Graphics.Gpu.Engine ulong srcBaseAddress = _context.MemoryManager.Translate(cbp.SrcAddress.Pack()); ulong dstBaseAddress = _context.MemoryManager.Translate(cbp.DstAddress.Pack()); - for (int y = 0; y < cbp.YCount; y++) - for (int x = 0; x < cbp.XCount; x++) + (int srcBaseOffset, int srcSize) = srcCalculator.GetRectangleRange(src.RegionX, src.RegionY, cbp.XCount, cbp.YCount); + (int dstBaseOffset, int dstSize) = dstCalculator.GetRectangleRange(dst.RegionX, dst.RegionY, cbp.XCount, cbp.YCount); + + ReadOnlySpan srcSpan = _context.PhysicalMemory.GetSpan(srcBaseAddress + (ulong)srcBaseOffset, srcSize); + Span dstSpan = _context.PhysicalMemory.GetSpan(dstBaseAddress + (ulong)dstBaseOffset, dstSize).ToArray(); + + bool completeSource = src.RegionX == 0 && src.RegionY == 0 && src.Width == cbp.XCount && src.Height == cbp.YCount; + bool completeDest = dst.RegionX == 0 && dst.RegionY == 0 && dst.Width == cbp.XCount && dst.Height == cbp.YCount; + + if (completeSource && completeDest && srcCalculator.LayoutMatches(dstCalculator)) { - int srcOffset = srcCalculator.GetOffset(src.RegionX + x, src.RegionY + y); - int dstOffset = dstCalculator.GetOffset(dst.RegionX + x, dst.RegionY + y); - - ulong srcAddress = srcBaseAddress + (ulong)srcOffset; - ulong dstAddress = dstBaseAddress + (ulong)dstOffset; - - ReadOnlySpan pixel = _context.PhysicalMemory.GetSpan(srcAddress, srcBpp); - - _context.PhysicalMemory.Write(dstAddress, pixel); + srcSpan.CopyTo(dstSpan); // No layout conversion has to be performed, just copy the data entirely. } + else + { + unsafe bool Convert(Span dstSpan, ReadOnlySpan srcSpan) where T : unmanaged + { + fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan) + { + byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset. + byte* srcBase = srcPtr - srcBaseOffset; + + for (int y = 0; y < cbp.YCount; y++) + { + srcCalculator.SetY(src.RegionY + y); + dstCalculator.SetY(dst.RegionY + y); + + for (int x = 0; x < cbp.XCount; x++) + { + int srcOffset = srcCalculator.GetOffset(src.RegionX + x); + int dstOffset = dstCalculator.GetOffset(dst.RegionX + x); + + *(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset); + } + } + } + return true; + } + + bool _ = srcBpp switch + { + 1 => Convert(dstSpan, srcSpan), + 2 => Convert(dstSpan, srcSpan), + 4 => Convert(dstSpan, srcSpan), + 8 => Convert(dstSpan, srcSpan), + 12 => Convert(dstSpan, srcSpan), + 16 => Convert>(dstSpan, srcSpan), + _ => throw new NotSupportedException($"Unable to copy ${srcBpp} bpp pixel format.") + }; + } + + _context.PhysicalMemory.Write(dstBaseAddress + (ulong)dstBaseOffset, dstSpan); } else { diff --git a/Ryujinx.Graphics.Texture/BlockLinearLayout.cs b/Ryujinx.Graphics.Texture/BlockLinearLayout.cs index b95db7029..0b1122421 100644 --- a/Ryujinx.Graphics.Texture/BlockLinearLayout.cs +++ b/Ryujinx.Graphics.Texture/BlockLinearLayout.cs @@ -33,6 +33,11 @@ namespace Ryujinx.Graphics.Texture private int _robSize; private int _sliceSize; + // Variables for built in iteration. + private int _yPart; + private int _yzPart; + private int _zPart; + public BlockLinearLayout( int width, int height, @@ -97,5 +102,94 @@ namespace Ryujinx.Graphics.Texture return offset; } + + public (int offset, int size) GetRectangleRange(int x, int y, int width, int height) + { + // Justification: + // The 2D offset is a combination of separate x and y parts. + // Both components increase with input and never overlap bits. + // Therefore for each component, the minimum input value is the lowest that component can go. + // Minimum total value is minimum X component + minimum Y component. Similar goes for maximum. + + int start = GetOffset(x, y, 0); + int end = GetOffset(x + width - 1, y + height - 1, 0) + _texBpp; // Cover the last pixel. + return (start, end - start); + } + + public bool LayoutMatches(BlockLinearLayout other) + { + return _robSize == other._robSize && + _sliceSize == other._sliceSize && + _texBpp == other._texBpp && + _bhMask == other._bhMask && + _bdMask == other._bdMask; + } + + // Functions for built in iteration. + // Components of the offset can be updated separately, and combined to save some time. + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void SetY(int y) + { + int yh = y / GobHeight; + int offset = (yh >> _bhShift) * _robSize; + + offset += (yh & _bhMask) * GobSize; + + offset += ((y & 0x07) >> 1) << 6; + offset += ((y & 0x01) >> 0) << 4; + + _yPart = offset; + _yzPart = offset + _zPart; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public void SetZ(int z) + { + int offset = (z >> _bdShift) * _sliceSize; + + offset += ((z & _bdMask) * GobSize) << _bhShift; + + _zPart = offset; + _yzPart = offset + _yPart; + } + + /// + /// Optimized conversion for line offset in bytes to an absolute offset. Input x must be divisible by 16. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffsetWithLineOffset16(int x) + { + int offset = (x / GobStride) << _xShift; + + offset += ((x & 0x3f) >> 5) << 8; + offset += ((x & 0x1f) >> 4) << 5; + + return offset + _yzPart; + } + + /// + /// Optimized conversion for line offset in bytes to an absolute offset. Input x must be divisible by 64. + /// + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffsetWithLineOffset64(int x) + { + int offset = (x / GobStride) << _xShift; + + return offset + _yzPart; + } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffset(int x) + { + x <<= _bppShift; + int offset = (x / GobStride) << _xShift; + + offset += ((x & 0x3f) >> 5) << 8; + offset += ((x & 0x1f) >> 4) << 5; + offset += (x & 0x0f); + + return offset + _yzPart; + } } } \ No newline at end of file diff --git a/Ryujinx.Graphics.Texture/Bpp12Pixel.cs b/Ryujinx.Graphics.Texture/Bpp12Pixel.cs new file mode 100644 index 000000000..5a38259e2 --- /dev/null +++ b/Ryujinx.Graphics.Texture/Bpp12Pixel.cs @@ -0,0 +1,11 @@ +using System.Runtime.InteropServices; + +namespace Ryujinx.Graphics.Texture +{ + [StructLayout(LayoutKind.Sequential, Pack = 1, Size = 12)] + public struct Bpp12Pixel + { + private ulong _elem1; + private uint _elem2; + } +} diff --git a/Ryujinx.Graphics.Texture/LayoutConverter.cs b/Ryujinx.Graphics.Texture/LayoutConverter.cs index ce2b37b54..525271c4c 100644 --- a/Ryujinx.Graphics.Texture/LayoutConverter.cs +++ b/Ryujinx.Graphics.Texture/LayoutConverter.cs @@ -1,6 +1,6 @@ using Ryujinx.Common; using System; - +using System.Runtime.Intrinsics; using static Ryujinx.Graphics.Texture.BlockLinearConstants; namespace Ryujinx.Graphics.Texture @@ -64,11 +64,14 @@ namespace Ryujinx.Graphics.Texture } int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16); + int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64); int xStart = strideTrunc / bytesPerPixel; int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); + int outStrideGap = stride - w * bytesPerPixel; + int alignment = gobWidth; if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight) @@ -86,36 +89,74 @@ namespace Ryujinx.Graphics.Texture mipGobBlocksInZ, bytesPerPixel); - for (int layer = 0; layer < layers; layer++) + unsafe bool Convert(Span output, ReadOnlySpan data) where T : unmanaged { - int inBaseOffset = layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level); - - for (int z = 0; z < d; z++) - for (int y = 0; y < h; y++) + fixed (byte* outputPtr = output, dataPtr = data) { - for (int x = 0; x < strideTrunc; x += 16) + byte* outPtr = outputPtr + outOffs; + for (int layer = 0; layer < layers; layer++) { - int offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset(x, y, z); + byte* inBaseOffset = dataPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level)); - Span dest = output.Slice(outOffs + x, 16); + for (int z = 0; z < d; z++) + { + layoutConverter.SetZ(z); + for (int y = 0; y < h; y++) + { + layoutConverter.SetY(y); - data.Slice(offset, 16).CopyTo(dest); + for (int x = 0; x < strideTrunc64; x += 64, outPtr += 64) + { + byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x); + byte* offset2 = offset + 0x20; + byte* offset3 = offset + 0x100; + byte* offset4 = offset + 0x120; + + Vector128 value = *(Vector128*)offset; + Vector128 value2 = *(Vector128*)offset2; + Vector128 value3 = *(Vector128*)offset3; + Vector128 value4 = *(Vector128*)offset4; + + *(Vector128*)outPtr = value; + *(Vector128*)(outPtr + 16) = value2; + *(Vector128*)(outPtr + 32) = value3; + *(Vector128*)(outPtr + 48) = value4; + } + + for (int x = strideTrunc64; x < strideTrunc; x += 16, outPtr += 16) + { + byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x); + + *(Vector128*)outPtr = *(Vector128*)offset; + } + + for (int x = xStart; x < w; x++, outPtr += bytesPerPixel) + { + byte* offset = inBaseOffset + layoutConverter.GetOffset(x); + + *(T*)outPtr = *(T*)offset; + } + + outPtr += outStrideGap; + } + } } - - for (int x = xStart; x < w; x++) - { - int offset = inBaseOffset + layoutConverter.GetOffset(x, y, z); - - Span dest = output.Slice(outOffs + x * bytesPerPixel, bytesPerPixel); - - data.Slice(offset, bytesPerPixel).CopyTo(dest); - } - - outOffs += stride; + outOffs += stride * h * d * layers; } + return true; } - } + bool _ = bytesPerPixel switch + { + 1 => Convert(output, data), + 2 => Convert(output, data), + 4 => Convert(output, data), + 8 => Convert(output, data), + 12 => Convert(output, data), + 16 => Convert>(output, data), + _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.") + }; + } return output; } @@ -132,22 +173,18 @@ namespace Ryujinx.Graphics.Texture int h = BitUtils.DivRoundUp(height, blockHeight); int outStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); + int lineSize = w * bytesPerPixel; Span output = new byte[h * outStride]; int outOffs = 0; + int inOffs = 0; for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) - { - int offset = y * stride + x * bytesPerPixel; - - Span dest = output.Slice(outOffs + x * bytesPerPixel, bytesPerPixel); - - data.Slice(offset, bytesPerPixel).CopyTo(dest); - } + data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize)); + inOffs += stride; outOffs += outStride; } @@ -198,8 +235,15 @@ namespace Ryujinx.Graphics.Texture mipGobBlocksInZ >>= 1; } + int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16); + int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64); + + int xStart = strideTrunc / bytesPerPixel; + int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); + int inStrideGap = stride - w * bytesPerPixel; + int alignment = gobWidth; if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight) @@ -217,25 +261,73 @@ namespace Ryujinx.Graphics.Texture mipGobBlocksInZ, bytesPerPixel); - for (int layer = 0; layer < layers; layer++) + unsafe bool Convert(Span output, ReadOnlySpan data) where T : unmanaged { - int outBaseOffset = layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level); - - for (int z = 0; z < d; z++) - for (int y = 0; y < h; y++) + fixed (byte* outputPtr = output, dataPtr = data) { - for (int x = 0; x < w; x++) + byte* inPtr = dataPtr + inOffs; + for (int layer = 0; layer < layers; layer++) { - int offset = outBaseOffset + layoutConverter.GetOffset(x, y, z); + byte* outBaseOffset = outputPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level)); - Span dest = output.Slice(offset, bytesPerPixel); + for (int z = 0; z < d; z++) + { + layoutConverter.SetZ(z); + for (int y = 0; y < h; y++) + { + layoutConverter.SetY(y); - data.Slice(inOffs + x * bytesPerPixel, bytesPerPixel).CopyTo(dest); + for (int x = 0; x < strideTrunc64; x += 64, inPtr += 64) + { + byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x); + byte* offset2 = offset + 0x20; + byte* offset3 = offset + 0x100; + byte* offset4 = offset + 0x120; + + Vector128 value = *(Vector128*)inPtr; + Vector128 value2 = *(Vector128*)(inPtr + 16); + Vector128 value3 = *(Vector128*)(inPtr + 32); + Vector128 value4 = *(Vector128*)(inPtr + 48); + + *(Vector128*)offset = value; + *(Vector128*)offset2 = value2; + *(Vector128*)offset3 = value3; + *(Vector128*)offset4 = value4; + } + + for (int x = strideTrunc64; x < strideTrunc; x += 16, inPtr += 16) + { + byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x); + + *(Vector128*)offset = *(Vector128*)inPtr; + } + + for (int x = xStart; x < w; x++, inPtr += bytesPerPixel) + { + byte* offset = outBaseOffset + layoutConverter.GetOffset(x); + + *(T*)offset = *(T*)inPtr; + } + + inPtr += inStrideGap; + } + } } - - inOffs += stride; + inOffs += stride * h * d * layers; } + return true; } + + bool _ = bytesPerPixel switch + { + 1 => Convert(output, data), + 2 => Convert(output, data), + 4 => Convert(output, data), + 8 => Convert(output, data), + 12 => Convert(output, data), + 16 => Convert>(output, data), + _ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.") + }; } return output; @@ -254,23 +346,19 @@ namespace Ryujinx.Graphics.Texture int h = BitUtils.DivRoundUp(height, blockHeight); int inStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); + int lineSize = width * bytesPerPixel; Span output = new byte[h * stride]; int inOffs = 0; + int outOffs = 0; for (int y = 0; y < h; y++) { - for (int x = 0; x < w; x++) - { - int offset = y * stride + x * bytesPerPixel; - - Span dest = output.Slice(offset, bytesPerPixel); - - data.Slice(inOffs + x * bytesPerPixel, bytesPerPixel).CopyTo(dest); - } + data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize)); inOffs += inStride; + outOffs += stride; } return output; diff --git a/Ryujinx.Graphics.Texture/OffsetCalculator.cs b/Ryujinx.Graphics.Texture/OffsetCalculator.cs index bb5d606ca..1f5d9614a 100644 --- a/Ryujinx.Graphics.Texture/OffsetCalculator.cs +++ b/Ryujinx.Graphics.Texture/OffsetCalculator.cs @@ -1,17 +1,22 @@ using Ryujinx.Common; - +using System.Runtime.CompilerServices; using static Ryujinx.Graphics.Texture.BlockLinearConstants; namespace Ryujinx.Graphics.Texture { public class OffsetCalculator { + private int _width; + private int _height; private int _stride; private bool _isLinear; private int _bytesPerPixel; private BlockLinearLayout _layoutConverter; + // Variables for built in iteration. + private int _yPart; + public OffsetCalculator( int width, int height, @@ -20,6 +25,8 @@ namespace Ryujinx.Graphics.Texture int gobBlocksInY, int bytesPerPixel) { + _width = width; + _height = height; _stride = stride; _isLinear = isLinear; _bytesPerPixel = bytesPerPixel; @@ -40,6 +47,18 @@ namespace Ryujinx.Graphics.Texture } } + public void SetY(int y) + { + if (_isLinear) + { + _yPart = y * _stride; + } + else + { + _layoutConverter.SetY(y); + } + } + public int GetOffset(int x, int y) { if (_isLinear) @@ -51,5 +70,48 @@ namespace Ryujinx.Graphics.Texture return _layoutConverter.GetOffset(x, y, 0); } } + + [MethodImpl(MethodImplOptions.AggressiveInlining)] + public int GetOffset(int x) + { + if (_isLinear) + { + return x * _bytesPerPixel + _yPart; + } + else + { + return _layoutConverter.GetOffset(x); + } + } + + public (int offset, int size) GetRectangleRange(int x, int y, int width, int height) + { + if (_isLinear) + { + int start = y * _stride + x * _bytesPerPixel; + int end = (y + height - 1) * _stride + (x + width) * _bytesPerPixel; + return (start, end - start); + } + else + { + return _layoutConverter.GetRectangleRange(x, y, width, height); + } + } + + public bool LayoutMatches(OffsetCalculator other) + { + if (_isLinear) + { + return other._isLinear && + _width == other._width && + _height == other._height && + _stride == other._stride && + _bytesPerPixel == other._bytesPerPixel; + } + else + { + return !other._isLinear && _layoutConverter.LayoutMatches(other._layoutConverter); + } + } } } \ No newline at end of file