Optimize texture format conversion, and MethodCopyBuffer (#1274)

* Improve performance when converting texture formats.

Still more work to do.

* Speed up buffer -> texture copies.

No longer copies byte by byte. Fast path when formats are identical.

* Fix a few things, 64 byte block fast copy.

* Spacing cleanup, unrelated change.

* Fix base offset calculation for region copies.

* Fix Linear -> BlockLinear

* Fix some nits. (part 1 of review feedback)

* Use a generic version of the Convert* functions rather than lambdas.

This is some real monkey's paw shit.

* Remove unnecessary span constructor.

* Revert "Use a generic version of the Convert* functions rather than lambdas."

This reverts commit aa43dcfbe8.

* Fix bug with rectangle destination writing, better rectangle calculation for linear textures.
This commit is contained in:
riperiperi 2020-06-13 23:31:06 +01:00 committed by GitHub
parent ce983f360b
commit bea1fc2e8d
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
5 changed files with 356 additions and 61 deletions

View file

@ -1,6 +1,7 @@
using Ryujinx.Graphics.Gpu.State; using Ryujinx.Graphics.Gpu.State;
using Ryujinx.Graphics.Texture; using Ryujinx.Graphics.Texture;
using System; using System;
using System.Runtime.Intrinsics;
namespace Ryujinx.Graphics.Gpu.Engine namespace Ryujinx.Graphics.Gpu.Engine
{ {
@ -56,19 +57,58 @@ namespace Ryujinx.Graphics.Gpu.Engine
ulong srcBaseAddress = _context.MemoryManager.Translate(cbp.SrcAddress.Pack()); ulong srcBaseAddress = _context.MemoryManager.Translate(cbp.SrcAddress.Pack());
ulong dstBaseAddress = _context.MemoryManager.Translate(cbp.DstAddress.Pack()); ulong dstBaseAddress = _context.MemoryManager.Translate(cbp.DstAddress.Pack());
for (int y = 0; y < cbp.YCount; y++) (int srcBaseOffset, int srcSize) = srcCalculator.GetRectangleRange(src.RegionX, src.RegionY, cbp.XCount, cbp.YCount);
for (int x = 0; x < cbp.XCount; x++) (int dstBaseOffset, int dstSize) = dstCalculator.GetRectangleRange(dst.RegionX, dst.RegionY, cbp.XCount, cbp.YCount);
ReadOnlySpan<byte> srcSpan = _context.PhysicalMemory.GetSpan(srcBaseAddress + (ulong)srcBaseOffset, srcSize);
Span<byte> dstSpan = _context.PhysicalMemory.GetSpan(dstBaseAddress + (ulong)dstBaseOffset, dstSize).ToArray();
bool completeSource = src.RegionX == 0 && src.RegionY == 0 && src.Width == cbp.XCount && src.Height == cbp.YCount;
bool completeDest = dst.RegionX == 0 && dst.RegionY == 0 && dst.Width == cbp.XCount && dst.Height == cbp.YCount;
if (completeSource && completeDest && srcCalculator.LayoutMatches(dstCalculator))
{ {
int srcOffset = srcCalculator.GetOffset(src.RegionX + x, src.RegionY + y); srcSpan.CopyTo(dstSpan); // No layout conversion has to be performed, just copy the data entirely.
int dstOffset = dstCalculator.GetOffset(dst.RegionX + x, dst.RegionY + y);
ulong srcAddress = srcBaseAddress + (ulong)srcOffset;
ulong dstAddress = dstBaseAddress + (ulong)dstOffset;
ReadOnlySpan<byte> pixel = _context.PhysicalMemory.GetSpan(srcAddress, srcBpp);
_context.PhysicalMemory.Write(dstAddress, pixel);
} }
else
{
unsafe bool Convert<T>(Span<byte> dstSpan, ReadOnlySpan<byte> srcSpan) where T : unmanaged
{
fixed (byte* dstPtr = dstSpan, srcPtr = srcSpan)
{
byte* dstBase = dstPtr - dstBaseOffset; // Layout offset is relative to the base, so we need to subtract the span's offset.
byte* srcBase = srcPtr - srcBaseOffset;
for (int y = 0; y < cbp.YCount; y++)
{
srcCalculator.SetY(src.RegionY + y);
dstCalculator.SetY(dst.RegionY + y);
for (int x = 0; x < cbp.XCount; x++)
{
int srcOffset = srcCalculator.GetOffset(src.RegionX + x);
int dstOffset = dstCalculator.GetOffset(dst.RegionX + x);
*(T*)(dstBase + dstOffset) = *(T*)(srcBase + srcOffset);
}
}
}
return true;
}
bool _ = srcBpp switch
{
1 => Convert<byte>(dstSpan, srcSpan),
2 => Convert<ushort>(dstSpan, srcSpan),
4 => Convert<uint>(dstSpan, srcSpan),
8 => Convert<ulong>(dstSpan, srcSpan),
12 => Convert<Bpp12Pixel>(dstSpan, srcSpan),
16 => Convert<Vector128<byte>>(dstSpan, srcSpan),
_ => throw new NotSupportedException($"Unable to copy ${srcBpp} bpp pixel format.")
};
}
_context.PhysicalMemory.Write(dstBaseAddress + (ulong)dstBaseOffset, dstSpan);
} }
else else
{ {

View file

@ -33,6 +33,11 @@ namespace Ryujinx.Graphics.Texture
private int _robSize; private int _robSize;
private int _sliceSize; private int _sliceSize;
// Variables for built in iteration.
private int _yPart;
private int _yzPart;
private int _zPart;
public BlockLinearLayout( public BlockLinearLayout(
int width, int width,
int height, int height,
@ -97,5 +102,94 @@ namespace Ryujinx.Graphics.Texture
return offset; return offset;
} }
public (int offset, int size) GetRectangleRange(int x, int y, int width, int height)
{
// Justification:
// The 2D offset is a combination of separate x and y parts.
// Both components increase with input and never overlap bits.
// Therefore for each component, the minimum input value is the lowest that component can go.
// Minimum total value is minimum X component + minimum Y component. Similar goes for maximum.
int start = GetOffset(x, y, 0);
int end = GetOffset(x + width - 1, y + height - 1, 0) + _texBpp; // Cover the last pixel.
return (start, end - start);
}
public bool LayoutMatches(BlockLinearLayout other)
{
return _robSize == other._robSize &&
_sliceSize == other._sliceSize &&
_texBpp == other._texBpp &&
_bhMask == other._bhMask &&
_bdMask == other._bdMask;
}
// Functions for built in iteration.
// Components of the offset can be updated separately, and combined to save some time.
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void SetY(int y)
{
int yh = y / GobHeight;
int offset = (yh >> _bhShift) * _robSize;
offset += (yh & _bhMask) * GobSize;
offset += ((y & 0x07) >> 1) << 6;
offset += ((y & 0x01) >> 0) << 4;
_yPart = offset;
_yzPart = offset + _zPart;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public void SetZ(int z)
{
int offset = (z >> _bdShift) * _sliceSize;
offset += ((z & _bdMask) * GobSize) << _bhShift;
_zPart = offset;
_yzPart = offset + _yPart;
}
/// <summary>
/// Optimized conversion for line offset in bytes to an absolute offset. Input x must be divisible by 16.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetOffsetWithLineOffset16(int x)
{
int offset = (x / GobStride) << _xShift;
offset += ((x & 0x3f) >> 5) << 8;
offset += ((x & 0x1f) >> 4) << 5;
return offset + _yzPart;
}
/// <summary>
/// Optimized conversion for line offset in bytes to an absolute offset. Input x must be divisible by 64.
/// </summary>
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetOffsetWithLineOffset64(int x)
{
int offset = (x / GobStride) << _xShift;
return offset + _yzPart;
}
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetOffset(int x)
{
x <<= _bppShift;
int offset = (x / GobStride) << _xShift;
offset += ((x & 0x3f) >> 5) << 8;
offset += ((x & 0x1f) >> 4) << 5;
offset += (x & 0x0f);
return offset + _yzPart;
}
} }
} }

View file

@ -0,0 +1,11 @@
using System.Runtime.InteropServices;
namespace Ryujinx.Graphics.Texture
{
[StructLayout(LayoutKind.Sequential, Pack = 1, Size = 12)]
public struct Bpp12Pixel
{
private ulong _elem1;
private uint _elem2;
}
}

View file

@ -1,6 +1,6 @@
using Ryujinx.Common; using Ryujinx.Common;
using System; using System;
using System.Runtime.Intrinsics;
using static Ryujinx.Graphics.Texture.BlockLinearConstants; using static Ryujinx.Graphics.Texture.BlockLinearConstants;
namespace Ryujinx.Graphics.Texture namespace Ryujinx.Graphics.Texture
@ -64,11 +64,14 @@ namespace Ryujinx.Graphics.Texture
} }
int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16); int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16);
int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64);
int xStart = strideTrunc / bytesPerPixel; int xStart = strideTrunc / bytesPerPixel;
int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
int outStrideGap = stride - w * bytesPerPixel;
int alignment = gobWidth; int alignment = gobWidth;
if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight) if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
@ -86,36 +89,74 @@ namespace Ryujinx.Graphics.Texture
mipGobBlocksInZ, mipGobBlocksInZ,
bytesPerPixel); bytesPerPixel);
for (int layer = 0; layer < layers; layer++) unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
{ {
int inBaseOffset = layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level); fixed (byte* outputPtr = output, dataPtr = data)
for (int z = 0; z < d; z++)
for (int y = 0; y < h; y++)
{ {
for (int x = 0; x < strideTrunc; x += 16) byte* outPtr = outputPtr + outOffs;
for (int layer = 0; layer < layers; layer++)
{ {
int offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset(x, y, z); byte* inBaseOffset = dataPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level));
Span<byte> dest = output.Slice(outOffs + x, 16); for (int z = 0; z < d; z++)
{
layoutConverter.SetZ(z);
for (int y = 0; y < h; y++)
{
layoutConverter.SetY(y);
data.Slice(offset, 16).CopyTo(dest); for (int x = 0; x < strideTrunc64; x += 64, outPtr += 64)
{
byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x);
byte* offset2 = offset + 0x20;
byte* offset3 = offset + 0x100;
byte* offset4 = offset + 0x120;
Vector128<byte> value = *(Vector128<byte>*)offset;
Vector128<byte> value2 = *(Vector128<byte>*)offset2;
Vector128<byte> value3 = *(Vector128<byte>*)offset3;
Vector128<byte> value4 = *(Vector128<byte>*)offset4;
*(Vector128<byte>*)outPtr = value;
*(Vector128<byte>*)(outPtr + 16) = value2;
*(Vector128<byte>*)(outPtr + 32) = value3;
*(Vector128<byte>*)(outPtr + 48) = value4;
}
for (int x = strideTrunc64; x < strideTrunc; x += 16, outPtr += 16)
{
byte* offset = inBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x);
*(Vector128<byte>*)outPtr = *(Vector128<byte>*)offset;
}
for (int x = xStart; x < w; x++, outPtr += bytesPerPixel)
{
byte* offset = inBaseOffset + layoutConverter.GetOffset(x);
*(T*)outPtr = *(T*)offset;
}
outPtr += outStrideGap;
}
}
} }
outOffs += stride * h * d * layers;
for (int x = xStart; x < w; x++)
{
int offset = inBaseOffset + layoutConverter.GetOffset(x, y, z);
Span<byte> dest = output.Slice(outOffs + x * bytesPerPixel, bytesPerPixel);
data.Slice(offset, bytesPerPixel).CopyTo(dest);
}
outOffs += stride;
} }
return true;
} }
}
bool _ = bytesPerPixel switch
{
1 => Convert<byte>(output, data),
2 => Convert<ushort>(output, data),
4 => Convert<uint>(output, data),
8 => Convert<ulong>(output, data),
12 => Convert<Bpp12Pixel>(output, data),
16 => Convert<Vector128<byte>>(output, data),
_ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
};
}
return output; return output;
} }
@ -132,22 +173,18 @@ namespace Ryujinx.Graphics.Texture
int h = BitUtils.DivRoundUp(height, blockHeight); int h = BitUtils.DivRoundUp(height, blockHeight);
int outStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); int outStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
int lineSize = w * bytesPerPixel;
Span<byte> output = new byte[h * outStride]; Span<byte> output = new byte[h * outStride];
int outOffs = 0; int outOffs = 0;
int inOffs = 0;
for (int y = 0; y < h; y++) for (int y = 0; y < h; y++)
{ {
for (int x = 0; x < w; x++) data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize));
{
int offset = y * stride + x * bytesPerPixel;
Span<byte> dest = output.Slice(outOffs + x * bytesPerPixel, bytesPerPixel);
data.Slice(offset, bytesPerPixel).CopyTo(dest);
}
inOffs += stride;
outOffs += outStride; outOffs += outStride;
} }
@ -198,8 +235,15 @@ namespace Ryujinx.Graphics.Texture
mipGobBlocksInZ >>= 1; mipGobBlocksInZ >>= 1;
} }
int strideTrunc = BitUtils.AlignDown(w * bytesPerPixel, 16);
int strideTrunc64 = BitUtils.AlignDown(w * bytesPerPixel, 64);
int xStart = strideTrunc / bytesPerPixel;
int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); int stride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
int inStrideGap = stride - w * bytesPerPixel;
int alignment = gobWidth; int alignment = gobWidth;
if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight) if (d < gobBlocksInZ || w <= gobWidth || h <= gobHeight)
@ -217,25 +261,73 @@ namespace Ryujinx.Graphics.Texture
mipGobBlocksInZ, mipGobBlocksInZ,
bytesPerPixel); bytesPerPixel);
for (int layer = 0; layer < layers; layer++) unsafe bool Convert<T>(Span<byte> output, ReadOnlySpan<byte> data) where T : unmanaged
{ {
int outBaseOffset = layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level); fixed (byte* outputPtr = output, dataPtr = data)
for (int z = 0; z < d; z++)
for (int y = 0; y < h; y++)
{ {
for (int x = 0; x < w; x++) byte* inPtr = dataPtr + inOffs;
for (int layer = 0; layer < layers; layer++)
{ {
int offset = outBaseOffset + layoutConverter.GetOffset(x, y, z); byte* outBaseOffset = outputPtr + (layer * sizeInfo.LayerSize + sizeInfo.GetMipOffset(level));
Span<byte> dest = output.Slice(offset, bytesPerPixel); for (int z = 0; z < d; z++)
{
layoutConverter.SetZ(z);
for (int y = 0; y < h; y++)
{
layoutConverter.SetY(y);
data.Slice(inOffs + x * bytesPerPixel, bytesPerPixel).CopyTo(dest); for (int x = 0; x < strideTrunc64; x += 64, inPtr += 64)
{
byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset64(x);
byte* offset2 = offset + 0x20;
byte* offset3 = offset + 0x100;
byte* offset4 = offset + 0x120;
Vector128<byte> value = *(Vector128<byte>*)inPtr;
Vector128<byte> value2 = *(Vector128<byte>*)(inPtr + 16);
Vector128<byte> value3 = *(Vector128<byte>*)(inPtr + 32);
Vector128<byte> value4 = *(Vector128<byte>*)(inPtr + 48);
*(Vector128<byte>*)offset = value;
*(Vector128<byte>*)offset2 = value2;
*(Vector128<byte>*)offset3 = value3;
*(Vector128<byte>*)offset4 = value4;
}
for (int x = strideTrunc64; x < strideTrunc; x += 16, inPtr += 16)
{
byte* offset = outBaseOffset + layoutConverter.GetOffsetWithLineOffset16(x);
*(Vector128<byte>*)offset = *(Vector128<byte>*)inPtr;
}
for (int x = xStart; x < w; x++, inPtr += bytesPerPixel)
{
byte* offset = outBaseOffset + layoutConverter.GetOffset(x);
*(T*)offset = *(T*)inPtr;
}
inPtr += inStrideGap;
}
}
} }
inOffs += stride * h * d * layers;
inOffs += stride;
} }
return true;
} }
bool _ = bytesPerPixel switch
{
1 => Convert<byte>(output, data),
2 => Convert<ushort>(output, data),
4 => Convert<uint>(output, data),
8 => Convert<ulong>(output, data),
12 => Convert<Bpp12Pixel>(output, data),
16 => Convert<Vector128<byte>>(output, data),
_ => throw new NotSupportedException($"Unable to convert ${bytesPerPixel} bpp pixel format.")
};
} }
return output; return output;
@ -254,23 +346,19 @@ namespace Ryujinx.Graphics.Texture
int h = BitUtils.DivRoundUp(height, blockHeight); int h = BitUtils.DivRoundUp(height, blockHeight);
int inStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment); int inStride = BitUtils.AlignUp(w * bytesPerPixel, HostStrideAlignment);
int lineSize = width * bytesPerPixel;
Span<byte> output = new byte[h * stride]; Span<byte> output = new byte[h * stride];
int inOffs = 0; int inOffs = 0;
int outOffs = 0;
for (int y = 0; y < h; y++) for (int y = 0; y < h; y++)
{ {
for (int x = 0; x < w; x++) data.Slice(inOffs, lineSize).CopyTo(output.Slice(outOffs, lineSize));
{
int offset = y * stride + x * bytesPerPixel;
Span<byte> dest = output.Slice(offset, bytesPerPixel);
data.Slice(inOffs + x * bytesPerPixel, bytesPerPixel).CopyTo(dest);
}
inOffs += inStride; inOffs += inStride;
outOffs += stride;
} }
return output; return output;

View file

@ -1,17 +1,22 @@
using Ryujinx.Common; using Ryujinx.Common;
using System.Runtime.CompilerServices;
using static Ryujinx.Graphics.Texture.BlockLinearConstants; using static Ryujinx.Graphics.Texture.BlockLinearConstants;
namespace Ryujinx.Graphics.Texture namespace Ryujinx.Graphics.Texture
{ {
public class OffsetCalculator public class OffsetCalculator
{ {
private int _width;
private int _height;
private int _stride; private int _stride;
private bool _isLinear; private bool _isLinear;
private int _bytesPerPixel; private int _bytesPerPixel;
private BlockLinearLayout _layoutConverter; private BlockLinearLayout _layoutConverter;
// Variables for built in iteration.
private int _yPart;
public OffsetCalculator( public OffsetCalculator(
int width, int width,
int height, int height,
@ -20,6 +25,8 @@ namespace Ryujinx.Graphics.Texture
int gobBlocksInY, int gobBlocksInY,
int bytesPerPixel) int bytesPerPixel)
{ {
_width = width;
_height = height;
_stride = stride; _stride = stride;
_isLinear = isLinear; _isLinear = isLinear;
_bytesPerPixel = bytesPerPixel; _bytesPerPixel = bytesPerPixel;
@ -40,6 +47,18 @@ namespace Ryujinx.Graphics.Texture
} }
} }
public void SetY(int y)
{
if (_isLinear)
{
_yPart = y * _stride;
}
else
{
_layoutConverter.SetY(y);
}
}
public int GetOffset(int x, int y) public int GetOffset(int x, int y)
{ {
if (_isLinear) if (_isLinear)
@ -51,5 +70,48 @@ namespace Ryujinx.Graphics.Texture
return _layoutConverter.GetOffset(x, y, 0); return _layoutConverter.GetOffset(x, y, 0);
} }
} }
[MethodImpl(MethodImplOptions.AggressiveInlining)]
public int GetOffset(int x)
{
if (_isLinear)
{
return x * _bytesPerPixel + _yPart;
}
else
{
return _layoutConverter.GetOffset(x);
}
}
public (int offset, int size) GetRectangleRange(int x, int y, int width, int height)
{
if (_isLinear)
{
int start = y * _stride + x * _bytesPerPixel;
int end = (y + height - 1) * _stride + (x + width) * _bytesPerPixel;
return (start, end - start);
}
else
{
return _layoutConverter.GetRectangleRange(x, y, width, height);
}
}
public bool LayoutMatches(OffsetCalculator other)
{
if (_isLinear)
{
return other._isLinear &&
_width == other._width &&
_height == other._height &&
_stride == other._stride &&
_bytesPerPixel == other._bytesPerPixel;
}
else
{
return !other._isLinear && _layoutConverter.LayoutMatches(other._layoutConverter);
}
}
} }
} }