From 6ed613a6e6a66d57d2fdb045d926e42dfcdd3206 Mon Sep 17 00:00:00 2001 From: gdkchan Date: Wed, 16 Aug 2023 21:31:07 -0300 Subject: [PATCH] Fix vote and shuffle shader instructions on AMD GPUs (#5540) * Move shuffle handling out of the backend to a transform pass * Handle subgroup sizes higher than 32 * Stop using the subgroup size control extension * Make GenerateShuffleFunction static * Shader cache version bump --- src/Ryujinx.Graphics.GAL/Capabilities.cs | 3 + .../Shader/DiskCache/DiskCacheHostStorage.cs | 2 +- .../Shader/GpuAccessorBase.cs | 2 + src/Ryujinx.Graphics.OpenGL/Constants.cs | 1 + src/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs | 1 + .../CodeGen/Glsl/Declarations.cs | 21 +-- .../HelperFunctions/HelperFunctionNames.cs | 4 - .../CodeGen/Glsl/HelperFunctions/Shuffle.glsl | 11 -- .../Glsl/HelperFunctions/ShuffleDown.glsl | 11 -- .../Glsl/HelperFunctions/ShuffleUp.glsl | 9 -- .../Glsl/HelperFunctions/ShuffleXor.glsl | 11 -- .../CodeGen/Glsl/Instructions/InstGen.cs | 4 + .../Glsl/Instructions/InstGenBallot.cs | 5 +- .../Glsl/Instructions/InstGenHelper.cs | 8 +- .../Glsl/Instructions/InstGenShuffle.cs | 25 +++ .../CodeGen/Spirv/Instructions.cs | 95 ++---------- .../CodeGen/Spirv/SpirvGenerator.cs | 7 +- .../Decoders/Decoder.cs | 3 + src/Ryujinx.Graphics.Shader/IGpuAccessor.cs | 9 ++ .../Instructions/InstEmitMove.cs | 58 ++++++- .../Instructions/InstEmitWarp.cs | 91 +++++++++-- .../Ryujinx.Graphics.Shader.csproj | 4 - .../StructuredIr/HelperFunctionsMask.cs | 4 - .../StructuredIr/InstructionInfo.cs | 11 +- .../StructuredIr/StructuredProgram.cs | 12 -- .../Translation/EmitterContextInsts.cs | 28 +++- .../Translation/FeatureFlags.cs | 1 + .../Translation/HelperFunctionManager.cs | 145 ++++++++++++++++++ .../Translation/HelperFunctionName.cs | 6 + .../Translation/Transforms/ShufflePass.cs | 52 +++++++ .../Translation/Transforms/TransformPasses.cs | 1 + .../HardwareCapabilities.cs | 15 +- src/Ryujinx.Graphics.Vulkan/PipelineState.cs | 23 --- .../VulkanInitialization.cs | 1 - src/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs | 26 ++-- 35 files changed, 445 insertions(+), 265 deletions(-) delete mode 100644 src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/Shuffle.glsl delete mode 100644 src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleDown.glsl delete mode 100644 src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleUp.glsl delete mode 100644 src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleXor.glsl create mode 100644 src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenShuffle.cs create mode 100644 src/Ryujinx.Graphics.Shader/Translation/Transforms/ShufflePass.cs diff --git a/src/Ryujinx.Graphics.GAL/Capabilities.cs b/src/Ryujinx.Graphics.GAL/Capabilities.cs index f4b1d4d10..d41f8e59f 100644 --- a/src/Ryujinx.Graphics.GAL/Capabilities.cs +++ b/src/Ryujinx.Graphics.GAL/Capabilities.cs @@ -52,6 +52,7 @@ namespace Ryujinx.Graphics.GAL public readonly int MaximumComputeSharedMemorySize; public readonly float MaximumSupportedAnisotropy; + public readonly int ShaderSubgroupSize; public readonly int StorageBufferOffsetAlignment; public readonly int GatherBiasPrecision; @@ -101,6 +102,7 @@ namespace Ryujinx.Graphics.GAL uint maximumImagesPerStage, int maximumComputeSharedMemorySize, float maximumSupportedAnisotropy, + int shaderSubgroupSize, int storageBufferOffsetAlignment, int gatherBiasPrecision) { @@ -148,6 +150,7 @@ namespace Ryujinx.Graphics.GAL MaximumImagesPerStage = maximumImagesPerStage; MaximumComputeSharedMemorySize = maximumComputeSharedMemorySize; MaximumSupportedAnisotropy = maximumSupportedAnisotropy; + ShaderSubgroupSize = shaderSubgroupSize; StorageBufferOffsetAlignment = storageBufferOffsetAlignment; GatherBiasPrecision = gatherBiasPrecision; } diff --git a/src/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs b/src/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs index 9afc5b618..71a738255 100644 --- a/src/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs +++ b/src/Ryujinx.Graphics.Gpu/Shader/DiskCache/DiskCacheHostStorage.cs @@ -22,7 +22,7 @@ namespace Ryujinx.Graphics.Gpu.Shader.DiskCache private const ushort FileFormatVersionMajor = 1; private const ushort FileFormatVersionMinor = 2; private const uint FileFormatVersionPacked = ((uint)FileFormatVersionMajor << 16) | FileFormatVersionMinor; - private const uint CodeGenVersion = 5576; + private const uint CodeGenVersion = 5540; private const string SharedTocFileName = "shared.toc"; private const string SharedDataFileName = "shared.data"; diff --git a/src/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs b/src/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs index e7a2d345f..52193940b 100644 --- a/src/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs +++ b/src/Ryujinx.Graphics.Gpu/Shader/GpuAccessorBase.cs @@ -137,6 +137,8 @@ namespace Ryujinx.Graphics.Gpu.Shader public int QueryHostStorageBufferOffsetAlignment() => _context.Capabilities.StorageBufferOffsetAlignment; + public int QueryHostSubgroupSize() => _context.Capabilities.ShaderSubgroupSize; + public bool QueryHostSupportsBgraFormat() => _context.Capabilities.SupportsBgraFormat; public bool QueryHostSupportsFragmentShaderInterlock() => _context.Capabilities.SupportsFragmentShaderInterlock; diff --git a/src/Ryujinx.Graphics.OpenGL/Constants.cs b/src/Ryujinx.Graphics.OpenGL/Constants.cs index 8817011a9..38fedea0d 100644 --- a/src/Ryujinx.Graphics.OpenGL/Constants.cs +++ b/src/Ryujinx.Graphics.OpenGL/Constants.cs @@ -7,5 +7,6 @@ public const int MaxVertexAttribs = 16; public const int MaxVertexBuffers = 16; public const int MaxTransformFeedbackBuffers = 4; + public const int MaxSubgroupSize = 64; } } diff --git a/src/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs b/src/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs index 8a7ac8559..35d1569fe 100644 --- a/src/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs +++ b/src/Ryujinx.Graphics.OpenGL/OpenGLRenderer.cs @@ -175,6 +175,7 @@ namespace Ryujinx.Graphics.OpenGL maximumImagesPerStage: 8, maximumComputeSharedMemorySize: HwCapabilities.MaximumComputeSharedMemorySize, maximumSupportedAnisotropy: HwCapabilities.MaximumSupportedAnisotropy, + shaderSubgroupSize: Constants.MaxSubgroupSize, storageBufferOffsetAlignment: HwCapabilities.StorageBufferOffsetAlignment, gatherBiasPrecision: intelWindows || amdWindows ? 8 : 0); // Precision is 8 for these vendors on Vulkan. } diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs index e181ae98d..607ff431e 100644 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs +++ b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Declarations.cs @@ -25,6 +25,7 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl { context.AppendLine("#extension GL_KHR_shader_subgroup_basic : enable"); context.AppendLine("#extension GL_KHR_shader_subgroup_ballot : enable"); + context.AppendLine("#extension GL_KHR_shader_subgroup_shuffle : enable"); } context.AppendLine("#extension GL_ARB_shader_group_vote : enable"); @@ -201,26 +202,6 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/MultiplyHighU32.glsl"); } - if ((info.HelperFunctionsMask & HelperFunctionsMask.Shuffle) != 0) - { - AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/Shuffle.glsl"); - } - - if ((info.HelperFunctionsMask & HelperFunctionsMask.ShuffleDown) != 0) - { - AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleDown.glsl"); - } - - if ((info.HelperFunctionsMask & HelperFunctionsMask.ShuffleUp) != 0) - { - AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleUp.glsl"); - } - - if ((info.HelperFunctionsMask & HelperFunctionsMask.ShuffleXor) != 0) - { - AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleXor.glsl"); - } - if ((info.HelperFunctionsMask & HelperFunctionsMask.SwizzleAdd) != 0) { AppendHelperFunction(context, "Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/SwizzleAdd.glsl"); diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs index 221802727..0b80ac2b6 100644 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs +++ b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/HelperFunctionNames.cs @@ -5,10 +5,6 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl public static string MultiplyHighS32 = "Helper_MultiplyHighS32"; public static string MultiplyHighU32 = "Helper_MultiplyHighU32"; - public static string Shuffle = "Helper_Shuffle"; - public static string ShuffleDown = "Helper_ShuffleDown"; - public static string ShuffleUp = "Helper_ShuffleUp"; - public static string ShuffleXor = "Helper_ShuffleXor"; public static string SwizzleAdd = "Helper_SwizzleAdd"; } } diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/Shuffle.glsl b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/Shuffle.glsl deleted file mode 100644 index 7cb4764dd..000000000 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/Shuffle.glsl +++ /dev/null @@ -1,11 +0,0 @@ -float Helper_Shuffle(float x, uint index, uint mask, out bool valid) -{ - uint clamp = mask & 0x1fu; - uint segMask = (mask >> 8) & 0x1fu; - uint minThreadId = $SUBGROUP_INVOCATION$ & segMask; - uint maxThreadId = minThreadId | (clamp & ~segMask); - uint srcThreadId = (index & ~segMask) | minThreadId; - valid = srcThreadId <= maxThreadId; - float v = $SUBGROUP_BROADCAST$(x, srcThreadId); - return valid ? v : x; -} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleDown.glsl b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleDown.glsl deleted file mode 100644 index 71d901d5d..000000000 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleDown.glsl +++ /dev/null @@ -1,11 +0,0 @@ -float Helper_ShuffleDown(float x, uint index, uint mask, out bool valid) -{ - uint clamp = mask & 0x1fu; - uint segMask = (mask >> 8) & 0x1fu; - uint minThreadId = $SUBGROUP_INVOCATION$ & segMask; - uint maxThreadId = minThreadId | (clamp & ~segMask); - uint srcThreadId = $SUBGROUP_INVOCATION$ + index; - valid = srcThreadId <= maxThreadId; - float v = $SUBGROUP_BROADCAST$(x, srcThreadId); - return valid ? v : x; -} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleUp.glsl b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleUp.glsl deleted file mode 100644 index ae264d870..000000000 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleUp.glsl +++ /dev/null @@ -1,9 +0,0 @@ -float Helper_ShuffleUp(float x, uint index, uint mask, out bool valid) -{ - uint segMask = (mask >> 8) & 0x1fu; - uint minThreadId = $SUBGROUP_INVOCATION$ & segMask; - uint srcThreadId = $SUBGROUP_INVOCATION$ - index; - valid = int(srcThreadId) >= int(minThreadId); - float v = $SUBGROUP_BROADCAST$(x, srcThreadId); - return valid ? v : x; -} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleXor.glsl b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleXor.glsl deleted file mode 100644 index 789089d69..000000000 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/HelperFunctions/ShuffleXor.glsl +++ /dev/null @@ -1,11 +0,0 @@ -float Helper_ShuffleXor(float x, uint index, uint mask, out bool valid) -{ - uint clamp = mask & 0x1fu; - uint segMask = (mask >> 8) & 0x1fu; - uint minThreadId = $SUBGROUP_INVOCATION$ & segMask; - uint maxThreadId = minThreadId | (clamp & ~segMask); - uint srcThreadId = $SUBGROUP_INVOCATION$ ^ index; - valid = srcThreadId <= maxThreadId; - float v = $SUBGROUP_BROADCAST$(x, srcThreadId); - return valid ? v : x; -} \ No newline at end of file diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGen.cs b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGen.cs index 9208ceead..796eb4417 100644 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGen.cs +++ b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGen.cs @@ -9,6 +9,7 @@ using static Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions.InstGenFSI; using static Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions.InstGenHelper; using static Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions.InstGenMemory; using static Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions.InstGenPacking; +using static Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions.InstGenShuffle; using static Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions.InstGenVector; using static Ryujinx.Graphics.Shader.StructuredIr.InstructionInfo; @@ -174,6 +175,9 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions case Instruction.PackHalf2x16: return PackHalf2x16(context, operation); + case Instruction.Shuffle: + return Shuffle(context, operation); + case Instruction.Store: return Store(context, operation); diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenBallot.cs b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenBallot.cs index b44759c0d..6cc7048bd 100644 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenBallot.cs +++ b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenBallot.cs @@ -13,14 +13,15 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions AggregateType dstType = GetSrcVarType(operation.Inst, 0); string arg = GetSoureExpr(context, operation.GetSource(0), dstType); + char component = "xyzw"[operation.Index]; if (context.HostCapabilities.SupportsShaderBallot) { - return $"unpackUint2x32(ballotARB({arg})).x"; + return $"unpackUint2x32(ballotARB({arg})).{component}"; } else { - return $"subgroupBallot({arg}).x"; + return $"subgroupBallot({arg}).{component}"; } } } diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenHelper.cs b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenHelper.cs index c3d52b2c5..eb194c209 100644 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenHelper.cs +++ b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenHelper.cs @@ -108,10 +108,10 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions Add(Instruction.ShiftLeft, InstType.OpBinary, "<<", 3); Add(Instruction.ShiftRightS32, InstType.OpBinary, ">>", 3); Add(Instruction.ShiftRightU32, InstType.OpBinary, ">>", 3); - Add(Instruction.Shuffle, InstType.CallQuaternary, HelperFunctionNames.Shuffle); - Add(Instruction.ShuffleDown, InstType.CallQuaternary, HelperFunctionNames.ShuffleDown); - Add(Instruction.ShuffleUp, InstType.CallQuaternary, HelperFunctionNames.ShuffleUp); - Add(Instruction.ShuffleXor, InstType.CallQuaternary, HelperFunctionNames.ShuffleXor); + Add(Instruction.Shuffle, InstType.Special); + Add(Instruction.ShuffleDown, InstType.CallBinary, "subgroupShuffleDown"); + Add(Instruction.ShuffleUp, InstType.CallBinary, "subgroupShuffleUp"); + Add(Instruction.ShuffleXor, InstType.CallBinary, "subgroupShuffleXor"); Add(Instruction.Sine, InstType.CallUnary, "sin"); Add(Instruction.SquareRoot, InstType.CallUnary, "sqrt"); Add(Instruction.Store, InstType.Special); diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenShuffle.cs b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenShuffle.cs new file mode 100644 index 000000000..6d3859efd --- /dev/null +++ b/src/Ryujinx.Graphics.Shader/CodeGen/Glsl/Instructions/InstGenShuffle.cs @@ -0,0 +1,25 @@ +using Ryujinx.Graphics.Shader.StructuredIr; +using Ryujinx.Graphics.Shader.Translation; + +using static Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions.InstGenHelper; + +namespace Ryujinx.Graphics.Shader.CodeGen.Glsl.Instructions +{ + static class InstGenShuffle + { + public static string Shuffle(CodeGenContext context, AstOperation operation) + { + string value = GetSoureExpr(context, operation.GetSource(0), AggregateType.FP32); + string index = GetSoureExpr(context, operation.GetSource(1), AggregateType.U32); + + if (context.HostCapabilities.SupportsShaderBallot) + { + return $"readInvocationARB({value}, {index})"; + } + else + { + return $"subgroupShuffle({value}, {index})"; + } + } + } +} diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Spirv/Instructions.cs b/src/Ryujinx.Graphics.Shader/CodeGen/Spirv/Instructions.cs index 98c1b9d28..719ccf0cf 100644 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Spirv/Instructions.cs +++ b/src/Ryujinx.Graphics.Shader/CodeGen/Spirv/Instructions.cs @@ -231,7 +231,7 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Spirv var execution = context.Constant(context.TypeU32(), Scope.Subgroup); var maskVector = context.GroupNonUniformBallot(uvec4Type, execution, context.Get(AggregateType.Bool, source)); - var mask = context.CompositeExtract(context.TypeU32(), maskVector, (SpvLiteralInteger)0); + var mask = context.CompositeExtract(context.TypeU32(), maskVector, (SpvLiteralInteger)operation.Index); return new OperationResult(AggregateType.U32, mask); } @@ -1100,117 +1100,40 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Spirv private static OperationResult GenerateShuffle(CodeGenContext context, AstOperation operation) { - var x = context.GetFP32(operation.GetSource(0)); + var value = context.GetFP32(operation.GetSource(0)); var index = context.GetU32(operation.GetSource(1)); - var mask = context.GetU32(operation.GetSource(2)); - var const31 = context.Constant(context.TypeU32(), 31); - var const8 = context.Constant(context.TypeU32(), 8); - - var clamp = context.BitwiseAnd(context.TypeU32(), mask, const31); - var segMask = context.BitwiseAnd(context.TypeU32(), context.ShiftRightLogical(context.TypeU32(), mask, const8), const31); - var notSegMask = context.Not(context.TypeU32(), segMask); - var clampNotSegMask = context.BitwiseAnd(context.TypeU32(), clamp, notSegMask); - var indexNotSegMask = context.BitwiseAnd(context.TypeU32(), index, notSegMask); - - var threadId = GetScalarInput(context, IoVariable.SubgroupLaneId); - - var minThreadId = context.BitwiseAnd(context.TypeU32(), threadId, segMask); - var maxThreadId = context.BitwiseOr(context.TypeU32(), minThreadId, clampNotSegMask); - var srcThreadId = context.BitwiseOr(context.TypeU32(), indexNotSegMask, minThreadId); - var valid = context.ULessThanEqual(context.TypeBool(), srcThreadId, maxThreadId); - var value = context.GroupNonUniformShuffle(context.TypeFP32(), context.Constant(context.TypeU32(), (int)Scope.Subgroup), x, srcThreadId); - var result = context.Select(context.TypeFP32(), valid, value, x); - - var validLocal = (AstOperand)operation.GetSource(3); - - context.Store(context.GetLocalPointer(validLocal), context.BitcastIfNeeded(validLocal.VarType, AggregateType.Bool, valid)); + var result = context.GroupNonUniformShuffle(context.TypeFP32(), context.Constant(context.TypeU32(), (int)Scope.Subgroup), value, index); return new OperationResult(AggregateType.FP32, result); } private static OperationResult GenerateShuffleDown(CodeGenContext context, AstOperation operation) { - var x = context.GetFP32(operation.GetSource(0)); + var value = context.GetFP32(operation.GetSource(0)); var index = context.GetU32(operation.GetSource(1)); - var mask = context.GetU32(operation.GetSource(2)); - var const31 = context.Constant(context.TypeU32(), 31); - var const8 = context.Constant(context.TypeU32(), 8); - - var clamp = context.BitwiseAnd(context.TypeU32(), mask, const31); - var segMask = context.BitwiseAnd(context.TypeU32(), context.ShiftRightLogical(context.TypeU32(), mask, const8), const31); - var notSegMask = context.Not(context.TypeU32(), segMask); - var clampNotSegMask = context.BitwiseAnd(context.TypeU32(), clamp, notSegMask); - - var threadId = GetScalarInput(context, IoVariable.SubgroupLaneId); - - var minThreadId = context.BitwiseAnd(context.TypeU32(), threadId, segMask); - var maxThreadId = context.BitwiseOr(context.TypeU32(), minThreadId, clampNotSegMask); - var srcThreadId = context.IAdd(context.TypeU32(), threadId, index); - var valid = context.ULessThanEqual(context.TypeBool(), srcThreadId, maxThreadId); - var value = context.GroupNonUniformShuffle(context.TypeFP32(), context.Constant(context.TypeU32(), (int)Scope.Subgroup), x, srcThreadId); - var result = context.Select(context.TypeFP32(), valid, value, x); - - var validLocal = (AstOperand)operation.GetSource(3); - - context.Store(context.GetLocalPointer(validLocal), context.BitcastIfNeeded(validLocal.VarType, AggregateType.Bool, valid)); + var result = context.GroupNonUniformShuffleDown(context.TypeFP32(), context.Constant(context.TypeU32(), (int)Scope.Subgroup), value, index); return new OperationResult(AggregateType.FP32, result); } private static OperationResult GenerateShuffleUp(CodeGenContext context, AstOperation operation) { - var x = context.GetFP32(operation.GetSource(0)); + var value = context.GetFP32(operation.GetSource(0)); var index = context.GetU32(operation.GetSource(1)); - var mask = context.GetU32(operation.GetSource(2)); - var const31 = context.Constant(context.TypeU32(), 31); - var const8 = context.Constant(context.TypeU32(), 8); - - var segMask = context.BitwiseAnd(context.TypeU32(), context.ShiftRightLogical(context.TypeU32(), mask, const8), const31); - - var threadId = GetScalarInput(context, IoVariable.SubgroupLaneId); - - var minThreadId = context.BitwiseAnd(context.TypeU32(), threadId, segMask); - var srcThreadId = context.ISub(context.TypeU32(), threadId, index); - var valid = context.SGreaterThanEqual(context.TypeBool(), srcThreadId, minThreadId); - var value = context.GroupNonUniformShuffle(context.TypeFP32(), context.Constant(context.TypeU32(), (int)Scope.Subgroup), x, srcThreadId); - var result = context.Select(context.TypeFP32(), valid, value, x); - - var validLocal = (AstOperand)operation.GetSource(3); - - context.Store(context.GetLocalPointer(validLocal), context.BitcastIfNeeded(validLocal.VarType, AggregateType.Bool, valid)); + var result = context.GroupNonUniformShuffleUp(context.TypeFP32(), context.Constant(context.TypeU32(), (int)Scope.Subgroup), value, index); return new OperationResult(AggregateType.FP32, result); } private static OperationResult GenerateShuffleXor(CodeGenContext context, AstOperation operation) { - var x = context.GetFP32(operation.GetSource(0)); + var value = context.GetFP32(operation.GetSource(0)); var index = context.GetU32(operation.GetSource(1)); - var mask = context.GetU32(operation.GetSource(2)); - var const31 = context.Constant(context.TypeU32(), 31); - var const8 = context.Constant(context.TypeU32(), 8); - - var clamp = context.BitwiseAnd(context.TypeU32(), mask, const31); - var segMask = context.BitwiseAnd(context.TypeU32(), context.ShiftRightLogical(context.TypeU32(), mask, const8), const31); - var notSegMask = context.Not(context.TypeU32(), segMask); - var clampNotSegMask = context.BitwiseAnd(context.TypeU32(), clamp, notSegMask); - - var threadId = GetScalarInput(context, IoVariable.SubgroupLaneId); - - var minThreadId = context.BitwiseAnd(context.TypeU32(), threadId, segMask); - var maxThreadId = context.BitwiseOr(context.TypeU32(), minThreadId, clampNotSegMask); - var srcThreadId = context.BitwiseXor(context.TypeU32(), threadId, index); - var valid = context.ULessThanEqual(context.TypeBool(), srcThreadId, maxThreadId); - var value = context.GroupNonUniformShuffle(context.TypeFP32(), context.Constant(context.TypeU32(), (int)Scope.Subgroup), x, srcThreadId); - var result = context.Select(context.TypeFP32(), valid, value, x); - - var validLocal = (AstOperand)operation.GetSource(3); - - context.Store(context.GetLocalPointer(validLocal), context.BitcastIfNeeded(validLocal.VarType, AggregateType.Bool, valid)); + var result = context.GroupNonUniformShuffleXor(context.TypeFP32(), context.Constant(context.TypeU32(), (int)Scope.Subgroup), value, index); return new OperationResult(AggregateType.FP32, result); } diff --git a/src/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvGenerator.cs b/src/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvGenerator.cs index 5eee888e4..70f1dd3c4 100644 --- a/src/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvGenerator.cs +++ b/src/Ryujinx.Graphics.Shader/CodeGen/Spirv/SpirvGenerator.cs @@ -28,12 +28,7 @@ namespace Ryujinx.Graphics.Shader.CodeGen.Spirv _poolLock = new object(); } - private const HelperFunctionsMask NeedsInvocationIdMask = - HelperFunctionsMask.Shuffle | - HelperFunctionsMask.ShuffleDown | - HelperFunctionsMask.ShuffleUp | - HelperFunctionsMask.ShuffleXor | - HelperFunctionsMask.SwizzleAdd; + private const HelperFunctionsMask NeedsInvocationIdMask = HelperFunctionsMask.SwizzleAdd; public static byte[] Generate(StructuredProgramInfo info, CodeGenParameters parameters) { diff --git a/src/Ryujinx.Graphics.Shader/Decoders/Decoder.cs b/src/Ryujinx.Graphics.Shader/Decoders/Decoder.cs index d18a9baf8..4266dedca 100644 --- a/src/Ryujinx.Graphics.Shader/Decoders/Decoder.cs +++ b/src/Ryujinx.Graphics.Shader/Decoders/Decoder.cs @@ -307,6 +307,9 @@ namespace Ryujinx.Graphics.Shader.Decoders case InstName.Sts: context.SetUsedFeature(FeatureFlags.SharedMemory); break; + case InstName.Shfl: + context.SetUsedFeature(FeatureFlags.Shuffle); + break; } block.OpCodes.Add(op); diff --git a/src/Ryujinx.Graphics.Shader/IGpuAccessor.cs b/src/Ryujinx.Graphics.Shader/IGpuAccessor.cs index ee31f02d1..ba10f2720 100644 --- a/src/Ryujinx.Graphics.Shader/IGpuAccessor.cs +++ b/src/Ryujinx.Graphics.Shader/IGpuAccessor.cs @@ -194,6 +194,15 @@ namespace Ryujinx.Graphics.Shader return 16; } + /// + /// Queries host shader subgroup size. + /// + /// Host shader subgroup size in invocations + int QueryHostSubgroupSize() + { + return 32; + } + /// /// Queries host support for texture formats with BGRA component order (such as BGRA8). /// diff --git a/src/Ryujinx.Graphics.Shader/Instructions/InstEmitMove.cs b/src/Ryujinx.Graphics.Shader/Instructions/InstEmitMove.cs index 9d1c7d087..944039d65 100644 --- a/src/Ryujinx.Graphics.Shader/Instructions/InstEmitMove.cs +++ b/src/Ryujinx.Graphics.Shader/Instructions/InstEmitMove.cs @@ -76,7 +76,7 @@ namespace Ryujinx.Graphics.Shader.Instructions switch (op.SReg) { case SReg.LaneId: - src = context.Load(StorageKind.Input, IoVariable.SubgroupLaneId); + src = EmitLoadSubgroupLaneId(context); break; case SReg.InvocationId: @@ -146,19 +146,19 @@ namespace Ryujinx.Graphics.Shader.Instructions break; case SReg.EqMask: - src = context.Load(StorageKind.Input, IoVariable.SubgroupEqMask, null, Const(0)); + src = EmitLoadSubgroupMask(context, IoVariable.SubgroupEqMask); break; case SReg.LtMask: - src = context.Load(StorageKind.Input, IoVariable.SubgroupLtMask, null, Const(0)); + src = EmitLoadSubgroupMask(context, IoVariable.SubgroupLtMask); break; case SReg.LeMask: - src = context.Load(StorageKind.Input, IoVariable.SubgroupLeMask, null, Const(0)); + src = EmitLoadSubgroupMask(context, IoVariable.SubgroupLeMask); break; case SReg.GtMask: - src = context.Load(StorageKind.Input, IoVariable.SubgroupGtMask, null, Const(0)); + src = EmitLoadSubgroupMask(context, IoVariable.SubgroupGtMask); break; case SReg.GeMask: - src = context.Load(StorageKind.Input, IoVariable.SubgroupGeMask, null, Const(0)); + src = EmitLoadSubgroupMask(context, IoVariable.SubgroupGeMask); break; default: @@ -169,6 +169,52 @@ namespace Ryujinx.Graphics.Shader.Instructions context.Copy(GetDest(op.Dest), src); } + private static Operand EmitLoadSubgroupLaneId(EmitterContext context) + { + if (context.TranslatorContext.GpuAccessor.QueryHostSubgroupSize() <= 32) + { + return context.Load(StorageKind.Input, IoVariable.SubgroupLaneId); + } + + return context.BitwiseAnd(context.Load(StorageKind.Input, IoVariable.SubgroupLaneId), Const(0x1f)); + } + + private static Operand EmitLoadSubgroupMask(EmitterContext context, IoVariable ioVariable) + { + int subgroupSize = context.TranslatorContext.GpuAccessor.QueryHostSubgroupSize(); + + if (subgroupSize <= 32) + { + return context.Load(StorageKind.Input, ioVariable, null, Const(0)); + } + else if (subgroupSize == 64) + { + Operand laneId = context.Load(StorageKind.Input, IoVariable.SubgroupLaneId); + Operand low = context.Load(StorageKind.Input, ioVariable, null, Const(0)); + Operand high = context.Load(StorageKind.Input, ioVariable, null, Const(1)); + + return context.ConditionalSelect(context.BitwiseAnd(laneId, Const(32)), high, low); + } + else + { + Operand laneId = context.Load(StorageKind.Input, IoVariable.SubgroupLaneId); + Operand element = context.ShiftRightU32(laneId, Const(5)); + + Operand res = context.Load(StorageKind.Input, ioVariable, null, Const(0)); + res = context.ConditionalSelect( + context.ICompareEqual(element, Const(1)), + context.Load(StorageKind.Input, ioVariable, null, Const(1)), res); + res = context.ConditionalSelect( + context.ICompareEqual(element, Const(2)), + context.Load(StorageKind.Input, ioVariable, null, Const(2)), res); + res = context.ConditionalSelect( + context.ICompareEqual(element, Const(3)), + context.Load(StorageKind.Input, ioVariable, null, Const(3)), res); + + return res; + } + } + public static void SelR(EmitterContext context) { InstSelR op = context.GetOp(); diff --git a/src/Ryujinx.Graphics.Shader/Instructions/InstEmitWarp.cs b/src/Ryujinx.Graphics.Shader/Instructions/InstEmitWarp.cs index a84944e43..73eea5c34 100644 --- a/src/Ryujinx.Graphics.Shader/Instructions/InstEmitWarp.cs +++ b/src/Ryujinx.Graphics.Shader/Instructions/InstEmitWarp.cs @@ -50,20 +50,7 @@ namespace Ryujinx.Graphics.Shader.Instructions InstVote op = context.GetOp(); Operand pred = GetPredicate(context, op.SrcPred, op.SrcPredInv); - Operand res = null; - - switch (op.VoteMode) - { - case VoteMode.All: - res = context.VoteAll(pred); - break; - case VoteMode.Any: - res = context.VoteAny(pred); - break; - case VoteMode.Eq: - res = context.VoteAllEqual(pred); - break; - } + Operand res = EmitVote(context, op.VoteMode, pred); if (res != null) { @@ -76,7 +63,81 @@ namespace Ryujinx.Graphics.Shader.Instructions if (op.Dest != RegisterConsts.RegisterZeroIndex) { - context.Copy(GetDest(op.Dest), context.Ballot(pred)); + context.Copy(GetDest(op.Dest), EmitBallot(context, pred)); + } + } + + private static Operand EmitVote(EmitterContext context, VoteMode voteMode, Operand pred) + { + int subgroupSize = context.TranslatorContext.GpuAccessor.QueryHostSubgroupSize(); + + if (subgroupSize <= 32) + { + return voteMode switch + { + VoteMode.All => context.VoteAll(pred), + VoteMode.Any => context.VoteAny(pred), + VoteMode.Eq => context.VoteAllEqual(pred), + _ => null, + }; + } + + // Emulate vote with ballot masks. + // We do that when the GPU thread count is not 32, + // since the shader code assumes it is 32. + // allInvocations => ballot(pred) == ballot(true), + // anyInvocation => ballot(pred) != 0, + // allInvocationsEqual => ballot(pred) == balot(true) || ballot(pred) == 0 + Operand ballotMask = EmitBallot(context, pred); + + Operand AllTrue() => context.ICompareEqual(ballotMask, EmitBallot(context, Const(IrConsts.True))); + + return voteMode switch + { + VoteMode.All => AllTrue(), + VoteMode.Any => context.ICompareNotEqual(ballotMask, Const(0)), + VoteMode.Eq => context.BitwiseOr(AllTrue(), context.ICompareEqual(ballotMask, Const(0))), + _ => null, + }; + } + + private static Operand EmitBallot(EmitterContext context, Operand pred) + { + int subgroupSize = context.TranslatorContext.GpuAccessor.QueryHostSubgroupSize(); + + if (subgroupSize <= 32) + { + return context.Ballot(pred, 0); + } + else if (subgroupSize == 64) + { + // TODO: Add support for vector destination and do that with a single operation. + + Operand laneId = context.Load(StorageKind.Input, IoVariable.SubgroupLaneId); + Operand low = context.Ballot(pred, 0); + Operand high = context.Ballot(pred, 1); + + return context.ConditionalSelect(context.BitwiseAnd(laneId, Const(32)), high, low); + } + else + { + // TODO: Add support for vector destination and do that with a single operation. + + Operand laneId = context.Load(StorageKind.Input, IoVariable.SubgroupLaneId); + Operand element = context.ShiftRightU32(laneId, Const(5)); + + Operand res = context.Ballot(pred, 0); + res = context.ConditionalSelect( + context.ICompareEqual(element, Const(1)), + context.Ballot(pred, 1), res); + res = context.ConditionalSelect( + context.ICompareEqual(element, Const(2)), + context.Ballot(pred, 2), res); + res = context.ConditionalSelect( + context.ICompareEqual(element, Const(3)), + context.Ballot(pred, 3), res); + + return res; } } } diff --git a/src/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj b/src/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj index b1f1fb963..ea9a7821b 100644 --- a/src/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj +++ b/src/Ryujinx.Graphics.Shader/Ryujinx.Graphics.Shader.csproj @@ -12,10 +12,6 @@ - - - - diff --git a/src/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs b/src/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs index 73ce90827..2a3d65e75 100644 --- a/src/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs +++ b/src/Ryujinx.Graphics.Shader/StructuredIr/HelperFunctionsMask.cs @@ -7,10 +7,6 @@ namespace Ryujinx.Graphics.Shader.StructuredIr { MultiplyHighS32 = 1 << 2, MultiplyHighU32 = 1 << 3, - Shuffle = 1 << 4, - ShuffleDown = 1 << 5, - ShuffleUp = 1 << 6, - ShuffleXor = 1 << 7, SwizzleAdd = 1 << 10, FSI = 1 << 11, } diff --git a/src/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs b/src/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs index 6cd0fd086..1169512e9 100644 --- a/src/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs +++ b/src/Ryujinx.Graphics.Shader/StructuredIr/InstructionInfo.cs @@ -109,14 +109,15 @@ namespace Ryujinx.Graphics.Shader.StructuredIr Add(Instruction.PackDouble2x32, AggregateType.FP64, AggregateType.U32, AggregateType.U32); Add(Instruction.PackHalf2x16, AggregateType.U32, AggregateType.FP32, AggregateType.FP32); Add(Instruction.ReciprocalSquareRoot, AggregateType.Scalar, AggregateType.Scalar); + Add(Instruction.Return, AggregateType.Void, AggregateType.U32); Add(Instruction.Round, AggregateType.Scalar, AggregateType.Scalar); Add(Instruction.ShiftLeft, AggregateType.S32, AggregateType.S32, AggregateType.S32); Add(Instruction.ShiftRightS32, AggregateType.S32, AggregateType.S32, AggregateType.S32); Add(Instruction.ShiftRightU32, AggregateType.U32, AggregateType.U32, AggregateType.S32); - Add(Instruction.Shuffle, AggregateType.FP32, AggregateType.FP32, AggregateType.U32, AggregateType.U32, AggregateType.Bool); - Add(Instruction.ShuffleDown, AggregateType.FP32, AggregateType.FP32, AggregateType.U32, AggregateType.U32, AggregateType.Bool); - Add(Instruction.ShuffleUp, AggregateType.FP32, AggregateType.FP32, AggregateType.U32, AggregateType.U32, AggregateType.Bool); - Add(Instruction.ShuffleXor, AggregateType.FP32, AggregateType.FP32, AggregateType.U32, AggregateType.U32, AggregateType.Bool); + Add(Instruction.Shuffle, AggregateType.FP32, AggregateType.FP32, AggregateType.U32); + Add(Instruction.ShuffleDown, AggregateType.FP32, AggregateType.FP32, AggregateType.U32); + Add(Instruction.ShuffleUp, AggregateType.FP32, AggregateType.FP32, AggregateType.U32); + Add(Instruction.ShuffleXor, AggregateType.FP32, AggregateType.FP32, AggregateType.U32); Add(Instruction.Sine, AggregateType.Scalar, AggregateType.Scalar); Add(Instruction.SquareRoot, AggregateType.Scalar, AggregateType.Scalar); Add(Instruction.Store, AggregateType.Void); @@ -131,7 +132,7 @@ namespace Ryujinx.Graphics.Shader.StructuredIr Add(Instruction.VoteAll, AggregateType.Bool, AggregateType.Bool); Add(Instruction.VoteAllEqual, AggregateType.Bool, AggregateType.Bool); Add(Instruction.VoteAny, AggregateType.Bool, AggregateType.Bool); -#pragma warning restore IDE0055v +#pragma warning restore IDE0055 } private static void Add(Instruction inst, AggregateType destType, params AggregateType[] srcTypes) diff --git a/src/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs b/src/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs index 862fef126..b0db0ffb0 100644 --- a/src/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs +++ b/src/Ryujinx.Graphics.Shader/StructuredIr/StructuredProgram.cs @@ -282,18 +282,6 @@ namespace Ryujinx.Graphics.Shader.StructuredIr case Instruction.MultiplyHighU32: context.Info.HelperFunctionsMask |= HelperFunctionsMask.MultiplyHighU32; break; - case Instruction.Shuffle: - context.Info.HelperFunctionsMask |= HelperFunctionsMask.Shuffle; - break; - case Instruction.ShuffleDown: - context.Info.HelperFunctionsMask |= HelperFunctionsMask.ShuffleDown; - break; - case Instruction.ShuffleUp: - context.Info.HelperFunctionsMask |= HelperFunctionsMask.ShuffleUp; - break; - case Instruction.ShuffleXor: - context.Info.HelperFunctionsMask |= HelperFunctionsMask.ShuffleXor; - break; case Instruction.SwizzleAdd: context.Info.HelperFunctionsMask |= HelperFunctionsMask.SwizzleAdd; break; diff --git a/src/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs b/src/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs index 6cb572381..a08c8ea9d 100644 --- a/src/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs +++ b/src/Ryujinx.Graphics.Shader/Translation/EmitterContextInsts.cs @@ -112,9 +112,13 @@ namespace Ryujinx.Graphics.Shader.Translation return context.Add(Instruction.AtomicXor, storageKind, Local(), Const(binding), e0, e1, value); } - public static Operand Ballot(this EmitterContext context, Operand a) + public static Operand Ballot(this EmitterContext context, Operand a, int index) { - return context.Add(Instruction.Ballot, Local(), a); + Operand dest = Local(); + + context.Add(new Operation(Instruction.Ballot, index, dest, a)); + + return dest; } public static Operand Barrier(this EmitterContext context) @@ -782,21 +786,41 @@ namespace Ryujinx.Graphics.Shader.Translation return context.Add(Instruction.ShiftRightU32, Local(), a, b); } + public static Operand Shuffle(this EmitterContext context, Operand a, Operand b) + { + return context.Add(Instruction.Shuffle, Local(), a, b); + } + public static (Operand, Operand) Shuffle(this EmitterContext context, Operand a, Operand b, Operand c) { return context.Add(Instruction.Shuffle, (Local(), Local()), a, b, c); } + public static Operand ShuffleDown(this EmitterContext context, Operand a, Operand b) + { + return context.Add(Instruction.ShuffleDown, Local(), a, b); + } + public static (Operand, Operand) ShuffleDown(this EmitterContext context, Operand a, Operand b, Operand c) { return context.Add(Instruction.ShuffleDown, (Local(), Local()), a, b, c); } + public static Operand ShuffleUp(this EmitterContext context, Operand a, Operand b) + { + return context.Add(Instruction.ShuffleUp, Local(), a, b); + } + public static (Operand, Operand) ShuffleUp(this EmitterContext context, Operand a, Operand b, Operand c) { return context.Add(Instruction.ShuffleUp, (Local(), Local()), a, b, c); } + public static Operand ShuffleXor(this EmitterContext context, Operand a, Operand b) + { + return context.Add(Instruction.ShuffleXor, Local(), a, b); + } + public static (Operand, Operand) ShuffleXor(this EmitterContext context, Operand a, Operand b, Operand c) { return context.Add(Instruction.ShuffleXor, (Local(), Local()), a, b, c); diff --git a/src/Ryujinx.Graphics.Shader/Translation/FeatureFlags.cs b/src/Ryujinx.Graphics.Shader/Translation/FeatureFlags.cs index 5b7226acd..552a3f310 100644 --- a/src/Ryujinx.Graphics.Shader/Translation/FeatureFlags.cs +++ b/src/Ryujinx.Graphics.Shader/Translation/FeatureFlags.cs @@ -18,6 +18,7 @@ namespace Ryujinx.Graphics.Shader.Translation InstanceId = 1 << 3, DrawParameters = 1 << 4, RtLayer = 1 << 5, + Shuffle = 1 << 6, FixedFuncAttr = 1 << 9, LocalMemory = 1 << 10, SharedMemory = 1 << 11, diff --git a/src/Ryujinx.Graphics.Shader/Translation/HelperFunctionManager.cs b/src/Ryujinx.Graphics.Shader/Translation/HelperFunctionManager.cs index 2addff5c0..ef2f8759d 100644 --- a/src/Ryujinx.Graphics.Shader/Translation/HelperFunctionManager.cs +++ b/src/Ryujinx.Graphics.Shader/Translation/HelperFunctionManager.cs @@ -56,6 +56,20 @@ namespace Ryujinx.Graphics.Shader.Translation return functionId; } + public int GetOrCreateShuffleFunctionId(HelperFunctionName functionName, int subgroupSize) + { + if (_functionIds.TryGetValue((int)functionName, out int functionId)) + { + return functionId; + } + + Function function = GenerateShuffleFunction(functionName, subgroupSize); + functionId = AddFunction(function); + _functionIds.Add((int)functionName, functionId); + + return functionId; + } + private Function GenerateFunction(HelperFunctionName functionName) { return functionName switch @@ -216,6 +230,137 @@ namespace Ryujinx.Graphics.Shader.Translation return new Function(ControlFlowGraph.Create(context.GetOperations()).Blocks, $"SharedStore{bitSize}_{id}", false, 2, 0); } + private static Function GenerateShuffleFunction(HelperFunctionName functionName, int subgroupSize) + { + return functionName switch + { + HelperFunctionName.Shuffle => GenerateShuffle(subgroupSize), + HelperFunctionName.ShuffleDown => GenerateShuffleDown(subgroupSize), + HelperFunctionName.ShuffleUp => GenerateShuffleUp(subgroupSize), + HelperFunctionName.ShuffleXor => GenerateShuffleXor(subgroupSize), + _ => throw new ArgumentException($"Invalid function name {functionName}"), + }; + } + + private static Function GenerateShuffle(int subgroupSize) + { + EmitterContext context = new(); + + Operand value = Argument(0); + Operand index = Argument(1); + Operand mask = Argument(2); + + Operand clamp = context.BitwiseAnd(mask, Const(0x1f)); + Operand segMask = context.BitwiseAnd(context.ShiftRightU32(mask, Const(8)), Const(0x1f)); + Operand minThreadId = context.BitwiseAnd(GenerateLoadSubgroupLaneId(context, subgroupSize), segMask); + Operand maxThreadId = context.BitwiseOr(context.BitwiseAnd(clamp, context.BitwiseNot(segMask)), minThreadId); + Operand srcThreadId = context.BitwiseOr(context.BitwiseAnd(index, context.BitwiseNot(segMask)), minThreadId); + Operand valid = context.ICompareLessOrEqualUnsigned(srcThreadId, maxThreadId); + + context.Copy(Argument(3), valid); + + Operand result = context.Shuffle(value, GenerateSubgroupShuffleIndex(context, srcThreadId, subgroupSize)); + + context.Return(context.ConditionalSelect(valid, result, value)); + + return new Function(ControlFlowGraph.Create(context.GetOperations()).Blocks, "Shuffle", true, 3, 1); + } + + private static Function GenerateShuffleDown(int subgroupSize) + { + EmitterContext context = new(); + + Operand value = Argument(0); + Operand index = Argument(1); + Operand mask = Argument(2); + + Operand clamp = context.BitwiseAnd(mask, Const(0x1f)); + Operand segMask = context.BitwiseAnd(context.ShiftRightU32(mask, Const(8)), Const(0x1f)); + Operand laneId = GenerateLoadSubgroupLaneId(context, subgroupSize); + Operand minThreadId = context.BitwiseAnd(laneId, segMask); + Operand maxThreadId = context.BitwiseOr(context.BitwiseAnd(clamp, context.BitwiseNot(segMask)), minThreadId); + Operand srcThreadId = context.IAdd(laneId, index); + Operand valid = context.ICompareLessOrEqualUnsigned(srcThreadId, maxThreadId); + + context.Copy(Argument(3), valid); + + Operand result = context.Shuffle(value, GenerateSubgroupShuffleIndex(context, srcThreadId, subgroupSize)); + + context.Return(context.ConditionalSelect(valid, result, value)); + + return new Function(ControlFlowGraph.Create(context.GetOperations()).Blocks, "ShuffleDown", true, 3, 1); + } + + private static Function GenerateShuffleUp(int subgroupSize) + { + EmitterContext context = new(); + + Operand value = Argument(0); + Operand index = Argument(1); + Operand mask = Argument(2); + + Operand segMask = context.BitwiseAnd(context.ShiftRightU32(mask, Const(8)), Const(0x1f)); + Operand laneId = GenerateLoadSubgroupLaneId(context, subgroupSize); + Operand minThreadId = context.BitwiseAnd(laneId, segMask); + Operand srcThreadId = context.ISubtract(laneId, index); + Operand valid = context.ICompareGreaterOrEqual(srcThreadId, minThreadId); + + context.Copy(Argument(3), valid); + + Operand result = context.Shuffle(value, GenerateSubgroupShuffleIndex(context, srcThreadId, subgroupSize)); + + context.Return(context.ConditionalSelect(valid, result, value)); + + return new Function(ControlFlowGraph.Create(context.GetOperations()).Blocks, "ShuffleUp", true, 3, 1); + } + + private static Function GenerateShuffleXor(int subgroupSize) + { + EmitterContext context = new(); + + Operand value = Argument(0); + Operand index = Argument(1); + Operand mask = Argument(2); + + Operand clamp = context.BitwiseAnd(mask, Const(0x1f)); + Operand segMask = context.BitwiseAnd(context.ShiftRightU32(mask, Const(8)), Const(0x1f)); + Operand laneId = GenerateLoadSubgroupLaneId(context, subgroupSize); + Operand minThreadId = context.BitwiseAnd(laneId, segMask); + Operand maxThreadId = context.BitwiseOr(context.BitwiseAnd(clamp, context.BitwiseNot(segMask)), minThreadId); + Operand srcThreadId = context.BitwiseExclusiveOr(laneId, index); + Operand valid = context.ICompareLessOrEqualUnsigned(srcThreadId, maxThreadId); + + context.Copy(Argument(3), valid); + + Operand result = context.Shuffle(value, GenerateSubgroupShuffleIndex(context, srcThreadId, subgroupSize)); + + context.Return(context.ConditionalSelect(valid, result, value)); + + return new Function(ControlFlowGraph.Create(context.GetOperations()).Blocks, "ShuffleXor", true, 3, 1); + } + + private static Operand GenerateLoadSubgroupLaneId(EmitterContext context, int subgroupSize) + { + if (subgroupSize <= 32) + { + return context.Load(StorageKind.Input, IoVariable.SubgroupLaneId); + } + + return context.BitwiseAnd(context.Load(StorageKind.Input, IoVariable.SubgroupLaneId), Const(0x1f)); + } + + private static Operand GenerateSubgroupShuffleIndex(EmitterContext context, Operand srcThreadId, int subgroupSize) + { + if (subgroupSize <= 32) + { + return srcThreadId; + } + + return context.BitwiseOr( + context.BitwiseAnd(context.Load(StorageKind.Input, IoVariable.SubgroupLaneId), Const(0x60)), + srcThreadId); + } + private Function GenerateTexelFetchScaleFunction() { EmitterContext context = new(); diff --git a/src/Ryujinx.Graphics.Shader/Translation/HelperFunctionName.cs b/src/Ryujinx.Graphics.Shader/Translation/HelperFunctionName.cs index e5af17355..09b17729d 100644 --- a/src/Ryujinx.Graphics.Shader/Translation/HelperFunctionName.cs +++ b/src/Ryujinx.Graphics.Shader/Translation/HelperFunctionName.cs @@ -2,12 +2,18 @@ namespace Ryujinx.Graphics.Shader.Translation { enum HelperFunctionName { + Invalid, + ConvertDoubleToFloat, ConvertFloatToDouble, SharedAtomicMaxS32, SharedAtomicMinS32, SharedStore8, SharedStore16, + Shuffle, + ShuffleDown, + ShuffleUp, + ShuffleXor, TexelFetchScale, TextureSizeUnscale, } diff --git a/src/Ryujinx.Graphics.Shader/Translation/Transforms/ShufflePass.cs b/src/Ryujinx.Graphics.Shader/Translation/Transforms/ShufflePass.cs new file mode 100644 index 000000000..839d4f818 --- /dev/null +++ b/src/Ryujinx.Graphics.Shader/Translation/Transforms/ShufflePass.cs @@ -0,0 +1,52 @@ +using Ryujinx.Graphics.Shader.IntermediateRepresentation; +using Ryujinx.Graphics.Shader.Translation.Optimizations; +using System.Collections.Generic; +using static Ryujinx.Graphics.Shader.IntermediateRepresentation.OperandHelper; + +namespace Ryujinx.Graphics.Shader.Translation.Transforms +{ + class ShufflePass : ITransformPass + { + public static bool IsEnabled(IGpuAccessor gpuAccessor, ShaderStage stage, TargetLanguage targetLanguage, FeatureFlags usedFeatures) + { + return usedFeatures.HasFlag(FeatureFlags.Shuffle); + } + + public static LinkedListNode RunPass(TransformContext context, LinkedListNode node) + { + Operation operation = (Operation)node.Value; + + HelperFunctionName functionName = operation.Inst switch + { + Instruction.Shuffle => HelperFunctionName.Shuffle, + Instruction.ShuffleDown => HelperFunctionName.ShuffleDown, + Instruction.ShuffleUp => HelperFunctionName.ShuffleUp, + Instruction.ShuffleXor => HelperFunctionName.ShuffleXor, + _ => HelperFunctionName.Invalid, + }; + + if (functionName == HelperFunctionName.Invalid || operation.SourcesCount != 3 || operation.DestsCount != 2) + { + return node; + } + + int functionId = context.Hfm.GetOrCreateShuffleFunctionId(functionName, context.GpuAccessor.QueryHostSubgroupSize()); + + Operand result = operation.GetDest(0); + Operand valid = operation.GetDest(1); + Operand value = operation.GetSource(0); + Operand index = operation.GetSource(1); + Operand mask = operation.GetSource(2); + + operation.Dest = null; + + Operand[] callArgs = new Operand[] { Const(functionId), value, index, mask, valid }; + + LinkedListNode newNode = node.List.AddBefore(node, new Operation(Instruction.Call, 0, result, callArgs)); + + Utils.DeleteNode(node, operation); + + return newNode; + } + } +} diff --git a/src/Ryujinx.Graphics.Shader/Translation/Transforms/TransformPasses.cs b/src/Ryujinx.Graphics.Shader/Translation/Transforms/TransformPasses.cs index c3bbe7ddf..293938807 100644 --- a/src/Ryujinx.Graphics.Shader/Translation/Transforms/TransformPasses.cs +++ b/src/Ryujinx.Graphics.Shader/Translation/Transforms/TransformPasses.cs @@ -13,6 +13,7 @@ namespace Ryujinx.Graphics.Shader.Translation.Transforms RunPass(context); RunPass(context); RunPass(context); + RunPass(context); } private static void RunPass(TransformContext context) where T : ITransformPass diff --git a/src/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs b/src/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs index e76a332f4..798de5c90 100644 --- a/src/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs +++ b/src/Ryujinx.Graphics.Vulkan/HardwareCapabilities.cs @@ -25,7 +25,6 @@ namespace Ryujinx.Graphics.Vulkan public readonly bool SupportsIndirectParameters; public readonly bool SupportsFragmentShaderInterlock; public readonly bool SupportsGeometryShaderPassthrough; - public readonly bool SupportsSubgroupSizeControl; public readonly bool SupportsShaderFloat64; public readonly bool SupportsShaderInt8; public readonly bool SupportsShaderStencilExport; @@ -45,9 +44,7 @@ namespace Ryujinx.Graphics.Vulkan public readonly bool SupportsViewportArray2; public readonly bool SupportsHostImportedMemory; public readonly bool SupportsDepthClipControl; - public readonly uint MinSubgroupSize; - public readonly uint MaxSubgroupSize; - public readonly ShaderStageFlags RequiredSubgroupSizeStages; + public readonly uint SubgroupSize; public readonly SampleCountFlags SupportedSampleCounts; public readonly PortabilitySubsetFlags PortabilitySubset; public readonly uint VertexBufferAlignment; @@ -64,7 +61,6 @@ namespace Ryujinx.Graphics.Vulkan bool supportsIndirectParameters, bool supportsFragmentShaderInterlock, bool supportsGeometryShaderPassthrough, - bool supportsSubgroupSizeControl, bool supportsShaderFloat64, bool supportsShaderInt8, bool supportsShaderStencilExport, @@ -84,9 +80,7 @@ namespace Ryujinx.Graphics.Vulkan bool supportsViewportArray2, bool supportsHostImportedMemory, bool supportsDepthClipControl, - uint minSubgroupSize, - uint maxSubgroupSize, - ShaderStageFlags requiredSubgroupSizeStages, + uint subgroupSize, SampleCountFlags supportedSampleCounts, PortabilitySubsetFlags portabilitySubset, uint vertexBufferAlignment, @@ -102,7 +96,6 @@ namespace Ryujinx.Graphics.Vulkan SupportsIndirectParameters = supportsIndirectParameters; SupportsFragmentShaderInterlock = supportsFragmentShaderInterlock; SupportsGeometryShaderPassthrough = supportsGeometryShaderPassthrough; - SupportsSubgroupSizeControl = supportsSubgroupSizeControl; SupportsShaderFloat64 = supportsShaderFloat64; SupportsShaderInt8 = supportsShaderInt8; SupportsShaderStencilExport = supportsShaderStencilExport; @@ -122,9 +115,7 @@ namespace Ryujinx.Graphics.Vulkan SupportsViewportArray2 = supportsViewportArray2; SupportsHostImportedMemory = supportsHostImportedMemory; SupportsDepthClipControl = supportsDepthClipControl; - MinSubgroupSize = minSubgroupSize; - MaxSubgroupSize = maxSubgroupSize; - RequiredSubgroupSizeStages = requiredSubgroupSizeStages; + SubgroupSize = subgroupSize; SupportedSampleCounts = supportedSampleCounts; PortabilitySubset = portabilitySubset; VertexBufferAlignment = vertexBufferAlignment; diff --git a/src/Ryujinx.Graphics.Vulkan/PipelineState.cs b/src/Ryujinx.Graphics.Vulkan/PipelineState.cs index cc9af5b6d..5a30cff8e 100644 --- a/src/Ryujinx.Graphics.Vulkan/PipelineState.cs +++ b/src/Ryujinx.Graphics.Vulkan/PipelineState.cs @@ -352,11 +352,6 @@ namespace Ryujinx.Graphics.Vulkan return pipeline; } - if (gd.Capabilities.SupportsSubgroupSizeControl) - { - UpdateStageRequiredSubgroupSizes(gd, 1); - } - var pipelineCreateInfo = new ComputePipelineCreateInfo { SType = StructureType.ComputePipelineCreateInfo, @@ -616,11 +611,6 @@ namespace Ryujinx.Graphics.Vulkan PDynamicStates = dynamicStates, }; - if (gd.Capabilities.SupportsSubgroupSizeControl) - { - UpdateStageRequiredSubgroupSizes(gd, (int)StagesCount); - } - var pipelineCreateInfo = new GraphicsPipelineCreateInfo { SType = StructureType.GraphicsPipelineCreateInfo, @@ -659,19 +649,6 @@ namespace Ryujinx.Graphics.Vulkan return pipeline; } - private readonly unsafe void UpdateStageRequiredSubgroupSizes(VulkanRenderer gd, int count) - { - for (int index = 0; index < count; index++) - { - bool canUseExplicitSubgroupSize = - (gd.Capabilities.RequiredSubgroupSizeStages & Stages[index].Stage) != 0 && - gd.Capabilities.MinSubgroupSize <= RequiredSubgroupSize && - gd.Capabilities.MaxSubgroupSize >= RequiredSubgroupSize; - - Stages[index].PNext = canUseExplicitSubgroupSize ? StageRequiredSubgroupSizes.Pointer + index : null; - } - } - private void UpdateVertexAttributeDescriptions(VulkanRenderer gd) { // Vertex attributes exceeding the stride are invalid. diff --git a/src/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs b/src/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs index 6f73397b8..973c6d396 100644 --- a/src/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs +++ b/src/Ryujinx.Graphics.Vulkan/VulkanInitialization.cs @@ -37,7 +37,6 @@ namespace Ryujinx.Graphics.Vulkan "VK_EXT_shader_stencil_export", "VK_KHR_shader_float16_int8", "VK_EXT_shader_subgroup_ballot", - "VK_EXT_subgroup_size_control", "VK_NV_geometry_shader_passthrough", "VK_NV_viewport_array2", "VK_EXT_depth_clip_control", diff --git a/src/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs b/src/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs index 7848bc877..675512293 100644 --- a/src/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs +++ b/src/Ryujinx.Graphics.Vulkan/VulkanRenderer.cs @@ -151,6 +151,14 @@ namespace Ryujinx.Graphics.Vulkan SType = StructureType.PhysicalDeviceProperties2, }; + PhysicalDeviceSubgroupProperties propertiesSubgroup = new() + { + SType = StructureType.PhysicalDeviceSubgroupProperties, + PNext = properties2.PNext, + }; + + properties2.PNext = &propertiesSubgroup; + PhysicalDeviceBlendOperationAdvancedPropertiesEXT propertiesBlendOperationAdvanced = new() { SType = StructureType.PhysicalDeviceBlendOperationAdvancedPropertiesExt, @@ -164,18 +172,6 @@ namespace Ryujinx.Graphics.Vulkan properties2.PNext = &propertiesBlendOperationAdvanced; } - PhysicalDeviceSubgroupSizeControlPropertiesEXT propertiesSubgroupSizeControl = new() - { - SType = StructureType.PhysicalDeviceSubgroupSizeControlPropertiesExt, - }; - - bool supportsSubgroupSizeControl = _physicalDevice.IsDeviceExtensionPresent("VK_EXT_subgroup_size_control"); - - if (supportsSubgroupSizeControl) - { - properties2.PNext = &propertiesSubgroupSizeControl; - } - bool supportsTransformFeedback = _physicalDevice.IsDeviceExtensionPresent(ExtTransformFeedback.ExtensionName); PhysicalDeviceTransformFeedbackPropertiesEXT propertiesTransformFeedback = new() @@ -315,7 +311,6 @@ namespace Ryujinx.Graphics.Vulkan _physicalDevice.IsDeviceExtensionPresent(KhrDrawIndirectCount.ExtensionName), _physicalDevice.IsDeviceExtensionPresent("VK_EXT_fragment_shader_interlock"), _physicalDevice.IsDeviceExtensionPresent("VK_NV_geometry_shader_passthrough"), - supportsSubgroupSizeControl, features2.Features.ShaderFloat64, featuresShaderInt8.ShaderInt8, _physicalDevice.IsDeviceExtensionPresent("VK_EXT_shader_stencil_export"), @@ -335,9 +330,7 @@ namespace Ryujinx.Graphics.Vulkan _physicalDevice.IsDeviceExtensionPresent("VK_NV_viewport_array2"), _physicalDevice.IsDeviceExtensionPresent(ExtExternalMemoryHost.ExtensionName), supportsDepthClipControl && featuresDepthClipControl.DepthClipControl, - propertiesSubgroupSizeControl.MinSubgroupSize, - propertiesSubgroupSizeControl.MaxSubgroupSize, - propertiesSubgroupSizeControl.RequiredSubgroupSizeStages, + propertiesSubgroup.SubgroupSize, supportedSampleCounts, portabilityFlags, vertexBufferAlignment, @@ -623,6 +616,7 @@ namespace Ryujinx.Graphics.Vulkan maximumImagesPerStage: Constants.MaxImagesPerStage, maximumComputeSharedMemorySize: (int)limits.MaxComputeSharedMemorySize, maximumSupportedAnisotropy: (int)limits.MaxSamplerAnisotropy, + shaderSubgroupSize: (int)Capabilities.SubgroupSize, storageBufferOffsetAlignment: (int)limits.MinStorageBufferOffsetAlignment, gatherBiasPrecision: IsIntelWindows || IsAmdWindows ? (int)Capabilities.SubTexelPrecisionBits : 0); }