From 9db73f74cf77484c4d8b34af54c563c68cabb41e Mon Sep 17 00:00:00 2001 From: riperiperi Date: Mon, 10 Apr 2023 11:22:58 +0100 Subject: [PATCH] ARMeilleure: Respect FZ/RM flags for all floating point operations (#4618) * ARMeilleure: Respect Fz flag for all floating point operations. This is a change in strategy for emulating the Fz FPCR flag. Before, it was set before instructions that "needed it" and reset after. However, this missed a few hot instructions like the multiplication instruction, and the entirety of A32. The new strategy is to set the Fz flag only in the following circumstances: - Set to match FPCR before translated functions/loop are executed. - Reset when calling SoftFloat methods, set when returning. - Reset when exiting execution. This allows us to remove the code around the existing Fz aware instructions, and get the accuracy benefits on all floating point instructions executed while in translated code. Single step executions now need to be called with a context wrapper - right now it just contains the Fz flag initialization, and won't actually do anything on ARM. This fixes a bug in Breath of the Wild where some physics interactions could randomly crash the game due to subnormal values not flushing to zero. This is draft right now because I need to answer the questions: - Does dotnet avoid changing the value of Mxcsr? - Is it a good idea to assume that? Or should the flag set/restore be done on every managed method call, not just softfloat? - If we assume that, do we want a unit test to verify the behaviour? I recommend testing a bunch of games, especially games affected when this was originally added, such as #1611. * Remove unused method * Use FMA for Fmadd, Fmsub, Fnmadd, Fnmsub, Fmla, Fmls ...when available. Similar implementation to A32 * Use FMA for Frecps, Frsqrts * Don't set DAZ. * Add round mode to ARM FP mode * Fix mistakes * Add test for FP state when calling managed methods * Add explanatory comment to test. * Cleanup * Add A64 FPCR flags * Vrintx_S A32 fast path on A64 backend * Address feedback 1, re-enable DAZ * Fix FMA instructions By Elem * Address feedback --- ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs | 2 + ARMeilleure/CodeGen/X86/AssemblerTable.cs | 2 + ARMeilleure/CodeGen/X86/CodeGenerator.cs | 25 +- ARMeilleure/CodeGen/X86/IntrinsicTable.cs | 6 +- ARMeilleure/CodeGen/X86/Mxcsr.cs | 15 + ARMeilleure/CodeGen/X86/PreAllocator.cs | 8 +- ARMeilleure/CodeGen/X86/X86Instruction.cs | 2 + .../Instructions/InstEmitSimdArithmetic.cs | 512 ++++++++++++------ ARMeilleure/Instructions/InstEmitSimdCvt32.cs | 17 +- .../Instructions/InstEmitSimdHelper.cs | 105 ++-- .../Instructions/InstEmitSimdHelper32.cs | 2 + ARMeilleure/Instructions/InstEmitSystem.cs | 4 + ARMeilleure/Instructions/InstEmitSystem32.cs | 2 + .../IntermediateRepresentation/Intrinsic.cs | 8 +- ARMeilleure/Translation/ArmEmitterContext.cs | 15 + ARMeilleure/Translation/DispatcherFunction.cs | 1 + ARMeilleure/Translation/PTC/Ptc.cs | 2 +- ARMeilleure/Translation/TranslatedFunction.cs | 5 + ARMeilleure/Translation/Translator.cs | 4 +- ARMeilleure/Translation/TranslatorStubs.cs | 71 +++ .../Translation/TranslatorTestMethods.cs | 148 +++++ Ryujinx.Tests/Cpu/EnvironmentTests.cs | 91 ++++ 22 files changed, 822 insertions(+), 225 deletions(-) create mode 100644 ARMeilleure/CodeGen/X86/Mxcsr.cs create mode 100644 ARMeilleure/Translation/TranslatorTestMethods.cs create mode 100644 Ryujinx.Tests/Cpu/EnvironmentTests.cs diff --git a/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs b/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs index 53ef152e5..a309d56d9 100644 --- a/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/Arm64/IntrinsicTable.cs @@ -226,6 +226,8 @@ namespace ARMeilleure.CodeGen.Arm64 Add(Intrinsic.Arm64MlsVe, new IntrinsicInfo(0x2f004000u, IntrinsicType.VectorTernaryRdByElem)); Add(Intrinsic.Arm64MlsV, new IntrinsicInfo(0x2e209400u, IntrinsicType.VectorTernaryRd)); Add(Intrinsic.Arm64MoviV, new IntrinsicInfo(0x0f000400u, IntrinsicType.VectorMovi)); + Add(Intrinsic.Arm64MrsFpcr, new IntrinsicInfo(0xd53b4400u, IntrinsicType.GetRegister)); + Add(Intrinsic.Arm64MsrFpcr, new IntrinsicInfo(0xd51b4400u, IntrinsicType.SetRegister)); Add(Intrinsic.Arm64MrsFpsr, new IntrinsicInfo(0xd53b4420u, IntrinsicType.GetRegister)); Add(Intrinsic.Arm64MsrFpsr, new IntrinsicInfo(0xd51b4420u, IntrinsicType.SetRegister)); Add(Intrinsic.Arm64MulVe, new IntrinsicInfo(0x0f008000u, IntrinsicType.VectorBinaryByElem)); diff --git a/ARMeilleure/CodeGen/X86/AssemblerTable.cs b/ARMeilleure/CodeGen/X86/AssemblerTable.cs index b47b3ecd1..e6a2ff07f 100644 --- a/ARMeilleure/CodeGen/X86/AssemblerTable.cs +++ b/ARMeilleure/CodeGen/X86/AssemblerTable.cs @@ -268,11 +268,13 @@ namespace ARMeilleure.CodeGen.X86 Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vfmadd231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfmadd231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b8, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vfmadd231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38b9, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vfmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66)); + Add(X86Instruction.Vfnmadd231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfnmadd231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bc, InstructionFlags.Vex | InstructionFlags.Prefix66)); Add(X86Instruction.Vfnmadd231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW)); Add(X86Instruction.Vfnmadd231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bd, InstructionFlags.Vex | InstructionFlags.Prefix66)); diff --git a/ARMeilleure/CodeGen/X86/CodeGenerator.cs b/ARMeilleure/CodeGen/X86/CodeGenerator.cs index 8b5a3fc57..e7179b517 100644 --- a/ARMeilleure/CodeGen/X86/CodeGenerator.cs +++ b/ARMeilleure/CodeGen/X86/CodeGenerator.cs @@ -249,10 +249,9 @@ namespace ARMeilleure.CodeGen.X86 case IntrinsicType.Mxcsr: { Operand offset = operation.GetSource(0); - Operand bits = operation.GetSource(1); - Debug.Assert(offset.Kind == OperandKind.Constant && bits.Kind == OperandKind.Constant); - Debug.Assert(offset.Type == OperandType.I32 && bits.Type == OperandType.I32); + Debug.Assert(offset.Kind == OperandKind.Constant); + Debug.Assert(offset.Type == OperandType.I32); int offs = offset.AsInt32() + context.CallArgsRegionSize; @@ -261,21 +260,23 @@ namespace ARMeilleure.CodeGen.X86 Debug.Assert(HardwareCapabilities.SupportsSse || HardwareCapabilities.SupportsVexEncoding); - context.Assembler.Stmxcsr(memOp); - - if (operation.Intrinsic == Intrinsic.X86Mxcsrmb) + if (operation.Intrinsic == Intrinsic.X86Ldmxcsr) { - context.Assembler.Or(memOp, bits, OperandType.I32); + Operand bits = operation.GetSource(1); + Debug.Assert(bits.Type == OperandType.I32); + + context.Assembler.Mov(memOp, bits, OperandType.I32); + context.Assembler.Ldmxcsr(memOp); } - else /* if (intrinOp.Intrinsic == Intrinsic.X86Mxcsrub) */ + else if (operation.Intrinsic == Intrinsic.X86Stmxcsr) { - Operand notBits = Const(~bits.AsInt32()); + Operand dest = operation.Destination; + Debug.Assert(dest.Type == OperandType.I32); - context.Assembler.And(memOp, notBits, OperandType.I32); + context.Assembler.Stmxcsr(memOp); + context.Assembler.Mov(dest, memOp, OperandType.I32); } - context.Assembler.Ldmxcsr(memOp); - break; } diff --git a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs index c788fa442..e3d94b7ae 100644 --- a/ARMeilleure/CodeGen/X86/IntrinsicTable.cs +++ b/ARMeilleure/CodeGen/X86/IntrinsicTable.cs @@ -60,6 +60,7 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Haddpd, new IntrinsicInfo(X86Instruction.Haddpd, IntrinsicType.Binary)); Add(Intrinsic.X86Haddps, new IntrinsicInfo(X86Instruction.Haddps, IntrinsicType.Binary)); Add(Intrinsic.X86Insertps, new IntrinsicInfo(X86Instruction.Insertps, IntrinsicType.TernaryImm)); + Add(Intrinsic.X86Ldmxcsr, new IntrinsicInfo(X86Instruction.None, IntrinsicType.Mxcsr)); Add(Intrinsic.X86Maxpd, new IntrinsicInfo(X86Instruction.Maxpd, IntrinsicType.Binary)); Add(Intrinsic.X86Maxps, new IntrinsicInfo(X86Instruction.Maxps, IntrinsicType.Binary)); Add(Intrinsic.X86Maxsd, new IntrinsicInfo(X86Instruction.Maxsd, IntrinsicType.Binary)); @@ -75,8 +76,6 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Mulps, new IntrinsicInfo(X86Instruction.Mulps, IntrinsicType.Binary)); Add(Intrinsic.X86Mulsd, new IntrinsicInfo(X86Instruction.Mulsd, IntrinsicType.Binary)); Add(Intrinsic.X86Mulss, new IntrinsicInfo(X86Instruction.Mulss, IntrinsicType.Binary)); - Add(Intrinsic.X86Mxcsrmb, new IntrinsicInfo(X86Instruction.None, IntrinsicType.Mxcsr)); // Mask bits. - Add(Intrinsic.X86Mxcsrub, new IntrinsicInfo(X86Instruction.None, IntrinsicType.Mxcsr)); // Unmask bits. Add(Intrinsic.X86Paddb, new IntrinsicInfo(X86Instruction.Paddb, IntrinsicType.Binary)); Add(Intrinsic.X86Paddd, new IntrinsicInfo(X86Instruction.Paddd, IntrinsicType.Binary)); Add(Intrinsic.X86Paddq, new IntrinsicInfo(X86Instruction.Paddq, IntrinsicType.Binary)); @@ -160,6 +159,7 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Sqrtps, new IntrinsicInfo(X86Instruction.Sqrtps, IntrinsicType.Unary)); Add(Intrinsic.X86Sqrtsd, new IntrinsicInfo(X86Instruction.Sqrtsd, IntrinsicType.Unary)); Add(Intrinsic.X86Sqrtss, new IntrinsicInfo(X86Instruction.Sqrtss, IntrinsicType.Unary)); + Add(Intrinsic.X86Stmxcsr, new IntrinsicInfo(X86Instruction.None, IntrinsicType.Mxcsr)); Add(Intrinsic.X86Subpd, new IntrinsicInfo(X86Instruction.Subpd, IntrinsicType.Binary)); Add(Intrinsic.X86Subps, new IntrinsicInfo(X86Instruction.Subps, IntrinsicType.Binary)); Add(Intrinsic.X86Subsd, new IntrinsicInfo(X86Instruction.Subsd, IntrinsicType.Binary)); @@ -170,11 +170,13 @@ namespace ARMeilleure.CodeGen.X86 Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary)); Add(Intrinsic.X86Vcvtph2ps, new IntrinsicInfo(X86Instruction.Vcvtph2ps, IntrinsicType.Unary)); Add(Intrinsic.X86Vcvtps2ph, new IntrinsicInfo(X86Instruction.Vcvtps2ph, IntrinsicType.BinaryImm)); + Add(Intrinsic.X86Vfmadd231pd, new IntrinsicInfo(X86Instruction.Vfmadd231pd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmadd231ps, new IntrinsicInfo(X86Instruction.Vfmadd231ps, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmadd231sd, new IntrinsicInfo(X86Instruction.Vfmadd231sd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmadd231ss, new IntrinsicInfo(X86Instruction.Vfmadd231ss, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmsub231sd, new IntrinsicInfo(X86Instruction.Vfmsub231sd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfmsub231ss, new IntrinsicInfo(X86Instruction.Vfmsub231ss, IntrinsicType.Fma)); + Add(Intrinsic.X86Vfnmadd231pd, new IntrinsicInfo(X86Instruction.Vfnmadd231pd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfnmadd231ps, new IntrinsicInfo(X86Instruction.Vfnmadd231ps, IntrinsicType.Fma)); Add(Intrinsic.X86Vfnmadd231sd, new IntrinsicInfo(X86Instruction.Vfnmadd231sd, IntrinsicType.Fma)); Add(Intrinsic.X86Vfnmadd231ss, new IntrinsicInfo(X86Instruction.Vfnmadd231ss, IntrinsicType.Fma)); diff --git a/ARMeilleure/CodeGen/X86/Mxcsr.cs b/ARMeilleure/CodeGen/X86/Mxcsr.cs new file mode 100644 index 000000000..c61eac31a --- /dev/null +++ b/ARMeilleure/CodeGen/X86/Mxcsr.cs @@ -0,0 +1,15 @@ +using System; + +namespace ARMeilleure.CodeGen.X86 +{ + [Flags] + enum Mxcsr + { + Ftz = 1 << 15, // Flush To Zero. + Rhi = 1 << 14, // Round Mode high bit. + Rlo = 1 << 13, // Round Mode low bit. + Um = 1 << 11, // Underflow Mask. + Dm = 1 << 8, // Denormal Mask. + Daz = 1 << 6 // Denormals Are Zero. + } +} diff --git a/ARMeilleure/CodeGen/X86/PreAllocator.cs b/ARMeilleure/CodeGen/X86/PreAllocator.cs index 72f56514f..cb742d67f 100644 --- a/ARMeilleure/CodeGen/X86/PreAllocator.cs +++ b/ARMeilleure/CodeGen/X86/PreAllocator.cs @@ -120,12 +120,18 @@ namespace ARMeilleure.CodeGen.X86 break; case Instruction.Extended: - if (node.Intrinsic == Intrinsic.X86Mxcsrmb || node.Intrinsic == Intrinsic.X86Mxcsrub) + if (node.Intrinsic == Intrinsic.X86Ldmxcsr) { int stackOffset = stackAlloc.Allocate(OperandType.I32); node.SetSources(new Operand[] { Const(stackOffset), node.GetSource(0) }); } + else if (node.Intrinsic == Intrinsic.X86Stmxcsr) + { + int stackOffset = stackAlloc.Allocate(OperandType.I32); + + node.SetSources(new Operand[] { Const(stackOffset) }); + } break; } } diff --git a/ARMeilleure/CodeGen/X86/X86Instruction.cs b/ARMeilleure/CodeGen/X86/X86Instruction.cs index ecfc432d7..9a85c516f 100644 --- a/ARMeilleure/CodeGen/X86/X86Instruction.cs +++ b/ARMeilleure/CodeGen/X86/X86Instruction.cs @@ -208,11 +208,13 @@ namespace ARMeilleure.CodeGen.X86 Vblendvps, Vcvtph2ps, Vcvtps2ph, + Vfmadd231pd, Vfmadd231ps, Vfmadd231sd, Vfmadd231ss, Vfmsub231sd, Vfmsub231ss, + Vfnmadd231pd, Vfnmadd231ps, Vfnmadd231sd, Vfnmadd231ss, diff --git a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs index d0bb68e4f..7e7f26b1a 100644 --- a/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs +++ b/ARMeilleure/Instructions/InstEmitSimdArithmetic.cs @@ -615,14 +615,11 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; + IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; - Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd; + Intrinsic addInst = (op.Size & 1) == 0 ? Intrinsic.X86Addps : Intrinsic.X86Addpd; - return context.AddIntrinsic(addInst, op1, op2); - }, scalar: false, op1, op2); + return context.AddIntrinsic(addInst, op1, op2); }, scalar: false, op1, op2); }); } @@ -696,17 +693,33 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addss, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addss, a, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addsd, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addsd, a, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -730,10 +743,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: true, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: true); } else @@ -755,10 +765,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: false); } else @@ -886,10 +893,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: false, op1, op2); }); } @@ -914,10 +918,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: true); }, scalar: false, op1, op2); }); } @@ -940,10 +941,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: true, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: true); } else @@ -965,10 +963,7 @@ namespace ARMeilleure.Instructions { EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: false); } else @@ -1096,10 +1091,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: false, op1, op2); }); } @@ -1124,10 +1116,7 @@ namespace ARMeilleure.Instructions { return EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); - }, scalar: false, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: false); }, scalar: false, op1, op2); }); } @@ -1146,6 +1135,37 @@ namespace ARMeilleure.Instructions { InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlaSe); } + else if (Optimizations.UseFma) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ss, d, n, res); + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231sd, d, n, res); + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } else { EmitScalarTernaryOpByElemF(context, (op1, op2, op3) => @@ -1171,11 +1191,19 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1186,9 +1214,15 @@ namespace ARMeilleure.Instructions } else /* if (sizeF == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + } context.Copy(d, res); } @@ -1224,8 +1258,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); - res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231ps, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); + res = context.AddIntrinsic(Intrinsic.X86Addps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1240,8 +1281,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); - res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmadd231pd, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); + res = context.AddIntrinsic(Intrinsic.X86Addpd, d, res); + } context.Copy(d, res); } @@ -1261,6 +1309,37 @@ namespace ARMeilleure.Instructions { InstEmitSimdHelperArm64.EmitScalarTernaryOpFRdByElem(context, Intrinsic.Arm64FmlsSe); } + else if (Optimizations.UseFma) + { + OpCodeSimdRegElemF op = (OpCodeSimdRegElemF)context.CurrOp; + + Operand d = GetVec(op.Rd); + Operand n = GetVec(op.Rn); + Operand m = GetVec(op.Rm); + + int sizeF = op.Size & 1; + + if (sizeF == 0) + { + int shuffleMask = op.Index | op.Index << 2 | op.Index << 4 | op.Index << 6; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, d, n, res); + + context.Copy(d, context.VectorZeroUpper96(res)); + } + else /* if (sizeF == 1) */ + { + int shuffleMask = op.Index | op.Index << 1; + + Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); + + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, d, n, res); + + context.Copy(d, context.VectorZeroUpper64(res)); + } + } else { EmitScalarTernaryOpByElemF(context, (op1, op2, op3) => @@ -1286,11 +1365,19 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1301,9 +1388,15 @@ namespace ARMeilleure.Instructions } else /* if (sizeF == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); - - res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + } context.Copy(d, res); } @@ -1339,8 +1432,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufps, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); - res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, res); + res = context.AddIntrinsic(Intrinsic.X86Subps, d, res); + } if (op.RegisterSize == RegisterSize.Simd64) { @@ -1355,8 +1455,15 @@ namespace ARMeilleure.Instructions Operand res = context.AddIntrinsic(Intrinsic.X86Shufpd, m, m, Const(shuffleMask)); - res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); - res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, d, n, res); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, res); + res = context.AddIntrinsic(Intrinsic.X86Subpd, d, res); + } context.Copy(d, res); } @@ -1385,17 +1492,33 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subss, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, a, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subsd, a, res); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, a, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, a, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -1669,25 +1792,39 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand mask = X86GetScalar(context, -0f); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231ss, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0f); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); - - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subss, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, aNeg, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand mask = X86GetScalar(context, -0d); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmsub231sd, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); - - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Subsd, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, aNeg, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -1716,25 +1853,39 @@ namespace ARMeilleure.Instructions Operand n = GetVec(op.Rn); Operand m = GetVec(op.Rm); + Operand res; + if (op.Size == 0) { - Operand mask = X86GetScalar(context, -0f); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmsub231ss, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0f); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorps, mask, a); - - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addss, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addss, aNeg, res); + } context.Copy(d, context.VectorZeroUpper96(res)); } else /* if (op.Size == 1) */ { - Operand mask = X86GetScalar(context, -0d); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfmsub231sd, a, n, m); + } + else + { + Operand mask = X86GetScalar(context, -0d); + Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); - Operand aNeg = context.AddIntrinsic(Intrinsic.X86Xorpd, mask, a); - - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); - res = context.AddIntrinsic(Intrinsic.X86Addsd, aNeg, res); + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Addsd, aNeg, res); + } context.Copy(d, context.VectorZeroUpper64(res)); } @@ -1830,13 +1981,22 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand mask = X86GetScalar(context, 2f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subss, mask, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); @@ -1845,9 +2005,16 @@ namespace ARMeilleure.Instructions { Operand mask = X86GetScalar(context, 2d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subsd, mask, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); @@ -1877,14 +2044,23 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand mask = X86GetAllElements(context, 2f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); - res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subps, mask, res); + res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); if (op.RegisterSize == RegisterSize.Simd64) { @@ -1897,10 +2073,17 @@ namespace ARMeilleure.Instructions { Operand mask = X86GetAllElements(context, 2d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); - res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, mask, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, mask, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subpd, mask, res); + res = EmitSse41RecipStepSelectOpF(context, n, m, res, mask, scalar: false, sizeF); context.Copy(GetVec(op.Rd), res); } @@ -2113,20 +2296,32 @@ namespace ARMeilleure.Instructions public static void Frintx_S(ArmEmitterContext context) { - // TODO Arm64: Fast path. Should we set host FPCR? - EmitScalarUnaryOpF(context, (op1) => + if (Optimizations.UseAdvSimd) { - return EmitRoundByRMode(context, op1); - }); + InstEmitSimdHelperArm64.EmitScalarUnaryOpF(context, Intrinsic.Arm64FrintxS); + } + else + { + EmitScalarUnaryOpF(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } } public static void Frintx_V(ArmEmitterContext context) { - // TODO Arm64: Fast path. Should we set host FPCR? - EmitVectorUnaryOpF(context, (op1) => + if (Optimizations.UseAdvSimd) { - return EmitRoundByRMode(context, op1); - }); + InstEmitSimdHelperArm64.EmitVectorUnaryOpF(context, Intrinsic.Arm64FrintxV); + } + else + { + EmitVectorUnaryOpF(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } } public static void Frintz_S(ArmEmitterContext context) @@ -2237,16 +2432,25 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand maskHalf = X86GetScalar(context, 0.5f); Operand maskThree = X86GetScalar(context, 3f); Operand maskOneHalf = X86GetScalar(context, 1.5f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ss, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulss, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subss, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subss, maskThree, res); - res = context.AddIntrinsic(Intrinsic.X86Mulss, maskHalf, res); + res = context.AddIntrinsic(Intrinsic.X86Mulss, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper96(res)); @@ -2257,10 +2461,17 @@ namespace ARMeilleure.Instructions Operand maskThree = X86GetScalar(context, 3d); Operand maskOneHalf = X86GetScalar(context, 1.5d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231sd, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulsd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subsd, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subsd, maskThree, res); - res = context.AddIntrinsic(Intrinsic.X86Mulsd, maskHalf, res); + res = context.AddIntrinsic(Intrinsic.X86Mulsd, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: true, sizeF); context.Copy(GetVec(op.Rd), context.VectorZeroUpper64(res)); @@ -2290,15 +2501,24 @@ namespace ARMeilleure.Instructions int sizeF = op.Size & 1; + Operand res; + if (sizeF == 0) { Operand maskHalf = X86GetAllElements(context, 0.5f); Operand maskThree = X86GetAllElements(context, 3f); Operand maskOneHalf = X86GetAllElements(context, 1.5f); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231ps, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulps, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subps, maskThree, res); res = context.AddIntrinsic(Intrinsic.X86Mulps, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF); @@ -2315,9 +2535,16 @@ namespace ARMeilleure.Instructions Operand maskThree = X86GetAllElements(context, 3d); Operand maskOneHalf = X86GetAllElements(context, 1.5d); - Operand res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + if (Optimizations.UseFma) + { + res = context.AddIntrinsic(Intrinsic.X86Vfnmadd231pd, maskThree, n, m); + } + else + { + res = context.AddIntrinsic(Intrinsic.X86Mulpd, n, m); + res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res); + } - res = context.AddIntrinsic(Intrinsic.X86Subpd, maskThree, res); res = context.AddIntrinsic(Intrinsic.X86Mulpd, maskHalf, res); res = EmitSse41RecipStepSelectOpF(context, n, m, res, maskOneHalf, scalar: false, sizeF); @@ -4728,53 +4955,6 @@ namespace ARMeilleure.Instructions } } - public static Operand EmitSseOrAvxHandleFzModeOpF( - ArmEmitterContext context, - Func2I emit, - bool scalar, - Operand n = default, - Operand m = default) - { - Operand nCopy = n == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rn)) : n; - Operand mCopy = m == default ? context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rm)) : m; - - EmitSseOrAvxEnterFtzAndDazModesOpF(context, out Operand isTrue); - - Operand res = emit(nCopy, mCopy); - - EmitSseOrAvxExitFtzAndDazModesOpF(context, isTrue); - - if (n != default || m != default) - { - return res; - } - - int sizeF = ((IOpCodeSimd)context.CurrOp).Size & 1; - - if (sizeF == 0) - { - if (scalar) - { - res = context.VectorZeroUpper96(res); - } - else if (((OpCodeSimdReg)context.CurrOp).RegisterSize == RegisterSize.Simd64) - { - res = context.VectorZeroUpper64(res); - } - } - else /* if (sizeF == 1) */ - { - if (scalar) - { - res = context.VectorZeroUpper64(res); - } - } - - context.Copy(GetVec(((OpCodeSimdReg)context.CurrOp).Rd), res); - - return default; - } - private static Operand EmitSse2VectorMaxMinOpF(ArmEmitterContext context, Operand n, Operand m, bool isMax) { IOpCodeSimd op = (IOpCodeSimd)context.CurrOp; @@ -4834,10 +5014,7 @@ namespace ARMeilleure.Instructions Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); - }, scalar: scalar, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); }, scalar: scalar, nCopy, mCopy); if (n != default || m != default) @@ -4872,10 +5049,7 @@ namespace ARMeilleure.Instructions Operand res = EmitSse41ProcessNaNsOpF(context, (op1, op2) => { - return EmitSseOrAvxHandleFzModeOpF(context, (op1, op2) => - { - return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); - }, scalar: scalar, op1, op2); + return EmitSse2VectorMaxMinOpF(context, op1, op2, isMax: isMaxNum); }, scalar: scalar, nCopy, mCopy); if (n != default || m != default) diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs index 5fdc3b5ad..33ae83df6 100644 --- a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs @@ -356,9 +356,11 @@ namespace ARMeilleure.Instructions ? typeof(SoftFloat64_16).GetMethod(nameof(SoftFloat64_16.FPConvert)) : typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert)); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(method, src); context.LoadFromContext(); + context.EnterArmFpMode(); InsertScalar16(context, op.Vd, op.T, res); } @@ -372,9 +374,11 @@ namespace ARMeilleure.Instructions ? typeof(SoftFloat16_64).GetMethod(nameof(SoftFloat16_64.FPConvert)) : typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert)); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(method, src); context.LoadFromContext(); + context.EnterArmFpMode(); InsertScalar(context, op.Vd, res); } @@ -542,10 +546,17 @@ namespace ARMeilleure.Instructions // VRINTX (floating-point). public static void Vrintx_S(ArmEmitterContext context) { - EmitScalarUnaryOpF32(context, (op1) => + if (Optimizations.UseAdvSimd) { - return EmitRoundByRMode(context, op1); - }); + InstEmitSimdHelper32Arm64.EmitScalarUnaryOpF32(context, Intrinsic.Arm64FrintxS); + } + else + { + EmitScalarUnaryOpF32(context, (op1) => + { + return EmitRoundByRMode(context, op1); + }); + } } private static Operand EmitFPConvert(ArmEmitterContext context, Operand value, OperandType type, bool signed) diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper.cs b/ARMeilleure/Instructions/InstEmitSimdHelper.cs index 0e7af794a..c44c9b4d9 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper.cs @@ -1,3 +1,4 @@ +using ARMeilleure.CodeGen.X86; using ARMeilleure.Decoders; using ARMeilleure.IntermediateRepresentation; using ARMeilleure.State; @@ -158,6 +159,75 @@ namespace ARMeilleure.Instructions }; #endregion + public static void EnterArmFpMode(EmitterContext context, Func getFpFlag) + { + if (Optimizations.UseSse2) + { + Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr); + + Operand fzTrue = getFpFlag(FPState.FzFlag); + Operand r0True = getFpFlag(FPState.RMode0Flag); + Operand r1True = getFpFlag(FPState.RMode1Flag); + + mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Rhi | Mxcsr.Rlo))); + + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(fzTrue, Const((int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Um | Mxcsr.Dm)), Const(0))); + + // X86 round modes in order: nearest, negative, positive, zero + // ARM round modes in order: nearest, positive, negative, zero + // Read the bits backwards to correct this. + + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(r0True, Const((int)Mxcsr.Rhi), Const(0))); + mxcsr = context.BitwiseOr(mxcsr, context.ConditionalSelect(r1True, Const((int)Mxcsr.Rlo), Const(0))); + + context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr); + } + else if (Optimizations.UseAdvSimd) + { + Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr); + + Operand fzTrue = getFpFlag(FPState.FzFlag); + Operand r0True = getFpFlag(FPState.RMode0Flag); + Operand r1True = getFpFlag(FPState.RMode1Flag); + + fpcr = context.BitwiseAnd(fpcr, Const(~(int)(FPCR.Fz | FPCR.RMode0 | FPCR.RMode1))); + + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(fzTrue, Const((int)FPCR.Fz), Const(0))); + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(r0True, Const((int)FPCR.RMode0), Const(0))); + fpcr = context.BitwiseOr(fpcr, context.ConditionalSelect(r1True, Const((int)FPCR.RMode1), Const(0))); + + context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr); + + // TODO: Restore FPSR + } + } + + public static void ExitArmFpMode(EmitterContext context, Action setFpFlag) + { + if (Optimizations.UseSse2) + { + Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr); + + // Unset round mode (to nearest) and ftz. + mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)(Mxcsr.Ftz | Mxcsr.Daz | Mxcsr.Rhi | Mxcsr.Rlo))); + + context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr); + + // Status flags would be stored here if they were used. + } + else if (Optimizations.UseAdvSimd) + { + Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr); + + // Unset round mode (to nearest) and fz. + fpcr = context.BitwiseAnd(fpcr, Const(~(int)(FPCR.Fz | FPCR.RMode0 | FPCR.RMode1))); + + context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr); + + // TODO: Store FPSR + } + } + public static int GetImmShl(OpCodeSimdShImm op) { return op.Imm - (8 << op.Size); @@ -465,9 +535,11 @@ namespace ARMeilleure.Instructions ? typeof(SoftFloat32).GetMethod(name) : typeof(SoftFloat64).GetMethod(name); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(info, callArgs); context.LoadFromContext(); + context.EnterArmFpMode(); return res; } @@ -1358,39 +1430,6 @@ namespace ARMeilleure.Instructions } } - [Flags] - public enum Mxcsr - { - Ftz = 1 << 15, // Flush To Zero. - Um = 1 << 11, // Underflow Mask. - Dm = 1 << 8, // Denormal Mask. - Daz = 1 << 6 // Denormals Are Zero. - } - - public static void EmitSseOrAvxEnterFtzAndDazModesOpF(ArmEmitterContext context, out Operand isTrue) - { - isTrue = GetFpFlag(FPState.FzFlag); - - Operand lblTrue = Label(); - context.BranchIfFalse(lblTrue, isTrue); - - context.AddIntrinsicNoRet(Intrinsic.X86Mxcsrmb, Const((int)(Mxcsr.Ftz | Mxcsr.Um | Mxcsr.Dm | Mxcsr.Daz))); - - context.MarkLabel(lblTrue); - } - - public static void EmitSseOrAvxExitFtzAndDazModesOpF(ArmEmitterContext context, Operand isTrue = default) - { - isTrue = isTrue == default ? GetFpFlag(FPState.FzFlag) : isTrue; - - Operand lblTrue = Label(); - context.BranchIfFalse(lblTrue, isTrue); - - context.AddIntrinsicNoRet(Intrinsic.X86Mxcsrub, Const((int)(Mxcsr.Ftz | Mxcsr.Daz))); - - context.MarkLabel(lblTrue); - } - public enum CmpCondition { // Legacy Sse. diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs index 84b01d05c..36d27d425 100644 --- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs +++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs @@ -1197,9 +1197,11 @@ namespace ARMeilleure.Instructions Array.Resize(ref callArgs, callArgs.Length + 1); callArgs[callArgs.Length - 1] = Const(1); + context.ExitArmFpMode(); context.StoreToContext(); Operand res = context.Call(info, callArgs); context.LoadFromContext(); + context.EnterArmFpMode(); return res; } diff --git a/ARMeilleure/Instructions/InstEmitSystem.cs b/ARMeilleure/Instructions/InstEmitSystem.cs index 1345bbf10..f668b83b6 100644 --- a/ARMeilleure/Instructions/InstEmitSystem.cs +++ b/ARMeilleure/Instructions/InstEmitSystem.cs @@ -192,6 +192,8 @@ namespace ARMeilleure.Instructions SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpcr, Const(flag)), Const(1))); } } + + context.UpdateArmFpMode(); } private static void EmitSetFpsr(ArmEmitterContext context) @@ -210,6 +212,8 @@ namespace ARMeilleure.Instructions SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpsr, Const(flag)), Const(1))); } } + + context.UpdateArmFpMode(); } } } diff --git a/ARMeilleure/Instructions/InstEmitSystem32.cs b/ARMeilleure/Instructions/InstEmitSystem32.cs index e07db4121..2f6cf19d6 100644 --- a/ARMeilleure/Instructions/InstEmitSystem32.cs +++ b/ARMeilleure/Instructions/InstEmitSystem32.cs @@ -321,6 +321,8 @@ namespace ARMeilleure.Instructions SetFpFlag(context, (FPState)flag, context.BitwiseAnd(context.ShiftRightUI(fpscr, Const(flag)), Const(1))); } } + + context.UpdateArmFpMode(); } } } diff --git a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs index b629345ee..f5a776fa2 100644 --- a/ARMeilleure/IntermediateRepresentation/Intrinsic.cs +++ b/ARMeilleure/IntermediateRepresentation/Intrinsic.cs @@ -53,6 +53,7 @@ namespace ARMeilleure.IntermediateRepresentation X86Haddpd, X86Haddps, X86Insertps, + X86Ldmxcsr, X86Maxpd, X86Maxps, X86Maxsd, @@ -68,8 +69,6 @@ namespace ARMeilleure.IntermediateRepresentation X86Mulps, X86Mulsd, X86Mulss, - X86Mxcsrmb, - X86Mxcsrub, X86Paddb, X86Paddd, X86Paddq, @@ -153,6 +152,7 @@ namespace ARMeilleure.IntermediateRepresentation X86Sqrtps, X86Sqrtsd, X86Sqrtss, + X86Stmxcsr, X86Subpd, X86Subps, X86Subsd, @@ -163,11 +163,13 @@ namespace ARMeilleure.IntermediateRepresentation X86Unpcklps, X86Vcvtph2ps, X86Vcvtps2ph, + X86Vfmadd231pd, X86Vfmadd231ps, X86Vfmadd231sd, X86Vfmadd231ss, X86Vfmsub231sd, X86Vfmsub231ss, + X86Vfnmadd231pd, X86Vfnmadd231ps, X86Vfnmadd231sd, X86Vfnmadd231ss, @@ -394,6 +396,8 @@ namespace ARMeilleure.IntermediateRepresentation Arm64MlsVe, Arm64MlsV, Arm64MoviV, + Arm64MrsFpcr, + Arm64MsrFpcr, Arm64MrsFpsr, Arm64MsrFpsr, Arm64MulVe, diff --git a/ARMeilleure/Translation/ArmEmitterContext.cs b/ARMeilleure/Translation/ArmEmitterContext.cs index 238f85082..565d2aada 100644 --- a/ARMeilleure/Translation/ArmEmitterContext.cs +++ b/ARMeilleure/Translation/ArmEmitterContext.cs @@ -188,6 +188,21 @@ namespace ARMeilleure.Translation } } + public void EnterArmFpMode() + { + InstEmitSimdHelper.EnterArmFpMode(this, InstEmitHelper.GetFpFlag); + } + + public void UpdateArmFpMode() + { + EnterArmFpMode(); + } + + public void ExitArmFpMode() + { + InstEmitSimdHelper.ExitArmFpMode(this, (flag, value) => InstEmitHelper.SetFpFlag(this, flag, value)); + } + public Operand TryGetComparisonResult(Condition condition) { if (_optOpLastCompare == null || _optOpLastCompare != _optOpLastFlagSet) diff --git a/ARMeilleure/Translation/DispatcherFunction.cs b/ARMeilleure/Translation/DispatcherFunction.cs index e3ea21f67..7d5a3388e 100644 --- a/ARMeilleure/Translation/DispatcherFunction.cs +++ b/ARMeilleure/Translation/DispatcherFunction.cs @@ -3,4 +3,5 @@ namespace ARMeilleure.Translation { delegate void DispatcherFunction(IntPtr nativeContext, ulong startAddress); + delegate ulong WrapperFunction(IntPtr nativeContext, ulong startAddress); } diff --git a/ARMeilleure/Translation/PTC/Ptc.cs b/ARMeilleure/Translation/PTC/Ptc.cs index 17f687062..5970c4ff9 100644 --- a/ARMeilleure/Translation/PTC/Ptc.cs +++ b/ARMeilleure/Translation/PTC/Ptc.cs @@ -30,7 +30,7 @@ namespace ARMeilleure.Translation.PTC private const string OuterHeaderMagicString = "PTCohd\0\0"; private const string InnerHeaderMagicString = "PTCihd\0\0"; - private const uint InternalVersion = 4485; //! To be incremented manually for each change to the ARMeilleure project. + private const uint InternalVersion = 4626; //! To be incremented manually for each change to the ARMeilleure project. private const string ActualDir = "0"; private const string BackupDir = "1"; diff --git a/ARMeilleure/Translation/TranslatedFunction.cs b/ARMeilleure/Translation/TranslatedFunction.cs index 71eec08ac..f007883ef 100644 --- a/ARMeilleure/Translation/TranslatedFunction.cs +++ b/ARMeilleure/Translation/TranslatedFunction.cs @@ -25,5 +25,10 @@ namespace ARMeilleure.Translation { return _func(context.NativeContextPtr); } + + public ulong Execute(WrapperFunction dispatcher, State.ExecutionContext context) + { + return dispatcher(context.NativeContextPtr, (ulong)FuncPointer); + } } } \ No newline at end of file diff --git a/ARMeilleure/Translation/Translator.cs b/ARMeilleure/Translation/Translator.cs index 0c05b2b49..f349c5ebf 100644 --- a/ARMeilleure/Translation/Translator.cs +++ b/ARMeilleure/Translation/Translator.cs @@ -183,7 +183,7 @@ namespace ARMeilleure.Translation Statistics.StartTimer(); - ulong nextAddr = func.Execute(context); + ulong nextAddr = func.Execute(Stubs.ContextWrapper, context); Statistics.StopTimer(address); @@ -194,7 +194,7 @@ namespace ARMeilleure.Translation { TranslatedFunction func = Translate(address, context.ExecutionMode, highCq: false, singleStep: true); - address = func.Execute(context); + address = func.Execute(Stubs.ContextWrapper, context); EnqueueForDeletion(address, func); diff --git a/ARMeilleure/Translation/TranslatorStubs.cs b/ARMeilleure/Translation/TranslatorStubs.cs index 6ed84de80..69648df44 100644 --- a/ARMeilleure/Translation/TranslatorStubs.cs +++ b/ARMeilleure/Translation/TranslatorStubs.cs @@ -21,6 +21,7 @@ namespace ARMeilleure.Translation private readonly Translator _translator; private readonly Lazy _dispatchStub; private readonly Lazy _dispatchLoop; + private readonly Lazy _contextWrapper; /// /// Gets the dispatch stub. @@ -64,6 +65,20 @@ namespace ARMeilleure.Translation } } + /// + /// Gets the context wrapper function. + /// + /// instance was disposed + public WrapperFunction ContextWrapper + { + get + { + ObjectDisposedException.ThrowIf(_disposed, this); + + return _contextWrapper.Value; + } + } + /// /// Initializes a new instance of the class with the specified /// instance. @@ -77,6 +92,7 @@ namespace ARMeilleure.Translation _translator = translator; _dispatchStub = new(GenerateDispatchStub, isThreadSafe: true); _dispatchLoop = new(GenerateDispatchLoop, isThreadSafe: true); + _contextWrapper = new(GenerateContextWrapper, isThreadSafe: true); } /// @@ -202,6 +218,32 @@ namespace ARMeilleure.Translation return Marshal.GetFunctionPointerForDelegate(func); } + /// + /// Emits code that syncs FP state before executing guest code, or returns it to normal. + /// + /// Emitter context for the method + /// Pointer to the native context + /// True if entering guest code, false otherwise + private void EmitSyncFpContext(EmitterContext context, Operand nativeContext, bool enter) + { + if (enter) + { + InstEmitSimdHelper.EnterArmFpMode(context, (flag) => + { + Operand flagAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetRegisterOffset(new Register((int)flag, RegisterType.FpFlag)))); + return context.Load(OperandType.I32, flagAddress); + }); + } + else + { + InstEmitSimdHelper.ExitArmFpMode(context, (flag, value) => + { + Operand flagAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetRegisterOffset(new Register((int)flag, RegisterType.FpFlag)))); + context.Store(flagAddress, value); + }); + } + } + /// /// Generates a function. /// @@ -221,6 +263,8 @@ namespace ARMeilleure.Translation Operand runningAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetRunningOffset())); Operand dispatchAddress = context.Add(nativeContext, Const((ulong)NativeContext.GetDispatchAddressOffset())); + EmitSyncFpContext(context, nativeContext, true); + context.MarkLabel(beginLbl); context.Store(dispatchAddress, guestAddress); context.Copy(guestAddress, context.Call(Const((ulong)DispatchStub), OperandType.I64, nativeContext)); @@ -229,6 +273,9 @@ namespace ARMeilleure.Translation context.Branch(beginLbl); context.MarkLabel(endLbl); + + EmitSyncFpContext(context, nativeContext, false); + context.Return(); var cfg = context.GetControlFlowGraph(); @@ -237,5 +284,29 @@ namespace ARMeilleure.Translation return Compiler.Compile(cfg, argTypes, retType, CompilerOptions.HighCq, RuntimeInformation.ProcessArchitecture).Map(); } + + /// + /// Generates a function. + /// + /// function + private WrapperFunction GenerateContextWrapper() + { + var context = new EmitterContext(); + + Operand nativeContext = context.LoadArgument(OperandType.I64, 0); + Operand guestMethod = context.LoadArgument(OperandType.I64, 1); + + EmitSyncFpContext(context, nativeContext, true); + Operand returnValue = context.Call(guestMethod, OperandType.I64, nativeContext); + EmitSyncFpContext(context, nativeContext, false); + + context.Return(returnValue); + + var cfg = context.GetControlFlowGraph(); + var retType = OperandType.I64; + var argTypes = new[] { OperandType.I64, OperandType.I64 }; + + return Compiler.Compile(cfg, argTypes, retType, CompilerOptions.HighCq, RuntimeInformation.ProcessArchitecture).Map(); + } } } diff --git a/ARMeilleure/Translation/TranslatorTestMethods.cs b/ARMeilleure/Translation/TranslatorTestMethods.cs new file mode 100644 index 000000000..ab96019a6 --- /dev/null +++ b/ARMeilleure/Translation/TranslatorTestMethods.cs @@ -0,0 +1,148 @@ +using ARMeilleure.CodeGen.X86; +using ARMeilleure.IntermediateRepresentation; +using ARMeilleure.State; +using ARMeilleure.Translation; +using System; +using System.Runtime.InteropServices; +using static ARMeilleure.IntermediateRepresentation.Operand.Factory; + +namespace ARMeilleure.Translation +{ + public static class TranslatorTestMethods + { + public delegate int FpFlagsPInvokeTest(IntPtr managedMethod); + + private static bool SetPlatformFtz(EmitterContext context, bool ftz) + { + if (Optimizations.UseSse2) + { + Operand mxcsr = context.AddIntrinsicInt(Intrinsic.X86Stmxcsr); + + if (ftz) + { + mxcsr = context.BitwiseOr(mxcsr, Const((int)(Mxcsr.Ftz | Mxcsr.Um | Mxcsr.Dm))); + } + else + { + mxcsr = context.BitwiseAnd(mxcsr, Const(~(int)Mxcsr.Ftz)); + } + + context.AddIntrinsicNoRet(Intrinsic.X86Ldmxcsr, mxcsr); + + return true; + } + else if (Optimizations.UseAdvSimd) + { + Operand fpcr = context.AddIntrinsicInt(Intrinsic.Arm64MrsFpcr); + + if (ftz) + { + fpcr = context.BitwiseOr(fpcr, Const((int)FPCR.Fz)); + } + else + { + fpcr = context.BitwiseAnd(fpcr, Const(~(int)FPCR.Fz)); + } + + context.AddIntrinsicNoRet(Intrinsic.Arm64MsrFpcr, fpcr); + + return true; + } + else + { + return false; + } + } + + private static Operand FpBitsToInt(EmitterContext context, Operand fp) + { + Operand vec = context.VectorInsert(context.VectorZero(), fp, 0); + return context.VectorExtract(OperandType.I32, vec, 0); + } + + public static FpFlagsPInvokeTest GenerateFpFlagsPInvokeTest() + { + EmitterContext context = new EmitterContext(); + + Operand methodAddress = context.Copy(context.LoadArgument(OperandType.I64, 0)); + + // Verify that default dotnet fp state does not flush to zero. + // This is required for SoftFloat to function. + + // Denormal + zero != 0 + + Operand denormal = ConstF(BitConverter.Int32BitsToSingle(1)); // 1.40129846432e-45 + Operand zeroF = ConstF(0f); + Operand zero = Const(0); + + Operand result = context.Add(zeroF, denormal); + + // Must not be zero. + + Operand correct1Label = Label(); + + context.BranchIfFalse(correct1Label, context.ICompareEqual(FpBitsToInt(context, result), zero)); + + context.Return(Const(1)); + + context.MarkLabel(correct1Label); + + // Set flush to zero flag. If unsupported by the backend, just return true. + + if (!SetPlatformFtz(context, true)) + { + context.Return(Const(0)); + } + + // Denormal + zero == 0 + + Operand resultFz = context.Add(zeroF, denormal); + + // Must equal zero. + + Operand correct2Label = Label(); + + context.BranchIfTrue(correct2Label, context.ICompareEqual(FpBitsToInt(context, resultFz), zero)); + + SetPlatformFtz(context, false); + + context.Return(Const(2)); + + context.MarkLabel(correct2Label); + + // Call a managed method. This method should not change Fz state. + + context.Call(methodAddress, OperandType.None); + + // Denormal + zero == 0 + + Operand resultFz2 = context.Add(zeroF, denormal); + + // Must equal zero. + + Operand correct3Label = Label(); + + context.BranchIfTrue(correct3Label, context.ICompareEqual(FpBitsToInt(context, resultFz2), zero)); + + SetPlatformFtz(context, false); + + context.Return(Const(3)); + + context.MarkLabel(correct3Label); + + // Success. + + SetPlatformFtz(context, false); + + context.Return(Const(0)); + + // Compile and return the function. + + ControlFlowGraph cfg = context.GetControlFlowGraph(); + + OperandType[] argTypes = new OperandType[] { OperandType.I64 }; + + return Compiler.Compile(cfg, argTypes, OperandType.I32, CompilerOptions.HighCq, RuntimeInformation.ProcessArchitecture).Map(); + } + } +} diff --git a/Ryujinx.Tests/Cpu/EnvironmentTests.cs b/Ryujinx.Tests/Cpu/EnvironmentTests.cs new file mode 100644 index 000000000..d374c08a5 --- /dev/null +++ b/Ryujinx.Tests/Cpu/EnvironmentTests.cs @@ -0,0 +1,91 @@ +using ARMeilleure.Translation; +using NUnit.Framework; +using Ryujinx.Cpu.Jit; +using Ryujinx.Tests.Memory; +using System; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +namespace Ryujinx.Tests.Cpu +{ + internal class EnvironmentTests + { + private static Translator _translator; + + private void EnsureTranslator() + { + // Create a translator, as one is needed to register the signal handler or emit methods. + _translator ??= new Translator(new JitMemoryAllocator(), new MockMemoryManager(), true); + } + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.NoOptimization)] + private float GetDenormal() + { + return BitConverter.Int32BitsToSingle(1); + } + + [MethodImpl(MethodImplOptions.NoInlining | MethodImplOptions.NoOptimization)] + private float GetZero() + { + return BitConverter.Int32BitsToSingle(0); + } + + /// + /// This test ensures that managed methods do not reset floating point control flags. + /// This is used to avoid changing control flags when running methods that don't require it, such as SVC calls, software memory... + /// + [Test] + public void FpFlagsPInvoke() + { + EnsureTranslator(); + + // Subnormal results are not flushed to zero by default. + // This operation should not be allowed to do constant propagation, hence the methods that explicitly disallow inlining. + Assert.AreNotEqual(GetDenormal() + GetZero(), 0f); + + bool methodCalled = false; + bool isFz = false; + + var managedMethod = () => + { + // Floating point math should not modify fp flags. + float test = 2f * 3.5f; + + if (test < 4f) + { + throw new System.Exception("Sanity check."); + } + + isFz = GetDenormal() + GetZero() == 0f; + + try + { + if (test >= 4f) + { + throw new System.Exception("Always throws."); + } + } + catch + { + // Exception handling should not modify fp flags. + + methodCalled = true; + } + }; + + var method = TranslatorTestMethods.GenerateFpFlagsPInvokeTest(); + + // This method sets flush-to-zero and then calls the managed method. + // Before and after setting the flags, it ensures subnormal addition works as expected. + // It returns a positive result if any tests fail, and 0 on success (or if the platform cannot change FP flags) + int result = method(Marshal.GetFunctionPointerForDelegate(managedMethod)); + + // Subnormal results are not flushed to zero by default, which we should have returned to exiting the method. + Assert.AreNotEqual(GetDenormal() + GetZero(), 0f); + + Assert.True(result == 0); + Assert.True(methodCalled); + Assert.True(isFz); + } + } +}