CPU: Implement VFNMS.F32/64 (#1758)

* Add necessary methods / op-code

* Enable Support for FMA Instruction Set

* Add Intrinsics / Assembly Opcodes for VFMSUB231XX.

* Add X86 Instructions for VFMSUB231XX

* Implement VFNMS

* Implement VFNMS Tests

* Add special cases for FMA instructions.

* Update PPTC Version

* Remove unused Op

* Move Check into Assert / Cleanup

* Rename and cleanup

* Whitespace

* Whitespace / Rename

* Re-sort

* Address final requests

* Implement VFMA.F64

* Simplify switch

* Simplify FMA Instructions into their own IntrinsicType.

* Remove whitespace

* Fix indentation

* Change tests for Vfnms -- disable inf / nan

* Move args up, not description ;)

* Undo vfma

* Completely remove vfms code.,

* Fix order of instruction in assembler
This commit is contained in:
sharmander 2020-12-03 14:20:02 -05:00 committed by GitHub
parent c00d39b675
commit b479a43939
No known key found for this signature in database
GPG key ID: 4AEE18F83AFDEB23
14 changed files with 462 additions and 363 deletions

View file

@ -271,6 +271,10 @@ namespace ARMeilleure.CodeGen.X86
Add(X86Instruction.Vblendvps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4a, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vcvtph2ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3813, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vcvtps2ph, new InstructionInfo(0x000f3a1d, BadOp, BadOp, BadOp, BadOp, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vfmsub231ps, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vfmsub231pd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38ba, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
Add(X86Instruction.Vfmsub231ss, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Vfmsub231sd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f38bb, InstructionFlags.Vex | InstructionFlags.Prefix66 | InstructionFlags.RexW));
Add(X86Instruction.Vpblendvb, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x000f3a4c, InstructionFlags.Vex | InstructionFlags.Prefix66));
Add(X86Instruction.Xor, new InstructionInfo(0x00000031, 0x06000083, 0x06000081, BadOp, 0x00000033, InstructionFlags.None));
Add(X86Instruction.Xorpd, new InstructionInfo(BadOp, BadOp, BadOp, BadOp, 0x00000f57, InstructionFlags.Vex | InstructionFlags.Prefix66));

View file

@ -406,12 +406,9 @@ namespace ARMeilleure.CodeGen.X86
else
{
EnsureSameReg(dest, src1);
Debug.Assert(src3.GetRegister().Index == 0);
context.Assembler.WriteInstruction(info.Inst, dest, src1, src2);
}
break;
}
@ -435,6 +432,23 @@ namespace ARMeilleure.CodeGen.X86
break;
}
case IntrinsicType.Fma:
{
Operand dest = operation.Destination;
Operand src1 = operation.GetSource(0);
Operand src2 = operation.GetSource(1);
Operand src3 = operation.GetSource(2);
EnsureSameType(dest, src1, src2, src3);
EnsureSameReg(dest, src1);
Debug.Assert(!dest.Type.IsInteger());
Debug.Assert(HardwareCapabilities.SupportsVexEncoding);
context.Assembler.WriteInstruction(info.Inst, dest, src2, src3);
break;
}
}
}
else

View file

@ -164,6 +164,10 @@ namespace ARMeilleure.CodeGen.X86
Add(Intrinsic.X86Unpcklps, new IntrinsicInfo(X86Instruction.Unpcklps, IntrinsicType.Binary));
Add(Intrinsic.X86Vcvtph2ps, new IntrinsicInfo(X86Instruction.Vcvtph2ps, IntrinsicType.Unary));
Add(Intrinsic.X86Vcvtps2ph, new IntrinsicInfo(X86Instruction.Vcvtps2ph, IntrinsicType.BinaryImm));
Add(Intrinsic.X86Vfmsub231pd, new IntrinsicInfo(X86Instruction.Vfmsub231pd, IntrinsicType.Fma));
Add(Intrinsic.X86Vfmsub231ps, new IntrinsicInfo(X86Instruction.Vfmsub231ps, IntrinsicType.Fma));
Add(Intrinsic.X86Vfmsub231sd, new IntrinsicInfo(X86Instruction.Vfmsub231sd, IntrinsicType.Fma));
Add(Intrinsic.X86Vfmsub231ss, new IntrinsicInfo(X86Instruction.Vfmsub231ss, IntrinsicType.Fma));
Add(Intrinsic.X86Xorpd, new IntrinsicInfo(X86Instruction.Xorpd, IntrinsicType.Binary));
Add(Intrinsic.X86Xorps, new IntrinsicInfo(X86Instruction.Xorps, IntrinsicType.Binary));
}

View file

@ -11,6 +11,7 @@ namespace ARMeilleure.CodeGen.X86
BinaryImm,
Crc32,
Ternary,
TernaryImm
TernaryImm,
Fma
}
}

View file

@ -1309,7 +1309,7 @@ namespace ARMeilleure.CodeGen.X86
IntrinsicOperation intrinOp = (IntrinsicOperation)operation;
IntrinsicInfo info = IntrinsicTable.GetInfo(intrinOp.Intrinsic);
return info.Type == IntrinsicType.Crc32 || IsVexSameOperandDestSrc1(operation);
return info.Type == IntrinsicType.Crc32 || info.Type == IntrinsicType.Fma || IsVexSameOperandDestSrc1(operation);
}
private static bool IsVexSameOperandDestSrc1(Operation operation)

View file

@ -200,6 +200,10 @@ namespace ARMeilleure.CodeGen.X86
Vblendvps,
Vcvtph2ps,
Vcvtps2ph,
Vfmsub231ps,
Vfmsub231pd,
Vfmsub231ss,
Vfmsub231sd,
Vpblendvb,
Xor,
Xorpd,

View file

@ -821,6 +821,7 @@ namespace ARMeilleure.Decoders
SetA32("111100101x11xxxxxxxxxxxxxxx0xxxx", InstName.Vext, InstEmit32.Vext, OpCode32SimdExt.Create);
SetA32("<<<<11101x10xxxxxxxx101xx0x0xxxx", InstName.Vfma, InstEmit32.Vfma_S, OpCode32SimdRegS.Create);
SetA32("<<<<11101x10xxxxxxxx101xx1x0xxxx", InstName.Vfms, InstEmit32.Vfms_S, OpCode32SimdRegS.Create);
SetA32("<<<<11101x01xxxxxxxx101xx0x0xxxx", InstName.Vfnms, InstEmit32.Vfnms_S, OpCode32SimdRegS.Create);
SetA32("1111001x0x<<xxxxxxxx0000xxx0xxxx", InstName.Vhadd, InstEmit32.Vhadd, OpCode32SimdReg.Create);
SetA32("111101001x10xxxxxxxxxx00xxxxxxxx", InstName.Vld1, InstEmit32.Vld1, OpCode32SimdMemSingle.Create);
SetA32("111101000x10xxxxxxxx0111xxxxxxxx", InstName.Vld1, InstEmit32.Vld1, OpCode32SimdMemPair.Create); // Regs = 1.

View file

@ -284,6 +284,21 @@ namespace ARMeilleure.Instructions
}
}
public static void Vfnms_S(ArmEmitterContext context) // Fused.
{
if (Optimizations.FastFP && Optimizations.UseFma)
{
EmitScalarTernaryOpF32(context, Intrinsic.X86Vfmsub231ss, Intrinsic.X86Vfmsub231sd);
}
else
{
EmitScalarTernaryOpF32(context, (op1, op2, op3) =>
{
return EmitSoftFloatCall(context, nameof(SoftFloat32.FPMulAdd), context.Negate(op1), op2, op3);
});
}
}
public static void Vhadd(ArmEmitterContext context)
{
OpCode32SimdReg op = (OpCode32SimdReg)context.CurrOp;

View file

@ -901,6 +901,20 @@ namespace ARMeilleure.Instructions
context.Copy(initialD, res);
}
public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32, Intrinsic inst64)
{
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;
bool doubleSize = (op.Size & 1) != 0;
Intrinsic inst = doubleSize ? inst64 : inst32;
EmitScalarTernaryOpSimd32(context, (d, n, m) =>
{
return context.AddIntrinsic(inst, d, n, m);
});
}
public static void EmitScalarTernaryOpF32(ArmEmitterContext context, Intrinsic inst32pt1, Intrinsic inst64pt1, Intrinsic inst32pt2, Intrinsic inst64pt2)
{
OpCode32SimdRegS op = (OpCode32SimdRegS)context.CurrOp;

View file

@ -571,6 +571,7 @@ namespace ARMeilleure.Instructions
Vext,
Vfma,
Vfms,
Vfnms,
Vhadd,
Vld1,
Vld2,

View file

@ -153,6 +153,10 @@ namespace ARMeilleure.IntermediateRepresentation
X86Unpcklps,
X86Vcvtph2ps,
X86Vcvtps2ph,
X86Vfmsub231pd,
X86Vfmsub231ps,
X86Vfmsub231sd,
X86Vfmsub231ss,
X86Xorpd,
X86Xorps
}

View file

@ -15,6 +15,7 @@ namespace ARMeilleure
public static bool UsePopCntIfAvailable { get; set; } = true;
public static bool UseAvxIfAvailable { get; set; } = true;
public static bool UseF16cIfAvailable { get; set; } = true;
public static bool UseFmaIfAvailable { get; set; } = true;
public static bool UseAesniIfAvailable { get; set; } = true;
public static bool UsePclmulqdqIfAvailable { get; set; } = true;
@ -33,6 +34,7 @@ namespace ARMeilleure
internal static bool UsePopCnt => UsePopCntIfAvailable && HardwareCapabilities.SupportsPopcnt;
internal static bool UseAvx => UseAvxIfAvailable && HardwareCapabilities.SupportsAvx && !ForceLegacySse;
internal static bool UseF16c => UseF16cIfAvailable && HardwareCapabilities.SupportsF16c;
internal static bool UseFma => UseFmaIfAvailable && HardwareCapabilities.SupportsFma;
internal static bool UseAesni => UseAesniIfAvailable && HardwareCapabilities.SupportsAesni;
internal static bool UsePclmulqdq => UsePclmulqdqIfAvailable && HardwareCapabilities.SupportsPclmulqdq;
}

View file

@ -21,7 +21,7 @@ namespace ARMeilleure.Translation.PTC
{
private const string HeaderMagic = "PTChd";
private const int InternalVersion = 1650; //! To be incremented manually for each change to the ARMeilleure project.
private const int InternalVersion = 1758; //! To be incremented manually for each change to the ARMeilleure project.
private const string ActualDir = "0";
private const string BackupDir = "1";

View file

@ -184,8 +184,8 @@ namespace Ryujinx.Tests.Cpu
private const int RndCnt = 2;
private static readonly bool NoZeros = false;
private static readonly bool NoInfs = false;
private static readonly bool NoNaNs = false;
private static readonly bool NoInfs = true;
private static readonly bool NoNaNs = true;
[Explicit]
[Test, Pairwise, Description("VADD.f32 V0, V0, V0")]
@ -293,6 +293,41 @@ namespace Ryujinx.Tests.Cpu
CompareAgainstUnicorn(fpsrMask: Fpsr.Nzcv);
}
[Test, Pairwise, Description("VFNMS.F<size> <Vd>, <Vn>, <Vm>")]
public void Vfnms([Values(0u, 1u)] uint rd,
[Values(0u, 1u)] uint rn,
[Values(0u, 1u)] uint rm,
[Values(2u, 3u)] uint size,
[ValueSource("_2S_F_")] ulong z,
[ValueSource("_2S_F_")] ulong a,
[ValueSource("_2S_F_")] ulong b)
{
uint opcode = 0xee900a00;
if (size == 2)
{
opcode |= (((rm & 0x1) << 5) | (rm & 0x1e) >> 1);
opcode |= (((rd & 0x1) << 22) | (rd & 0x1e) << 11);
opcode |= (((rn & 0x1) << 7) | (rn & 0x1e) >> 15);
}
else
{
opcode |= (((rm & 0x10) << 1) | (rm & 0xf) << 0);
opcode |= (((rd & 0x10) << 18) | (rd & 0xf) << 12);
opcode |= (((rn & 0x10) << 3) | (rn & 0xf) << 16);
}
opcode |= ((size & 3) << 8);
V128 v0 = MakeVectorE0E1(z, z);
V128 v1 = MakeVectorE0E1(a, z);
V128 v2 = MakeVectorE0E1(b, z);
SingleOpcode(opcode, v0: v0, v1: v1, v2: v2);
CompareAgainstUnicorn();
}
[Test, Pairwise, Description("VMLSL.<type><size> <Vd>, <Vn>, <Vm>")]
public void Vmlsl_I([Values(0u)] uint rd,
[Values(1u, 0u)] uint rn,