From 8d41402fa603a2f00ccd08239d3b938fd60715a3 Mon Sep 17 00:00:00 2001
From: merry <git@mary.rs>
Date: Wed, 19 Oct 2022 01:36:04 +0100
Subject: [PATCH] A32: Implement VCVTT, VCVTB (#3710)

* A32: Implement VCVTT, VCVTB

* A32: F16C implementation of VCVTT/VCVTB
---
 ARMeilleure/Decoders/OpCode32SimdCvtTB.cs     | 44 ++++++++++
 ARMeilleure/Decoders/OpCodeTable.cs           |  1 +
 ARMeilleure/Instructions/InstEmitSimdCvt32.cs | 62 +++++++++++++
 .../Instructions/InstEmitSimdHelper32.cs      | 16 ++++
 Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs         | 87 +++++++++++++++++++
 5 files changed, 210 insertions(+)
 create mode 100644 ARMeilleure/Decoders/OpCode32SimdCvtTB.cs

diff --git a/ARMeilleure/Decoders/OpCode32SimdCvtTB.cs b/ARMeilleure/Decoders/OpCode32SimdCvtTB.cs
new file mode 100644
index 0000000000..a95b32ab07
--- /dev/null
+++ b/ARMeilleure/Decoders/OpCode32SimdCvtTB.cs
@@ -0,0 +1,44 @@
+namespace ARMeilleure.Decoders
+{
+    class OpCode32SimdCvtTB : OpCode32, IOpCode32Simd
+    {
+        public int Vd { get; }
+        public int Vm { get; }
+        public bool Op { get; } // Convert to Half / Convert from Half
+        public bool T { get; } // Top / Bottom
+        public int Size { get; } // Double / Single
+
+        public new static OpCode Create(InstDescriptor inst, ulong address, int opCode) => new OpCode32SimdCvtTB(inst, address, opCode, false);
+        public static OpCode CreateT32(InstDescriptor inst, ulong address, int opCode) => new OpCode32SimdCvtTB(inst, address, opCode, true);
+
+        public OpCode32SimdCvtTB(InstDescriptor inst, ulong address, int opCode, bool isThumb) : base(inst, address, opCode)
+        {
+            IsThumb = isThumb;
+
+            Op   = ((opCode >> 16) & 0x1) != 0;
+            T    = ((opCode >> 7) & 0x1) != 0;
+            Size = ((opCode >> 8) & 0x1);
+
+            RegisterSize = Size == 1 ? RegisterSize.Int64 : RegisterSize.Int32;
+
+            if (Size == 1)
+            {
+                if (Op)
+                {
+                    Vm = ((opCode >> 1) & 0x10) | ((opCode >> 0) & 0xf);
+                    Vd = ((opCode >> 22) & 0x1) | ((opCode >> 11) & 0x1e);
+                }
+                else
+                {
+                    Vm = ((opCode >> 5) & 0x1) | ((opCode << 1) & 0x1e);
+                    Vd = ((opCode >> 18) & 0x10) | ((opCode >> 12) & 0xf);
+                }
+            }
+            else
+            {
+                Vm = ((opCode >> 5) & 0x1) | ((opCode << 1) & 0x1e);
+                Vd = ((opCode >> 22) & 0x1) | ((opCode >> 11) & 0x1e);
+            }
+        }
+    }
+}
\ No newline at end of file
diff --git a/ARMeilleure/Decoders/OpCodeTable.cs b/ARMeilleure/Decoders/OpCodeTable.cs
index b4f4b17966..f44c154006 100644
--- a/ARMeilleure/Decoders/OpCodeTable.cs
+++ b/ARMeilleure/Decoders/OpCodeTable.cs
@@ -828,6 +828,7 @@ namespace ARMeilleure.Decoders
             SetVfp("<<<<11101x11110xxxxx101x11x0xxxx", InstName.Vcvt,   InstEmit32.Vcvt_FI,  OpCode32SimdCvtFI.Create,       OpCode32SimdCvtFI.CreateT32); // FP32 to int.
             SetVfp("<<<<11101x111000xxxx101xx1x0xxxx", InstName.Vcvt,   InstEmit32.Vcvt_FI,  OpCode32SimdCvtFI.Create,       OpCode32SimdCvtFI.CreateT32); // Int to FP32.
             SetVfp("111111101x1111xxxxxx101xx1x0xxxx", InstName.Vcvt,   InstEmit32.Vcvt_RM,  OpCode32SimdCvtFI.Create,       OpCode32SimdCvtFI.CreateT32); // The many FP32 to int encodings (fp).
+            SetVfp("<<<<11101x11001xxxxx101xx1x0xxxx", InstName.Vcvt,   InstEmit32.Vcvt_TB,  OpCode32SimdCvtTB.Create,       OpCode32SimdCvtTB.CreateT32);
             SetVfp("<<<<11101x00xxxxxxxx101xx0x0xxxx", InstName.Vdiv,   InstEmit32.Vdiv_S,   OpCode32SimdRegS.Create,        OpCode32SimdRegS.CreateT32);
             SetVfp("<<<<11101xx0xxxxxxxx1011x0x10000", InstName.Vdup,   InstEmit32.Vdup,     OpCode32SimdDupGP.Create,       OpCode32SimdDupGP.CreateT32);
             SetVfp("<<<<11101x10xxxxxxxx101xx0x0xxxx", InstName.Vfma,   InstEmit32.Vfma_S,   OpCode32SimdRegS.Create,        OpCode32SimdRegS.CreateT32);
diff --git a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
index c76634ebf9..ba713feb7a 100644
--- a/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdCvt32.cs
@@ -261,6 +261,68 @@ namespace ARMeilleure.Instructions
             }
         }
 
+        public static void Vcvt_TB(ArmEmitterContext context)
+        {
+            OpCode32SimdCvtTB op = (OpCode32SimdCvtTB)context.CurrOp;
+
+            if (Optimizations.UseF16c)
+            {
+                Debug.Assert(!Optimizations.ForceLegacySse);
+
+                if (op.Op)
+                {
+                    Operand res = ExtractScalar(context, op.Size == 1 ? OperandType.FP64 : OperandType.FP32, op.Vm);
+                    if (op.Size == 1)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Cvtsd2ss, context.VectorZero(), res);
+                    }
+                    res = context.AddIntrinsic(Intrinsic.X86Vcvtps2ph, res, Const(X86GetRoundControl(FPRoundingMode.ToNearest)));
+                    res = context.VectorExtract16(res, 0);
+                    InsertScalar16(context, op.Vd, op.T, res);
+                }
+                else
+                {
+                    Operand res = context.VectorCreateScalar(ExtractScalar16(context, op.Vm, op.T));
+                    res = context.AddIntrinsic(Intrinsic.X86Vcvtph2ps, res);
+                    if (op.Size == 1)
+                    {
+                        res = context.AddIntrinsic(Intrinsic.X86Cvtss2sd, context.VectorZero(), res);
+                    }
+                    res = context.VectorExtract(op.Size == 1 ? OperandType.I64 : OperandType.I32, res, 0);
+                    InsertScalar(context, op.Vd, res);
+                }
+            }
+            else
+            {
+                if (op.Op)
+                {
+                    // Convert to half
+
+                    Operand src = ExtractScalar(context, op.Size == 1 ? OperandType.FP64 : OperandType.FP32, op.Vm);
+
+                    MethodInfo method = op.Size == 1
+                        ? typeof(SoftFloat64_16).GetMethod(nameof(SoftFloat64_16.FPConvert))
+                        : typeof(SoftFloat32_16).GetMethod(nameof(SoftFloat32_16.FPConvert));
+                    Operand res = context.Call(method, src);
+
+                    InsertScalar16(context, op.Vd, op.T, res);
+                }
+                else
+                {
+                    // Convert from half
+
+                    Operand src = ExtractScalar16(context, op.Vm, op.T);
+
+                    MethodInfo method = op.Size == 1
+                        ? typeof(SoftFloat16_64).GetMethod(nameof(SoftFloat16_64.FPConvert))
+                        : typeof(SoftFloat16_32).GetMethod(nameof(SoftFloat16_32.FPConvert));
+                    Operand res = context.Call(method, src);
+
+                    InsertScalar(context, op.Vd, res);
+                }
+            }
+        }
+
         // VRINTA/M/N/P (floating-point).
         public static void Vrint_RM(ArmEmitterContext context)
         {
diff --git a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
index 0620ea3320..84b01d05ca 100644
--- a/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
+++ b/ARMeilleure/Instructions/InstEmitSimdHelper32.cs
@@ -70,6 +70,22 @@ namespace ARMeilleure.Instructions
             context.Copy(vec, insert);
         }
 
+        public static Operand ExtractScalar16(ArmEmitterContext context, int reg, bool top)
+        {
+            return context.VectorExtract16(GetVecA32(reg >> 2), ((reg & 3) << 1) | (top ? 1 : 0));
+        }
+
+        public static void InsertScalar16(ArmEmitterContext context, int reg, bool top, Operand value)
+        {
+            Debug.Assert(value.Type == OperandType.FP32 || value.Type == OperandType.I32);
+
+            Operand vec, insert;
+            vec = GetVecA32(reg >> 2);
+            insert = context.VectorInsert16(vec, value, ((reg & 3) << 1) | (top ? 1 : 0));
+
+            context.Copy(vec, insert);
+        }
+
         public static Operand ExtractElement(ArmEmitterContext context, int reg, int size, bool signed)
         {
             return EmitVectorExtract32(context, reg >> (4 - size), reg & ((16 >> size) - 1), size, signed);
diff --git a/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs b/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs
index 78d5c3cc2a..0c90d0bad5 100644
--- a/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs
+++ b/Ryujinx.Tests/Cpu/CpuTestSimdCvt32.cs
@@ -339,6 +339,93 @@ namespace Ryujinx.Tests.Cpu
 
             CompareAgainstUnicorn();
         }
+
+        [Explicit]
+        [Test, Pairwise, Description("VCVT<top>.F16.F32 <Sd>, <Dm>")]
+        public void Vcvt_F32_F16([Values(0u, 1u, 2u, 3u)] uint rd,
+                                 [Values(0u, 1u, 2u, 3u)] uint rm,
+                                 [ValueSource(nameof(_1S_))] [Random(RndCnt)] uint s0,
+                                 [ValueSource(nameof(_1S_))] [Random(RndCnt)] uint s1,
+                                 [ValueSource(nameof(_1S_))] [Random(RndCnt)] uint s2,
+                                 [ValueSource(nameof(_1S_))] [Random(RndCnt)] uint s3,
+                                 [Values] bool top)
+        {
+            uint opcode = 0xeeb30a40; // VCVTB.F16.F32 S0, D0
+
+            if (top)
+            {
+                opcode |= 1 << 7;
+            }
+
+            opcode |= ((rd & 0x1e) << 11) | ((rd & 0x1) << 22);
+            opcode |= ((rm & 0x1e) >> 1) | ((rm & 0x1) << 5);
+
+            V128 v0 = MakeVectorE0E1E2E3(s0, s1, s2, s3);
+
+            SingleOpcode(opcode, v0: v0);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Explicit]
+        [Test, Pairwise, Description("VCVT<top>.F16.F64 <Sd>, <Dm>")]
+        public void Vcvt_F64_F16([Values(0u, 1u, 2u, 3u)] uint rd,
+                                 [Values(0u, 1u)] uint rm,
+                                 [ValueSource(nameof(_1D_F_))] ulong d0,
+                                 [ValueSource(nameof(_1D_F_))] ulong d1,
+                                 [Values] bool top)
+        {
+            uint opcode = 0xeeb30b40; // VCVTB.F16.F64 S0, D0
+
+            if (top)
+            {
+                opcode |= 1 << 7;
+            }
+
+            opcode |= ((rd & 0x1e) << 11) | ((rd & 0x1) << 22);
+            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+
+            V128 v0 = MakeVectorE0E1(d0, d1);
+
+            SingleOpcode(opcode, v0: v0);
+
+            CompareAgainstUnicorn();
+        }
+
+        [Explicit]
+        [Test, Pairwise, Description("VCVT<top>.F<size>.F16 <Vd>, <Sm>")]
+        public void Vcvt_F16_Fx([Values(0u, 1u, 2u, 3u)] uint rd,
+                                 [Values(0u, 1u, 2u, 3u)] uint rm,
+                                 [ValueSource(nameof(_1D_F_))] ulong d0,
+                                 [ValueSource(nameof(_1D_F_))] ulong d1,
+                                 [Values] bool top,
+                                 [Values] bool sz)
+        {
+            uint opcode = 0xeeb20a40; // VCVTB.F32.F16 S0, S0
+
+            if (top)
+            {
+                opcode |= 1 << 7;
+            }
+
+            if (sz)
+            {
+                opcode |= 1 << 8;
+                opcode |= ((rd & 0xf) << 12) | ((rd & 0x10) << 18);
+            }
+            else
+            {
+                opcode |= ((rd & 0x1e) << 11) | ((rd & 0x1) << 22);
+            }
+
+            opcode |= ((rm & 0xf) << 0) | ((rm & 0x10) << 1);
+
+            V128 v0 = MakeVectorE0E1(d0, d1);
+
+            SingleOpcode(opcode, v0: v0);
+
+            CompareAgainstUnicorn();
+        }
 #endif
     }
 }