Merge pull request #1065 from yuriks/shader-fp

Shader FP compliance fixes
This commit is contained in:
Yuri Kunde Schlesner 2015-08-27 16:34:13 -07:00
commit c5a4025b65
4 changed files with 102 additions and 59 deletions

View file

@ -1021,12 +1021,20 @@ struct float24 {
return ret; return ret;
} }
static float24 Zero() {
return FromFloat32(0.f);
}
// Not recommended for anything but logging // Not recommended for anything but logging
float ToFloat32() const { float ToFloat32() const {
return value; return value;
} }
float24 operator * (const float24& flt) const { float24 operator * (const float24& flt) const {
if ((this->value == 0.f && !std::isnan(flt.value)) ||
(flt.value == 0.f && !std::isnan(this->value)))
// PICA gives 0 instead of NaN when multiplying by inf
return Zero();
return float24::FromFloat32(ToFloat32() * flt.ToFloat32()); return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
} }
@ -1043,7 +1051,11 @@ struct float24 {
} }
float24& operator *= (const float24& flt) { float24& operator *= (const float24& flt) {
value *= flt.ToFloat32(); if ((this->value == 0.f && !std::isnan(flt.value)) ||
(flt.value == 0.f && !std::isnan(this->value)))
// PICA gives 0 instead of NaN when multiplying by inf
*this = Zero();
else value *= flt.ToFloat32();
return *this; return *this;
} }

View file

@ -177,7 +177,10 @@ void RunInterpreter(UnitState<Debug>& state) {
if (!swizzle.DestComponentEnabled(i)) if (!swizzle.DestComponentEnabled(i))
continue; continue;
dest[i] = std::max(src1[i], src2[i]); // NOTE: Exact form required to match NaN semantics to hardware:
// max(0, NaN) -> NaN
// max(NaN, 0) -> 0
dest[i] = (src1[i] > src2[i]) ? src1[i] : src2[i];
} }
Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break; break;
@ -190,7 +193,10 @@ void RunInterpreter(UnitState<Debug>& state) {
if (!swizzle.DestComponentEnabled(i)) if (!swizzle.DestComponentEnabled(i))
continue; continue;
dest[i] = std::min(src1[i], src2[i]); // NOTE: Exact form required to match NaN semantics to hardware:
// min(0, NaN) -> NaN
// min(NaN, 0) -> 0
dest[i] = (src1[i] < src2[i]) ? src1[i] : src2[i];
} }
Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest); Record<DebugDataRecord::DEST_OUT>(state.debug, iteration, dest);
break; break;

View file

@ -115,6 +115,8 @@ static const X64Reg SRC1 = XMM1;
static const X64Reg SRC2 = XMM2; static const X64Reg SRC2 = XMM2;
/// Loaded with the third swizzled source register, otherwise can be used as a scratch register /// Loaded with the third swizzled source register, otherwise can be used as a scratch register
static const X64Reg SRC3 = XMM3; static const X64Reg SRC3 = XMM3;
/// Additional scratch register
static const X64Reg SCRATCH2 = XMM4;
/// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one /// Constant vector of [1.0f, 1.0f, 1.0f, 1.0f], used to efficiently set a vector to one
static const X64Reg ONE = XMM14; static const X64Reg ONE = XMM14;
/// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR /// Constant vector of [-0.f, -0.f, -0.f, -0.f], used to efficiently negate a vector with XOR
@ -227,8 +229,8 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1); u8 mask = ((swiz.dest_mask & 1) << 3) | ((swiz.dest_mask & 8) >> 3) | ((swiz.dest_mask & 2) << 1) | ((swiz.dest_mask & 4) >> 1);
BLENDPS(SCRATCH, R(src), mask); BLENDPS(SCRATCH, R(src), mask);
} else { } else {
MOVAPS(XMM4, R(src)); MOVAPS(SCRATCH2, R(src));
UNPCKHPS(XMM4, R(SCRATCH)); // Unpack X/Y components of source and destination UNPCKHPS(SCRATCH2, R(SCRATCH)); // Unpack X/Y components of source and destination
UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination UNPCKLPS(SCRATCH, R(src)); // Unpack Z/W components of source and destination
// Compute selector to selectively copy source components to destination for SHUFPS instruction // Compute selector to selectively copy source components to destination for SHUFPS instruction
@ -236,7 +238,7 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) | ((swiz.DestComponentEnabled(1) ? 3 : 2) << 2) |
((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) | ((swiz.DestComponentEnabled(2) ? 0 : 1) << 4) |
((swiz.DestComponentEnabled(3) ? 2 : 3) << 6); ((swiz.DestComponentEnabled(3) ? 2 : 3) << 6);
SHUFPS(SCRATCH, R(XMM4), sel); SHUFPS(SCRATCH, R(SCRATCH2), sel);
} }
// Store dest back to memory // Store dest back to memory
@ -244,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
} }
} }
void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
MOVAPS(scratch, R(src1));
CMPPS(scratch, R(src2), CMP_ORD);
MULPS(src1, R(src2));
MOVAPS(src2, R(src1));
CMPPS(src2, R(src2), CMP_UNORD);
XORPS(scratch, R(src2));
ANDPS(src1, R(scratch));
}
void JitCompiler::Compile_EvaluateCondition(Instruction instr) { void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
// Note: NXOR is used below to check for equality // Note: NXOR is used below to check for equality
switch (instr.flow_control.op) { switch (instr.flow_control.op) {
@ -307,10 +322,7 @@ void JitCompiler::Compile_DP3(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
if (Common::GetCPUCaps().sse4_1) { Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
DPPS(SRC1, R(SRC2), 0x7f);
} else {
MULPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1)); MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1)); SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
@ -321,7 +333,6 @@ void JitCompiler::Compile_DP3(Instruction instr) {
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0)); SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
ADDPS(SRC1, R(SRC2)); ADDPS(SRC1, R(SRC2));
ADDPS(SRC1, R(SRC3)); ADDPS(SRC1, R(SRC3));
}
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -330,10 +341,7 @@ void JitCompiler::Compile_DP4(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
if (Common::GetCPUCaps().sse4_1) { Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
DPPS(SRC1, R(SRC2), 0xff);
} else {
MULPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1)); MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
@ -342,7 +350,6 @@ void JitCompiler::Compile_DP4(Instruction instr) {
MOVAPS(SRC2, R(SRC1)); MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
ADDPS(SRC1, R(SRC2)); ADDPS(SRC1, R(SRC2));
}
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -359,14 +366,14 @@ void JitCompiler::Compile_DPH(Instruction instr) {
if (Common::GetCPUCaps().sse4_1) { if (Common::GetCPUCaps().sse4_1) {
// Set 4th component to 1.0 // Set 4th component to 1.0
BLENDPS(SRC1, R(ONE), 0x8); // 0b1000 BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
DPPS(SRC1, R(SRC2), 0xff);
} else { } else {
// Reverse to set the 4th component to 1.0 // Set 4th component to 1.0
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); MOVAPS(SCRATCH, R(SRC1));
MOVSS(SRC1, R(ONE)); UNPCKHPS(SCRATCH, R(ONE)); // XYZW, 1111 -> Z1__
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); UNPCKLPD(SRC1, R(SCRATCH)); // XYZW, Z1__ -> XYZ1
}
MULPS(SRC1, R(SRC2)); Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
MOVAPS(SRC2, R(SRC1)); MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
@ -375,7 +382,6 @@ void JitCompiler::Compile_DPH(Instruction instr) {
MOVAPS(SRC2, R(SRC1)); MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
ADDPS(SRC1, R(SRC2)); ADDPS(SRC1, R(SRC2));
}
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -415,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) {
void JitCompiler::Compile_MUL(Instruction instr) { void JitCompiler::Compile_MUL(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
MULPS(SRC1, R(SRC2)); Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -465,6 +471,7 @@ void JitCompiler::Compile_FLR(Instruction instr) {
void JitCompiler::Compile_MAX(Instruction instr) { void JitCompiler::Compile_MAX(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
// SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
MAXPS(SRC1, R(SRC2)); MAXPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -472,6 +479,7 @@ void JitCompiler::Compile_MAX(Instruction instr) {
void JitCompiler::Compile_MIN(Instruction instr) { void JitCompiler::Compile_MIN(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
// SSE semantics match PICA200 ones: In case of NaN, SRC2 is returned.
MINPS(SRC1, R(SRC2)); MINPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }
@ -578,27 +586,42 @@ void JitCompiler::Compile_CALLU(Instruction instr) {
} }
void JitCompiler::Compile_CMP(Instruction instr) { void JitCompiler::Compile_CMP(Instruction instr) {
using Op = Instruction::Common::CompareOpType::Op;
Op op_x = instr.common.compare_op.x;
Op op_y = instr.common.compare_op.y;
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1); Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2); Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_NLE, CMP_NLT }; // SSE doesn't have greater-than (GT) or greater-equal (GE) comparison operators. You need to
// emulate them by swapping the lhs and rhs and using LT and LE. NLT and NLE can't be used here
// because they don't match when used with NaNs.
static const u8 cmp[] = { CMP_EQ, CMP_NEQ, CMP_LT, CMP_LE, CMP_LT, CMP_LE };
if (instr.common.compare_op.x == instr.common.compare_op.y) { bool invert_op_x = (op_x == Op::GreaterThan || op_x == Op::GreaterEqual);
Gen::X64Reg lhs_x = invert_op_x ? SRC2 : SRC1;
Gen::X64Reg rhs_x = invert_op_x ? SRC1 : SRC2;
if (op_x == op_y) {
// Compare X-component and Y-component together // Compare X-component and Y-component together
CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.x]); CMPPS(lhs_x, R(rhs_x), cmp[op_x]);
MOVQ_xmm(R(COND0), lhs_x);
MOVQ_xmm(R(COND0), SRC1);
MOV(64, R(COND1), R(COND0)); MOV(64, R(COND1), R(COND0));
} else { } else {
bool invert_op_y = (op_y == Op::GreaterThan || op_y == Op::GreaterEqual);
Gen::X64Reg lhs_y = invert_op_y ? SRC2 : SRC1;
Gen::X64Reg rhs_y = invert_op_y ? SRC1 : SRC2;
// Compare X-component // Compare X-component
MOVAPS(SCRATCH, R(SRC1)); MOVAPS(SCRATCH, R(lhs_x));
CMPSS(SCRATCH, R(SRC2), cmp[instr.common.compare_op.x]); CMPSS(SCRATCH, R(rhs_x), cmp[op_x]);
// Compare Y-component // Compare Y-component
CMPPS(SRC1, R(SRC2), cmp[instr.common.compare_op.y]); CMPPS(lhs_y, R(rhs_y), cmp[op_y]);
MOVQ_xmm(R(COND0), SCRATCH); MOVQ_xmm(R(COND0), SCRATCH);
MOVQ_xmm(R(COND1), SRC1); MOVQ_xmm(R(COND1), lhs_y);
} }
SHR(32, R(COND0), Imm8(31)); SHR(32, R(COND0), Imm8(31));
@ -616,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3); Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
} }
if (Common::GetCPUCaps().fma) { Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
VFMADD213PS(SRC1, SRC2, R(SRC3));
} else {
MULPS(SRC1, R(SRC2));
ADDPS(SRC1, R(SRC3)); ADDPS(SRC1, R(SRC3));
}
Compile_DestEnable(instr, SRC1); Compile_DestEnable(instr, SRC1);
} }

View file

@ -68,6 +68,12 @@ private:
void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest); void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
void Compile_DestEnable(Instruction instr, Gen::X64Reg dest); void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
/**
* Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
* zero by inf. Clobbers `src2` and `scratch`.
*/
void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch);
void Compile_EvaluateCondition(Instruction instr); void Compile_EvaluateCondition(Instruction instr);
void Compile_UniformCondition(Instruction instr); void Compile_UniformCondition(Instruction instr);