Shaders: Fix multiplications between 0.0 and inf

The PICA200 semantics for multiplication are so that when multiplying
inf by exactly 0.0, the result is 0.0, instead of NaN, as defined by
IEEE. This is relied upon by games.

Fixes #1024 (missing OoT interface items)
This commit is contained in:
Yuri Kunde Schlesner 2015-08-24 01:48:15 -03:00
parent 082b74fa24
commit 630a850d4d
3 changed files with 60 additions and 42 deletions

View file

@ -1021,12 +1021,20 @@ struct float24 {
return ret;
}
static float24 Zero() {
return FromFloat32(0.f);
}
// Not recommended for anything but logging
float ToFloat32() const {
return value;
}
float24 operator * (const float24& flt) const {
if ((this->value == 0.f && flt.value == flt.value) ||
(flt.value == 0.f && this->value == this->value))
// PICA gives 0 instead of NaN when multiplying by inf
return Zero();
return float24::FromFloat32(ToFloat32() * flt.ToFloat32());
}
@ -1043,7 +1051,11 @@ struct float24 {
}
float24& operator *= (const float24& flt) {
value *= flt.ToFloat32();
if ((this->value == 0.f && flt.value == flt.value) ||
(flt.value == 0.f && this->value == this->value))
// PICA gives 0 instead of NaN when multiplying by inf
*this = Zero();
else value *= flt.ToFloat32();
return *this;
}

View file

@ -246,6 +246,19 @@ void JitCompiler::Compile_DestEnable(Instruction instr,X64Reg src) {
}
}
void JitCompiler::Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch) {
MOVAPS(scratch, R(src1));
CMPPS(scratch, R(src2), CMP_ORD);
MULPS(src1, R(src2));
MOVAPS(src2, R(src1));
CMPPS(src2, R(src2), CMP_UNORD);
XORPS(scratch, R(src2));
ANDPS(src1, R(scratch));
}
void JitCompiler::Compile_EvaluateCondition(Instruction instr) {
// Note: NXOR is used below to check for equality
switch (instr.flow_control.op) {
@ -309,21 +322,17 @@ void JitCompiler::Compile_DP3(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
if (Common::GetCPUCaps().sse4_1) {
DPPS(SRC1, R(SRC2), 0x7f);
} else {
MULPS(SRC1, R(SRC2));
Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC2, R(SRC2), _MM_SHUFFLE(1, 1, 1, 1));
MOVAPS(SRC3, R(SRC1));
SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
MOVAPS(SRC3, R(SRC1));
SHUFPS(SRC3, R(SRC3), _MM_SHUFFLE(2, 2, 2, 2));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
ADDPS(SRC1, R(SRC2));
ADDPS(SRC1, R(SRC3));
}
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 0, 0, 0));
ADDPS(SRC1, R(SRC2));
ADDPS(SRC1, R(SRC3));
Compile_DestEnable(instr, SRC1);
}
@ -332,19 +341,15 @@ void JitCompiler::Compile_DP4(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
if (Common::GetCPUCaps().sse4_1) {
DPPS(SRC1, R(SRC2), 0xff);
} else {
MULPS(SRC1, R(SRC2));
Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
ADDPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
ADDPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
ADDPS(SRC1, R(SRC2));
}
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
ADDPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1);
}
@ -361,24 +366,23 @@ void JitCompiler::Compile_DPH(Instruction instr) {
if (Common::GetCPUCaps().sse4_1) {
// Set 4th component to 1.0
BLENDPS(SRC1, R(ONE), 0x8); // 0b1000
DPPS(SRC1, R(SRC2), 0xff);
} else {
// Reverse to set the 4th component to 1.0
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
MOVSS(SRC1, R(ONE));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3));
MULPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
ADDPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
ADDPS(SRC1, R(SRC2));
}
Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(2, 3, 0, 1)); // XYZW -> ZWXY
ADDPS(SRC1, R(SRC2));
MOVAPS(SRC2, R(SRC1));
SHUFPS(SRC1, R(SRC1), _MM_SHUFFLE(0, 1, 2, 3)); // XYZW -> WZYX
ADDPS(SRC1, R(SRC2));
Compile_DestEnable(instr, SRC1);
}
@ -417,7 +421,7 @@ void JitCompiler::Compile_LG2(Instruction instr) {
void JitCompiler::Compile_MUL(Instruction instr) {
Compile_SwizzleSrc(instr, 1, instr.common.src1, SRC1);
Compile_SwizzleSrc(instr, 2, instr.common.src2, SRC2);
MULPS(SRC1, R(SRC2));
Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
Compile_DestEnable(instr, SRC1);
}
@ -635,12 +639,8 @@ void JitCompiler::Compile_MAD(Instruction instr) {
Compile_SwizzleSrc(instr, 3, instr.mad.src3, SRC3);
}
if (Common::GetCPUCaps().fma) {
VFMADD213PS(SRC1, SRC2, R(SRC3));
} else {
MULPS(SRC1, R(SRC2));
ADDPS(SRC1, R(SRC3));
}
Compile_SanitizedMul(SRC1, SRC2, SCRATCH);
ADDPS(SRC1, R(SRC3));
Compile_DestEnable(instr, SRC1);
}

View file

@ -68,6 +68,12 @@ private:
void Compile_SwizzleSrc(Instruction instr, unsigned src_num, SourceRegister src_reg, Gen::X64Reg dest);
void Compile_DestEnable(Instruction instr, Gen::X64Reg dest);
/**
* Compiles a `MUL src1, src2` operation, properly handling the PICA semantics when multiplying
* zero by inf. Clobbers `src2` and `scratch`.
*/
void Compile_SanitizedMul(Gen::X64Reg src1, Gen::X64Reg src2, Gen::X64Reg scratch);
void Compile_EvaluateCondition(Instruction instr);
void Compile_UniformCondition(Instruction instr);