Pica/VertexShader: Fix a bug caused due to incorrect assumptions of consecutive output register tables.

We now write create a temporary buffer for output registers and copy all of them to the actual output vertex structure after the shader has run. This is technically not necessary, but it's easier to vectorize in the future.
This commit is contained in:
Tony Wasserka 2015-03-12 14:18:46 +01:00
parent ed5b275d21
commit e4f5ec6272

View file

@ -72,7 +72,7 @@ struct VertexShaderState {
u32* program_counter; u32* program_counter;
const float24* input_register_table[16]; const float24* input_register_table[16];
float24* output_register_table[7*4]; Math::Vec4<float24> output_registers[16];
Math::Vec4<float24> temporary_registers[16]; Math::Vec4<float24> temporary_registers[16];
bool conditional_code[2]; bool conditional_code[2];
@ -198,8 +198,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
src2[3] = src2[3] * float24::FromFloat32(-1); src2[3] = src2[3] * float24::FromFloat32(-1);
} }
float24* dest = (instr.common.dest.Value() < 0x08) ? state.output_register_table[4*instr.common.dest.Value().GetIndex()] float24* dest = (instr.common.dest.Value() < 0x10) ? &state.output_registers[instr.common.dest.Value().GetIndex()][0]
: (instr.common.dest.Value() < 0x10) ? dummy_vec4_float24
: (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0] : (instr.common.dest.Value() < 0x20) ? &state.temporary_registers[instr.common.dest.Value().GetIndex()][0]
: dummy_vec4_float24; : dummy_vec4_float24;
@ -409,8 +408,7 @@ static void ProcessShaderCode(VertexShaderState& state) {
src3[3] = src3[3] * float24::FromFloat32(-1); src3[3] = src3[3] * float24::FromFloat32(-1);
} }
float24* dest = (instr.mad.dest.Value() < 0x08) ? state.output_register_table[4*instr.mad.dest.Value().GetIndex()] float24* dest = (instr.mad.dest.Value() < 0x10) ? &state.output_registers[instr.mad.dest.Value().GetIndex()][0]
: (instr.mad.dest.Value() < 0x10) ? dummy_vec4_float24
: (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0] : (instr.mad.dest.Value() < 0x20) ? &state.temporary_registers[instr.mad.dest.Value().GetIndex()][0]
: dummy_vec4_float24; : dummy_vec4_float24;
@ -587,12 +585,18 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) {
if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x; if(num_attributes > 14) state.input_register_table[attribute_register_map.attribute14_register] = &input.attr[14].x;
if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x; if(num_attributes > 15) state.input_register_table[attribute_register_map.attribute15_register] = &input.attr[15].x;
// Setup output register table state.conditional_code[0] = false;
OutputVertex ret; state.conditional_code[1] = false;
// Zero output so that attributes which aren't output won't have denormals in them, which will
// slow us down later.
memset(&ret, 0, sizeof(ret));
ProcessShaderCode(state);
DebugUtils::DumpShader(shader_memory.data(), state.debug.max_offset, swizzle_data.data(),
state.debug.max_opdesc_id, registers.vs_main_offset,
registers.vs_output_attributes);
// Setup output data
OutputVertex ret;
// TODO(neobrain): Under some circumstances, up to 16 attributes may be output. We need to
// figure out what those circumstances are and enable the remaining outputs then.
for (int i = 0; i < 7; ++i) { for (int i = 0; i < 7; ++i) {
const auto& output_register_map = registers.vs_output_attributes[i]; const auto& output_register_map = registers.vs_output_attributes[i];
@ -601,18 +605,18 @@ OutputVertex RunShader(const InputVertex& input, int num_attributes) {
output_register_map.map_z, output_register_map.map_w output_register_map.map_z, output_register_map.map_w
}; };
for (int comp = 0; comp < 4; ++comp) for (int comp = 0; comp < 4; ++comp) {
state.output_register_table[4*i+comp] = ((float24*)&ret) + semantics[comp]; float24* out = ((float24*)&ret) + semantics[comp];
if (semantics[comp] != Regs::VSOutputAttributes::INVALID) {
*out = state.output_registers[i][comp];
} else {
// Zero output so that attributes which aren't output won't have denormals in them,
// which would slow us down later.
memset(out, 0, sizeof(*out));
}
}
} }
state.conditional_code[0] = false;
state.conditional_code[1] = false;
ProcessShaderCode(state);
DebugUtils::DumpShader(shader_memory.data(), state.debug.max_offset, swizzle_data.data(),
state.debug.max_opdesc_id, registers.vs_main_offset,
registers.vs_output_attributes);
LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)", LOG_TRACE(Render_Software, "Output vertex: pos (%.2f, %.2f, %.2f, %.2f), col(%.2f, %.2f, %.2f, %.2f), tc0(%.2f, %.2f)",
ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(), ret.pos.x.ToFloat32(), ret.pos.y.ToFloat32(), ret.pos.z.ToFloat32(), ret.pos.w.ToFloat32(),
ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(), ret.color.x.ToFloat32(), ret.color.y.ToFloat32(), ret.color.z.ToFloat32(), ret.color.w.ToFloat32(),