renderer_software: Multi-thread processing

* Doubles the performance in most cases
This commit is contained in:
GPUCode 2023-07-16 03:02:55 +03:00
parent d702915624
commit 531d280461
2 changed files with 165 additions and 151 deletions

View file

@ -95,8 +95,14 @@ private:
} // Anonymous namespace } // Anonymous namespace
// Kirby Blowout Blast relies on the combiner output of a previous draw
// in order to render the sky correctly.
static thread_local Common::Vec4<u8> combiner_output{};
RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_) RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_)
: memory{memory_}, state{Pica::g_state}, regs{state.regs}, fb{memory, regs.framebuffer} {} : memory{memory_}, state{Pica::g_state}, regs{state.regs},
num_sw_threads{std::max(std::thread::hardware_concurrency(), 2U)},
sw_workers{num_sw_threads, "SwRenderer workers"}, fb{memory, regs.framebuffer} {}
void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0, void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0,
const Pica::Shader::OutputVertex& v1, const Pica::Shader::OutputVertex& v1,
@ -295,9 +301,10 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con
// Enter rasterization loop, starting at the center of the topleft bounding box corner. // Enter rasterization loop, starting at the center of the topleft bounding box corner.
// TODO: Not sure if looping through x first might be faster // TODO: Not sure if looping through x first might be faster
for (u16 y = min_y + 8; y < max_y; y += 0x10) { for (u16 y = min_y + 8; y < max_y; y += 0x10) {
const auto process_scanline = [&, y] {
for (u16 x = min_x + 8; x < max_x; x += 0x10) { for (u16 x = min_x + 8; x < max_x; x += 0x10) {
// Do not process the pixel if it's inside the scissor box and the scissor mode is set // Do not process the pixel if it's inside the scissor box and the scissor mode is
// to Exclude. // set to Exclude.
if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) { if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) {
if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) { if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) {
continue; continue;
@ -360,11 +367,13 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con
* one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1) * one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
* u = u_over_w / one_over_w * u = u_over_w / one_over_w
* *
* The generalization to three vertices is straightforward in baricentric coordinates. * The generalization to three vertices is straightforward in baricentric
*coordinates.
**/ **/
const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) { const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) {
auto attr_over_w = Common::MakeVec(attr0, attr1, attr2); auto attr_over_w = Common::MakeVec(attr0, attr1, attr2);
f24 interpolated_attr_over_w = Common::Dot(attr_over_w, baricentric_coordinates); f24 interpolated_attr_over_w =
Common::Dot(attr_over_w, baricentric_coordinates);
return interpolated_attr_over_w * interpolated_w_inverse; return interpolated_attr_over_w * interpolated_w_inverse;
}; };
@ -405,9 +414,12 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con
if (!regs.lighting.disable) { if (!regs.lighting.disable) {
const auto normquat = const auto normquat =
Common::Quaternion<f32>{ Common::Quaternion<f32>{
{get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(), {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x)
get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(), .ToFloat32(),
get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()}, get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y)
.ToFloat32(),
get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z)
.ToFloat32()},
get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(), get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
} }
.Normalized(); .Normalized();
@ -417,8 +429,9 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con
get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(), get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(), get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
}; };
std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors( std::tie(primary_fragment_color, secondary_fragment_color) =
regs.lighting, state.lighting, normquat, view, texture_color); ComputeFragmentsColors(regs.lighting, state.lighting, normquat, view,
texture_color);
} }
// Write the TEV stages. // Write the TEV stages.
@ -440,16 +453,19 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con
if (!DoAlphaTest(combiner_output.a())) { if (!DoAlphaTest(combiner_output.a())) {
continue; continue;
} }
WriteFog(combiner_output, depth); WriteFog(depth);
if (!DoDepthStencilTest(x, y, depth)) { if (!DoDepthStencilTest(x, y, depth)) {
continue; continue;
} }
const auto result = PixelColor(x, y, combiner_output); const auto result = PixelColor(x, y);
if (regs.framebuffer.framebuffer.allow_color_write != 0) { if (regs.framebuffer.framebuffer.allow_color_write != 0) {
fb.DrawPixel(x >> 4, y >> 4, result); fb.DrawPixel(x >> 4, y >> 4, result);
} }
} }
};
sw_workers.QueueWork(std::move(process_scanline));
} }
sw_workers.WaitForRequests();
} }
std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor( std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
@ -572,8 +588,7 @@ std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
return texture_color; return texture_color;
} }
Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y, Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y) const {
Common::Vec4<u8>& combiner_output) const {
const auto dest = fb.GetPixel(x >> 4, y >> 4); const auto dest = fb.GetPixel(x >> 4, y >> 4);
Common::Vec4<u8> blend_output = combiner_output; Common::Vec4<u8> blend_output = combiner_output;
@ -768,7 +783,7 @@ void RasterizerSoftware::WriteTevConfig(
} }
} }
void RasterizerSoftware::WriteFog(Common::Vec4<u8>& combiner_output, float depth) const { void RasterizerSoftware::WriteFog(float depth) const {
/** /**
* Apply fog combiner. Not fully accurate. We'd have to know what data type is used to * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to
* store the depth etc. Using float for now until we know more about Pica datatypes. * store the depth etc. Using float for now until we know more about Pica datatypes.

View file

@ -5,7 +5,7 @@
#pragma once #pragma once
#include <span> #include <span>
#include "common/thread_worker.h"
#include "video_core/rasterizer_interface.h" #include "video_core/rasterizer_interface.h"
#include "video_core/regs_texturing.h" #include "video_core/regs_texturing.h"
#include "video_core/renderer_software/sw_clipper.h" #include "video_core/renderer_software/sw_clipper.h"
@ -52,7 +52,7 @@ private:
std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const; std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const;
/// Returns the final pixel color with blending or logic ops applied. /// Returns the final pixel color with blending or logic ops applied.
Common::Vec4<u8> PixelColor(u16 x, u16 y, Common::Vec4<u8>& combiner_output) const; Common::Vec4<u8> PixelColor(u16 x, u16 y) const;
/// Emulates the TEV configuration and returns the combiner output. /// Emulates the TEV configuration and returns the combiner output.
void WriteTevConfig(std::span<const Common::Vec4<u8>, 4> texture_color, void WriteTevConfig(std::span<const Common::Vec4<u8>, 4> texture_color,
@ -61,7 +61,7 @@ private:
Common::Vec4<u8> secondary_fragment_color); Common::Vec4<u8> secondary_fragment_color);
/// Blends fog to the combiner output if enabled. /// Blends fog to the combiner output if enabled.
void WriteFog(Common::Vec4<u8>& combiner_output, float depth) const; void WriteFog(float depth) const;
/// Performs the alpha test. Returns false if the test failed. /// Performs the alpha test. Returns false if the test failed.
bool DoAlphaTest(u8 alpha) const; bool DoAlphaTest(u8 alpha) const;
@ -73,10 +73,9 @@ private:
Memory::MemorySystem& memory; Memory::MemorySystem& memory;
Pica::State& state; Pica::State& state;
const Pica::Regs& regs; const Pica::Regs& regs;
size_t num_sw_threads;
Common::ThreadWorker sw_workers;
Framebuffer fb; Framebuffer fb;
// Kirby Blowout Blast relies on the combiner output of a previous draw
// in order to render the sky correctly.
Common::Vec4<u8> combiner_output{};
}; };
} // namespace SwRenderer } // namespace SwRenderer