From 531d2804619f3456c141976ed5bbe841492a3dc5 Mon Sep 17 00:00:00 2001
From: GPUCode <geoster3d@gmail.com>
Date: Sun, 16 Jul 2023 03:02:55 +0300
Subject: [PATCH] renderer_software: Multi-thread processing

* Doubles the performance in most cases
---
 .../renderer_software/sw_rasterizer.cpp       | 305 +++++++++---------
 .../renderer_software/sw_rasterizer.h         |  11 +-
 2 files changed, 165 insertions(+), 151 deletions(-)
diff --git a/src/video_core/renderer_software/sw_rasterizer.cpp b/src/video_core/renderer_software/sw_rasterizer.cpp
index 52d172e6e..62bdc7f1d 100644
--- a/src/video_core/renderer_software/sw_rasterizer.cpp
+++ b/src/video_core/renderer_software/sw_rasterizer.cpp
@@ -95,8 +95,14 @@ private:
 
 } // Anonymous namespace
 
+// Kirby Blowout Blast relies on the combiner output of a previous draw
+// in order to render the sky correctly.
+static thread_local Common::Vec4<u8> combiner_output{};
+
 RasterizerSoftware::RasterizerSoftware(Memory::MemorySystem& memory_)
-    : memory{memory_}, state{Pica::g_state}, regs{state.regs}, fb{memory, regs.framebuffer} {}
+    : memory{memory_}, state{Pica::g_state}, regs{state.regs},
+      num_sw_threads{std::max(std::thread::hardware_concurrency(), 2U)},
+      sw_workers{num_sw_threads, "SwRenderer workers"}, fb{memory, regs.framebuffer} {}
 
 void RasterizerSoftware::AddTriangle(const Pica::Shader::OutputVertex& v0,
                                      const Pica::Shader::OutputVertex& v1,
@@ -295,161 +301,171 @@ void RasterizerSoftware::ProcessTriangle(const Vertex& v0, const Vertex& v1, con
     // Enter rasterization loop, starting at the center of the topleft bounding box corner.
     // TODO: Not sure if looping through x first might be faster
     for (u16 y = min_y + 8; y < max_y; y += 0x10) {
-        for (u16 x = min_x + 8; x < max_x; x += 0x10) {
-            // Do not process the pixel if it's inside the scissor box and the scissor mode is set
-            // to Exclude.
-            if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) {
-                if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) {
+        const auto process_scanline = [&, y] {
+            for (u16 x = min_x + 8; x < max_x; x += 0x10) {
+                // Do not process the pixel if it's inside the scissor box and the scissor mode is
+                // set to Exclude.
+                if (regs.rasterizer.scissor_test.mode == RasterizerRegs::ScissorMode::Exclude) {
+                    if (x >= scissor_x1 && x < scissor_x2 && y >= scissor_y1 && y < scissor_y2) {
+                        continue;
+                    }
+                }
+
+                // Calculate the barycentric coordinates w0, w1 and w2
+                const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
+                const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
+                const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
+                const s32 wsum = w0 + w1 + w2;
+
+                // If current pixel is not covered by the current primitive
+                if (w0 < 0 || w1 < 0 || w2 < 0) {
                     continue;
                 }
-            }
 
-            // Calculate the barycentric coordinates w0, w1 and w2
-            const s32 w0 = bias0 + SignedArea(vtxpos[1].xy(), vtxpos[2].xy(), {x, y});
-            const s32 w1 = bias1 + SignedArea(vtxpos[2].xy(), vtxpos[0].xy(), {x, y});
-            const s32 w2 = bias2 + SignedArea(vtxpos[0].xy(), vtxpos[1].xy(), {x, y});
-            const s32 wsum = w0 + w1 + w2;
+                const auto baricentric_coordinates = Common::MakeVec(
+                    f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)),
+                    f24::FromFloat32(static_cast<f32>(w2)));
+                const f24 interpolated_w_inverse =
+                    f24::One() / Common::Dot(w_inverse, baricentric_coordinates);
 
-            // If current pixel is not covered by the current primitive
-            if (w0 < 0 || w1 < 0 || w2 < 0) {
-                continue;
-            }
+                // interpolated_z = z / w
+                const float interpolated_z_over_w =
+                    (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 +
+                     v2.screenpos[2].ToFloat32() * w2) /
+                    wsum;
 
-            const auto baricentric_coordinates = Common::MakeVec(
-                f24::FromFloat32(static_cast<f32>(w0)), f24::FromFloat32(static_cast<f32>(w1)),
-                f24::FromFloat32(static_cast<f32>(w2)));
-            const f24 interpolated_w_inverse =
-                f24::One() / Common::Dot(w_inverse, baricentric_coordinates);
+                // Not fully accurate. About 3 bits in precision are missing.
+                // Z-Buffer (z / w * scale + offset)
+                const float depth_scale =
+                    f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32();
+                const float depth_offset =
+                    f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32();
+                float depth = interpolated_z_over_w * depth_scale + depth_offset;
 
-            // interpolated_z = z / w
-            const float interpolated_z_over_w =
-                (v0.screenpos[2].ToFloat32() * w0 + v1.screenpos[2].ToFloat32() * w1 +
-                 v2.screenpos[2].ToFloat32() * w2) /
-                wsum;
+                // Potentially switch to W-Buffer
+                if (regs.rasterizer.depthmap_enable ==
+                    Pica::RasterizerRegs::DepthBuffering::WBuffering) {
+                    // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
+                    depth *= interpolated_w_inverse.ToFloat32() * wsum;
+                }
 
-            // Not fully accurate. About 3 bits in precision are missing.
-            // Z-Buffer (z / w * scale + offset)
-            const float depth_scale =
-                f24::FromRaw(regs.rasterizer.viewport_depth_range).ToFloat32();
-            const float depth_offset =
-                f24::FromRaw(regs.rasterizer.viewport_depth_near_plane).ToFloat32();
-            float depth = interpolated_z_over_w * depth_scale + depth_offset;
+                // Clamp the result
+                depth = std::clamp(depth, 0.0f, 1.0f);
 
-            // Potentially switch to W-Buffer
-            if (regs.rasterizer.depthmap_enable ==
-                Pica::RasterizerRegs::DepthBuffering::WBuffering) {
-                // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
-                depth *= interpolated_w_inverse.ToFloat32() * wsum;
-            }
-
-            // Clamp the result
-            depth = std::clamp(depth, 0.0f, 1.0f);
-
-            /**
-             * Perspective correct attribute interpolation:
-             * Attribute values cannot be calculated by simple linear interpolation since
-             * they are not linear in screen space. For example, when interpolating a
-             * texture coordinate across two vertices, something simple like
-             *     u = (u0*w0 + u1*w1)/(w0+w1)
-             * will not work. However, the attribute value divided by the
-             * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear
-             * in screenspace. Hence, we can linearly interpolate these two independently and
-             * calculate the interpolated attribute by dividing the results.
-             * I.e.
-             *     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1)
-             *     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
-             *     u = u_over_w / one_over_w
-             *
-             * The generalization to three vertices is straightforward in baricentric coordinates.
-             **/
-            const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) {
-                auto attr_over_w = Common::MakeVec(attr0, attr1, attr2);
-                f24 interpolated_attr_over_w = Common::Dot(attr_over_w, baricentric_coordinates);
-                return interpolated_attr_over_w * interpolated_w_inverse;
-            };
-
-            const Common::Vec4<u8> primary_color{
-                static_cast<u8>(
-                    round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r())
-                              .ToFloat32() *
-                          255)),
-                static_cast<u8>(
-                    round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g())
-                              .ToFloat32() *
-                          255)),
-                static_cast<u8>(
-                    round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b())
-                              .ToFloat32() *
-                          255)),
-                static_cast<u8>(
-                    round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a())
-                              .ToFloat32() *
-                          255)),
-            };
-
-            std::array<Common::Vec2<f24>, 3> uv;
-            uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
-            uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
-            uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u());
-            uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v());
-            uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u());
-            uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v());
-
-            // Sample bound texture units.
-            const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
-            const auto texture_color = TextureColor(uv, textures, tc0_w);
-
-            Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
-            Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
-
-            if (!regs.lighting.disable) {
-                const auto normquat =
-                    Common::Quaternion<f32>{
-                        {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x).ToFloat32(),
-                         get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y).ToFloat32(),
-                         get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z).ToFloat32()},
-                        get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
-                    }
-                        .Normalized();
-
-                const Common::Vec3f view{
-                    get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
-                    get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
-                    get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
+                /**
+                 * Perspective correct attribute interpolation:
+                 * Attribute values cannot be calculated by simple linear interpolation since
+                 * they are not linear in screen space. For example, when interpolating a
+                 * texture coordinate across two vertices, something simple like
+                 *     u = (u0*w0 + u1*w1)/(w0+w1)
+                 * will not work. However, the attribute value divided by the
+                 * clipspace w-coordinate (u/w) and and the inverse w-coordinate (1/w) are linear
+                 * in screenspace. Hence, we can linearly interpolate these two independently and
+                 * calculate the interpolated attribute by dividing the results.
+                 * I.e.
+                 *     u_over_w   = ((u0/v0.pos.w)*w0 + (u1/v1.pos.w)*w1)/(w0+w1)
+                 *     one_over_w = (( 1/v0.pos.w)*w0 + ( 1/v1.pos.w)*w1)/(w0+w1)
+                 *     u = u_over_w / one_over_w
+                 *
+                 * The generalization to three vertices is straightforward in baricentric
+                 *coordinates.
+                 **/
+                const auto get_interpolated_attribute = [&](f24 attr0, f24 attr1, f24 attr2) {
+                    auto attr_over_w = Common::MakeVec(attr0, attr1, attr2);
+                    f24 interpolated_attr_over_w =
+                        Common::Dot(attr_over_w, baricentric_coordinates);
+                    return interpolated_attr_over_w * interpolated_w_inverse;
                 };
-                std::tie(primary_fragment_color, secondary_fragment_color) = ComputeFragmentsColors(
-                    regs.lighting, state.lighting, normquat, view, texture_color);
-            }
 
-            // Write the TEV stages.
-            WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color,
-                           secondary_fragment_color);
+                const Common::Vec4<u8> primary_color{
+                    static_cast<u8>(
+                        round(get_interpolated_attribute(v0.color.r(), v1.color.r(), v2.color.r())
+                                  .ToFloat32() *
+                              255)),
+                    static_cast<u8>(
+                        round(get_interpolated_attribute(v0.color.g(), v1.color.g(), v2.color.g())
+                                  .ToFloat32() *
+                              255)),
+                    static_cast<u8>(
+                        round(get_interpolated_attribute(v0.color.b(), v1.color.b(), v2.color.b())
+                                  .ToFloat32() *
+                              255)),
+                    static_cast<u8>(
+                        round(get_interpolated_attribute(v0.color.a(), v1.color.a(), v2.color.a())
+                                  .ToFloat32() *
+                              255)),
+                };
 
-            const auto& output_merger = regs.framebuffer.output_merger;
-            if (output_merger.fragment_operation_mode ==
-                FramebufferRegs::FragmentOperationMode::Shadow) {
-                u32 depth_int = static_cast<u32>(depth * 0xFFFFFF);
-                // Use green color as the shadow intensity
-                u8 stencil = combiner_output.y;
-                fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil);
-                // Skip the normal output merger pipeline if it is in shadow mode
-                continue;
-            }
+                std::array<Common::Vec2<f24>, 3> uv;
+                uv[0].u() = get_interpolated_attribute(v0.tc0.u(), v1.tc0.u(), v2.tc0.u());
+                uv[0].v() = get_interpolated_attribute(v0.tc0.v(), v1.tc0.v(), v2.tc0.v());
+                uv[1].u() = get_interpolated_attribute(v0.tc1.u(), v1.tc1.u(), v2.tc1.u());
+                uv[1].v() = get_interpolated_attribute(v0.tc1.v(), v1.tc1.v(), v2.tc1.v());
+                uv[2].u() = get_interpolated_attribute(v0.tc2.u(), v1.tc2.u(), v2.tc2.u());
+                uv[2].v() = get_interpolated_attribute(v0.tc2.v(), v1.tc2.v(), v2.tc2.v());
 
-            // Does alpha testing happen before or after stencil?
-            if (!DoAlphaTest(combiner_output.a())) {
-                continue;
+                // Sample bound texture units.
+                const f24 tc0_w = get_interpolated_attribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
+                const auto texture_color = TextureColor(uv, textures, tc0_w);
+
+                Common::Vec4<u8> primary_fragment_color = {0, 0, 0, 0};
+                Common::Vec4<u8> secondary_fragment_color = {0, 0, 0, 0};
+
+                if (!regs.lighting.disable) {
+                    const auto normquat =
+                        Common::Quaternion<f32>{
+                            {get_interpolated_attribute(v0.quat.x, v1.quat.x, v2.quat.x)
+                                 .ToFloat32(),
+                             get_interpolated_attribute(v0.quat.y, v1.quat.y, v2.quat.y)
+                                 .ToFloat32(),
+                             get_interpolated_attribute(v0.quat.z, v1.quat.z, v2.quat.z)
+                                 .ToFloat32()},
+                            get_interpolated_attribute(v0.quat.w, v1.quat.w, v2.quat.w).ToFloat32(),
+                        }
+                            .Normalized();
+
+                    const Common::Vec3f view{
+                        get_interpolated_attribute(v0.view.x, v1.view.x, v2.view.x).ToFloat32(),
+                        get_interpolated_attribute(v0.view.y, v1.view.y, v2.view.y).ToFloat32(),
+                        get_interpolated_attribute(v0.view.z, v1.view.z, v2.view.z).ToFloat32(),
+                    };
+                    std::tie(primary_fragment_color, secondary_fragment_color) =
+                        ComputeFragmentsColors(regs.lighting, state.lighting, normquat, view,
+                                               texture_color);
+                }
+
+                // Write the TEV stages.
+                WriteTevConfig(texture_color, tev_stages, primary_color, primary_fragment_color,
+                               secondary_fragment_color);
+
+                const auto& output_merger = regs.framebuffer.output_merger;
+                if (output_merger.fragment_operation_mode ==
+                    FramebufferRegs::FragmentOperationMode::Shadow) {
+                    u32 depth_int = static_cast<u32>(depth * 0xFFFFFF);
+                    // Use green color as the shadow intensity
+                    u8 stencil = combiner_output.y;
+                    fb.DrawShadowMapPixel(x >> 4, y >> 4, depth_int, stencil);
+                    // Skip the normal output merger pipeline if it is in shadow mode
+                    continue;
+                }
+
+                // Does alpha testing happen before or after stencil?
+                if (!DoAlphaTest(combiner_output.a())) {
+                    continue;
+                }
+                WriteFog(depth);
+                if (!DoDepthStencilTest(x, y, depth)) {
+                    continue;
+                }
+                const auto result = PixelColor(x, y);
+                if (regs.framebuffer.framebuffer.allow_color_write != 0) {
+                    fb.DrawPixel(x >> 4, y >> 4, result);
+                }
             }
-            WriteFog(combiner_output, depth);
-            if (!DoDepthStencilTest(x, y, depth)) {
-                continue;
-            }
-            const auto result = PixelColor(x, y, combiner_output);
-            if (regs.framebuffer.framebuffer.allow_color_write != 0) {
-                fb.DrawPixel(x >> 4, y >> 4, result);
-            }
-        }
+        };
+        sw_workers.QueueWork(std::move(process_scanline));
     }
+    sw_workers.WaitForRequests();
 }
 
 std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
@@ -572,8 +588,7 @@ std::array<Common::Vec4<u8>, 4> RasterizerSoftware::TextureColor(
     return texture_color;
 }
 
-Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y,
-                                                Common::Vec4<u8>& combiner_output) const {
+Common::Vec4<u8> RasterizerSoftware::PixelColor(u16 x, u16 y) const {
     const auto dest = fb.GetPixel(x >> 4, y >> 4);
     Common::Vec4<u8> blend_output = combiner_output;
 
@@ -768,7 +783,7 @@ void RasterizerSoftware::WriteTevConfig(
     }
 }
 
-void RasterizerSoftware::WriteFog(Common::Vec4<u8>& combiner_output, float depth) const {
+void RasterizerSoftware::WriteFog(float depth) const {
     /**
      * Apply fog combiner. Not fully accurate. We'd have to know what data type is used to
      * store the depth etc. Using float for now until we know more about Pica datatypes.
diff --git a/src/video_core/renderer_software/sw_rasterizer.h b/src/video_core/renderer_software/sw_rasterizer.h
index 919d862fc..28b68263d 100644
--- a/src/video_core/renderer_software/sw_rasterizer.h
+++ b/src/video_core/renderer_software/sw_rasterizer.h
@@ -5,7 +5,7 @@
 #pragma once
 
 #include <span>
-
+#include "common/thread_worker.h"
 #include "video_core/rasterizer_interface.h"
 #include "video_core/regs_texturing.h"
 #include "video_core/renderer_software/sw_clipper.h"
@@ -52,7 +52,7 @@ private:
         std::span<const Pica::TexturingRegs::FullTextureConfig, 3> textures, f24 tc0_w) const;
 
     /// Returns the final pixel color with blending or logic ops applied.
-    Common::Vec4<u8> PixelColor(u16 x, u16 y, Common::Vec4<u8>& combiner_output) const;
+    Common::Vec4<u8> PixelColor(u16 x, u16 y) const;
 
     /// Emulates the TEV configuration and returns the combiner output.
     void WriteTevConfig(std::span<const Common::Vec4<u8>, 4> texture_color,
@@ -61,7 +61,7 @@ private:
                         Common::Vec4<u8> secondary_fragment_color);
 
     /// Blends fog to the combiner output if enabled.
-    void WriteFog(Common::Vec4<u8>& combiner_output, float depth) const;
+    void WriteFog(float depth) const;
 
     /// Performs the alpha test. Returns false if the test failed.
     bool DoAlphaTest(u8 alpha) const;
@@ -73,10 +73,9 @@ private:
     Memory::MemorySystem& memory;
     Pica::State& state;
     const Pica::Regs& regs;
+    size_t num_sw_threads;
+    Common::ThreadWorker sw_workers;
     Framebuffer fb;
-    // Kirby Blowout Blast relies on the combiner output of a previous draw
-    // in order to render the sky correctly.
-    Common::Vec4<u8> combiner_output{};
 };
 
 } // namespace SwRenderer