Merge pull request #3741 from wwylele/glvtx-last

gl_rasterizer: implement AccelerateDrawBatch to emulate PICA shader on hardware
2018-05-18 13:03:40 -06:00 · 2018-05-18 13:03:40 -06:00 · 64a8c8ee07
commit 64a8c8ee07
parent 7861be67bb 129b893509
15 changed files with 492 additions and 44 deletions
--- a/src/citra/config.cpp
+++ b/src/citra/config.cpp
@ -95,6 +95,11 @@ void Config::ReadValues() {
    // Renderer
    Settings::values.use_hw_renderer = sdl2_config->GetBoolean("Renderer", "use_hw_renderer", true);
    Settings::values.use_hw_shader = sdl2_config->GetBoolean("Renderer", "use_hw_shader", true);
    Settings::values.shaders_accurate_gs =
        sdl2_config->GetBoolean("Renderer", "shaders_accurate_gs", true);
    Settings::values.shaders_accurate_mul =
        sdl2_config->GetBoolean("Renderer", "shaders_accurate_mul", false);
    Settings::values.use_shader_jit = sdl2_config->GetBoolean("Renderer", "use_shader_jit", true);
    Settings::values.resolution_factor =
        static_cast<u16>(sdl2_config->GetInteger("Renderer", "resolution_factor", 1));
--- a/src/citra/default_ini.h
+++ b/src/citra/default_ini.h
@ -77,6 +77,18 @@ use_cpu_jit =
 # 0: Software, 1 (default): Hardware
 use_hw_renderer =
 # Whether to use hardware shaders to emulate 3DS shaders
 # 0: Software, 1 (default): Hardware
 use_hw_shader =
 # Whether to use accurate multiplication in hardware shaders
 # 0: Off (Default. Faster, but causes issues in some games) 1: On (Slower, but correct)
 shaders_accurate_mul =
 # Whether to fallback to software for geometry shaders
 # 0: Off (Faster, but causes issues in some games) 1: On (Default. Slower, but correct)
 shaders_accurate_gs =
 # Whether to use the Just-In-Time (JIT) compiler for shader emulation
 # 0: Interpreter (slow), 1 (default): JIT (fast)
 use_shader_jit =
--- a/src/citra_qt/configuration/config.cpp
+++ b/src/citra_qt/configuration/config.cpp
@ -83,6 +83,10 @@ void Config::ReadValues() {
    qt_config->beginGroup("Renderer");
    Settings::values.use_hw_renderer = qt_config->value("use_hw_renderer", true).toBool();
    Settings::values.use_hw_shader = qt_config->value("use_hw_shader", true).toBool();
    Settings::values.shaders_accurate_gs = qt_config->value("shaders_accurate_gs", true).toBool();
    Settings::values.shaders_accurate_mul =
        qt_config->value("shaders_accurate_mul", false).toBool();
    Settings::values.use_shader_jit = qt_config->value("use_shader_jit", true).toBool();
    Settings::values.resolution_factor =
        static_cast<u16>(qt_config->value("resolution_factor", 1).toInt());
@ -272,6 +276,9 @@ void Config::SaveValues() {
    qt_config->beginGroup("Renderer");
    qt_config->setValue("use_hw_renderer", Settings::values.use_hw_renderer);
    qt_config->setValue("use_hw_shader", Settings::values.use_hw_shader);
    qt_config->setValue("shaders_accurate_gs", Settings::values.shaders_accurate_gs);
    qt_config->setValue("shaders_accurate_mul", Settings::values.shaders_accurate_mul);
    qt_config->setValue("use_shader_jit", Settings::values.use_shader_jit);
    qt_config->setValue("resolution_factor", Settings::values.resolution_factor);
    qt_config->setValue("use_vsync", Settings::values.use_vsync);
--- a/src/core/settings.cpp
+++ b/src/core/settings.cpp
@ -22,6 +22,9 @@ void Apply() {
    VideoCore::g_hw_renderer_enabled = values.use_hw_renderer;
    VideoCore::g_shader_jit_enabled = values.use_shader_jit;
    VideoCore::g_hw_shader_enabled = values.use_hw_shader;
    VideoCore::g_hw_shader_accurate_gs = values.shaders_accurate_gs;
    VideoCore::g_hw_shader_accurate_mul = values.shaders_accurate_mul;
    if (VideoCore::g_emu_window) {
        auto layout = VideoCore::g_emu_window->GetFramebufferLayout();
--- a/src/core/settings.h
+++ b/src/core/settings.h
@ -107,6 +107,9 @@ struct Values {
    // Renderer
    bool use_hw_renderer;
    bool use_hw_shader;
    bool shaders_accurate_gs;
    bool shaders_accurate_mul;
    bool use_shader_jit;
    u16 resolution_factor;
    bool use_vsync;
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@ -286,6 +286,38 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
        if (g_debug_context)
            g_debug_context->OnEvent(DebugContext::Event::IncomingPrimitiveBatch, nullptr);
        PrimitiveAssembler<Shader::OutputVertex>& primitive_assembler = g_state.primitive_assembler;
        bool accelerate_draw = VideoCore::g_hw_shader_enabled && primitive_assembler.IsEmpty();
        if (regs.pipeline.use_gs == PipelineRegs::UseGS::No) {
            auto topology = primitive_assembler.GetTopology();
            if (topology == PipelineRegs::TriangleTopology::Shader ||
                topology == PipelineRegs::TriangleTopology::List) {
                accelerate_draw = accelerate_draw && (regs.pipeline.num_vertices % 3) == 0;
            }
            // TODO (wwylele): for Strip/Fan topology, if the primitive assember is not restarted
            // after this draw call, the buffered vertex from this draw should "leak" to the next
            // draw, in which case we should buffer the vertex into the software primitive assember,
            // or disable accelerate draw completely. However, there is not game found yet that does
            // this, so this is left unimplemented for now. Revisit this when an issue is found in
            // games.
        } else {
            if (VideoCore::g_hw_shader_accurate_gs) {
                accelerate_draw = false;
            }
        }
        bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
        if (accelerate_draw &&
            VideoCore::g_renderer->Rasterizer()->AccelerateDrawBatch(is_indexed)) {
            if (g_debug_context) {
                g_debug_context->OnEvent(DebugContext::Event::FinishedPrimitiveBatch, nullptr);
            }
            break;
        }
        // Processes information about internal vertex attributes to figure out how a vertex is
        // loaded.
        // Later, these can be compiled and cached.
@ -294,15 +326,11 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
        Shader::OutputVertex::ValidateSemantics(regs.rasterizer);
        // Load vertices
        bool is_indexed = (id == PICA_REG_INDEX(pipeline.trigger_draw_indexed));
        const auto& index_info = regs.pipeline.index_array;
        const u8* index_address_8 = Memory::GetPhysicalPointer(base_address + index_info.offset);
        const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
        bool index_u16 = index_info.format != 0;
        PrimitiveAssembler<Shader::OutputVertex>& primitive_assembler = g_state.primitive_assembler;
        if (g_debug_context && g_debug_context->recorder) {
            for (int i = 0; i < 3; ++i) {
                const auto texture = regs.texturing.GetTextures()[i];
--- a/src/video_core/primitive_assembly.cpp
+++ b/src/video_core/primitive_assembly.cpp
@ -71,6 +71,16 @@ void PrimitiveAssembler<VertexType>::Reconfigure(PipelineRegs::TriangleTopology
    this->topology = topology;
 }
 template <typename VertexType>
 bool PrimitiveAssembler<VertexType>::IsEmpty() const {
    return buffer_index == 0 && strip_ready == false;
 }
 template <typename VertexType>
 PipelineRegs::TriangleTopology PrimitiveAssembler<VertexType>::GetTopology() const {
    return topology;
 }
 // explicitly instantiate use cases
 template struct PrimitiveAssembler<Shader::OutputVertex>;
--- a/src/video_core/primitive_assembly.h
+++ b/src/video_core/primitive_assembly.h
@ -45,6 +45,16 @@ struct PrimitiveAssembler {
     */
    void Reconfigure(PipelineRegs::TriangleTopology topology);
    /**
     * Returns whether the PrimitiveAssembler has an empty internal buffer.
     */
    bool IsEmpty() const;
    /**
     * Returns the current topology.
     */
    PipelineRegs::TriangleTopology GetTopology() const;
 private:
    PipelineRegs::TriangleTopology topology;
--- a/src/video_core/rasterizer_interface.h
+++ b/src/video_core/rasterizer_interface.h
@ -66,5 +66,10 @@ public:
                                   ScreenInfo& screen_info) {
        return false;
    }
    /// Attempt to draw using hardware shaders
    virtual bool AccelerateDrawBatch(bool is_indexed) {
        return false;
    }
 };
 } // namespace VideoCore
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -12,6 +12,7 @@
 #include "common/logging/log.h"
 #include "common/math_util.h"
 #include "common/microprofile.h"
 #include "common/scope_exit.h"
 #include "common/vector_math.h"
 #include "core/hw/gpu.h"
 #include "video_core/pica_state.h"
@ -26,13 +27,17 @@
 using PixelFormat = SurfaceParams::PixelFormat;
 using SurfaceType = SurfaceParams::SurfaceType;
 MICROPROFILE_DEFINE(OpenGL_VAO, "OpenGL", "Vertex Array Setup", MP_RGB(255, 128, 0));
 MICROPROFILE_DEFINE(OpenGL_VS, "OpenGL", "Vertex Shader Setup", MP_RGB(192, 128, 128));
 MICROPROFILE_DEFINE(OpenGL_GS, "OpenGL", "Geometry Shader Setup", MP_RGB(128, 192, 128));
 MICROPROFILE_DEFINE(OpenGL_Drawing, "OpenGL", "Drawing", MP_RGB(128, 128, 192));
 MICROPROFILE_DEFINE(OpenGL_Blits, "OpenGL", "Blits", MP_RGB(100, 100, 255));
 MICROPROFILE_DEFINE(OpenGL_CacheManagement, "OpenGL", "Cache Mgmt", MP_RGB(100, 255, 100));
 RasterizerOpenGL::RasterizerOpenGL()
    : shader_dirty(true), vertex_buffer(GL_ARRAY_BUFFER, VERTEX_BUFFER_SIZE),
-      uniform_buffer(GL_UNIFORM_BUFFER, UNIFORM_BUFFER_SIZE) {
+      uniform_buffer(GL_UNIFORM_BUFFER, UNIFORM_BUFFER_SIZE),
      index_buffer(GL_ELEMENT_ARRAY_BUFFER, INDEX_BUFFER_SIZE) {
    // Clipping plane 0 is always enabled for PICA fixed clip plane z <= 0
    state.clip_distance[0] = true;
@ -46,13 +51,9 @@ RasterizerOpenGL::RasterizerOpenGL()
    texture_cube_sampler.Create();
    state.texture_cube_unit.sampler = texture_cube_sampler.sampler.handle;
-    // Generate VBO, VAO and UBO
+    // Generate VAO
-    vertex_array.Create();
+    sw_vao.Create();
-
+    hw_vao.Create();
    state.draw.vertex_array = vertex_array.handle;
    state.draw.vertex_buffer = vertex_buffer.GetHandle();
    state.draw.uniform_buffer = uniform_buffer.GetHandle();
    state.Apply();
    uniform_block_data.dirty = true;
@ -67,10 +68,18 @@ RasterizerOpenGL::RasterizerOpenGL()
    uniform_block_data.proctex_diff_lut_dirty = true;
    glGetIntegerv(GL_UNIFORM_BUFFER_OFFSET_ALIGNMENT, &uniform_buffer_alignment);
    uniform_size_aligned_vs =
        Common::AlignUp<size_t>(sizeof(VSUniformData), uniform_buffer_alignment);
    uniform_size_aligned_gs =
        Common::AlignUp<size_t>(sizeof(GSUniformData), uniform_buffer_alignment);
    uniform_size_aligned_fs =
        Common::AlignUp<size_t>(sizeof(UniformData), uniform_buffer_alignment);
-    // Set vertex attributes
+    // Set vertex attributes for software shader path
    state.draw.vertex_array = sw_vao.handle;
    state.draw.vertex_buffer = vertex_buffer.GetHandle();
    state.Apply();
    glVertexAttribPointer(GLShader::ATTRIBUTE_POSITION, 4, GL_FLOAT, GL_FALSE,
                          sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, position));
    glEnableVertexAttribArray(GLShader::ATTRIBUTE_POSITION);
@ -176,6 +185,11 @@ RasterizerOpenGL::RasterizerOpenGL()
    glActiveTexture(TextureUnits::ProcTexDiffLUT.Enum());
    glTexBuffer(GL_TEXTURE_BUFFER, GL_RGBA32F, proctex_diff_lut_buffer.handle);
    // Bind index buffer for hardware shader path
    state.draw.vertex_array = hw_vao.handle;
    state.Apply();
    glBindBuffer(GL_ELEMENT_ARRAY_BUFFER, index_buffer.GetHandle());
    shader_program_manager =
        std::make_unique<ShaderProgramManager>(GLAD_GL_ARB_separate_shader_objects);
@ -258,10 +272,264 @@ void RasterizerOpenGL::AddTriangle(const Pica::Shader::OutputVertex& v0,
    vertex_batch.emplace_back(v2, AreQuaternionsOpposite(v0.quat, v2.quat));
 }
 static constexpr std::array<GLenum, 4> vs_attrib_types{
    GL_BYTE,          // VertexAttributeFormat::BYTE
    GL_UNSIGNED_BYTE, // VertexAttributeFormat::UBYTE
    GL_SHORT,         // VertexAttributeFormat::SHORT
    GL_FLOAT          // VertexAttributeFormat::FLOAT
 };
 struct VertexArrayInfo {
    u32 vs_input_index_min;
    u32 vs_input_index_max;
    u32 vs_input_size;
 };
 RasterizerOpenGL::VertexArrayInfo RasterizerOpenGL::AnalyzeVertexArray(bool is_indexed) {
    const auto& regs = Pica::g_state.regs;
    const auto& vertex_attributes = regs.pipeline.vertex_attributes;
    u32 vertex_min;
    u32 vertex_max;
    if (is_indexed) {
        const auto& index_info = regs.pipeline.index_array;
        PAddr address = vertex_attributes.GetPhysicalBaseAddress() + index_info.offset;
        const u8* index_address_8 = Memory::GetPhysicalPointer(address);
        const u16* index_address_16 = reinterpret_cast<const u16*>(index_address_8);
        bool index_u16 = index_info.format != 0;
        vertex_min = 0xFFFF;
        vertex_max = 0;
        std::size_t size = regs.pipeline.num_vertices * (index_u16 ? 2 : 1);
        res_cache.FlushRegion(address, size, nullptr);
        for (u32 index = 0; index < regs.pipeline.num_vertices; ++index) {
            u32 vertex = index_u16 ? index_address_16[index] : index_address_8[index];
            vertex_min = std::min(vertex_min, vertex);
            vertex_max = std::max(vertex_max, vertex);
        }
    } else {
        vertex_min = regs.pipeline.vertex_offset;
        vertex_max = regs.pipeline.vertex_offset + regs.pipeline.num_vertices - 1;
    }
    u32 vertex_num = vertex_max - vertex_min + 1;
    u32 vs_input_size = 0;
    for (auto& loader : vertex_attributes.attribute_loaders) {
        if (loader.component_count != 0) {
            vs_input_size += loader.byte_count * vertex_num;
        }
    }
    return {vertex_min, vertex_max, vs_input_size};
 }
 void RasterizerOpenGL::SetupVertexArray(u8* array_ptr, GLintptr buffer_offset,
                                        GLuint vs_input_index_min, GLuint vs_input_index_max) {
    MICROPROFILE_SCOPE(OpenGL_VAO);
    const auto& regs = Pica::g_state.regs;
    const auto& vertex_attributes = regs.pipeline.vertex_attributes;
    PAddr base_address = vertex_attributes.GetPhysicalBaseAddress();
    state.draw.vertex_array = hw_vao.handle;
    state.draw.vertex_buffer = vertex_buffer.GetHandle();
    state.Apply();
    std::array<bool, 16> enable_attributes{};
    for (const auto& loader : vertex_attributes.attribute_loaders) {
        if (loader.component_count == 0 || loader.byte_count == 0) {
            continue;
        }
        u32 offset = 0;
        for (u32 comp = 0; comp < loader.component_count && comp < 12; ++comp) {
            u32 attribute_index = loader.GetComponent(comp);
            if (attribute_index < 12) {
                if (vertex_attributes.GetNumElements(attribute_index) != 0) {
                    offset = Common::AlignUp(
                        offset, vertex_attributes.GetElementSizeInBytes(attribute_index));
                    u32 input_reg = regs.vs.GetRegisterForAttribute(attribute_index);
                    GLint size = vertex_attributes.GetNumElements(attribute_index);
                    GLenum type = vs_attrib_types[static_cast<u32>(
                        vertex_attributes.GetFormat(attribute_index))];
                    GLsizei stride = loader.byte_count;
                    glVertexAttribPointer(input_reg, size, type, GL_FALSE, stride,
                                          reinterpret_cast<GLvoid*>(buffer_offset + offset));
                    enable_attributes[input_reg] = true;
                    offset += vertex_attributes.GetStride(attribute_index);
                }
            } else {
                // Attribute ids 12, 13, 14 and 15 signify 4, 8, 12 and 16-byte paddings,
                // respectively
                offset = Common::AlignUp(offset, 4);
                offset += (attribute_index - 11) * 4;
            }
        }
        PAddr data_addr =
            base_address + loader.data_offset + (vs_input_index_min * loader.byte_count);
        u32 vertex_num = vs_input_index_max - vs_input_index_min + 1;
        u32 data_size = loader.byte_count * vertex_num;
        res_cache.FlushRegion(data_addr, data_size, nullptr);
        std::memcpy(array_ptr, Memory::GetPhysicalPointer(data_addr), data_size);
        array_ptr += data_size;
        buffer_offset += data_size;
    }
    for (std::size_t i = 0; i < enable_attributes.size(); ++i) {
        if (enable_attributes[i] != hw_vao_enabled_attributes[i]) {
            if (enable_attributes[i]) {
                glEnableVertexAttribArray(i);
            } else {
                glDisableVertexAttribArray(i);
            }
            hw_vao_enabled_attributes[i] = enable_attributes[i];
        }
        if (vertex_attributes.IsDefaultAttribute(i)) {
            u32 reg = regs.vs.GetRegisterForAttribute(i);
            if (!enable_attributes[reg]) {
                const auto& attr = Pica::g_state.input_default_attributes.attr[i];
                glVertexAttrib4f(reg, attr.x.ToFloat32(), attr.y.ToFloat32(), attr.z.ToFloat32(),
                                 attr.w.ToFloat32());
            }
        }
    }
 }
 bool RasterizerOpenGL::SetupVertexShader() {
    MICROPROFILE_SCOPE(OpenGL_VS);
    GLShader::PicaVSConfig vs_config(Pica::g_state.regs, Pica::g_state.vs);
    return shader_program_manager->UseProgrammableVertexShader(vs_config, Pica::g_state.vs);
 }
 bool RasterizerOpenGL::SetupGeometryShader() {
    MICROPROFILE_SCOPE(OpenGL_GS);
    const auto& regs = Pica::g_state.regs;
    if (regs.pipeline.use_gs == Pica::PipelineRegs::UseGS::No) {
        GLShader::PicaFixedGSConfig gs_config(regs);
        shader_program_manager->UseFixedGeometryShader(gs_config);
        return true;
    } else {
        GLShader::PicaGSConfig gs_config(regs, Pica::g_state.gs);
        return shader_program_manager->UseProgrammableGeometryShader(gs_config, Pica::g_state.gs);
    }
 }
 bool RasterizerOpenGL::AccelerateDrawBatch(bool is_indexed) {
    const auto& regs = Pica::g_state.regs;
    if (regs.pipeline.use_gs != Pica::PipelineRegs::UseGS::No) {
        if (regs.pipeline.gs_config.mode != Pica::PipelineRegs::GSMode::Point) {
            return false;
        }
        if (regs.pipeline.triangle_topology != Pica::PipelineRegs::TriangleTopology::Shader) {
            return false;
        }
    }
    if (!SetupVertexShader())
        return false;
    if (!SetupGeometryShader())
        return false;
    return Draw(true, is_indexed);
 }
 static GLenum GetCurrentPrimitiveMode(bool use_gs) {
    const auto& regs = Pica::g_state.regs;
    if (use_gs) {
        switch ((regs.gs.max_input_attribute_index + 1) /
                (regs.pipeline.vs_outmap_total_minus_1_a + 1)) {
        case 1:
            return GL_POINTS;
        case 2:
            return GL_LINES;
        case 4:
            return GL_LINES_ADJACENCY;
        case 3:
            return GL_TRIANGLES;
        case 6:
            return GL_TRIANGLES_ADJACENCY;
        default:
            UNREACHABLE();
        }
    } else {
        switch (regs.pipeline.triangle_topology) {
        case Pica::PipelineRegs::TriangleTopology::Shader:
        case Pica::PipelineRegs::TriangleTopology::List:
            return GL_TRIANGLES;
        case Pica::PipelineRegs::TriangleTopology::Fan:
            return GL_TRIANGLE_FAN;
        case Pica::PipelineRegs::TriangleTopology::Strip:
            return GL_TRIANGLE_STRIP;
        default:
            UNREACHABLE();
        }
    }
 }
 bool RasterizerOpenGL::AccelerateDrawBatchInternal(bool is_indexed, bool use_gs) {
    const auto& regs = Pica::g_state.regs;
    GLenum primitive_mode = GetCurrentPrimitiveMode(use_gs);
    auto [vs_input_index_min, vs_input_index_max, vs_input_size] = AnalyzeVertexArray(is_indexed);
    if (vs_input_size > VERTEX_BUFFER_SIZE) {
        NGLOG_WARNING(Render_OpenGL, "Too large vertex input size {}", vs_input_size);
        return false;
    }
    state.draw.vertex_buffer = vertex_buffer.GetHandle();
    state.Apply();
    u8* buffer_ptr;
    GLintptr buffer_offset;
    std::tie(buffer_ptr, buffer_offset, std::ignore) = vertex_buffer.Map(vs_input_size, 4);
    SetupVertexArray(buffer_ptr, buffer_offset, vs_input_index_min, vs_input_index_max);
    vertex_buffer.Unmap(vs_input_size);
    shader_program_manager->ApplyTo(state);
    state.Apply();
    if (is_indexed) {
        bool index_u16 = regs.pipeline.index_array.format != 0;
        std::size_t index_buffer_size = regs.pipeline.num_vertices * (index_u16 ? 2 : 1);
        if (index_buffer_size > INDEX_BUFFER_SIZE) {
            NGLOG_WARNING(Render_OpenGL, "Too large index input size {}", index_buffer_size);
            return false;
        }
        const u8* index_data =
            Memory::GetPhysicalPointer(regs.pipeline.vertex_attributes.GetPhysicalBaseAddress() +
                                       regs.pipeline.index_array.offset);
        std::tie(buffer_ptr, buffer_offset, std::ignore) = index_buffer.Map(index_buffer_size, 4);
        std::memcpy(buffer_ptr, index_data, index_buffer_size);
        index_buffer.Unmap(index_buffer_size);
        glDrawRangeElementsBaseVertex(
            primitive_mode, vs_input_index_min, vs_input_index_max, regs.pipeline.num_vertices,
            index_u16 ? GL_UNSIGNED_SHORT : GL_UNSIGNED_BYTE,
            reinterpret_cast<const void*>(buffer_offset), -static_cast<GLint>(vs_input_index_min));
    } else {
        glDrawArrays(primitive_mode, 0, regs.pipeline.num_vertices);
    }
    return true;
 }
 void RasterizerOpenGL::DrawTriangles() {
    if (vertex_batch.empty())
        return;
    Draw(false, false);
 }
 bool RasterizerOpenGL::Draw(bool accelerate, bool is_indexed) {
    MICROPROFILE_SCOPE(OpenGL_Drawing);
    const auto& regs = Pica::g_state.regs;
@ -474,7 +742,8 @@ void RasterizerOpenGL::DrawTriangles() {
    }
    // Sync the uniform data
-    UploadUniforms();
+    const bool use_gs = regs.pipeline.use_gs == Pica::PipelineRegs::UseGS::Yes;
    UploadUniforms(accelerate, use_gs);
    // Viewport can have negative offsets or larger
    // dimensions than our framebuffer sub-rect.
@ -487,22 +756,31 @@ void RasterizerOpenGL::DrawTriangles() {
    state.scissor.height = draw_rect.GetHeight();
    state.Apply();
    shader_program_manager->UseTrivialVertexShader();
    shader_program_manager->UseTrivialGeometryShader();
    shader_program_manager->ApplyTo(state);
    state.Apply();
    // Draw the vertex batch
-    size_t max_vertices = 3 * (vertex_buffer.GetSize() / (3 * sizeof(HardwareVertex)));
+    bool succeeded = true;
-    for (size_t base_vertex = 0; base_vertex < vertex_batch.size(); base_vertex += max_vertices) {
+    if (accelerate) {
-        size_t vertices = std::min(max_vertices, vertex_batch.size() - base_vertex);
+        succeeded = AccelerateDrawBatchInternal(is_indexed, use_gs);
-        size_t vertex_size = vertices * sizeof(HardwareVertex);
+    } else {
-        u8* vbo;
+        state.draw.vertex_array = sw_vao.handle;
-        GLintptr offset;
+        state.draw.vertex_buffer = vertex_buffer.GetHandle();
-        std::tie(vbo, offset, std::ignore) = vertex_buffer.Map(vertex_size, sizeof(HardwareVertex));
+        shader_program_manager->UseTrivialVertexShader();
-        memcpy(vbo, vertex_batch.data() + base_vertex, vertex_size);
+        shader_program_manager->UseTrivialGeometryShader();
-        vertex_buffer.Unmap(vertex_size);
+        shader_program_manager->ApplyTo(state);
-        glDrawArrays(GL_TRIANGLES, offset / sizeof(HardwareVertex), (GLsizei)vertices);
+        state.Apply();
        std::size_t max_vertices = 3 * (VERTEX_BUFFER_SIZE / (3 * sizeof(HardwareVertex)));
        for (std::size_t base_vertex = 0; base_vertex < vertex_batch.size();
             base_vertex += max_vertices) {
            std::size_t vertices = std::min(max_vertices, vertex_batch.size() - base_vertex);
            std::size_t vertex_size = vertices * sizeof(HardwareVertex);
            u8* vbo;
            GLintptr offset;
            std::tie(vbo, offset, std::ignore) =
                vertex_buffer.Map(vertex_size, sizeof(HardwareVertex));
            std::memcpy(vbo, vertex_batch.data() + base_vertex, vertex_size);
            vertex_buffer.Unmap(vertex_size);
            glDrawArrays(GL_TRIANGLES, offset / sizeof(HardwareVertex), (GLsizei)vertices);
        }
    }
    // Disable scissor test
@ -532,6 +810,8 @@ void RasterizerOpenGL::DrawTriangles() {
        res_cache.InvalidateRegion(boost::icl::first(interval), boost::icl::length(interval),
                                   depth_surface);
    }
    return succeeded;
 }
 void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
@ -1648,18 +1928,53 @@ void RasterizerOpenGL::SyncLightDistanceAttenuationScale(int light_index) {
    }
 }
-void RasterizerOpenGL::UploadUniforms() {
+void RasterizerOpenGL::UploadUniforms(bool accelerate_draw, bool use_gs) {
-    if (!uniform_block_data.dirty)
+    // glBindBufferRange below also changes the generic buffer binding point, so we sync the state
    // first
    state.draw.uniform_buffer = uniform_buffer.GetHandle();
    state.Apply();
    bool sync_vs = accelerate_draw;
    bool sync_gs = accelerate_draw && use_gs;
    bool sync_fs = uniform_block_data.dirty;
    if (!sync_vs && !sync_gs && !sync_fs)
        return;
-    size_t uniform_size = uniform_size_aligned_fs;
+    size_t uniform_size =
        uniform_size_aligned_vs + uniform_size_aligned_gs + uniform_size_aligned_fs;
    size_t used_bytes = 0;
    u8* uniforms;
    GLintptr offset;
-    std::tie(uniforms, offset, std::ignore) =
+    bool invalidate;
    std::tie(uniforms, offset, invalidate) =
        uniform_buffer.Map(uniform_size, uniform_buffer_alignment);
-    std::memcpy(uniforms, &uniform_block_data.data, sizeof(UniformData));
+
-    uniform_buffer.Unmap(uniform_size);
+    if (sync_vs) {
-    glBindBufferRange(GL_UNIFORM_BUFFER, 0, uniform_buffer.GetHandle(), offset,
+        VSUniformData vs_uniforms;
-                      sizeof(UniformData));
+        vs_uniforms.uniforms.SetFromRegs(Pica::g_state.regs.vs, Pica::g_state.vs);
-    uniform_block_data.dirty = false;
+        std::memcpy(uniforms + used_bytes, &vs_uniforms, sizeof(vs_uniforms));
        glBindBufferRange(GL_UNIFORM_BUFFER, static_cast<GLuint>(UniformBindings::VS),
                          uniform_buffer.GetHandle(), offset + used_bytes, sizeof(VSUniformData));
        used_bytes += uniform_size_aligned_vs;
    }
    if (sync_gs) {
        GSUniformData gs_uniforms;
        gs_uniforms.uniforms.SetFromRegs(Pica::g_state.regs.gs, Pica::g_state.gs);
        std::memcpy(uniforms + used_bytes, &gs_uniforms, sizeof(gs_uniforms));
        glBindBufferRange(GL_UNIFORM_BUFFER, static_cast<GLuint>(UniformBindings::GS),
                          uniform_buffer.GetHandle(), offset + used_bytes, sizeof(GSUniformData));
        used_bytes += uniform_size_aligned_gs;
    }
    if (sync_fs || invalidate) {
        std::memcpy(uniforms + used_bytes, &uniform_block_data.data, sizeof(UniformData));
        glBindBufferRange(GL_UNIFORM_BUFFER, static_cast<GLuint>(UniformBindings::Common),
                          uniform_buffer.GetHandle(), offset + used_bytes, sizeof(UniformData));
        uniform_block_data.dirty = false;
        used_bytes += uniform_size_aligned_fs;
    }
    uniform_buffer.Unmap(used_bytes);
 }
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@ -50,6 +50,7 @@ public:
    bool AccelerateFill(const GPU::Regs::MemoryFillConfig& config) override;
    bool AccelerateDisplay(const GPU::Regs::FramebufferConfig& config, PAddr framebuffer_addr,
                           u32 pixel_stride, ScreenInfo& screen_info) override;
    bool AccelerateDrawBatch(bool is_indexed) override;
 private:
    struct SamplerInfo {
@ -73,6 +74,7 @@ private:
    /// Structure that the hardware rendered vertices are composed of
    struct HardwareVertex {
        HardwareVertex() = default;
        HardwareVertex(const Pica::Shader::OutputVertex& v, bool flip_quaternion) {
            position[0] = v.pos.x.ToFloat32();
            position[1] = v.pos.y.ToFloat32();
@ -216,7 +218,32 @@ private:
    void SyncLightDistanceAttenuationScale(int light_index);
    /// Upload the uniform blocks to the uniform buffer object
-    void UploadUniforms();
+    void UploadUniforms(bool accelerate_draw, bool use_gs);
    /// Generic draw function for DrawTriangles and AccelerateDrawBatch
    bool Draw(bool accelerate, bool is_indexed);
    /// Internal implementation for AccelerateDrawBatch
    bool AccelerateDrawBatchInternal(bool is_indexed, bool use_gs);
    struct VertexArrayInfo {
        u32 vs_input_index_min;
        u32 vs_input_index_max;
        u32 vs_input_size;
    };
    /// Retrieve the range and the size of the input vertex
    VertexArrayInfo AnalyzeVertexArray(bool is_indexed);
    /// Setup vertex array for AccelerateDrawBatch
    void SetupVertexArray(u8* array_ptr, GLintptr buffer_offset, GLuint vs_input_index_min,
                          GLuint vs_input_index_max);
    /// Setup vertex shader for AccelerateDrawBatch
    bool SetupVertexShader();
    /// Setup geometry shader for AccelerateDrawBatch
    bool SetupGeometryShader();
    OpenGLState state;
@ -242,14 +269,21 @@ private:
    // They shall be big enough for about one frame.
    static constexpr size_t VERTEX_BUFFER_SIZE = 32 * 1024 * 1024;
    static constexpr size_t INDEX_BUFFER_SIZE = 1 * 1024 * 1024;
    static constexpr size_t UNIFORM_BUFFER_SIZE = 2 * 1024 * 1024;
    OGLVertexArray sw_vao; // VAO for software shader draw
    OGLVertexArray hw_vao; // VAO for hardware shader / accelerate draw
    std::array<bool, 16> hw_vao_enabled_attributes{};
    std::array<SamplerInfo, 3> texture_samplers;
    OGLVertexArray vertex_array;
    OGLStreamBuffer vertex_buffer;
    OGLStreamBuffer uniform_buffer;
    OGLStreamBuffer index_buffer;
    OGLFramebuffer framebuffer;
    GLint uniform_buffer_alignment;
    size_t uniform_size_aligned_vs;
    size_t uniform_size_aligned_gs;
    size_t uniform_size_aligned_fs;
    SamplerInfo texture_cube_sampler;
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@ -18,6 +18,7 @@
 #include "video_core/renderer_opengl/gl_shader_decompiler.h"
 #include "video_core/renderer_opengl/gl_shader_gen.h"
 #include "video_core/renderer_opengl/gl_shader_util.h"
 #include "video_core/video_core.h"
 using Pica::FramebufferRegs;
 using Pica::LightingRegs;
@ -226,7 +227,7 @@ void PicaShaderConfigCommon::Init(const Pica::ShaderRegs& regs, Pica::Shader::Sh
    program_hash = setup.GetProgramCodeHash();
    swizzle_hash = setup.GetSwizzleDataHash();
    main_offset = regs.main_offset;
-    sanitize_mul = false; // TODO (wwylele): stubbed now. Should sync with user settings
+    sanitize_mul = VideoCore::g_hw_shader_accurate_mul;
    num_outputs = 0;
    output_map.fill(16);
--- a/src/video_core/renderer_opengl/gl_stream_buffer.cpp
+++ b/src/video_core/renderer_opengl/gl_stream_buffer.cpp
@ -14,16 +14,26 @@ OGLStreamBuffer::OGLStreamBuffer(GLenum target, GLsizeiptr size, bool prefer_coh
    gl_buffer.Create();
    glBindBuffer(gl_target, gl_buffer.handle);
    GLsizeiptr allocate_size = size;
    if (target == GL_ARRAY_BUFFER) {
        // On AMD GPU there is a strange crash in indexed drawing. The crash happens when the buffer
        // read position is near the end and is an out-of-bound access to the vertex buffer. This is
        // probably a bug in the driver and is related to the usage of vec3<byte> attributes in the
        // vertex array. Doubling the allocation size for the vertex buffer seems to avoid the
        // crash.
        allocate_size *= 2;
    }
    if (GLAD_GL_ARB_buffer_storage) {
        persistent = true;
        coherent = prefer_coherent;
        GLbitfield flags =
            GL_MAP_WRITE_BIT | GL_MAP_PERSISTENT_BIT | (coherent ? GL_MAP_COHERENT_BIT : 0);
-        glBufferStorage(gl_target, buffer_size, nullptr, flags);
+        glBufferStorage(gl_target, allocate_size, nullptr, flags);
        mapped_ptr = static_cast<u8*>(glMapBufferRange(
            gl_target, 0, buffer_size, flags | (coherent ? 0 : GL_MAP_FLUSH_EXPLICIT_BIT)));
    } else {
-        glBufferData(gl_target, buffer_size, nullptr, GL_STREAM_DRAW);
+        glBufferData(gl_target, allocate_size, nullptr, GL_STREAM_DRAW);
    }
 }
--- a/src/video_core/video_core.cpp
+++ b/src/video_core/video_core.cpp
@ -19,7 +19,9 @@ std::unique_ptr<RendererBase> g_renderer; ///< Renderer plugin
 std::atomic<bool> g_hw_renderer_enabled;
 std::atomic<bool> g_shader_jit_enabled;
-std::atomic<bool> g_vsync_enabled;
+std::atomic<bool> g_hw_shader_enabled;
 std::atomic<bool> g_hw_shader_accurate_gs;
 std::atomic<bool> g_hw_shader_accurate_mul;
 /// Initialize the video core
 bool Init(EmuWindow* emu_window) {
--- a/src/video_core/video_core.h
+++ b/src/video_core/video_core.h
@ -22,6 +22,9 @@ extern EmuWindow* g_emu_window;                  ///< Emu window
 // qt ui)
 extern std::atomic<bool> g_hw_renderer_enabled;
 extern std::atomic<bool> g_shader_jit_enabled;
 extern std::atomic<bool> g_hw_shader_enabled;
 extern std::atomic<bool> g_hw_shader_accurate_gs;
 extern std::atomic<bool> g_hw_shader_accurate_mul;
 /// Start the video core
 void Start();