diff --git a/src/video_core/CMakeLists.txt b/src/video_core/CMakeLists.txt index af9d929d6..581bbba46 100644 --- a/src/video_core/CMakeLists.txt +++ b/src/video_core/CMakeLists.txt @@ -104,6 +104,12 @@ add_library(video_core STATIC renderer_vulkan/vk_instance.h renderer_vulkan/vk_platform.cpp renderer_vulkan/vk_platform.h + renderer_vulkan/vk_shader_gen.cpp + renderer_vulkan/vk_shader_gen.h + renderer_vulkan/vk_shader_gen_spv.cpp + renderer_vulkan/vk_shader_gen_spv.h + renderer_vulkan/vk_shader_util.cpp + renderer_vulkan/vk_shader_util.h shader/debug_data.h shader/shader.cpp shader/shader.h @@ -133,7 +139,7 @@ create_target_directory_groups(video_core) target_link_libraries(video_core PUBLIC citra_common citra_core) target_link_libraries(video_core PRIVATE Boost::serialization dds-ktx json-headers nihstro-headers tsl::robin_map) -target_link_libraries(video_core PRIVATE vulkan-headers vma glad) +target_link_libraries(video_core PRIVATE vulkan-headers vma glad sirit SPIRV glslang) set_target_properties(video_core PROPERTIES INTERPROCEDURAL_OPTIMIZATION ${ENABLE_LTO}) if ("x86_64" IN_LIST ARCHITECTURE) diff --git a/src/video_core/renderer_vulkan/vk_shader_gen.cpp b/src/video_core/renderer_vulkan/vk_shader_gen.cpp new file mode 100644 index 000000000..648aa5d88 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_gen.cpp @@ -0,0 +1,1880 @@ +// Copyright 2023 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include "common/bit_set.h" +#include "common/logging/log.h" +#include "core/core.h" +#include "video_core/pica_state.h" +#include "video_core/regs_framebuffer.h" +#include "video_core/renderer_opengl/gl_shader_decompiler.h" +#include "video_core/renderer_vulkan/vk_instance.h" +#include "video_core/renderer_vulkan/vk_shader_gen.h" +#include "video_core/shader/shader_uniforms.h" +#include "video_core/video_core.h" + +using Pica::FramebufferRegs; +using Pica::LightingRegs; +using Pica::RasterizerRegs; +using Pica::TexturingRegs; +using TevStageConfig = TexturingRegs::TevStageConfig; +using VSOutputAttributes = RasterizerRegs::VSOutputAttributes; + +namespace Vulkan { + +const std::string UniformBlockDef = Pica::Shader::BuildShaderUniformDefinitions("binding = 1,"); + +static std::string GetVertexInterfaceDeclaration(bool is_output, bool use_clip_planes = false) { + std::string out; + + const auto append_variable = [&](std::string_view var, int location) { + out += fmt::format("layout (location={}) ", location); + out += fmt::format("{}{};\n", is_output ? "out " : "in ", var); + }; + + append_variable("vec4 primary_color", ATTRIBUTE_COLOR); + append_variable("vec2 texcoord0", ATTRIBUTE_TEXCOORD0); + append_variable("vec2 texcoord1", ATTRIBUTE_TEXCOORD1); + append_variable("vec2 texcoord2", ATTRIBUTE_TEXCOORD2); + append_variable("float texcoord0_w", ATTRIBUTE_TEXCOORD0_W); + append_variable("vec4 normquat", ATTRIBUTE_NORMQUAT); + append_variable("vec3 view", ATTRIBUTE_VIEW); + + if (is_output) { + // gl_PerVertex redeclaration is required for separate shader object + out += "out gl_PerVertex {\n"; + out += " vec4 gl_Position;\n"; + if (use_clip_planes) { + out += " float gl_ClipDistance[2];\n"; + } + out += "};\n"; + } + + return out; +} + +PicaFSConfig::PicaFSConfig(const Pica::Regs& regs, const Instance& instance) { + state.scissor_test_mode.Assign(regs.rasterizer.scissor_test.mode); + + state.depthmap_enable.Assign(regs.rasterizer.depthmap_enable); + + state.alpha_test_func.Assign(regs.framebuffer.output_merger.alpha_test.enable + ? regs.framebuffer.output_merger.alpha_test.func.Value() + : FramebufferRegs::CompareFunc::Always); + + state.texture0_type.Assign(regs.texturing.texture0.type); + + state.texture2_use_coord1.Assign(regs.texturing.main_config.texture2_use_coord1 != 0); + + // Emulate logic op in the shader if not supported. This is mostly for mobile GPUs + const bool emulate_logic_op = instance.NeedsLogicOpEmulation() && + !Pica::g_state.regs.framebuffer.output_merger.alphablend_enable; + + state.emulate_logic_op.Assign(emulate_logic_op); + if (emulate_logic_op) { + state.logic_op.Assign(regs.framebuffer.output_merger.logic_op); + } else { + state.logic_op.Assign(Pica::FramebufferRegs::LogicOp::NoOp); + } + + // Copy relevant tev stages fields. + // We don't sync const_color here because of the high variance, it is a + // shader uniform instead. + const auto& tev_stages = regs.texturing.GetTevStages(); + DEBUG_ASSERT(state.tev_stages.size() == tev_stages.size()); + for (std::size_t i = 0; i < tev_stages.size(); i++) { + const auto& tev_stage = tev_stages[i]; + state.tev_stages[i].sources_raw = tev_stage.sources_raw; + state.tev_stages[i].modifiers_raw = tev_stage.modifiers_raw; + state.tev_stages[i].ops_raw = tev_stage.ops_raw; + state.tev_stages[i].scales_raw = tev_stage.scales_raw; + } + + state.fog_mode.Assign(regs.texturing.fog_mode); + state.fog_flip.Assign(regs.texturing.fog_flip != 0); + + state.combiner_buffer_input.Assign( + regs.texturing.tev_combiner_buffer_input.update_mask_rgb.Value() | + regs.texturing.tev_combiner_buffer_input.update_mask_a.Value() << 4); + + // Fragment lighting + + state.lighting.enable.Assign(!regs.lighting.disable); + state.lighting.src_num.Assign(regs.lighting.max_light_index + 1); + + for (u32 light_index = 0; light_index < state.lighting.src_num; ++light_index) { + const u32 num = regs.lighting.light_enable.GetNum(light_index); + const auto& light = regs.lighting.light[num]; + state.lighting.light[light_index].num.Assign(num); + state.lighting.light[light_index].directional.Assign(light.config.directional != 0); + state.lighting.light[light_index].two_sided_diffuse.Assign(light.config.two_sided_diffuse != + 0); + state.lighting.light[light_index].geometric_factor_0.Assign( + light.config.geometric_factor_0 != 0); + state.lighting.light[light_index].geometric_factor_1.Assign( + light.config.geometric_factor_1 != 0); + state.lighting.light[light_index].dist_atten_enable.Assign( + !regs.lighting.IsDistAttenDisabled(num)); + state.lighting.light[light_index].spot_atten_enable.Assign( + !regs.lighting.IsSpotAttenDisabled(num)); + state.lighting.light[light_index].shadow_enable.Assign( + !regs.lighting.IsShadowDisabled(num)); + } + + state.lighting.lut_d0.enable.Assign(regs.lighting.config1.disable_lut_d0 == 0); + state.lighting.lut_d0.abs_input.Assign(regs.lighting.abs_lut_input.disable_d0 == 0); + state.lighting.lut_d0.type.Assign(regs.lighting.lut_input.d0.Value()); + state.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0); + + state.lighting.lut_d1.enable.Assign(regs.lighting.config1.disable_lut_d1 == 0); + state.lighting.lut_d1.abs_input.Assign(regs.lighting.abs_lut_input.disable_d1 == 0); + state.lighting.lut_d1.type.Assign(regs.lighting.lut_input.d1.Value()); + state.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1); + + // this is a dummy field due to lack of the corresponding register + state.lighting.lut_sp.enable.Assign(1); + state.lighting.lut_sp.abs_input.Assign(regs.lighting.abs_lut_input.disable_sp == 0); + state.lighting.lut_sp.type.Assign(regs.lighting.lut_input.sp.Value()); + state.lighting.lut_sp.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.sp); + + state.lighting.lut_fr.enable.Assign(regs.lighting.config1.disable_lut_fr == 0); + state.lighting.lut_fr.abs_input.Assign(regs.lighting.abs_lut_input.disable_fr == 0); + state.lighting.lut_fr.type.Assign(regs.lighting.lut_input.fr.Value()); + state.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr); + + state.lighting.lut_rr.enable.Assign(regs.lighting.config1.disable_lut_rr == 0); + state.lighting.lut_rr.abs_input.Assign(regs.lighting.abs_lut_input.disable_rr == 0); + state.lighting.lut_rr.type.Assign(regs.lighting.lut_input.rr.Value()); + state.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr); + + state.lighting.lut_rg.enable.Assign(regs.lighting.config1.disable_lut_rg == 0); + state.lighting.lut_rg.abs_input.Assign(regs.lighting.abs_lut_input.disable_rg == 0); + state.lighting.lut_rg.type.Assign(regs.lighting.lut_input.rg.Value()); + state.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg); + + state.lighting.lut_rb.enable.Assign(regs.lighting.config1.disable_lut_rb == 0); + state.lighting.lut_rb.abs_input.Assign(regs.lighting.abs_lut_input.disable_rb == 0); + state.lighting.lut_rb.type.Assign(regs.lighting.lut_input.rb.Value()); + state.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb); + + state.lighting.config.Assign(regs.lighting.config0.config); + state.lighting.enable_primary_alpha.Assign(regs.lighting.config0.enable_primary_alpha); + state.lighting.enable_secondary_alpha.Assign(regs.lighting.config0.enable_secondary_alpha); + state.lighting.bump_mode.Assign(regs.lighting.config0.bump_mode); + state.lighting.bump_selector.Assign(regs.lighting.config0.bump_selector); + state.lighting.bump_renorm.Assign(regs.lighting.config0.disable_bump_renorm == 0); + state.lighting.clamp_highlights.Assign(regs.lighting.config0.clamp_highlights != 0); + + state.lighting.enable_shadow.Assign(regs.lighting.config0.enable_shadow != 0); + state.lighting.shadow_primary.Assign(regs.lighting.config0.shadow_primary != 0); + state.lighting.shadow_secondary.Assign(regs.lighting.config0.shadow_secondary != 0); + state.lighting.shadow_invert.Assign(regs.lighting.config0.shadow_invert != 0); + state.lighting.shadow_alpha.Assign(regs.lighting.config0.shadow_alpha != 0); + state.lighting.shadow_selector.Assign(regs.lighting.config0.shadow_selector); + + state.proctex.enable.Assign(regs.texturing.main_config.texture3_enable); + if (state.proctex.enable) { + state.proctex.coord.Assign(regs.texturing.main_config.texture3_coordinates); + state.proctex.u_clamp.Assign(regs.texturing.proctex.u_clamp); + state.proctex.v_clamp.Assign(regs.texturing.proctex.v_clamp); + state.proctex.color_combiner.Assign(regs.texturing.proctex.color_combiner); + state.proctex.alpha_combiner.Assign(regs.texturing.proctex.alpha_combiner); + state.proctex.separate_alpha.Assign(regs.texturing.proctex.separate_alpha); + state.proctex.noise_enable.Assign(regs.texturing.proctex.noise_enable); + state.proctex.u_shift.Assign(regs.texturing.proctex.u_shift); + state.proctex.v_shift.Assign(regs.texturing.proctex.v_shift); + state.proctex.lut_width = regs.texturing.proctex_lut.width; + state.proctex.lut_offset0 = regs.texturing.proctex_lut_offset.level0; + state.proctex.lut_offset1 = regs.texturing.proctex_lut_offset.level1; + state.proctex.lut_offset2 = regs.texturing.proctex_lut_offset.level2; + state.proctex.lut_offset3 = regs.texturing.proctex_lut_offset.level3; + state.proctex.lod_min = regs.texturing.proctex_lut.lod_min; + state.proctex.lod_max = regs.texturing.proctex_lut.lod_max; + state.proctex.lut_filter.Assign(regs.texturing.proctex_lut.filter); + } + + state.shadow_rendering.Assign(regs.framebuffer.output_merger.fragment_operation_mode == + FramebufferRegs::FragmentOperationMode::Shadow); + + state.shadow_texture_orthographic.Assign(regs.texturing.shadow.orthographic != 0); +} + +void PicaShaderConfigCommon::Init(const Pica::RasterizerRegs& rasterizer, + const Pica::ShaderRegs& regs, Pica::Shader::ShaderSetup& setup) { + program_hash = setup.GetProgramCodeHash(); + swizzle_hash = setup.GetSwizzleDataHash(); + main_offset = regs.main_offset; + sanitize_mul = VideoCore::g_hw_shader_accurate_mul; + + num_outputs = 0; + load_flags.fill(AttribLoadFlags::Float); + output_map.fill(16); + + for (int reg : Common::BitSet(regs.output_mask)) { + output_map[reg] = num_outputs++; + } + + vs_output_attributes = Common::BitSet(regs.output_mask).Count(); + gs_output_attributes = vs_output_attributes; + + semantic_maps.fill({16, 0}); + for (u32 attrib = 0; attrib < rasterizer.vs_output_total; ++attrib) { + const std::array semantics{ + rasterizer.vs_output_attributes[attrib].map_x.Value(), + rasterizer.vs_output_attributes[attrib].map_y.Value(), + rasterizer.vs_output_attributes[attrib].map_z.Value(), + rasterizer.vs_output_attributes[attrib].map_w.Value(), + }; + for (u32 comp = 0; comp < 4; ++comp) { + const auto semantic = semantics[comp]; + if (static_cast(semantic) < 24) { + semantic_maps[static_cast(semantic)] = {attrib, comp}; + } else if (semantic != VSOutputAttributes::INVALID) { + LOG_ERROR(Render_OpenGL, "Invalid/unknown semantic id: {}", semantic); + } + } + } +} + +PicaVSConfig::PicaVSConfig(const Pica::RasterizerRegs& rasterizer, const Pica::ShaderRegs& regs, + Pica::Shader::ShaderSetup& setup, const Instance& instance) { + state.Init(rasterizer, regs, setup); + use_clip_planes = instance.IsShaderClipDistanceSupported(); +} + +void PicaGSConfigCommonRaw::Init(const Pica::Regs& regs) { + vs_output_attributes = Common::BitSet(regs.vs.output_mask).Count(); + gs_output_attributes = vs_output_attributes; + + semantic_maps.fill({16, 0}); + for (u32 attrib = 0; attrib < regs.rasterizer.vs_output_total; ++attrib) { + const std::array semantics{ + regs.rasterizer.vs_output_attributes[attrib].map_x.Value(), + regs.rasterizer.vs_output_attributes[attrib].map_y.Value(), + regs.rasterizer.vs_output_attributes[attrib].map_z.Value(), + regs.rasterizer.vs_output_attributes[attrib].map_w.Value(), + }; + for (u32 comp = 0; comp < 4; ++comp) { + const auto semantic = semantics[comp]; + if (static_cast(semantic) < 24) { + semantic_maps[static_cast(semantic)] = {attrib, comp}; + } else if (semantic != VSOutputAttributes::INVALID) { + LOG_ERROR(Render_OpenGL, "Invalid/unknown semantic id: {}", semantic); + } + } + } +} + +PicaFixedGSConfig::PicaFixedGSConfig(const Pica::Regs& regs, const Instance& instance) { + state.Init(regs); + use_clip_planes = instance.IsShaderClipDistanceSupported(); +} + +/// Detects if a TEV stage is configured to be skipped (to avoid generating unnecessary code) +static bool IsPassThroughTevStage(const TevStageConfig& stage) { + return (stage.color_op == TevStageConfig::Operation::Replace && + stage.alpha_op == TevStageConfig::Operation::Replace && + stage.color_source1 == TevStageConfig::Source::Previous && + stage.alpha_source1 == TevStageConfig::Source::Previous && + stage.color_modifier1 == TevStageConfig::ColorModifier::SourceColor && + stage.alpha_modifier1 == TevStageConfig::AlphaModifier::SourceAlpha && + stage.GetColorMultiplier() == 1 && stage.GetAlphaMultiplier() == 1); +} + +static std::string SampleTexture(const PicaFSConfig& config, unsigned texture_unit) { + const auto& state = config.state; + switch (texture_unit) { + case 0: + // Only unit 0 respects the texturing type + switch (state.texture0_type) { + case TexturingRegs::TextureConfig::Texture2D: + return "textureLod(tex0, texcoord0, getLod(texcoord0 * " + "vec2(textureSize(tex0, 0))) + tex_lod_bias[0])"; + case TexturingRegs::TextureConfig::Projection2D: + // TODO (wwylele): find the exact LOD formula for projection texture + return "textureProj(tex0, vec3(texcoord0, texcoord0_w))"; + case TexturingRegs::TextureConfig::TextureCube: + return "texture(tex_cube, vec3(texcoord0, texcoord0_w))"; + case TexturingRegs::TextureConfig::Shadow2D: + return "shadowTexture(texcoord0, texcoord0_w)"; + case TexturingRegs::TextureConfig::ShadowCube: + return "shadowTextureCube(texcoord0, texcoord0_w)"; + case TexturingRegs::TextureConfig::Disabled: + return "vec4(0.0)"; + default: + LOG_CRITICAL(HW_GPU, "Unhandled texture type {:x}", state.texture0_type); + UNIMPLEMENTED(); + return "texture(tex0, texcoord0)"; + } + case 1: + return "textureLod(tex1, texcoord1, getLod(texcoord1 * " + "vec2(textureSize(tex1, 0))) + tex_lod_bias[1])"; + case 2: + if (state.texture2_use_coord1) + return "textureLod(tex2, texcoord1, getLod(texcoord1 * " + "vec2(textureSize(tex2, 0))) + tex_lod_bias[2])"; + else + return "textureLod(tex2, texcoord2, getLod(texcoord2 * " + "vec2(textureSize(tex2, 0))) + tex_lod_bias[2])"; + case 3: + if (state.proctex.enable) { + return "ProcTex()"; + } else { + LOG_DEBUG(Render_OpenGL, "Using Texture3 without enabling it"); + return "vec4(0.0)"; + } + default: + UNREACHABLE(); + return ""; + } +} + +/// Writes the specified TEV stage source component(s) +static void AppendSource(std::string& out, const PicaFSConfig& config, + TevStageConfig::Source source, std::string_view index_name) { + using Source = TevStageConfig::Source; + switch (source) { + case Source::PrimaryColor: + out += "rounded_primary_color"; + break; + case Source::PrimaryFragmentColor: + out += "primary_fragment_color"; + break; + case Source::SecondaryFragmentColor: + out += "secondary_fragment_color"; + break; + case Source::Texture0: + out += SampleTexture(config, 0); + break; + case Source::Texture1: + out += SampleTexture(config, 1); + break; + case Source::Texture2: + out += SampleTexture(config, 2); + break; + case Source::Texture3: + out += SampleTexture(config, 3); + break; + case Source::PreviousBuffer: + out += "combiner_buffer"; + break; + case Source::Constant: + out += fmt::format("const_color[{}]", index_name); + break; + case Source::Previous: + out += "last_tex_env_out"; + break; + default: + out += "vec4(0.0)"; + LOG_CRITICAL(Render_OpenGL, "Unknown source op {}", source); + break; + } +} + +/// Writes the color components to use for the specified TEV stage color modifier +static void AppendColorModifier(std::string& out, const PicaFSConfig& config, + TevStageConfig::ColorModifier modifier, + TevStageConfig::Source source, std::string_view index_name) { + using ColorModifier = TevStageConfig::ColorModifier; + switch (modifier) { + case ColorModifier::SourceColor: + AppendSource(out, config, source, index_name); + out += ".rgb"; + break; + case ColorModifier::OneMinusSourceColor: + out += "vec3(1.0) - "; + AppendSource(out, config, source, index_name); + out += ".rgb"; + break; + case ColorModifier::SourceAlpha: + AppendSource(out, config, source, index_name); + out += ".aaa"; + break; + case ColorModifier::OneMinusSourceAlpha: + out += "vec3(1.0) - "; + AppendSource(out, config, source, index_name); + out += ".aaa"; + break; + case ColorModifier::SourceRed: + AppendSource(out, config, source, index_name); + out += ".rrr"; + break; + case ColorModifier::OneMinusSourceRed: + out += "vec3(1.0) - "; + AppendSource(out, config, source, index_name); + out += ".rrr"; + break; + case ColorModifier::SourceGreen: + AppendSource(out, config, source, index_name); + out += ".ggg"; + break; + case ColorModifier::OneMinusSourceGreen: + out += "vec3(1.0) - "; + AppendSource(out, config, source, index_name); + out += ".ggg"; + break; + case ColorModifier::SourceBlue: + AppendSource(out, config, source, index_name); + out += ".bbb"; + break; + case ColorModifier::OneMinusSourceBlue: + out += "vec3(1.0) - "; + AppendSource(out, config, source, index_name); + out += ".bbb"; + break; + default: + out += "vec3(0.0)"; + LOG_CRITICAL(Render_OpenGL, "Unknown color modifier op {}", modifier); + break; + } +} + +/// Writes the alpha component to use for the specified TEV stage alpha modifier +static void AppendAlphaModifier(std::string& out, const PicaFSConfig& config, + TevStageConfig::AlphaModifier modifier, + TevStageConfig::Source source, const std::string& index_name) { + using AlphaModifier = TevStageConfig::AlphaModifier; + switch (modifier) { + case AlphaModifier::SourceAlpha: + AppendSource(out, config, source, index_name); + out += ".a"; + break; + case AlphaModifier::OneMinusSourceAlpha: + out += "1.0 - "; + AppendSource(out, config, source, index_name); + out += ".a"; + break; + case AlphaModifier::SourceRed: + AppendSource(out, config, source, index_name); + out += ".r"; + break; + case AlphaModifier::OneMinusSourceRed: + out += "1.0 - "; + AppendSource(out, config, source, index_name); + out += ".r"; + break; + case AlphaModifier::SourceGreen: + AppendSource(out, config, source, index_name); + out += ".g"; + break; + case AlphaModifier::OneMinusSourceGreen: + out += "1.0 - "; + AppendSource(out, config, source, index_name); + out += ".g"; + break; + case AlphaModifier::SourceBlue: + AppendSource(out, config, source, index_name); + out += ".b"; + break; + case AlphaModifier::OneMinusSourceBlue: + out += "1.0 - "; + AppendSource(out, config, source, index_name); + out += ".b"; + break; + default: + out += "0.0"; + LOG_CRITICAL(Render_OpenGL, "Unknown alpha modifier op {}", modifier); + break; + } +} + +/// Writes the combiner function for the color components for the specified TEV stage operation +static void AppendColorCombiner(std::string& out, TevStageConfig::Operation operation, + std::string_view variable_name) { + const auto get_combiner = [operation] { + using Operation = TevStageConfig::Operation; + switch (operation) { + case Operation::Replace: + return "color_results_1"; + case Operation::Modulate: + return "color_results_1 * color_results_2"; + case Operation::Add: + return "color_results_1 + color_results_2"; + case Operation::AddSigned: + return "color_results_1 + color_results_2 - vec3(0.5)"; + case Operation::Lerp: + return "color_results_1 * color_results_3 + color_results_2 * (vec3(1.0) - " + "color_results_3)"; + case Operation::Subtract: + return "color_results_1 - color_results_2"; + case Operation::MultiplyThenAdd: + return "color_results_1 * color_results_2 + color_results_3"; + case Operation::AddThenMultiply: + return "min(color_results_1 + color_results_2, vec3(1.0)) * color_results_3"; + case Operation::Dot3_RGB: + case Operation::Dot3_RGBA: + return "vec3(dot(color_results_1 - vec3(0.5), color_results_2 - vec3(0.5)) * 4.0)"; + default: + LOG_CRITICAL(Render_OpenGL, "Unknown color combiner operation: {}", operation); + return "vec3(0.0)"; + } + }; + + // Clamp result to 0.0, 1.0 + out += fmt::format("clamp({}, vec3(0.0), vec3(1.0))", get_combiner()); +} + +/// Writes the combiner function for the alpha component for the specified TEV stage operation +static void AppendAlphaCombiner(std::string& out, TevStageConfig::Operation operation, + std::string_view variable_name) { + out += "clamp("; + using Operation = TevStageConfig::Operation; + switch (operation) { + case Operation::Replace: + out += "alpha_results_1"; + break; + case Operation::Modulate: + out += "alpha_results_1 * alpha_results_2"; + break; + case Operation::Add: + out += "alpha_results_1 + alpha_results_2"; + break; + case Operation::AddSigned: + out += "alpha_results_1 + alpha_results_2 - 0.5"; + break; + case Operation::Lerp: + out += "alpha_results_1 * alpha_results_3 + alpha_results_2 * (1.0 - alpha_results_3)"; + break; + case Operation::Subtract: + out += "alpha_results_1 - alpha_results_2"; + break; + case Operation::MultiplyThenAdd: + out += "alpha_results_1 * alpha_results_2 + alpha_results_3"; + break; + case Operation::AddThenMultiply: + out += "min(alpha_results_1 + alpha_results_2, 1.0) * alpha_results_3"; + break; + default: + out += "0.0"; + LOG_CRITICAL(Render_OpenGL, "Unknown alpha combiner operation: {}", operation); + break; + } + out += ", 0.0, 1.0)"; +} + +/// Writes the if-statement condition used to evaluate alpha testing +static void AppendAlphaTestCondition(std::string& out, FramebufferRegs::CompareFunc func) { + using CompareFunc = FramebufferRegs::CompareFunc; + switch (func) { + case CompareFunc::Never: + out += "true"; + break; + case CompareFunc::Always: + out += "false"; + break; + case CompareFunc::Equal: + case CompareFunc::NotEqual: + case CompareFunc::LessThan: + case CompareFunc::LessThanOrEqual: + case CompareFunc::GreaterThan: + case CompareFunc::GreaterThanOrEqual: { + static constexpr std::array op{"!=", "==", ">=", ">", "<=", "<"}; + const auto index = static_cast(func) - static_cast(CompareFunc::Equal); + out += fmt::format("int(last_tex_env_out.a * 255.0) {} alphatest_ref", op[index]); + break; + } + + default: + out += "false"; + LOG_CRITICAL(Render_OpenGL, "Unknown alpha test condition {}", func); + break; + } +} + +/// Writes the code to emulate the specified TEV stage +static void WriteTevStage(std::string& out, const PicaFSConfig& config, unsigned index) { + const auto stage = + static_cast(config.state.tev_stages[index]); + if (!IsPassThroughTevStage(stage)) { + const std::string index_name = std::to_string(index); + + out += fmt::format("color_results_1 = ", index_name); + AppendColorModifier(out, config, stage.color_modifier1, stage.color_source1, index_name); + out += fmt::format(";\ncolor_results_2 = ", index_name); + AppendColorModifier(out, config, stage.color_modifier2, stage.color_source2, index_name); + out += fmt::format(";\ncolor_results_3 = ", index_name); + AppendColorModifier(out, config, stage.color_modifier3, stage.color_source3, index_name); + + // Round the output of each TEV stage to maintain the PICA's 8 bits of precision + out += fmt::format(";\nvec3 color_output_{} = byteround(", index_name); + AppendColorCombiner(out, stage.color_op, "color_results"); + out += ");\n"; + + if (stage.color_op == TevStageConfig::Operation::Dot3_RGBA) { + // result of Dot3_RGBA operation is also placed to the alpha component + out += fmt::format("float alpha_output_{0} = color_output_{0}[0];\n", index_name); + } else { + out += fmt::format("alpha_results_1 = ", index_name); + AppendAlphaModifier(out, config, stage.alpha_modifier1, stage.alpha_source1, + index_name); + out += fmt::format(";\nalpha_results_2 = ", index_name); + AppendAlphaModifier(out, config, stage.alpha_modifier2, stage.alpha_source2, + index_name); + out += fmt::format(";\nalpha_results_3 = ", index_name); + AppendAlphaModifier(out, config, stage.alpha_modifier3, stage.alpha_source3, + index_name); + + out += fmt::format(";\nfloat alpha_output_{} = byteround(", index_name); + AppendAlphaCombiner(out, stage.alpha_op, "alpha_results"); + out += ");\n"; + } + + out += fmt::format("last_tex_env_out = vec4(" + "clamp(color_output_{} * {}.0, vec3(0.0), vec3(1.0)), " + "clamp(alpha_output_{} * {}.0, 0.0, 1.0));\n", + index_name, stage.GetColorMultiplier(), index_name, + stage.GetAlphaMultiplier()); + } + + out += "combiner_buffer = next_combiner_buffer;\n"; + + if (config.TevStageUpdatesCombinerBufferColor(index)) + out += "next_combiner_buffer.rgb = last_tex_env_out.rgb;\n"; + + if (config.TevStageUpdatesCombinerBufferAlpha(index)) + out += "next_combiner_buffer.a = last_tex_env_out.a;\n"; +} + +/// Writes the code to emulate fragment lighting +static void WriteLighting(std::string& out, const PicaFSConfig& config) { + const auto& lighting = config.state.lighting; + + // Define lighting globals + out += "vec4 diffuse_sum = vec4(0.0, 0.0, 0.0, 1.0);\n" + "vec4 specular_sum = vec4(0.0, 0.0, 0.0, 1.0);\n" + "vec3 light_vector = vec3(0.0);\n" + "float light_distance = 0.0;\n" + "vec3 refl_value = vec3(0.0);\n" + "vec3 spot_dir = vec3(0.0);\n" + "vec3 half_vector = vec3(0.0);\n" + "float dot_product = 0.0;\n" + "float clamp_highlights = 1.0;\n" + "float geo_factor = 1.0;\n"; + + // Compute fragment normals and tangents + const auto perturbation = [&] { + return fmt::format("2.0 * ({}).rgb - 1.0", SampleTexture(config, lighting.bump_selector)); + }; + + switch (lighting.bump_mode) { + case LightingRegs::LightingBumpMode::NormalMap: { + // Bump mapping is enabled using a normal map + out += fmt::format("vec3 surface_normal = {};\n", perturbation()); + + // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher + // precision result + if (lighting.bump_renorm) { + constexpr std::string_view val = + "(1.0 - (surface_normal.x*surface_normal.x + surface_normal.y*surface_normal.y))"; + out += fmt::format("surface_normal.z = sqrt(max({}, 0.0));\n", val); + } + + // The tangent vector is not perturbed by the normal map and is just a unit vector. + out += "vec3 surface_tangent = vec3(1.0, 0.0, 0.0);\n"; + break; + } + case LightingRegs::LightingBumpMode::TangentMap: { + // Bump mapping is enabled using a tangent map + out += fmt::format("vec3 surface_tangent = {};\n", perturbation()); + // Mathematically, recomputing Z-component of the tangent vector won't affect the relevant + // computation below, which is also confirmed on 3DS. So we don't bother recomputing here + // even if 'renorm' is enabled. + + // The normal vector is not perturbed by the tangent map and is just a unit vector. + out += "vec3 surface_normal = vec3(0.0, 0.0, 1.0);\n"; + break; + } + default: + // No bump mapping - surface local normal and tangent are just unit vectors + out += "vec3 surface_normal = vec3(0.0, 0.0, 1.0);\n" + "vec3 surface_tangent = vec3(1.0, 0.0, 0.0);\n"; + } + + // Rotate the surface-local normal by the interpolated normal quaternion to convert it to + // eyespace. + out += "vec4 normalized_normquat = normalize(normquat);\n" + "vec3 normal = quaternion_rotate(normalized_normquat, surface_normal);\n" + "vec3 tangent = quaternion_rotate(normalized_normquat, surface_tangent);\n"; + + if (lighting.enable_shadow) { + std::string shadow_texture = SampleTexture(config, lighting.shadow_selector); + if (lighting.shadow_invert) { + out += fmt::format("vec4 shadow = vec4(1.0) - {};\n", shadow_texture); + } else { + out += fmt::format("vec4 shadow = {};\n", shadow_texture); + } + } else { + out += "vec4 shadow = vec4(1.0);\n"; + } + + // Samples the specified lookup table for specular lighting + auto get_lut_value = [&lighting](LightingRegs::LightingSampler sampler, unsigned light_num, + LightingRegs::LightingLutInput input, bool abs) { + std::string index; + switch (input) { + case LightingRegs::LightingLutInput::NH: + index = "dot(normal, normalize(half_vector))"; + break; + + case LightingRegs::LightingLutInput::VH: + index = "dot(normalize(view), normalize(half_vector))"; + break; + + case LightingRegs::LightingLutInput::NV: + index = "dot(normal, normalize(view))"; + break; + + case LightingRegs::LightingLutInput::LN: + index = "dot(light_vector, normal)"; + break; + + case LightingRegs::LightingLutInput::SP: + index = "dot(light_vector, spot_dir)"; + break; + + case LightingRegs::LightingLutInput::CP: + // CP input is only available with configuration 7 + if (lighting.config == LightingRegs::LightingConfig::Config7) { + // Note: even if the normal vector is modified by normal map, which is not the + // normal of the tangent plane anymore, the half angle vector is still projected + // using the modified normal vector. + constexpr std::string_view half_angle_proj = + "normalize(half_vector) - normal * dot(normal, normalize(half_vector))"; + // Note: the half angle vector projection is confirmed not normalized before the dot + // product. The result is in fact not cos(phi) as the name suggested. + index = fmt::format("dot({}, tangent)", half_angle_proj); + } else { + index = "0.0"; + } + break; + + default: + LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input {}", static_cast(input)); + UNIMPLEMENTED(); + index = "0.0"; + break; + } + + const auto sampler_index = static_cast(sampler); + + if (abs) { + // LUT index is in the range of (0.0, 1.0) + index = lighting.light[light_num].two_sided_diffuse + ? fmt::format("abs({})", index) + : fmt::format("max({}, 0.0)", index); + return fmt::format("LookupLightingLUTUnsigned({}, {})", sampler_index, index); + } else { + // LUT index is in the range of (-1.0, 1.0) + return fmt::format("LookupLightingLUTSigned({}, {})", sampler_index, index); + } + }; + + // Write the code to emulate each enabled light + for (unsigned light_index = 0; light_index < lighting.src_num; ++light_index) { + const auto& light_config = lighting.light[light_index]; + const std::string light_src = fmt::format("light_src[{}]", light_config.num); + + // Compute light vector (directional or positional) + if (light_config.directional) { + out += fmt::format("light_vector = {}.position;\n", light_src); + } else { + out += fmt::format("light_vector = {}.position + view;\n", light_src); + } + out += fmt::format("light_distance = length(light_vector);\n", light_src); + out += fmt::format("light_vector = normalize(light_vector);\n", light_src); + + out += fmt::format("spot_dir = {}.spot_direction;\n", light_src); + out += "half_vector = normalize(view) + light_vector;\n"; + + // Compute dot product of light_vector and normal, adjust if lighting is one-sided or + // two-sided + out += std::string("dot_product = ") + (light_config.two_sided_diffuse + ? "abs(dot(light_vector, normal));\n" + : "max(dot(light_vector, normal), 0.0);\n"); + + // If enabled, clamp specular component if lighting result is zero + if (lighting.clamp_highlights) { + out += "clamp_highlights = sign(dot_product);\n"; + } + + // If enabled, compute spot light attenuation value + std::string spot_atten = "1.0"; + if (light_config.spot_atten_enable && + LightingRegs::IsLightingSamplerSupported( + lighting.config, LightingRegs::LightingSampler::SpotlightAttenuation)) { + const std::string value = + get_lut_value(LightingRegs::SpotlightAttenuationSampler(light_config.num), + light_config.num, lighting.lut_sp.type, lighting.lut_sp.abs_input); + spot_atten = fmt::format("({:#} * {})", lighting.lut_sp.scale, value); + } + + // If enabled, compute distance attenuation value + std::string dist_atten = "1.0"; + if (light_config.dist_atten_enable) { + const std::string index = fmt::format("clamp({}.dist_atten_scale * light_distance " + "+ {}.dist_atten_bias, 0.0, 1.0)", + light_src, light_src, light_src); + const auto sampler = LightingRegs::DistanceAttenuationSampler(light_config.num); + dist_atten = fmt::format("LookupLightingLUTUnsigned({}, {})", sampler, index); + } + + if (light_config.geometric_factor_0 || light_config.geometric_factor_1) { + out += "geo_factor = dot(half_vector, half_vector);\n" + "geo_factor = geo_factor == 0.0 ? 0.0 : min(" + "dot_product / geo_factor, 1.0);\n"; + } + + // Specular 0 component + std::string d0_lut_value = "1.0"; + if (lighting.lut_d0.enable && + LightingRegs::IsLightingSamplerSupported( + lighting.config, LightingRegs::LightingSampler::Distribution0)) { + // Lookup specular "distribution 0" LUT value + const std::string value = + get_lut_value(LightingRegs::LightingSampler::Distribution0, light_config.num, + lighting.lut_d0.type, lighting.lut_d0.abs_input); + d0_lut_value = fmt::format("({:#} * {})", lighting.lut_d0.scale, value); + } + std::string specular_0 = fmt::format("({} * {}.specular_0)", d0_lut_value, light_src); + if (light_config.geometric_factor_0) { + specular_0 = fmt::format("({} * geo_factor)", specular_0); + } + + // If enabled, lookup ReflectRed value, otherwise, 1.0 is used + if (lighting.lut_rr.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::ReflectRed)) { + std::string value = + get_lut_value(LightingRegs::LightingSampler::ReflectRed, light_config.num, + lighting.lut_rr.type, lighting.lut_rr.abs_input); + value = fmt::format("({:#} * {})", lighting.lut_rr.scale, value); + out += fmt::format("refl_value.r = {};\n", value); + } else { + out += "refl_value.r = 1.0;\n"; + } + + // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used + if (lighting.lut_rg.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::ReflectGreen)) { + std::string value = + get_lut_value(LightingRegs::LightingSampler::ReflectGreen, light_config.num, + lighting.lut_rg.type, lighting.lut_rg.abs_input); + value = fmt::format("({:#} * {})", lighting.lut_rg.scale, value); + out += fmt::format("refl_value.g = {};\n", value); + } else { + out += "refl_value.g = refl_value.r;\n"; + } + + // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used + if (lighting.lut_rb.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::ReflectBlue)) { + std::string value = + get_lut_value(LightingRegs::LightingSampler::ReflectBlue, light_config.num, + lighting.lut_rb.type, lighting.lut_rb.abs_input); + value = fmt::format("({:#} * {})", lighting.lut_rb.scale, value); + out += fmt::format("refl_value.b = {};\n", value); + } else { + out += "refl_value.b = refl_value.r;\n"; + } + + // Specular 1 component + std::string d1_lut_value = "1.0"; + if (lighting.lut_d1.enable && + LightingRegs::IsLightingSamplerSupported( + lighting.config, LightingRegs::LightingSampler::Distribution1)) { + // Lookup specular "distribution 1" LUT value + const std::string value = + get_lut_value(LightingRegs::LightingSampler::Distribution1, light_config.num, + lighting.lut_d1.type, lighting.lut_d1.abs_input); + d1_lut_value = fmt::format("({:#} * {})", lighting.lut_d1.scale, value); + } + std::string specular_1 = + fmt::format("({} * refl_value * {}.specular_1)", d1_lut_value, light_src); + if (light_config.geometric_factor_1) { + specular_1 = fmt::format("({} * geo_factor)", specular_1); + } + + // Fresnel + // Note: only the last entry in the light slots applies the Fresnel factor + if (light_index == lighting.src_num - 1 && lighting.lut_fr.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::Fresnel)) { + // Lookup fresnel LUT value + std::string value = + get_lut_value(LightingRegs::LightingSampler::Fresnel, light_config.num, + lighting.lut_fr.type, lighting.lut_fr.abs_input); + value = fmt::format("({:#} * {})", lighting.lut_fr.scale, value); + + // Enabled for diffuse lighting alpha component + if (lighting.enable_primary_alpha) { + out += fmt::format("diffuse_sum.a = {};\n", value); + } + + // Enabled for the specular lighting alpha component + if (lighting.enable_secondary_alpha) { + out += fmt::format("specular_sum.a = {};\n", value); + } + } + + bool shadow_primary_enable = lighting.shadow_primary && light_config.shadow_enable; + bool shadow_secondary_enable = lighting.shadow_secondary && light_config.shadow_enable; + std::string shadow_primary = shadow_primary_enable ? " * shadow.rgb" : ""; + std::string shadow_secondary = shadow_secondary_enable ? " * shadow.rgb" : ""; + + // Compute primary fragment color (diffuse lighting) function + out += fmt::format( + "diffuse_sum.rgb += (({}.diffuse * dot_product) + {}.ambient) * {} * {}{};\n", + light_src, light_src, dist_atten, spot_atten, shadow_primary); + + // Compute secondary fragment color (specular lighting) function + out += fmt::format("specular_sum.rgb += ({} + {}) * clamp_highlights * {} * {}{};\n", + specular_0, specular_1, dist_atten, spot_atten, shadow_secondary); + } + + // Apply shadow attenuation to alpha components if enabled + if (lighting.shadow_alpha) { + if (lighting.enable_primary_alpha) { + out += "diffuse_sum.a *= shadow.a;\n"; + } + if (lighting.enable_secondary_alpha) { + out += "specular_sum.a *= shadow.a;\n"; + } + } + + // Sum final lighting result + out += "diffuse_sum.rgb += lighting_global_ambient;\n" + "primary_fragment_color = clamp(diffuse_sum, vec4(0.0), vec4(1.0));\n" + "secondary_fragment_color = clamp(specular_sum, vec4(0.0), vec4(1.0));\n"; +} + +using ProcTexClamp = TexturingRegs::ProcTexClamp; +using ProcTexShift = TexturingRegs::ProcTexShift; +using ProcTexCombiner = TexturingRegs::ProcTexCombiner; +using ProcTexFilter = TexturingRegs::ProcTexFilter; + +static void AppendProcTexShiftOffset(std::string& out, std::string_view v, ProcTexShift mode, + ProcTexClamp clamp_mode) { + const std::string_view offset = (clamp_mode == ProcTexClamp::MirroredRepeat) ? "1.0" : "0.5"; + switch (mode) { + case ProcTexShift::None: + out += "0.0"; + break; + case ProcTexShift::Odd: + out += fmt::format("{} * float((int({}) / 2) % 2)", offset, v); + break; + case ProcTexShift::Even: + out += fmt::format("{} * float(((int({}) + 1) / 2) % 2)", offset, v); + break; + default: + LOG_CRITICAL(HW_GPU, "Unknown shift mode {}", mode); + out += "0.0"; + break; + } +} + +static void AppendProcTexClamp(std::string& out, std::string_view var, ProcTexClamp mode) { + switch (mode) { + case ProcTexClamp::ToZero: + out += fmt::format("{0} = {0} > 1.0 ? 0 : {0};\n", var); + break; + case ProcTexClamp::ToEdge: + out += fmt::format("{0} = min({0}, 1.0);\n", var); + break; + case ProcTexClamp::SymmetricalRepeat: + out += fmt::format("{0} = fract({0});\n", var); + break; + case ProcTexClamp::MirroredRepeat: { + out += fmt::format("{0} = int({0}) % 2 == 0 ? fract({0}) : 1.0 - fract({0});\n", var); + break; + } + case ProcTexClamp::Pulse: + out += fmt::format("{0} = {0} > 0.5 ? 1.0 : 0.0;\n", var); + break; + default: + LOG_CRITICAL(HW_GPU, "Unknown clamp mode {}", mode); + out += fmt::format("{0} = min({0}, 1.0);\n", var); + break; + } +} + +static void AppendProcTexCombineAndMap(std::string& out, ProcTexCombiner combiner, + std::string_view offset) { + const auto combined = [combiner]() -> std::string_view { + switch (combiner) { + case ProcTexCombiner::U: + return "u"; + case ProcTexCombiner::U2: + return "(u * u)"; + case TexturingRegs::ProcTexCombiner::V: + return "v"; + case TexturingRegs::ProcTexCombiner::V2: + return "(v * v)"; + case TexturingRegs::ProcTexCombiner::Add: + return "((u + v) * 0.5)"; + case TexturingRegs::ProcTexCombiner::Add2: + return "((u * u + v * v) * 0.5)"; + case TexturingRegs::ProcTexCombiner::SqrtAdd2: + return "min(sqrt(u * u + v * v), 1.0)"; + case TexturingRegs::ProcTexCombiner::Min: + return "min(u, v)"; + case TexturingRegs::ProcTexCombiner::Max: + return "max(u, v)"; + case TexturingRegs::ProcTexCombiner::RMax: + return "min(((u + v) * 0.5 + sqrt(u * u + v * v)) * 0.5, 1.0)"; + default: + LOG_CRITICAL(HW_GPU, "Unknown combiner {}", combiner); + return "0.0"; + } + }(); + + out += fmt::format("ProcTexLookupLUT({}, {})", offset, combined); +} + +static void AppendProcTexSampler(std::string& out, const PicaFSConfig& config) { + // LUT sampling uitlity + // For NoiseLUT/ColorMap/AlphaMap, coord=0.0 is lut[0], coord=127.0/128.0 is lut[127] and + // coord=1.0 is lut[127]+lut_diff[127]. For other indices, the result is interpolated using + // value entries and difference entries. + out += R"( +float ProcTexLookupLUT(int offset, float coord) { + coord *= 128.0; + float index_i = clamp(floor(coord), 0.0, 127.0); + float index_f = coord - index_i; // fract() cannot be used here because 128.0 needs to be + // extracted as index_i = 127.0 and index_f = 1.0 + vec2 entry = texelFetch(texture_buffer_lut_rg, int(index_i) + offset).rg; + return clamp(entry.r + entry.g * index_f, 0.0, 1.0); +} + )"; + + // Noise utility + if (config.state.proctex.noise_enable) { + // See swrasterizer/proctex.cpp for more information about these functions + out += R"( +int ProcTexNoiseRand1D(int v) { + const int table[] = int[](0,4,10,8,4,9,7,12,5,15,13,14,11,15,2,11); + return ((v % 9 + 2) * 3 & 0xF) ^ table[(v / 9) & 0xF]; +} + +float ProcTexNoiseRand2D(vec2 point) { + const int table[] = int[](10,2,15,8,0,7,4,5,5,13,2,6,13,9,3,14); + int u2 = ProcTexNoiseRand1D(int(point.x)); + int v2 = ProcTexNoiseRand1D(int(point.y)); + v2 += ((u2 & 3) == 1) ? 4 : 0; + v2 ^= (u2 & 1) * 6; + v2 += 10 + u2; + v2 &= 0xF; + v2 ^= table[u2]; + return -1.0 + float(v2) * (2.0/15.0); +} + +float ProcTexNoiseCoef(vec2 x) { + vec2 grid = 9.0 * proctex_noise_f * abs(x + proctex_noise_p); + vec2 point = floor(grid); + vec2 frac = grid - point; + + float g0 = ProcTexNoiseRand2D(point) * (frac.x + frac.y); + float g1 = ProcTexNoiseRand2D(point + vec2(1.0, 0.0)) * (frac.x + frac.y - 1.0); + float g2 = ProcTexNoiseRand2D(point + vec2(0.0, 1.0)) * (frac.x + frac.y - 1.0); + float g3 = ProcTexNoiseRand2D(point + vec2(1.0, 1.0)) * (frac.x + frac.y - 2.0); + + float x_noise = ProcTexLookupLUT(proctex_noise_lut_offset, frac.x); + float y_noise = ProcTexLookupLUT(proctex_noise_lut_offset, frac.y); + float x0 = mix(g0, g1, x_noise); + float x1 = mix(g2, g3, x_noise); + return mix(x0, x1, y_noise); +} + )"; + } + + out += "vec4 SampleProcTexColor(float lut_coord, int level) {\n"; + out += fmt::format("int lut_width = {} >> level;\n", config.state.proctex.lut_width); + // Offsets for level 4-7 seem to be hardcoded + out += fmt::format("int lut_offsets[8] = int[]({}, {}, {}, {}, 0xF0, 0xF8, 0xFC, 0xFE);\n", + config.state.proctex.lut_offset0, config.state.proctex.lut_offset1, + config.state.proctex.lut_offset2, config.state.proctex.lut_offset3); + out += "int lut_offset = lut_offsets[level];\n"; + // For the color lut, coord=0.0 is lut[offset] and coord=1.0 is lut[offset+width-1] + out += "lut_coord *= float(lut_width - 1);\n"; + + switch (config.state.proctex.lut_filter) { + case ProcTexFilter::Linear: + case ProcTexFilter::LinearMipmapLinear: + case ProcTexFilter::LinearMipmapNearest: + out += "int lut_index_i = int(lut_coord) + lut_offset;\n"; + out += "float lut_index_f = fract(lut_coord);\n"; + out += "return texelFetch(texture_buffer_lut_rgba, lut_index_i + " + "proctex_lut_offset) + " + "lut_index_f * " + "texelFetch(texture_buffer_lut_rgba, lut_index_i + proctex_diff_lut_offset);\n"; + break; + case ProcTexFilter::Nearest: + case ProcTexFilter::NearestMipmapLinear: + case ProcTexFilter::NearestMipmapNearest: + out += "lut_coord += float(lut_offset);\n"; + out += "return texelFetch(texture_buffer_lut_rgba, int(round(lut_coord)) + " + "proctex_lut_offset);\n"; + break; + } + + out += "}\n"; + + out += "vec4 ProcTex() {\n"; + if (config.state.proctex.coord < 3) { + out += fmt::format("vec2 uv = abs(texcoord{});\n", config.state.proctex.coord); + } else { + LOG_CRITICAL(Render_OpenGL, "Unexpected proctex.coord >= 3"); + out += "vec2 uv = abs(texcoord0);\n"; + } + + // This LOD formula is the same as the LOD upper limit defined in OpenGL. + // f(x, y) <= m_u + m_v + m_w + // (See OpenGL 4.6 spec, 8.14.1 - Scale Factor and Level-of-Detail) + // Note: this is different from the one normal 2D textures use. + out += "vec2 duv = max(abs(dFdx(uv)), abs(dFdy(uv)));\n"; + // unlike normal texture, the bias is inside the log2 + out += fmt::format("float lod = log2(abs(float({}) * proctex_bias) * (duv.x + duv.y));\n", + config.state.proctex.lut_width); + out += "if (proctex_bias == 0.0) lod = 0.0;\n"; + out += fmt::format("lod = clamp(lod, {:#}, {:#});\n", + std::max(0.0f, static_cast(config.state.proctex.lod_min)), + std::min(7.0f, static_cast(config.state.proctex.lod_max))); + // Get shift offset before noise generation + out += "float u_shift = "; + AppendProcTexShiftOffset(out, "uv.y", config.state.proctex.u_shift, + config.state.proctex.u_clamp); + out += ";\n"; + out += "float v_shift = "; + AppendProcTexShiftOffset(out, "uv.x", config.state.proctex.v_shift, + config.state.proctex.v_clamp); + out += ";\n"; + + // Generate noise + if (config.state.proctex.noise_enable) { + out += "uv += proctex_noise_a * ProcTexNoiseCoef(uv);\n" + "uv = abs(uv);\n"; + } + + // Shift + out += "float u = uv.x + u_shift;\n" + "float v = uv.y + v_shift;\n"; + + // Clamp + AppendProcTexClamp(out, "u", config.state.proctex.u_clamp); + AppendProcTexClamp(out, "v", config.state.proctex.v_clamp); + + // Combine and map + out += "float lut_coord = "; + AppendProcTexCombineAndMap(out, config.state.proctex.color_combiner, + "proctex_color_map_offset"); + out += ";\n"; + + switch (config.state.proctex.lut_filter) { + case ProcTexFilter::Linear: + case ProcTexFilter::Nearest: + out += "vec4 final_color = SampleProcTexColor(lut_coord, 0);\n"; + break; + case ProcTexFilter::NearestMipmapNearest: + case ProcTexFilter::LinearMipmapNearest: + out += "vec4 final_color = SampleProcTexColor(lut_coord, int(round(lod)));\n"; + break; + case ProcTexFilter::NearestMipmapLinear: + case ProcTexFilter::LinearMipmapLinear: + out += "int lod_i = int(lod);\n" + "float lod_f = fract(lod);\n" + "vec4 final_color = mix(SampleProcTexColor(lut_coord, lod_i), " + "SampleProcTexColor(lut_coord, lod_i + 1), lod_f);\n"; + break; + } + + if (config.state.proctex.separate_alpha) { + // Note: in separate alpha mode, the alpha channel skips the color LUT look up stage. It + // uses the output of CombineAndMap directly instead. + out += "float final_alpha = "; + AppendProcTexCombineAndMap(out, config.state.proctex.alpha_combiner, + "proctex_alpha_map_offset"); + out += ";\n"; + out += "return vec4(final_color.xyz, final_alpha);\n}\n"; + } else { + out += "return final_color;\n}\n"; + } +} + +std::string GenerateFragmentShader(const PicaFSConfig& config) { + const auto& state = config.state; + std::string out = "#version 450 core\n" + "#extension GL_ARB_separate_shader_objects : enable\n\n"; + out += GetVertexInterfaceDeclaration(false); + + out += R"( +in vec4 gl_FragCoord; + +layout (location = 0) out vec4 color; + +layout(set = 0, binding = 2) uniform samplerBuffer texture_buffer_lut_lf; +layout(set = 0, binding = 3) uniform samplerBuffer texture_buffer_lut_rg; +layout(set = 0, binding = 4) uniform samplerBuffer texture_buffer_lut_rgba; + +layout(set = 1, binding = 0) uniform sampler2D tex0; +layout(set = 1, binding = 1) uniform sampler2D tex1; +layout(set = 1, binding = 2) uniform sampler2D tex2; +layout(set = 1, binding = 3) uniform samplerCube tex_cube; + +layout(set = 2, binding = 0, r32ui) uniform readonly uimage2D shadow_texture_px; +layout(set = 2, binding = 1, r32ui) uniform readonly uimage2D shadow_texture_nx; +layout(set = 2, binding = 2, r32ui) uniform readonly uimage2D shadow_texture_py; +layout(set = 2, binding = 3, r32ui) uniform readonly uimage2D shadow_texture_ny; +layout(set = 2, binding = 4, r32ui) uniform readonly uimage2D shadow_texture_pz; +layout(set = 2, binding = 5, r32ui) uniform readonly uimage2D shadow_texture_nz; +layout(set = 2, binding = 6, r32ui) uniform uimage2D shadow_buffer; +)"; + + out += UniformBlockDef; + + out += R"( +// Rotate the vector v by the quaternion q +vec3 quaternion_rotate(vec4 q, vec3 v) { + return v + 2.0 * cross(q.xyz, cross(q.xyz, v) + q.w * v); +} + +float LookupLightingLUT(int lut_index, int index, float delta) { + vec2 entry = texelFetch(texture_buffer_lut_lf, lighting_lut_offset[lut_index >> 2][lut_index & 3] + index).rg; + return entry.r + entry.g * delta; +} + +float LookupLightingLUTUnsigned(int lut_index, float pos) { + int index = clamp(int(pos * 256.0), 0, 255); + float delta = pos * 256.0 - float(index); + return LookupLightingLUT(lut_index, index, delta); +} + +float LookupLightingLUTSigned(int lut_index, float pos) { + int index = clamp(int(pos * 128.0), -128, 127); + float delta = pos * 128.0 - float(index); + if (index < 0) index += 256; + return LookupLightingLUT(lut_index, index, delta); +} + +float byteround(float x) { + return round(x * 255.0) * (1.0 / 255.0); +} + +vec2 byteround(vec2 x) { + return round(x * 255.0) * (1.0 / 255.0); +} + +vec3 byteround(vec3 x) { + return round(x * 255.0) * (1.0 / 255.0); +} + +vec4 byteround(vec4 x) { + return round(x * 255.0) * (1.0 / 255.0); +} + +// PICA's LOD formula for 2D textures. +// This LOD formula is the same as the LOD lower limit defined in OpenGL. +// f(x, y) >= max{m_u, m_v, m_w} +// (See OpenGL 4.6 spec, 8.14.1 - Scale Factor and Level-of-Detail) +float getLod(vec2 coord) { + vec2 d = max(abs(dFdx(coord)), abs(dFdy(coord))); + return log2(max(d.x, d.y)); +} + +uvec2 DecodeShadow(uint pixel) { + return uvec2(pixel >> 8, pixel & 0xFFu); +} + +uint EncodeShadow(uvec2 pixel) { + return (pixel.x << 8) | pixel.y; +} + +float CompareShadow(uint pixel, uint z) { + uvec2 p = DecodeShadow(pixel); + return mix(float(p.y) * (1.0 / 255.0), 0.0, p.x <= z); +} + +float SampleShadow2D(ivec2 uv, uint z) { + if (any(bvec4( lessThan(uv, ivec2(0)), greaterThanEqual(uv, imageSize(shadow_texture_px)) ))) + return 1.0; + return CompareShadow(imageLoad(shadow_texture_px, uv).x, z); +} + +float mix2(vec4 s, vec2 a) { + vec2 t = mix(s.xy, s.zw, a.yy); + return mix(t.x, t.y, a.x); +} + +vec4 shadowTexture(vec2 uv, float w) { +)"; + if (!config.state.shadow_texture_orthographic) { + out += "uv /= w;"; + } + out += "uint z = uint(max(0, int(min(abs(w), 1.0) * float(0xFFFFFF)) - shadow_texture_bias));"; + out += R"( + vec2 coord = vec2(imageSize(shadow_texture_px)) * uv - vec2(0.5); + vec2 coord_floor = floor(coord); + vec2 f = coord - coord_floor; + ivec2 i = ivec2(coord_floor); + vec4 s = vec4( + SampleShadow2D(i , z), + SampleShadow2D(i + ivec2(1, 0), z), + SampleShadow2D(i + ivec2(0, 1), z), + SampleShadow2D(i + ivec2(1, 1), z)); + return vec4(mix2(s, f)); +} + +vec4 shadowTextureCube(vec2 uv, float w) { + ivec2 size = imageSize(shadow_texture_px); + vec3 c = vec3(uv, w); + vec3 a = abs(c); + if (a.x > a.y && a.x > a.z) { + w = a.x; + uv = -c.zy; + if (c.x < 0.0) uv.x = -uv.x; + } else if (a.y > a.z) { + w = a.y; + uv = c.xz; + if (c.y < 0.0) uv.y = -uv.y; + } else { + w = a.z; + uv = -c.xy; + if (c.z > 0.0) uv.x = -uv.x; + } +)"; + out += "uint z = uint(max(0, int(min(w, 1.0) * float(0xFFFFFF)) - shadow_texture_bias));"; + out += R"( + vec2 coord = vec2(size) * (uv / w * vec2(0.5) + vec2(0.5)) - vec2(0.5); + vec2 coord_floor = floor(coord); + vec2 f = coord - coord_floor; + ivec2 i00 = ivec2(coord_floor); + ivec2 i10 = i00 + ivec2(1, 0); + ivec2 i01 = i00 + ivec2(0, 1); + ivec2 i11 = i00 + ivec2(1, 1); + ivec2 cmin = ivec2(0), cmax = size - ivec2(1, 1); + i00 = clamp(i00, cmin, cmax); + i10 = clamp(i10, cmin, cmax); + i01 = clamp(i01, cmin, cmax); + i11 = clamp(i11, cmin, cmax); + uvec4 pixels; + // This part should have been refactored into functions, + // but many drivers don't like passing uimage2D as parameters + if (a.x > a.y && a.x > a.z) { + if (c.x > 0.0) + pixels = uvec4( + imageLoad(shadow_texture_px, i00).r, + imageLoad(shadow_texture_px, i10).r, + imageLoad(shadow_texture_px, i01).r, + imageLoad(shadow_texture_px, i11).r); + else + pixels = uvec4( + imageLoad(shadow_texture_nx, i00).r, + imageLoad(shadow_texture_nx, i10).r, + imageLoad(shadow_texture_nx, i01).r, + imageLoad(shadow_texture_nx, i11).r); + } else if (a.y > a.z) { + if (c.y > 0.0) + pixels = uvec4( + imageLoad(shadow_texture_py, i00).r, + imageLoad(shadow_texture_py, i10).r, + imageLoad(shadow_texture_py, i01).r, + imageLoad(shadow_texture_py, i11).r); + else + pixels = uvec4( + imageLoad(shadow_texture_ny, i00).r, + imageLoad(shadow_texture_ny, i10).r, + imageLoad(shadow_texture_ny, i01).r, + imageLoad(shadow_texture_ny, i11).r); + } else { + if (c.z > 0.0) + pixels = uvec4( + imageLoad(shadow_texture_pz, i00).r, + imageLoad(shadow_texture_pz, i10).r, + imageLoad(shadow_texture_pz, i01).r, + imageLoad(shadow_texture_pz, i11).r); + else + pixels = uvec4( + imageLoad(shadow_texture_nz, i00).r, + imageLoad(shadow_texture_nz, i10).r, + imageLoad(shadow_texture_nz, i01).r, + imageLoad(shadow_texture_nz, i11).r); + } + vec4 s = vec4( + CompareShadow(pixels.x, z), + CompareShadow(pixels.y, z), + CompareShadow(pixels.z, z), + CompareShadow(pixels.w, z)); + return vec4(mix2(s, f)); +} +)"; + + if (config.state.proctex.enable) + AppendProcTexSampler(out, config); + + // We round the interpolated primary color to the nearest 1/255th + // This maintains the PICA's 8 bits of precision + out += R"( +void main() { +vec4 rounded_primary_color = byteround(primary_color); +vec4 primary_fragment_color = vec4(0.0); +vec4 secondary_fragment_color = vec4(0.0); +)"; + + // Do not do any sort of processing if it's obvious we're not going to pass the alpha test + if (state.alpha_test_func == FramebufferRegs::CompareFunc::Never) { + out += "discard; }"; + return out; + } + + // Append the scissor test + if (state.scissor_test_mode != RasterizerRegs::ScissorMode::Disabled) { + out += "if ("; + // Negate the condition if we have to keep only the pixels outside the scissor box + if (state.scissor_test_mode == RasterizerRegs::ScissorMode::Include) { + out += '!'; + } + out += "(gl_FragCoord.x >= float(scissor_x1) && " + "gl_FragCoord.y >= float(scissor_y1) && " + "gl_FragCoord.x < float(scissor_x2) && " + "gl_FragCoord.y < float(scissor_y2))) discard;\n"; + } + + // After perspective divide, OpenGL transform z_over_w from [-1, 1] to [near, far]. Here we use + // default near = 0 and far = 1, and undo the transformation to get the original z_over_w, then + // do our own transformation according to PICA specification. + out += "float z_over_w = 2.0 * gl_FragCoord.z - 1.0;\n" + "float depth = z_over_w * depth_scale + depth_offset;\n"; + if (state.depthmap_enable == RasterizerRegs::DepthBuffering::WBuffering) { + out += "depth /= gl_FragCoord.w;\n"; + } + + if (state.lighting.enable) + WriteLighting(out, config); + + out += "vec4 combiner_buffer = vec4(0.0);\n" + "vec4 next_combiner_buffer = tev_combiner_buffer_color;\n" + "vec4 last_tex_env_out = vec4(0.0);\n"; + + out += "vec3 color_results_1 = vec3(0.0);\n" + "vec3 color_results_2 = vec3(0.0);\n" + "vec3 color_results_3 = vec3(0.0);\n"; + + out += "float alpha_results_1 = 0.0;\n" + "float alpha_results_2 = 0.0;\n" + "float alpha_results_3 = 0.0;\n"; + + for (std::size_t index = 0; index < state.tev_stages.size(); ++index) { + WriteTevStage(out, config, static_cast(index)); + } + + if (state.alpha_test_func != FramebufferRegs::CompareFunc::Always) { + out += "if ("; + AppendAlphaTestCondition(out, state.alpha_test_func); + out += ") discard;\n"; + } + + // Append fog combiner + if (state.fog_mode == TexturingRegs::FogMode::Fog) { + // Get index into fog LUT + if (state.fog_flip) { + out += "float fog_index = (1.0 - float(depth)) * 128.0;\n"; + } else { + out += "float fog_index = depth * 128.0;\n"; + } + + // Generate clamped fog factor from LUT for given fog index + out += "float fog_i = clamp(floor(fog_index), 0.0, 127.0);\n" + "float fog_f = fog_index - fog_i;\n" + "vec2 fog_lut_entry = texelFetch(texture_buffer_lut_lf, int(fog_i) + " + "fog_lut_offset).rg;\n" + "float fog_factor = fog_lut_entry.r + fog_lut_entry.g * fog_f;\n" + "fog_factor = clamp(fog_factor, 0.0, 1.0);\n"; + + // Blend the fog + out += "last_tex_env_out.rgb = mix(fog_color.rgb, last_tex_env_out.rgb, fog_factor);\n"; + } else if (state.fog_mode == TexturingRegs::FogMode::Gas) { + Core::System::GetInstance().TelemetrySession().AddField( + Common::Telemetry::FieldType::Session, "VideoCore_Pica_UseGasMode", true); + LOG_CRITICAL(Render_OpenGL, "Unimplemented gas mode"); + out += "discard; }"; + return out; + } + + if (state.shadow_rendering) { + out += R"( +uint d = uint(clamp(depth, 0.0, 1.0) * float(0xFFFFFF)); +uint s = uint(last_tex_env_out.g * float(0xFF)); +ivec2 image_coord = ivec2(gl_FragCoord.xy); + +uint old = imageLoad(shadow_buffer, image_coord).x; +uint new1; +uint old2; +do { + old2 = old; + + uvec2 ref = DecodeShadow(old); + if (d < ref.x) { + if (s == 0u) { + ref.x = d; + } else { + s = uint(float(s) / (shadow_bias_constant + shadow_bias_linear * float(d) / float(ref.x))); + ref.y = min(s, ref.y); + } + } + new1 = EncodeShadow(ref); + +} while ((old = imageAtomicCompSwap(shadow_buffer, image_coord, old, new1)) != old2); +)"; + } else { + out += "gl_FragDepth = depth;\n"; + // Round the final fragment color to maintain the PICA's 8 bits of precision + out += "color = byteround(last_tex_env_out);\n"; + } + + if (state.emulate_logic_op) { + switch (state.logic_op) { + case FramebufferRegs::LogicOp::Clear: + out += "color = vec4(0);\n"; + break; + case FramebufferRegs::LogicOp::Set: + out += "color = vec4(1);\n"; + break; + case FramebufferRegs::LogicOp::Copy: + // Take the color output as-is + break; + case FramebufferRegs::LogicOp::CopyInverted: + out += "color = ~color;\n"; + break; + case FramebufferRegs::LogicOp::NoOp: + // We need to discard the color, but not necessarily the depth. This is not possible + // with fragment shader alone, so we emulate this behavior with the color mask. + break; + default: + LOG_CRITICAL(HW_GPU, "Unhandled logic_op {:x}", + static_cast(state.logic_op.Value())); + UNIMPLEMENTED(); + } + } + + out += '}'; + return out; +} + +std::string GenerateTrivialVertexShader(bool use_clip_planes) { + std::string out = "#version 450 core\n" + "#extension GL_ARB_separate_shader_objects : enable\n\n"; + out += + fmt::format("layout(location = {}) in vec4 vert_position;\n" + "layout(location = {}) in vec4 vert_color;\n" + "layout(location = {}) in vec2 vert_texcoord0;\n" + "layout(location = {}) in vec2 vert_texcoord1;\n" + "layout(location = {}) in vec2 vert_texcoord2;\n" + "layout(location = {}) in float vert_texcoord0_w;\n" + "layout(location = {}) in vec4 vert_normquat;\n" + "layout(location = {}) in vec3 vert_view;\n", + ATTRIBUTE_POSITION, ATTRIBUTE_COLOR, ATTRIBUTE_TEXCOORD0, ATTRIBUTE_TEXCOORD1, + ATTRIBUTE_TEXCOORD2, ATTRIBUTE_TEXCOORD0_W, ATTRIBUTE_NORMQUAT, ATTRIBUTE_VIEW); + + out += GetVertexInterfaceDeclaration(true, use_clip_planes); + + out += UniformBlockDef; + + out += R"( + +void main() { + primary_color = vert_color; + texcoord0 = vert_texcoord0; + texcoord1 = vert_texcoord1; + texcoord2 = vert_texcoord2; + texcoord0_w = vert_texcoord0_w; + normquat = vert_normquat; + view = vert_view; + gl_Position = vert_position; + gl_Position.z = (gl_Position.z + gl_Position.w) / 2.0; +)"; + if (use_clip_planes) { + out += R"( + gl_ClipDistance[0] = -vert_position.z; // fixed PICA clipping plane z <= 0 + if (enable_clip1) { + gl_ClipDistance[1] = dot(clip_coef, vert_position); + } else { + gl_ClipDistance[1] = 0; + } + )"; + } + + out += "}\n"; + + return out; +} + +std::string_view MakeLoadPrefix(AttribLoadFlags flag) { + if (True(flag & AttribLoadFlags::Float)) { + return ""; + } else if (True(flag & AttribLoadFlags::Sint)) { + return "i"; + } else if (True(flag & AttribLoadFlags::Uint)) { + return "u"; + } + return ""; +} + +std::optional GenerateVertexShader(const Pica::Shader::ShaderSetup& setup, + const PicaVSConfig& config) { + std::string out = "#extension GL_ARB_separate_shader_objects : enable\n"; + out += UniformBlockDef; + out += OpenGL::ShaderDecompiler::GetCommonDeclarations(); + + std::array used_regs{}; + const auto get_input_reg = [&used_regs](u32 reg) { + ASSERT(reg < 16); + used_regs[reg] = true; + return fmt::format("vs_in_reg{}", reg); + }; + + const auto get_output_reg = [&](u32 reg) -> std::string { + ASSERT(reg < 16); + if (config.state.output_map[reg] < config.state.num_outputs) { + return fmt::format("vs_out_attr{}", config.state.output_map[reg]); + } + return ""; + }; + + auto program_source_opt = OpenGL::ShaderDecompiler::DecompileProgram( + setup.program_code, setup.swizzle_data, config.state.main_offset, get_input_reg, + get_output_reg, config.state.sanitize_mul); + + if (!program_source_opt) { + return std::nullopt; + } + + std::string& program_source = program_source_opt->code; + + out += R"( +#define uniforms vs_uniforms +layout (set = 0, binding = 0, std140) uniform vs_config { + pica_uniforms uniforms; +}; + +)"; + if (!config.state.use_geometry_shader) { + out += GetVertexInterfaceDeclaration(true, config.use_clip_planes); + } + + // input attributes declaration + for (std::size_t i = 0; i < used_regs.size(); ++i) { + if (used_regs[i]) { + const auto flags = config.state.load_flags[i]; + const std::string_view prefix = MakeLoadPrefix(flags); + out += + fmt::format("layout(location = {0}) in {1}vec4 vs_in_typed_reg{0};\n", i, prefix); + out += fmt::format("vec4 vs_in_reg{0} = vec4(vs_in_typed_reg{0});\n", i); + } + } + out += '\n'; + + if (config.state.use_geometry_shader) { + // output attributes declaration + for (u32 i = 0; i < config.state.num_outputs; ++i) { + out += fmt::format("layout(location = {0}) out vec4 vs_out_attr{0};\n", i); + } + out += "void EmitVtx() {}\n"; + } else { + // output attributes declaration + for (u32 i = 0; i < config.state.num_outputs; ++i) { + out += fmt::format("vec4 vs_out_attr{};\n", i); + } + + const auto semantic = + [&config = config.state](VSOutputAttributes::Semantic slot_semantic) -> std::string { + const u32 slot = static_cast(slot_semantic); + const u32 attrib = config.semantic_maps[slot].attribute_index; + const u32 comp = config.semantic_maps[slot].component_index; + if (attrib < config.gs_output_attributes) { + return fmt::format("vs_out_attr{}.{}", attrib, "xyzw"[comp]); + } + return "0.0"; + }; + + out += "vec4 GetVertexQuaternion() {\n"; + out += " return vec4(" + semantic(VSOutputAttributes::QUATERNION_X) + ", " + + semantic(VSOutputAttributes::QUATERNION_Y) + ", " + + semantic(VSOutputAttributes::QUATERNION_Z) + ", " + + semantic(VSOutputAttributes::QUATERNION_W) + ");\n"; + out += "}\n\n"; + + out += "void EmitVtx() {\n"; + out += " vec4 vtx_pos = vec4(" + semantic(VSOutputAttributes::POSITION_X) + ", " + + semantic(VSOutputAttributes::POSITION_Y) + ", " + + semantic(VSOutputAttributes::POSITION_Z) + ", " + + semantic(VSOutputAttributes::POSITION_W) + ");\n"; + out += " gl_Position = vtx_pos;\n"; + out += " gl_Position.z = (gl_Position.z + gl_Position.w) / 2.0;\n"; + if (config.use_clip_planes) { + out += " gl_ClipDistance[0] = -vtx_pos.z;\n"; // fixed PICA clipping plane z <= 0 + out += " if (enable_clip1) {\n"; + out += " gl_ClipDistance[1] = dot(clip_coef, vtx_pos);\n"; + out += " } else {\n"; + out += " gl_ClipDistance[1] = 0;\n"; + out += " }\n\n"; + } + + out += " normquat = GetVertexQuaternion();\n"; + out += " vec4 vtx_color = vec4(" + semantic(VSOutputAttributes::COLOR_R) + ", " + + semantic(VSOutputAttributes::COLOR_G) + ", " + + semantic(VSOutputAttributes::COLOR_B) + ", " + + semantic(VSOutputAttributes::COLOR_A) + ");\n"; + out += " primary_color = min(abs(vtx_color), vec4(1.0));\n\n"; + + out += " texcoord0 = vec2(" + semantic(VSOutputAttributes::TEXCOORD0_U) + ", " + + semantic(VSOutputAttributes::TEXCOORD0_V) + ");\n"; + out += " texcoord1 = vec2(" + semantic(VSOutputAttributes::TEXCOORD1_U) + ", " + + semantic(VSOutputAttributes::TEXCOORD1_V) + ");\n\n"; + + out += " texcoord0_w = " + semantic(VSOutputAttributes::TEXCOORD0_W) + ";\n"; + out += " view = vec3(" + semantic(VSOutputAttributes::VIEW_X) + ", " + + semantic(VSOutputAttributes::VIEW_Y) + ", " + semantic(VSOutputAttributes::VIEW_Z) + + ");\n\n"; + + out += " texcoord2 = vec2(" + semantic(VSOutputAttributes::TEXCOORD2_U) + ", " + + semantic(VSOutputAttributes::TEXCOORD2_V) + ");\n\n"; + out += "}\n"; + } + + out += "\nvoid main() {\n"; + for (u32 i = 0; i < config.state.num_outputs; ++i) { + out += fmt::format(" vs_out_attr{} = vec4(0.0, 0.0, 0.0, 1.0);\n", i); + } + for (std::size_t i = 0; i < used_regs.size(); ++i) { + if (used_regs[i] && True(config.state.load_flags[i] & AttribLoadFlags::ZeroW)) { + out += fmt::format("vs_in_reg{0}.w = 0;\n", i); + } + } + out += "\n exec_shader();\nEmitVtx();\n}\n\n"; + + out += program_source; + + return out; +} + +static std::string GetGSCommonSource(const PicaGSConfigCommonRaw& config, bool use_clip_planes) { + std::string out = GetVertexInterfaceDeclaration(true, use_clip_planes); + out += UniformBlockDef; + out += OpenGL::ShaderDecompiler::GetCommonDeclarations(); + + out += '\n'; + for (u32 i = 0; i < config.vs_output_attributes; ++i) { + out += fmt::format("layout(location = {}) in vec4 vs_out_attr{}[];\n", i, i); + } + + out += R"( +struct Vertex { +)"; + out += fmt::format(" vec4 attributes[{}];\n", config.gs_output_attributes); + out += "};\n\n"; + + const auto semantic = [&config](VSOutputAttributes::Semantic slot_semantic) -> std::string { + const u32 slot = static_cast(slot_semantic); + const u32 attrib = config.semantic_maps[slot].attribute_index; + const u32 comp = config.semantic_maps[slot].component_index; + if (attrib < config.gs_output_attributes) { + return fmt::format("vtx.attributes[{}].{}", attrib, "xyzw"[comp]); + } + return "0.0"; + }; + + out += "vec4 GetVertexQuaternion(Vertex vtx) {\n"; + out += " return vec4(" + semantic(VSOutputAttributes::QUATERNION_X) + ", " + + semantic(VSOutputAttributes::QUATERNION_Y) + ", " + + semantic(VSOutputAttributes::QUATERNION_Z) + ", " + + semantic(VSOutputAttributes::QUATERNION_W) + ");\n"; + out += "}\n\n"; + + out += "void EmitVtx(Vertex vtx, bool quats_opposite) {\n"; + out += " vec4 vtx_pos = vec4(" + semantic(VSOutputAttributes::POSITION_X) + ", " + + semantic(VSOutputAttributes::POSITION_Y) + ", " + + semantic(VSOutputAttributes::POSITION_Z) + ", " + + semantic(VSOutputAttributes::POSITION_W) + ");\n"; + out += " gl_Position = vtx_pos;\n"; + out += " gl_Position.z = (gl_Position.z + gl_Position.w) / 2.0;\n"; + if (use_clip_planes) { + out += " gl_ClipDistance[0] = -vtx_pos.z;\n"; // fixed PICA clipping plane z <= 0 + out += " if (enable_clip1) {\n"; + out += " gl_ClipDistance[1] = dot(clip_coef, vtx_pos);\n"; + out += " } else {\n"; + out += " gl_ClipDistance[1] = 0;\n"; + out += " }\n\n"; + } + + out += " vec4 vtx_quat = GetVertexQuaternion(vtx);\n"; + out += " normquat = mix(vtx_quat, -vtx_quat, bvec4(quats_opposite));\n\n"; + + out += " vec4 vtx_color = vec4(" + semantic(VSOutputAttributes::COLOR_R) + ", " + + semantic(VSOutputAttributes::COLOR_G) + ", " + semantic(VSOutputAttributes::COLOR_B) + + ", " + semantic(VSOutputAttributes::COLOR_A) + ");\n"; + out += " primary_color = min(abs(vtx_color), vec4(1.0));\n\n"; + + out += " texcoord0 = vec2(" + semantic(VSOutputAttributes::TEXCOORD0_U) + ", " + + semantic(VSOutputAttributes::TEXCOORD0_V) + ");\n"; + out += " texcoord1 = vec2(" + semantic(VSOutputAttributes::TEXCOORD1_U) + ", " + + semantic(VSOutputAttributes::TEXCOORD1_V) + ");\n\n"; + + out += " texcoord0_w = " + semantic(VSOutputAttributes::TEXCOORD0_W) + ";\n"; + out += " view = vec3(" + semantic(VSOutputAttributes::VIEW_X) + ", " + + semantic(VSOutputAttributes::VIEW_Y) + ", " + semantic(VSOutputAttributes::VIEW_Z) + + ");\n\n"; + + out += " texcoord2 = vec2(" + semantic(VSOutputAttributes::TEXCOORD2_U) + ", " + + semantic(VSOutputAttributes::TEXCOORD2_V) + ");\n\n"; + + out += " EmitVertex();\n"; + out += "}\n"; + + out += R"( +bool AreQuaternionsOpposite(vec4 qa, vec4 qb) { + return (dot(qa, qb) < 0.0); +} + +void EmitPrim(Vertex vtx0, Vertex vtx1, Vertex vtx2) { + EmitVtx(vtx0, false); + EmitVtx(vtx1, AreQuaternionsOpposite(GetVertexQuaternion(vtx0), GetVertexQuaternion(vtx1))); + EmitVtx(vtx2, AreQuaternionsOpposite(GetVertexQuaternion(vtx0), GetVertexQuaternion(vtx2))); + EndPrimitive(); +} +)"; + + return out; +}; + +std::string GenerateFixedGeometryShader(const PicaFixedGSConfig& config) { + std::string out = "#version 450 core\n" + "#extension GL_ARB_separate_shader_objects : enable\n\n"; + + out += R"( +layout(triangles) in; +layout(triangle_strip, max_vertices = 3) out; + +)"; + + out += GetGSCommonSource(config.state, config.use_clip_planes); + + out += R"( +void main() { + Vertex prim_buffer[3]; +)"; + for (u32 vtx = 0; vtx < 3; ++vtx) { + out += fmt::format(" prim_buffer[{}].attributes = vec4[{}](", vtx, + config.state.gs_output_attributes); + for (u32 i = 0; i < config.state.vs_output_attributes; ++i) { + out += fmt::format("{}vs_out_attr{}[{}]", i == 0 ? "" : ", ", i, vtx); + } + out += ");\n"; + } + out += " EmitPrim(prim_buffer[0], prim_buffer[1], prim_buffer[2]);\n"; + out += "}\n"; + + return out; +} +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_gen.h b/src/video_core/renderer_vulkan/vk_shader_gen.h new file mode 100644 index 000000000..bd901b006 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_gen.h @@ -0,0 +1,274 @@ +// Copyright 2023 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include "common/hash.h" +#include "video_core/regs.h" +#include "video_core/shader/shader.h" + +namespace Vulkan { + +class Instance; + +enum Attributes { + ATTRIBUTE_POSITION, + ATTRIBUTE_COLOR, + ATTRIBUTE_TEXCOORD0, + ATTRIBUTE_TEXCOORD1, + ATTRIBUTE_TEXCOORD2, + ATTRIBUTE_TEXCOORD0_W, + ATTRIBUTE_NORMQUAT, + ATTRIBUTE_VIEW, +}; + +// Doesn't include const_color because we don't sync it, see comment in BuildFromRegs() +struct TevStageConfigRaw { + u32 sources_raw; + u32 modifiers_raw; + u32 ops_raw; + u32 scales_raw; + explicit operator Pica::TexturingRegs::TevStageConfig() const noexcept { + Pica::TexturingRegs::TevStageConfig stage; + stage.sources_raw = sources_raw; + stage.modifiers_raw = modifiers_raw; + stage.ops_raw = ops_raw; + stage.const_color = 0; + stage.scales_raw = scales_raw; + return stage; + } +}; + +struct PicaFSConfigState { + union { + BitField<0, 3, Pica::FramebufferRegs::CompareFunc> alpha_test_func; + BitField<3, 2, Pica::RasterizerRegs::ScissorMode> scissor_test_mode; + BitField<5, 3, Pica::TexturingRegs::TextureConfig::TextureType> texture0_type; + BitField<8, 1, u32> texture2_use_coord1; + BitField<9, 8, u32> combiner_buffer_input; + BitField<17, 1, Pica::RasterizerRegs::DepthBuffering> depthmap_enable; + BitField<18, 3, Pica::TexturingRegs::FogMode> fog_mode; + BitField<21, 1, u32> fog_flip; + BitField<22, 1, u32> emulate_logic_op; + BitField<23, 4, Pica::FramebufferRegs::LogicOp> logic_op; + BitField<27, 1, u32> shadow_rendering; + BitField<28, 1, u32> shadow_texture_orthographic; + }; + + std::array tev_stages; + + struct { + union { + BitField<0, 3, u16> num; + BitField<3, 1, u16> directional; + BitField<4, 1, u16> two_sided_diffuse; + BitField<5, 1, u16> dist_atten_enable; + BitField<6, 1, u16> spot_atten_enable; + BitField<7, 1, u16> geometric_factor_0; + BitField<8, 1, u16> geometric_factor_1; + BitField<9, 1, u16> shadow_enable; + } light[8]; + + union { + BitField<0, 1, u32> enable; + BitField<1, 4, u32> src_num; + BitField<5, 2, Pica::LightingRegs::LightingBumpMode> bump_mode; + BitField<7, 2, u32> bump_selector; + BitField<9, 1, u32> bump_renorm; + BitField<10, 1, u32> clamp_highlights; + BitField<11, 4, Pica::LightingRegs::LightingConfig> config; + BitField<15, 1, u32> enable_primary_alpha; + BitField<16, 1, u32> enable_secondary_alpha; + BitField<17, 1, u32> enable_shadow; + BitField<18, 1, u32> shadow_primary; + BitField<19, 1, u32> shadow_secondary; + BitField<20, 1, u32> shadow_invert; + BitField<21, 1, u32> shadow_alpha; + BitField<22, 2, u32> shadow_selector; + }; + + struct { + union { + BitField<0, 1, u32> enable; + BitField<1, 1, u32> abs_input; + BitField<2, 3, Pica::LightingRegs::LightingLutInput> type; + }; + float scale; + } lut_d0, lut_d1, lut_sp, lut_fr, lut_rr, lut_rg, lut_rb; + } lighting; + + struct { + union { + BitField<0, 1, u32> enable; + BitField<1, 2, u32> coord; + BitField<3, 3, Pica::TexturingRegs::ProcTexClamp> u_clamp; + BitField<6, 3, Pica::TexturingRegs::ProcTexClamp> v_clamp; + BitField<9, 4, Pica::TexturingRegs::ProcTexCombiner> color_combiner; + BitField<13, 4, Pica::TexturingRegs::ProcTexCombiner> alpha_combiner; + BitField<17, 3, Pica::TexturingRegs::ProcTexFilter> lut_filter; + BitField<20, 1, u32> separate_alpha; + BitField<21, 1, u32> noise_enable; + BitField<22, 2, Pica::TexturingRegs::ProcTexShift> u_shift; + BitField<24, 2, Pica::TexturingRegs::ProcTexShift> v_shift; + }; + s32 lut_width; + s32 lut_offset0; + s32 lut_offset1; + s32 lut_offset2; + s32 lut_offset3; + u8 lod_min; + u8 lod_max; + } proctex; +}; + +/** + * This struct contains all state used to generate the GLSL fragment shader that emulates the + * current Pica register configuration. This struct is used as a cache key for generated GLSL shader + * programs. The functions in gl_shader_gen.cpp should retrieve state from this struct only, not by + * directly accessing Pica registers. This should reduce the risk of bugs in shader generation where + * Pica state is not being captured in the shader cache key, thereby resulting in (what should be) + * two separate shaders sharing the same key. + */ +struct PicaFSConfig : Common::HashableStruct { + PicaFSConfig(const Pica::Regs& regs, const Instance& instance); + + bool TevStageUpdatesCombinerBufferColor(unsigned stage_index) const { + return (stage_index < 4) && (state.combiner_buffer_input & (1 << stage_index)); + } + + bool TevStageUpdatesCombinerBufferAlpha(unsigned stage_index) const { + return (stage_index < 4) && ((state.combiner_buffer_input >> 4) & (1 << stage_index)); + } +}; + +enum class AttribLoadFlags { + Float = 1 << 0, + Sint = 1 << 1, + Uint = 1 << 2, + ZeroW = 1 << 3, +}; +DECLARE_ENUM_FLAG_OPERATORS(AttribLoadFlags) + +/** + * This struct contains common information to identify a GL vertex/geometry shader generated from + * PICA vertex/geometry shader. + */ +struct PicaShaderConfigCommon { + void Init(const Pica::RasterizerRegs& rasterizer, const Pica::ShaderRegs& regs, + Pica::Shader::ShaderSetup& setup); + + u64 program_hash; + u64 swizzle_hash; + u32 main_offset; + bool sanitize_mul; + + u32 num_outputs; + // Load operations to apply to the input vertex data + std::array load_flags; + + // output_map[output register index] -> output attribute index + std::array output_map; + + bool use_geometry_shader; + u32 vs_output_attributes; + u32 gs_output_attributes; + + struct SemanticMap { + u32 attribute_index; + u32 component_index; + }; + + // semantic_maps[semantic name] -> GS output attribute index + component index + std::array semantic_maps; +}; + +/** + * This struct contains information to identify a GL vertex shader generated from PICA vertex + * shader. + */ +struct PicaVSConfig : Common::HashableStruct { + explicit PicaVSConfig(const Pica::RasterizerRegs& rasterizer, const Pica::ShaderRegs& regs, + Pica::Shader::ShaderSetup& setup, const Instance& instance); + bool use_clip_planes; +}; + +struct PicaGSConfigCommonRaw { + void Init(const Pica::Regs& regs); + + u32 vs_output_attributes; + u32 gs_output_attributes; + + struct SemanticMap { + u32 attribute_index; + u32 component_index; + }; + + // semantic_maps[semantic name] -> GS output attribute index + component index + std::array semantic_maps; +}; + +/** + * This struct contains information to identify a GL geometry shader generated from PICA no-geometry + * shader pipeline + */ +struct PicaFixedGSConfig : Common::HashableStruct { + explicit PicaFixedGSConfig(const Pica::Regs& regs, const Instance& instance); + bool use_clip_planes; +}; + +/** + * Generates the GLSL vertex shader program source code that accepts vertices from software shader + * and directly passes them to the fragment shader. + * @param separable_shader generates shader that can be used for separate shader object + * @returns String of the shader source code + */ +std::string GenerateTrivialVertexShader(bool use_clip_planes); + +/** + * Generates the GLSL vertex shader program source code for the given VS program + * @returns String of the shader source code; boost::none on failure + */ +std::optional GenerateVertexShader(const Pica::Shader::ShaderSetup& setup, + const PicaVSConfig& config); + +/** + * Generates the GLSL fixed geometry shader program source code for non-GS PICA pipeline + * @returns String of the shader source code + */ +std::string GenerateFixedGeometryShader(const PicaFixedGSConfig& config); + +/** + * Generates the GLSL fragment shader program source code for the current Pica state + * @param config ShaderCacheKey object generated for the current Pica state, used for the shader + * configuration (NOTE: Use state in this struct only, not the Pica registers!) + * @param separable_shader generates shader that can be used for separate shader object + * @returns String of the shader source code + */ +std::string GenerateFragmentShader(const PicaFSConfig& config); + +} // namespace Vulkan + +namespace std { +template <> +struct hash { + std::size_t operator()(const Vulkan::PicaFSConfig& k) const noexcept { + return k.Hash(); + } +}; + +template <> +struct hash { + std::size_t operator()(const Vulkan::PicaVSConfig& k) const noexcept { + return k.Hash(); + } +}; + +template <> +struct hash { + std::size_t operator()(const Vulkan::PicaFixedGSConfig& k) const noexcept { + return k.Hash(); + } +}; +} // namespace std diff --git a/src/video_core/renderer_vulkan/vk_shader_gen_spv.cpp b/src/video_core/renderer_vulkan/vk_shader_gen_spv.cpp new file mode 100644 index 000000000..a064a7ce0 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_gen_spv.cpp @@ -0,0 +1,1516 @@ +// Copyright 2023 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include "core/core.h" +#include "video_core/renderer_vulkan/vk_shader_gen_spv.h" + +using Pica::FramebufferRegs; +using Pica::LightingRegs; +using Pica::RasterizerRegs; +using Pica::TexturingRegs; +using TevStageConfig = TexturingRegs::TevStageConfig; + +namespace Vulkan { + +constexpr u32 SPIRV_VERSION_1_3 = 0x00010300; + +FragmentModule::FragmentModule(Core::TelemetrySession& telemetry_, const PicaFSConfig& config_) + : Sirit::Module{SPIRV_VERSION_1_3}, telemetry{telemetry_}, config{config_} { + DefineArithmeticTypes(); + DefineUniformStructs(); + DefineInterface(); + if (config.state.proctex.enable) { + DefineProcTexSampler(); + } + DefineEntryPoint(); +} + +FragmentModule::~FragmentModule() = default; + +void FragmentModule::Generate() { + AddLabel(OpLabel()); + + rounded_primary_color = Byteround(OpLoad(vec_ids.Get(4), primary_color_id), 4); + primary_fragment_color = ConstF32(0.f, 0.f, 0.f, 0.f); + secondary_fragment_color = ConstF32(0.f, 0.f, 0.f, 0.f); + + // Do not do any sort of processing if it's obvious we're not going to pass the alpha test + if (config.state.alpha_test_func == Pica::FramebufferRegs::CompareFunc::Never) { + OpKill(); + OpFunctionEnd(); + return; + } + + // Check if the fragment is outside scissor rectangle + WriteScissor(); + + // Write shader bytecode to emulate all enabled PICA lights + if (config.state.lighting.enable) { + WriteLighting(); + } + + combiner_buffer = ConstF32(0.f, 0.f, 0.f, 0.f); + next_combiner_buffer = GetShaderDataMember(vec_ids.Get(4), ConstS32(27)); + last_tex_env_out = ConstF32(0.f, 0.f, 0.f, 0.f); + + // Write shader bytecode to emulate PICA TEV stages + for (std::size_t index = 0; index < config.state.tev_stages.size(); ++index) { + WriteTevStage(static_cast(index)); + } + + WriteAlphaTestCondition(config.state.alpha_test_func); + + // After perspective divide, OpenGL transform z_over_w from [-1, 1] to [near, far]. Here we use + // default near = 0 and far = 1, and undo the transformation to get the original z_over_w, then + // do our own transformation according to PICA specification. + WriteDepth(); + + // Emulate the fog + switch (config.state.fog_mode) { + case TexturingRegs::FogMode::Fog: + WriteFog(); + break; + case TexturingRegs::FogMode::Gas: + WriteGas(); + return; + default: + break; + } + + Id color{Byteround(last_tex_env_out, 4)}; + if (config.state.emulate_logic_op) { + switch (config.state.logic_op) { + case FramebufferRegs::LogicOp::Clear: + color = ConstF32(0.f, 0.f, 0.f, 0.f); + break; + case FramebufferRegs::LogicOp::Set: + color = ConstF32(1.f, 1.f, 1.f, 1.f); + break; + case FramebufferRegs::LogicOp::Copy: + // Take the color output as-is + break; + case FramebufferRegs::LogicOp::CopyInverted: + // out += "color = ~color;\n"; + break; + case FramebufferRegs::LogicOp::NoOp: + // We need to discard the color, but not necessarily the depth. This is not possible + // with fragment shader alone, so we emulate this behavior with the color mask. + break; + default: + LOG_CRITICAL(HW_GPU, "Unhandled logic_op {:x}", + static_cast(config.state.logic_op.Value())); + UNIMPLEMENTED(); + } + } + + // Write output color + OpStore(color_id, color); + OpReturn(); + OpFunctionEnd(); +} + +void FragmentModule::WriteDepth() { + const Id input_pointer_id{TypePointer(spv::StorageClass::Input, f32_id)}; + const Id gl_frag_coord_z{ + OpLoad(f32_id, OpAccessChain(input_pointer_id, gl_frag_coord_id, ConstU32(2u)))}; + const Id z_over_w{OpFma(f32_id, ConstF32(2.f), gl_frag_coord_z, ConstF32(-1.f))}; + const Id depth_scale{GetShaderDataMember(f32_id, ConstS32(2))}; + const Id depth_offset{GetShaderDataMember(f32_id, ConstS32(3))}; + depth = OpFma(f32_id, z_over_w, depth_scale, depth_offset); + if (config.state.depthmap_enable == Pica::RasterizerRegs::DepthBuffering::WBuffering) { + const Id gl_frag_coord_w{ + OpLoad(f32_id, OpAccessChain(input_pointer_id, gl_frag_coord_id, ConstU32(3u)))}; + depth = OpFDiv(f32_id, depth, gl_frag_coord_w); + } + OpStore(gl_frag_depth_id, depth); +} + +void FragmentModule::WriteScissor() { + if (config.state.scissor_test_mode == RasterizerRegs::ScissorMode::Disabled) { + return; + } + + const Id gl_frag_coord{OpLoad(vec_ids.Get(4), gl_frag_coord_id)}; + const Id gl_frag_coord_xy{OpVectorShuffle(vec_ids.Get(2), gl_frag_coord, gl_frag_coord, 0, 1)}; + + const Id scissor_x1{GetShaderDataMember(i32_id, ConstS32(6))}; + const Id scissor_y1{GetShaderDataMember(i32_id, ConstS32(7))}; + const Id scissor_1{OpCompositeConstruct(vec_ids.Get(2), OpConvertSToF(f32_id, scissor_x1), + OpConvertSToF(f32_id, scissor_y1))}; + + const Id scissor_x2{GetShaderDataMember(i32_id, ConstS32(8))}; + const Id scissor_y2{GetShaderDataMember(i32_id, ConstS32(9))}; + const Id scissor_2{OpCompositeConstruct(vec_ids.Get(2), OpConvertSToF(f32_id, scissor_x2), + OpConvertSToF(f32_id, scissor_y2))}; + + const Id cond1{OpFOrdGreaterThanEqual(bvec_ids.Get(2), gl_frag_coord_xy, scissor_1)}; + const Id cond2{OpFOrdLessThan(bvec_ids.Get(2), gl_frag_coord_xy, scissor_2)}; + + Id result{OpAll(bool_id, OpCompositeConstruct(bvec_ids.Get(4), cond1, cond2))}; + if (config.state.scissor_test_mode == RasterizerRegs::ScissorMode::Include) { + result = OpLogicalNot(bool_id, result); + } + + const Id merge_block{OpLabel()}; + const Id kill_label{OpLabel()}; + OpSelectionMerge(merge_block, spv::SelectionControlMask::MaskNone); + OpBranchConditional(result, kill_label, merge_block); + + AddLabel(kill_label); + OpKill(); + + AddLabel(merge_block); +} + +void FragmentModule::WriteFog() { + // Get index into fog LUT + Id fog_index{}; + if (config.state.fog_flip) { + fog_index = OpFMul(f32_id, OpFSub(f32_id, ConstF32(1.f), depth), ConstF32(128.f)); + } else { + fog_index = OpFMul(f32_id, depth, ConstF32(128.f)); + } + + // Generate clamped fog factor from LUT for given fog index + const Id fog_i{OpFClamp(f32_id, OpFloor(f32_id, fog_index), ConstF32(0.f), ConstF32(127.f))}; + const Id fog_f{OpFSub(f32_id, fog_index, fog_i)}; + const Id fog_lut_offset{GetShaderDataMember(i32_id, ConstS32(10))}; + const Id coord{OpIAdd(i32_id, OpConvertFToS(i32_id, fog_i), fog_lut_offset)}; + if (!Sirit::ValidId(texture_buffer_lut_lf)) { + const Id sampled_image{TypeSampledImage(image_buffer_id)}; + texture_buffer_lut_lf = OpLoad(sampled_image, texture_buffer_lut_lf_id); + } + const Id fog_lut_entry_rgba{ + OpImageFetch(vec_ids.Get(4), OpImage(image_buffer_id, texture_buffer_lut_lf), coord)}; + const Id fog_lut_r{OpCompositeExtract(f32_id, fog_lut_entry_rgba, 0)}; + const Id fog_lut_g{OpCompositeExtract(f32_id, fog_lut_entry_rgba, 1)}; + Id fog_factor{OpFma(f32_id, fog_f, fog_lut_g, fog_lut_r)}; + fog_factor = OpFClamp(f32_id, fog_factor, ConstF32(0.f), ConstF32(1.f)); + + // Blend the fog + const Id tex_env_rgb{ + OpVectorShuffle(vec_ids.Get(3), last_tex_env_out, last_tex_env_out, 0, 1, 2)}; + const Id fog_color{GetShaderDataMember(vec_ids.Get(3), ConstS32(20))}; + const Id fog_factor_rgb{ + OpCompositeConstruct(vec_ids.Get(3), fog_factor, fog_factor, fog_factor)}; + const Id fog_result{OpFMix(vec_ids.Get(3), fog_color, tex_env_rgb, fog_factor_rgb)}; + last_tex_env_out = OpVectorShuffle(vec_ids.Get(4), fog_result, last_tex_env_out, 0, 1, 2, 6); +} + +void FragmentModule::WriteGas() { + // TODO: Implement me + telemetry.AddField(Common::Telemetry::FieldType::Session, "VideoCore_Pica_UseGasMode", true); + LOG_CRITICAL(Render_Vulkan, "Unimplemented gas mode"); + OpKill(); + OpFunctionEnd(); +} + +void FragmentModule::WriteLighting() { + const auto& lighting = config.state.lighting; + + // Define lighting globals + Id diffuse_sum{ConstF32(0.f, 0.f, 0.f, 1.f)}; + Id specular_sum{ConstF32(0.f, 0.f, 0.f, 1.f)}; + Id light_vector{ConstF32(0.f, 0.f, 0.f)}; + Id light_distance{ConstF32(0.f)}; + Id spot_dir{ConstF32(0.f, 0.f, 0.f)}; + Id half_vector{ConstF32(0.f, 0.f, 0.f)}; + Id dot_product{ConstF32(0.f)}; + Id clamp_highlights{ConstF32(1.f)}; + Id geo_factor{ConstF32(1.f)}; + Id surface_normal{}; + Id surface_tangent{}; + + // Compute fragment normals and tangents + const auto perturbation = [&]() -> Id { + const Id texel{SampleTexture(lighting.bump_selector)}; + const Id texel_rgb{OpVectorShuffle(vec_ids.Get(3), texel, texel, 0, 1, 2)}; + const Id rgb_mul_two{OpVectorTimesScalar(vec_ids.Get(3), texel_rgb, ConstF32(2.f))}; + return OpFSub(vec_ids.Get(3), rgb_mul_two, ConstF32(1.f, 1.f, 1.f)); + }; + + if (lighting.bump_mode == LightingRegs::LightingBumpMode::NormalMap) { + // Bump mapping is enabled using a normal map + surface_normal = perturbation(); + + // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher + // precision result + if (lighting.bump_renorm) { + const Id normal_x{OpCompositeExtract(f32_id, surface_normal, 0)}; + const Id normal_y{OpCompositeExtract(f32_id, surface_normal, 1)}; + const Id y_mul_y{OpFMul(f32_id, normal_y, normal_y)}; + const Id val{OpFSub(f32_id, ConstF32(1.f), OpFma(f32_id, normal_x, normal_x, y_mul_y))}; + const Id normal_z{OpSqrt(f32_id, OpFMax(f32_id, val, ConstF32(0.f)))}; + surface_normal = OpCompositeConstruct(vec_ids.Get(3), normal_x, normal_y, normal_z); + } + + // The tangent vector is not perturbed by the normal map and is just a unit vector. + surface_tangent = ConstF32(1.f, 0.f, 0.f); + } else if (lighting.bump_mode == LightingRegs::LightingBumpMode::TangentMap) { + // Bump mapping is enabled using a tangent map + surface_tangent = perturbation(); + + // Mathematically, recomputing Z-component of the tangent vector won't affect the relevant + // computation below, which is also confirmed on 3DS. So we don't bother recomputing here + // even if 'renorm' is enabled. + + // The normal vector is not perturbed by the tangent map and is just a unit vector. + surface_normal = ConstF32(0.f, 0.f, 1.f); + } else { + // No bump mapping - surface local normal and tangent are just unit vectors + surface_normal = ConstF32(0.f, 0.f, 1.f); + surface_tangent = ConstF32(1.f, 0.f, 0.f); + } + + // Rotate the vector v by the quaternion q + const auto quaternion_rotate = [this](Id q, Id v) -> Id { + const Id q_xyz{OpVectorShuffle(vec_ids.Get(3), q, q, 0, 1, 2)}; + const Id q_xyz_cross_v{OpCross(vec_ids.Get(3), q_xyz, v)}; + const Id q_w{OpCompositeExtract(f32_id, q, 3)}; + const Id val1{ + OpFAdd(vec_ids.Get(3), q_xyz_cross_v, OpVectorTimesScalar(vec_ids.Get(3), v, q_w))}; + const Id val2{OpVectorTimesScalar(vec_ids.Get(3), OpCross(vec_ids.Get(3), q_xyz, val1), + ConstF32(2.f))}; + return OpFAdd(vec_ids.Get(3), v, val2); + }; + + // Rotate the surface-local normal by the interpolated normal quaternion to convert it to + // eyespace. + const Id normalized_normquat{OpNormalize(vec_ids.Get(4), OpLoad(vec_ids.Get(4), normquat_id))}; + const Id normal{quaternion_rotate(normalized_normquat, surface_normal)}; + const Id tangent{quaternion_rotate(normalized_normquat, surface_tangent)}; + + Id shadow{ConstF32(1.f, 1.f, 1.f, 1.f)}; + if (lighting.enable_shadow) { + shadow = SampleTexture(lighting.shadow_selector); + if (lighting.shadow_invert) { + shadow = OpFSub(vec_ids.Get(4), ConstF32(1.f, 1.f, 1.f, 1.f), shadow); + } + } + + const auto lookup_lighting_lut_unsigned = [this](Id lut_index, Id pos) -> Id { + const Id pos_int{OpConvertFToS(i32_id, OpFMul(f32_id, pos, ConstF32(256.f)))}; + const Id index{OpSClamp(i32_id, pos_int, ConstS32(0), ConstS32(255))}; + const Id neg_index{OpFNegate(f32_id, OpConvertSToF(f32_id, index))}; + const Id delta{OpFma(f32_id, pos, ConstF32(256.f), neg_index)}; + return LookupLightingLUT(lut_index, index, delta); + }; + + const auto lookup_lighting_lut_signed = [this](Id lut_index, Id pos) -> Id { + const Id pos_int{OpConvertFToS(i32_id, OpFMul(f32_id, pos, ConstF32(128.f)))}; + const Id index{OpSClamp(i32_id, pos_int, ConstS32(-128), ConstS32(127))}; + const Id neg_index{OpFNegate(f32_id, OpConvertSToF(f32_id, index))}; + const Id delta{OpFma(f32_id, pos, ConstF32(128.f), neg_index)}; + const Id increment{ + OpSelect(i32_id, OpSLessThan(bool_id, index, ConstS32(0)), ConstS32(256), ConstS32(0))}; + return LookupLightingLUT(lut_index, OpIAdd(i32_id, index, increment), delta); + }; + + // Samples the specified lookup table for specular lighting + const Id view{OpLoad(vec_ids.Get(3), view_id)}; + const auto get_lut_value = [&](LightingRegs::LightingSampler sampler, u32 light_num, + LightingRegs::LightingLutInput input, bool abs) -> Id { + Id index{}; + switch (input) { + case LightingRegs::LightingLutInput::NH: + index = OpDot(f32_id, normal, OpNormalize(vec_ids.Get(3), half_vector)); + break; + case LightingRegs::LightingLutInput::VH: + index = OpDot(f32_id, OpNormalize(vec_ids.Get(3), view), + OpNormalize(vec_ids.Get(3), half_vector)); + break; + case LightingRegs::LightingLutInput::NV: + index = OpDot(f32_id, normal, OpNormalize(vec_ids.Get(3), view)); + break; + case LightingRegs::LightingLutInput::LN: + index = OpDot(f32_id, light_vector, normal); + break; + case LightingRegs::LightingLutInput::SP: + index = OpDot(f32_id, light_vector, spot_dir); + break; + case LightingRegs::LightingLutInput::CP: + // CP input is only available with configuration 7 + if (lighting.config == LightingRegs::LightingConfig::Config7) { + // Note: even if the normal vector is modified by normal map, which is not the + // normal of the tangent plane anymore, the half angle vector is still projected + // using the modified normal vector. + const Id normalized_half_vector{OpNormalize(vec_ids.Get(3), half_vector)}; + const Id normal_dot_half_vector{OpDot(f32_id, normal, normalized_half_vector)}; + const Id normal_mul_dot{ + OpVectorTimesScalar(vec_ids.Get(3), normal, normal_dot_half_vector)}; + const Id half_angle_proj{ + OpFSub(vec_ids.Get(3), normalized_half_vector, normal_mul_dot)}; + + // Note: the half angle vector projection is confirmed not normalized before the dot + // product. The result is in fact not cos(phi) as the name suggested. + index = OpDot(f32_id, half_angle_proj, tangent); + } else { + index = ConstF32(0.f); + } + break; + default: + LOG_CRITICAL(HW_GPU, "Unknown lighting LUT input {}", (int)input); + UNIMPLEMENTED(); + index = ConstF32(0.f); + break; + } + + const Id sampler_index{ConstU32(static_cast(sampler))}; + if (abs) { + // LUT index is in the range of (0.0, 1.0) + index = lighting.light[light_num].two_sided_diffuse + ? OpFAbs(f32_id, index) + : OpFMax(f32_id, index, ConstF32(0.f)); + return lookup_lighting_lut_unsigned(sampler_index, index); + } else { + // LUT index is in the range of (-1.0, 1.0) + return lookup_lighting_lut_signed(sampler_index, index); + } + }; + + // Write the code to emulate each enabled light + for (u32 light_index = 0; light_index < lighting.src_num; ++light_index) { + const auto& light_config = lighting.light[light_index]; + + const auto GetLightMember = [&](s32 member) -> Id { + const Id member_type = member < 6 ? vec_ids.Get(3) : f32_id; + const Id light_num{ConstS32(static_cast(lighting.light[light_index].num.Value()))}; + return GetShaderDataMember(member_type, ConstS32(25), light_num, ConstS32(member)); + }; + + // Compute light vector (directional or positional) + const Id light_position{GetLightMember(4)}; + if (light_config.directional) { + light_vector = light_position; + } else { + light_vector = OpFAdd(vec_ids.Get(3), light_position, view); + } + + light_distance = OpLength(f32_id, light_vector); + light_vector = OpNormalize(vec_ids.Get(3), light_vector); + + spot_dir = GetLightMember(5); + half_vector = OpFAdd(vec_ids.Get(3), OpNormalize(vec_ids.Get(3), view), light_vector); + + // Compute dot product of light_vector and normal, adjust if lighting is one-sided or + // two-sided + if (light_config.two_sided_diffuse) { + dot_product = OpFAbs(f32_id, OpDot(f32_id, light_vector, normal)); + } else { + dot_product = OpFMax(f32_id, OpDot(f32_id, light_vector, normal), ConstF32(0.f)); + } + + // If enabled, clamp specular component if lighting result is zero + if (lighting.clamp_highlights) { + clamp_highlights = OpFSign(f32_id, dot_product); + } + + // If enabled, compute spot light attenuation value + Id spot_atten{ConstF32(1.f)}; + if (light_config.spot_atten_enable && + LightingRegs::IsLightingSamplerSupported( + lighting.config, LightingRegs::LightingSampler::SpotlightAttenuation)) { + const Id value{ + get_lut_value(LightingRegs::SpotlightAttenuationSampler(light_config.num), + light_config.num, lighting.lut_sp.type, lighting.lut_sp.abs_input)}; + spot_atten = OpFMul(f32_id, ConstF32(lighting.lut_sp.scale), value); + } + + // If enabled, compute distance attenuation value + Id dist_atten{ConstF32(1.f)}; + if (light_config.dist_atten_enable) { + const Id dist_atten_scale{GetLightMember(7)}; + const Id dist_atten_bias{GetLightMember(6)}; + const Id index{OpFma(f32_id, dist_atten_scale, light_distance, dist_atten_bias)}; + const Id clamped_index{OpFClamp(f32_id, index, ConstF32(0.f), ConstF32(1.f))}; + const Id sampler{ConstS32( + static_cast(LightingRegs::DistanceAttenuationSampler(light_config.num)))}; + dist_atten = lookup_lighting_lut_unsigned(sampler, clamped_index); + } + + if (light_config.geometric_factor_0 || light_config.geometric_factor_1) { + geo_factor = OpDot(f32_id, half_vector, half_vector); + const Id dot_div_geo{ + OpFMin(f32_id, OpFDiv(f32_id, dot_product, geo_factor), ConstF32(1.f))}; + const Id is_geo_factor_zero{OpFOrdEqual(bool_id, geo_factor, ConstF32(0.f))}; + geo_factor = OpSelect(f32_id, is_geo_factor_zero, ConstF32(0.f), dot_div_geo); + } + + // Specular 0 component + Id d0_lut_value{ConstF32(1.f)}; + if (lighting.lut_d0.enable && + LightingRegs::IsLightingSamplerSupported( + lighting.config, LightingRegs::LightingSampler::Distribution0)) { + // Lookup specular "distribution 0" LUT value + const Id value{get_lut_value(LightingRegs::LightingSampler::Distribution0, + light_config.num, lighting.lut_d0.type, + lighting.lut_d0.abs_input)}; + d0_lut_value = OpFMul(f32_id, ConstF32(lighting.lut_d0.scale), value); + } + + Id specular_0{OpVectorTimesScalar(vec_ids.Get(3), GetLightMember(0), d0_lut_value)}; + if (light_config.geometric_factor_0) { + specular_0 = OpVectorTimesScalar(vec_ids.Get(3), specular_0, geo_factor); + } + + // If enabled, lookup ReflectRed value, otherwise, 1.0 is used + Id refl_value_r{ConstF32(1.f)}; + if (lighting.lut_rr.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::ReflectRed)) { + const Id value{get_lut_value(LightingRegs::LightingSampler::ReflectRed, + light_config.num, lighting.lut_rr.type, + lighting.lut_rr.abs_input)}; + + refl_value_r = OpFMul(f32_id, ConstF32(lighting.lut_rr.scale), value); + } + + // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used + Id refl_value_g{refl_value_r}; + if (lighting.lut_rg.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::ReflectGreen)) { + const Id value{get_lut_value(LightingRegs::LightingSampler::ReflectGreen, + light_config.num, lighting.lut_rg.type, + lighting.lut_rg.abs_input)}; + + refl_value_g = OpFMul(f32_id, ConstF32(lighting.lut_rg.scale), value); + } + + // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used + Id refl_value_b{refl_value_r}; + if (lighting.lut_rb.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::ReflectBlue)) { + const Id value{get_lut_value(LightingRegs::LightingSampler::ReflectBlue, + light_config.num, lighting.lut_rb.type, + lighting.lut_rb.abs_input)}; + refl_value_b = OpFMul(f32_id, ConstF32(lighting.lut_rb.scale), value); + } + + // Specular 1 component + Id d1_lut_value{ConstF32(1.f)}; + if (lighting.lut_d1.enable && + LightingRegs::IsLightingSamplerSupported( + lighting.config, LightingRegs::LightingSampler::Distribution1)) { + // Lookup specular "distribution 1" LUT value + const Id value{get_lut_value(LightingRegs::LightingSampler::Distribution1, + light_config.num, lighting.lut_d1.type, + lighting.lut_d1.abs_input)}; + d1_lut_value = OpFMul(f32_id, ConstF32(lighting.lut_d1.scale), value); + } + + const Id refl_value{ + OpCompositeConstruct(vec_ids.Get(3), refl_value_r, refl_value_g, refl_value_b)}; + const Id light_specular_1{GetLightMember(1)}; + Id specular_1{OpFMul(vec_ids.Get(3), + OpVectorTimesScalar(vec_ids.Get(3), refl_value, d1_lut_value), + light_specular_1)}; + if (light_config.geometric_factor_1) { + specular_1 = OpVectorTimesScalar(vec_ids.Get(3), specular_1, geo_factor); + } + + // Fresnel + // Note: only the last entry in the light slots applies the Fresnel factor + if (light_index == lighting.src_num - 1 && lighting.lut_fr.enable && + LightingRegs::IsLightingSamplerSupported(lighting.config, + LightingRegs::LightingSampler::Fresnel)) { + // Lookup fresnel LUT value + Id value{get_lut_value(LightingRegs::LightingSampler::Fresnel, light_config.num, + lighting.lut_fr.type, lighting.lut_fr.abs_input)}; + value = OpFMul(f32_id, ConstF32(lighting.lut_fr.scale), value); + + // Enabled for diffuse lighting alpha component + if (lighting.enable_primary_alpha) { + diffuse_sum = OpCompositeInsert(vec_ids.Get(4), value, diffuse_sum, 3); + } + + // Enabled for the specular lighting alpha component + if (lighting.enable_secondary_alpha) { + specular_sum = OpCompositeInsert(vec_ids.Get(4), value, specular_sum, 3); + } + } + + const bool shadow_primary_enable = lighting.shadow_primary && light_config.shadow_enable; + const bool shadow_secondary_enable = + lighting.shadow_secondary && light_config.shadow_enable; + const Id shadow_rgb{OpVectorShuffle(vec_ids.Get(3), shadow, shadow, 0, 1, 2)}; + + const Id light_diffuse{GetLightMember(2)}; + const Id light_ambient{GetLightMember(3)}; + const Id diffuse_mul_dot{OpVectorTimesScalar(vec_ids.Get(3), light_diffuse, dot_product)}; + + // Compute primary fragment color (diffuse lighting) function + Id diffuse_sum_rgb{OpFAdd(vec_ids.Get(3), diffuse_mul_dot, light_ambient)}; + diffuse_sum_rgb = OpVectorTimesScalar(vec_ids.Get(3), diffuse_sum_rgb, dist_atten); + diffuse_sum_rgb = OpVectorTimesScalar(vec_ids.Get(3), diffuse_sum_rgb, spot_atten); + if (shadow_primary_enable) { + diffuse_sum_rgb = OpFMul(vec_ids.Get(3), diffuse_sum_rgb, shadow_rgb); + } + + // Compute secondary fragment color (specular lighting) function + const Id specular_01{OpFAdd(vec_ids.Get(3), specular_0, specular_1)}; + Id specular_sum_rgb{OpVectorTimesScalar(vec_ids.Get(3), specular_01, clamp_highlights)}; + specular_sum_rgb = OpVectorTimesScalar(vec_ids.Get(3), specular_sum_rgb, dist_atten); + specular_sum_rgb = OpVectorTimesScalar(vec_ids.Get(3), specular_sum_rgb, spot_atten); + if (shadow_secondary_enable) { + specular_sum_rgb = OpFMul(vec_ids.Get(3), specular_sum_rgb, shadow_rgb); + } + + // Accumulate the fragment colors + const Id diffuse_sum_rgba{PadVectorF32(diffuse_sum_rgb, vec_ids.Get(4), 0.f)}; + const Id specular_sum_rgba{PadVectorF32(specular_sum_rgb, vec_ids.Get(4), 0.f)}; + diffuse_sum = OpFAdd(vec_ids.Get(4), diffuse_sum, diffuse_sum_rgba); + specular_sum = OpFAdd(vec_ids.Get(4), specular_sum, specular_sum_rgba); + } + + // Apply shadow attenuation to alpha components if enabled + if (lighting.shadow_alpha) { + const Id shadow_a{OpCompositeExtract(f32_id, shadow, 3)}; + const Id shadow_a_vec{ + OpCompositeConstruct(vec_ids.Get(4), ConstF32(1.f, 1.f, 1.f), shadow_a)}; + if (lighting.enable_primary_alpha) { + diffuse_sum = OpFMul(vec_ids.Get(4), diffuse_sum, shadow_a_vec); + } + if (lighting.enable_secondary_alpha) { + specular_sum = OpFMul(vec_ids.Get(4), specular_sum, shadow_a_vec); + } + } + + // Sum final lighting result + const Id lighting_global_ambient{GetShaderDataMember(vec_ids.Get(3), ConstS32(24))}; + const Id lighting_global_ambient_rgba{ + PadVectorF32(lighting_global_ambient, vec_ids.Get(4), 0.f)}; + const Id zero_vec{ConstF32(0.f, 0.f, 0.f, 0.f)}; + const Id one_vec{ConstF32(1.f, 1.f, 1.f, 1.f)}; + diffuse_sum = OpFAdd(vec_ids.Get(4), diffuse_sum, lighting_global_ambient_rgba); + primary_fragment_color = OpFClamp(vec_ids.Get(4), diffuse_sum, zero_vec, one_vec); + secondary_fragment_color = OpFClamp(vec_ids.Get(4), specular_sum, zero_vec, one_vec); +} + +void FragmentModule::WriteTevStage(s32 index) { + const TexturingRegs::TevStageConfig stage = + static_cast(config.state.tev_stages[index]); + + // Detects if a TEV stage is configured to be skipped (to avoid generating unnecessary code) + const auto is_passthrough_tev_stage = [](const TevStageConfig& stage) { + return (stage.color_op == TevStageConfig::Operation::Replace && + stage.alpha_op == TevStageConfig::Operation::Replace && + stage.color_source1 == TevStageConfig::Source::Previous && + stage.alpha_source1 == TevStageConfig::Source::Previous && + stage.color_modifier1 == TevStageConfig::ColorModifier::SourceColor && + stage.alpha_modifier1 == TevStageConfig::AlphaModifier::SourceAlpha && + stage.GetColorMultiplier() == 1 && stage.GetAlphaMultiplier() == 1); + }; + + if (!is_passthrough_tev_stage(stage)) { + color_results_1 = AppendColorModifier(stage.color_modifier1, stage.color_source1, index); + color_results_2 = AppendColorModifier(stage.color_modifier2, stage.color_source2, index); + color_results_3 = AppendColorModifier(stage.color_modifier3, stage.color_source3, index); + + // Round the output of each TEV stage to maintain the PICA's 8 bits of precision + Id color_output{Byteround(AppendColorCombiner(stage.color_op), 3)}; + Id alpha_output{}; + + if (stage.color_op == TevStageConfig::Operation::Dot3_RGBA) { + // result of Dot3_RGBA operation is also placed to the alpha component + alpha_output = OpCompositeExtract(f32_id, color_output, 0); + } else { + alpha_results_1 = + AppendAlphaModifier(stage.alpha_modifier1, stage.alpha_source1, index); + alpha_results_2 = + AppendAlphaModifier(stage.alpha_modifier2, stage.alpha_source2, index); + alpha_results_3 = + AppendAlphaModifier(stage.alpha_modifier3, stage.alpha_source3, index); + + alpha_output = Byteround(AppendAlphaCombiner(stage.alpha_op)); + } + + color_output = OpVectorTimesScalar( + vec_ids.Get(3), color_output, ConstF32(static_cast(stage.GetColorMultiplier()))); + color_output = OpFClamp(vec_ids.Get(3), color_output, ConstF32(0.f, 0.f, 0.f), + ConstF32(1.f, 1.f, 1.f)); + alpha_output = + OpFMul(f32_id, alpha_output, ConstF32(static_cast(stage.GetAlphaMultiplier()))); + alpha_output = OpFClamp(f32_id, alpha_output, ConstF32(0.f), ConstF32(1.f)); + last_tex_env_out = OpCompositeConstruct(vec_ids.Get(4), color_output, alpha_output); + } + + combiner_buffer = next_combiner_buffer; + if (config.TevStageUpdatesCombinerBufferColor(index)) { + next_combiner_buffer = + OpVectorShuffle(vec_ids.Get(4), last_tex_env_out, next_combiner_buffer, 0, 1, 2, 7); + } + + if (config.TevStageUpdatesCombinerBufferAlpha(index)) { + next_combiner_buffer = + OpVectorShuffle(vec_ids.Get(4), next_combiner_buffer, last_tex_env_out, 0, 1, 2, 7); + } +} + +using ProcTexClamp = TexturingRegs::ProcTexClamp; +using ProcTexShift = TexturingRegs::ProcTexShift; +using ProcTexCombiner = TexturingRegs::ProcTexCombiner; +using ProcTexFilter = TexturingRegs::ProcTexFilter; + +void FragmentModule::WriteAlphaTestCondition(FramebufferRegs::CompareFunc func) { + using CompareFunc = FramebufferRegs::CompareFunc; + + // The compare func is to keep the fragment so we invert it to discard it + const auto compare = [this, func](Id alpha, Id alphatest_ref) { + switch (func) { + case CompareFunc::Equal: + return OpINotEqual(bool_id, alpha, alphatest_ref); + case CompareFunc::NotEqual: + return OpIEqual(bool_id, alpha, alphatest_ref); + case CompareFunc::LessThan: + return OpSGreaterThanEqual(bool_id, alpha, alphatest_ref); + case CompareFunc::LessThanOrEqual: + return OpSGreaterThan(bool_id, alpha, alphatest_ref); + case CompareFunc::GreaterThan: + return OpSLessThanEqual(bool_id, alpha, alphatest_ref); + case CompareFunc::GreaterThanOrEqual: + return OpSLessThan(bool_id, alpha, alphatest_ref); + default: + return Id{}; + } + }; + + // Don't check for kill, this is done earlier + switch (func) { + case CompareFunc::Always: // Do nothing + break; + case CompareFunc::Equal: + case CompareFunc::NotEqual: + case CompareFunc::LessThan: + case CompareFunc::LessThanOrEqual: + case CompareFunc::GreaterThan: + case CompareFunc::GreaterThanOrEqual: { + const Id alpha_scaled{ + OpFMul(f32_id, OpCompositeExtract(f32_id, last_tex_env_out, 3), ConstF32(255.f))}; + const Id alpha_int{OpConvertFToS(i32_id, alpha_scaled)}; + const Id alphatest_ref{GetShaderDataMember(i32_id, ConstS32(1))}; + const Id alpha_comp_ref{compare(alpha_int, alphatest_ref)}; + const Id kill_label{OpLabel()}; + const Id keep_label{OpLabel()}; + OpSelectionMerge(keep_label, spv::SelectionControlMask::MaskNone); + OpBranchConditional(alpha_comp_ref, kill_label, keep_label); + AddLabel(kill_label); + OpKill(); + AddLabel(keep_label); + break; + } + default: + LOG_CRITICAL(Render_Vulkan, "Unknown alpha test condition {}", func); + break; + } +} + +Id FragmentModule::SampleTexture(u32 texture_unit) { + const PicaFSConfigState& state = config.state; + const Id zero_vec{ConstF32(0.f, 0.f, 0.f, 0.f)}; + + // PICA's LOD formula for 2D textures. + // This LOD formula is the same as the LOD lower limit defined in OpenGL. + // f(x, y) >= max{m_u, m_v, m_w} + // (See OpenGL 4.6 spec, 8.14.1 - Scale Factor and Level-of-Detail) + const auto sample_lod = [this, texture_unit](Id tex_id, Id texcoord_id) { + const Id sampled_image{OpLoad(TypeSampledImage(image2d_id), tex_id)}; + const Id tex_image{OpImage(image2d_id, sampled_image)}; + const Id tex_size{OpImageQuerySizeLod(ivec_ids.Get(2), tex_image, ConstS32(0))}; + const Id texcoord{OpLoad(vec_ids.Get(2), texcoord_id)}; + const Id coord{OpFMul(vec_ids.Get(2), texcoord, OpConvertSToF(vec_ids.Get(2), tex_size))}; + const Id abs_dfdx_coord{OpFAbs(vec_ids.Get(2), OpDPdx(vec_ids.Get(2), coord))}; + const Id abs_dfdy_coord{OpFAbs(vec_ids.Get(2), OpDPdy(vec_ids.Get(2), coord))}; + const Id d{OpFMax(vec_ids.Get(2), abs_dfdx_coord, abs_dfdy_coord)}; + const Id dx_dy_max{ + OpFMax(f32_id, OpCompositeExtract(f32_id, d, 0), OpCompositeExtract(f32_id, d, 1))}; + const Id lod{OpLog2(f32_id, dx_dy_max)}; + const Id lod_bias{GetShaderDataMember(f32_id, ConstS32(28), ConstU32(texture_unit))}; + const Id biased_lod{OpFAdd(f32_id, lod, lod_bias)}; + return OpImageSampleExplicitLod(vec_ids.Get(4), sampled_image, texcoord, + spv::ImageOperandsMask::Lod, biased_lod); + }; + + const auto sample = [this](Id tex_id, bool projection) { + const Id image_type = tex_id.value == tex_cube_id.value ? image_cube_id : image2d_id; + const Id sampled_image{OpLoad(TypeSampledImage(image_type), tex_id)}; + const Id texcoord0{OpLoad(vec_ids.Get(2), texcoord0_id)}; + const Id texcoord0_w{OpLoad(f32_id, texcoord0_w_id)}; + const Id coord{OpCompositeConstruct(vec_ids.Get(3), + OpCompositeExtract(f32_id, texcoord0, 0), + OpCompositeExtract(f32_id, texcoord0, 1), texcoord0_w)}; + if (projection) { + return OpImageSampleProjImplicitLod(vec_ids.Get(4), sampled_image, coord); + } else { + return OpImageSampleImplicitLod(vec_ids.Get(4), sampled_image, coord); + } + }; + + switch (texture_unit) { + case 0: + // Only unit 0 respects the texturing type + switch (state.texture0_type) { + case Pica::TexturingRegs::TextureConfig::Texture2D: + return sample_lod(tex0_id, texcoord0_id); + case Pica::TexturingRegs::TextureConfig::Projection2D: + return sample(tex0_id, true); + case Pica::TexturingRegs::TextureConfig::TextureCube: + return sample(tex_cube_id, false); + case Pica::TexturingRegs::TextureConfig::Shadow2D: + return SampleShadow(); + // case Pica::TexturingRegs::TextureConfig::ShadowCube: + // return "shadowTextureCube(texcoord0, texcoord0_w)"; + case Pica::TexturingRegs::TextureConfig::Disabled: + return zero_vec; + default: + LOG_CRITICAL(Render_Vulkan, "Unhandled texture type {:x}", state.texture0_type); + UNIMPLEMENTED(); + return zero_vec; + } + case 1: + return sample_lod(tex1_id, texcoord1_id); + case 2: + if (state.texture2_use_coord1) { + return sample_lod(tex2_id, texcoord1_id); + } else { + return sample_lod(tex2_id, texcoord2_id); + } + case 3: + if (state.proctex.enable) { + return OpFunctionCall(vec_ids.Get(4), proctex_func); + } else { + LOG_DEBUG(Render_Vulkan, "Using Texture3 without enabling it"); + return zero_vec; + } + default: + UNREACHABLE(); + return void_id; + } +} + +Id FragmentModule::CompareShadow(Id pixel, Id z) { + const Id pixel_d24{OpShiftRightLogical(u32_id, pixel, ConstS32(8))}; + const Id pixel_s8{OpConvertUToF(f32_id, OpBitwiseAnd(u32_id, pixel, ConstU32(255u)))}; + const Id s8_f32{OpFMul(f32_id, pixel_s8, ConstF32(1.f / 255.f))}; + const Id d24_leq_z{OpULessThanEqual(bool_id, pixel_d24, z)}; + return OpSelect(f32_id, d24_leq_z, ConstF32(0.f), s8_f32); +} + +Id FragmentModule::SampleShadow() { + const Id texcoord0{OpLoad(vec_ids.Get(2), texcoord0_id)}; + const Id texcoord0_w{OpLoad(f32_id, texcoord0_w_id)}; + const Id abs_min_w{OpFMul(f32_id, OpFMin(f32_id, OpFAbs(f32_id, texcoord0_w), ConstF32(1.f)), + ConstF32(16777215.f))}; + const Id shadow_texture_bias{GetShaderDataMember(i32_id, ConstS32(17))}; + const Id z_i32{OpSMax(i32_id, ConstS32(0), + OpISub(i32_id, OpConvertFToS(i32_id, abs_min_w), shadow_texture_bias))}; + const Id z{OpBitcast(u32_id, z_i32)}; + const Id shadow_texture_px{OpLoad(image_r32_id, shadow_texture_px_id)}; + const Id px_size{OpImageQuerySize(ivec_ids.Get(2), shadow_texture_px)}; + const Id coord{OpFma(vec_ids.Get(2), OpConvertSToF(vec_ids.Get(2), px_size), texcoord0, + ConstF32(-0.5f, -0.5f))}; + const Id coord_floor{OpFloor(vec_ids.Get(2), coord)}; + const Id f{OpFSub(vec_ids.Get(2), coord, coord_floor)}; + const Id i{OpConvertFToS(ivec_ids.Get(2), coord_floor)}; + + const auto sample_shadow2D = [&](Id uv) -> Id { + const Id true_label{OpLabel()}; + const Id false_label{OpLabel()}; + const Id end_label{OpLabel()}; + const Id uv_le_zero{OpSLessThan(bvec_ids.Get(2), uv, ConstS32(0, 0))}; + const Id uv_geq_size{OpSGreaterThanEqual(bvec_ids.Get(2), uv, px_size)}; + const Id cond{ + OpAny(bool_id, OpCompositeConstruct(bvec_ids.Get(4), uv_le_zero, uv_geq_size))}; + OpSelectionMerge(end_label, spv::SelectionControlMask::MaskNone); + OpBranchConditional(cond, true_label, false_label); + AddLabel(true_label); + OpBranch(end_label); + AddLabel(false_label); + const Id px_texel{OpImageRead(uvec_ids.Get(4), shadow_texture_px, uv)}; + const Id px_texel_x{OpCompositeExtract(u32_id, px_texel, 0)}; + const Id result{CompareShadow(px_texel_x, z)}; + OpBranch(end_label); + AddLabel(end_label); + return OpPhi(f32_id, ConstF32(1.f), true_label, result, false_label); + }; + + const Id s_xy{ + OpCompositeConstruct(vec_ids.Get(2), sample_shadow2D(i), + sample_shadow2D(OpIAdd(ivec_ids.Get(2), i, ConstS32(1, 0))))}; + const Id s_zw{OpCompositeConstruct( + vec_ids.Get(2), sample_shadow2D(OpIAdd(ivec_ids.Get(2), i, ConstS32(0, 1))), + sample_shadow2D(OpIAdd(ivec_ids.Get(2), i, ConstS32(1, 1))))}; + const Id f_yy{OpVectorShuffle(vec_ids.Get(2), f, f, 1, 1)}; + const Id t{OpFMix(vec_ids.Get(2), s_xy, s_zw, f_yy)}; + const Id t_x{OpCompositeExtract(f32_id, t, 0)}; + const Id t_y{OpCompositeExtract(f32_id, t, 1)}; + const Id a_x{OpCompositeExtract(f32_id, f, 0)}; + const Id val{OpFMix(f32_id, t_x, t_y, a_x)}; + return OpCompositeConstruct(vec_ids.Get(4), val, val, val, val); +} + +Id FragmentModule::AppendProcTexShiftOffset(Id v, ProcTexShift mode, ProcTexClamp clamp_mode) { + const Id offset{clamp_mode == ProcTexClamp::MirroredRepeat ? ConstF32(1.f) : ConstF32(0.5f)}; + const Id v_i32{OpConvertFToS(i32_id, v)}; + + const auto shift = [&](bool even) -> Id { + const Id temp1{ + OpSDiv(i32_id, even ? OpIAdd(i32_id, v_i32, ConstS32(1)) : v_i32, ConstS32(2))}; + const Id temp2{OpConvertSToF(f32_id, OpSMod(i32_id, temp1, ConstS32(2)))}; + return OpFMul(f32_id, offset, temp2); + }; + + switch (mode) { + case ProcTexShift::None: + return ConstF32(0.f); + case ProcTexShift::Odd: + return shift(false); + case ProcTexShift::Even: + return shift(true); + default: + LOG_CRITICAL(Render_Vulkan, "Unknown shift mode {}", mode); + return ConstF32(0.f); + } +} + +Id FragmentModule::AppendProcTexClamp(Id var, ProcTexClamp mode) { + const Id zero{ConstF32(0.f)}; + const Id one{ConstF32(1.f)}; + + const auto mirrored_repeat = [&]() -> Id { + const Id fract{OpFract(f32_id, var)}; + const Id cond{OpIEqual(bool_id, OpSMod(i32_id, OpConvertFToS(i32_id, var), ConstS32(2)), + ConstS32(0))}; + return OpSelect(f32_id, cond, fract, OpFSub(f32_id, one, fract)); + }; + + switch (mode) { + case ProcTexClamp::ToZero: + return OpSelect(f32_id, OpFOrdGreaterThan(bool_id, var, one), zero, var); + case ProcTexClamp::ToEdge: + return OpFMin(f32_id, var, one); + case ProcTexClamp::SymmetricalRepeat: + return OpFract(f32_id, var); + case ProcTexClamp::MirroredRepeat: + return mirrored_repeat(); + case ProcTexClamp::Pulse: + return OpSelect(f32_id, OpFOrdGreaterThan(bool_id, var, ConstF32(0.5f)), one, zero); + default: + LOG_CRITICAL(Render_Vulkan, "Unknown clamp mode {}", mode); + return OpFMin(f32_id, var, one); + } +} + +Id FragmentModule::AppendProcTexCombineAndMap(ProcTexCombiner combiner, Id u, Id v, Id offset) { + const auto combined = [&]() -> Id { + const Id u2v2{OpFma(f32_id, u, u, OpFMul(f32_id, v, v))}; + switch (combiner) { + case ProcTexCombiner::U: + return u; + case ProcTexCombiner::U2: + return OpFMul(f32_id, u, u); + case TexturingRegs::ProcTexCombiner::V: + return v; + case TexturingRegs::ProcTexCombiner::V2: + return OpFMul(f32_id, v, v); + case TexturingRegs::ProcTexCombiner::Add: + return OpFMul(f32_id, OpFAdd(f32_id, u, v), ConstF32(0.5f)); + case TexturingRegs::ProcTexCombiner::Add2: + return OpFMul(f32_id, u2v2, ConstF32(0.5f)); + case TexturingRegs::ProcTexCombiner::SqrtAdd2: + return OpFMin(f32_id, OpSqrt(f32_id, u2v2), ConstF32(1.f)); + case TexturingRegs::ProcTexCombiner::Min: + return OpFMin(f32_id, u, v); + case TexturingRegs::ProcTexCombiner::Max: + return OpFMax(f32_id, u, v); + case TexturingRegs::ProcTexCombiner::RMax: { + const Id r{OpFma(f32_id, OpFAdd(f32_id, u, v), ConstF32(0.5f), OpSqrt(f32_id, u2v2))}; + return OpFMin(f32_id, OpFMul(f32_id, r, ConstF32(0.5f)), ConstF32(1.f)); + } + default: + LOG_CRITICAL(Render_Vulkan, "Unknown combiner {}", combiner); + return ConstF32(0.f); + } + }(); + + return ProcTexLookupLUT(offset, combined); +} + +void FragmentModule::DefineProcTexSampler() { + const Id func_type{TypeFunction(vec_ids.Get(4))}; + proctex_func = OpFunction(vec_ids.Get(4), spv::FunctionControlMask::MaskNone, func_type); + AddLabel(OpLabel()); + + // Define noise tables at the beginning of the function + if (config.state.proctex.noise_enable) { + noise1d_table = + DefineVar(TypeArray(i32_id, ConstU32(16u)), spv::StorageClass::Function); + noise2d_table = + DefineVar(TypeArray(i32_id, ConstU32(16u)), spv::StorageClass::Function); + } + lut_offsets = DefineVar(TypeArray(i32_id, ConstU32(8u)), spv::StorageClass::Function); + + Id uv{}; + if (config.state.proctex.coord < 3) { + Id texcoord_id{}; + switch (config.state.proctex.coord.Value()) { + case 0: + texcoord_id = texcoord0_id; + break; + case 1: + texcoord_id = texcoord1_id; + break; + case 2: + texcoord_id = texcoord2_id; + break; + } + + const Id texcoord{OpLoad(vec_ids.Get(2), texcoord_id)}; + uv = OpFAbs(vec_ids.Get(2), texcoord); + } else { + LOG_CRITICAL(Render_Vulkan, "Unexpected proctex.coord >= 3"); + uv = OpFAbs(vec_ids.Get(2), OpLoad(vec_ids.Get(2), texcoord0_id)); + } + + // This LOD formula is the same as the LOD upper limit defined in OpenGL. + // f(x, y) <= m_u + m_v + m_w + // (See OpenGL 4.6 spec, 8.14.1 - Scale Factor and Level-of-Detail) + // Note: this is different from the one normal 2D textures use. + const Id uv_1{OpFAbs(vec_ids.Get(2), OpDPdx(vec_ids.Get(2), uv))}; + const Id uv_2{OpFAbs(vec_ids.Get(2), OpDPdy(vec_ids.Get(2), uv))}; + const Id duv{OpFMax(vec_ids.Get(2), uv_1, uv_2)}; + + // unlike normal texture, the bias is inside the log2 + const Id proctex_bias{GetShaderDataMember(f32_id, ConstS32(16))}; + const Id bias{ + OpFMul(f32_id, ConstF32(static_cast(config.state.proctex.lut_width)), proctex_bias)}; + const Id duv_xy{ + OpFAdd(f32_id, OpCompositeExtract(f32_id, duv, 0), OpCompositeExtract(f32_id, duv, 1))}; + + Id lod{OpLog2(f32_id, OpFMul(f32_id, OpFAbs(f32_id, bias), duv_xy))}; + lod = OpSelect(f32_id, OpFOrdEqual(bool_id, proctex_bias, ConstF32(0.f)), ConstF32(0.f), lod); + lod = OpFClamp(f32_id, lod, + ConstF32(std::max(0.0f, static_cast(config.state.proctex.lod_min))), + ConstF32(std::min(7.0f, static_cast(config.state.proctex.lod_max)))); + + // Get shift offset before noise generation + const Id u_shift{AppendProcTexShiftOffset(OpCompositeExtract(f32_id, uv, 1), + config.state.proctex.u_shift, + config.state.proctex.u_clamp)}; + const Id v_shift{AppendProcTexShiftOffset(OpCompositeExtract(f32_id, uv, 0), + config.state.proctex.v_shift, + config.state.proctex.v_clamp)}; + + // Generate noise + if (config.state.proctex.noise_enable) { + const Id proctex_noise_a{GetShaderDataMember(vec_ids.Get(2), ConstS32(22))}; + const Id noise_coef{ProcTexNoiseCoef(uv)}; + uv = OpFAdd(vec_ids.Get(2), uv, + OpVectorTimesScalar(vec_ids.Get(2), proctex_noise_a, noise_coef)); + uv = OpFAbs(vec_ids.Get(2), uv); + } + + // Shift + Id u{OpFAdd(f32_id, OpCompositeExtract(f32_id, uv, 0), u_shift)}; + Id v{OpFAdd(f32_id, OpCompositeExtract(f32_id, uv, 1), v_shift)}; + + // Clamp + u = AppendProcTexClamp(u, config.state.proctex.u_clamp); + v = AppendProcTexClamp(v, config.state.proctex.v_clamp); + + // Combine and map + const Id proctex_color_map_offset{GetShaderDataMember(i32_id, ConstS32(12))}; + const Id lut_coord{AppendProcTexCombineAndMap(config.state.proctex.color_combiner, u, v, + proctex_color_map_offset)}; + + Id final_color{}; + switch (config.state.proctex.lut_filter) { + case ProcTexFilter::Linear: + case ProcTexFilter::Nearest: { + final_color = SampleProcTexColor(lut_coord, ConstS32(0)); + break; + } + case ProcTexFilter::NearestMipmapNearest: + case ProcTexFilter::LinearMipmapNearest: { + final_color = SampleProcTexColor(lut_coord, OpConvertFToS(i32_id, OpRound(f32_id, lod))); + break; + } + case ProcTexFilter::NearestMipmapLinear: + case ProcTexFilter::LinearMipmapLinear: { + const Id lod_i{OpConvertFToS(i32_id, lod)}; + const Id lod_f{OpFract(f32_id, lod)}; + const Id color1{SampleProcTexColor(lut_coord, lod_i)}; + const Id color2{SampleProcTexColor(lut_coord, OpIAdd(i32_id, lod_i, ConstS32(1)))}; + final_color = OpFMix(f32_id, color1, color2, lod_f); + break; + } + } + + if (config.state.proctex.separate_alpha) { + const Id proctex_alpha_map_offset{GetShaderDataMember(i32_id, ConstS32(13))}; + const Id final_alpha{AppendProcTexCombineAndMap(config.state.proctex.alpha_combiner, u, v, + proctex_alpha_map_offset)}; + final_color = OpCompositeInsert(vec_ids.Get(4), final_alpha, final_color, 3); + } + + OpReturnValue(final_color); + OpFunctionEnd(); +} + +Id FragmentModule::Byteround(Id variable_id, u32 size) { + if (size > 1) { + const Id scaled_vec_id{ + OpVectorTimesScalar(vec_ids.Get(size), variable_id, ConstF32(255.f))}; + const Id rounded_id{OpRound(vec_ids.Get(size), scaled_vec_id)}; + return OpVectorTimesScalar(vec_ids.Get(size), rounded_id, ConstF32(1.f / 255.f)); + } else { + const Id rounded_id{OpRound(f32_id, OpFMul(f32_id, variable_id, ConstF32(255.f)))}; + return OpFMul(f32_id, rounded_id, ConstF32(1.f / 255.f)); + } +} + +Id FragmentModule::ProcTexLookupLUT(Id offset, Id coord) { + coord = OpFMul(f32_id, coord, ConstF32(128.f)); + const Id index_i{OpFClamp(f32_id, OpFloor(f32_id, coord), ConstF32(0.f), ConstF32(127.0f))}; + const Id index_f{OpFSub(f32_id, coord, index_i)}; + const Id p{OpIAdd(i32_id, OpConvertFToS(i32_id, index_i), offset)}; + if (!Sirit::ValidId(texture_buffer_lut_rg)) { + const Id sampled_image{TypeSampledImage(image_buffer_id)}; + texture_buffer_lut_rg = OpLoad(sampled_image, texture_buffer_lut_rg_id); + } + const Id entry{ + OpImageFetch(vec_ids.Get(4), OpImage(image_buffer_id, texture_buffer_lut_rg), p)}; + const Id entry_r{OpCompositeExtract(f32_id, entry, 0)}; + const Id entry_g{OpCompositeExtract(f32_id, entry, 1)}; + return OpFClamp(f32_id, OpFma(f32_id, entry_g, index_f, entry_r), ConstF32(0.f), ConstF32(1.f)); +}; + +Id FragmentModule::ProcTexNoiseCoef(Id x) { + // Noise utility + const auto proctex_noise_rand1D = [&](Id v) -> Id { + InitTableS32(noise1d_table, 0, 4, 10, 8, 4, 9, 7, 12, 5, 15, 13, 14, 11, 15, 2, 11); + const Id table_ptr{TypePointer(spv::StorageClass::Function, i32_id)}; + const Id left_tmp{OpIAdd(i32_id, OpSMod(i32_id, v, ConstS32(9)), ConstS32(2))}; + const Id left{OpBitwiseAnd(i32_id, OpIMul(i32_id, left_tmp, ConstS32(3)), ConstS32(0xF))}; + const Id table_index{OpBitwiseAnd(i32_id, OpSDiv(i32_id, v, ConstS32(9)), ConstS32(0xF))}; + const Id table_value{OpLoad(i32_id, OpAccessChain(table_ptr, noise1d_table, table_index))}; + return OpBitwiseXor(i32_id, left, table_value); + }; + + const auto proctex_noise_rand2D = [&](Id point) -> Id { + InitTableS32(noise2d_table, 10, 2, 15, 8, 0, 7, 4, 5, 5, 13, 2, 6, 13, 9, 3, 14); + const Id table_ptr{TypePointer(spv::StorageClass::Function, i32_id)}; + const Id point_x{OpConvertFToS(i32_id, OpCompositeExtract(f32_id, point, 0))}; + const Id point_y{OpConvertFToS(i32_id, OpCompositeExtract(f32_id, point, 1))}; + const Id u2{proctex_noise_rand1D(point_x)}; + const Id cond{OpIEqual(bool_id, OpBitwiseAnd(i32_id, u2, ConstS32(3)), ConstS32(1))}; + const Id table_value{OpLoad(i32_id, OpAccessChain(table_ptr, noise2d_table, u2))}; + Id v2{proctex_noise_rand1D(point_y)}; + v2 = OpIAdd(i32_id, v2, OpSelect(i32_id, cond, ConstS32(4), ConstS32(0))); + v2 = OpBitwiseXor(i32_id, v2, + OpIMul(i32_id, OpBitwiseAnd(i32_id, u2, ConstS32(1)), ConstS32(6))); + v2 = OpIAdd(i32_id, v2, OpIAdd(i32_id, u2, ConstS32(10))); + v2 = OpBitwiseAnd(i32_id, v2, ConstS32(0xF)); + v2 = OpBitwiseXor(i32_id, v2, table_value); + return OpFma(f32_id, OpConvertSToF(f32_id, v2), ConstF32(2.f / 15.f), ConstF32(-1.f)); + }; + + const Id proctex_noise_f{GetShaderDataMember(vec_ids.Get(2), ConstS32(21))}; + const Id proctex_noise_p{GetShaderDataMember(vec_ids.Get(2), ConstS32(23))}; + const Id grid{OpFMul(vec_ids.Get(2), + OpVectorTimesScalar(vec_ids.Get(2), proctex_noise_f, ConstF32(9.f)), + OpFAbs(vec_ids.Get(2), OpFAdd(vec_ids.Get(2), x, proctex_noise_p)))}; + const Id point{OpFloor(vec_ids.Get(2), grid)}; + const Id frac{OpFSub(vec_ids.Get(2), grid, point)}; + const Id frac_x{OpCompositeExtract(f32_id, frac, 0)}; + const Id frac_y{OpCompositeExtract(f32_id, frac, 1)}; + const Id frac_x_y{OpFAdd(f32_id, frac_x, frac_y)}; + const Id g0{OpFMul(f32_id, proctex_noise_rand2D(point), frac_x_y)}; + const Id frac_x_y_min_one{OpFSub(f32_id, frac_x_y, ConstF32(1.f))}; + const Id g1{OpFMul(f32_id, + proctex_noise_rand2D(OpFAdd(vec_ids.Get(2), point, ConstF32(1.f, 0.f))), + frac_x_y_min_one)}; + const Id g2{OpFMul(f32_id, + proctex_noise_rand2D(OpFAdd(vec_ids.Get(2), point, ConstF32(0.f, 1.f))), + frac_x_y_min_one)}; + const Id frac_x_y_min_two{OpFSub(f32_id, frac_x_y, ConstF32(2.f))}; + const Id g3{OpFMul(f32_id, + proctex_noise_rand2D(OpFAdd(vec_ids.Get(2), point, ConstF32(1.f, 1.f))), + frac_x_y_min_two)}; + const Id proctex_noise_lut_offset{GetShaderDataMember(i32_id, ConstS32(11))}; + const Id x_noise{ProcTexLookupLUT(proctex_noise_lut_offset, frac_x)}; + const Id y_noise{ProcTexLookupLUT(proctex_noise_lut_offset, frac_y)}; + const Id x0{OpFMix(f32_id, g0, g1, x_noise)}; + const Id x1{OpFMix(f32_id, g2, g3, x_noise)}; + return OpFMix(f32_id, x0, x1, y_noise); +} + +Id FragmentModule::SampleProcTexColor(Id lut_coord, Id level) { + const Id lut_width{ + OpShiftRightArithmetic(i32_id, ConstS32(config.state.proctex.lut_width), level)}; + const Id lut_ptr{TypePointer(spv::StorageClass::Function, i32_id)}; + // Offsets for level 4-7 seem to be hardcoded + InitTableS32(lut_offsets, config.state.proctex.lut_offset0, config.state.proctex.lut_offset1, + config.state.proctex.lut_offset2, config.state.proctex.lut_offset3, 0xF0, 0xF8, + 0xFC, 0xFE); + const Id lut_offset{OpLoad(i32_id, OpAccessChain(lut_ptr, lut_offsets, level))}; + // For the color lut, coord=0.0 is lut[offset] and coord=1.0 is lut[offset+width-1] + lut_coord = + OpFMul(f32_id, lut_coord, OpConvertSToF(f32_id, OpISub(i32_id, lut_width, ConstS32(1)))); + + if (!Sirit::ValidId(texture_buffer_lut_rgba)) { + const Id sampled_image{TypeSampledImage(image_buffer_id)}; + texture_buffer_lut_rgba = OpLoad(sampled_image, texture_buffer_lut_rgba_id); + } + + const Id proctex_lut_offset{GetShaderDataMember(i32_id, ConstS32(14))}; + const Id lut_rgba{OpImage(image_buffer_id, texture_buffer_lut_rgba)}; + + switch (config.state.proctex.lut_filter) { + case ProcTexFilter::Linear: + case ProcTexFilter::LinearMipmapLinear: + case ProcTexFilter::LinearMipmapNearest: { + const Id lut_index_i{OpIAdd(i32_id, OpConvertFToS(i32_id, lut_coord), lut_offset)}; + const Id lut_index_f{OpFract(f32_id, lut_coord)}; + const Id proctex_diff_lut_offset{GetShaderDataMember(i32_id, ConstS32(15))}; + const Id p1{OpIAdd(i32_id, lut_index_i, proctex_lut_offset)}; + const Id p2{OpIAdd(i32_id, lut_index_i, proctex_diff_lut_offset)}; + const Id texel1{OpImageFetch(vec_ids.Get(4), lut_rgba, p1)}; + const Id texel2{OpImageFetch(vec_ids.Get(4), lut_rgba, p2)}; + return OpFAdd(vec_ids.Get(4), texel1, + OpVectorTimesScalar(vec_ids.Get(4), texel2, lut_index_f)); + } + case ProcTexFilter::Nearest: + case ProcTexFilter::NearestMipmapLinear: + case ProcTexFilter::NearestMipmapNearest: { + lut_coord = OpFAdd(f32_id, lut_coord, OpConvertSToF(f32_id, lut_offset)); + const Id lut_coord_rounded{OpConvertFToS(i32_id, OpRound(f32_id, lut_coord))}; + const Id p{OpIAdd(i32_id, lut_coord_rounded, proctex_lut_offset)}; + return OpImageFetch(vec_ids.Get(4), lut_rgba, p); + } + } + + return Id{}; +} + +Id FragmentModule::LookupLightingLUT(Id lut_index, Id index, Id delta) { + // Only load the texture buffer lut once + if (!Sirit::ValidId(texture_buffer_lut_lf)) { + const Id sampled_image{TypeSampledImage(image_buffer_id)}; + texture_buffer_lut_lf = OpLoad(sampled_image, texture_buffer_lut_lf_id); + } + + const Id lut_index_x{OpShiftRightArithmetic(i32_id, lut_index, ConstS32(2))}; + const Id lut_index_y{OpBitwiseAnd(i32_id, lut_index, ConstS32(3))}; + const Id lut_offset{GetShaderDataMember(i32_id, ConstS32(19), lut_index_x, lut_index_y)}; + const Id coord{OpIAdd(i32_id, lut_offset, index)}; + const Id entry{ + OpImageFetch(vec_ids.Get(4), OpImage(image_buffer_id, texture_buffer_lut_lf), coord)}; + const Id entry_r{OpCompositeExtract(f32_id, entry, 0)}; + const Id entry_g{OpCompositeExtract(f32_id, entry, 1)}; + return OpFma(f32_id, entry_g, delta, entry_r); +} + +Id FragmentModule::AppendSource(TevStageConfig::Source source, s32 index) { + using Source = TevStageConfig::Source; + switch (source) { + case Source::PrimaryColor: + return rounded_primary_color; + case Source::PrimaryFragmentColor: + return primary_fragment_color; + case Source::SecondaryFragmentColor: + return secondary_fragment_color; + case Source::Texture0: + return SampleTexture(0); + case Source::Texture1: + return SampleTexture(1); + case Source::Texture2: + return SampleTexture(2); + case Source::Texture3: + return SampleTexture(3); + case Source::PreviousBuffer: + return combiner_buffer; + case Source::Constant: + return GetShaderDataMember(vec_ids.Get(4), ConstS32(26), ConstS32(index)); + case Source::Previous: + return last_tex_env_out; + default: + LOG_CRITICAL(Render_Vulkan, "Unknown source op {}", source); + return ConstF32(0.f, 0.f, 0.f, 0.f); + } +} + +Id FragmentModule::AppendColorModifier(TevStageConfig::ColorModifier modifier, + TevStageConfig::Source source, s32 index) { + using ColorModifier = TevStageConfig::ColorModifier; + const Id source_color{AppendSource(source, index)}; + const Id one_vec{ConstF32(1.f, 1.f, 1.f)}; + + const auto shuffle = [&](s32 r, s32 g, s32 b) -> Id { + return OpVectorShuffle(vec_ids.Get(3), source_color, source_color, r, g, b); + }; + + switch (modifier) { + case ColorModifier::SourceColor: + return shuffle(0, 1, 2); + case ColorModifier::OneMinusSourceColor: + return OpFSub(vec_ids.Get(3), one_vec, shuffle(0, 1, 2)); + case ColorModifier::SourceRed: + return shuffle(0, 0, 0); + case ColorModifier::OneMinusSourceRed: + return OpFSub(vec_ids.Get(3), one_vec, shuffle(0, 0, 0)); + case ColorModifier::SourceGreen: + return shuffle(1, 1, 1); + case ColorModifier::OneMinusSourceGreen: + return OpFSub(vec_ids.Get(3), one_vec, shuffle(1, 1, 1)); + case ColorModifier::SourceBlue: + return shuffle(2, 2, 2); + case ColorModifier::OneMinusSourceBlue: + return OpFSub(vec_ids.Get(3), one_vec, shuffle(2, 2, 2)); + case ColorModifier::SourceAlpha: + return shuffle(3, 3, 3); + case ColorModifier::OneMinusSourceAlpha: + return OpFSub(vec_ids.Get(3), one_vec, shuffle(3, 3, 3)); + default: + LOG_CRITICAL(Render_Vulkan, "Unknown color modifier op {}", modifier); + return one_vec; + } +} + +Id FragmentModule::AppendAlphaModifier(TevStageConfig::AlphaModifier modifier, + TevStageConfig::Source source, s32 index) { + using AlphaModifier = TevStageConfig::AlphaModifier; + const Id source_color{AppendSource(source, index)}; + const Id one_f32{ConstF32(1.f)}; + + const auto component = [&](s32 c) -> Id { return OpCompositeExtract(f32_id, source_color, c); }; + + switch (modifier) { + case AlphaModifier::SourceAlpha: + return component(3); + case AlphaModifier::OneMinusSourceAlpha: + return OpFSub(f32_id, one_f32, component(3)); + case AlphaModifier::SourceRed: + return component(0); + case AlphaModifier::OneMinusSourceRed: + return OpFSub(f32_id, one_f32, component(0)); + case AlphaModifier::SourceGreen: + return component(1); + case AlphaModifier::OneMinusSourceGreen: + return OpFSub(f32_id, one_f32, component(1)); + case AlphaModifier::SourceBlue: + return component(2); + case AlphaModifier::OneMinusSourceBlue: + return OpFSub(f32_id, one_f32, component(2)); + default: + LOG_CRITICAL(Render_Vulkan, "Unknown alpha modifier op {}", modifier); + return one_f32; + } +} + +Id FragmentModule::AppendColorCombiner(Pica::TexturingRegs::TevStageConfig::Operation operation) { + using Operation = TevStageConfig::Operation; + const Id half_vec{ConstF32(0.5f, 0.5f, 0.5f)}; + const Id one_vec{ConstF32(1.f, 1.f, 1.f)}; + const Id zero_vec{ConstF32(0.f, 0.f, 0.f)}; + Id color{}; + + switch (operation) { + case Operation::Replace: + color = color_results_1; + break; + case Operation::Modulate: + color = OpFMul(vec_ids.Get(3), color_results_1, color_results_2); + break; + case Operation::Add: + color = OpFAdd(vec_ids.Get(3), color_results_1, color_results_2); + break; + case Operation::AddSigned: + color = OpFSub(vec_ids.Get(3), OpFAdd(vec_ids.Get(3), color_results_1, color_results_2), + half_vec); + break; + case Operation::Lerp: + color = OpFMix(vec_ids.Get(3), color_results_2, color_results_1, color_results_3); + break; + case Operation::Subtract: + color = OpFSub(vec_ids.Get(3), color_results_1, color_results_2); + break; + case Operation::MultiplyThenAdd: + color = OpFma(vec_ids.Get(3), color_results_1, color_results_2, color_results_3); + break; + case Operation::AddThenMultiply: + color = OpFMin(vec_ids.Get(3), OpFAdd(vec_ids.Get(3), color_results_1, color_results_2), + one_vec); + color = OpFMul(vec_ids.Get(3), color, color_results_3); + break; + case Operation::Dot3_RGB: + case Operation::Dot3_RGBA: + color = OpDot(f32_id, OpFSub(vec_ids.Get(3), color_results_1, half_vec), + OpFSub(vec_ids.Get(3), color_results_2, half_vec)); + color = OpFMul(f32_id, color, ConstF32(4.f)); + color = OpCompositeConstruct(vec_ids.Get(3), color, color, color); + break; + default: + color = zero_vec; + LOG_CRITICAL(Render_Vulkan, "Unknown color combiner operation: {}", operation); + break; + } + + // Clamp result to 0.0, 1.0 + return OpFClamp(vec_ids.Get(3), color, zero_vec, one_vec); +} + +Id FragmentModule::AppendAlphaCombiner(TevStageConfig::Operation operation) { + using Operation = TevStageConfig::Operation; + Id color{}; + + switch (operation) { + case Operation::Replace: + color = alpha_results_1; + break; + case Operation::Modulate: + color = OpFMul(f32_id, alpha_results_1, alpha_results_2); + break; + case Operation::Add: + color = OpFAdd(f32_id, alpha_results_1, alpha_results_2); + break; + case Operation::AddSigned: + color = OpFSub(f32_id, OpFAdd(f32_id, alpha_results_1, alpha_results_2), ConstF32(0.5f)); + break; + case Operation::Lerp: + color = OpFMix(f32_id, alpha_results_2, alpha_results_1, alpha_results_3); + break; + case Operation::Subtract: + color = OpFSub(f32_id, alpha_results_1, alpha_results_2); + break; + case Operation::MultiplyThenAdd: + color = OpFma(f32_id, alpha_results_1, alpha_results_2, alpha_results_3); + break; + case Operation::AddThenMultiply: + color = OpFMin(f32_id, OpFAdd(f32_id, alpha_results_1, alpha_results_2), ConstF32(1.f)); + color = OpFMul(f32_id, color, alpha_results_3); + break; + default: + color = ConstF32(0.f); + LOG_CRITICAL(Render_Vulkan, "Unknown alpha combiner operation: {}", operation); + break; + } + + return OpFClamp(f32_id, color, ConstF32(0.f), ConstF32(1.f)); +} + +void FragmentModule::DefineArithmeticTypes() { + void_id = Name(TypeVoid(), "void_id"); + bool_id = Name(TypeBool(), "bool_id"); + f32_id = Name(TypeFloat(32), "f32_id"); + i32_id = Name(TypeSInt(32), "i32_id"); + u32_id = Name(TypeUInt(32), "u32_id"); + + for (u32 size = 2; size <= 4; size++) { + const u32 i = size - 2; + vec_ids.ids[i] = Name(TypeVector(f32_id, size), fmt::format("vec{}_id", size)); + ivec_ids.ids[i] = Name(TypeVector(i32_id, size), fmt::format("ivec{}_id", size)); + uvec_ids.ids[i] = Name(TypeVector(u32_id, size), fmt::format("uvec{}_id", size)); + bvec_ids.ids[i] = Name(TypeVector(bool_id, size), fmt::format("bvec{}_id", size)); + } +} + +void FragmentModule::DefineEntryPoint() { + AddCapability(spv::Capability::Shader); + AddCapability(spv::Capability::SampledBuffer); + AddCapability(spv::Capability::ImageQuery); + SetMemoryModel(spv::AddressingModel::Logical, spv::MemoryModel::GLSL450); + + const Id main_type{TypeFunction(TypeVoid())}; + const Id main_func{OpFunction(TypeVoid(), spv::FunctionControlMask::MaskNone, main_type)}; + AddEntryPoint(spv::ExecutionModel::Fragment, main_func, "main", primary_color_id, texcoord0_id, + texcoord1_id, texcoord2_id, texcoord0_w_id, normquat_id, view_id, color_id, + gl_frag_coord_id, gl_frag_depth_id); + AddExecutionMode(main_func, spv::ExecutionMode::OriginUpperLeft); + AddExecutionMode(main_func, spv::ExecutionMode::DepthReplacing); +} + +void FragmentModule::DefineUniformStructs() { + const Id light_src_struct_id{TypeStruct(vec_ids.Get(3), vec_ids.Get(3), vec_ids.Get(3), + vec_ids.Get(3), vec_ids.Get(3), vec_ids.Get(3), f32_id, + f32_id)}; + + const Id light_src_array_id{TypeArray(light_src_struct_id, ConstU32(NUM_LIGHTS))}; + const Id lighting_lut_array_id{TypeArray(ivec_ids.Get(4), ConstU32(NUM_LIGHTING_SAMPLERS / 4))}; + const Id const_color_array_id{TypeArray(vec_ids.Get(4), ConstU32(NUM_TEV_STAGES))}; + + const Id shader_data_struct_id{TypeStruct( + i32_id, i32_id, f32_id, f32_id, f32_id, f32_id, i32_id, i32_id, i32_id, i32_id, i32_id, + i32_id, i32_id, i32_id, i32_id, i32_id, f32_id, i32_id, u32_id, lighting_lut_array_id, + vec_ids.Get(3), vec_ids.Get(2), vec_ids.Get(2), vec_ids.Get(2), vec_ids.Get(3), + light_src_array_id, const_color_array_id, vec_ids.Get(4), vec_ids.Get(3), vec_ids.Get(4))}; + + constexpr std::array light_src_offsets{0u, 16u, 32u, 48u, 64u, 80u, 92u, 96u}; + constexpr std::array shader_data_offsets{ + 0u, 4u, 8u, 12u, 16u, 20u, 24u, 28u, 32u, 36u, 40u, 44u, 48u, 52u, 56u, + 60u, 64u, 68u, 72u, 80u, 176u, 192u, 200u, 208u, 224u, 240u, 1136u, 1232u, 1248u, 1264u}; + + Decorate(lighting_lut_array_id, spv::Decoration::ArrayStride, 16u); + Decorate(light_src_array_id, spv::Decoration::ArrayStride, 112u); + Decorate(const_color_array_id, spv::Decoration::ArrayStride, 16u); + for (u32 i = 0; i < static_cast(light_src_offsets.size()); i++) { + MemberDecorate(light_src_struct_id, i, spv::Decoration::Offset, light_src_offsets[i]); + } + for (u32 i = 0; i < static_cast(shader_data_offsets.size()); i++) { + MemberDecorate(shader_data_struct_id, i, spv::Decoration::Offset, shader_data_offsets[i]); + } + Decorate(shader_data_struct_id, spv::Decoration::Block); + + shader_data_id = AddGlobalVariable( + TypePointer(spv::StorageClass::Uniform, shader_data_struct_id), spv::StorageClass::Uniform); + Decorate(shader_data_id, spv::Decoration::DescriptorSet, 0); + Decorate(shader_data_id, spv::Decoration::Binding, 1); +} + +void FragmentModule::DefineInterface() { + // Define interface block + primary_color_id = DefineInput(vec_ids.Get(4), 1); + texcoord0_id = DefineInput(vec_ids.Get(2), 2); + texcoord1_id = DefineInput(vec_ids.Get(2), 3); + texcoord2_id = DefineInput(vec_ids.Get(2), 4); + texcoord0_w_id = DefineInput(f32_id, 5); + normquat_id = DefineInput(vec_ids.Get(4), 6); + view_id = DefineInput(vec_ids.Get(3), 7); + color_id = DefineOutput(vec_ids.Get(4), 0); + + // Define the texture unit samplers/uniforms + image_buffer_id = TypeImage(f32_id, spv::Dim::Buffer, 0, 0, 0, 1, spv::ImageFormat::Unknown); + image2d_id = TypeImage(f32_id, spv::Dim::Dim2D, 0, 0, 0, 1, spv::ImageFormat::Unknown); + image_cube_id = TypeImage(f32_id, spv::Dim::Cube, 0, 0, 0, 1, spv::ImageFormat::Unknown); + image_r32_id = TypeImage(u32_id, spv::Dim::Dim2D, 0, 0, 0, 2, spv::ImageFormat::R32ui); + sampler_id = TypeSampler(); + + texture_buffer_lut_lf_id = DefineUniformConst(TypeSampledImage(image_buffer_id), 0, 2); + texture_buffer_lut_rg_id = DefineUniformConst(TypeSampledImage(image_buffer_id), 0, 3); + texture_buffer_lut_rgba_id = DefineUniformConst(TypeSampledImage(image_buffer_id), 0, 4); + tex0_id = DefineUniformConst(TypeSampledImage(image2d_id), 1, 0); + tex1_id = DefineUniformConst(TypeSampledImage(image2d_id), 1, 1); + tex2_id = DefineUniformConst(TypeSampledImage(image2d_id), 1, 2); + tex_cube_id = DefineUniformConst(TypeSampledImage(image_cube_id), 1, 3); + + // Define shadow textures + shadow_texture_px_id = DefineUniformConst(image_r32_id, 2, 0, true); + shadow_buffer_id = DefineUniformConst(image_r32_id, 2, 6); + + // Define built-ins + gl_frag_coord_id = DefineVar(vec_ids.Get(4), spv::StorageClass::Input); + gl_frag_depth_id = DefineVar(f32_id, spv::StorageClass::Output); + Decorate(gl_frag_coord_id, spv::Decoration::BuiltIn, spv::BuiltIn::FragCoord); + Decorate(gl_frag_depth_id, spv::Decoration::BuiltIn, spv::BuiltIn::FragDepth); +} + +std::vector GenerateFragmentShaderSPV(const PicaFSConfig& config) { + auto& telemetry = Core::System::GetInstance().TelemetrySession(); + FragmentModule module{telemetry, config}; + module.Generate(); + return module.Assemble(); +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_gen_spv.h b/src/video_core/renderer_vulkan/vk_shader_gen_spv.h new file mode 100644 index 000000000..c23a0a9ed --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_gen_spv.h @@ -0,0 +1,300 @@ +// Copyright 2023 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include +#include + +#include "video_core/renderer_vulkan/vk_shader_gen.h" + +namespace Core { +class TelemetrySession; +} + +namespace Vulkan { + +using Sirit::Id; + +struct VectorIds { + /// Returns the type id of the vector with the provided size + [[nodiscard]] constexpr Id Get(u32 size) const { + return ids[size - 2]; + } + + std::array ids; +}; + +class FragmentModule : public Sirit::Module { + static constexpr u32 NUM_TEV_STAGES = 6; + static constexpr u32 NUM_LIGHTS = 8; + static constexpr u32 NUM_LIGHTING_SAMPLERS = 24; + +public: + explicit FragmentModule(Core::TelemetrySession& telemetry, const PicaFSConfig& config); + ~FragmentModule(); + + /// Emits SPIR-V bytecode corresponding to the provided pica fragment configuration + void Generate(); + +private: + /// Undos the vulkan perspective transformation and applies the PICA one + void WriteDepth(); + + /// Emits code to emulate the scissor rectangle + void WriteScissor(); + + /// Writes the code to emulate fragment lighting + void WriteLighting(); + + /// Writes the code to emulate fog + void WriteFog(); + + /// Writes the code to emulate gas rendering + void WriteGas(); + + /// Writes the code to emulate the specified TEV stage + void WriteTevStage(s32 index); + + /// Defines the tex3 proctex sampling function + void DefineProcTexSampler(); + + /// Writes the if-statement condition used to evaluate alpha testing. + void WriteAlphaTestCondition(Pica::FramebufferRegs::CompareFunc func); + + /// Samples the current fragment texel from the provided texture unit + [[nodiscard]] Id SampleTexture(u32 texture_unit); + + /// Samples the current fragment texel from shadow plane + [[nodiscard]] Id SampleShadow(); + + [[nodiscard]] Id AppendProcTexShiftOffset(Id v, Pica::TexturingRegs::ProcTexShift mode, + Pica::TexturingRegs::ProcTexClamp clamp_mode); + + [[nodiscard]] Id AppendProcTexClamp(Id var, Pica::TexturingRegs::ProcTexClamp mode); + + [[nodiscard]] Id AppendProcTexCombineAndMap(Pica::TexturingRegs::ProcTexCombiner combiner, Id u, + Id v, Id offset); + + /// Rounds the provided variable to the nearest 1/255th + [[nodiscard]] Id Byteround(Id variable_id, u32 size = 1); + + /// LUT sampling uitlity + /// For NoiseLUT/ColorMap/AlphaMap, coord=0.0 is lut[0], coord=127.0/128.0 is lut[127] and + /// coord=1.0 is lut[127]+lut_diff[127]. For other indices, the result is interpolated using + /// value entries and difference entries. + [[nodiscard]] Id ProcTexLookupLUT(Id offset, Id coord); + + /// Generates random noise with proctex + [[nodiscard]] Id ProcTexNoiseCoef(Id x); + + /// Samples a color value from the rgba texture lut + [[nodiscard]] Id SampleProcTexColor(Id lut_coord, Id level); + + /// Lookups the lighting LUT at the provided lut_index + [[nodiscard]] Id LookupLightingLUT(Id lut_index, Id index, Id delta); + + /// Writes the specified TEV stage source component(s) + [[nodiscard]] Id AppendSource(Pica::TexturingRegs::TevStageConfig::Source source, s32 index); + + /// Writes the color components to use for the specified TEV stage color modifier + [[nodiscard]] Id AppendColorModifier( + Pica::TexturingRegs::TevStageConfig::ColorModifier modifier, + Pica::TexturingRegs::TevStageConfig::Source source, s32 index); + + /// Writes the alpha component to use for the specified TEV stage alpha modifier + [[nodiscard]] Id AppendAlphaModifier( + Pica::TexturingRegs::TevStageConfig::AlphaModifier modifier, + Pica::TexturingRegs::TevStageConfig::Source source, s32 index); + + /// Writes the combiner function for the color components for the specified TEV stage operation + [[nodiscard]] Id AppendColorCombiner(Pica::TexturingRegs::TevStageConfig::Operation operation); + + /// Writes the combiner function for the alpha component for the specified TEV stage operation + [[nodiscard]] Id AppendAlphaCombiner(Pica::TexturingRegs::TevStageConfig::Operation operation); + +private: + /// Creates a constant array of integers + template + void InitTableS32(Id table, T... elems) { + const Id table_const{ConstS32(elems...)}; + OpStore(table, table_const); + }; + + /// Loads the member specified from the shader_data uniform struct + template + [[nodiscard]] Id GetShaderDataMember(Id type, Ids... ids) { + const Id uniform_ptr{TypePointer(spv::StorageClass::Uniform, type)}; + return OpLoad(type, OpAccessChain(uniform_ptr, shader_data_id, ids...)); + } + + /// Pads the provided vector by inserting args at the end + template + [[nodiscard]] Id PadVectorF32(Id vector, Id pad_type_id, Args&&... args) { + return OpCompositeConstruct(pad_type_id, vector, ConstF32(args...)); + } + + /// Defines a input variable + [[nodiscard]] Id DefineInput(Id type, u32 location) { + const Id input_id{DefineVar(type, spv::StorageClass::Input)}; + Decorate(input_id, spv::Decoration::Location, location); + return input_id; + } + + /// Defines a input variable + [[nodiscard]] Id DefineOutput(Id type, u32 location) { + const Id output_id{DefineVar(type, spv::StorageClass::Output)}; + Decorate(output_id, spv::Decoration::Location, location); + return output_id; + } + + /// Defines a uniform constant variable + [[nodiscard]] Id DefineUniformConst(Id type, u32 set, u32 binding, bool readonly = false) { + const Id uniform_id{DefineVar(type, spv::StorageClass::UniformConstant)}; + Decorate(uniform_id, spv::Decoration::DescriptorSet, set); + Decorate(uniform_id, spv::Decoration::Binding, binding); + if (readonly) { + Decorate(uniform_id, spv::Decoration::NonWritable); + } + return uniform_id; + } + + template + [[nodiscard]] Id DefineVar(Id type, spv::StorageClass storage_class) { + const Id pointer_type_id{TypePointer(storage_class, type)}; + return global ? AddGlobalVariable(pointer_type_id, storage_class) + : AddLocalVariable(pointer_type_id, storage_class); + } + + /// Returns the id of a signed integer constant of value + [[nodiscard]] Id ConstU32(u32 value) { + return Constant(u32_id, value); + } + + template + [[nodiscard]] Id ConstU32(Args&&... values) { + constexpr u32 size = static_cast(sizeof...(values)); + static_assert(size >= 2); + const std::array constituents{Constant(u32_id, values)...}; + const Id type = size <= 4 ? uvec_ids.Get(size) : TypeArray(u32_id, ConstU32(size)); + return ConstantComposite(type, constituents); + } + + /// Returns the id of a signed integer constant of value + [[nodiscard]] Id ConstS32(s32 value) { + return Constant(i32_id, value); + } + + template + [[nodiscard]] Id ConstS32(Args&&... values) { + constexpr u32 size = static_cast(sizeof...(values)); + static_assert(size >= 2); + const std::array constituents{Constant(i32_id, values)...}; + const Id type = size <= 4 ? ivec_ids.Get(size) : TypeArray(i32_id, ConstU32(size)); + return ConstantComposite(type, constituents); + } + + /// Returns the id of a float constant of value + [[nodiscard]] Id ConstF32(f32 value) { + return Constant(f32_id, value); + } + + template + [[nodiscard]] Id ConstF32(Args... values) { + constexpr u32 size = static_cast(sizeof...(values)); + static_assert(size >= 2); + const std::array constituents{Constant(f32_id, values)...}; + const Id type = size <= 4 ? vec_ids.Get(size) : TypeArray(f32_id, ConstU32(size)); + return ConstantComposite(type, constituents); + } + + void DefineArithmeticTypes(); + void DefineEntryPoint(); + void DefineUniformStructs(); + void DefineInterface(); + Id CompareShadow(Id pixel, Id z); + +private: + Core::TelemetrySession& telemetry; + PicaFSConfig config; + Id void_id{}; + Id bool_id{}; + Id f32_id{}; + Id i32_id{}; + Id u32_id{}; + + VectorIds vec_ids{}; + VectorIds ivec_ids{}; + VectorIds uvec_ids{}; + VectorIds bvec_ids{}; + + Id image2d_id{}; + Id image_cube_id{}; + Id image_buffer_id{}; + Id image_r32_id{}; + Id sampler_id{}; + Id shader_data_id{}; + + Id primary_color_id{}; + Id texcoord0_id{}; + Id texcoord1_id{}; + Id texcoord2_id{}; + Id texcoord0_w_id{}; + Id normquat_id{}; + Id view_id{}; + Id color_id{}; + + Id gl_frag_coord_id{}; + Id gl_frag_depth_id{}; + Id depth{}; + + Id tex0_id{}; + Id tex1_id{}; + Id tex2_id{}; + Id tex_cube_id{}; + Id texture_buffer_lut_lf_id{}; + Id texture_buffer_lut_rg_id{}; + Id texture_buffer_lut_rgba_id{}; + Id shadow_buffer_id{}; + Id shadow_texture_px_id{}; + Id shadow_texture_nx_id{}; + Id shadow_texture_py_id{}; + Id shadow_texture_ny_id{}; + Id shadow_texture_pz_id{}; + Id shadow_texture_nz_id{}; + + Id texture_buffer_lut_lf{}; + Id texture_buffer_lut_rg{}; + Id texture_buffer_lut_rgba{}; + + Id rounded_primary_color{}; + Id primary_fragment_color{}; + Id secondary_fragment_color{}; + Id combiner_buffer{}; + Id next_combiner_buffer{}; + Id last_tex_env_out{}; + + Id color_results_1{}; + Id color_results_2{}; + Id color_results_3{}; + Id alpha_results_1{}; + Id alpha_results_2{}; + Id alpha_results_3{}; + + Id proctex_func{}; + Id noise1d_table{}; + Id noise2d_table{}; + Id lut_offsets{}; +}; + +/** + * Generates the SPIR-V fragment shader program source code for the current Pica state + * @param config ShaderCacheKey object generated for the current Pica state, used for the shader + * configuration (NOTE: Use state in this struct only, not the Pica registers!) + * @param separable_shader generates shader that can be used for separate shader object + * @returns String of the shader source code + */ +std::vector GenerateFragmentShaderSPV(const PicaFSConfig& config); + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_util.cpp b/src/video_core/renderer_vulkan/vk_shader_util.cpp new file mode 100644 index 000000000..2d84a76b9 --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_util.cpp @@ -0,0 +1,233 @@ +// Copyright 2023 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#include +#include +#include +#include "common/assert.h" +#include "common/literals.h" +#include "common/logging/log.h" +#include "video_core/renderer_vulkan/vk_shader_util.h" + +namespace Vulkan { + +using namespace Common::Literals; + +namespace { +constexpr TBuiltInResource DefaultTBuiltInResource = { + .maxLights = 32, + .maxClipPlanes = 6, + .maxTextureUnits = 32, + .maxTextureCoords = 32, + .maxVertexAttribs = 64, + .maxVertexUniformComponents = 4096, + .maxVaryingFloats = 64, + .maxVertexTextureImageUnits = 32, + .maxCombinedTextureImageUnits = 80, + .maxTextureImageUnits = 32, + .maxFragmentUniformComponents = 4096, + .maxDrawBuffers = 32, + .maxVertexUniformVectors = 128, + .maxVaryingVectors = 8, + .maxFragmentUniformVectors = 16, + .maxVertexOutputVectors = 16, + .maxFragmentInputVectors = 15, + .minProgramTexelOffset = -8, + .maxProgramTexelOffset = 7, + .maxClipDistances = 8, + .maxComputeWorkGroupCountX = 65535, + .maxComputeWorkGroupCountY = 65535, + .maxComputeWorkGroupCountZ = 65535, + .maxComputeWorkGroupSizeX = 1024, + .maxComputeWorkGroupSizeY = 1024, + .maxComputeWorkGroupSizeZ = 64, + .maxComputeUniformComponents = 1024, + .maxComputeTextureImageUnits = 16, + .maxComputeImageUniforms = 8, + .maxComputeAtomicCounters = 8, + .maxComputeAtomicCounterBuffers = 1, + .maxVaryingComponents = 60, + .maxVertexOutputComponents = 64, + .maxGeometryInputComponents = 64, + .maxGeometryOutputComponents = 128, + .maxFragmentInputComponents = 128, + .maxImageUnits = 8, + .maxCombinedImageUnitsAndFragmentOutputs = 8, + .maxCombinedShaderOutputResources = 8, + .maxImageSamples = 0, + .maxVertexImageUniforms = 0, + .maxTessControlImageUniforms = 0, + .maxTessEvaluationImageUniforms = 0, + .maxGeometryImageUniforms = 0, + .maxFragmentImageUniforms = 8, + .maxCombinedImageUniforms = 8, + .maxGeometryTextureImageUnits = 16, + .maxGeometryOutputVertices = 256, + .maxGeometryTotalOutputComponents = 1024, + .maxGeometryUniformComponents = 1024, + .maxGeometryVaryingComponents = 64, + .maxTessControlInputComponents = 128, + .maxTessControlOutputComponents = 128, + .maxTessControlTextureImageUnits = 16, + .maxTessControlUniformComponents = 1024, + .maxTessControlTotalOutputComponents = 4096, + .maxTessEvaluationInputComponents = 128, + .maxTessEvaluationOutputComponents = 128, + .maxTessEvaluationTextureImageUnits = 16, + .maxTessEvaluationUniformComponents = 1024, + .maxTessPatchComponents = 120, + .maxPatchVertices = 32, + .maxTessGenLevel = 64, + .maxViewports = 16, + .maxVertexAtomicCounters = 0, + .maxTessControlAtomicCounters = 0, + .maxTessEvaluationAtomicCounters = 0, + .maxGeometryAtomicCounters = 0, + .maxFragmentAtomicCounters = 8, + .maxCombinedAtomicCounters = 8, + .maxAtomicCounterBindings = 1, + .maxVertexAtomicCounterBuffers = 0, + .maxTessControlAtomicCounterBuffers = 0, + .maxTessEvaluationAtomicCounterBuffers = 0, + .maxGeometryAtomicCounterBuffers = 0, + .maxFragmentAtomicCounterBuffers = 1, + .maxCombinedAtomicCounterBuffers = 1, + .maxAtomicCounterBufferSize = 16384, + .maxTransformFeedbackBuffers = 4, + .maxTransformFeedbackInterleavedComponents = 64, + .maxCullDistances = 8, + .maxCombinedClipAndCullDistances = 8, + .maxSamples = 4, + .maxMeshOutputVerticesNV = 256, + .maxMeshOutputPrimitivesNV = 512, + .maxMeshWorkGroupSizeX_NV = 32, + .maxMeshWorkGroupSizeY_NV = 1, + .maxMeshWorkGroupSizeZ_NV = 1, + .maxTaskWorkGroupSizeX_NV = 32, + .maxTaskWorkGroupSizeY_NV = 1, + .maxTaskWorkGroupSizeZ_NV = 1, + .maxMeshViewCountNV = 4, + .maxDualSourceDrawBuffersEXT = 1, + .limits = + TLimits{ + .nonInductiveForLoops = 1, + .whileLoops = 1, + .doWhileLoops = 1, + .generalUniformIndexing = 1, + .generalAttributeMatrixVectorIndexing = 1, + .generalVaryingIndexing = 1, + .generalSamplerIndexing = 1, + .generalVariableIndexing = 1, + .generalConstantMatrixVectorIndexing = 1, + }, +}; + +EShLanguage ToEshShaderStage(vk::ShaderStageFlagBits stage) { + switch (stage) { + case vk::ShaderStageFlagBits::eVertex: + return EShLanguage::EShLangVertex; + case vk::ShaderStageFlagBits::eGeometry: + return EShLanguage::EShLangGeometry; + case vk::ShaderStageFlagBits::eFragment: + return EShLanguage::EShLangFragment; + case vk::ShaderStageFlagBits::eCompute: + return EShLanguage::EShLangCompute; + default: + UNREACHABLE_MSG("Unkown shader stage {}", stage); + } + return EShLanguage::EShLangVertex; +} + +bool InitializeCompiler() { + static bool glslang_initialized = false; + + if (glslang_initialized) { + return true; + } + + if (!glslang::InitializeProcess()) { + LOG_CRITICAL(Render_Vulkan, "Failed to initialize glslang shader compiler"); + return false; + } + + std::atexit([]() { glslang::FinalizeProcess(); }); + + glslang_initialized = true; + return true; +} +} // Anonymous namespace + +vk::ShaderModule Compile(std::string_view code, vk::ShaderStageFlagBits stage, vk::Device device) { + if (!InitializeCompiler()) { + return VK_NULL_HANDLE; + } + + EProfile profile = ECoreProfile; + EShMessages messages = + static_cast(EShMsgDefault | EShMsgSpvRules | EShMsgVulkanRules); + EShLanguage lang = ToEshShaderStage(stage); + + const int default_version = 450; + const char* pass_source_code = code.data(); + int pass_source_code_length = static_cast(code.size()); + + auto shader = std::make_unique(lang); + shader->setEnvTarget(glslang::EShTargetSpv, + glslang::EShTargetLanguageVersion::EShTargetSpv_1_3); + shader->setStringsWithLengths(&pass_source_code, &pass_source_code_length, 1); + + glslang::TShader::ForbidIncluder includer; + if (!shader->parse(&DefaultTBuiltInResource, default_version, profile, false, true, messages, + includer)) [[unlikely]] { + LOG_INFO(Render_Vulkan, "Shader Info Log:\n{}\n{}", shader->getInfoLog(), + shader->getInfoDebugLog()); + return VK_NULL_HANDLE; + } + + // Even though there's only a single shader, we still need to link it to generate SPV + auto program = std::make_unique(); + program->addShader(shader.get()); + if (!program->link(messages)) { + LOG_INFO(Render_Vulkan, "Program Info Log:\n{}\n{}", program->getInfoLog(), + program->getInfoDebugLog()); + return VK_NULL_HANDLE; + } + + glslang::TIntermediate* intermediate = program->getIntermediate(lang); + std::vector out_code; + spv::SpvBuildLogger logger; + glslang::SpvOptions options; + + // Enable optimizations on the generated SPIR-V code. + options.disableOptimizer = false; + options.validate = false; + options.optimizeSize = true; + + out_code.reserve(8_KiB); + glslang::GlslangToSpv(*intermediate, out_code, &logger, &options); + + const std::string spv_messages = logger.getAllMessages(); + if (!spv_messages.empty()) { + LOG_INFO(Render_Vulkan, "SPIR-V conversion messages: {}", spv_messages); + } + + return CompileSPV(out_code, device); +} + +vk::ShaderModule CompileSPV(std::span code, vk::Device device) { + const vk::ShaderModuleCreateInfo shader_info = { + .codeSize = code.size() * sizeof(u32), + .pCode = code.data(), + }; + + try { + return device.createShaderModule(shader_info); + } catch (vk::SystemError& err) { + UNREACHABLE_MSG("{}", err.what()); + } + + return VK_NULL_HANDLE; +} + +} // namespace Vulkan diff --git a/src/video_core/renderer_vulkan/vk_shader_util.h b/src/video_core/renderer_vulkan/vk_shader_util.h new file mode 100644 index 000000000..eb932e07d --- /dev/null +++ b/src/video_core/renderer_vulkan/vk_shader_util.h @@ -0,0 +1,28 @@ +// Copyright 2023 Citra Emulator Project +// Licensed under GPLv2 or any later version +// Refer to the license.txt file included. + +#pragma once + +#include + +#include "video_core/renderer_vulkan/vk_common.h" + +namespace Vulkan { + +/** + * @brief Creates a vulkan shader module from GLSL by converting it to SPIR-V using glslang. + * @param code The string containing GLSL code. + * @param stage The pipeline stage the shader will be used in. + * @param device The vulkan device handle. + */ +vk::ShaderModule Compile(std::string_view code, vk::ShaderStageFlagBits stage, vk::Device device); + +/** + * @brief Creates a vulkan shader module from SPIR-V bytecode. + * @param code The SPIR-V bytecode data. + * @param device The vulkan device handle + */ +vk::ShaderModule CompileSPV(std::span code, vk::Device device); + +} // namespace Vulkan