Merge branch 'master' into language-selector

2016-05-12 22:09:50 +02:00 · 2016-05-12 22:09:50 +02:00 · e1a19506b5
commit e1a19506b5
parent 9a9bcd5661 f6eb62d062
66 changed files with 1563 additions and 419 deletions
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@ -152,12 +152,15 @@ if (ENABLE_SDL2)
            download_bundled_external("sdl2/" ${SDL2_VER} SDL2_PREFIX)
        endif()

+        set(SDL2_FOUND YES)
        set(SDL2_INCLUDE_DIR "${SDL2_PREFIX}/include" CACHE PATH "Path to SDL2 headers")
        set(SDL2_LIBRARY "${SDL2_PREFIX}/lib/x64/SDL2.lib" CACHE PATH "Path to SDL2 library")
        set(SDL2_DLL_DIR "${SDL2_PREFIX}/lib/x64/" CACHE PATH "Path to SDL2.dll")
    else()
        find_package(SDL2 REQUIRED)
    endif()
+else()
+    set(SDL2_FOUND NO)
 endif()

 IF (APPLE)
--- a/src/audio_core/CMakeLists.txt
+++ b/src/audio_core/CMakeLists.txt
@ -4,6 +4,7 @@ set(SRCS
            hle/dsp.cpp
            hle/filter.cpp
            hle/pipe.cpp
+            hle/source.cpp
            interpolate.cpp
            sink_details.cpp
            )
@ -15,6 +16,7 @@ set(HEADERS
            hle/dsp.h
            hle/filter.h
            hle/pipe.h
+            hle/source.h
            interpolate.h
            null_sink.h
            sink.h
@ -23,7 +25,18 @@ set(HEADERS

 include_directories(../../externals/soundtouch/include)

+if(SDL2_FOUND)
+    set(SRCS ${SRCS} sdl2_sink.cpp)
+    set(HEADERS ${HEADERS} sdl2_sink.h)
+    include_directories(${SDL2_INCLUDE_DIR})
+endif()
+
 create_directory_groups(${SRCS} ${HEADERS})

 add_library(audio_core STATIC ${SRCS} ${HEADERS})
 target_link_libraries(audio_core SoundTouch)
+
+if(SDL2_FOUND)
+    target_link_libraries(audio_core ${SDL2_LIBRARY})
+    set_property(TARGET audio_core APPEND PROPERTY COMPILE_DEFINITIONS HAVE_SDL2)
+endif()
--- a/src/audio_core/hle/common.h
+++ b/src/audio_core/hle/common.h
@ -27,7 +27,7 @@ using QuadFrame32   = std::array<std::array<s32, 4>, samples_per_frame>;
 */
 template<typename FrameT, typename FilterT>
 void FilterFrame(FrameT& frame, FilterT& filter) {
-    std::transform(frame.begin(), frame.end(), frame.begin(), [&filter](const typename FrameT::value_type& sample) {
+    std::transform(frame.begin(), frame.end(), frame.begin(), [&filter](const auto& sample) {
        return filter.ProcessSample(sample);
    });
 }
--- a/src/audio_core/hle/dsp.cpp
+++ b/src/audio_core/hle/dsp.cpp
@ -2,10 +2,12 @@
 // Licensed under GPLv2 or any later version
 // Refer to the license.txt file included.

+#include <array>
 #include <memory>

 #include "audio_core/hle/dsp.h"
 #include "audio_core/hle/pipe.h"
+#include "audio_core/hle/source.h"
 #include "audio_core/sink.h"

 namespace DSP {
@ -38,16 +40,38 @@ static SharedMemory& WriteRegion() {
    return g_regions[1 - CurrentRegionIndex()];
 }

+static std::array<Source, num_sources> sources = {
+    Source(0), Source(1), Source(2), Source(3), Source(4), Source(5),
+    Source(6), Source(7), Source(8), Source(9), Source(10), Source(11),
+    Source(12), Source(13), Source(14), Source(15), Source(16), Source(17),
+    Source(18), Source(19), Source(20), Source(21), Source(22), Source(23)
+};
+
 static std::unique_ptr<AudioCore::Sink> sink;

 void Init() {
    DSP::HLE::ResetPipes();
+    for (auto& source : sources) {
+        source.Reset();
+    }
 }

 void Shutdown() {
 }

 bool Tick() {
+    SharedMemory& read = ReadRegion();
+    SharedMemory& write = WriteRegion();
+
+    std::array<QuadFrame32, 3> intermediate_mixes = {};
+
+    for (size_t i = 0; i < num_sources; i++) {
+        write.source_statuses.status[i] = sources[i].Tick(read.source_configurations.config[i], read.adpcm_coefficients.coeff[i]);
+        for (size_t mix = 0; mix < 3; mix++) {
+            sources[i].MixInto(intermediate_mixes[mix], mix);
+        }
+    }
+
    return true;
 }

--- a/src/audio_core/hle/dsp.h
+++ b/src/audio_core/hle/dsp.h
@ -33,13 +33,9 @@ namespace HLE {
 // double-buffer. The frame counter is located as the very last u16 of each region and is incremented
 // each audio tick.

-struct SharedMemory;
-
 constexpr VAddr region0_base = 0x1FF50000;
 constexpr VAddr region1_base = 0x1FF70000;

-extern std::array<SharedMemory, 2> g_regions;
-
 /**
 * The DSP is native 16-bit. The DSP also appears to be big-endian. When reading 32-bit numbers from
 * its memory regions, the higher and lower 16-bit halves are swapped compared to the little-endian
@ -169,9 +165,9 @@ struct SourceConfiguration {
        float_le rate_multiplier;

        enum class InterpolationMode : u8 {
-            None = 0,
+            Polyphase = 0,
            Linear = 1,
-            Polyphase = 2
+            None = 2
        };

        InterpolationMode interpolation_mode;
@ -318,10 +314,10 @@ ASSERT_DSP_STRUCT(SourceConfiguration::Configuration::Buffer, 20);
 struct SourceStatus {
    struct Status {
        u8 is_enabled;               ///< Is this channel enabled? (Doesn't have to be playing anything.)
-        u8 previous_buffer_id_dirty; ///< Non-zero when previous_buffer_id changes
+        u8 current_buffer_id_dirty;  ///< Non-zero when current_buffer_id changes
        u16_le sync;                 ///< Is set by the DSP to the value of SourceConfiguration::sync
        u32_dsp buffer_position;     ///< Number of samples into the current buffer
-        u16_le previous_buffer_id;   ///< Updated when a buffer finishes playing
+        u16_le current_buffer_id;    ///< Updated when a buffer finishes playing
        INSERT_PADDING_DSPWORDS(1);
    };

@ -507,6 +503,8 @@ struct SharedMemory {
 };
 ASSERT_DSP_STRUCT(SharedMemory, 0x8000);

+extern std::array<SharedMemory, 2> g_regions;
+
 // Structures must have an offset that is a multiple of two.
 static_assert(offsetof(SharedMemory, frame_counter) % 2 == 0, "Structures in DSP::HLE::SharedMemory must be 2-byte aligned");
 static_assert(offsetof(SharedMemory, source_configurations) % 2 == 0, "Structures in DSP::HLE::SharedMemory must be 2-byte aligned");
--- a/src/audio_core/hle/filter.h
+++ b/src/audio_core/hle/filter.h
@ -16,6 +16,7 @@ namespace HLE {

 /// Preprocessing filters. There is an independent set of filters for each Source.
 class SourceFilters final {
+public:
    SourceFilters() { Reset(); }

    /// Reset internal state.
--- a/src/audio_core/hle/pipe.cpp
+++ b/src/audio_core/hle/pipe.cpp
@ -36,12 +36,17 @@ std::vector<u8> PipeRead(DspPipe pipe_number, u32 length) {
        return {};
    }

+    if (length > UINT16_MAX) { // Can only read at most UINT16_MAX from the pipe
+        LOG_ERROR(Audio_DSP, "length of %u greater than max of %u", length, UINT16_MAX);
+        return {};
+    }
+
    std::vector<u8>& data = pipe_data[pipe_index];

    if (length > data.size()) {
        LOG_WARNING(Audio_DSP, "pipe_number = %zu is out of data, application requested read of %u but %zu remain",
                    pipe_index, length, data.size());
-        length = data.size();
+        length = static_cast<u32>(data.size());
    }

    if (length == 0)
@ -94,7 +99,7 @@ static void AudioPipeWriteStructAddresses() {
    };

    // Begin with a u16 denoting the number of structs.
-    WriteU16(DspPipe::Audio, struct_addresses.size());
+    WriteU16(DspPipe::Audio, static_cast<u16>(struct_addresses.size()));
    // Then write the struct addresses.
    for (u16 addr : struct_addresses) {
        WriteU16(DspPipe::Audio, addr);
--- a/src/audio_core/hle/pipe.h
+++ b/src/audio_core/hle/pipe.h
@ -24,10 +24,14 @@ enum class DspPipe {
 constexpr size_t NUM_DSP_PIPE = 8;

 /**
- * Read a DSP pipe.
- * @param pipe_number The Pipe ID
- * @param length How much data to request.
- * @return The data read from the pipe. The size of this vector can be less than the length requested.
+ * Reads `length` bytes from the DSP pipe identified with `pipe_number`.
+ * @note Can read up to the maximum value of a u16 in bytes (65,535).
+ * @note IF an error is encoutered with either an invalid `pipe_number` or `length` value, an empty vector will be returned.
+ * @note IF `length` is set to 0, an empty vector will be returned.
+ * @note IF `length` is greater than the amount of data available, this function will only read the available amount.
+ * @param pipe_number a `DspPipe`
+ * @param length the number of bytes to read. The max is 65,535 (max of u16).
+ * @returns a vector of bytes from the specified pipe. On error, will be empty.
 */
 std::vector<u8> PipeRead(DspPipe pipe_number, u32 length);

--- a/src/audio_core/hle/source.cpp
+++ b/src/audio_core/hle/source.cpp
@ -0,0 +1,320 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <algorithm>
+#include <array>
+
+#include "audio_core/codec.h"
+#include "audio_core/hle/common.h"
+#include "audio_core/hle/source.h"
+#include "audio_core/interpolate.h"
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+
+#include "core/memory.h"
+
+namespace DSP {
+namespace HLE {
+
+SourceStatus::Status Source::Tick(SourceConfiguration::Configuration& config, const s16_le (&adpcm_coeffs)[16]) {
+    ParseConfig(config, adpcm_coeffs);
+
+    if (state.enabled) {
+        GenerateFrame();
+    }
+
+    return GetCurrentStatus();
+}
+
+void Source::MixInto(QuadFrame32& dest, size_t intermediate_mix_id) const {
+    if (!state.enabled)
+        return;
+
+    const std::array<float, 4>& gains = state.gain.at(intermediate_mix_id);
+    for (size_t samplei = 0; samplei < samples_per_frame; samplei++) {
+        // Conversion from stereo (current_frame) to quadraphonic (dest) occurs here.
+        dest[samplei][0] += static_cast<s32>(gains[0] * current_frame[samplei][0]);
+        dest[samplei][1] += static_cast<s32>(gains[1] * current_frame[samplei][1]);
+        dest[samplei][2] += static_cast<s32>(gains[2] * current_frame[samplei][0]);
+        dest[samplei][3] += static_cast<s32>(gains[3] * current_frame[samplei][1]);
+    }
+}
+
+void Source::Reset() {
+    current_frame.fill({});
+    state = {};
+}
+
+void Source::ParseConfig(SourceConfiguration::Configuration& config, const s16_le (&adpcm_coeffs)[16]) {
+    if (!config.dirty_raw) {
+        return;
+    }
+
+    if (config.reset_flag) {
+        config.reset_flag.Assign(0);
+        Reset();
+        LOG_TRACE(Audio_DSP, "source_id=%zu reset", source_id);
+    }
+
+    if (config.partial_reset_flag) {
+        config.partial_reset_flag.Assign(0);
+        state.input_queue = std::priority_queue<Buffer, std::vector<Buffer>, BufferOrder>{};
+        LOG_TRACE(Audio_DSP, "source_id=%zu partial_reset", source_id);
+    }
+
+    if (config.enable_dirty) {
+        config.enable_dirty.Assign(0);
+        state.enabled = config.enable != 0;
+        LOG_TRACE(Audio_DSP, "source_id=%zu enable=%d", source_id, state.enabled);
+    }
+
+    if (config.sync_dirty) {
+        config.sync_dirty.Assign(0);
+        state.sync = config.sync;
+        LOG_TRACE(Audio_DSP, "source_id=%zu sync=%u", source_id, state.sync);
+    }
+
+    if (config.rate_multiplier_dirty) {
+        config.rate_multiplier_dirty.Assign(0);
+        state.rate_multiplier = config.rate_multiplier;
+        LOG_TRACE(Audio_DSP, "source_id=%zu rate=%f", source_id, state.rate_multiplier);
+
+        if (state.rate_multiplier <= 0) {
+            LOG_ERROR(Audio_DSP, "Was given an invalid rate multiplier: source_id=%zu rate=%f", source_id, state.rate_multiplier);
+            state.rate_multiplier = 1.0f;
+            // Note: Actual firmware starts producing garbage if this occurs.
+        }
+    }
+
+    if (config.adpcm_coefficients_dirty) {
+        config.adpcm_coefficients_dirty.Assign(0);
+        std::transform(adpcm_coeffs, adpcm_coeffs + state.adpcm_coeffs.size(), state.adpcm_coeffs.begin(),
+            [](const auto& coeff) { return static_cast<s16>(coeff); });
+        LOG_TRACE(Audio_DSP, "source_id=%zu adpcm update", source_id);
+    }
+
+    if (config.gain_0_dirty) {
+        config.gain_0_dirty.Assign(0);
+        std::transform(config.gain[0], config.gain[0] + state.gain[0].size(), state.gain[0].begin(),
+            [](const auto& coeff) { return static_cast<float>(coeff); });
+        LOG_TRACE(Audio_DSP, "source_id=%zu gain 0 update", source_id);
+    }
+
+    if (config.gain_1_dirty) {
+        config.gain_1_dirty.Assign(0);
+        std::transform(config.gain[1], config.gain[1] + state.gain[1].size(), state.gain[1].begin(),
+            [](const auto& coeff) { return static_cast<float>(coeff); });
+        LOG_TRACE(Audio_DSP, "source_id=%zu gain 1 update", source_id);
+    }
+
+    if (config.gain_2_dirty) {
+        config.gain_2_dirty.Assign(0);
+        std::transform(config.gain[2], config.gain[2] + state.gain[2].size(), state.gain[2].begin(),
+            [](const auto& coeff) { return static_cast<float>(coeff); });
+        LOG_TRACE(Audio_DSP, "source_id=%zu gain 2 update", source_id);
+    }
+
+    if (config.filters_enabled_dirty) {
+        config.filters_enabled_dirty.Assign(0);
+        state.filters.Enable(config.simple_filter_enabled.ToBool(), config.biquad_filter_enabled.ToBool());
+        LOG_TRACE(Audio_DSP, "source_id=%zu enable_simple=%hu enable_biquad=%hu",
+                  source_id, config.simple_filter_enabled.Value(), config.biquad_filter_enabled.Value());
+    }
+
+    if (config.simple_filter_dirty) {
+        config.simple_filter_dirty.Assign(0);
+        state.filters.Configure(config.simple_filter);
+        LOG_TRACE(Audio_DSP, "source_id=%zu simple filter update", source_id);
+    }
+
+    if (config.biquad_filter_dirty) {
+        config.biquad_filter_dirty.Assign(0);
+        state.filters.Configure(config.biquad_filter);
+        LOG_TRACE(Audio_DSP, "source_id=%zu biquad filter update", source_id);
+    }
+
+    if (config.interpolation_dirty) {
+        config.interpolation_dirty.Assign(0);
+        state.interpolation_mode = config.interpolation_mode;
+        LOG_TRACE(Audio_DSP, "source_id=%zu interpolation_mode=%zu", source_id, static_cast<size_t>(state.interpolation_mode));
+    }
+
+    if (config.format_dirty || config.embedded_buffer_dirty) {
+        config.format_dirty.Assign(0);
+        state.format = config.format;
+        LOG_TRACE(Audio_DSP, "source_id=%zu format=%zu", source_id, static_cast<size_t>(state.format));
+    }
+
+    if (config.mono_or_stereo_dirty || config.embedded_buffer_dirty) {
+        config.mono_or_stereo_dirty.Assign(0);
+        state.mono_or_stereo = config.mono_or_stereo;
+        LOG_TRACE(Audio_DSP, "source_id=%zu mono_or_stereo=%zu", source_id, static_cast<size_t>(state.mono_or_stereo));
+    }
+
+    if (config.embedded_buffer_dirty) {
+        config.embedded_buffer_dirty.Assign(0);
+        state.input_queue.emplace(Buffer{
+            config.physical_address,
+            config.length,
+            static_cast<u8>(config.adpcm_ps),
+            { config.adpcm_yn[0], config.adpcm_yn[1] },
+            config.adpcm_dirty.ToBool(),
+            config.is_looping.ToBool(),
+            config.buffer_id,
+            state.mono_or_stereo,
+            state.format,
+            false
+        });
+        LOG_TRACE(Audio_DSP, "enqueuing embedded addr=0x%08x len=%u id=%hu", config.physical_address, config.length, config.buffer_id);
+    }
+
+    if (config.buffer_queue_dirty) {
+        config.buffer_queue_dirty.Assign(0);
+        for (size_t i = 0; i < 4; i++) {
+            if (config.buffers_dirty & (1 << i)) {
+                const auto& b = config.buffers[i];
+                state.input_queue.emplace(Buffer{
+                    b.physical_address,
+                    b.length,
+                    static_cast<u8>(b.adpcm_ps),
+                    { b.adpcm_yn[0], b.adpcm_yn[1] },
+                    b.adpcm_dirty != 0,
+                    b.is_looping != 0,
+                    b.buffer_id,
+                    state.mono_or_stereo,
+                    state.format,
+                    true
+                });
+                LOG_TRACE(Audio_DSP, "enqueuing queued %zu addr=0x%08x len=%u id=%hu", i, b.physical_address, b.length, b.buffer_id);
+            }
+        }
+        config.buffers_dirty = 0;
+    }
+
+    if (config.dirty_raw) {
+        LOG_DEBUG(Audio_DSP, "source_id=%zu remaining_dirty=%x", source_id, config.dirty_raw);
+    }
+
+    config.dirty_raw = 0;
+}
+
+void Source::GenerateFrame() {
+    current_frame.fill({});
+
+    if (state.current_buffer.empty() && !DequeueBuffer()) {
+        state.enabled = false;
+        state.buffer_update = true;
+        state.current_buffer_id = 0;
+        return;
+    }
+
+    size_t frame_position = 0;
+
+    state.current_sample_number = state.next_sample_number;
+    while (frame_position < current_frame.size()) {
+        if (state.current_buffer.empty() && !DequeueBuffer()) {
+            break;
+        }
+
+        const size_t size_to_copy = std::min(state.current_buffer.size(), current_frame.size() - frame_position);
+
+        std::copy(state.current_buffer.begin(), state.current_buffer.begin() + size_to_copy, current_frame.begin() + frame_position);
+        state.current_buffer.erase(state.current_buffer.begin(), state.current_buffer.begin() + size_to_copy);
+
+        frame_position += size_to_copy;
+        state.next_sample_number += static_cast<u32>(size_to_copy);
+    }
+
+    state.filters.ProcessFrame(current_frame);
+}
+
+
+bool Source::DequeueBuffer() {
+    ASSERT_MSG(state.current_buffer.empty(), "Shouldn't dequeue; we still have data in current_buffer");
+
+    if (state.input_queue.empty())
+        return false;
+
+    const Buffer buf = state.input_queue.top();
+    state.input_queue.pop();
+
+    if (buf.adpcm_dirty) {
+        state.adpcm_state.yn1 = buf.adpcm_yn[0];
+        state.adpcm_state.yn2 = buf.adpcm_yn[1];
+    }
+
+    if (buf.is_looping) {
+        LOG_ERROR(Audio_DSP, "Looped buffers are unimplemented at the moment");
+    }
+
+    const u8* const memory = Memory::GetPhysicalPointer(buf.physical_address);
+    if (memory) {
+        const unsigned num_channels = buf.mono_or_stereo == MonoOrStereo::Stereo ? 2 : 1;
+        switch (buf.format) {
+        case Format::PCM8:
+            state.current_buffer = Codec::DecodePCM8(num_channels, memory, buf.length);
+            break;
+        case Format::PCM16:
+            state.current_buffer = Codec::DecodePCM16(num_channels, memory, buf.length);
+            break;
+        case Format::ADPCM:
+            DEBUG_ASSERT(num_channels == 1);
+            state.current_buffer = Codec::DecodeADPCM(memory, buf.length, state.adpcm_coeffs, state.adpcm_state);
+            break;
+        default:
+            UNIMPLEMENTED();
+            break;
+        }
+    } else {
+        LOG_WARNING(Audio_DSP, "source_id=%zu buffer_id=%hu length=%u: Invalid physical address 0x%08X",
+                               source_id, buf.buffer_id, buf.length, buf.physical_address);
+        state.current_buffer.clear();
+        return true;
+    }
+
+    switch (state.interpolation_mode) {
+    case InterpolationMode::None:
+        state.current_buffer = AudioInterp::None(state.interp_state, state.current_buffer, state.rate_multiplier);
+        break;
+    case InterpolationMode::Linear:
+        state.current_buffer = AudioInterp::Linear(state.interp_state, state.current_buffer, state.rate_multiplier);
+        break;
+    case InterpolationMode::Polyphase:
+        // TODO(merry): Implement polyphase interpolation
+        state.current_buffer = AudioInterp::Linear(state.interp_state, state.current_buffer, state.rate_multiplier);
+        break;
+    default:
+        UNIMPLEMENTED();
+        break;
+    }
+
+    state.current_sample_number = 0;
+    state.next_sample_number = 0;
+    state.current_buffer_id = buf.buffer_id;
+    state.buffer_update = buf.from_queue;
+
+    LOG_TRACE(Audio_DSP, "source_id=%zu buffer_id=%hu from_queue=%s current_buffer.size()=%zu",
+                         source_id, buf.buffer_id, buf.from_queue ? "true" : "false", state.current_buffer.size());
+    return true;
+}
+
+SourceStatus::Status Source::GetCurrentStatus() {
+    SourceStatus::Status ret;
+
+    // Applications depend on the correct emulation of
+    // current_buffer_id_dirty and current_buffer_id to synchronise
+    // audio with video.
+    ret.is_enabled = state.enabled;
+    ret.current_buffer_id_dirty = state.buffer_update ? 1 : 0;
+    state.buffer_update = false;
+    ret.current_buffer_id = state.current_buffer_id;
+    ret.buffer_position = state.current_sample_number;
+    ret.sync = state.sync;
+
+    return ret;
+}
+
+} // namespace HLE
+} // namespace DSP
--- a/src/audio_core/hle/source.h
+++ b/src/audio_core/hle/source.h
@ -0,0 +1,144 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <array>
+#include <queue>
+#include <vector>
+
+#include "audio_core/codec.h"
+#include "audio_core/hle/common.h"
+#include "audio_core/hle/dsp.h"
+#include "audio_core/hle/filter.h"
+#include "audio_core/interpolate.h"
+
+#include "common/common_types.h"
+
+namespace DSP {
+namespace HLE {
+
+/**
+ * This module performs:
+ * - Buffer management
+ * - Decoding of buffers
+ * - Buffer resampling and interpolation
+ * - Per-source filtering (SimpleFilter, BiquadFilter)
+ * - Per-source gain
+ * - Other per-source processing
+ */
+class Source final {
+public:
+    explicit Source(size_t source_id_) : source_id(source_id_) {
+        Reset();
+    }
+
+    /// Resets internal state.
+    void Reset();
+
+    /**
+     * This is called once every audio frame. This performs per-source processing every frame.
+     * @param config The new configuration we've got for this Source from the application.
+     * @param adpcm_coeffs ADPCM coefficients to use if config tells us to use them (may contain invalid values otherwise).
+     * @return The current status of this Source. This is given back to the emulated application via SharedMemory.
+     */
+    SourceStatus::Status Tick(SourceConfiguration::Configuration& config, const s16_le (&adpcm_coeffs)[16]);
+
+    /**
+     * Mix this source's output into dest, using the gains for the `intermediate_mix_id`-th intermediate mixer.
+     * @param dest The QuadFrame32 to mix into.
+     * @param intermediate_mix_id The id of the intermediate mix whose gains we are using.
+     */
+    void MixInto(QuadFrame32& dest, size_t intermediate_mix_id) const;
+
+private:
+    const size_t source_id;
+    StereoFrame16 current_frame;
+
+    using Format = SourceConfiguration::Configuration::Format;
+    using InterpolationMode = SourceConfiguration::Configuration::InterpolationMode;
+    using MonoOrStereo = SourceConfiguration::Configuration::MonoOrStereo;
+
+    /// Internal representation of a buffer for our buffer queue
+    struct Buffer {
+        PAddr physical_address;
+        u32 length;
+        u8 adpcm_ps;
+        std::array<u16, 2> adpcm_yn;
+        bool adpcm_dirty;
+        bool is_looping;
+        u16 buffer_id;
+
+        MonoOrStereo mono_or_stereo;
+        Format format;
+
+        bool from_queue;
+    };
+
+    struct BufferOrder {
+        bool operator() (const Buffer& a, const Buffer& b) const {
+            // Lower buffer_id comes first.
+            return a.buffer_id > b.buffer_id;
+        }
+    };
+
+    struct {
+
+        // State variables
+
+        bool enabled = false;
+        u16 sync = 0;
+
+        // Mixing
+
+        std::array<std::array<float, 4>, 3> gain = {};
+
+        // Buffer queue
+
+        std::priority_queue<Buffer, std::vector<Buffer>, BufferOrder> input_queue;
+        MonoOrStereo mono_or_stereo = MonoOrStereo::Mono;
+        Format format = Format::ADPCM;
+
+        // Current buffer
+
+        u32 current_sample_number = 0;
+        u32 next_sample_number = 0;
+        std::vector<std::array<s16, 2>> current_buffer;
+
+        // buffer_id state
+
+        bool buffer_update = false;
+        u32 current_buffer_id = 0;
+
+        // Decoding state
+
+        std::array<s16, 16> adpcm_coeffs = {};
+        Codec::ADPCMState adpcm_state = {};
+
+        // Resampling state
+
+        float rate_multiplier = 1.0;
+        InterpolationMode interpolation_mode = InterpolationMode::Polyphase;
+        AudioInterp::State interp_state = {};
+
+        // Filter state
+
+        SourceFilters filters;
+
+    } state;
+
+    // Internal functions
+
+    /// INTERNAL: Update our internal state based on the current config.
+    void ParseConfig(SourceConfiguration::Configuration& config, const s16_le (&adpcm_coeffs)[16]);
+    /// INTERNAL: Generate the current audio output for this frame based on our internal state.
+    void GenerateFrame();
+    /// INTERNAL: Dequeues a buffer and does preprocessing on it (decoding, resampling). Puts it into current_buffer.
+    bool DequeueBuffer();
+    /// INTERNAL: Generates a SourceStatus::Status based on our internal state.
+    SourceStatus::Status GetCurrentStatus();
+};
+
+} // namespace HLE
+} // namespace DSP
--- a/src/audio_core/sdl2_sink.cpp
+++ b/src/audio_core/sdl2_sink.cpp
@ -0,0 +1,126 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#include <list>
+#include <vector>
+
+#include <SDL.h>
+
+#include "audio_core/audio_core.h"
+#include "audio_core/sdl2_sink.h"
+
+#include "common/assert.h"
+#include "common/logging/log.h"
+#include <numeric>
+
+namespace AudioCore {
+
+struct SDL2Sink::Impl {
+    unsigned int sample_rate = 0;
+
+    SDL_AudioDeviceID audio_device_id = 0;
+
+    std::list<std::vector<s16>> queue;
+
+    static void Callback(void* impl_, u8* buffer, int buffer_size_in_bytes);
+};
+
+SDL2Sink::SDL2Sink() : impl(std::make_unique<Impl>()) {
+    if (SDL_Init(SDL_INIT_AUDIO) < 0) {
+        LOG_CRITICAL(Audio_Sink, "SDL_Init(SDL_INIT_AUDIO) failed");
+        impl->audio_device_id = 0;
+        return;
+    }
+
+    SDL_AudioSpec desired_audiospec;
+    SDL_zero(desired_audiospec);
+    desired_audiospec.format = AUDIO_S16;
+    desired_audiospec.channels = 2;
+    desired_audiospec.freq = native_sample_rate;
+    desired_audiospec.samples = 1024;
+    desired_audiospec.userdata = impl.get();
+    desired_audiospec.callback = &Impl::Callback;
+
+    SDL_AudioSpec obtained_audiospec;
+    SDL_zero(obtained_audiospec);
+
+    impl->audio_device_id = SDL_OpenAudioDevice(nullptr, false, &desired_audiospec, &obtained_audiospec, 0);
+    if (impl->audio_device_id <= 0) {
+        LOG_CRITICAL(Audio_Sink, "SDL_OpenAudioDevice failed");
+        return;
+    }
+
+    impl->sample_rate = obtained_audiospec.freq;
+
+    // SDL2 audio devices start out paused, unpause it:
+    SDL_PauseAudioDevice(impl->audio_device_id, 0);
+}
+
+SDL2Sink::~SDL2Sink() {
+    if (impl->audio_device_id <= 0)
+        return;
+
+    SDL_CloseAudioDevice(impl->audio_device_id);
+}
+
+unsigned int SDL2Sink::GetNativeSampleRate() const {
+    if (impl->audio_device_id <= 0)
+        return native_sample_rate;
+
+    return impl->sample_rate;
+}
+
+void SDL2Sink::EnqueueSamples(const std::vector<s16>& samples) {
+    if (impl->audio_device_id <= 0)
+        return;
+
+    ASSERT_MSG(samples.size() % 2 == 0, "Samples must be in interleaved stereo PCM16 format (size must be a multiple of two)");
+
+    SDL_LockAudioDevice(impl->audio_device_id);
+    impl->queue.emplace_back(samples);
+    SDL_UnlockAudioDevice(impl->audio_device_id);
+}
+
+size_t SDL2Sink::SamplesInQueue() const {
+    if (impl->audio_device_id <= 0)
+        return 0;
+
+    SDL_LockAudioDevice(impl->audio_device_id);
+
+    size_t total_size = std::accumulate(impl->queue.begin(), impl->queue.end(), static_cast<size_t>(0),
+        [](size_t sum, const auto& buffer) {
+            // Division by two because each stereo sample is made of two s16.
+            return sum + buffer.size() / 2;
+        });
+
+    SDL_UnlockAudioDevice(impl->audio_device_id);
+
+    return total_size;
+}
+
+void SDL2Sink::Impl::Callback(void* impl_, u8* buffer, int buffer_size_in_bytes) {
+    Impl* impl = reinterpret_cast<Impl*>(impl_);
+
+    size_t remaining_size = static_cast<size_t>(buffer_size_in_bytes) / sizeof(s16); // Keep track of size in 16-bit increments.
+
+    while (remaining_size > 0 && !impl->queue.empty()) {
+        if (impl->queue.front().size() <= remaining_size) {
+            memcpy(buffer, impl->queue.front().data(), impl->queue.front().size() * sizeof(s16));
+            buffer += impl->queue.front().size() * sizeof(s16);
+            remaining_size -= impl->queue.front().size();
+            impl->queue.pop_front();
+        } else {
+            memcpy(buffer, impl->queue.front().data(), remaining_size * sizeof(s16));
+            buffer += remaining_size * sizeof(s16);
+            impl->queue.front().erase(impl->queue.front().begin(), impl->queue.front().begin() + remaining_size);
+            remaining_size = 0;
+        }
+    }
+
+    if (remaining_size > 0) {
+        memset(buffer, 0, remaining_size * sizeof(s16));
+    }
+}
+
+} // namespace AudioCore
--- a/src/audio_core/sdl2_sink.h
+++ b/src/audio_core/sdl2_sink.h
@ -0,0 +1,30 @@
+// Copyright 2016 Citra Emulator Project
+// Licensed under GPLv2 or any later version
+// Refer to the license.txt file included.
+
+#pragma once
+
+#include <cstddef>
+#include <memory>
+
+#include "audio_core/sink.h"
+
+namespace AudioCore {
+
+class SDL2Sink final : public Sink {
+public:
+    SDL2Sink();
+    ~SDL2Sink() override;
+
+    unsigned int GetNativeSampleRate() const override;
+
+    void EnqueueSamples(const std::vector<s16>& samples) override;
+
+    size_t SamplesInQueue() const override;
+
+private:
+    struct Impl;
+    std::unique_ptr<Impl> impl;
+};
+
+} // namespace AudioCore
--- a/src/audio_core/sink.h
+++ b/src/audio_core/sink.h
@ -19,7 +19,7 @@ public:
    virtual ~Sink() = default;

    /// The native rate of this sink. The sink expects to be fed samples that respect this. (Units: samples/sec)
-    virtual unsigned GetNativeSampleRate() const = 0;
+    virtual unsigned int GetNativeSampleRate() const = 0;

    /**
     * Feed stereo samples to sink.
--- a/src/audio_core/sink_details.cpp
+++ b/src/audio_core/sink_details.cpp
@ -8,10 +8,17 @@
 #include "audio_core/null_sink.h"
 #include "audio_core/sink_details.h"

+#ifdef HAVE_SDL2
+#include "audio_core/sdl2_sink.h"
+#endif
+
 namespace AudioCore {

 // g_sink_details is ordered in terms of desirability, with the best choice at the top.
 const std::vector<SinkDetails> g_sink_details = {
+#ifdef HAVE_SDL2
+    { "sdl2", []() { return std::make_unique<SDL2Sink>(); } },
+#endif
    { "null", []() { return std::make_unique<NullSink>(); } },
 };

--- a/src/citra/config.cpp
+++ b/src/citra/config.cpp
@ -88,7 +88,7 @@ void Config::ReadValues() {

    // Debugging
    Settings::values.use_gdbstub = sdl2_config->GetBoolean("Debugging", "use_gdbstub", false);
-    Settings::values.gdbstub_port = sdl2_config->GetInteger("Debugging", "gdbstub_port", 24689);
+    Settings::values.gdbstub_port = static_cast<u16>(sdl2_config->GetInteger("Debugging", "gdbstub_port", 24689));
 }

 void Config::Reload() {
--- a/src/citra/default_ini.h
+++ b/src/citra/default_ini.h
@ -58,7 +58,7 @@ bg_green =

 [Audio]
 # Which audio output engine to use.
-# auto (default): Auto-select, null: No audio output
+# auto (default): Auto-select, null: No audio output, sdl2: SDL2 (if available)
 output_engine =

 [Data Storage]
--- a/src/citra/emu_window/emu_window_sdl2.cpp
+++ b/src/citra/emu_window/emu_window_sdl2.cpp
@ -9,6 +9,8 @@
 #define SDL_MAIN_HANDLED
 #include <SDL.h>

+#include <glad/glad.h>
+
 #include "common/key_map.h"
 #include "common/logging/log.h"
 #include "common/scm_rev.h"
@ -98,6 +100,11 @@ EmuWindow_SDL2::EmuWindow_SDL2() {
        exit(1);
    }

+    if (!gladLoadGLLoader(static_cast<GLADloadproc>(SDL_GL_GetProcAddress))) {
+        LOG_CRITICAL(Frontend, "Failed to initialize GL functions! Exiting...");
+        exit(1);
+    }
+
    OnResize();
    OnMinimalClientAreaChangeRequest(GetActiveConfig().min_client_area_size);
    SDL_PumpEvents();
--- a/src/citra_qt/CMakeLists.txt
+++ b/src/citra_qt/CMakeLists.txt
@ -55,6 +55,7 @@ set(HEADERS
            configure_dialog.h
            configure_general.h
            game_list.h
+            game_list_p.h
            hotkeys.h
            main.h
            ui_settings.h
--- a/src/citra_qt/debugger/graphics_breakpoints.cpp
+++ b/src/citra_qt/debugger/graphics_breakpoints.cpp
@ -44,7 +44,7 @@ QVariant BreakPointModel::data(const QModelIndex& index, int role) const
                { Pica::DebugContext::Event::PicaCommandProcessed, tr("Pica command processed") },
                { Pica::DebugContext::Event::IncomingPrimitiveBatch, tr("Incoming primitive batch") },
                { Pica::DebugContext::Event::FinishedPrimitiveBatch, tr("Finished primitive batch") },
-                { Pica::DebugContext::Event::VertexLoaded, tr("Vertex loaded") },
+                { Pica::DebugContext::Event::VertexShaderInvocation, tr("Vertex shader invocation") },
                { Pica::DebugContext::Event::IncomingDisplayTransfer, tr("Incoming display transfer") },
                { Pica::DebugContext::Event::GSPCommandProcessed, tr("GSP command processed") },
                { Pica::DebugContext::Event::BufferSwapped, tr("Buffers swapped") }
--- a/src/citra_qt/debugger/graphics_vertex_shader.cpp
+++ b/src/citra_qt/debugger/graphics_vertex_shader.cpp
@ -365,7 +365,7 @@ GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::De
        input_data[i]->setValidator(new QDoubleValidator(input_data[i]));
    }

-    breakpoint_warning = new QLabel(tr("(data only available at VertexLoaded breakpoints)"));
+    breakpoint_warning = new QLabel(tr("(data only available at vertex shader invocation breakpoints)"));

    // TODO: Add some button for jumping to the shader entry point

@ -454,7 +454,7 @@ GraphicsVertexShaderWidget::GraphicsVertexShaderWidget(std::shared_ptr< Pica::De

 void GraphicsVertexShaderWidget::OnBreakPointHit(Pica::DebugContext::Event event, void* data) {
    auto input = static_cast<Pica::Shader::InputVertex*>(data);
-    if (event == Pica::DebugContext::Event::VertexLoaded) {
+    if (event == Pica::DebugContext::Event::VertexShaderInvocation) {
        Reload(true, data);
    } else {
        // No vertex data is retrievable => invalidate currently stored vertex data
@ -501,7 +501,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d
    info.labels.insert({ entry_point, "main" });

    // Generate debug information
-    debug_data = Pica::Shader::ProduceDebugInfo(input_vertex, num_attributes, shader_config, shader_setup);
+    debug_data = Pica::g_state.vs.ProduceDebugInfo(input_vertex, num_attributes, shader_config, shader_setup);

    // Reload widget state
    for (int attr = 0; attr < num_attributes; ++attr) {
@ -515,7 +515,7 @@ void GraphicsVertexShaderWidget::Reload(bool replace_vertex_data, void* vertex_d
    }

    // Initialize debug info text for current cycle count
-    cycle_index->setMaximum(debug_data.records.size() - 1);
+    cycle_index->setMaximum(static_cast<int>(debug_data.records.size() - 1));
    OnCycleIndexChanged(cycle_index->value());

    model->endResetModel();
--- a/src/citra_qt/game_list.cpp
+++ b/src/citra_qt/game_list.cpp
@ -34,8 +34,8 @@ GameList::GameList(QWidget* parent)
    tree_view->setUniformRowHeights(true);

    item_model->insertColumns(0, COLUMN_COUNT);
-    item_model->setHeaderData(COLUMN_FILE_TYPE, Qt::Horizontal, "File type");
    item_model->setHeaderData(COLUMN_NAME, Qt::Horizontal, "Name");
+    item_model->setHeaderData(COLUMN_FILE_TYPE, Qt::Horizontal, "File type");
    item_model->setHeaderData(COLUMN_SIZE, Qt::Horizontal, "Size");

    connect(tree_view, SIGNAL(activated(const QModelIndex&)), this, SLOT(ValidateEntry(const QModelIndex&)));
@ -109,7 +109,11 @@ void GameList::SaveInterfaceLayout()
 void GameList::LoadInterfaceLayout()
 {
    auto header = tree_view->header();
-    header->restoreState(UISettings::values.gamelist_header_state);
+    if (!header->restoreState(UISettings::values.gamelist_header_state)) {
+        // We are using the name column to display icons and titles
+        // so make it as large as possible as default.
+        header->resizeSection(COLUMN_NAME, header->width());
+    }

    item_model->sort(header->sortIndicatorSection(), header->sortIndicatorOrder());
 }
@ -143,9 +147,15 @@ void GameListWorker::AddFstEntriesToGameList(const std::string& dir_path, bool d
                LOG_WARNING(Frontend, "Filetype and extension of file %s do not match.", physical_name.c_str());
            }

+            std::vector<u8> smdh;
+            std::unique_ptr<Loader::AppLoader> loader = Loader::GetLoader(FileUtil::IOFile(physical_name, "rb"), filetype, filename_filename, physical_name);
+
+            if (loader)
+                loader->ReadIcon(smdh);
+
            emit EntryReady({
+                new GameListItemPath(QString::fromStdString(physical_name), smdh),
                new GameListItem(QString::fromStdString(Loader::GetFileTypeString(filetype))),
-                new GameListItemPath(QString::fromStdString(physical_name)),
                new GameListItemSize(FileUtil::GetSize(physical_name)),
            });
        }
--- a/src/citra_qt/game_list.h
+++ b/src/citra_qt/game_list.h
@ -20,8 +20,8 @@ class GameList : public QWidget {

 public:
    enum {
-        COLUMN_FILE_TYPE,
        COLUMN_NAME,
+        COLUMN_FILE_TYPE,
        COLUMN_SIZE,
        COLUMN_COUNT, // Number of columns
    };
--- a/src/citra_qt/game_list_p.h
+++ b/src/citra_qt/game_list_p.h
@ -6,13 +6,85 @@

 #include <atomic>

+#include <QImage>
 #include <QRunnable>
 #include <QStandardItem>
 #include <QString>

 #include "citra_qt/util/util.h"
 #include "common/string_util.h"
+#include "common/color.h"

+#include "core/loader/loader.h"
+
+#include "video_core/utils.h"
+
+/**
+ * Tests if data is a valid SMDH by its length and magic number.
+ * @param smdh_data data buffer to test
+ * @return bool test result
+ */
+static bool IsValidSMDH(const std::vector<u8>& smdh_data) {
+    if (smdh_data.size() < sizeof(Loader::SMDH))
+        return false;
+
+    u32 magic;
+    memcpy(&magic, smdh_data.data(), 4);
+
+    return Loader::MakeMagic('S', 'M', 'D', 'H') == magic;
+}
+
+/**
+ * Gets game icon from SMDH
+ * @param sdmh SMDH data
+ * @param large If true, returns large icon (48x48), otherwise returns small icon (24x24)
+ * @return QPixmap game icon
+ */
+static QPixmap GetIconFromSMDH(const Loader::SMDH& smdh, bool large) {
+    u32 size;
+    const u8* icon_data;
+
+    if (large) {
+        size = 48;
+        icon_data = smdh.large_icon.data();
+    } else {
+        size = 24;
+        icon_data = smdh.small_icon.data();
+    }
+
+    QImage icon(size, size, QImage::Format::Format_RGB888);
+    for (u32 x = 0; x < size; ++x) {
+        for (u32 y = 0; y < size; ++y) {
+            u32 coarse_y = y & ~7;
+            auto v = Color::DecodeRGB565(
+                icon_data + VideoCore::GetMortonOffset(x, y, 2) + coarse_y * size * 2);
+            icon.setPixel(x, y, qRgb(v.r(), v.g(), v.b()));
+        }
+    }
+    return QPixmap::fromImage(icon);
+}
+
+/**
+ * Gets the default icon (for games without valid SMDH)
+ * @param large If true, returns large icon (48x48), otherwise returns small icon (24x24)
+ * @return QPixmap default icon
+ */
+static QPixmap GetDefaultIcon(bool large) {
+    int size = large ? 48 : 24;
+    QPixmap icon(size, size);
+    icon.fill(Qt::transparent);
+    return icon;
+}
+
+/**
+ * Gets the short game title fromn SMDH
+ * @param sdmh SMDH data
+ * @param language title language
+ * @return QString short title
+ */
+static QString GetShortTitleFromSMDH(const Loader::SMDH& smdh, Loader::SMDH::TitleLanguage language) {
+    return QString::fromUtf16(smdh.titles[static_cast<int>(language)].short_title.data());
+}

 class GameListItem : public QStandardItem {

@ -27,29 +99,43 @@ public:
 * A specialization of GameListItem for path values.
 * This class ensures that for every full path value it holds, a correct string representation
 * of just the filename (with no extension) will be displayed to the user.
+ * If this class recieves valid SMDH data, it will also display game icons and titles.
 */
 class GameListItemPath : public GameListItem {

 public:
    static const int FullPathRole = Qt::UserRole + 1;
+    static const int TitleRole = Qt::UserRole + 2;

    GameListItemPath(): GameListItem() {}
-    GameListItemPath(const QString& game_path): GameListItem()
+    GameListItemPath(const QString& game_path, const std::vector<u8>& smdh_data): GameListItem()
    {
        setData(game_path, FullPathRole);
+
+        if (!IsValidSMDH(smdh_data)) {
+            // SMDH is not valid, set a default icon
+            setData(GetDefaultIcon(true), Qt::DecorationRole);
+            return;
+        }
+
+        Loader::SMDH smdh;
+        memcpy(&smdh, smdh_data.data(), sizeof(Loader::SMDH));
+
+        // Get icon from SMDH
+        setData(GetIconFromSMDH(smdh, true), Qt::DecorationRole);
+
+        // Get title form SMDH
+        setData(GetShortTitleFromSMDH(smdh, Loader::SMDH::TitleLanguage::English), TitleRole);
    }

-    void setData(const QVariant& value, int role) override
-    {
-        // By specializing setData for FullPathRole, we can ensure that the two string
-        // representations of the data are always accurate and in the correct format.
-        if (role == FullPathRole) {
+    QVariant data(int role) const override {
+        if (role == Qt::DisplayRole) {
            std::string filename;
-            Common::SplitPath(value.toString().toStdString(), nullptr, &filename, nullptr);
-            GameListItem::setData(QString::fromStdString(filename), Qt::DisplayRole);
-            GameListItem::setData(value, FullPathRole);
+            Common::SplitPath(data(FullPathRole).toString().toStdString(), nullptr, &filename, nullptr);
+            QString title = data(TitleRole).toString();
+            return QString::fromStdString(filename) + (title.isEmpty() ? "" : "\n    " + title);
        } else {
-            GameListItem::setData(value, role);
+            return GameListItem::data(role);
        }
    }
 };
--- a/src/citra_qt/main.cpp
+++ b/src/citra_qt/main.cpp
@ -6,6 +6,9 @@
 #include <memory>
 #include <thread>

+#include <glad/glad.h>
+
+#define QT_NO_OPENGL
 #include <QDesktopWidget>
 #include <QtGui>
 #include <QFileDialog>
@ -240,6 +243,14 @@ bool GMainWindow::InitializeSystem() {
    if (emu_thread != nullptr)
        ShutdownGame();

+    render_window->MakeCurrent();
+    if (!gladLoadGL()) {
+        QMessageBox::critical(this, tr("Error while starting Citra!"),
+                              tr("Failed to initialize the video core!\n\n"
+                                 "Please ensure that your GPU supports OpenGL 3.3 and that you have the latest graphics driver."));
+        return false;
+    }
+
    // Initialize the core emulation
    System::Result system_result = System::Init(render_window);
    if (System::Result::Success != system_result) {
--- a/src/citra_qt/util/util.cpp
+++ b/src/citra_qt/util/util.cpp
@ -19,7 +19,7 @@ QString ReadableByteSize(qulonglong size) {
    static const std::array<const char*, 6> units = { "B", "KiB", "MiB", "GiB", "TiB", "PiB" };
    if (size == 0)
        return "0";
-    int digit_groups = std::min<int>((int)(std::log10(size) / std::log10(1024)), units.size());
+    int digit_groups = std::min<int>(static_cast<int>(std::log10(size) / std::log10(1024)), static_cast<int>(units.size()));
    return QString("%L1 %2").arg(size / std::pow(1024, digit_groups), 0, 'f', 1)
                            .arg(units[digit_groups]);
 }
--- a/src/common/logging/backend.cpp
+++ b/src/common/logging/backend.cpp
@ -65,6 +65,7 @@ namespace Log {
        SUB(Render, OpenGL) \
        CLS(Audio) \
        SUB(Audio, DSP) \
+        SUB(Audio, Sink) \
        CLS(Loader)

 // GetClassName is a macro defined by Windows.h, grrr...
--- a/src/common/logging/log.h
+++ b/src/common/logging/log.h
@ -78,8 +78,9 @@ enum class Class : ClassType {
    Render,                     ///< Emulator video output and hardware acceleration
    Render_Software,            ///< Software renderer backend
    Render_OpenGL,              ///< OpenGL backend
-    Audio,                      ///< Emulator audio output
+    Audio,                      ///< Audio emulation
    Audio_DSP,                  ///< The HLE implementation of the DSP
+    Audio_Sink,                 ///< Emulator audio output backend
    Loader,                     ///< ROM loader

    Count ///< Total number of logging classes
--- a/src/common/swap.h
+++ b/src/common/swap.h
@ -25,6 +25,8 @@
    #include <sys/endian.h>
 #endif

+#include <cstring>
+
 #include "common/common_types.h"

 // GCC 4.6+
@ -58,9 +60,6 @@

 namespace Common {

-inline u8 swap8(u8 _data) {return _data;}
-inline u32 swap24(const u8* _data) {return (_data[0] << 16) | (_data[1] << 8) | _data[2];}
-
 #ifdef _MSC_VER
 inline u16 swap16(u16 _data) {return _byteswap_ushort(_data);}
 inline u32 swap32(u32 _data) {return _byteswap_ulong (_data);}
@ -92,52 +91,29 @@ inline u64 swap64(u64 data) {return ((u64)swap32(data) << 32) | swap32(data >> 3
 #endif

 inline float swapf(float f) {
-    union {
-        float f;
-        unsigned int u32;
-    } dat1, dat2;
+    static_assert(sizeof(u32) == sizeof(float),
+                  "float must be the same size as uint32_t.");

-    dat1.f = f;
-    dat2.u32 = swap32(dat1.u32);
+    u32 value;
+    std::memcpy(&value, &f, sizeof(u32));

-    return dat2.f;
+    value = swap32(value);
+    std::memcpy(&f, &value, sizeof(u32));
+
+    return f;
 }

 inline double swapd(double f) {
-    union  {
-        double f;
-        unsigned long long u64;
-    } dat1, dat2;
+    static_assert(sizeof(u64) == sizeof(double),
+                  "double must be the same size as uint64_t.");

-    dat1.f = f;
-    dat2.u64 = swap64(dat1.u64);
+    u64 value;
+    std::memcpy(&value, &f, sizeof(u64));

-    return dat2.f;
-}
+    value = swap64(value);
+    std::memcpy(&f, &value, sizeof(u64));

-inline u16 swap16(const u8* _pData) {return swap16(*(const u16*)_pData);}
-inline u32 swap32(const u8* _pData) {return swap32(*(const u32*)_pData);}
-inline u64 swap64(const u8* _pData) {return swap64(*(const u64*)_pData);}
-
-template <int count>
-void swap(u8*);
-
-template <>
-inline void swap<1>(u8* data) { }
-
-template <>
-inline void swap<2>(u8* data) {
-    *reinterpret_cast<u16*>(data) = swap16(data);
-}
-
-template <>
-inline void swap<4>(u8* data) {
-    *reinterpret_cast<u32*>(data) = swap32(data);
-}
-
-template <>
-inline void swap<8>(u8* data) {
-    *reinterpret_cast<u64*>(data) = swap64(data);
+    return f;
 }

 }  // Namespace Common
@ -534,35 +510,35 @@ bool operator==(const S &p, const swap_struct_t<T, F> v) {
 template <typename T>
 struct swap_64_t {
    static T swap(T x) {
-        return (T)Common::swap64(*(u64 *)&x);
+        return static_cast<T>(Common::swap64(x));
    }
 };

 template <typename T>
 struct swap_32_t {
    static T swap(T x) {
-        return (T)Common::swap32(*(u32 *)&x);
+        return static_cast<T>(Common::swap32(x));
    }
 };

 template <typename T>
 struct swap_16_t {
    static T swap(T x) {
-        return (T)Common::swap16(*(u16 *)&x);
+        return static_cast<T>(Common::swap16(x));
    }
 };

 template <typename T>
 struct swap_float_t {
    static T swap(T x) {
-        return (T)Common::swapf(*(float *)&x);
+        return static_cast<T>(Common::swapf(x));
    }
 };

 template <typename T>
 struct swap_double_t {
    static T swap(T x) {
-        return (T)Common::swapd(*(double *)&x);
+        return static_cast<T>(Common::swapd(x));
    }
 };

--- a/src/core/arm/dyncom/arm_dyncom.cpp
+++ b/src/core/arm/dyncom/arm_dyncom.cpp
@ -93,7 +93,7 @@ void ARM_DynCom::ResetContext(Core::ThreadContext& context, u32 stack_top, u32 e
    context.cpu_registers[0] = arg;
    context.pc = entry_point;
    context.sp = stack_top;
-    context.cpsr = 0x1F | ((entry_point & 1) << 5); // Usermode and THUMB mode
+    context.cpsr = USER32MODE | ((entry_point & 1) << 5); // Usermode and THUMB mode
 }

 void ARM_DynCom::SaveContext(Core::ThreadContext& ctx) {
--- a/src/core/core.cpp
+++ b/src/core/core.cpp
@ -51,7 +51,7 @@ void RunLoop(int tight_loop) {
    }

    HW::Update();
-    if (HLE::g_reschedule) {
+    if (HLE::IsReschedulePending()) {
        Kernel::Reschedule();
    }
 }
--- a/src/core/gdbstub/gdbstub.cpp
+++ b/src/core/gdbstub/gdbstub.cpp
@ -374,7 +374,7 @@ static void SendReply(const char* reply) {

    memset(command_buffer, 0, sizeof(command_buffer));

-    command_length = strlen(reply);
+    command_length = static_cast<u32>(strlen(reply));
    if (command_length + 4 > sizeof(command_buffer)) {
        LOG_ERROR(Debug_GDBStub, "command_buffer overflow in SendReply");
        return;
@ -437,7 +437,7 @@ static void HandleSetThread() {
 *
 * @param signal Signal to be sent to client.
 */
-void SendSignal(u32 signal) {
+static void SendSignal(u32 signal) {
    if (gdbserver_socket == -1) {
        return;
    }
@ -515,7 +515,7 @@ static bool IsDataAvailable() {
        return false;
    }

-    return FD_ISSET(gdbserver_socket, &fd_socket);
+    return FD_ISSET(gdbserver_socket, &fd_socket) != 0;
 }

 /// Send requested register to gdb client.
@ -633,10 +633,10 @@ static void ReadMemory() {

    auto start_offset = command_buffer+1;
    auto addr_pos = std::find(start_offset, command_buffer+command_length, ',');
-    PAddr addr = HexToInt(start_offset, addr_pos - start_offset);
+    PAddr addr = HexToInt(start_offset, static_cast<u32>(addr_pos - start_offset));

    start_offset = addr_pos+1;
-    u32 len = HexToInt(start_offset, (command_buffer + command_length) - start_offset);
+    u32 len = HexToInt(start_offset, static_cast<u32>((command_buffer + command_length) - start_offset));

    LOG_DEBUG(Debug_GDBStub, "gdb: addr: %08x len: %08x\n", addr, len);

@ -658,11 +658,11 @@ static void ReadMemory() {
 static void WriteMemory() {
    auto start_offset = command_buffer+1;
    auto addr_pos = std::find(start_offset, command_buffer+command_length, ',');
-    PAddr addr = HexToInt(start_offset, addr_pos - start_offset);
+    PAddr addr = HexToInt(start_offset, static_cast<u32>(addr_pos - start_offset));

    start_offset = addr_pos+1;
    auto len_pos = std::find(start_offset, command_buffer+command_length, ':');
-    u32 len = HexToInt(start_offset, len_pos - start_offset);
+    u32 len = HexToInt(start_offset, static_cast<u32>(len_pos - start_offset));

    u8* dst = Memory::GetPointer(addr);
    if (!dst) {
@ -713,7 +713,7 @@ static void Continue() {
 * @param addr Address of breakpoint.
 * @param len Length of breakpoint.
 */
-bool CommitBreakpoint(BreakpointType type, PAddr addr, u32 len) {
+static bool CommitBreakpoint(BreakpointType type, PAddr addr, u32 len) {
    std::map<u32, Breakpoint>& p = GetBreakpointList(type);

    Breakpoint breakpoint;
@ -752,10 +752,10 @@ static void AddBreakpoint() {

    auto start_offset = command_buffer+3;
    auto addr_pos = std::find(start_offset, command_buffer+command_length, ',');
-    PAddr addr = HexToInt(start_offset, addr_pos - start_offset);
+    PAddr addr = HexToInt(start_offset, static_cast<u32>(addr_pos - start_offset));

    start_offset = addr_pos+1;
-    u32 len = HexToInt(start_offset, (command_buffer + command_length) - start_offset);
+    u32 len = HexToInt(start_offset, static_cast<u32>((command_buffer + command_length) - start_offset));

    if (type == BreakpointType::Access) {
        // Access is made up of Read and Write types, so add both breakpoints
@ -800,10 +800,10 @@ static void RemoveBreakpoint() {

    auto start_offset = command_buffer+3;
    auto addr_pos = std::find(start_offset, command_buffer+command_length, ',');
-    PAddr addr = HexToInt(start_offset, addr_pos - start_offset);
+    PAddr addr = HexToInt(start_offset, static_cast<u32>(addr_pos - start_offset));

    start_offset = addr_pos+1;
-    u32 len = HexToInt(start_offset, (command_buffer + command_length) - start_offset);
+    u32 len = HexToInt(start_offset, static_cast<u32>((command_buffer + command_length) - start_offset));

    if (type == BreakpointType::Access) {
        // Access is made up of Read and Write types, so add both breakpoints
@ -907,7 +907,7 @@ void ToggleServer(bool status) {
    }
 }

-void Init(u16 port) {
+static void Init(u16 port) {
    if (!g_server_enabled) {
        // Set the halt loop to false in case the user enabled the gdbstub mid-execution.
        // This way the CPU can still execute normally.
--- a/src/core/hle/applets/mii_selector.cpp
+++ b/src/core/hle/applets/mii_selector.cpp
@ -21,13 +21,6 @@
 namespace HLE {
 namespace Applets {

-MiiSelector::MiiSelector(Service::APT::AppletId id) : Applet(id), started(false) {
-    // Create the SharedMemory that will hold the framebuffer data
-    // TODO(Subv): What size should we use here?
-    using Kernel::MemoryPermission;
-    framebuffer_memory = Kernel::SharedMemory::Create(0x1000, MemoryPermission::ReadWrite, MemoryPermission::ReadWrite, "MiiSelector Memory");
-}
-
 ResultCode MiiSelector::ReceiveParameter(const Service::APT::MessageParameter& parameter) {
    if (parameter.signal != static_cast<u32>(Service::APT::SignalType::LibAppJustStarted)) {
        LOG_ERROR(Service_APT, "unsupported signal %u", parameter.signal);
@ -36,8 +29,18 @@ ResultCode MiiSelector::ReceiveParameter(const Service::APT::MessageParameter& p
        return ResultCode(-1);
    }

+    // The LibAppJustStarted message contains a buffer with the size of the framebuffer shared memory.
+    // Create the SharedMemory that will hold the framebuffer data
+    Service::APT::CaptureBufferInfo capture_info;
+    ASSERT(sizeof(capture_info) == parameter.buffer_size);
+
+    memcpy(&capture_info, parameter.data, sizeof(capture_info));
+    using Kernel::MemoryPermission;
+    framebuffer_memory = Kernel::SharedMemory::Create(capture_info.size, MemoryPermission::ReadWrite,
+                                                      MemoryPermission::ReadWrite, "MiiSelector Memory");
+
+    // Send the response message with the newly created SharedMemory
    Service::APT::MessageParameter result;
-    // The buffer passed in parameter contains the data returned by GSPGPU::ImportDisplayCaptureInfo
    result.signal = static_cast<u32>(Service::APT::SignalType::LibAppFinished);
    result.data = nullptr;
    result.buffer_size = 0;
@ -55,6 +58,11 @@ ResultCode MiiSelector::StartImpl(const Service::APT::AppletStartupParameter& pa
    // TODO(Subv): Set the expected fields in the response buffer before resending it to the application.
    // TODO(Subv): Reverse the parameter format for the Mii Selector

+    if(parameter.buffer_size >= sizeof(u32)) {
+        // TODO: defaults return no error, but garbage in other unknown fields
+        memset(parameter.data, 0, sizeof(u32));
+    }
+
    // Let the application know that we're closing
    Service::APT::MessageParameter message;
    message.buffer_size = parameter.buffer_size;
--- a/src/core/hle/applets/mii_selector.h
+++ b/src/core/hle/applets/mii_selector.h
@ -16,17 +16,61 @@
 namespace HLE {
 namespace Applets {

+struct MiiConfig {
+    u8  unk_000;
+    u8  unk_001;
+    u8  unk_002;
+    u8  unk_003;
+    u8  unk_004;
+    INSERT_PADDING_BYTES(3);
+    u16 unk_008;
+    INSERT_PADDING_BYTES(0x8C - 0xA);
+    u8  unk_08C;
+    INSERT_PADDING_BYTES(3);
+    u16 unk_090;
+    INSERT_PADDING_BYTES(2);
+    u32 unk_094;
+    u16 unk_098;
+    u8  unk_09A[0x64];
+    u8  unk_0FE;
+    u8  unk_0FF;
+    u32 unk_100;
+};
+
+static_assert(sizeof(MiiConfig) == 0x104, "MiiConfig structure has incorrect size");
+#define ASSERT_REG_POSITION(field_name, position) static_assert(offsetof(MiiConfig, field_name) == position, "Field "#field_name" has invalid position")
+ASSERT_REG_POSITION(unk_008, 0x08);
+ASSERT_REG_POSITION(unk_08C, 0x8C);
+ASSERT_REG_POSITION(unk_090, 0x90);
+ASSERT_REG_POSITION(unk_094, 0x94);
+ASSERT_REG_POSITION(unk_0FE, 0xFE);
+#undef ASSERT_REG_POSITION
+
+struct MiiResult {
+    u32 result_code;
+    u8 unk_04;
+    INSERT_PADDING_BYTES(7);
+    u8 unk_0C[0x60];
+    u8 unk_6C[0x16];
+    INSERT_PADDING_BYTES(2);
+};
+static_assert(sizeof(MiiResult) == 0x84, "MiiResult structure has incorrect size");
+#define ASSERT_REG_POSITION(field_name, position) static_assert(offsetof(MiiResult, field_name) == position, "Field "#field_name" has invalid position")
+ASSERT_REG_POSITION(unk_0C, 0x0C);
+ASSERT_REG_POSITION(unk_6C, 0x6C);
+#undef ASSERT_REG_POSITION
+
 class MiiSelector final : public Applet {
 public:
-    MiiSelector(Service::APT::AppletId id);
+    MiiSelector(Service::APT::AppletId id) : Applet(id), started(false) { }

    ResultCode ReceiveParameter(const Service::APT::MessageParameter& parameter) override;
    ResultCode StartImpl(const Service::APT::AppletStartupParameter& parameter) override;
    void Update() override;
    bool IsRunning() const override { return started; }

-    /// TODO(Subv): Find out what this is actually used for.
-    /// It is believed that the application stores the current screen image here.
+    /// This SharedMemory will be created when we receive the LibAppJustStarted message.
+    /// It holds the framebuffer info retrieved by the application with GSPGPU::ImportDisplayCaptureInfo
    Kernel::SharedPtr<Kernel::SharedMemory> framebuffer_memory;

    /// Whether this applet is currently running instead of the host application or not.
--- a/src/core/hle/applets/swkbd.cpp
+++ b/src/core/hle/applets/swkbd.cpp
@ -24,13 +24,6 @@
 namespace HLE {
 namespace Applets {

-SoftwareKeyboard::SoftwareKeyboard(Service::APT::AppletId id) : Applet(id), started(false) {
-    // Create the SharedMemory that will hold the framebuffer data
-    // TODO(Subv): What size should we use here?
-    using Kernel::MemoryPermission;
-    framebuffer_memory = Kernel::SharedMemory::Create(0x1000, MemoryPermission::ReadWrite, MemoryPermission::ReadWrite, "SoftwareKeyboard Memory");
-}
-
 ResultCode SoftwareKeyboard::ReceiveParameter(Service::APT::MessageParameter const& parameter) {
    if (parameter.signal != static_cast<u32>(Service::APT::SignalType::LibAppJustStarted)) {
        LOG_ERROR(Service_APT, "unsupported signal %u", parameter.signal);
@ -39,8 +32,19 @@ ResultCode SoftwareKeyboard::ReceiveParameter(Service::APT::MessageParameter con
        return ResultCode(-1);
    }

+    // The LibAppJustStarted message contains a buffer with the size of the framebuffer shared memory.
+    // Create the SharedMemory that will hold the framebuffer data
+    Service::APT::CaptureBufferInfo capture_info;
+    ASSERT(sizeof(capture_info) == parameter.buffer_size);
+
+    memcpy(&capture_info, parameter.data, sizeof(capture_info));
+
+    using Kernel::MemoryPermission;
+    framebuffer_memory = Kernel::SharedMemory::Create(capture_info.size, MemoryPermission::ReadWrite,
+                                                      MemoryPermission::ReadWrite, "SoftwareKeyboard Memory");
+
+    // Send the response message with the newly created SharedMemory
    Service::APT::MessageParameter result;
-    // The buffer passed in parameter contains the data returned by GSPGPU::ImportDisplayCaptureInfo
    result.signal = static_cast<u32>(Service::APT::SignalType::LibAppFinished);
    result.data = nullptr;
    result.buffer_size = 0;
--- a/src/core/hle/applets/swkbd.h
+++ b/src/core/hle/applets/swkbd.h
@ -53,8 +53,7 @@ static_assert(sizeof(SoftwareKeyboardConfig) == 0x400, "Software Keyboard Config

 class SoftwareKeyboard final : public Applet {
 public:
-    SoftwareKeyboard(Service::APT::AppletId id);
-    ~SoftwareKeyboard() {}
+    SoftwareKeyboard(Service::APT::AppletId id) : Applet(id), started(false) { }

    ResultCode ReceiveParameter(const Service::APT::MessageParameter& parameter) override;
    ResultCode StartImpl(const Service::APT::AppletStartupParameter& parameter) override;
@ -72,8 +71,8 @@ public:
     */
    void Finalize();

-    /// TODO(Subv): Find out what this is actually used for.
-    /// It is believed that the application stores the current screen image here.
+    /// This SharedMemory will be created when we receive the LibAppJustStarted message.
+    /// It holds the framebuffer info retrieved by the application with GSPGPU::ImportDisplayCaptureInfo
    Kernel::SharedPtr<Kernel::SharedMemory> framebuffer_memory;

    /// SharedMemory where the output text will be stored
--- a/src/core/hle/hle.cpp
+++ b/src/core/hle/hle.cpp
@ -12,9 +12,13 @@

 ////////////////////////////////////////////////////////////////////////////////////////////////////

-namespace HLE {
+namespace {

-bool g_reschedule; ///< If true, immediately reschedules the CPU to a new thread
+bool reschedule; ///< If true, immediately reschedules the CPU to a new thread
+
+}
+
+namespace HLE {

 void Reschedule(const char *reason) {
    DEBUG_ASSERT_MSG(reason != nullptr && strlen(reason) < 256, "Reschedule: Invalid or too long reason.");
@ -27,13 +31,21 @@ void Reschedule(const char *reason) {

    Core::g_app_core->PrepareReschedule();

-    g_reschedule = true;
+    reschedule = true;
+}
+
+bool IsReschedulePending() {
+    return reschedule;
+}
+
+void DoneRescheduling() {
+    reschedule = false;
 }

 void Init() {
    Service::Init();

-    g_reschedule = false;
+    reschedule = false;

    LOG_DEBUG(Kernel, "initialized OK");
 }
--- a/src/core/hle/hle.h
+++ b/src/core/hle/hle.h
@ -13,9 +13,9 @@ const Handle INVALID_HANDLE = 0;

 namespace HLE {

-extern bool g_reschedule;   ///< If true, immediately reschedules the CPU to a new thread
-
 void Reschedule(const char *reason);
+bool IsReschedulePending();
+void DoneRescheduling();

 void Init();
 void Shutdown();
--- a/src/core/hle/kernel/process.h
+++ b/src/core/hle/kernel/process.h
@ -107,6 +107,8 @@ public:
    ProcessFlags flags;
    /// Kernel compatibility version for this process
    u16 kernel_version = 0;
+    /// The default CPU for this process, threads are scheduled on this cpu by default.
+    u8 ideal_processor = 0;

    /// The id of this process
    u32 process_id = next_process_id++;
--- a/src/core/hle/kernel/thread.cpp
+++ b/src/core/hle/kernel/thread.cpp
@ -483,7 +483,8 @@ void Reschedule() {

    Thread* cur = GetCurrentThread();
    Thread* next = PopNextReadyThread();
-    HLE::g_reschedule = false;
+
+    HLE::DoneRescheduling();

    // Don't bother switching to the same thread
    if (next == cur)
--- a/src/core/hle/service/apt/apt.h
+++ b/src/core/hle/service/apt/apt.h
@ -5,6 +5,7 @@
 #pragma once

 #include "common/common_types.h"
+#include "common/swap.h"

 #include "core/hle/kernel/kernel.h"

@ -31,6 +32,20 @@ struct AppletStartupParameter {
    u8* data = nullptr;
 };

+/// Used by the application to pass information about the current framebuffer to applets.
+struct CaptureBufferInfo {
+    u32_le size;
+    u8 is_3d;
+    INSERT_PADDING_BYTES(0x3); // Padding for alignment
+    u32_le top_screen_left_offset;
+    u32_le top_screen_right_offset;
+    u32_le top_screen_format;
+    u32_le bottom_screen_left_offset;
+    u32_le bottom_screen_right_offset;
+    u32_le bottom_screen_format;
+};
+static_assert(sizeof(CaptureBufferInfo) == 0x20, "CaptureBufferInfo struct has incorrect size");
+
 /// Signals used by APT functions
 enum class SignalType : u32 {
    None              = 0x0,
--- a/src/core/hle/service/dsp_dsp.cpp
+++ b/src/core/hle/service/dsp_dsp.cpp
@ -288,7 +288,7 @@ static void WriteProcessPipe(Service::Interface* self) {
    ASSERT_MSG(Memory::GetPointer(buffer) != nullptr, "Invalid Buffer: pipe=%u, size=0x%X, buffer=0x%08X", pipe_index, size, buffer);

    std::vector<u8> message(size);
-    for (size_t i = 0; i < size; i++) {
+    for (u32 i = 0; i < size; i++) {
        message[i] = Memory::Read8(buffer + i);
    }

@ -403,7 +403,7 @@ static void GetPipeReadableSize(Service::Interface* self) {

    cmd_buff[0] = IPC::MakeHeader(0xF, 2, 0);
    cmd_buff[1] = RESULT_SUCCESS.raw; // No error
-    cmd_buff[2] = DSP::HLE::GetPipeReadableSize(pipe);
+    cmd_buff[2] = static_cast<u32>(DSP::HLE::GetPipeReadableSize(pipe));

    LOG_DEBUG(Service_DSP, "pipe=%u, unknown=0x%08X, return cmd_buff[2]=0x%08X", pipe_index, unknown, cmd_buff[2]);
 }
--- a/src/core/hle/svc.cpp
+++ b/src/core/hle/svc.cpp
@ -496,6 +496,11 @@ static ResultCode CreateThread(Handle* out_handle, s32 priority, u32 entry_point
        break;
    }

+    if (processor_id == THREADPROCESSORID_1 || processor_id == THREADPROCESSORID_ALL ||
+        (processor_id == THREADPROCESSORID_DEFAULT && Kernel::g_current_process->ideal_processor == THREADPROCESSORID_1)) {
+        LOG_WARNING(Kernel_SVC, "Newly created thread is allowed to be run in the SysCore, unimplemented.");
+    }
+
    CASCADE_RESULT(SharedPtr<Thread> thread, Kernel::Thread::Create(
            name, entry_point, priority, arg, processor_id, stack_top));
    CASCADE_RESULT(*out_handle, Kernel::g_handle_table.Create(std::move(thread)));
--- a/src/core/hw/gpu.cpp
+++ b/src/core/hw/gpu.cpp
@ -188,10 +188,10 @@ inline void Write(u32 addr, const T data) {
                    u32 output_gap = config.texture_copy.output_gap * 16;

                    size_t contiguous_input_size = config.texture_copy.size / input_width * (input_width + input_gap);
-                    Memory::RasterizerFlushRegion(config.GetPhysicalInputAddress(), contiguous_input_size);
+                    Memory::RasterizerFlushRegion(config.GetPhysicalInputAddress(), static_cast<u32>(contiguous_input_size));

                    size_t contiguous_output_size = config.texture_copy.size / output_width * (output_width + output_gap);
-                    Memory::RasterizerFlushAndInvalidateRegion(config.GetPhysicalOutputAddress(), contiguous_output_size);
+                    Memory::RasterizerFlushAndInvalidateRegion(config.GetPhysicalOutputAddress(), static_cast<u32>(contiguous_output_size));

                    u32 remaining_size = config.texture_copy.size;
                    u32 remaining_input = input_width;
--- a/src/core/loader/3dsx.cpp
+++ b/src/core/loader/3dsx.cpp
@ -178,11 +178,11 @@ static THREEDSX_Error Load3DSXFile(FileUtil::IOFile& file, u32 base_addr, Shared
                for (unsigned current_inprogress = 0; current_inprogress < remaining && pos < end_pos; current_inprogress++) {
                    const auto& table = reloc_table[current_inprogress];
                    LOG_TRACE(Loader, "(t=%d,skip=%u,patch=%u)", current_segment_reloc_table,
-                              (u32)table.skip, (u32)table.patch);
+                              static_cast<u32>(table.skip), static_cast<u32>(table.patch));
                    pos += table.skip;
                    s32 num_patches = table.patch;
                    while (0 < num_patches && pos < end_pos) {
-                        u32 in_addr = (u8*)pos - program_image.data();
+                        u32 in_addr = static_cast<u32>(reinterpret_cast<u8*>(pos) - program_image.data());
                        u32 addr = TranslateAddr(*pos, &loadinfo, offsets);
                        LOG_TRACE(Loader, "Patching %08X <-- rel(%08X,%d) (%08X)",
                                  base_addr + in_addr, addr, current_segment_reloc_table, *pos);
@ -284,7 +284,7 @@ ResultStatus AppLoader_THREEDSX::ReadRomFS(std::shared_ptr<FileUtil::IOFile>& ro
    // Check if the 3DSX has a RomFS...
    if (hdr.fs_offset != 0) {
        u32 romfs_offset = hdr.fs_offset;
-        u32 romfs_size = file.GetSize() - hdr.fs_offset;
+        u32 romfs_size = static_cast<u32>(file.GetSize()) - hdr.fs_offset;

        LOG_DEBUG(Loader, "RomFS offset:           0x%08X", romfs_offset);
        LOG_DEBUG(Loader, "RomFS size:             0x%08X", romfs_size);
@ -303,4 +303,31 @@ ResultStatus AppLoader_THREEDSX::ReadRomFS(std::shared_ptr<FileUtil::IOFile>& ro
    return ResultStatus::ErrorNotUsed;
 }

+ResultStatus AppLoader_THREEDSX::ReadIcon(std::vector<u8>& buffer) {
+    if (!file.IsOpen())
+        return ResultStatus::Error;
+
+    // Reset read pointer in case this file has been read before.
+    file.Seek(0, SEEK_SET);
+
+    THREEDSX_Header hdr;
+    if (file.ReadBytes(&hdr, sizeof(THREEDSX_Header)) != sizeof(THREEDSX_Header))
+        return ResultStatus::Error;
+
+    if (hdr.header_size != sizeof(THREEDSX_Header))
+        return ResultStatus::Error;
+
+    // Check if the 3DSX has a SMDH...
+    if (hdr.smdh_offset != 0) {
+        file.Seek(hdr.smdh_offset, SEEK_SET);
+        buffer.resize(hdr.smdh_size);
+
+        if (file.ReadBytes(&buffer[0], hdr.smdh_size) != hdr.smdh_size)
+            return ResultStatus::Error;
+
+        return ResultStatus::Success;
+    }
+    return ResultStatus::ErrorNotUsed;
+}
+
 } // namespace Loader
--- a/src/core/loader/3dsx.h
+++ b/src/core/loader/3dsx.h
@ -17,7 +17,7 @@ namespace Loader {
 /// Loads an 3DSX file
 class AppLoader_THREEDSX final : public AppLoader {
 public:
-    AppLoader_THREEDSX(FileUtil::IOFile&& file, std::string filename, const std::string& filepath)
+    AppLoader_THREEDSX(FileUtil::IOFile&& file, const std::string& filename, const std::string& filepath)
        : AppLoader(std::move(file)), filename(std::move(filename)), filepath(filepath) {}

    /**
@ -33,6 +33,13 @@ public:
     */
    ResultStatus Load() override;

+    /**
+     * Get the icon (typically icon section) of the application
+     * @param buffer Reference to buffer to store data
+     * @return ResultStatus result of function
+     */
+    ResultStatus ReadIcon(std::vector<u8>& buffer) override;
+
    /**
     * Get the RomFS of the application
     * @param romfs_file Reference to buffer to store data
--- a/src/core/loader/loader.cpp
+++ b/src/core/loader/loader.cpp
@ -90,6 +90,28 @@ const char* GetFileTypeString(FileType type) {
    return "unknown";
 }

+std::unique_ptr<AppLoader> GetLoader(FileUtil::IOFile&& file, FileType type,
+    const std::string& filename, const std::string& filepath) {
+    switch (type) {
+
+    // 3DSX file format.
+    case FileType::THREEDSX:
+        return std::make_unique<AppLoader_THREEDSX>(std::move(file), filename, filepath);
+
+    // Standard ELF file format.
+    case FileType::ELF:
+        return std::make_unique<AppLoader_ELF>(std::move(file), filename);
+
+    // NCCH/NCSD container formats.
+    case FileType::CXI:
+    case FileType::CCI:
+        return std::make_unique<AppLoader_NCCH>(std::move(file), filepath);
+
+    default:
+        return std::unique_ptr<AppLoader>();
+    }
+}
+
 ResultStatus LoadFile(const std::string& filename) {
    FileUtil::IOFile file(filename, "rb");
    if (!file.IsOpen()) {
@ -111,37 +133,28 @@ ResultStatus LoadFile(const std::string& filename) {

    LOG_INFO(Loader, "Loading file %s as %s...", filename.c_str(), GetFileTypeString(type));

+    std::unique_ptr<AppLoader> app_loader = GetLoader(std::move(file), type, filename_filename, filename);
+
    switch (type) {

-    //3DSX file format...
+    // 3DSX file format...
+    // or NCCH/NCSD container formats...
    case FileType::THREEDSX:
+    case FileType::CXI:
+    case FileType::CCI:
    {
-        AppLoader_THREEDSX app_loader(std::move(file), filename_filename, filename);
        // Load application and RomFS
-        if (ResultStatus::Success == app_loader.Load()) {
-            Service::FS::RegisterArchiveType(std::make_unique<FileSys::ArchiveFactory_RomFS>(app_loader), Service::FS::ArchiveIdCode::RomFS);
+        ResultStatus result = app_loader->Load();
+        if (ResultStatus::Success == result) {
+            Service::FS::RegisterArchiveType(std::make_unique<FileSys::ArchiveFactory_RomFS>(*app_loader), Service::FS::ArchiveIdCode::RomFS);
            return ResultStatus::Success;
        }
-        break;
+        return result;
    }

    // Standard ELF file format...
    case FileType::ELF:
-        return AppLoader_ELF(std::move(file), filename_filename).Load();
-
-    // NCCH/NCSD container formats...
-    case FileType::CXI:
-    case FileType::CCI:
-    {
-        AppLoader_NCCH app_loader(std::move(file), filename);
-
-        // Load application and RomFS
-        ResultStatus result = app_loader.Load();
-        if (ResultStatus::Success == result) {
-            Service::FS::RegisterArchiveType(std::make_unique<FileSys::ArchiveFactory_RomFS>(app_loader), Service::FS::ArchiveIdCode::RomFS);
-        }
-        return result;
-    }
+        return app_loader->Load();

    // CIA file format...
    case FileType::CIA:
--- a/src/core/loader/loader.h
+++ b/src/core/loader/loader.h
@ -10,8 +10,10 @@
 #include <string>
 #include <vector>

+#include "common/common_funcs.h"
 #include "common/common_types.h"
 #include "common/file_util.h"
+#include "common/swap.h"

 namespace Kernel {
 struct AddressMapping;
@ -78,6 +80,51 @@ constexpr u32 MakeMagic(char a, char b, char c, char d) {
    return a | b << 8 | c << 16 | d << 24;
 }

+/// SMDH data structure that contains titles, icons etc. See https://www.3dbrew.org/wiki/SMDH
+struct SMDH {
+    u32_le magic;
+    u16_le version;
+    INSERT_PADDING_BYTES(2);
+
+    struct Title {
+        std::array<u16, 0x40> short_title;
+        std::array<u16, 0x80> long_title;
+        std::array<u16, 0x40> publisher;
+    };
+    std::array<Title, 16> titles;
+
+    std::array<u8, 16> ratings;
+    u32_le region_lockout;
+    u32_le match_maker_id;
+    u64_le match_maker_bit_id;
+    u32_le flags;
+    u16_le eula_version;
+    INSERT_PADDING_BYTES(2);
+    float_le banner_animation_frame;
+    u32_le cec_id;
+    INSERT_PADDING_BYTES(8);
+
+    std::array<u8, 0x480> small_icon;
+    std::array<u8, 0x1200> large_icon;
+
+    /// indicates the language used for each title entry
+    enum class TitleLanguage {
+        Japanese = 0,
+        English = 1,
+        French = 2,
+        German = 3,
+        Italian = 4,
+        Spanish = 5,
+        SimplifiedChinese = 6,
+        Korean= 7,
+        Dutch = 8,
+        Portuguese = 9,
+        Russian = 10,
+        TraditionalChinese = 11
+    };
+};
+static_assert(sizeof(SMDH) == 0x36C0, "SMDH structure size is wrong");
+
 /// Interface for loading an application
 class AppLoader : NonCopyable {
 public:
@ -149,6 +196,16 @@ protected:
 */
 extern const std::initializer_list<Kernel::AddressMapping> default_address_mappings;

+/**
+ * Get a loader for a file with a specific type
+ * @param file The file to load
+ * @param type The type of the file
+ * @param filename the file name (without path)
+ * @param filepath the file full path (with name)
+ * @return std::unique_ptr<AppLoader> a pointer to a loader object;  nullptr for unsupported type
+ */
+std::unique_ptr<AppLoader> GetLoader(FileUtil::IOFile&& file, FileType type, const std::string& filename, const std::string& filepath);
+
 /**
 * Identifies and loads a bootable file
 * @param filename String filename of bootable file
--- a/src/core/loader/ncch.cpp
+++ b/src/core/loader/ncch.cpp
@ -156,6 +156,9 @@ ResultStatus AppLoader_NCCH::LoadExec() {
        Kernel::g_current_process->resource_limit = Kernel::ResourceLimit::GetForCategory(
            static_cast<Kernel::ResourceLimitCategory>(exheader_header.arm11_system_local_caps.resource_limit_category));

+        // Set the default CPU core for this process
+        Kernel::g_current_process->ideal_processor = exheader_header.arm11_system_local_caps.ideal_processor;
+
        // Copy data while converting endianess
        std::array<u32, ARRAY_SIZE(exheader_header.arm11_kernel_caps.descriptors)> kernel_caps;
        std::copy_n(exheader_header.arm11_kernel_caps.descriptors, kernel_caps.size(), begin(kernel_caps));
@ -173,6 +176,10 @@ ResultStatus AppLoader_NCCH::LoadSectionExeFS(const char* name, std::vector<u8>&
    if (!file.IsOpen())
        return ResultStatus::Error;

+    ResultStatus result = LoadExeFS();
+    if (result != ResultStatus::Success)
+        return result;
+
    LOG_DEBUG(Loader, "%d sections:", kMaxSections);
    // Iterate through the ExeFs archive until we find a section with the specified name...
    for (unsigned section_number = 0; section_number < kMaxSections; section_number++) {
@ -215,9 +222,9 @@ ResultStatus AppLoader_NCCH::LoadSectionExeFS(const char* name, std::vector<u8>&
    return ResultStatus::ErrorNotUsed;
 }

-ResultStatus AppLoader_NCCH::Load() {
-    if (is_loaded)
-        return ResultStatus::ErrorAlreadyLoaded;
+ResultStatus AppLoader_NCCH::LoadExeFS() {
+    if (is_exefs_loaded)
+        return ResultStatus::Success;

    if (!file.IsOpen())
        return ResultStatus::Error;
@ -282,6 +289,18 @@ ResultStatus AppLoader_NCCH::Load() {
    if (file.ReadBytes(&exefs_header, sizeof(ExeFs_Header)) != sizeof(ExeFs_Header))
        return ResultStatus::Error;

+    is_exefs_loaded = true;
+    return ResultStatus::Success;
+}
+
+ResultStatus AppLoader_NCCH::Load() {
+    if (is_loaded)
+        return ResultStatus::ErrorAlreadyLoaded;
+
+    ResultStatus result = LoadExeFS();
+    if (result != ResultStatus::Success)
+        return result;
+
    is_loaded = true; // Set state to loaded

    return LoadExec(); // Load the executable into memory for booting
--- a/src/core/loader/ncch.h
+++ b/src/core/loader/ncch.h
@ -232,6 +232,13 @@ private:
     */
    ResultStatus LoadExec();

+    /**
+     * Ensure ExeFS is loaded and ready for reading sections
+     * @return ResultStatus result of function
+     */
+    ResultStatus LoadExeFS();
+
+    bool            is_exefs_loaded = false;
    bool            is_compressed = false;

    u32             entry_point = 0;
--- a/src/core/tracer/recorder.cpp
+++ b/src/core/tracer/recorder.cpp
@ -26,17 +26,17 @@ void Recorder::Finish(const std::string& filename) {
    // Calculate file offsets
    auto& initial = header.initial_state_offsets;

-    initial.gpu_registers_size      = initial_state.gpu_registers.size();
-    initial.lcd_registers_size      = initial_state.lcd_registers.size();
-    initial.pica_registers_size     = initial_state.pica_registers.size();
-    initial.default_attributes_size = initial_state.default_attributes.size();
-    initial.vs_program_binary_size  = initial_state.vs_program_binary.size();
-    initial.vs_swizzle_data_size    = initial_state.vs_swizzle_data.size();
-    initial.vs_float_uniforms_size  = initial_state.vs_float_uniforms.size();
-    initial.gs_program_binary_size  = initial_state.gs_program_binary.size();
-    initial.gs_swizzle_data_size    = initial_state.gs_swizzle_data.size();
-    initial.gs_float_uniforms_size  = initial_state.gs_float_uniforms.size();
-    header.stream_size              = stream.size();
+    initial.gpu_registers_size      = static_cast<u32>(initial_state.gpu_registers.size());
+    initial.lcd_registers_size      = static_cast<u32>(initial_state.lcd_registers.size());
+    initial.pica_registers_size     = static_cast<u32>(initial_state.pica_registers.size());
+    initial.default_attributes_size = static_cast<u32>(initial_state.default_attributes.size());
+    initial.vs_program_binary_size  = static_cast<u32>(initial_state.vs_program_binary.size());
+    initial.vs_swizzle_data_size    = static_cast<u32>(initial_state.vs_swizzle_data.size());
+    initial.vs_float_uniforms_size  = static_cast<u32>(initial_state.vs_float_uniforms.size());
+    initial.gs_program_binary_size  = static_cast<u32>(initial_state.gs_program_binary.size());
+    initial.gs_swizzle_data_size    = static_cast<u32>(initial_state.gs_swizzle_data.size());
+    initial.gs_float_uniforms_size  = static_cast<u32>(initial_state.gs_float_uniforms.size());
+    header.stream_size              = static_cast<u32>(stream.size());

    initial.gpu_registers      = sizeof(header);
    initial.lcd_registers      = initial.gpu_registers      + initial.gpu_registers_size * sizeof(u32);
@ -68,7 +68,7 @@ void Recorder::Finish(const std::string& filename) {
            DEBUG_ASSERT(stream_element.extra_data.size() == 0);
            break;
        }
-        header.stream_offset += stream_element.extra_data.size();
+        header.stream_offset += static_cast<u32>(stream_element.extra_data.size());
    }

    try {
--- a/src/video_core/clipper.cpp
+++ b/src/video_core/clipper.cpp
@ -75,8 +75,6 @@ static void InitScreenCoordinates(OutputVertex& vtx)
    viewport.halfsize_y = float24::FromRaw(regs.viewport_size_y);
    viewport.offset_x   = float24::FromFloat32(static_cast<float>(regs.viewport_corner.x));
    viewport.offset_y   = float24::FromFloat32(static_cast<float>(regs.viewport_corner.y));
-    viewport.zscale     = float24::FromRaw(regs.viewport_depth_range);
-    viewport.offset_z   = float24::FromRaw(regs.viewport_depth_far_plane);

    float24 inv_w = float24::FromFloat32(1.f) / vtx.pos.w;
    vtx.color *= inv_w;
@ -89,7 +87,7 @@ static void InitScreenCoordinates(OutputVertex& vtx)

    vtx.screenpos[0] = (vtx.pos.x * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_x + viewport.offset_x;
    vtx.screenpos[1] = (vtx.pos.y * inv_w + float24::FromFloat32(1.0)) * viewport.halfsize_y + viewport.offset_y;
-    vtx.screenpos[2] = viewport.offset_z + vtx.pos.z * inv_w * viewport.zscale;
+    vtx.screenpos[2] = vtx.pos.z * inv_w;
 }

 void ProcessTriangle(const OutputVertex &v0, const OutputVertex &v1, const OutputVertex &v2) {
--- a/src/video_core/command_processor.cpp
+++ b/src/video_core/command_processor.cpp
@ -144,13 +144,12 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                        immediate_attribute_id = 0;

                        Shader::UnitState<false> shader_unit;
-                        Shader::Setup();
-
-                        if (g_debug_context)
-                            g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, static_cast<void*>(&immediate_input));
+                        g_state.vs.Setup();

                        // Send to vertex shader
-                        Shader::OutputVertex output = Shader::Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1);
+                        if (g_debug_context)
+                            g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, static_cast<void*>(&immediate_input));
+                        Shader::OutputVertex output = g_state.vs.Run(shader_unit, immediate_input, regs.vs.num_input_attributes+1);

                        // Send to renderer
                        using Pica::Shader::OutputVertex;
@ -238,7 +237,7 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
            vertex_cache_ids.fill(-1);

            Shader::UnitState<false> shader_unit;
-            Shader::Setup();
+            g_state.vs.Setup();

            for (unsigned int index = 0; index < regs.num_vertices; ++index)
            {
@ -272,11 +271,10 @@ static void WritePicaReg(u32 id, u32 value, u32 mask) {
                    Shader::InputVertex input;
                    loader.LoadVertex(base_address, index, vertex, input, memory_accesses);

-                    if (g_debug_context)
-                        g_debug_context->OnEvent(DebugContext::Event::VertexLoaded, (void*)&input);
-
                    // Send to vertex shader
-                    output = Shader::Run(shader_unit, input, loader.GetNumTotalAttributes());
+                    if (g_debug_context)
+                        g_debug_context->OnEvent(DebugContext::Event::VertexShaderInvocation, (void*)&input);
+                    output = g_state.vs.Run(shader_unit, input, loader.GetNumTotalAttributes());

                    if (is_indexed) {
                        vertex_cache[vertex_cache_pos] = output;
--- a/src/video_core/debug_utils/debug_utils.cpp
+++ b/src/video_core/debug_utils/debug_utils.cpp
@ -208,11 +208,12 @@ void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, c

    // TODO: Reduce the amount of binary code written to relevant portions
    dvlp.binary_offset = write_offset - dvlp_offset;
-    dvlp.binary_size_words = setup.program_code.size();
-    QueueForWriting(reinterpret_cast<const u8*>(setup.program_code.data()), setup.program_code.size() * sizeof(u32));
+    dvlp.binary_size_words = static_cast<uint32_t>(setup.program_code.size());
+    QueueForWriting(reinterpret_cast<const u8*>(setup.program_code.data()),
+                    static_cast<u32>(setup.program_code.size()) * sizeof(u32));

    dvlp.swizzle_info_offset = write_offset - dvlp_offset;
-    dvlp.swizzle_info_num_entries = setup.swizzle_data.size();
+    dvlp.swizzle_info_num_entries = static_cast<uint32_t>(setup.swizzle_data.size());
    u32 dummy = 0;
    for (unsigned int i = 0; i < setup.swizzle_data.size(); ++i) {
        QueueForWriting(reinterpret_cast<const u8*>(&setup.swizzle_data[i]), sizeof(setup.swizzle_data[i]));
@ -264,7 +265,7 @@ void DumpShader(const std::string& filename, const Regs::ShaderConfig& config, c
            constant_table.emplace_back(constant);
    }
    dvle.constant_table_offset = write_offset - dvlb.dvle_offset;
-    dvle.constant_table_size = constant_table.size();
+    dvle.constant_table_size = static_cast<uint32_t>(constant_table.size());
    for (const auto& constant : constant_table) {
        QueueForWriting(reinterpret_cast<const u8*>(&constant), sizeof(constant));
    }
--- a/src/video_core/debug_utils/debug_utils.h
+++ b/src/video_core/debug_utils/debug_utils.h
@ -40,7 +40,7 @@ public:
        PicaCommandProcessed,
        IncomingPrimitiveBatch,
        FinishedPrimitiveBatch,
-        VertexLoaded,
+        VertexShaderInvocation,
        IncomingDisplayTransfer,
        GSPCommandProcessed,
        BufferSwapped,
--- a/src/video_core/pica.cpp
+++ b/src/video_core/pica.cpp
@ -500,7 +500,7 @@ void Init() {
 }

 void Shutdown() {
-    Shader::Shutdown();
+    Shader::ClearCache();
 }

 template <typename T>
--- a/src/video_core/pica.h
+++ b/src/video_core/pica.h
@ -70,7 +70,7 @@ struct Regs {
    INSERT_PADDING_WORDS(0x9);

    BitField<0, 24, u32> viewport_depth_range; // float24
-    BitField<0, 24, u32> viewport_depth_far_plane; // float24
+    BitField<0, 24, u32> viewport_depth_near_plane; // float24

    BitField<0, 3, u32> vs_output_total;

@ -122,9 +122,31 @@ struct Regs {
        BitField<16, 10, s32> y;
    } viewport_corner;

-    INSERT_PADDING_WORDS(0x17);
+    INSERT_PADDING_WORDS(0x1);
+
+    //TODO: early depth
+    INSERT_PADDING_WORDS(0x1);
+
+    INSERT_PADDING_WORDS(0x2);
+
+    enum DepthBuffering : u32 {
+        WBuffering  = 0,
+        ZBuffering  = 1,
+    };
+    BitField< 0, 1, DepthBuffering> depthmap_enable;
+
+    INSERT_PADDING_WORDS(0x12);

    struct TextureConfig {
+        enum TextureType : u32 {
+            Texture2D    = 0,
+            TextureCube  = 1,
+            Shadow2D     = 2,
+            Projection2D = 3,
+            ShadowCube   = 4,
+            Disabled     = 5,
+        };
+
        enum WrapMode : u32 {
            ClampToEdge    = 0,
            ClampToBorder  = 1,
@ -155,6 +177,7 @@ struct Regs {
            BitField< 2, 1, TextureFilter> min_filter;
            BitField< 8, 2, WrapMode> wrap_t;
            BitField<12, 2, WrapMode> wrap_s;
+            BitField<28, 2, TextureType> type; ///< @note Only valid for texture 0 according to 3DBrew.
        };

        INSERT_PADDING_WORDS(0x1);
@ -1279,10 +1302,11 @@ ASSERT_REG_POSITION(cull_mode, 0x40);
 ASSERT_REG_POSITION(viewport_size_x, 0x41);
 ASSERT_REG_POSITION(viewport_size_y, 0x43);
 ASSERT_REG_POSITION(viewport_depth_range, 0x4d);
-ASSERT_REG_POSITION(viewport_depth_far_plane, 0x4e);
+ASSERT_REG_POSITION(viewport_depth_near_plane, 0x4e);
 ASSERT_REG_POSITION(vs_output_attributes[0], 0x50);
 ASSERT_REG_POSITION(vs_output_attributes[1], 0x51);
 ASSERT_REG_POSITION(viewport_corner, 0x68);
+ASSERT_REG_POSITION(depthmap_enable, 0x6D);
 ASSERT_REG_POSITION(texture0_enable, 0x80);
 ASSERT_REG_POSITION(texture0, 0x81);
 ASSERT_REG_POSITION(texture0_format, 0x8e);
--- a/src/video_core/pica_state.h
+++ b/src/video_core/pica_state.h
@ -56,7 +56,7 @@ struct State {
        // Used to buffer partial vertices for immediate-mode rendering.
        Shader::InputVertex input_vertex;
        // Index of the next attribute to be loaded into `input_vertex`.
-        int current_attribute = 0;
+        u32 current_attribute = 0;
    } immediate;

    // This is constructed with a dummy triangle topology
--- a/src/video_core/rasterizer.cpp
+++ b/src/video_core/rasterizer.cpp
@ -442,8 +442,33 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,

                DEBUG_ASSERT(0 != texture.config.address);

-                int s = (int)(uv[i].u() * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
-                int t = (int)(uv[i].v() * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
+                float24 u = uv[i].u();
+                float24 v = uv[i].v();
+
+                // Only unit 0 respects the texturing type (according to 3DBrew)
+                // TODO: Refactor so cubemaps and shadowmaps can be handled
+                if (i == 0) {
+                    switch(texture.config.type) {
+                    case Regs::TextureConfig::Texture2D:
+                        break;
+                    case Regs::TextureConfig::Projection2D: {
+                        auto tc0_w = GetInterpolatedAttribute(v0.tc0_w, v1.tc0_w, v2.tc0_w);
+                        u /= tc0_w;
+                        v /= tc0_w;
+                        break;
+                    }
+                    default:
+                        // TODO: Change to LOG_ERROR when more types are handled.
+                        LOG_DEBUG(HW_GPU, "Unhandled texture type %x", (int)texture.config.type);
+                        UNIMPLEMENTED();
+                        break;
+                    }
+                }
+
+                int s = (int)(u * float24::FromFloat32(static_cast<float>(texture.config.width))).ToFloat32();
+                int t = (int)(v * float24::FromFloat32(static_cast<float>(texture.config.height))).ToFloat32();
+
+
                static auto GetWrappedTexCoord = [](Regs::TextureConfig::WrapMode mode, int val, unsigned size) {
                    switch (mode) {
                        case Regs::TextureConfig::ClampToEdge:
@ -862,10 +887,30 @@ static void ProcessTriangleInternal(const Shader::OutputVertex& v0,
                }
            }

+            // interpolated_z = z / w
+            float interpolated_z_over_w = (v0.screenpos[2].ToFloat32() * w0 +
+                                           v1.screenpos[2].ToFloat32() * w1 +
+                                           v2.screenpos[2].ToFloat32() * w2) / wsum;
+
+            // Not fully accurate. About 3 bits in precision are missing.
+            // Z-Buffer (z / w * scale + offset)
+            float depth_scale = float24::FromRaw(regs.viewport_depth_range).ToFloat32();
+            float depth_offset = float24::FromRaw(regs.viewport_depth_near_plane).ToFloat32();
+            float depth = interpolated_z_over_w * depth_scale + depth_offset;
+
+            // Potentially switch to W-Buffer
+            if (regs.depthmap_enable == Pica::Regs::DepthBuffering::WBuffering) {
+
+                // W-Buffer (z * scale + w * offset = (z / w * scale + offset) * w)
+                depth *= interpolated_w_inverse.ToFloat32() * wsum;
+            }
+
+            // Clamp the result
+            depth = MathUtil::Clamp(depth, 0.0f, 1.0f);
+
+            // Convert float to integer
            unsigned num_bits = Regs::DepthBitsPerPixel(regs.framebuffer.depth_format);
-            u32 z = (u32)((v0.screenpos[2].ToFloat32() * w0 +
-                           v1.screenpos[2].ToFloat32() * w1 +
-                           v2.screenpos[2].ToFloat32() * w2) * ((1 << num_bits) - 1) / wsum);
+            u32 z = (u32)(depth * ((1 << num_bits) - 1));

            if (output_merger.depth_test_enable) {
                u32 ref_z = GetDepth(x >> 4, y >> 4);
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@ -76,6 +76,9 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
    glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD1);
    glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD2);

+    glVertexAttribPointer(GLShader::ATTRIBUTE_TEXCOORD0_W, 1, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, tex_coord0_w));
+    glEnableVertexAttribArray(GLShader::ATTRIBUTE_TEXCOORD0_W);
+
    glVertexAttribPointer(GLShader::ATTRIBUTE_NORMQUAT, 4, GL_FLOAT, GL_FALSE, sizeof(HardwareVertex), (GLvoid*)offsetof(HardwareVertex, normquat));
    glEnableVertexAttribArray(GLShader::ATTRIBUTE_NORMQUAT);

@ -93,7 +96,7 @@ RasterizerOpenGL::RasterizerOpenGL() : shader_dirty(true) {
    state.Apply();

    for (size_t i = 0; i < lighting_luts.size(); ++i) {
-        glActiveTexture(GL_TEXTURE3 + i);
+        glActiveTexture(static_cast<GLenum>(GL_TEXTURE3 + i));
        glTexImage1D(GL_TEXTURE_1D, 0, GL_RGBA32F, 256, 0, GL_RGBA, GL_FLOAT, nullptr);
        glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MIN_FILTER, GL_LINEAR);
        glTexParameteri(GL_TEXTURE_1D, GL_TEXTURE_MAG_FILTER, GL_LINEAR);
@ -256,10 +259,15 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {

    // Depth modifiers
    case PICA_REG_INDEX(viewport_depth_range):
-    case PICA_REG_INDEX(viewport_depth_far_plane):
+    case PICA_REG_INDEX(viewport_depth_near_plane):
        SyncDepthModifiers();
        break;

+    // Depth buffering
+    case PICA_REG_INDEX(depthmap_enable):
+        shader_dirty = true;
+        break;
+
    // Blending
    case PICA_REG_INDEX(output_merger.alphablend_enable):
        SyncBlendEnabled();
@ -314,6 +322,11 @@ void RasterizerOpenGL::NotifyPicaRegisterChanged(u32 id) {
        SyncLogicOp();
        break;

+    // Texture 0 type
+    case PICA_REG_INDEX(texture0.type):
+        shader_dirty = true;
+        break;
+
    // TEV stages
    case PICA_REG_INDEX(tev_stage0.color_source1):
    case PICA_REG_INDEX(tev_stage0.color_modifier1):
@ -910,10 +923,10 @@ void RasterizerOpenGL::SyncCullMode() {
 }

 void RasterizerOpenGL::SyncDepthModifiers() {
-    float depth_scale = -Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32();
-    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_far_plane).ToFloat32() / 2.0f;
+    float depth_scale = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_range).ToFloat32();
+    float depth_offset = Pica::float24::FromRaw(Pica::g_state.regs.viewport_depth_near_plane).ToFloat32();

-    // TODO: Implement scale modifier
+    uniform_block_data.data.depth_scale = depth_scale;
    uniform_block_data.data.depth_offset = depth_offset;
    uniform_block_data.dirty = true;
 }
--- a/src/video_core/renderer_opengl/gl_rasterizer.h
+++ b/src/video_core/renderer_opengl/gl_rasterizer.h
@ -39,139 +39,185 @@ struct ScreenInfo;
 * directly accessing Pica registers. This should reduce the risk of bugs in shader generation where
 * Pica state is not being captured in the shader cache key, thereby resulting in (what should be)
 * two separate shaders sharing the same key.
+ *
+ * We use a union because "implicitly-defined copy/move constructor for a union X copies the object representation of X."
+ * and "implicitly-defined copy assignment operator for a union X copies the object representation (3.9) of X."
+ * = Bytewise copy instead of memberwise copy.
+ * This is important because the padding bytes are included in the hash and comparison between objects.
 */
-struct PicaShaderConfig {
+union PicaShaderConfig {
+
    /// Construct a PicaShaderConfig with the current Pica register configuration.
    static PicaShaderConfig CurrentConfig() {
        PicaShaderConfig res;
+
+        auto& state = res.state;
+        std::memset(&state, 0, sizeof(PicaShaderConfig::State));
+
        const auto& regs = Pica::g_state.regs;

-        res.alpha_test_func = regs.output_merger.alpha_test.enable ?
+        state.depthmap_enable = regs.depthmap_enable;
+
+        state.alpha_test_func = regs.output_merger.alpha_test.enable ?
            regs.output_merger.alpha_test.func.Value() : Pica::Regs::CompareFunc::Always;

-        // Copy tev stages
+        state.texture0_type = regs.texture0.type;
+
+        // Copy relevant tev stages fields.
+        // We don't sync const_color here because of the high variance, it is a
+        // shader uniform instead.
        const auto& tev_stages = regs.GetTevStages();
-        DEBUG_ASSERT(res.tev_stages.size() == tev_stages.size());
+        DEBUG_ASSERT(state.tev_stages.size() == tev_stages.size());
        for (size_t i = 0; i < tev_stages.size(); i++) {
            const auto& tev_stage = tev_stages[i];
-            res.tev_stages[i].sources_raw = tev_stage.sources_raw;
-            res.tev_stages[i].modifiers_raw = tev_stage.modifiers_raw;
-            res.tev_stages[i].ops_raw = tev_stage.ops_raw;
-            res.tev_stages[i].const_color = tev_stage.const_color;
-            res.tev_stages[i].scales_raw = tev_stage.scales_raw;
+            state.tev_stages[i].sources_raw = tev_stage.sources_raw;
+            state.tev_stages[i].modifiers_raw = tev_stage.modifiers_raw;
+            state.tev_stages[i].ops_raw = tev_stage.ops_raw;
+            state.tev_stages[i].scales_raw = tev_stage.scales_raw;
        }

-        res.combiner_buffer_input =
+        state.combiner_buffer_input =
            regs.tev_combiner_buffer_input.update_mask_rgb.Value() |
            regs.tev_combiner_buffer_input.update_mask_a.Value() << 4;

        // Fragment lighting

-        res.lighting.enable = !regs.lighting.disable;
-        res.lighting.src_num = regs.lighting.num_lights + 1;
+        state.lighting.enable = !regs.lighting.disable;
+        state.lighting.src_num = regs.lighting.num_lights + 1;

-        for (unsigned light_index = 0; light_index < res.lighting.src_num; ++light_index) {
+        for (unsigned light_index = 0; light_index < state.lighting.src_num; ++light_index) {
            unsigned num = regs.lighting.light_enable.GetNum(light_index);
            const auto& light = regs.lighting.light[num];
-            res.lighting.light[light_index].num = num;
-            res.lighting.light[light_index].directional = light.directional != 0;
-            res.lighting.light[light_index].two_sided_diffuse = light.two_sided_diffuse != 0;
-            res.lighting.light[light_index].dist_atten_enable = !regs.lighting.IsDistAttenDisabled(num);
-            res.lighting.light[light_index].dist_atten_bias = Pica::float20::FromRaw(light.dist_atten_bias).ToFloat32();
-            res.lighting.light[light_index].dist_atten_scale = Pica::float20::FromRaw(light.dist_atten_scale).ToFloat32();
+            state.lighting.light[light_index].num = num;
+            state.lighting.light[light_index].directional = light.directional != 0;
+            state.lighting.light[light_index].two_sided_diffuse = light.two_sided_diffuse != 0;
+            state.lighting.light[light_index].dist_atten_enable = !regs.lighting.IsDistAttenDisabled(num);
+            state.lighting.light[light_index].dist_atten_bias = Pica::float20::FromRaw(light.dist_atten_bias).ToFloat32();
+            state.lighting.light[light_index].dist_atten_scale = Pica::float20::FromRaw(light.dist_atten_scale).ToFloat32();
        }

-        res.lighting.lut_d0.enable = regs.lighting.disable_lut_d0 == 0;
-        res.lighting.lut_d0.abs_input = regs.lighting.abs_lut_input.disable_d0 == 0;
-        res.lighting.lut_d0.type = regs.lighting.lut_input.d0.Value();
-        res.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0);
+        state.lighting.lut_d0.enable = regs.lighting.disable_lut_d0 == 0;
+        state.lighting.lut_d0.abs_input = regs.lighting.abs_lut_input.disable_d0 == 0;
+        state.lighting.lut_d0.type = regs.lighting.lut_input.d0.Value();
+        state.lighting.lut_d0.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d0);

-        res.lighting.lut_d1.enable = regs.lighting.disable_lut_d1 == 0;
-        res.lighting.lut_d1.abs_input = regs.lighting.abs_lut_input.disable_d1 == 0;
-        res.lighting.lut_d1.type = regs.lighting.lut_input.d1.Value();
-        res.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1);
+        state.lighting.lut_d1.enable = regs.lighting.disable_lut_d1 == 0;
+        state.lighting.lut_d1.abs_input = regs.lighting.abs_lut_input.disable_d1 == 0;
+        state.lighting.lut_d1.type = regs.lighting.lut_input.d1.Value();
+        state.lighting.lut_d1.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.d1);

-        res.lighting.lut_fr.enable = regs.lighting.disable_lut_fr == 0;
-        res.lighting.lut_fr.abs_input = regs.lighting.abs_lut_input.disable_fr == 0;
-        res.lighting.lut_fr.type = regs.lighting.lut_input.fr.Value();
-        res.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr);
+        state.lighting.lut_fr.enable = regs.lighting.disable_lut_fr == 0;
+        state.lighting.lut_fr.abs_input = regs.lighting.abs_lut_input.disable_fr == 0;
+        state.lighting.lut_fr.type = regs.lighting.lut_input.fr.Value();
+        state.lighting.lut_fr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.fr);

-        res.lighting.lut_rr.enable = regs.lighting.disable_lut_rr == 0;
-        res.lighting.lut_rr.abs_input = regs.lighting.abs_lut_input.disable_rr == 0;
-        res.lighting.lut_rr.type = regs.lighting.lut_input.rr.Value();
-        res.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr);
+        state.lighting.lut_rr.enable = regs.lighting.disable_lut_rr == 0;
+        state.lighting.lut_rr.abs_input = regs.lighting.abs_lut_input.disable_rr == 0;
+        state.lighting.lut_rr.type = regs.lighting.lut_input.rr.Value();
+        state.lighting.lut_rr.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rr);

-        res.lighting.lut_rg.enable = regs.lighting.disable_lut_rg == 0;
-        res.lighting.lut_rg.abs_input = regs.lighting.abs_lut_input.disable_rg == 0;
-        res.lighting.lut_rg.type = regs.lighting.lut_input.rg.Value();
-        res.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg);
+        state.lighting.lut_rg.enable = regs.lighting.disable_lut_rg == 0;
+        state.lighting.lut_rg.abs_input = regs.lighting.abs_lut_input.disable_rg == 0;
+        state.lighting.lut_rg.type = regs.lighting.lut_input.rg.Value();
+        state.lighting.lut_rg.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rg);

-        res.lighting.lut_rb.enable = regs.lighting.disable_lut_rb == 0;
-        res.lighting.lut_rb.abs_input = regs.lighting.abs_lut_input.disable_rb == 0;
-        res.lighting.lut_rb.type = regs.lighting.lut_input.rb.Value();
-        res.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb);
+        state.lighting.lut_rb.enable = regs.lighting.disable_lut_rb == 0;
+        state.lighting.lut_rb.abs_input = regs.lighting.abs_lut_input.disable_rb == 0;
+        state.lighting.lut_rb.type = regs.lighting.lut_input.rb.Value();
+        state.lighting.lut_rb.scale = regs.lighting.lut_scale.GetScale(regs.lighting.lut_scale.rb);

-        res.lighting.config = regs.lighting.config;
-        res.lighting.fresnel_selector = regs.lighting.fresnel_selector;
-        res.lighting.bump_mode = regs.lighting.bump_mode;
-        res.lighting.bump_selector = regs.lighting.bump_selector;
-        res.lighting.bump_renorm = regs.lighting.disable_bump_renorm == 0;
-        res.lighting.clamp_highlights = regs.lighting.clamp_highlights != 0;
+        state.lighting.config = regs.lighting.config;
+        state.lighting.fresnel_selector = regs.lighting.fresnel_selector;
+        state.lighting.bump_mode = regs.lighting.bump_mode;
+        state.lighting.bump_selector = regs.lighting.bump_selector;
+        state.lighting.bump_renorm = regs.lighting.disable_bump_renorm == 0;
+        state.lighting.clamp_highlights = regs.lighting.clamp_highlights != 0;

        return res;
    }

    bool TevStageUpdatesCombinerBufferColor(unsigned stage_index) const {
-        return (stage_index < 4) && (combiner_buffer_input & (1 << stage_index));
+        return (stage_index < 4) && (state.combiner_buffer_input & (1 << stage_index));
    }

    bool TevStageUpdatesCombinerBufferAlpha(unsigned stage_index) const {
-        return (stage_index < 4) && ((combiner_buffer_input >> 4) & (1 << stage_index));
+        return (stage_index < 4) && ((state.combiner_buffer_input >> 4) & (1 << stage_index));
    }

    bool operator ==(const PicaShaderConfig& o) const {
-        return std::memcmp(this, &o, sizeof(PicaShaderConfig)) == 0;
+        return std::memcmp(&state, &o.state, sizeof(PicaShaderConfig::State)) == 0;
    };

-    Pica::Regs::CompareFunc alpha_test_func = Pica::Regs::CompareFunc::Never;
-    std::array<Pica::Regs::TevStageConfig, 6> tev_stages = {};
-    u8 combiner_buffer_input = 0;
+    // NOTE: MSVC15 (Update 2) doesn't think `delete`'d constructors and operators are TC.
+    //       This makes BitField not TC when used in a union or struct so we have to resort
+    //       to this ugly hack.
+    //       Once that bug is fixed we can use Pica::Regs::TevStageConfig here.
+    //       Doesn't include const_color because we don't sync it, see comment in CurrentConfig()
+    struct TevStageConfigRaw {
+        u32 sources_raw;
+        u32 modifiers_raw;
+        u32 ops_raw;
+        u32 scales_raw;
+        explicit operator Pica::Regs::TevStageConfig() const noexcept {
+            Pica::Regs::TevStageConfig stage;
+            stage.sources_raw = sources_raw;
+            stage.modifiers_raw = modifiers_raw;
+            stage.ops_raw = ops_raw;
+            stage.const_color = 0;
+            stage.scales_raw = scales_raw;
+            return stage;
+        }
+    };

-    struct {
-        struct {
-            unsigned num = 0;
-            bool directional = false;
-            bool two_sided_diffuse = false;
-            bool dist_atten_enable = false;
-            GLfloat dist_atten_scale = 0.0f;
-            GLfloat dist_atten_bias = 0.0f;
-        } light[8];
+    struct State {

-        bool enable = false;
-        unsigned src_num = 0;
-        Pica::Regs::LightingBumpMode bump_mode = Pica::Regs::LightingBumpMode::None;
-        unsigned bump_selector = 0;
-        bool bump_renorm = false;
-        bool clamp_highlights = false;
+        Pica::Regs::CompareFunc alpha_test_func;
+        Pica::Regs::TextureConfig::TextureType texture0_type;
+        std::array<TevStageConfigRaw, 6> tev_stages;
+        u8 combiner_buffer_input;

-        Pica::Regs::LightingConfig config = Pica::Regs::LightingConfig::Config0;
-        Pica::Regs::LightingFresnelSelector fresnel_selector = Pica::Regs::LightingFresnelSelector::None;
+        Pica::Regs::DepthBuffering depthmap_enable;

        struct {
-            bool enable = false;
-            bool abs_input = false;
-            Pica::Regs::LightingLutInput type = Pica::Regs::LightingLutInput::NH;
-            float scale = 1.0f;
-        } lut_d0, lut_d1, lut_fr, lut_rr, lut_rg, lut_rb;
-    } lighting;
+            struct {
+                unsigned num;
+                bool directional;
+                bool two_sided_diffuse;
+                bool dist_atten_enable;
+                GLfloat dist_atten_scale;
+                GLfloat dist_atten_bias;
+            } light[8];
+
+            bool enable;
+            unsigned src_num;
+            Pica::Regs::LightingBumpMode bump_mode;
+            unsigned bump_selector;
+            bool bump_renorm;
+            bool clamp_highlights;
+
+            Pica::Regs::LightingConfig config;
+            Pica::Regs::LightingFresnelSelector fresnel_selector;
+
+            struct {
+                bool enable;
+                bool abs_input;
+                Pica::Regs::LightingLutInput type;
+                float scale;
+            } lut_d0, lut_d1, lut_fr, lut_rr, lut_rg, lut_rb;
+        } lighting;
+
+    } state;
 };
+#if (__GNUC__ >= 5) || defined(__clang__) || defined(_MSC_VER)
+static_assert(std::is_trivially_copyable<PicaShaderConfig::State>::value, "PicaShaderConfig::State must be trivially copyable");
+#endif

 namespace std {

 template <>
 struct hash<PicaShaderConfig> {
    size_t operator()(const PicaShaderConfig& k) const {
-        return Common::ComputeHash64(&k, sizeof(PicaShaderConfig));
+        return Common::ComputeHash64(&k.state, sizeof(PicaShaderConfig::State));
    }
 };

@ -238,6 +284,7 @@ private:
            tex_coord1[1] = v.tc1.y.ToFloat32();
            tex_coord2[0] = v.tc2.x.ToFloat32();
            tex_coord2[1] = v.tc2.y.ToFloat32();
+            tex_coord0_w = v.tc0_w.ToFloat32();
            normquat[0] = v.quat.x.ToFloat32();
            normquat[1] = v.quat.y.ToFloat32();
            normquat[2] = v.quat.z.ToFloat32();
@ -258,6 +305,7 @@ private:
        GLfloat tex_coord0[2];
        GLfloat tex_coord1[2];
        GLfloat tex_coord2[2];
+        GLfloat tex_coord0_w;
        GLfloat normquat[4];
        GLfloat view[3];
    };
@ -276,6 +324,7 @@ private:
        GLvec4 const_color[6];
        GLvec4 tev_combiner_buffer_color;
        GLint alphatest_ref;
+        GLfloat depth_scale;
        GLfloat depth_offset;
        alignas(16) GLvec3 lighting_global_ambient;
        LightSrc light_src[8];
--- a/src/video_core/renderer_opengl/gl_shader_gen.cpp
+++ b/src/video_core/renderer_opengl/gl_shader_gen.cpp
@ -32,8 +32,9 @@ static bool IsPassThroughTevStage(const TevStageConfig& stage) {
 }

 /// Writes the specified TEV stage source component(s)
-static void AppendSource(std::string& out, TevStageConfig::Source source,
+static void AppendSource(std::string& out, const PicaShaderConfig& config, TevStageConfig::Source source,
        const std::string& index_name) {
+    const auto& state = config.state;
    using Source = TevStageConfig::Source;
    switch (source) {
    case Source::PrimaryColor:
@ -46,7 +47,20 @@ static void AppendSource(std::string& out, TevStageConfig::Source source,
        out += "secondary_fragment_color";
        break;
    case Source::Texture0:
-        out += "texture(tex[0], texcoord[0])";
+        // Only unit 0 respects the texturing type (according to 3DBrew)
+        switch(state.texture0_type) {
+        case Pica::Regs::TextureConfig::Texture2D:
+            out += "texture(tex[0], texcoord[0])";
+            break;
+        case Pica::Regs::TextureConfig::Projection2D:
+            out += "textureProj(tex[0], vec3(texcoord[0], texcoord0_w))";
+            break;
+        default:
+            out += "texture(tex[0], texcoord[0])";
+            LOG_CRITICAL(HW_GPU, "Unhandled texture type %x", static_cast<int>(state.texture0_type));
+            UNIMPLEMENTED();
+            break;
+        }
        break;
    case Source::Texture1:
        out += "texture(tex[1], texcoord[1])";
@ -71,53 +85,53 @@ static void AppendSource(std::string& out, TevStageConfig::Source source,
 }

 /// Writes the color components to use for the specified TEV stage color modifier
-static void AppendColorModifier(std::string& out, TevStageConfig::ColorModifier modifier,
+static void AppendColorModifier(std::string& out, const PicaShaderConfig& config, TevStageConfig::ColorModifier modifier,
        TevStageConfig::Source source, const std::string& index_name) {
    using ColorModifier = TevStageConfig::ColorModifier;
    switch (modifier) {
    case ColorModifier::SourceColor:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".rgb";
        break;
    case ColorModifier::OneMinusSourceColor:
        out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".rgb";
        break;
    case ColorModifier::SourceAlpha:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".aaa";
        break;
    case ColorModifier::OneMinusSourceAlpha:
        out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".aaa";
        break;
    case ColorModifier::SourceRed:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".rrr";
        break;
    case ColorModifier::OneMinusSourceRed:
        out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".rrr";
        break;
    case ColorModifier::SourceGreen:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".ggg";
        break;
    case ColorModifier::OneMinusSourceGreen:
        out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".ggg";
        break;
    case ColorModifier::SourceBlue:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".bbb";
        break;
    case ColorModifier::OneMinusSourceBlue:
        out += "vec3(1.0) - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".bbb";
        break;
    default:
@ -128,44 +142,44 @@ static void AppendColorModifier(std::string& out, TevStageConfig::ColorModifier
 }

 /// Writes the alpha component to use for the specified TEV stage alpha modifier
-static void AppendAlphaModifier(std::string& out, TevStageConfig::AlphaModifier modifier,
+static void AppendAlphaModifier(std::string& out, const PicaShaderConfig& config, TevStageConfig::AlphaModifier modifier,
        TevStageConfig::Source source, const std::string& index_name) {
    using AlphaModifier = TevStageConfig::AlphaModifier;
    switch (modifier) {
    case AlphaModifier::SourceAlpha:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".a";
        break;
    case AlphaModifier::OneMinusSourceAlpha:
        out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".a";
        break;
    case AlphaModifier::SourceRed:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".r";
        break;
    case AlphaModifier::OneMinusSourceRed:
        out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".r";
        break;
    case AlphaModifier::SourceGreen:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".g";
        break;
    case AlphaModifier::OneMinusSourceGreen:
        out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".g";
        break;
    case AlphaModifier::SourceBlue:
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".b";
        break;
    case AlphaModifier::OneMinusSourceBlue:
        out += "1.0 - ";
-        AppendSource(out, source, index_name);
+        AppendSource(out, config, source, index_name);
        out += ".b";
        break;
    default:
@ -287,16 +301,16 @@ static void AppendAlphaTestCondition(std::string& out, Regs::CompareFunc func) {

 /// Writes the code to emulate the specified TEV stage
 static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsigned index) {
-    auto& stage = config.tev_stages[index];
+    const auto stage = static_cast<const Pica::Regs::TevStageConfig>(config.state.tev_stages[index]);
    if (!IsPassThroughTevStage(stage)) {
        std::string index_name = std::to_string(index);

        out += "vec3 color_results_" + index_name + "[3] = vec3[3](";
-        AppendColorModifier(out, stage.color_modifier1, stage.color_source1, index_name);
+        AppendColorModifier(out, config, stage.color_modifier1, stage.color_source1, index_name);
        out += ", ";
-        AppendColorModifier(out, stage.color_modifier2, stage.color_source2, index_name);
+        AppendColorModifier(out, config, stage.color_modifier2, stage.color_source2, index_name);
        out += ", ";
-        AppendColorModifier(out, stage.color_modifier3, stage.color_source3, index_name);
+        AppendColorModifier(out, config, stage.color_modifier3, stage.color_source3, index_name);
        out += ");\n";

        out += "vec3 color_output_" + index_name + " = ";
@ -304,11 +318,11 @@ static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsi
        out += ";\n";

        out += "float alpha_results_" + index_name + "[3] = float[3](";
-        AppendAlphaModifier(out, stage.alpha_modifier1, stage.alpha_source1, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier1, stage.alpha_source1, index_name);
        out += ", ";
-        AppendAlphaModifier(out, stage.alpha_modifier2, stage.alpha_source2, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier2, stage.alpha_source2, index_name);
        out += ", ";
-        AppendAlphaModifier(out, stage.alpha_modifier3, stage.alpha_source3, index_name);
+        AppendAlphaModifier(out, config, stage.alpha_modifier3, stage.alpha_source3, index_name);
        out += ");\n";

        out += "float alpha_output_" + index_name + " = ";
@ -331,6 +345,8 @@ static void WriteTevStage(std::string& out, const PicaShaderConfig& config, unsi

 /// Writes the code to emulate fragment lighting
 static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
+    const auto& lighting = config.state.lighting;
+
    // Define lighting globals
    out += "vec4 diffuse_sum = vec4(0.0, 0.0, 0.0, 1.0);\n"
           "vec4 specular_sum = vec4(0.0, 0.0, 0.0, 1.0);\n"
@ -338,17 +354,17 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
           "vec3 refl_value = vec3(0.0);\n";

    // Compute fragment normals
-    if (config.lighting.bump_mode == Pica::Regs::LightingBumpMode::NormalMap) {
+    if (lighting.bump_mode == Pica::Regs::LightingBumpMode::NormalMap) {
        // Bump mapping is enabled using a normal map, read perturbation vector from the selected texture
-        std::string bump_selector = std::to_string(config.lighting.bump_selector);
+        std::string bump_selector = std::to_string(lighting.bump_selector);
        out += "vec3 surface_normal = 2.0 * texture(tex[" + bump_selector + "], texcoord[" + bump_selector + "]).rgb - 1.0;\n";

        // Recompute Z-component of perturbation if 'renorm' is enabled, this provides a higher precision result
-        if (config.lighting.bump_renorm) {
+        if (lighting.bump_renorm) {
            std::string val = "(1.0 - (surface_normal.x*surface_normal.x + surface_normal.y*surface_normal.y))";
            out += "surface_normal.z = sqrt(max(" + val + ", 0.0));\n";
        }
-    } else if (config.lighting.bump_mode == Pica::Regs::LightingBumpMode::TangentMap) {
+    } else if (lighting.bump_mode == Pica::Regs::LightingBumpMode::TangentMap) {
        // Bump mapping is enabled using a tangent map
        LOG_CRITICAL(HW_GPU, "unimplemented bump mapping mode (tangent mapping)");
        UNIMPLEMENTED();
@ -361,7 +377,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
    out += "vec3 normal = normalize(quaternion_rotate(normquat, surface_normal));\n";

    // Gets the index into the specified lookup table for specular lighting
-    auto GetLutIndex = [config](unsigned light_num, Regs::LightingLutInput input, bool abs) {
+    auto GetLutIndex = [&lighting](unsigned light_num, Regs::LightingLutInput input, bool abs) {
        const std::string half_angle = "normalize(normalize(view) + light_vector)";
        std::string index;
        switch (input) {
@ -389,7 +405,7 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {

        if (abs) {
            // LUT index is in the range of (0.0, 1.0)
-            index = config.lighting.light[light_num].two_sided_diffuse ? "abs(" + index + ")" : "max(" + index + ", 0.f)";
+            index = lighting.light[light_num].two_sided_diffuse ? "abs(" + index + ")" : "max(" + index + ", 0.f)";
            return "(FLOAT_255 * clamp(" + index + ", 0.0, 1.0))";
        } else {
            // LUT index is in the range of (-1.0, 1.0)
@ -407,8 +423,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
    };

    // Write the code to emulate each enabled light
-    for (unsigned light_index = 0; light_index < config.lighting.src_num; ++light_index) {
-        const auto& light_config = config.lighting.light[light_index];
+    for (unsigned light_index = 0; light_index < lighting.src_num; ++light_index) {
+        const auto& light_config = lighting.light[light_index];
        std::string light_src = "light_src[" + std::to_string(light_config.num) + "]";

        // Compute light vector (directional or positional)
@ -432,39 +448,39 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
        }

        // If enabled, clamp specular component if lighting result is negative
-        std::string clamp_highlights = config.lighting.clamp_highlights ? "(dot(light_vector, normal) <= 0.0 ? 0.0 : 1.0)" : "1.0";
+        std::string clamp_highlights = lighting.clamp_highlights ? "(dot(light_vector, normal) <= 0.0 ? 0.0 : 1.0)" : "1.0";

        // Specular 0 component
        std::string d0_lut_value = "1.0";
-        if (config.lighting.lut_d0.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Distribution0)) {
+        if (lighting.lut_d0.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Distribution0)) {
            // Lookup specular "distribution 0" LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_d0.type, config.lighting.lut_d0.abs_input);
-            d0_lut_value = "(" + std::to_string(config.lighting.lut_d0.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution0, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_d0.type, lighting.lut_d0.abs_input);
+            d0_lut_value = "(" + std::to_string(lighting.lut_d0.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution0, index) + ")";
        }
        std::string specular_0 = "(" + d0_lut_value + " * " + light_src + ".specular_0)";

        // If enabled, lookup ReflectRed value, otherwise, 1.0 is used
-        if (config.lighting.lut_rr.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectRed)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rr.type, config.lighting.lut_rr.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rr.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectRed, index) + ")";
+        if (lighting.lut_rr.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectRed)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rr.type, lighting.lut_rr.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rr.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectRed, index) + ")";
            out += "refl_value.r = " + value + ";\n";
        } else {
            out += "refl_value.r = 1.0;\n";
        }

        // If enabled, lookup ReflectGreen value, otherwise, ReflectRed value is used
-        if (config.lighting.lut_rg.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectGreen)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rg.type, config.lighting.lut_rg.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rg.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectGreen, index) + ")";
+        if (lighting.lut_rg.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectGreen)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rg.type, lighting.lut_rg.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rg.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectGreen, index) + ")";
            out += "refl_value.g = " + value + ";\n";
        } else {
            out += "refl_value.g = refl_value.r;\n";
        }

        // If enabled, lookup ReflectBlue value, otherwise, ReflectRed value is used
-        if (config.lighting.lut_rb.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::ReflectBlue)) {
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_rb.type, config.lighting.lut_rb.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_rb.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectBlue, index) + ")";
+        if (lighting.lut_rb.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::ReflectBlue)) {
+            std::string index = GetLutIndex(light_config.num, lighting.lut_rb.type, lighting.lut_rb.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_rb.scale) + " * " + GetLutValue(Regs::LightingSampler::ReflectBlue, index) + ")";
            out += "refl_value.b = " + value + ";\n";
        } else {
            out += "refl_value.b = refl_value.r;\n";
@ -472,27 +488,27 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {

        // Specular 1 component
        std::string d1_lut_value = "1.0";
-        if (config.lighting.lut_d1.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Distribution1)) {
+        if (lighting.lut_d1.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Distribution1)) {
            // Lookup specular "distribution 1" LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_d1.type, config.lighting.lut_d1.abs_input);
-            d1_lut_value = "(" + std::to_string(config.lighting.lut_d1.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution1, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_d1.type, lighting.lut_d1.abs_input);
+            d1_lut_value = "(" + std::to_string(lighting.lut_d1.scale) + " * " + GetLutValue(Regs::LightingSampler::Distribution1, index) + ")";
        }
        std::string specular_1 = "(" + d1_lut_value + " * refl_value * " + light_src + ".specular_1)";

        // Fresnel
-        if (config.lighting.lut_fr.enable && Pica::Regs::IsLightingSamplerSupported(config.lighting.config, Pica::Regs::LightingSampler::Fresnel)) {
+        if (lighting.lut_fr.enable && Pica::Regs::IsLightingSamplerSupported(lighting.config, Pica::Regs::LightingSampler::Fresnel)) {
            // Lookup fresnel LUT value
-            std::string index = GetLutIndex(light_config.num, config.lighting.lut_fr.type, config.lighting.lut_fr.abs_input);
-            std::string value = "(" + std::to_string(config.lighting.lut_fr.scale) + " * " + GetLutValue(Regs::LightingSampler::Fresnel, index) + ")";
+            std::string index = GetLutIndex(light_config.num, lighting.lut_fr.type, lighting.lut_fr.abs_input);
+            std::string value = "(" + std::to_string(lighting.lut_fr.scale) + " * " + GetLutValue(Regs::LightingSampler::Fresnel, index) + ")";

            // Enabled for difffuse lighting alpha component
-            if (config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::PrimaryAlpha ||
-                config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
+            if (lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::PrimaryAlpha ||
+                lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
                out += "diffuse_sum.a  *= " + value + ";\n";

            // Enabled for the specular lighting alpha component
-            if (config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::SecondaryAlpha ||
-                config.lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
+            if (lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::SecondaryAlpha ||
+                lighting.fresnel_selector == Pica::Regs::LightingFresnelSelector::Both)
                out += "specular_sum.a *= " + value + ";\n";
        }

@ -510,6 +526,8 @@ static void WriteLighting(std::string& out, const PicaShaderConfig& config) {
 }

 std::string GenerateFragmentShader(const PicaShaderConfig& config) {
+    const auto& state = config.state;
+
    std::string out = R"(
 #version 330 core
 #define NUM_TEV_STAGES 6
@ -519,6 +537,7 @@ std::string GenerateFragmentShader(const PicaShaderConfig& config) {

 in vec4 primary_color;
 in vec2 texcoord[3];
+in float texcoord0_w;
 in vec4 normquat;
 in vec3 view;

@ -536,6 +555,7 @@ layout (std140) uniform shader_data {
    vec4 const_color[NUM_TEV_STAGES];
    vec4 tev_combiner_buffer_color;
    int alphatest_ref;
+    float depth_scale;
    float depth_offset;
    vec3 lighting_global_ambient;
    LightSrc light_src[NUM_LIGHTS];
@ -555,29 +575,37 @@ vec4 secondary_fragment_color = vec4(0.0);
 )";

    // Do not do any sort of processing if it's obvious we're not going to pass the alpha test
-    if (config.alpha_test_func == Regs::CompareFunc::Never) {
+    if (state.alpha_test_func == Regs::CompareFunc::Never) {
        out += "discard; }";
        return out;
    }

-    if (config.lighting.enable)
+    if (state.lighting.enable)
        WriteLighting(out, config);

    out += "vec4 combiner_buffer = vec4(0.0);\n";
    out += "vec4 next_combiner_buffer = tev_combiner_buffer_color;\n";
    out += "vec4 last_tex_env_out = vec4(0.0);\n";

-    for (size_t index = 0; index < config.tev_stages.size(); ++index)
+    for (size_t index = 0; index < state.tev_stages.size(); ++index)
        WriteTevStage(out, config, (unsigned)index);

-    if (config.alpha_test_func != Regs::CompareFunc::Always) {
+    if (state.alpha_test_func != Regs::CompareFunc::Always) {
        out += "if (";
-        AppendAlphaTestCondition(out, config.alpha_test_func);
+        AppendAlphaTestCondition(out, state.alpha_test_func);
        out += ") discard;\n";
    }

    out += "color = last_tex_env_out;\n";
-    out += "gl_FragDepth = gl_FragCoord.z + depth_offset;\n}";
+
+    out += "float z_over_w = 1.0 - gl_FragCoord.z * 2.0;\n";
+    out += "float depth = z_over_w * depth_scale + depth_offset;\n";
+    if (state.depthmap_enable == Pica::Regs::DepthBuffering::WBuffering) {
+        out += "depth /= gl_FragCoord.w;\n";
+    }
+    out += "gl_FragDepth = depth;\n";
+
+    out += "}";

    return out;
 }
@ -585,17 +613,19 @@ vec4 secondary_fragment_color = vec4(0.0);
 std::string GenerateVertexShader() {
    std::string out = "#version 330 core\n";

-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_POSITION)  + ") in vec4 vert_position;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_COLOR)     + ") in vec4 vert_color;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0) + ") in vec2 vert_texcoord0;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD1) + ") in vec2 vert_texcoord1;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD2) + ") in vec2 vert_texcoord2;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_NORMQUAT)  + ") in vec4 vert_normquat;\n";
-    out += "layout(location = " + std::to_string((int)ATTRIBUTE_VIEW)      + ") in vec3 vert_view;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_POSITION)    + ") in vec4 vert_position;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_COLOR)       + ") in vec4 vert_color;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0)   + ") in vec2 vert_texcoord0;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD1)   + ") in vec2 vert_texcoord1;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD2)   + ") in vec2 vert_texcoord2;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_TEXCOORD0_W) + ") in float vert_texcoord0_w;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_NORMQUAT)    + ") in vec4 vert_normquat;\n";
+    out += "layout(location = " + std::to_string((int)ATTRIBUTE_VIEW)        + ") in vec3 vert_view;\n";

    out += R"(
 out vec4 primary_color;
 out vec2 texcoord[3];
+out float texcoord0_w;
 out vec4 normquat;
 out vec3 view;

@ -604,6 +634,7 @@ void main() {
    texcoord[0] = vert_texcoord0;
    texcoord[1] = vert_texcoord1;
    texcoord[2] = vert_texcoord2;
+    texcoord0_w = vert_texcoord0_w;
    normquat = vert_normquat;
    view = vert_view;
    gl_Position = vec4(vert_position.x, vert_position.y, -vert_position.z, vert_position.w);
--- a/src/video_core/renderer_opengl/gl_shader_gen.h
+++ b/src/video_core/renderer_opengl/gl_shader_gen.h
@ -6,7 +6,7 @@

 #include <string>

-struct PicaShaderConfig;
+union PicaShaderConfig;

 namespace GLShader {

--- a/src/video_core/renderer_opengl/gl_shader_util.h
+++ b/src/video_core/renderer_opengl/gl_shader_util.h
@ -14,6 +14,7 @@ enum Attributes {
    ATTRIBUTE_TEXCOORD0,
    ATTRIBUTE_TEXCOORD1,
    ATTRIBUTE_TEXCOORD2,
+    ATTRIBUTE_TEXCOORD0_W,
    ATTRIBUTE_NORMQUAT,
    ATTRIBUTE_VIEW,
 };
--- a/src/video_core/renderer_opengl/renderer_opengl.cpp
+++ b/src/video_core/renderer_opengl/renderer_opengl.cpp
@ -192,7 +192,7 @@ void RendererOpenGL::LoadFBToScreenInfo(const GPU::Regs::FramebufferConfig& fram
    // only allows rows to have a memory alignement of 4.
    ASSERT(pixel_stride % 4 == 0);

-    if (!Rasterizer()->AccelerateDisplay(framebuffer, framebuffer_addr, pixel_stride, screen_info)) {
+    if (!Rasterizer()->AccelerateDisplay(framebuffer, framebuffer_addr, static_cast<u32>(pixel_stride), screen_info)) {
        // Reset the screen info's display texture to its own permanent texture
        screen_info.display_texture = screen_info.texture.resource.handle;
        screen_info.display_texcoords = MathUtil::Rectangle<float>(0.f, 0.f, 1.f, 1.f);
@ -473,12 +473,6 @@ static void DebugHandler(GLenum source, GLenum type, GLuint id, GLenum severity,
 bool RendererOpenGL::Init() {
    render_window->MakeCurrent();

-    // TODO: Make frontends initialize this, so they can use gladLoadGLLoader with their own loaders
-    if (!gladLoadGL()) {
-        LOG_CRITICAL(Render_OpenGL, "Failed to initialize GL functions! Exiting...");
-        exit(-1);
-    }
-
    if (GLAD_GL_KHR_debug) {
        glEnable(GL_DEBUG_OUTPUT);
        glDebugMessageCallback(DebugHandler, nullptr);
--- a/src/video_core/shader/shader.cpp
+++ b/src/video_core/shader/shader.cpp
@ -35,7 +35,13 @@ static std::unordered_map<u64, std::unique_ptr<JitShader>> shader_map;
 static const JitShader* jit_shader;
 #endif // ARCHITECTURE_x86_64

-void Setup() {
+void ClearCache() {
+#ifdef ARCHITECTURE_x86_64
+    shader_map.clear();
+#endif // ARCHITECTURE_x86_64
+}
+
+void ShaderSetup::Setup() {
 #ifdef ARCHITECTURE_x86_64
    if (VideoCore::g_shader_jit_enabled) {
        u64 cache_key = (Common::ComputeHash64(&g_state.vs.program_code, sizeof(g_state.vs.program_code)) ^
@ -54,18 +60,12 @@ void Setup() {
 #endif // ARCHITECTURE_x86_64
 }

-void Shutdown() {
-#ifdef ARCHITECTURE_x86_64
-    shader_map.clear();
-#endif // ARCHITECTURE_x86_64
-}
+MICROPROFILE_DEFINE(GPU_Shader, "GPU", "Shader", MP_RGB(50, 50, 240));

-MICROPROFILE_DEFINE(GPU_VertexShader, "GPU", "Vertex Shader", MP_RGB(50, 50, 240));
-
-OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
+OutputVertex ShaderSetup::Run(UnitState<false>& state, const InputVertex& input, int num_attributes) {
    auto& config = g_state.regs.vs;

-    MICROPROFILE_SCOPE(GPU_VertexShader);
+    MICROPROFILE_SCOPE(GPU_Shader);

    state.program_counter = config.main_offset;
    state.debug.max_offset = 0;
@ -140,7 +140,7 @@ OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attr
    return ret;
 }

-DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) {
+DebugData<true> ShaderSetup::ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup) {
    UnitState<true> state;

    state.program_counter = config.main_offset;
--- a/src/video_core/shader/shader.h
+++ b/src/video_core/shader/shader.h
@ -43,7 +43,8 @@ struct OutputVertex {
    Math::Vec4<float24> color;
    Math::Vec2<float24> tc0;
    Math::Vec2<float24> tc1;
-    INSERT_PADDING_WORDS(2);
+    float24 tc0_w;
+    INSERT_PADDING_WORDS(1);
    Math::Vec3<float24> view;
    INSERT_PADDING_WORDS(1);
    Math::Vec2<float24> tc2;
@ -83,23 +84,6 @@ struct OutputVertex {
 static_assert(std::is_pod<OutputVertex>::value, "Structure is not POD");
 static_assert(sizeof(OutputVertex) == 32 * sizeof(float), "OutputVertex has invalid size");

-/// Vertex shader memory
-struct ShaderSetup {
-    struct {
-        // The float uniforms are accessed by the shader JIT using SSE instructions, and are
-        // therefore required to be 16-byte aligned.
-        alignas(16) Math::Vec4<float24> f[96];
-
-        std::array<bool, 16> b;
-        std::array<Math::Vec4<u8>, 4> i;
-    } uniforms;
-
-    Math::Vec4<float24> default_attributes[16];
-
-    std::array<u32, 1024> program_code;
-    std::array<u32, 1024> swizzle_data;
-};
-
 // Helper structure used to keep track of data useful for inspection of shader emulation
 template<bool full_debugging>
 struct DebugData;
@ -342,33 +326,51 @@ struct UnitState {
    }
 };

-/**
- * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
- * vertex, which would happen within the `Run` function).
- */
-void Setup();
+/// Clears the shader cache
+void ClearCache();

-/// Performs any cleanup when the emulator is shutdown
-void Shutdown();
+struct ShaderSetup {

-/**
- * Runs the currently setup shader
- * @param state Shader unit state, must be setup per shader and per shader unit
- * @param input Input vertex into the shader
- * @param num_attributes The number of vertex shader attributes
- * @return The output vertex, after having been processed by the vertex shader
- */
-OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+    struct {
+        // The float uniforms are accessed by the shader JIT using SSE instructions, and are
+        // therefore required to be 16-byte aligned.
+        alignas(16) Math::Vec4<float24> f[96];

-/**
- * Produce debug information based on the given shader and input vertex
- * @param input Input vertex into the shader
- * @param num_attributes The number of vertex shader attributes
- * @param config Configuration object for the shader pipeline
- * @param setup Setup object for the shader pipeline
- * @return Debug information for this shader with regards to the given vertex
- */
-DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup);
+        std::array<bool, 16> b;
+        std::array<Math::Vec4<u8>, 4> i;
+    } uniforms;
+
+    Math::Vec4<float24> default_attributes[16];
+
+    std::array<u32, 1024> program_code;
+    std::array<u32, 1024> swizzle_data;
+
+    /**
+     * Performs any shader unit setup that only needs to happen once per shader (as opposed to once per
+     * vertex, which would happen within the `Run` function).
+     */
+    void Setup();
+
+    /**
+     * Runs the currently setup shader
+     * @param state Shader unit state, must be setup per shader and per shader unit
+     * @param input Input vertex into the shader
+     * @param num_attributes The number of vertex shader attributes
+     * @return The output vertex, after having been processed by the vertex shader
+     */
+    OutputVertex Run(UnitState<false>& state, const InputVertex& input, int num_attributes);
+
+    /**
+     * Produce debug information based on the given shader and input vertex
+     * @param input Input vertex into the shader
+     * @param num_attributes The number of vertex shader attributes
+     * @param config Configuration object for the shader pipeline
+     * @param setup Setup object for the shader pipeline
+     * @return Debug information for this shader with regards to the given vertex
+     */
+    DebugData<true> ProduceDebugInfo(const InputVertex& input, int num_attributes, const Regs::ShaderConfig& config, const ShaderSetup& setup);
+
+};

 } // namespace Shader