From 9d0f22d24fdbefb3e0505cb7d502431591647b8e Mon Sep 17 00:00:00 2001 From: Subv Date: Sun, 16 Aug 2015 15:11:58 -0500 Subject: [PATCH] Rasterizer/HW: Improved the speed of the swizzling done during each framebuffer readback / reload. This reduces readback and reload time by a few milliseconds. Original function by @yuriks --- src/common/common_types.h | 5 ++ .../renderer_opengl/gl_rasterizer.cpp | 53 ++++++++++++------- src/video_core/utils.h | 53 +++++++++++++++---- 3 files changed, 82 insertions(+), 29 deletions(-) diff --git a/src/common/common_types.h b/src/common/common_types.h index fa3e0b8d6..1d933bd6f 100644 --- a/src/common/common_types.h +++ b/src/common/common_types.h @@ -50,6 +50,11 @@ typedef double f64; ///< 64-bit floating point typedef u32 VAddr; ///< Represents a pointer in the userspace virtual address space. typedef u32 PAddr; ///< Represents a pointer in the ARM11 physical address space. +// A 24-bit storage datatype to make working with data of this size (e.g. RGB8 textures) easier. Not intended for usage in arithmetic. +struct u24_be { + u8 components[3]; +}; + // An inheritable class to disallow the copy constructor and operator= functions class NonCopyable { protected: diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp index a613fe136..d55ceab78 100644 --- a/src/video_core/renderer_opengl/gl_rasterizer.cpp +++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp @@ -102,9 +102,10 @@ void RasterizerOpenGL::InitObjects() { glEnableVertexAttribArray(attrib_texcoords + 1); glEnableVertexAttribArray(attrib_texcoords + 2); - // Create textures for OGL framebuffer that will be rendered to, initially 1x1 to succeed in framebuffer creation + // Create textures for OGL framebuffer that will be rendered to, initially 8x8 to succeed in framebuffer creation + // 3DS textures' width and height must be multiples of 8 fb_color_texture.texture.Create(); - ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 1, 1); + ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 8, 8); state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.Apply(); @@ -848,15 +849,21 @@ void RasterizerOpenGL::ReloadColorBuffer() { std::unique_ptr temp_fb_color_buffer(new u8[fb_color_texture.width * fb_color_texture.height * bytes_per_pixel]); // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary. - for (int y = 0; y < fb_color_texture.height; ++y) { - for (int x = 0; x < fb_color_texture.width; ++x) { - const u32 coarse_y = y & ~7; - u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_color_texture.width * bytes_per_pixel; - u32 gl_pixel_index = (x + y * fb_color_texture.width) * bytes_per_pixel; - - u8* pixel = color_buffer + dst_offset; - memcpy(&temp_fb_color_buffer[gl_pixel_index], pixel, bytes_per_pixel); - } + switch (bytes_per_pixel) { + case 4: + VideoCore::CopyTextureAndTile((u32*)temp_fb_color_buffer.get(), (u32*)color_buffer, fb_color_texture.width, fb_color_texture.height); + break; + case 3: + VideoCore::CopyTextureAndTile((u24_be*)temp_fb_color_buffer.get(), (u24_be*)color_buffer, fb_color_texture.width, fb_color_texture.height); + break; + case 2: + VideoCore::CopyTextureAndTile((u16*)temp_fb_color_buffer.get(), (u16*)color_buffer, fb_color_texture.width, fb_color_texture.height); + break; + case 1: + VideoCore::CopyTextureAndTile(temp_fb_color_buffer.get(), color_buffer, fb_color_texture.width, fb_color_texture.height); + break; + default: + LOG_ERROR(Render_OpenGL, "Unimplemented pixel size %u bytes per pixel", bytes_per_pixel); } state.texture_units[0].texture_2d = fb_color_texture.texture.handle; @@ -961,15 +968,21 @@ void RasterizerOpenGL::CommitColorBuffer() { state.Apply(); // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary. - for (int y = 0; y < fb_color_texture.height; ++y) { - for (int x = 0; x < fb_color_texture.width; ++x) { - const u32 coarse_y = y & ~7; - u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_color_texture.width * bytes_per_pixel; - u32 gl_pixel_index = x * bytes_per_pixel + y * fb_color_texture.width * bytes_per_pixel; - - u8* pixel = color_buffer + dst_offset; - memcpy(pixel, &temp_gl_color_buffer[gl_pixel_index], bytes_per_pixel); - } + switch (bytes_per_pixel) { + case 4: + VideoCore::CopyTextureAndTile((u32*)color_buffer, (u32*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height); + break; + case 3: + VideoCore::CopyTextureAndTile((u24_be*)color_buffer, (u24_be*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height); + break; + case 2: + VideoCore::CopyTextureAndTile((u16*)color_buffer, (u16*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height); + break; + case 1: + VideoCore::CopyTextureAndTile(color_buffer, temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height); + break; + default: + LOG_ERROR(Render_OpenGL, "Unimplemented pixel size %u bytes per pixel", bytes_per_pixel); } } } diff --git a/src/video_core/utils.h b/src/video_core/utils.h index 4fa60a10e..ba61b2a70 100644 --- a/src/video_core/utils.h +++ b/src/video_core/utils.h @@ -35,17 +35,52 @@ struct TGAHeader { */ void DumpTGA(std::string filename, short width, short height, u8* raw_data); +/// Lookup table for the offsets used to convert an image to Morton order. +static const u8 morton_lut[64] = { + 0, 1, 4, 5, 16, 17, 20, 21, + 2, 3, 6, 7, 18, 19, 22, 23, + 8, 9, 12, 13, 24, 25, 28, 29, + 10, 11, 14, 15, 26, 27, 30, 31, + 32, 33, 36, 37, 48, 49, 52, 53, + 34, 35, 38, 39, 50, 51, 54, 55, + 40, 41, 44, 45, 56, 57, 60, 61, + 42, 43, 46, 47, 58, 59, 62, 63, +}; + /** - * Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are - * arranged in a Z-order curve. More details on the bit manipulation at: - * https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/ + * Lookup the intra-block offset for the specified coordinates in the Morton order (Z-order) lookup table. + * @param x X coordinate, must be [0, 7] + * @param y Y coordinate, must be [0, 7] */ static inline u32 MortonInterleave(u32 x, u32 y) { - u32 i = (x & 7) | ((y & 7) << 8); // ---- -210 - i = (i ^ (i << 2)) & 0x1313; // ---2 --10 - i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0 - i = (i | (i >> 7)) & 0x3F; - return i; + return morton_lut[y * 8 + x]; +} + +/** + * Copies the texture data from the source address to the destination address, + * applying a Morton-order transformation while copying. + * @param dst Pointer to which the texture will be copied. + * @param src Pointer to the source texture data. + * @param width Width of the texture, should be a multiple of 8. + * @param height Height of the texture, should be a multiple of 8. + * @param T Type of the source and destination pointers, the swizzling process depends on the size of this parameter. + */ +template +static inline void CopyTextureAndTile(T* dst, const T* src, unsigned int width, unsigned int height) { + for (unsigned int y = 0; y + 8 <= height; y += 8) { + for (unsigned int x = 0; x + 8 <= width; x += 8) { + const T* line = &src[y * width + x]; + + for (unsigned int yy = 0; yy < 8; ++yy) { + for (unsigned int xx = 0; xx < 8; ++xx) { + dst[morton_lut[yy * 8 + xx]] = line[xx]; + } + line += width; + } + + dst += 8 * 8; + } + } } /** @@ -75,7 +110,7 @@ static inline u32 GetMortonOffset(u32 x, u32 y, u32 bytes_per_pixel) { const unsigned int block_height = 8; const unsigned int coarse_x = x & ~7; - u32 i = VideoCore::MortonInterleave(x, y); + u32 i = VideoCore::MortonInterleave(x & 7, y & 7); const unsigned int offset = coarse_x * block_height;