Rasterizer/HW: Improved the speed of the swizzling done during each framebuffer readback / reload. This reduces readback and reload time by a few milliseconds.

Original function by @yuriks
This commit is contained in:
Subv 2015-08-16 15:11:58 -05:00
parent 7411aed5c7
commit 9d0f22d24f
3 changed files with 82 additions and 29 deletions

View file

@ -50,6 +50,11 @@ typedef double f64; ///< 64-bit floating point
typedef u32 VAddr; ///< Represents a pointer in the userspace virtual address space. typedef u32 VAddr; ///< Represents a pointer in the userspace virtual address space.
typedef u32 PAddr; ///< Represents a pointer in the ARM11 physical address space. typedef u32 PAddr; ///< Represents a pointer in the ARM11 physical address space.
// A 24-bit storage datatype to make working with data of this size (e.g. RGB8 textures) easier. Not intended for usage in arithmetic.
struct u24_be {
u8 components[3];
};
// An inheritable class to disallow the copy constructor and operator= functions // An inheritable class to disallow the copy constructor and operator= functions
class NonCopyable { class NonCopyable {
protected: protected:

View file

@ -102,9 +102,10 @@ void RasterizerOpenGL::InitObjects() {
glEnableVertexAttribArray(attrib_texcoords + 1); glEnableVertexAttribArray(attrib_texcoords + 1);
glEnableVertexAttribArray(attrib_texcoords + 2); glEnableVertexAttribArray(attrib_texcoords + 2);
// Create textures for OGL framebuffer that will be rendered to, initially 1x1 to succeed in framebuffer creation // Create textures for OGL framebuffer that will be rendered to, initially 8x8 to succeed in framebuffer creation
// 3DS textures' width and height must be multiples of 8
fb_color_texture.texture.Create(); fb_color_texture.texture.Create();
ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 1, 1); ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 8, 8);
state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.texture_units[0].texture_2d = fb_color_texture.texture.handle;
state.Apply(); state.Apply();
@ -848,15 +849,21 @@ void RasterizerOpenGL::ReloadColorBuffer() {
std::unique_ptr<u8[]> temp_fb_color_buffer(new u8[fb_color_texture.width * fb_color_texture.height * bytes_per_pixel]); std::unique_ptr<u8[]> temp_fb_color_buffer(new u8[fb_color_texture.width * fb_color_texture.height * bytes_per_pixel]);
// Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary. // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary.
for (int y = 0; y < fb_color_texture.height; ++y) { switch (bytes_per_pixel) {
for (int x = 0; x < fb_color_texture.width; ++x) { case 4:
const u32 coarse_y = y & ~7; VideoCore::CopyTextureAndTile<u32>((u32*)temp_fb_color_buffer.get(), (u32*)color_buffer, fb_color_texture.width, fb_color_texture.height);
u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_color_texture.width * bytes_per_pixel; break;
u32 gl_pixel_index = (x + y * fb_color_texture.width) * bytes_per_pixel; case 3:
VideoCore::CopyTextureAndTile<u24_be>((u24_be*)temp_fb_color_buffer.get(), (u24_be*)color_buffer, fb_color_texture.width, fb_color_texture.height);
u8* pixel = color_buffer + dst_offset; break;
memcpy(&temp_fb_color_buffer[gl_pixel_index], pixel, bytes_per_pixel); case 2:
} VideoCore::CopyTextureAndTile<u16>((u16*)temp_fb_color_buffer.get(), (u16*)color_buffer, fb_color_texture.width, fb_color_texture.height);
break;
case 1:
VideoCore::CopyTextureAndTile<u8>(temp_fb_color_buffer.get(), color_buffer, fb_color_texture.width, fb_color_texture.height);
break;
default:
LOG_ERROR(Render_OpenGL, "Unimplemented pixel size %u bytes per pixel", bytes_per_pixel);
} }
state.texture_units[0].texture_2d = fb_color_texture.texture.handle; state.texture_units[0].texture_2d = fb_color_texture.texture.handle;
@ -961,15 +968,21 @@ void RasterizerOpenGL::CommitColorBuffer() {
state.Apply(); state.Apply();
// Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary. // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary.
for (int y = 0; y < fb_color_texture.height; ++y) { switch (bytes_per_pixel) {
for (int x = 0; x < fb_color_texture.width; ++x) { case 4:
const u32 coarse_y = y & ~7; VideoCore::CopyTextureAndTile<u32>((u32*)color_buffer, (u32*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_color_texture.width * bytes_per_pixel; break;
u32 gl_pixel_index = x * bytes_per_pixel + y * fb_color_texture.width * bytes_per_pixel; case 3:
VideoCore::CopyTextureAndTile<u24_be>((u24_be*)color_buffer, (u24_be*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
u8* pixel = color_buffer + dst_offset; break;
memcpy(pixel, &temp_gl_color_buffer[gl_pixel_index], bytes_per_pixel); case 2:
} VideoCore::CopyTextureAndTile<u16>((u16*)color_buffer, (u16*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
break;
case 1:
VideoCore::CopyTextureAndTile<u8>(color_buffer, temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
break;
default:
LOG_ERROR(Render_OpenGL, "Unimplemented pixel size %u bytes per pixel", bytes_per_pixel);
} }
} }
} }

View file

@ -35,17 +35,52 @@ struct TGAHeader {
*/ */
void DumpTGA(std::string filename, short width, short height, u8* raw_data); void DumpTGA(std::string filename, short width, short height, u8* raw_data);
/// Lookup table for the offsets used to convert an image to Morton order.
static const u8 morton_lut[64] = {
0, 1, 4, 5, 16, 17, 20, 21,
2, 3, 6, 7, 18, 19, 22, 23,
8, 9, 12, 13, 24, 25, 28, 29,
10, 11, 14, 15, 26, 27, 30, 31,
32, 33, 36, 37, 48, 49, 52, 53,
34, 35, 38, 39, 50, 51, 54, 55,
40, 41, 44, 45, 56, 57, 60, 61,
42, 43, 46, 47, 58, 59, 62, 63,
};
/** /**
* Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are * Lookup the intra-block offset for the specified coordinates in the Morton order (Z-order) lookup table.
* arranged in a Z-order curve. More details on the bit manipulation at: * @param x X coordinate, must be [0, 7]
* https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/ * @param y Y coordinate, must be [0, 7]
*/ */
static inline u32 MortonInterleave(u32 x, u32 y) { static inline u32 MortonInterleave(u32 x, u32 y) {
u32 i = (x & 7) | ((y & 7) << 8); // ---- -210 return morton_lut[y * 8 + x];
i = (i ^ (i << 2)) & 0x1313; // ---2 --10 }
i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
i = (i | (i >> 7)) & 0x3F; /**
return i; * Copies the texture data from the source address to the destination address,
* applying a Morton-order transformation while copying.
* @param dst Pointer to which the texture will be copied.
* @param src Pointer to the source texture data.
* @param width Width of the texture, should be a multiple of 8.
* @param height Height of the texture, should be a multiple of 8.
* @param T Type of the source and destination pointers, the swizzling process depends on the size of this parameter.
*/
template<typename T>
static inline void CopyTextureAndTile(T* dst, const T* src, unsigned int width, unsigned int height) {
for (unsigned int y = 0; y + 8 <= height; y += 8) {
for (unsigned int x = 0; x + 8 <= width; x += 8) {
const T* line = &src[y * width + x];
for (unsigned int yy = 0; yy < 8; ++yy) {
for (unsigned int xx = 0; xx < 8; ++xx) {
dst[morton_lut[yy * 8 + xx]] = line[xx];
}
line += width;
}
dst += 8 * 8;
}
}
} }
/** /**
@ -75,7 +110,7 @@ static inline u32 GetMortonOffset(u32 x, u32 y, u32 bytes_per_pixel) {
const unsigned int block_height = 8; const unsigned int block_height = 8;
const unsigned int coarse_x = x & ~7; const unsigned int coarse_x = x & ~7;
u32 i = VideoCore::MortonInterleave(x, y); u32 i = VideoCore::MortonInterleave(x & 7, y & 7);
const unsigned int offset = coarse_x * block_height; const unsigned int offset = coarse_x * block_height;