Rasterizer/HW: Improved the speed of the swizzling done during each framebuffer readback / reload. This reduces readback and reload time by a few milliseconds.
Original function by @yuriks
This commit is contained in:
parent
7411aed5c7
commit
9d0f22d24f
3 changed files with 82 additions and 29 deletions
|
@ -50,6 +50,11 @@ typedef double f64; ///< 64-bit floating point
|
||||||
typedef u32 VAddr; ///< Represents a pointer in the userspace virtual address space.
|
typedef u32 VAddr; ///< Represents a pointer in the userspace virtual address space.
|
||||||
typedef u32 PAddr; ///< Represents a pointer in the ARM11 physical address space.
|
typedef u32 PAddr; ///< Represents a pointer in the ARM11 physical address space.
|
||||||
|
|
||||||
|
// A 24-bit storage datatype to make working with data of this size (e.g. RGB8 textures) easier. Not intended for usage in arithmetic.
|
||||||
|
struct u24_be {
|
||||||
|
u8 components[3];
|
||||||
|
};
|
||||||
|
|
||||||
// An inheritable class to disallow the copy constructor and operator= functions
|
// An inheritable class to disallow the copy constructor and operator= functions
|
||||||
class NonCopyable {
|
class NonCopyable {
|
||||||
protected:
|
protected:
|
||||||
|
|
|
@ -102,9 +102,10 @@ void RasterizerOpenGL::InitObjects() {
|
||||||
glEnableVertexAttribArray(attrib_texcoords + 1);
|
glEnableVertexAttribArray(attrib_texcoords + 1);
|
||||||
glEnableVertexAttribArray(attrib_texcoords + 2);
|
glEnableVertexAttribArray(attrib_texcoords + 2);
|
||||||
|
|
||||||
// Create textures for OGL framebuffer that will be rendered to, initially 1x1 to succeed in framebuffer creation
|
// Create textures for OGL framebuffer that will be rendered to, initially 8x8 to succeed in framebuffer creation
|
||||||
|
// 3DS textures' width and height must be multiples of 8
|
||||||
fb_color_texture.texture.Create();
|
fb_color_texture.texture.Create();
|
||||||
ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 1, 1);
|
ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 8, 8);
|
||||||
|
|
||||||
state.texture_units[0].texture_2d = fb_color_texture.texture.handle;
|
state.texture_units[0].texture_2d = fb_color_texture.texture.handle;
|
||||||
state.Apply();
|
state.Apply();
|
||||||
|
@ -848,15 +849,21 @@ void RasterizerOpenGL::ReloadColorBuffer() {
|
||||||
std::unique_ptr<u8[]> temp_fb_color_buffer(new u8[fb_color_texture.width * fb_color_texture.height * bytes_per_pixel]);
|
std::unique_ptr<u8[]> temp_fb_color_buffer(new u8[fb_color_texture.width * fb_color_texture.height * bytes_per_pixel]);
|
||||||
|
|
||||||
// Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary.
|
// Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary.
|
||||||
for (int y = 0; y < fb_color_texture.height; ++y) {
|
switch (bytes_per_pixel) {
|
||||||
for (int x = 0; x < fb_color_texture.width; ++x) {
|
case 4:
|
||||||
const u32 coarse_y = y & ~7;
|
VideoCore::CopyTextureAndTile<u32>((u32*)temp_fb_color_buffer.get(), (u32*)color_buffer, fb_color_texture.width, fb_color_texture.height);
|
||||||
u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_color_texture.width * bytes_per_pixel;
|
break;
|
||||||
u32 gl_pixel_index = (x + y * fb_color_texture.width) * bytes_per_pixel;
|
case 3:
|
||||||
|
VideoCore::CopyTextureAndTile<u24_be>((u24_be*)temp_fb_color_buffer.get(), (u24_be*)color_buffer, fb_color_texture.width, fb_color_texture.height);
|
||||||
u8* pixel = color_buffer + dst_offset;
|
break;
|
||||||
memcpy(&temp_fb_color_buffer[gl_pixel_index], pixel, bytes_per_pixel);
|
case 2:
|
||||||
}
|
VideoCore::CopyTextureAndTile<u16>((u16*)temp_fb_color_buffer.get(), (u16*)color_buffer, fb_color_texture.width, fb_color_texture.height);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
VideoCore::CopyTextureAndTile<u8>(temp_fb_color_buffer.get(), color_buffer, fb_color_texture.width, fb_color_texture.height);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
LOG_ERROR(Render_OpenGL, "Unimplemented pixel size %u bytes per pixel", bytes_per_pixel);
|
||||||
}
|
}
|
||||||
|
|
||||||
state.texture_units[0].texture_2d = fb_color_texture.texture.handle;
|
state.texture_units[0].texture_2d = fb_color_texture.texture.handle;
|
||||||
|
@ -961,15 +968,21 @@ void RasterizerOpenGL::CommitColorBuffer() {
|
||||||
state.Apply();
|
state.Apply();
|
||||||
|
|
||||||
// Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary.
|
// Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary.
|
||||||
for (int y = 0; y < fb_color_texture.height; ++y) {
|
switch (bytes_per_pixel) {
|
||||||
for (int x = 0; x < fb_color_texture.width; ++x) {
|
case 4:
|
||||||
const u32 coarse_y = y & ~7;
|
VideoCore::CopyTextureAndTile<u32>((u32*)color_buffer, (u32*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
|
||||||
u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_color_texture.width * bytes_per_pixel;
|
break;
|
||||||
u32 gl_pixel_index = x * bytes_per_pixel + y * fb_color_texture.width * bytes_per_pixel;
|
case 3:
|
||||||
|
VideoCore::CopyTextureAndTile<u24_be>((u24_be*)color_buffer, (u24_be*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
|
||||||
u8* pixel = color_buffer + dst_offset;
|
break;
|
||||||
memcpy(pixel, &temp_gl_color_buffer[gl_pixel_index], bytes_per_pixel);
|
case 2:
|
||||||
}
|
VideoCore::CopyTextureAndTile<u16>((u16*)color_buffer, (u16*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
|
||||||
|
break;
|
||||||
|
case 1:
|
||||||
|
VideoCore::CopyTextureAndTile<u8>(color_buffer, temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
|
||||||
|
break;
|
||||||
|
default:
|
||||||
|
LOG_ERROR(Render_OpenGL, "Unimplemented pixel size %u bytes per pixel", bytes_per_pixel);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
|
@ -35,17 +35,52 @@ struct TGAHeader {
|
||||||
*/
|
*/
|
||||||
void DumpTGA(std::string filename, short width, short height, u8* raw_data);
|
void DumpTGA(std::string filename, short width, short height, u8* raw_data);
|
||||||
|
|
||||||
|
/// Lookup table for the offsets used to convert an image to Morton order.
|
||||||
|
static const u8 morton_lut[64] = {
|
||||||
|
0, 1, 4, 5, 16, 17, 20, 21,
|
||||||
|
2, 3, 6, 7, 18, 19, 22, 23,
|
||||||
|
8, 9, 12, 13, 24, 25, 28, 29,
|
||||||
|
10, 11, 14, 15, 26, 27, 30, 31,
|
||||||
|
32, 33, 36, 37, 48, 49, 52, 53,
|
||||||
|
34, 35, 38, 39, 50, 51, 54, 55,
|
||||||
|
40, 41, 44, 45, 56, 57, 60, 61,
|
||||||
|
42, 43, 46, 47, 58, 59, 62, 63,
|
||||||
|
};
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
|
* Lookup the intra-block offset for the specified coordinates in the Morton order (Z-order) lookup table.
|
||||||
* arranged in a Z-order curve. More details on the bit manipulation at:
|
* @param x X coordinate, must be [0, 7]
|
||||||
* https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
|
* @param y Y coordinate, must be [0, 7]
|
||||||
*/
|
*/
|
||||||
static inline u32 MortonInterleave(u32 x, u32 y) {
|
static inline u32 MortonInterleave(u32 x, u32 y) {
|
||||||
u32 i = (x & 7) | ((y & 7) << 8); // ---- -210
|
return morton_lut[y * 8 + x];
|
||||||
i = (i ^ (i << 2)) & 0x1313; // ---2 --10
|
}
|
||||||
i = (i ^ (i << 1)) & 0x1515; // ---2 -1-0
|
|
||||||
i = (i | (i >> 7)) & 0x3F;
|
/**
|
||||||
return i;
|
* Copies the texture data from the source address to the destination address,
|
||||||
|
* applying a Morton-order transformation while copying.
|
||||||
|
* @param dst Pointer to which the texture will be copied.
|
||||||
|
* @param src Pointer to the source texture data.
|
||||||
|
* @param width Width of the texture, should be a multiple of 8.
|
||||||
|
* @param height Height of the texture, should be a multiple of 8.
|
||||||
|
* @param T Type of the source and destination pointers, the swizzling process depends on the size of this parameter.
|
||||||
|
*/
|
||||||
|
template<typename T>
|
||||||
|
static inline void CopyTextureAndTile(T* dst, const T* src, unsigned int width, unsigned int height) {
|
||||||
|
for (unsigned int y = 0; y + 8 <= height; y += 8) {
|
||||||
|
for (unsigned int x = 0; x + 8 <= width; x += 8) {
|
||||||
|
const T* line = &src[y * width + x];
|
||||||
|
|
||||||
|
for (unsigned int yy = 0; yy < 8; ++yy) {
|
||||||
|
for (unsigned int xx = 0; xx < 8; ++xx) {
|
||||||
|
dst[morton_lut[yy * 8 + xx]] = line[xx];
|
||||||
|
}
|
||||||
|
line += width;
|
||||||
|
}
|
||||||
|
|
||||||
|
dst += 8 * 8;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/**
|
/**
|
||||||
|
@ -75,7 +110,7 @@ static inline u32 GetMortonOffset(u32 x, u32 y, u32 bytes_per_pixel) {
|
||||||
const unsigned int block_height = 8;
|
const unsigned int block_height = 8;
|
||||||
const unsigned int coarse_x = x & ~7;
|
const unsigned int coarse_x = x & ~7;
|
||||||
|
|
||||||
u32 i = VideoCore::MortonInterleave(x, y);
|
u32 i = VideoCore::MortonInterleave(x & 7, y & 7);
|
||||||
|
|
||||||
const unsigned int offset = coarse_x * block_height;
|
const unsigned int offset = coarse_x * block_height;
|
||||||
|
|
||||||
|
|
Loading…
Reference in a new issue