diff --git a/src/common/common_types.h b/src/common/common_types.h
index fa3e0b8d6..1d933bd6f 100644
--- a/src/common/common_types.h
+++ b/src/common/common_types.h
@@ -50,6 +50,11 @@ typedef double  f64; ///< 64-bit floating point
 typedef u32 VAddr; ///< Represents a pointer in the userspace virtual address space.
 typedef u32 PAddr; ///< Represents a pointer in the ARM11 physical address space.
 
+// A 24-bit storage datatype to make working with data of this size (e.g. RGB8 textures) easier. Not intended for usage in arithmetic.
+struct u24_be {
+    u8 components[3];
+};
+
 // An inheritable class to disallow the copy constructor and operator= functions
 class NonCopyable {
 protected:
diff --git a/src/video_core/renderer_opengl/gl_rasterizer.cpp b/src/video_core/renderer_opengl/gl_rasterizer.cpp
index a613fe136..d55ceab78 100644
--- a/src/video_core/renderer_opengl/gl_rasterizer.cpp
+++ b/src/video_core/renderer_opengl/gl_rasterizer.cpp
@@ -102,9 +102,10 @@ void RasterizerOpenGL::InitObjects() {
     glEnableVertexAttribArray(attrib_texcoords + 1);
     glEnableVertexAttribArray(attrib_texcoords + 2);
 
-    // Create textures for OGL framebuffer that will be rendered to, initially 1x1 to succeed in framebuffer creation
+    // Create textures for OGL framebuffer that will be rendered to, initially 8x8 to succeed in framebuffer creation
+    // 3DS textures' width and height must be multiples of 8
     fb_color_texture.texture.Create();
-    ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 1, 1);
+    ReconfigureColorTexture(fb_color_texture, Pica::Regs::ColorFormat::RGBA8, 8, 8);
 
     state.texture_units[0].texture_2d = fb_color_texture.texture.handle;
     state.Apply();
@@ -848,15 +849,21 @@ void RasterizerOpenGL::ReloadColorBuffer() {
     std::unique_ptr<u8[]> temp_fb_color_buffer(new u8[fb_color_texture.width * fb_color_texture.height * bytes_per_pixel]);
 
     // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary.
-    for (int y = 0; y < fb_color_texture.height; ++y) {
-        for (int x = 0; x < fb_color_texture.width; ++x) {
-            const u32 coarse_y = y & ~7;
-            u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_color_texture.width * bytes_per_pixel;
-            u32 gl_pixel_index = (x + y * fb_color_texture.width) * bytes_per_pixel;
-
-            u8* pixel = color_buffer + dst_offset;
-            memcpy(&temp_fb_color_buffer[gl_pixel_index], pixel, bytes_per_pixel);
-        }
+    switch (bytes_per_pixel) {
+    case 4:
+        VideoCore::CopyTextureAndTile<u32>((u32*)temp_fb_color_buffer.get(), (u32*)color_buffer, fb_color_texture.width, fb_color_texture.height);
+        break;
+    case 3:
+        VideoCore::CopyTextureAndTile<u24_be>((u24_be*)temp_fb_color_buffer.get(), (u24_be*)color_buffer, fb_color_texture.width, fb_color_texture.height);
+        break;
+    case 2:
+        VideoCore::CopyTextureAndTile<u16>((u16*)temp_fb_color_buffer.get(), (u16*)color_buffer, fb_color_texture.width, fb_color_texture.height);
+        break;
+    case 1:
+        VideoCore::CopyTextureAndTile<u8>(temp_fb_color_buffer.get(), color_buffer, fb_color_texture.width, fb_color_texture.height);
+        break;
+    default:
+        LOG_ERROR(Render_OpenGL, "Unimplemented pixel size %u bytes per pixel", bytes_per_pixel);
     }
 
     state.texture_units[0].texture_2d = fb_color_texture.texture.handle;
@@ -961,15 +968,21 @@ void RasterizerOpenGL::CommitColorBuffer() {
             state.Apply();
 
             // Directly copy pixels. Internal OpenGL color formats are consistent so no conversion is necessary.
-            for (int y = 0; y < fb_color_texture.height; ++y) {
-                for (int x = 0; x < fb_color_texture.width; ++x) {
-                    const u32 coarse_y = y & ~7;
-                    u32 dst_offset = VideoCore::GetMortonOffset(x, y, bytes_per_pixel) + coarse_y * fb_color_texture.width * bytes_per_pixel;
-                    u32 gl_pixel_index = x * bytes_per_pixel + y * fb_color_texture.width * bytes_per_pixel;
-
-                    u8* pixel = color_buffer + dst_offset;
-                    memcpy(pixel, &temp_gl_color_buffer[gl_pixel_index], bytes_per_pixel);
-                }
+            switch (bytes_per_pixel) {
+            case 4:
+                VideoCore::CopyTextureAndTile<u32>((u32*)color_buffer, (u32*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
+                break;
+            case 3:
+                VideoCore::CopyTextureAndTile<u24_be>((u24_be*)color_buffer, (u24_be*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
+                break;
+            case 2:
+                VideoCore::CopyTextureAndTile<u16>((u16*)color_buffer, (u16*)temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
+                break;
+            case 1:
+                VideoCore::CopyTextureAndTile<u8>(color_buffer, temp_gl_color_buffer.get(), fb_color_texture.width, fb_color_texture.height);
+                break;
+            default:
+                LOG_ERROR(Render_OpenGL, "Unimplemented pixel size %u bytes per pixel", bytes_per_pixel);
             }
         }
     }
diff --git a/src/video_core/utils.h b/src/video_core/utils.h
index 4fa60a10e..ba61b2a70 100644
--- a/src/video_core/utils.h
+++ b/src/video_core/utils.h
@@ -35,17 +35,52 @@ struct TGAHeader {
  */
 void DumpTGA(std::string filename, short width, short height, u8* raw_data);
 
+/// Lookup table for the offsets used to convert an image to Morton order.
+static const u8 morton_lut[64] = {
+    0,  1,  4,  5, 16, 17, 20, 21,
+    2,  3,  6,  7, 18, 19, 22, 23,
+    8,  9, 12, 13, 24, 25, 28, 29,
+    10, 11, 14, 15, 26, 27, 30, 31,
+    32, 33, 36, 37, 48, 49, 52, 53,
+    34, 35, 38, 39, 50, 51, 54, 55,
+    40, 41, 44, 45, 56, 57, 60, 61,
+    42, 43, 46, 47, 58, 59, 62, 63,
+};
+
 /**
- * Interleave the lower 3 bits of each coordinate to get the intra-block offsets, which are
- * arranged in a Z-order curve. More details on the bit manipulation at:
- * https://fgiesen.wordpress.com/2009/12/13/decoding-morton-codes/
+ * Lookup the intra-block offset for the specified coordinates in the Morton order (Z-order) lookup table.
+ * @param x X coordinate, must be [0, 7]
+ * @param y Y coordinate, must be [0, 7]
  */
 static inline u32 MortonInterleave(u32 x, u32 y) {
-    u32 i = (x & 7) | ((y & 7) << 8); // ---- -210
-    i = (i ^ (i << 2)) & 0x1313;      // ---2 --10
-    i = (i ^ (i << 1)) & 0x1515;      // ---2 -1-0
-    i = (i | (i >> 7)) & 0x3F;
-    return i;
+    return morton_lut[y * 8 + x];
+}
+
+/**
+ * Copies the texture data from the source address to the destination address,
+ * applying a Morton-order transformation while copying.
+ * @param dst Pointer to which the texture will be copied.
+ * @param src Pointer to the source texture data.
+ * @param width Width of the texture, should be a multiple of 8.
+ * @param height Height of the texture, should be a multiple of 8.
+ * @param T Type of the source and destination pointers, the swizzling process depends on the size of this parameter.
+ */
+template<typename T>
+static inline void CopyTextureAndTile(T* dst, const T* src, unsigned int width, unsigned int height) {
+    for (unsigned int y = 0; y + 8 <= height; y += 8) {
+        for (unsigned int x = 0; x + 8 <= width; x += 8) {
+            const T* line = &src[y * width + x];
+
+            for (unsigned int yy = 0; yy < 8; ++yy) {
+                for (unsigned int xx = 0; xx < 8; ++xx) {
+                    dst[morton_lut[yy * 8 + xx]] = line[xx];
+                }
+                line += width;
+            }
+
+            dst += 8 * 8;
+        }
+    }
 }
 
 /**
@@ -75,7 +110,7 @@ static inline u32 GetMortonOffset(u32 x, u32 y, u32 bytes_per_pixel) {
     const unsigned int block_height = 8;
     const unsigned int coarse_x = x & ~7;
 
-    u32 i = VideoCore::MortonInterleave(x, y);
+    u32 i = VideoCore::MortonInterleave(x & 7, y & 7);
 
     const unsigned int offset = coarse_x * block_height;