c439fc9be9
This reduces the amount of over dispatching when there are odd dimensions (i.e. ASTC 8x5), which rarely evenly divide into 32x32.
1258 lines
38 KiB
Text
1258 lines
38 KiB
Text
// Copyright 2021 yuzu Emulator Project
|
|
// Licensed under GPLv2 or any later version
|
|
// Refer to the license.txt file included.
|
|
|
|
#version 450
|
|
|
|
#ifdef VULKAN
|
|
|
|
#define BEGIN_PUSH_CONSTANTS layout(push_constant) uniform PushConstants {
|
|
#define END_PUSH_CONSTANTS };
|
|
#define UNIFORM(n)
|
|
#define BINDING_INPUT_BUFFER 0
|
|
#define BINDING_OUTPUT_IMAGE 1
|
|
|
|
#else // ^^^ Vulkan ^^^ // vvv OpenGL vvv
|
|
|
|
#define BEGIN_PUSH_CONSTANTS
|
|
#define END_PUSH_CONSTANTS
|
|
#define UNIFORM(n) layout(location = n) uniform
|
|
#define BINDING_INPUT_BUFFER 0
|
|
#define BINDING_OUTPUT_IMAGE 0
|
|
|
|
#endif
|
|
|
|
layout(local_size_x = 8, local_size_y = 8, local_size_z = 1) in;
|
|
|
|
BEGIN_PUSH_CONSTANTS
|
|
UNIFORM(1) uvec2 block_dims;
|
|
UNIFORM(2) uint layer_stride;
|
|
UNIFORM(3) uint block_size;
|
|
UNIFORM(4) uint x_shift;
|
|
UNIFORM(5) uint block_height;
|
|
UNIFORM(6) uint block_height_mask;
|
|
END_PUSH_CONSTANTS
|
|
|
|
struct EncodingData {
|
|
uint encoding;
|
|
uint num_bits;
|
|
uint bit_value;
|
|
uint quint_trit_value;
|
|
};
|
|
|
|
struct TexelWeightParams {
|
|
uvec2 size;
|
|
uint max_weight;
|
|
bool dual_plane;
|
|
bool error_state;
|
|
bool void_extent_ldr;
|
|
bool void_extent_hdr;
|
|
};
|
|
|
|
layout(binding = BINDING_INPUT_BUFFER, std430) readonly buffer InputBufferU32 {
|
|
uvec4 astc_data[];
|
|
};
|
|
|
|
layout(binding = BINDING_OUTPUT_IMAGE, rgba8) uniform writeonly image2DArray dest_image;
|
|
|
|
const uint GOB_SIZE_X_SHIFT = 6;
|
|
const uint GOB_SIZE_Y_SHIFT = 3;
|
|
const uint GOB_SIZE_SHIFT = GOB_SIZE_X_SHIFT + GOB_SIZE_Y_SHIFT;
|
|
|
|
const uint BYTES_PER_BLOCK_LOG2 = 4;
|
|
|
|
const int JUST_BITS = 0;
|
|
const int QUINT = 1;
|
|
const int TRIT = 2;
|
|
|
|
// ASTC Encodings data, sorted in ascending order based on their BitLength value
|
|
// (see GetBitLength() function)
|
|
EncodingData encoding_values[22] = EncodingData[](
|
|
EncodingData(JUST_BITS, 0, 0, 0), EncodingData(JUST_BITS, 1, 0, 0), EncodingData(TRIT, 0, 0, 0),
|
|
EncodingData(JUST_BITS, 2, 0, 0), EncodingData(QUINT, 0, 0, 0), EncodingData(TRIT, 1, 0, 0),
|
|
EncodingData(JUST_BITS, 3, 0, 0), EncodingData(QUINT, 1, 0, 0), EncodingData(TRIT, 2, 0, 0),
|
|
EncodingData(JUST_BITS, 4, 0, 0), EncodingData(QUINT, 2, 0, 0), EncodingData(TRIT, 3, 0, 0),
|
|
EncodingData(JUST_BITS, 5, 0, 0), EncodingData(QUINT, 3, 0, 0), EncodingData(TRIT, 4, 0, 0),
|
|
EncodingData(JUST_BITS, 6, 0, 0), EncodingData(QUINT, 4, 0, 0), EncodingData(TRIT, 5, 0, 0),
|
|
EncodingData(JUST_BITS, 7, 0, 0), EncodingData(QUINT, 5, 0, 0), EncodingData(TRIT, 6, 0, 0),
|
|
EncodingData(JUST_BITS, 8, 0, 0)
|
|
);
|
|
|
|
// The following constants are expanded variants of the Replicate()
|
|
// function calls corresponding to the following arguments:
|
|
// value: index into the generated table
|
|
// num_bits: the after "REPLICATE" in the table name. i.e. 4 is num_bits in REPLICATE_4.
|
|
// to_bit: the integer after "TO_"
|
|
const uint REPLICATE_BIT_TO_7_TABLE[2] = uint[](0, 127);
|
|
const uint REPLICATE_1_BIT_TO_9_TABLE[2] = uint[](0, 511);
|
|
|
|
const uint REPLICATE_1_BIT_TO_8_TABLE[2] = uint[](0, 255);
|
|
const uint REPLICATE_2_BIT_TO_8_TABLE[4] = uint[](0, 85, 170, 255);
|
|
const uint REPLICATE_3_BIT_TO_8_TABLE[8] = uint[](0, 36, 73, 109, 146, 182, 219, 255);
|
|
const uint REPLICATE_4_BIT_TO_8_TABLE[16] =
|
|
uint[](0, 17, 34, 51, 68, 85, 102, 119, 136, 153, 170, 187, 204, 221, 238, 255);
|
|
const uint REPLICATE_5_BIT_TO_8_TABLE[32] =
|
|
uint[](0, 8, 16, 24, 33, 41, 49, 57, 66, 74, 82, 90, 99, 107, 115, 123, 132, 140, 148, 156, 165,
|
|
173, 181, 189, 198, 206, 214, 222, 231, 239, 247, 255);
|
|
const uint REPLICATE_1_BIT_TO_6_TABLE[2] = uint[](0, 63);
|
|
const uint REPLICATE_2_BIT_TO_6_TABLE[4] = uint[](0, 21, 42, 63);
|
|
const uint REPLICATE_3_BIT_TO_6_TABLE[8] = uint[](0, 9, 18, 27, 36, 45, 54, 63);
|
|
const uint REPLICATE_4_BIT_TO_6_TABLE[16] =
|
|
uint[](0, 4, 8, 12, 17, 21, 25, 29, 34, 38, 42, 46, 51, 55, 59, 63);
|
|
const uint REPLICATE_5_BIT_TO_6_TABLE[32] =
|
|
uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 33, 35, 37, 39, 41, 43, 45,
|
|
47, 49, 51, 53, 55, 57, 59, 61, 63);
|
|
const uint REPLICATE_6_BIT_TO_8_TABLE[64] =
|
|
uint[](0, 4, 8, 12, 16, 20, 24, 28, 32, 36, 40, 44, 48, 52, 56, 60, 65, 69, 73, 77, 81, 85, 89,
|
|
93, 97, 101, 105, 109, 113, 117, 121, 125, 130, 134, 138, 142, 146, 150, 154, 158, 162,
|
|
166, 170, 174, 178, 182, 186, 190, 195, 199, 203, 207, 211, 215, 219, 223, 227, 231, 235,
|
|
239, 243, 247, 251, 255);
|
|
const uint REPLICATE_7_BIT_TO_8_TABLE[128] =
|
|
uint[](0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 24, 26, 28, 30, 32, 34, 36, 38, 40, 42, 44,
|
|
46, 48, 50, 52, 54, 56, 58, 60, 62, 64, 66, 68, 70, 72, 74, 76, 78, 80, 82, 84, 86, 88,
|
|
90, 92, 94, 96, 98, 100, 102, 104, 106, 108, 110, 112, 114, 116, 118, 120, 122, 124, 126,
|
|
129, 131, 133, 135, 137, 139, 141, 143, 145, 147, 149, 151, 153, 155, 157, 159, 161, 163,
|
|
165, 167, 169, 171, 173, 175, 177, 179, 181, 183, 185, 187, 189, 191, 193, 195, 197, 199,
|
|
201, 203, 205, 207, 209, 211, 213, 215, 217, 219, 221, 223, 225, 227, 229, 231, 233, 235,
|
|
237, 239, 241, 243, 245, 247, 249, 251, 253, 255);
|
|
|
|
// Input ASTC texture globals
|
|
uint current_index = 0;
|
|
int bitsread = 0;
|
|
int total_bitsread = 0;
|
|
uvec4 local_buff;
|
|
|
|
// Color data globals
|
|
uvec4 color_endpoint_data;
|
|
int color_bitsread = 0;
|
|
|
|
// Four values, two endpoints, four maximum paritions
|
|
uint color_values[32];
|
|
int colvals_index = 0;
|
|
|
|
// Weight data globals
|
|
uvec4 texel_weight_data;
|
|
int texel_bitsread = 0;
|
|
|
|
bool texel_flag = false;
|
|
|
|
// Global "vectors" to be pushed into when decoding
|
|
EncodingData result_vector[144];
|
|
int result_index = 0;
|
|
|
|
EncodingData texel_vector[144];
|
|
int texel_vector_index = 0;
|
|
|
|
uint unquantized_texel_weights[2][144];
|
|
|
|
uint SwizzleOffset(uvec2 pos) {
|
|
uint x = pos.x;
|
|
uint y = pos.y;
|
|
return ((x % 64) / 32) * 256 + ((y % 8) / 2) * 64 + ((x % 32) / 16) * 32 +
|
|
(y % 2) * 16 + (x % 16);
|
|
}
|
|
|
|
// Replicates low num_bits such that [(to_bit - 1):(to_bit - 1 - from_bit)]
|
|
// is the same as [(num_bits - 1):0] and repeats all the way down.
|
|
uint Replicate(uint val, uint num_bits, uint to_bit) {
|
|
if (num_bits == 0 || to_bit == 0) {
|
|
return 0;
|
|
}
|
|
const uint v = val & uint((1 << num_bits) - 1);
|
|
uint res = v;
|
|
uint reslen = num_bits;
|
|
while (reslen < to_bit) {
|
|
uint comp = 0;
|
|
if (num_bits > to_bit - reslen) {
|
|
uint newshift = to_bit - reslen;
|
|
comp = num_bits - newshift;
|
|
num_bits = newshift;
|
|
}
|
|
res = uint(res << num_bits);
|
|
res = uint(res | (v >> comp));
|
|
reslen += num_bits;
|
|
}
|
|
return res;
|
|
}
|
|
|
|
uvec4 ReplicateByteTo16(uvec4 value) {
|
|
return value * 0x101;
|
|
}
|
|
|
|
uint ReplicateBitTo7(uint value) {
|
|
return REPLICATE_BIT_TO_7_TABLE[value];
|
|
}
|
|
|
|
uint ReplicateBitTo9(uint value) {
|
|
return REPLICATE_1_BIT_TO_9_TABLE[value];
|
|
}
|
|
|
|
uint FastReplicateTo8(uint value, uint num_bits) {
|
|
switch (num_bits) {
|
|
case 1:
|
|
return REPLICATE_1_BIT_TO_8_TABLE[value];
|
|
case 2:
|
|
return REPLICATE_2_BIT_TO_8_TABLE[value];
|
|
case 3:
|
|
return REPLICATE_3_BIT_TO_8_TABLE[value];
|
|
case 4:
|
|
return REPLICATE_4_BIT_TO_8_TABLE[value];
|
|
case 5:
|
|
return REPLICATE_5_BIT_TO_8_TABLE[value];
|
|
case 6:
|
|
return REPLICATE_6_BIT_TO_8_TABLE[value];
|
|
case 7:
|
|
return REPLICATE_7_BIT_TO_8_TABLE[value];
|
|
case 8:
|
|
return value;
|
|
}
|
|
return Replicate(value, num_bits, 8);
|
|
}
|
|
|
|
uint FastReplicateTo6(uint value, uint num_bits) {
|
|
switch (num_bits) {
|
|
case 1:
|
|
return REPLICATE_1_BIT_TO_6_TABLE[value];
|
|
case 2:
|
|
return REPLICATE_2_BIT_TO_6_TABLE[value];
|
|
case 3:
|
|
return REPLICATE_3_BIT_TO_6_TABLE[value];
|
|
case 4:
|
|
return REPLICATE_4_BIT_TO_6_TABLE[value];
|
|
case 5:
|
|
return REPLICATE_5_BIT_TO_6_TABLE[value];
|
|
}
|
|
return Replicate(value, num_bits, 6);
|
|
}
|
|
|
|
uint Div3Floor(uint v) {
|
|
return (v * 0x5556) >> 16;
|
|
}
|
|
|
|
uint Div3Ceil(uint v) {
|
|
return Div3Floor(v + 2);
|
|
}
|
|
|
|
uint Div5Floor(uint v) {
|
|
return (v * 0x3334) >> 16;
|
|
}
|
|
|
|
uint Div5Ceil(uint v) {
|
|
return Div5Floor(v + 4);
|
|
}
|
|
|
|
uint Hash52(uint p) {
|
|
p ^= p >> 15;
|
|
p -= p << 17;
|
|
p += p << 7;
|
|
p += p << 4;
|
|
p ^= p >> 5;
|
|
p += p << 16;
|
|
p ^= p >> 7;
|
|
p ^= p >> 3;
|
|
p ^= p << 6;
|
|
p ^= p >> 17;
|
|
return p;
|
|
}
|
|
|
|
uint Select2DPartition(uint seed, uint x, uint y, uint partition_count, bool small_block) {
|
|
if (small_block) {
|
|
x <<= 1;
|
|
y <<= 1;
|
|
}
|
|
|
|
seed += (partition_count - 1) * 1024;
|
|
|
|
uint rnum = Hash52(uint(seed));
|
|
uint seed1 = uint(rnum & 0xF);
|
|
uint seed2 = uint((rnum >> 4) & 0xF);
|
|
uint seed3 = uint((rnum >> 8) & 0xF);
|
|
uint seed4 = uint((rnum >> 12) & 0xF);
|
|
uint seed5 = uint((rnum >> 16) & 0xF);
|
|
uint seed6 = uint((rnum >> 20) & 0xF);
|
|
uint seed7 = uint((rnum >> 24) & 0xF);
|
|
uint seed8 = uint((rnum >> 28) & 0xF);
|
|
|
|
seed1 = (seed1 * seed1);
|
|
seed2 = (seed2 * seed2);
|
|
seed3 = (seed3 * seed3);
|
|
seed4 = (seed4 * seed4);
|
|
seed5 = (seed5 * seed5);
|
|
seed6 = (seed6 * seed6);
|
|
seed7 = (seed7 * seed7);
|
|
seed8 = (seed8 * seed8);
|
|
|
|
uint sh1, sh2;
|
|
if ((seed & 1) > 0) {
|
|
sh1 = (seed & 2) > 0 ? 4 : 5;
|
|
sh2 = (partition_count == 3) ? 6 : 5;
|
|
} else {
|
|
sh1 = (partition_count == 3) ? 6 : 5;
|
|
sh2 = (seed & 2) > 0 ? 4 : 5;
|
|
}
|
|
seed1 >>= sh1;
|
|
seed2 >>= sh2;
|
|
seed3 >>= sh1;
|
|
seed4 >>= sh2;
|
|
seed5 >>= sh1;
|
|
seed6 >>= sh2;
|
|
seed7 >>= sh1;
|
|
seed8 >>= sh2;
|
|
|
|
uint a = seed1 * x + seed2 * y + (rnum >> 14);
|
|
uint b = seed3 * x + seed4 * y + (rnum >> 10);
|
|
uint c = seed5 * x + seed6 * y + (rnum >> 6);
|
|
uint d = seed7 * x + seed8 * y + (rnum >> 2);
|
|
|
|
a &= 0x3F;
|
|
b &= 0x3F;
|
|
c &= 0x3F;
|
|
d &= 0x3F;
|
|
|
|
if (partition_count < 4) {
|
|
d = 0;
|
|
}
|
|
if (partition_count < 3) {
|
|
c = 0;
|
|
}
|
|
|
|
if (a >= b && a >= c && a >= d) {
|
|
return 0;
|
|
} else if (b >= c && b >= d) {
|
|
return 1;
|
|
} else if (c >= d) {
|
|
return 2;
|
|
} else {
|
|
return 3;
|
|
}
|
|
}
|
|
|
|
uint ExtractBits(uvec4 payload, int offset, int bits) {
|
|
if (bits <= 0) {
|
|
return 0;
|
|
}
|
|
int last_offset = offset + bits - 1;
|
|
int shifted_offset = offset >> 5;
|
|
if ((last_offset >> 5) == shifted_offset) {
|
|
return bitfieldExtract(payload[shifted_offset], offset & 31, bits);
|
|
}
|
|
int first_bits = 32 - (offset & 31);
|
|
int result_first = int(bitfieldExtract(payload[shifted_offset], offset & 31, first_bits));
|
|
int result_second = int(bitfieldExtract(payload[shifted_offset + 1], 0, bits - first_bits));
|
|
return result_first | (result_second << first_bits);
|
|
}
|
|
|
|
uint StreamBits(uint num_bits) {
|
|
int int_bits = int(num_bits);
|
|
uint ret = ExtractBits(local_buff, total_bitsread, int_bits);
|
|
total_bitsread += int_bits;
|
|
return ret;
|
|
}
|
|
|
|
uint StreamColorBits(uint num_bits) {
|
|
uint ret = 0;
|
|
int int_bits = int(num_bits);
|
|
if (texel_flag) {
|
|
ret = ExtractBits(texel_weight_data, texel_bitsread, int_bits);
|
|
texel_bitsread += int_bits;
|
|
} else {
|
|
ret = ExtractBits(color_endpoint_data, color_bitsread, int_bits);
|
|
color_bitsread += int_bits;
|
|
}
|
|
return ret;
|
|
}
|
|
|
|
void ResultEmplaceBack(EncodingData val) {
|
|
if (texel_flag) {
|
|
texel_vector[texel_vector_index] = val;
|
|
++texel_vector_index;
|
|
} else {
|
|
result_vector[result_index] = val;
|
|
++result_index;
|
|
}
|
|
}
|
|
|
|
// Returns the number of bits required to encode n_vals values.
|
|
uint GetBitLength(uint n_vals, uint encoding_index) {
|
|
uint total_bits = encoding_values[encoding_index].num_bits * n_vals;
|
|
if (encoding_values[encoding_index].encoding == TRIT) {
|
|
total_bits += Div5Ceil(n_vals * 8);
|
|
} else if (encoding_values[encoding_index].encoding == QUINT) {
|
|
total_bits += Div3Ceil(n_vals * 7);
|
|
}
|
|
return total_bits;
|
|
}
|
|
|
|
uint GetNumWeightValues(uvec2 size, bool dual_plane) {
|
|
uint n_vals = size.x * size.y;
|
|
if (dual_plane) {
|
|
n_vals *= 2;
|
|
}
|
|
return n_vals;
|
|
}
|
|
|
|
uint GetPackedBitSize(uvec2 size, bool dual_plane, uint max_weight) {
|
|
uint n_vals = GetNumWeightValues(size, dual_plane);
|
|
return GetBitLength(n_vals, max_weight);
|
|
}
|
|
|
|
uint BitsBracket(uint bits, uint pos) {
|
|
return ((bits >> pos) & 1);
|
|
}
|
|
|
|
uint BitsOp(uint bits, uint start, uint end) {
|
|
if (start == end) {
|
|
return BitsBracket(bits, start);
|
|
} else if (start > end) {
|
|
uint t = start;
|
|
start = end;
|
|
end = t;
|
|
}
|
|
|
|
uint mask = (1 << (end - start + 1)) - 1;
|
|
return ((bits >> start) & mask);
|
|
}
|
|
|
|
void DecodeQuintBlock(uint num_bits) {
|
|
uint m[3];
|
|
uint q[3];
|
|
uint Q;
|
|
m[0] = StreamColorBits(num_bits);
|
|
Q = StreamColorBits(3);
|
|
m[1] = StreamColorBits(num_bits);
|
|
Q |= StreamColorBits(2) << 3;
|
|
m[2] = StreamColorBits(num_bits);
|
|
Q |= StreamColorBits(2) << 5;
|
|
if (BitsOp(Q, 1, 2) == 3 && BitsOp(Q, 5, 6) == 0) {
|
|
q[0] = 4;
|
|
q[1] = 4;
|
|
q[2] = (BitsBracket(Q, 0) << 2) | ((BitsBracket(Q, 4) & ~BitsBracket(Q, 0)) << 1) |
|
|
(BitsBracket(Q, 3) & ~BitsBracket(Q, 0));
|
|
} else {
|
|
uint C = 0;
|
|
if (BitsOp(Q, 1, 2) == 3) {
|
|
q[2] = 4;
|
|
C = (BitsOp(Q, 3, 4) << 3) | ((~BitsOp(Q, 5, 6) & 3) << 1) | BitsBracket(Q, 0);
|
|
} else {
|
|
q[2] = BitsOp(Q, 5, 6);
|
|
C = BitsOp(Q, 0, 4);
|
|
}
|
|
if (BitsOp(C, 0, 2) == 5) {
|
|
q[1] = 4;
|
|
q[0] = BitsOp(C, 3, 4);
|
|
} else {
|
|
q[1] = BitsOp(C, 3, 4);
|
|
q[0] = BitsOp(C, 0, 2);
|
|
}
|
|
}
|
|
for (uint i = 0; i < 3; i++) {
|
|
EncodingData val;
|
|
val.encoding = QUINT;
|
|
val.num_bits = num_bits;
|
|
val.bit_value = m[i];
|
|
val.quint_trit_value = q[i];
|
|
ResultEmplaceBack(val);
|
|
}
|
|
}
|
|
|
|
void DecodeTritBlock(uint num_bits) {
|
|
uint m[5];
|
|
uint t[5];
|
|
uint T;
|
|
m[0] = StreamColorBits(num_bits);
|
|
T = StreamColorBits(2);
|
|
m[1] = StreamColorBits(num_bits);
|
|
T |= StreamColorBits(2) << 2;
|
|
m[2] = StreamColorBits(num_bits);
|
|
T |= StreamColorBits(1) << 4;
|
|
m[3] = StreamColorBits(num_bits);
|
|
T |= StreamColorBits(2) << 5;
|
|
m[4] = StreamColorBits(num_bits);
|
|
T |= StreamColorBits(1) << 7;
|
|
uint C = 0;
|
|
if (BitsOp(T, 2, 4) == 7) {
|
|
C = (BitsOp(T, 5, 7) << 2) | BitsOp(T, 0, 1);
|
|
t[4] = 2;
|
|
t[3] = 2;
|
|
} else {
|
|
C = BitsOp(T, 0, 4);
|
|
if (BitsOp(T, 5, 6) == 3) {
|
|
t[4] = 2;
|
|
t[3] = BitsBracket(T, 7);
|
|
} else {
|
|
t[4] = BitsBracket(T, 7);
|
|
t[3] = BitsOp(T, 5, 6);
|
|
}
|
|
}
|
|
if (BitsOp(C, 0, 1) == 3) {
|
|
t[2] = 2;
|
|
t[1] = BitsBracket(C, 4);
|
|
t[0] = (BitsBracket(C, 3) << 1) | (BitsBracket(C, 2) & ~BitsBracket(C, 3));
|
|
} else if (BitsOp(C, 2, 3) == 3) {
|
|
t[2] = 2;
|
|
t[1] = 2;
|
|
t[0] = BitsOp(C, 0, 1);
|
|
} else {
|
|
t[2] = BitsBracket(C, 4);
|
|
t[1] = BitsOp(C, 2, 3);
|
|
t[0] = (BitsBracket(C, 1) << 1) | (BitsBracket(C, 0) & ~BitsBracket(C, 1));
|
|
}
|
|
for (uint i = 0; i < 5; i++) {
|
|
EncodingData val;
|
|
val.encoding = TRIT;
|
|
val.num_bits = num_bits;
|
|
val.bit_value = m[i];
|
|
val.quint_trit_value = t[i];
|
|
ResultEmplaceBack(val);
|
|
}
|
|
}
|
|
|
|
void DecodeIntegerSequence(uint max_range, uint num_values) {
|
|
EncodingData val = encoding_values[max_range];
|
|
uint vals_decoded = 0;
|
|
while (vals_decoded < num_values) {
|
|
switch (val.encoding) {
|
|
case QUINT:
|
|
DecodeQuintBlock(val.num_bits);
|
|
vals_decoded += 3;
|
|
break;
|
|
case TRIT:
|
|
DecodeTritBlock(val.num_bits);
|
|
vals_decoded += 5;
|
|
break;
|
|
case JUST_BITS:
|
|
val.bit_value = StreamColorBits(val.num_bits);
|
|
ResultEmplaceBack(val);
|
|
vals_decoded++;
|
|
break;
|
|
}
|
|
}
|
|
}
|
|
|
|
void DecodeColorValues(uvec4 modes, uint num_partitions, uint color_data_bits) {
|
|
uint num_values = 0;
|
|
for (uint i = 0; i < num_partitions; i++) {
|
|
num_values += ((modes[i] >> 2) + 1) << 1;
|
|
}
|
|
// Find the largest encoding that's within color_data_bits
|
|
// TODO(ameerj): profile with binary search
|
|
int range = 0;
|
|
while (++range < encoding_values.length()) {
|
|
uint bit_length = GetBitLength(num_values, range);
|
|
if (bit_length > color_data_bits) {
|
|
break;
|
|
}
|
|
}
|
|
DecodeIntegerSequence(range - 1, num_values);
|
|
uint out_index = 0;
|
|
for (int itr = 0; itr < result_index; ++itr) {
|
|
if (out_index >= num_values) {
|
|
break;
|
|
}
|
|
EncodingData val = result_vector[itr];
|
|
uint bitlen = val.num_bits;
|
|
uint bitval = val.bit_value;
|
|
uint A = 0, B = 0, C = 0, D = 0;
|
|
A = ReplicateBitTo9((bitval & 1));
|
|
switch (val.encoding) {
|
|
case JUST_BITS:
|
|
color_values[out_index++] = FastReplicateTo8(bitval, bitlen);
|
|
break;
|
|
case TRIT: {
|
|
D = val.quint_trit_value;
|
|
switch (bitlen) {
|
|
case 1:
|
|
C = 204;
|
|
break;
|
|
case 2: {
|
|
C = 93;
|
|
uint b = (bitval >> 1) & 1;
|
|
B = (b << 8) | (b << 4) | (b << 2) | (b << 1);
|
|
break;
|
|
}
|
|
case 3: {
|
|
C = 44;
|
|
uint cb = (bitval >> 1) & 3;
|
|
B = (cb << 7) | (cb << 2) | cb;
|
|
break;
|
|
}
|
|
case 4: {
|
|
C = 22;
|
|
uint dcb = (bitval >> 1) & 7;
|
|
B = (dcb << 6) | dcb;
|
|
break;
|
|
}
|
|
case 5: {
|
|
C = 11;
|
|
uint edcb = (bitval >> 1) & 0xF;
|
|
B = (edcb << 5) | (edcb >> 2);
|
|
break;
|
|
}
|
|
case 6: {
|
|
C = 5;
|
|
uint fedcb = (bitval >> 1) & 0x1F;
|
|
B = (fedcb << 4) | (fedcb >> 4);
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
case QUINT: {
|
|
D = val.quint_trit_value;
|
|
switch (bitlen) {
|
|
case 1:
|
|
C = 113;
|
|
break;
|
|
case 2: {
|
|
C = 54;
|
|
uint b = (bitval >> 1) & 1;
|
|
B = (b << 8) | (b << 3) | (b << 2);
|
|
break;
|
|
}
|
|
case 3: {
|
|
C = 26;
|
|
uint cb = (bitval >> 1) & 3;
|
|
B = (cb << 7) | (cb << 1) | (cb >> 1);
|
|
break;
|
|
}
|
|
case 4: {
|
|
C = 13;
|
|
uint dcb = (bitval >> 1) & 7;
|
|
B = (dcb << 6) | (dcb >> 1);
|
|
break;
|
|
}
|
|
case 5: {
|
|
C = 6;
|
|
uint edcb = (bitval >> 1) & 0xF;
|
|
B = (edcb << 5) | (edcb >> 3);
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (val.encoding != JUST_BITS) {
|
|
uint T = (D * C) + B;
|
|
T ^= A;
|
|
T = (A & 0x80) | (T >> 2);
|
|
color_values[out_index++] = T;
|
|
}
|
|
}
|
|
}
|
|
|
|
ivec2 BitTransferSigned(int a, int b) {
|
|
ivec2 transferred;
|
|
transferred.y = b >> 1;
|
|
transferred.y |= a & 0x80;
|
|
transferred.x = a >> 1;
|
|
transferred.x &= 0x3F;
|
|
if ((transferred.x & 0x20) > 0) {
|
|
transferred.x -= 0x40;
|
|
}
|
|
return transferred;
|
|
}
|
|
|
|
uvec4 ClampByte(ivec4 color) {
|
|
for (uint i = 0; i < 4; ++i) {
|
|
color[i] = (color[i] < 0) ? 0 : ((color[i] > 255) ? 255 : color[i]);
|
|
}
|
|
return uvec4(color);
|
|
}
|
|
|
|
ivec4 BlueContract(int a, int r, int g, int b) {
|
|
return ivec4(a, (r + b) >> 1, (g + b) >> 1, b);
|
|
}
|
|
|
|
void ComputeEndpoints(out uvec4 ep1, out uvec4 ep2, uint color_endpoint_mode) {
|
|
#define READ_UINT_VALUES(N) \
|
|
uint v[N]; \
|
|
for (uint i = 0; i < N; i++) { \
|
|
v[i] = color_values[colvals_index++]; \
|
|
}
|
|
|
|
#define READ_INT_VALUES(N) \
|
|
int v[N]; \
|
|
for (uint i = 0; i < N; i++) { \
|
|
v[i] = int(color_values[colvals_index++]); \
|
|
}
|
|
|
|
switch (color_endpoint_mode) {
|
|
case 0: {
|
|
READ_UINT_VALUES(2)
|
|
ep1 = uvec4(0xFF, v[0], v[0], v[0]);
|
|
ep2 = uvec4(0xFF, v[1], v[1], v[1]);
|
|
break;
|
|
}
|
|
case 1: {
|
|
READ_UINT_VALUES(2)
|
|
uint L0 = (v[0] >> 2) | (v[1] & 0xC0);
|
|
uint L1 = min(L0 + (v[1] & 0x3F), 0xFFU);
|
|
ep1 = uvec4(0xFF, L0, L0, L0);
|
|
ep2 = uvec4(0xFF, L1, L1, L1);
|
|
break;
|
|
}
|
|
case 4: {
|
|
READ_UINT_VALUES(4)
|
|
ep1 = uvec4(v[2], v[0], v[0], v[0]);
|
|
ep2 = uvec4(v[3], v[1], v[1], v[1]);
|
|
break;
|
|
}
|
|
case 5: {
|
|
READ_INT_VALUES(4)
|
|
ivec2 transferred = BitTransferSigned(v[1], v[0]);
|
|
v[1] = transferred.x;
|
|
v[0] = transferred.y;
|
|
transferred = BitTransferSigned(v[3], v[2]);
|
|
v[3] = transferred.x;
|
|
v[2] = transferred.y;
|
|
ep1 = ClampByte(ivec4(v[2], v[0], v[0], v[0]));
|
|
ep2 = ClampByte(ivec4(v[2] + v[3], v[0] + v[1], v[0] + v[1], v[0] + v[1]));
|
|
break;
|
|
}
|
|
case 6: {
|
|
READ_UINT_VALUES(4)
|
|
ep1 = uvec4(0xFF, (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8);
|
|
ep2 = uvec4(0xFF, v[0], v[1], v[2]);
|
|
break;
|
|
}
|
|
case 8: {
|
|
READ_UINT_VALUES(6)
|
|
if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) {
|
|
ep1 = uvec4(0xFF, v[0], v[2], v[4]);
|
|
ep2 = uvec4(0xFF, v[1], v[3], v[5]);
|
|
} else {
|
|
ep1 = uvec4(BlueContract(0xFF, int(v[1]), int(v[3]), int(v[5])));
|
|
ep2 = uvec4(BlueContract(0xFF, int(v[0]), int(v[2]), int(v[4])));
|
|
}
|
|
break;
|
|
}
|
|
case 9: {
|
|
READ_INT_VALUES(6)
|
|
ivec2 transferred = BitTransferSigned(v[1], v[0]);
|
|
v[1] = transferred.x;
|
|
v[0] = transferred.y;
|
|
transferred = BitTransferSigned(v[3], v[2]);
|
|
v[3] = transferred.x;
|
|
v[2] = transferred.y;
|
|
transferred = BitTransferSigned(v[5], v[4]);
|
|
v[5] = transferred.x;
|
|
v[4] = transferred.y;
|
|
if ((v[1] + v[3] + v[5]) >= 0) {
|
|
ep1 = ClampByte(ivec4(0xFF, v[0], v[2], v[4]));
|
|
ep2 = ClampByte(ivec4(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
|
|
} else {
|
|
ep1 = ClampByte(BlueContract(0xFF, v[0] + v[1], v[2] + v[3], v[4] + v[5]));
|
|
ep2 = ClampByte(BlueContract(0xFF, v[0], v[2], v[4]));
|
|
}
|
|
break;
|
|
}
|
|
case 10: {
|
|
READ_UINT_VALUES(6)
|
|
ep1 = uvec4(v[4], (v[0] * v[3]) >> 8, (v[1] * v[3]) >> 8, (v[2] * v[3]) >> 8);
|
|
ep2 = uvec4(v[5], v[0], v[1], v[2]);
|
|
break;
|
|
}
|
|
case 12: {
|
|
READ_UINT_VALUES(8)
|
|
if ((v[1] + v[3] + v[5]) >= (v[0] + v[2] + v[4])) {
|
|
ep1 = uvec4(v[6], v[0], v[2], v[4]);
|
|
ep2 = uvec4(v[7], v[1], v[3], v[5]);
|
|
} else {
|
|
ep1 = uvec4(BlueContract(int(v[7]), int(v[1]), int(v[3]), int(v[5])));
|
|
ep2 = uvec4(BlueContract(int(v[6]), int(v[0]), int(v[2]), int(v[4])));
|
|
}
|
|
break;
|
|
}
|
|
case 13: {
|
|
READ_INT_VALUES(8)
|
|
ivec2 transferred = BitTransferSigned(v[1], v[0]);
|
|
v[1] = transferred.x;
|
|
v[0] = transferred.y;
|
|
transferred = BitTransferSigned(v[3], v[2]);
|
|
v[3] = transferred.x;
|
|
v[2] = transferred.y;
|
|
|
|
transferred = BitTransferSigned(v[5], v[4]);
|
|
v[5] = transferred.x;
|
|
v[4] = transferred.y;
|
|
|
|
transferred = BitTransferSigned(v[7], v[6]);
|
|
v[7] = transferred.x;
|
|
v[6] = transferred.y;
|
|
|
|
if ((v[1] + v[3] + v[5]) >= 0) {
|
|
ep1 = ClampByte(ivec4(v[6], v[0], v[2], v[4]));
|
|
ep2 = ClampByte(ivec4(v[7] + v[6], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
|
|
} else {
|
|
ep1 = ClampByte(BlueContract(v[6] + v[7], v[0] + v[1], v[2] + v[3], v[4] + v[5]));
|
|
ep2 = ClampByte(BlueContract(v[6], v[0], v[2], v[4]));
|
|
}
|
|
break;
|
|
}
|
|
default: {
|
|
// HDR mode, or more likely a bug computing the color_endpoint_mode
|
|
ep1 = uvec4(0xFF, 0xFF, 0, 0);
|
|
ep2 = uvec4(0xFF, 0xFF, 0, 0);
|
|
break;
|
|
}
|
|
}
|
|
#undef READ_UINT_VALUES
|
|
#undef READ_INT_VALUES
|
|
}
|
|
|
|
uint UnquantizeTexelWeight(EncodingData val) {
|
|
uint bitval = val.bit_value;
|
|
uint bitlen = val.num_bits;
|
|
uint A = ReplicateBitTo7((bitval & 1));
|
|
uint B = 0, C = 0, D = 0;
|
|
uint result = 0;
|
|
switch (val.encoding) {
|
|
case JUST_BITS:
|
|
result = FastReplicateTo6(bitval, bitlen);
|
|
break;
|
|
case TRIT: {
|
|
D = val.quint_trit_value;
|
|
switch (bitlen) {
|
|
case 0: {
|
|
uint results[3] = {0, 32, 63};
|
|
result = results[D];
|
|
break;
|
|
}
|
|
case 1: {
|
|
C = 50;
|
|
break;
|
|
}
|
|
case 2: {
|
|
C = 23;
|
|
uint b = (bitval >> 1) & 1;
|
|
B = (b << 6) | (b << 2) | b;
|
|
break;
|
|
}
|
|
case 3: {
|
|
C = 11;
|
|
uint cb = (bitval >> 1) & 3;
|
|
B = (cb << 5) | cb;
|
|
break;
|
|
}
|
|
default:
|
|
break;
|
|
}
|
|
break;
|
|
}
|
|
case QUINT: {
|
|
D = val.quint_trit_value;
|
|
switch (bitlen) {
|
|
case 0: {
|
|
uint results[5] = {0, 16, 32, 47, 63};
|
|
result = results[D];
|
|
break;
|
|
}
|
|
case 1: {
|
|
C = 28;
|
|
break;
|
|
}
|
|
case 2: {
|
|
C = 13;
|
|
uint b = (bitval >> 1) & 1;
|
|
B = (b << 6) | (b << 1);
|
|
break;
|
|
}
|
|
}
|
|
break;
|
|
}
|
|
}
|
|
if (val.encoding != JUST_BITS && bitlen > 0) {
|
|
result = D * C + B;
|
|
result ^= A;
|
|
result = (A & 0x20) | (result >> 2);
|
|
}
|
|
if (result > 32) {
|
|
result += 1;
|
|
}
|
|
return result;
|
|
}
|
|
|
|
void UnquantizeTexelWeights(bool dual_plane, uvec2 size) {
|
|
uint weight_idx = 0;
|
|
uint unquantized[2][144];
|
|
uint area = size.x * size.y;
|
|
for (uint itr = 0; itr < texel_vector_index; itr++) {
|
|
unquantized[0][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
|
|
if (dual_plane) {
|
|
++itr;
|
|
unquantized[1][weight_idx] = UnquantizeTexelWeight(texel_vector[itr]);
|
|
if (itr == texel_vector_index) {
|
|
break;
|
|
}
|
|
}
|
|
if (++weight_idx >= (area))
|
|
break;
|
|
}
|
|
|
|
const uint Ds = uint((block_dims.x * 0.5f + 1024) / (block_dims.x - 1));
|
|
const uint Dt = uint((block_dims.y * 0.5f + 1024) / (block_dims.y - 1));
|
|
const uint k_plane_scale = dual_plane ? 2 : 1;
|
|
for (uint plane = 0; plane < k_plane_scale; plane++) {
|
|
for (uint t = 0; t < block_dims.y; t++) {
|
|
for (uint s = 0; s < block_dims.x; s++) {
|
|
uint cs = Ds * s;
|
|
uint ct = Dt * t;
|
|
uint gs = (cs * (size.x - 1) + 32) >> 6;
|
|
uint gt = (ct * (size.y - 1) + 32) >> 6;
|
|
uint js = gs >> 4;
|
|
uint fs = gs & 0xF;
|
|
uint jt = gt >> 4;
|
|
uint ft = gt & 0x0F;
|
|
uint w11 = (fs * ft + 8) >> 4;
|
|
uint w10 = ft - w11;
|
|
uint w01 = fs - w11;
|
|
uint w00 = 16 - fs - ft + w11;
|
|
uvec4 w = uvec4(w00, w01, w10, w11);
|
|
uint v0 = jt * size.x + js;
|
|
|
|
uvec4 p = uvec4(0);
|
|
if (v0 < area) {
|
|
p.x = unquantized[plane][v0];
|
|
}
|
|
if ((v0 + 1) < (area)) {
|
|
p.y = unquantized[plane][v0 + 1];
|
|
}
|
|
if ((v0 + size.x) < (area)) {
|
|
p.z = unquantized[plane][(v0 + size.x)];
|
|
}
|
|
if ((v0 + size.x + 1) < (area)) {
|
|
p.w = unquantized[plane][(v0 + size.x + 1)];
|
|
}
|
|
unquantized_texel_weights[plane][t * block_dims.x + s] = (uint(dot(p, w)) + 8) >> 4;
|
|
}
|
|
}
|
|
}
|
|
}
|
|
|
|
int FindLayout(uint mode) {
|
|
if ((mode & 3) != 0) {
|
|
if ((mode & 8) != 0) {
|
|
if ((mode & 4) != 0) {
|
|
if ((mode & 0x100) != 0) {
|
|
return 4;
|
|
}
|
|
return 3;
|
|
}
|
|
return 2;
|
|
}
|
|
if ((mode & 4) != 0) {
|
|
return 1;
|
|
}
|
|
return 0;
|
|
}
|
|
if ((mode & 0x100) != 0) {
|
|
if ((mode & 0x80) != 0) {
|
|
if ((mode & 0x20) != 0) {
|
|
return 8;
|
|
}
|
|
return 7;
|
|
}
|
|
return 9;
|
|
}
|
|
if ((mode & 0x80) != 0) {
|
|
return 6;
|
|
}
|
|
return 5;
|
|
}
|
|
|
|
TexelWeightParams DecodeBlockInfo() {
|
|
TexelWeightParams params = TexelWeightParams(uvec2(0), 0, false, false, false, false);
|
|
uint mode = StreamBits(11);
|
|
if ((mode & 0x1ff) == 0x1fc) {
|
|
if ((mode & 0x200) != 0) {
|
|
params.void_extent_hdr = true;
|
|
} else {
|
|
params.void_extent_ldr = true;
|
|
}
|
|
if ((mode & 0x400) == 0 || StreamBits(1) == 0) {
|
|
params.error_state = true;
|
|
}
|
|
return params;
|
|
}
|
|
if ((mode & 0xf) == 0) {
|
|
params.error_state = true;
|
|
return params;
|
|
}
|
|
if ((mode & 3) == 0 && (mode & 0x1c0) == 0x1c0) {
|
|
params.error_state = true;
|
|
return params;
|
|
}
|
|
uint A, B;
|
|
uint mode_layout = FindLayout(mode);
|
|
switch (mode_layout) {
|
|
case 0:
|
|
A = (mode >> 5) & 0x3;
|
|
B = (mode >> 7) & 0x3;
|
|
params.size = uvec2(B + 4, A + 2);
|
|
break;
|
|
case 1:
|
|
A = (mode >> 5) & 0x3;
|
|
B = (mode >> 7) & 0x3;
|
|
params.size = uvec2(B + 8, A + 2);
|
|
break;
|
|
case 2:
|
|
A = (mode >> 5) & 0x3;
|
|
B = (mode >> 7) & 0x3;
|
|
params.size = uvec2(A + 2, B + 8);
|
|
break;
|
|
case 3:
|
|
A = (mode >> 5) & 0x3;
|
|
B = (mode >> 7) & 0x1;
|
|
params.size = uvec2(A + 2, B + 6);
|
|
break;
|
|
case 4:
|
|
A = (mode >> 5) & 0x3;
|
|
B = (mode >> 7) & 0x1;
|
|
params.size = uvec2(B + 2, A + 2);
|
|
break;
|
|
case 5:
|
|
A = (mode >> 5) & 0x3;
|
|
params.size = uvec2(12, A + 2);
|
|
break;
|
|
case 6:
|
|
A = (mode >> 5) & 0x3;
|
|
params.size = uvec2(A + 2, 12);
|
|
break;
|
|
case 7:
|
|
params.size = uvec2(6, 10);
|
|
break;
|
|
case 8:
|
|
params.size = uvec2(10, 6);
|
|
break;
|
|
case 9:
|
|
A = (mode >> 5) & 0x3;
|
|
B = (mode >> 9) & 0x3;
|
|
params.size = uvec2(A + 6, B + 6);
|
|
break;
|
|
default:
|
|
params.error_state = true;
|
|
break;
|
|
}
|
|
params.dual_plane = (mode_layout != 9) && ((mode & 0x400) != 0);
|
|
uint weight_index = (mode & 0x10) != 0 ? 1 : 0;
|
|
if (mode_layout < 5) {
|
|
weight_index |= (mode & 0x3) << 1;
|
|
} else {
|
|
weight_index |= (mode & 0xc) >> 1;
|
|
}
|
|
weight_index -= 2;
|
|
if ((mode_layout != 9) && ((mode & 0x200) != 0)) {
|
|
const int max_weights[6] = int[6](7, 8, 9, 10, 11, 12);
|
|
params.max_weight = max_weights[weight_index];
|
|
} else {
|
|
const int max_weights[6] = int[6](1, 2, 3, 4, 5, 6);
|
|
params.max_weight = max_weights[weight_index];
|
|
}
|
|
return params;
|
|
}
|
|
|
|
void FillError(ivec3 coord) {
|
|
for (uint j = 0; j < block_dims.y; j++) {
|
|
for (uint i = 0; i < block_dims.x; i++) {
|
|
imageStore(dest_image, coord + ivec3(i, j, 0), vec4(1.0, 1.0, 0.0, 1.0));
|
|
}
|
|
}
|
|
}
|
|
|
|
void FillVoidExtentLDR(ivec3 coord) {
|
|
StreamBits(52);
|
|
uint r_u = StreamBits(16);
|
|
uint g_u = StreamBits(16);
|
|
uint b_u = StreamBits(16);
|
|
uint a_u = StreamBits(16);
|
|
float a = float(a_u) / 65535.0f;
|
|
float r = float(r_u) / 65535.0f;
|
|
float g = float(g_u) / 65535.0f;
|
|
float b = float(b_u) / 65535.0f;
|
|
for (uint j = 0; j < block_dims.y; j++) {
|
|
for (uint i = 0; i < block_dims.x; i++) {
|
|
imageStore(dest_image, coord + ivec3(i, j, 0), vec4(r, g, b, a));
|
|
}
|
|
}
|
|
}
|
|
|
|
void DecompressBlock(ivec3 coord) {
|
|
TexelWeightParams params = DecodeBlockInfo();
|
|
if (params.error_state) {
|
|
FillError(coord);
|
|
return;
|
|
}
|
|
if (params.void_extent_hdr) {
|
|
FillError(coord);
|
|
return;
|
|
}
|
|
if (params.void_extent_ldr) {
|
|
FillVoidExtentLDR(coord);
|
|
return;
|
|
}
|
|
if ((params.size.x > block_dims.x) || (params.size.y > block_dims.y)) {
|
|
FillError(coord);
|
|
return;
|
|
}
|
|
uint num_partitions = StreamBits(2) + 1;
|
|
if (num_partitions > 4 || (num_partitions == 4 && params.dual_plane)) {
|
|
FillError(coord);
|
|
return;
|
|
}
|
|
int plane_index = -1;
|
|
uint partition_index = 1;
|
|
uvec4 color_endpoint_mode = uvec4(0);
|
|
uint ced_pointer = 0;
|
|
uint base_cem = 0;
|
|
if (num_partitions == 1) {
|
|
color_endpoint_mode.x = StreamBits(4);
|
|
partition_index = 0;
|
|
} else {
|
|
partition_index = StreamBits(10);
|
|
base_cem = StreamBits(6);
|
|
}
|
|
uint base_mode = base_cem & 3;
|
|
uint weight_bits = GetPackedBitSize(params.size, params.dual_plane, params.max_weight);
|
|
uint remaining_bits = 128 - weight_bits - total_bitsread;
|
|
uint extra_cem_bits = 0;
|
|
if (base_mode > 0) {
|
|
switch (num_partitions) {
|
|
case 2:
|
|
extra_cem_bits += 2;
|
|
break;
|
|
case 3:
|
|
extra_cem_bits += 5;
|
|
break;
|
|
case 4:
|
|
extra_cem_bits += 8;
|
|
break;
|
|
default:
|
|
return;
|
|
}
|
|
}
|
|
remaining_bits -= extra_cem_bits;
|
|
uint plane_selector_bits = 0;
|
|
if (params.dual_plane) {
|
|
plane_selector_bits = 2;
|
|
}
|
|
remaining_bits -= plane_selector_bits;
|
|
if (remaining_bits > 128) {
|
|
// Bad data, more remaining bits than 4 bytes
|
|
// return early
|
|
return;
|
|
}
|
|
// Read color data...
|
|
uint color_data_bits = remaining_bits;
|
|
while (remaining_bits > 0) {
|
|
int nb = int(min(remaining_bits, 32U));
|
|
uint b = StreamBits(nb);
|
|
color_endpoint_data[ced_pointer] = uint(bitfieldExtract(b, 0, nb));
|
|
++ced_pointer;
|
|
remaining_bits -= nb;
|
|
}
|
|
plane_index = int(StreamBits(plane_selector_bits));
|
|
if (base_mode > 0) {
|
|
uint extra_cem = StreamBits(extra_cem_bits);
|
|
uint cem = (extra_cem << 6) | base_cem;
|
|
cem >>= 2;
|
|
uvec4 C = uvec4(0);
|
|
for (uint i = 0; i < num_partitions; i++) {
|
|
C[i] = (cem & 1);
|
|
cem >>= 1;
|
|
}
|
|
uvec4 M = uvec4(0);
|
|
for (uint i = 0; i < num_partitions; i++) {
|
|
M[i] = cem & 3;
|
|
cem >>= 2;
|
|
}
|
|
for (uint i = 0; i < num_partitions; i++) {
|
|
color_endpoint_mode[i] = base_mode;
|
|
if (C[i] == 0) {
|
|
--color_endpoint_mode[i];
|
|
}
|
|
color_endpoint_mode[i] <<= 2;
|
|
color_endpoint_mode[i] |= M[i];
|
|
}
|
|
} else if (num_partitions > 1) {
|
|
uint cem = base_cem >> 2;
|
|
for (uint i = 0; i < num_partitions; i++) {
|
|
color_endpoint_mode[i] = cem;
|
|
}
|
|
}
|
|
DecodeColorValues(color_endpoint_mode, num_partitions, color_data_bits);
|
|
|
|
uvec4 endpoints[4][2];
|
|
for (uint i = 0; i < num_partitions; i++) {
|
|
ComputeEndpoints(endpoints[i][0], endpoints[i][1], color_endpoint_mode[i]);
|
|
}
|
|
|
|
texel_weight_data = local_buff;
|
|
texel_weight_data = bitfieldReverse(texel_weight_data).wzyx;
|
|
uint clear_byte_start =
|
|
(GetPackedBitSize(params.size, params.dual_plane, params.max_weight) >> 3) + 1;
|
|
|
|
uint byte_insert = ExtractBits(texel_weight_data, int(clear_byte_start - 1) * 8, 8) &
|
|
uint(
|
|
((1 << (GetPackedBitSize(params.size, params.dual_plane, params.max_weight) % 8)) - 1));
|
|
uint vec_index = (clear_byte_start - 1) >> 2;
|
|
texel_weight_data[vec_index] =
|
|
bitfieldInsert(texel_weight_data[vec_index], byte_insert, int((clear_byte_start - 1) % 4) * 8, 8);
|
|
for (uint i = clear_byte_start; i < 16; ++i) {
|
|
uint idx = i >> 2;
|
|
texel_weight_data[idx] = bitfieldInsert(texel_weight_data[idx], 0, int(i % 4) * 8, 8);
|
|
}
|
|
texel_flag = true; // use texel "vector" and bit stream in integer decoding
|
|
DecodeIntegerSequence(params.max_weight, GetNumWeightValues(params.size, params.dual_plane));
|
|
|
|
UnquantizeTexelWeights(params.dual_plane, params.size);
|
|
|
|
for (uint j = 0; j < block_dims.y; j++) {
|
|
for (uint i = 0; i < block_dims.x; i++) {
|
|
uint local_partition = 0;
|
|
if (num_partitions > 1) {
|
|
local_partition = Select2DPartition(partition_index, i, j, num_partitions,
|
|
(block_dims.y * block_dims.x) < 32);
|
|
}
|
|
vec4 p;
|
|
uvec4 C0 = ReplicateByteTo16(endpoints[local_partition][0]);
|
|
uvec4 C1 = ReplicateByteTo16(endpoints[local_partition][1]);
|
|
uvec4 plane_vec = uvec4(0);
|
|
uvec4 weight_vec = uvec4(0);
|
|
for (uint c = 0; c < 4; c++) {
|
|
if (params.dual_plane && (((plane_index + 1) & 3) == c)) {
|
|
plane_vec[c] = 1;
|
|
}
|
|
weight_vec[c] = unquantized_texel_weights[plane_vec[c]][j * block_dims.x + i];
|
|
}
|
|
vec4 Cf = vec4((C0 * (uvec4(64) - weight_vec) + C1 * weight_vec + uvec4(32)) / 64);
|
|
p = (Cf / 65535.0);
|
|
imageStore(dest_image, coord + ivec3(i, j, 0), p.gbar);
|
|
}
|
|
}
|
|
}
|
|
|
|
void main() {
|
|
uvec3 pos = gl_GlobalInvocationID;
|
|
pos.x <<= BYTES_PER_BLOCK_LOG2;
|
|
|
|
// Read as soon as possible due to its latency
|
|
const uint swizzle = SwizzleOffset(pos.xy);
|
|
|
|
const uint block_y = pos.y >> GOB_SIZE_Y_SHIFT;
|
|
|
|
uint offset = 0;
|
|
offset += pos.z * layer_stride;
|
|
offset += (block_y >> block_height) * block_size;
|
|
offset += (block_y & block_height_mask) << GOB_SIZE_SHIFT;
|
|
offset += (pos.x >> GOB_SIZE_X_SHIFT) << x_shift;
|
|
offset += swizzle;
|
|
|
|
const ivec3 coord = ivec3(gl_GlobalInvocationID * uvec3(block_dims, 1));
|
|
if (any(greaterThanEqual(coord, imageSize(dest_image)))) {
|
|
return;
|
|
}
|
|
current_index = 0;
|
|
bitsread = 0;
|
|
local_buff = astc_data[offset / 16];
|
|
DecompressBlock(coord);
|
|
}
|