aco: shrink buffer stores with undef/zero components

Buffer stores store 0 like image stores for unspecified components.

Foz-DB Navi21:
Totals from 91 (0.11% of 79330) affected shaders:
Instrs: 63327 -> 63121 (-0.33%)
CodeSize: 315312 -> 314440 (-0.28%); split: -0.28%, +0.00%
VGPRs: 3144 -> 3120 (-0.76%)
Latency: 441424 -> 441300 (-0.03%); split: -0.03%, +0.00%
InvThroughput: 65501 -> 65130 (-0.57%)
Copies: 6197 -> 5999 (-3.20%)
PreVGPRs: 2197 -> 2182 (-0.68%)

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26897>
This commit is contained in:
Georg Lehmann 2023-12-25 15:32:52 +01:00 committed by Marge Bot
parent 862df28f6b
commit 4a6ee2c483

View file

@ -6410,24 +6410,62 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
bool glc = ctx->options->gfx_level == GFX6 ||
((access & (ACCESS_VOLATILE | ACCESS_COHERENT)) && ctx->program->gfx_level < GFX11);
uint32_t dmask = BITFIELD_MASK(num_components);
/* remove zero/undef elements from data, components which aren't in dmask
* are zeroed anyway
*/
if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
for (uint32_t i = 0; i < instr->num_components; i++) {
nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
if ((nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0) ||
nir_scalar_is_undef(comp))
dmask &= ~BITFIELD_BIT(i);
}
/* dmask cannot be 0, at least one vgpr is always read */
if (dmask == 0)
dmask = 1;
/* buffer store only supports consecutive components. */
if (dim == GLSL_SAMPLER_DIM_BUF)
dmask = BITFIELD_MASK(util_last_bit(dmask));
if (dmask != BITFIELD_MASK(num_components)) {
uint32_t dmask_count = util_bitcount(dmask);
RegClass rc = d16 ? v2b : v1;
if (dmask_count == 1) {
data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
} else {
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
uint32_t index = 0;
u_foreach_bit (bit, dmask) {
vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
}
data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
vec->definitions[0] = Definition(data);
bld.insert(std::move(vec));
}
}
}
if (dim == GLSL_SAMPLER_DIM_BUF) {
Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
aco_opcode opcode;
if (!d16) {
switch (num_components) {
case 1: opcode = aco_opcode::buffer_store_format_x; break;
case 2: opcode = aco_opcode::buffer_store_format_xy; break;
case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
switch (dmask) {
case 0x1: opcode = aco_opcode::buffer_store_format_x; break;
case 0x3: opcode = aco_opcode::buffer_store_format_xy; break;
case 0x7: opcode = aco_opcode::buffer_store_format_xyz; break;
case 0xf: opcode = aco_opcode::buffer_store_format_xyzw; break;
default: unreachable(">4 channel buffer image store");
}
} else {
switch (num_components) {
case 1: opcode = aco_opcode::buffer_store_format_d16_x; break;
case 2: opcode = aco_opcode::buffer_store_format_d16_xy; break;
case 3: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
case 4: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
switch (dmask) {
case 0x1: opcode = aco_opcode::buffer_store_format_d16_x; break;
case 0x3: opcode = aco_opcode::buffer_store_format_d16_xy; break;
case 0x7: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
case 0xf: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
default: unreachable(">4 channel buffer image store");
}
}
@ -6454,41 +6492,6 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
uint32_t dmask = BITFIELD_MASK(num_components);
/* remove zero/undef elements from data, components which aren't in dmask
* are zeroed anyway
*/
if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
for (uint32_t i = 0; i < instr->num_components; i++) {
nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
if ((nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0) ||
nir_scalar_is_undef(comp))
dmask &= ~BITFIELD_BIT(i);
}
/* dmask cannot be 0, at least one vgpr is always read */
if (dmask == 0)
dmask = 1;
if (dmask != BITFIELD_MASK(num_components)) {
uint32_t dmask_count = util_bitcount(dmask);
RegClass rc = d16 ? v2b : v1;
if (dmask_count == 1) {
data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
} else {
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
uint32_t index = 0;
u_foreach_bit (bit, dmask) {
vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
}
data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
vec->definitions[0] = Definition(data);
bld.insert(std::move(vec));
}
}
}
MIMG_instruction* store =
emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
store->glc = glc;