aco: shrink buffer stores with undef/zero components
Buffer stores store 0 like image stores for unspecified components. Foz-DB Navi21: Totals from 91 (0.11% of 79330) affected shaders: Instrs: 63327 -> 63121 (-0.33%) CodeSize: 315312 -> 314440 (-0.28%); split: -0.28%, +0.00% VGPRs: 3144 -> 3120 (-0.76%) Latency: 441424 -> 441300 (-0.03%); split: -0.03%, +0.00% InvThroughput: 65501 -> 65130 (-0.57%) Copies: 6197 -> 5999 (-3.20%) PreVGPRs: 2197 -> 2182 (-0.68%) Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26897>
This commit is contained in:
parent
862df28f6b
commit
4a6ee2c483
1 changed files with 48 additions and 45 deletions
|
|
@ -6410,24 +6410,62 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
|
|||
bool glc = ctx->options->gfx_level == GFX6 ||
|
||||
((access & (ACCESS_VOLATILE | ACCESS_COHERENT)) && ctx->program->gfx_level < GFX11);
|
||||
|
||||
uint32_t dmask = BITFIELD_MASK(num_components);
|
||||
/* remove zero/undef elements from data, components which aren't in dmask
|
||||
* are zeroed anyway
|
||||
*/
|
||||
if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
|
||||
for (uint32_t i = 0; i < instr->num_components; i++) {
|
||||
nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
|
||||
if ((nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0) ||
|
||||
nir_scalar_is_undef(comp))
|
||||
dmask &= ~BITFIELD_BIT(i);
|
||||
}
|
||||
|
||||
/* dmask cannot be 0, at least one vgpr is always read */
|
||||
if (dmask == 0)
|
||||
dmask = 1;
|
||||
/* buffer store only supports consecutive components. */
|
||||
if (dim == GLSL_SAMPLER_DIM_BUF)
|
||||
dmask = BITFIELD_MASK(util_last_bit(dmask));
|
||||
|
||||
if (dmask != BITFIELD_MASK(num_components)) {
|
||||
uint32_t dmask_count = util_bitcount(dmask);
|
||||
RegClass rc = d16 ? v2b : v1;
|
||||
if (dmask_count == 1) {
|
||||
data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
|
||||
} else {
|
||||
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
|
||||
uint32_t index = 0;
|
||||
u_foreach_bit (bit, dmask) {
|
||||
vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
|
||||
}
|
||||
data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
|
||||
vec->definitions[0] = Definition(data);
|
||||
bld.insert(std::move(vec));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if (dim == GLSL_SAMPLER_DIM_BUF) {
|
||||
Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa));
|
||||
Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1);
|
||||
aco_opcode opcode;
|
||||
if (!d16) {
|
||||
switch (num_components) {
|
||||
case 1: opcode = aco_opcode::buffer_store_format_x; break;
|
||||
case 2: opcode = aco_opcode::buffer_store_format_xy; break;
|
||||
case 3: opcode = aco_opcode::buffer_store_format_xyz; break;
|
||||
case 4: opcode = aco_opcode::buffer_store_format_xyzw; break;
|
||||
switch (dmask) {
|
||||
case 0x1: opcode = aco_opcode::buffer_store_format_x; break;
|
||||
case 0x3: opcode = aco_opcode::buffer_store_format_xy; break;
|
||||
case 0x7: opcode = aco_opcode::buffer_store_format_xyz; break;
|
||||
case 0xf: opcode = aco_opcode::buffer_store_format_xyzw; break;
|
||||
default: unreachable(">4 channel buffer image store");
|
||||
}
|
||||
} else {
|
||||
switch (num_components) {
|
||||
case 1: opcode = aco_opcode::buffer_store_format_d16_x; break;
|
||||
case 2: opcode = aco_opcode::buffer_store_format_d16_xy; break;
|
||||
case 3: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
|
||||
case 4: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
|
||||
switch (dmask) {
|
||||
case 0x1: opcode = aco_opcode::buffer_store_format_d16_x; break;
|
||||
case 0x3: opcode = aco_opcode::buffer_store_format_d16_xy; break;
|
||||
case 0x7: opcode = aco_opcode::buffer_store_format_d16_xyz; break;
|
||||
case 0xf: opcode = aco_opcode::buffer_store_format_d16_xyzw; break;
|
||||
default: unreachable(">4 channel buffer image store");
|
||||
}
|
||||
}
|
||||
|
|
@ -6454,41 +6492,6 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr)
|
|||
bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0;
|
||||
aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip;
|
||||
|
||||
uint32_t dmask = BITFIELD_MASK(num_components);
|
||||
/* remove zero/undef elements from data, components which aren't in dmask
|
||||
* are zeroed anyway
|
||||
*/
|
||||
if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) {
|
||||
for (uint32_t i = 0; i < instr->num_components; i++) {
|
||||
nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i);
|
||||
if ((nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0) ||
|
||||
nir_scalar_is_undef(comp))
|
||||
dmask &= ~BITFIELD_BIT(i);
|
||||
}
|
||||
|
||||
/* dmask cannot be 0, at least one vgpr is always read */
|
||||
if (dmask == 0)
|
||||
dmask = 1;
|
||||
|
||||
if (dmask != BITFIELD_MASK(num_components)) {
|
||||
uint32_t dmask_count = util_bitcount(dmask);
|
||||
RegClass rc = d16 ? v2b : v1;
|
||||
if (dmask_count == 1) {
|
||||
data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc);
|
||||
} else {
|
||||
aco_ptr<Pseudo_instruction> vec{create_instruction<Pseudo_instruction>(
|
||||
aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)};
|
||||
uint32_t index = 0;
|
||||
u_foreach_bit (bit, dmask) {
|
||||
vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc));
|
||||
}
|
||||
data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes()));
|
||||
vec->definitions[0] = Definition(data);
|
||||
bld.insert(std::move(vec));
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
MIMG_instruction* store =
|
||||
emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data));
|
||||
store->glc = glc;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue