From 4a6ee2c4833b7be8f2ae9e379433fd855a865de3 Mon Sep 17 00:00:00 2001 From: Georg Lehmann Date: Mon, 25 Dec 2023 15:32:52 +0100 Subject: [PATCH] aco: shrink buffer stores with undef/zero components MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Buffer stores store 0 like image stores for unspecified components. Foz-DB Navi21: Totals from 91 (0.11% of 79330) affected shaders: Instrs: 63327 -> 63121 (-0.33%) CodeSize: 315312 -> 314440 (-0.28%); split: -0.28%, +0.00% VGPRs: 3144 -> 3120 (-0.76%) Latency: 441424 -> 441300 (-0.03%); split: -0.03%, +0.00% InvThroughput: 65501 -> 65130 (-0.57%) Copies: 6197 -> 5999 (-3.20%) PreVGPRs: 2197 -> 2182 (-0.68%) Reviewed-by: Daniel Schürmann Part-of: --- .../compiler/aco_instruction_selection.cpp | 93 ++++++++++--------- 1 file changed, 48 insertions(+), 45 deletions(-) diff --git a/src/amd/compiler/aco_instruction_selection.cpp b/src/amd/compiler/aco_instruction_selection.cpp index 63573876f2e..a7059f38ed4 100644 --- a/src/amd/compiler/aco_instruction_selection.cpp +++ b/src/amd/compiler/aco_instruction_selection.cpp @@ -6410,24 +6410,62 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) bool glc = ctx->options->gfx_level == GFX6 || ((access & (ACCESS_VOLATILE | ACCESS_COHERENT)) && ctx->program->gfx_level < GFX11); + uint32_t dmask = BITFIELD_MASK(num_components); + /* remove zero/undef elements from data, components which aren't in dmask + * are zeroed anyway + */ + if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) { + for (uint32_t i = 0; i < instr->num_components; i++) { + nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i); + if ((nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0) || + nir_scalar_is_undef(comp)) + dmask &= ~BITFIELD_BIT(i); + } + + /* dmask cannot be 0, at least one vgpr is always read */ + if (dmask == 0) + dmask = 1; + /* buffer store only supports consecutive components. */ + if (dim == GLSL_SAMPLER_DIM_BUF) + dmask = BITFIELD_MASK(util_last_bit(dmask)); + + if (dmask != BITFIELD_MASK(num_components)) { + uint32_t dmask_count = util_bitcount(dmask); + RegClass rc = d16 ? v2b : v1; + if (dmask_count == 1) { + data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc); + } else { + aco_ptr vec{create_instruction( + aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)}; + uint32_t index = 0; + u_foreach_bit (bit, dmask) { + vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc)); + } + data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes())); + vec->definitions[0] = Definition(data); + bld.insert(std::move(vec)); + } + } + } + if (dim == GLSL_SAMPLER_DIM_BUF) { Temp rsrc = bld.as_uniform(get_ssa_temp(ctx, instr->src[0].ssa)); Temp vindex = emit_extract_vector(ctx, get_ssa_temp(ctx, instr->src[1].ssa), 0, v1); aco_opcode opcode; if (!d16) { - switch (num_components) { - case 1: opcode = aco_opcode::buffer_store_format_x; break; - case 2: opcode = aco_opcode::buffer_store_format_xy; break; - case 3: opcode = aco_opcode::buffer_store_format_xyz; break; - case 4: opcode = aco_opcode::buffer_store_format_xyzw; break; + switch (dmask) { + case 0x1: opcode = aco_opcode::buffer_store_format_x; break; + case 0x3: opcode = aco_opcode::buffer_store_format_xy; break; + case 0x7: opcode = aco_opcode::buffer_store_format_xyz; break; + case 0xf: opcode = aco_opcode::buffer_store_format_xyzw; break; default: unreachable(">4 channel buffer image store"); } } else { - switch (num_components) { - case 1: opcode = aco_opcode::buffer_store_format_d16_x; break; - case 2: opcode = aco_opcode::buffer_store_format_d16_xy; break; - case 3: opcode = aco_opcode::buffer_store_format_d16_xyz; break; - case 4: opcode = aco_opcode::buffer_store_format_d16_xyzw; break; + switch (dmask) { + case 0x1: opcode = aco_opcode::buffer_store_format_d16_x; break; + case 0x3: opcode = aco_opcode::buffer_store_format_d16_xy; break; + case 0x7: opcode = aco_opcode::buffer_store_format_d16_xyz; break; + case 0xf: opcode = aco_opcode::buffer_store_format_d16_xyzw; break; default: unreachable(">4 channel buffer image store"); } } @@ -6454,41 +6492,6 @@ visit_image_store(isel_context* ctx, nir_intrinsic_instr* instr) bool level_zero = nir_src_is_const(instr->src[4]) && nir_src_as_uint(instr->src[4]) == 0; aco_opcode opcode = level_zero ? aco_opcode::image_store : aco_opcode::image_store_mip; - uint32_t dmask = BITFIELD_MASK(num_components); - /* remove zero/undef elements from data, components which aren't in dmask - * are zeroed anyway - */ - if (instr->src[3].ssa->bit_size == 32 || instr->src[3].ssa->bit_size == 16) { - for (uint32_t i = 0; i < instr->num_components; i++) { - nir_scalar comp = nir_scalar_resolved(instr->src[3].ssa, i); - if ((nir_scalar_is_const(comp) && nir_scalar_as_uint(comp) == 0) || - nir_scalar_is_undef(comp)) - dmask &= ~BITFIELD_BIT(i); - } - - /* dmask cannot be 0, at least one vgpr is always read */ - if (dmask == 0) - dmask = 1; - - if (dmask != BITFIELD_MASK(num_components)) { - uint32_t dmask_count = util_bitcount(dmask); - RegClass rc = d16 ? v2b : v1; - if (dmask_count == 1) { - data = emit_extract_vector(ctx, data, ffs(dmask) - 1, rc); - } else { - aco_ptr vec{create_instruction( - aco_opcode::p_create_vector, Format::PSEUDO, dmask_count, 1)}; - uint32_t index = 0; - u_foreach_bit (bit, dmask) { - vec->operands[index++] = Operand(emit_extract_vector(ctx, data, bit, rc)); - } - data = bld.tmp(RegClass::get(RegType::vgpr, dmask_count * rc.bytes())); - vec->definitions[0] = Definition(data); - bld.insert(std::move(vec)); - } - } - } - MIMG_instruction* store = emit_mimg(bld, opcode, Temp(0, v1), resource, Operand(s4), coords, Operand(data)); store->glc = glc;