aco: optimize 32bit fsign by using fmulz with Inf

2 instruction fsign with the power of cursed DX9 floating point rules.

Foz-DB Navi31:
Totals from 3803 (4.86% of 78196) affected shaders:
Instrs: 8436366 -> 8412549 (-0.28%); split: -0.29%, +0.00%
CodeSize: 43174284 -> 43114676 (-0.14%); split: -0.14%, +0.01%
SpillSGPRs: 3241 -> 3247 (+0.19%)
Latency: 66333841 -> 66287361 (-0.07%); split: -0.08%, +0.01%
InvThroughput: 10331902 -> 10316916 (-0.15%); split: -0.15%, +0.01%
VClause: 165455 -> 165472 (+0.01%); split: -0.01%, +0.02%
SClause: 242352 -> 242335 (-0.01%); split: -0.02%, +0.01%
Copies: 604086 -> 605781 (+0.28%); split: -0.04%, +0.32%
Branches: 214017 -> 214013 (-0.00%)
PreSGPRs: 209413 -> 209726 (+0.15%)

Reviewed-by: Daniel Schürmann <daniel@schuermann.dev>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26765>
This commit is contained in:
Georg Lehmann 2023-12-19 22:16:33 +01:00
parent f60dafb4bd
commit 9ecfd7919b

View file

@ -2896,16 +2896,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
}
} else if (dst.regClass() == v1) {
if (ctx->block->fp_mode.denorm32 == fp_denorm_flush) {
/* If denormals are flushed, then v_mul_legacy_f32(2.0, src) can become omod. */
src =
bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), Operand::c32(0x40000000), src);
} else {
src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
}
src =
bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
/* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers
* the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0.
*/
Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, src);
bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src,
Operand::c32(0xbf800000));
} else if (dst.regClass() == v2) {
Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));