aco: optimize 32bit fsign by using fmulz with Inf
2 instruction fsign with the power of cursed DX9 floating point rules. Foz-DB Navi31: Totals from 3803 (4.86% of 78196) affected shaders: Instrs: 8436366 -> 8412549 (-0.28%); split: -0.29%, +0.00% CodeSize: 43174284 -> 43114676 (-0.14%); split: -0.14%, +0.01% SpillSGPRs: 3241 -> 3247 (+0.19%) Latency: 66333841 -> 66287361 (-0.07%); split: -0.08%, +0.01% InvThroughput: 10331902 -> 10316916 (-0.15%); split: -0.15%, +0.01% VClause: 165455 -> 165472 (+0.01%); split: -0.01%, +0.02% SClause: 242352 -> 242335 (-0.01%); split: -0.02%, +0.01% Copies: 604086 -> 605781 (+0.28%); split: -0.04%, +0.32% Branches: 214017 -> 214013 (-0.00%) PreSGPRs: 209413 -> 209726 (+0.15%) Reviewed-by: Daniel Schürmann <daniel@schuermann.dev> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26765>
This commit is contained in:
parent
f60dafb4bd
commit
9ecfd7919b
1 changed files with 7 additions and 10 deletions
|
|
@ -2896,16 +2896,13 @@ visit_alu_instr(isel_context* ctx, nir_alu_instr* instr)
|
|||
bld.vop1(aco_opcode::v_cvt_f16_i16, Definition(dst), src);
|
||||
}
|
||||
} else if (dst.regClass() == v1) {
|
||||
if (ctx->block->fp_mode.denorm32 == fp_denorm_flush) {
|
||||
/* If denormals are flushed, then v_mul_legacy_f32(2.0, src) can become omod. */
|
||||
src =
|
||||
bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), Operand::c32(0x40000000), src);
|
||||
} else {
|
||||
src = bld.vop2(aco_opcode::v_add_f32, bld.def(v1), Operand::zero(), src);
|
||||
}
|
||||
src =
|
||||
bld.vop3(aco_opcode::v_med3_i32, bld.def(v1), Operand::c32(-1), src, Operand::c32(1u));
|
||||
bld.vop1(aco_opcode::v_cvt_f32_i32, Definition(dst), src);
|
||||
/* Legacy multiply with +Inf means +-0.0 becomes +0.0 and all other numbers
|
||||
* the correctly signed Inf. After that, we only need to clamp between -1.0 and +1.0.
|
||||
*/
|
||||
Temp inf = bld.copy(bld.def(s1), Operand::c32(0x7f800000));
|
||||
src = bld.vop2(aco_opcode::v_mul_legacy_f32, bld.def(v1), inf, src);
|
||||
bld.vop3(aco_opcode::v_med3_f32, Definition(dst), Operand::c32(0x3f800000), src,
|
||||
Operand::c32(0xbf800000));
|
||||
} else if (dst.regClass() == v2) {
|
||||
Temp cond = bld.vopc(aco_opcode::v_cmp_nlt_f64, bld.def(bld.lm), Operand::zero(), src);
|
||||
Temp tmp = bld.copy(bld.def(v1), Operand::c32(0x3FF00000u));
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue