From 3a0d1f83d57db303b85ab281b161a34229484d5a Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Mon, 15 May 2023 16:48:14 -0400 Subject: [PATCH] agx: Stop bit-inexact conversion propagation Despite being mathematically equivalent, the following code sequences are not bit-identical under IEEE 754 rules due to differing internal precision: fadd16 r0l, r2, 0.0 z = f2f16 x fadd16 r1h, r0l, r0h w = fadd z, y versus fadd32 r1h, r2, r0h f2f16(w) = fadd x, f2f32(y) This is probably fine under GL's relaxed floating point precision rules, but it's definitely not ok with the more strict OpenCL or Vulkan. It also is a potential problem with GL invariance rules, if we get different results for the same shader depending whether we did a monolithic compile or a fast link. The place for doing inexact transformations is NIR, when we have the information available to do so correctly. By the time we get to the backend, everything we do needs to be bit-exact to preserve sanity. Fixes dEQP-GLES2.functional.shaders.algorithm.rgb_to_hsl_vertex. We believe that this is a CTS bug, but it's a useful one since it uncovered a serious driver bug that would bite us in the much less friendly Vulkan (or god forbid OpenCL) CTS later. It also seems like a magnet for GL app bugs, the fp16 support we do now is uncovering bad enough bugs as it is. shader-db results are pretty abysmal, though :| total instructions in shared programs: 1537964 -> 1571328 (2.17%) instructions in affected programs: 670231 -> 703595 (4.98%) total bytes in shared programs: 10533984 -> 10732316 (1.88%) bytes in affected programs: 4662414 -> 4860746 (4.25%) total halfregs in shared programs: 483448 -> 474541 (-1.84%) halfregs in affected programs: 58867 -> 49960 (-15.13%) Signed-off-by: Alyssa Rosenzweig Part-of: --- src/asahi/compiler/agx_optimizer.c | 21 +++++++++++++++ src/asahi/compiler/test/test-optimizer.cpp | 30 ++++++++++++++++++++++ 2 files changed, 51 insertions(+) diff --git a/src/asahi/compiler/agx_optimizer.c b/src/asahi/compiler/agx_optimizer.c index 458459f82ba..63d02ca3180 100644 --- a/src/asahi/compiler/agx_optimizer.c +++ b/src/asahi/compiler/agx_optimizer.c @@ -89,6 +89,17 @@ agx_optimizer_fmov(agx_instr **defs, agx_instr *ins) if (ins->op == AGX_OPCODE_FCMPSEL && s >= 2) continue; + /* We can fold f2f32 into 32-bit instructions, but we can't fold f2f16 + * into 16-bit instructions, since the latter would implicitly promote to + * a 32-bit instruction which is not exact. + */ + assert(def->src[0].size == AGX_SIZE_32 || + def->src[0].size == AGX_SIZE_16); + assert(src.size == AGX_SIZE_32 || src.size == AGX_SIZE_16); + + if (src.size == AGX_SIZE_16 && def->src[0].size == AGX_SIZE_32) + continue; + ins->src[s] = agx_compose_float_src(src, def->src[0]); } } @@ -154,6 +165,16 @@ agx_optimizer_fmov_rev(agx_instr *I, agx_instr *use) if (use->src[0].neg || use->src[0].abs) return false; + /* We can fold f2f16 into 32-bit instructions, but we can't fold f2f32 into + * 16-bit instructions, since the latter would implicitly promote to a 32-bit + * instruction which is not exact. + */ + assert(use->dest[0].size == AGX_SIZE_32 || use->dest[0].size == AGX_SIZE_16); + assert(I->dest[0].size == AGX_SIZE_32 || I->dest[0].size == AGX_SIZE_16); + + if (I->dest[0].size == AGX_SIZE_16 && use->dest[0].size == AGX_SIZE_32) + return false; + /* saturate(saturate(x)) = saturate(x) */ I->saturate |= use->saturate; I->dest[0] = use->dest[0]; diff --git a/src/asahi/compiler/test/test-optimizer.cpp b/src/asahi/compiler/test/test-optimizer.cpp index 2b0c6c8b324..9e05c5ee39c 100644 --- a/src/asahi/compiler/test/test-optimizer.cpp +++ b/src/asahi/compiler/test/test-optimizer.cpp @@ -78,6 +78,25 @@ TEST_F(Optimizer, FloatCopyprop) agx_fadd_to(b, out, agx_neg(wx), wy)); } +TEST_F(Optimizer, FloatConversion) +{ + CASE32( + { + agx_index cvt = agx_temp(b->shader, AGX_SIZE_32); + agx_fmov_to(b, cvt, hx); + agx_fadd_to(b, out, cvt, wy); + }, + { agx_fadd_to(b, out, hx, wy); }); + + CASE16( + { + agx_index sum = agx_temp(b->shader, AGX_SIZE_32); + agx_fadd_to(b, sum, wx, wy); + agx_fmov_to(b, out, sum); + }, + { agx_fadd_to(b, out, wx, wy); }); +} + TEST_F(Optimizer, FusedFABSNEG) { CASE32(agx_fadd_to(b, out, agx_fmov(b, agx_abs(wx)), wy), @@ -164,3 +183,14 @@ TEST_F(Optimizer, SkipPreloads) agx_xor_to(b, out, preload, wy); }); } + +TEST_F(Optimizer, NoConversionsOn16BitALU) +{ + NEGCASE16({ + agx_index cvt = agx_temp(b->shader, AGX_SIZE_16); + agx_fmov_to(b, cvt, wx); + agx_fadd_to(b, out, cvt, hy); + }); + + NEGCASE32(agx_fmov_to(b, out, agx_fadd(b, hx, hy))); +}