agx: Stop bit-inexact conversion propagation

Despite being mathematically equivalent, the following code sequences are not bit-identical under IEEE 754 rules due to differing internal precision: fadd16 r0l, r2, 0.0 z = f2f16 x fadd16 r1h, r0l, r0h w = fadd z, y versus fadd32 r1h, r2, r0h f2f16(w) = fadd x, f2f32(y) This is probably fine under GL's relaxed floating point precision rules, but it's definitely not ok with the more strict OpenCL or Vulkan. It also is a potential problem with GL invariance rules, if we get different results for the same shader depending whether we did a monolithic compile or a fast link. The place for doing inexact transformations is NIR, when we have the information available to do so correctly. By the time we get to the backend, everything we do needs to be bit-exact to preserve sanity. Fixes dEQP-GLES2.functional.shaders.algorithm.rgb_to_hsl_vertex. We believe that this is a CTS bug, but it's a useful one since it uncovered a serious driver bug that would bite us in the much less friendly Vulkan (or god forbid OpenCL) CTS later. It also seems like a magnet for GL app bugs, the fp16 support we do now is uncovering bad enough bugs as it is. shader-db results are pretty abysmal, though :| total instructions in shared programs: 1537964 -> 1571328 (2.17%) instructions in affected programs: 670231 -> 703595 (4.98%) total bytes in shared programs: 10533984 -> 10732316 (1.88%) bytes in affected programs: 4662414 -> 4860746 (4.25%) total halfregs in shared programs: 483448 -> 474541 (-1.84%) halfregs in affected programs: 58867 -> 49960 (-15.13%) Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23480>
2023-05-15 16:48:14 -04:00 · 2023-05-15 16:48:14 -04:00 · 3a0d1f83d5
commit 3a0d1f83d5
parent be5004691c
2 changed files with 51 additions and 0 deletions
--- a/src/asahi/compiler/agx_optimizer.c
+++ b/src/asahi/compiler/agx_optimizer.c
@ -89,6 +89,17 @@ agx_optimizer_fmov(agx_instr **defs, agx_instr *ins)
      if (ins->op == AGX_OPCODE_FCMPSEL && s >= 2)
         continue;

+      /* We can fold f2f32 into 32-bit instructions, but we can't fold f2f16
+       * into 16-bit instructions, since the latter would implicitly promote to
+       * a 32-bit instruction which is not exact.
+       */
+      assert(def->src[0].size == AGX_SIZE_32 ||
+             def->src[0].size == AGX_SIZE_16);
+      assert(src.size == AGX_SIZE_32 || src.size == AGX_SIZE_16);
+
+      if (src.size == AGX_SIZE_16 && def->src[0].size == AGX_SIZE_32)
+         continue;
+
      ins->src[s] = agx_compose_float_src(src, def->src[0]);
   }
 }
@ -154,6 +165,16 @@ agx_optimizer_fmov_rev(agx_instr *I, agx_instr *use)
   if (use->src[0].neg || use->src[0].abs)
      return false;

+   /* We can fold f2f16 into 32-bit instructions, but we can't fold f2f32 into
+    * 16-bit instructions, since the latter would implicitly promote to a 32-bit
+    * instruction which is not exact.
+    */
+   assert(use->dest[0].size == AGX_SIZE_32 || use->dest[0].size == AGX_SIZE_16);
+   assert(I->dest[0].size == AGX_SIZE_32 || I->dest[0].size == AGX_SIZE_16);
+
+   if (I->dest[0].size == AGX_SIZE_16 && use->dest[0].size == AGX_SIZE_32)
+      return false;
+
   /* saturate(saturate(x)) = saturate(x) */
   I->saturate |= use->saturate;
   I->dest[0] = use->dest[0];
--- a/src/asahi/compiler/test/test-optimizer.cpp
+++ b/src/asahi/compiler/test/test-optimizer.cpp
@ -78,6 +78,25 @@ TEST_F(Optimizer, FloatCopyprop)
          agx_fadd_to(b, out, agx_neg(wx), wy));
 }

+TEST_F(Optimizer, FloatConversion)
+{
+   CASE32(
+      {
+         agx_index cvt = agx_temp(b->shader, AGX_SIZE_32);
+         agx_fmov_to(b, cvt, hx);
+         agx_fadd_to(b, out, cvt, wy);
+      },
+      { agx_fadd_to(b, out, hx, wy); });
+
+   CASE16(
+      {
+         agx_index sum = agx_temp(b->shader, AGX_SIZE_32);
+         agx_fadd_to(b, sum, wx, wy);
+         agx_fmov_to(b, out, sum);
+      },
+      { agx_fadd_to(b, out, wx, wy); });
+}
+
 TEST_F(Optimizer, FusedFABSNEG)
 {
   CASE32(agx_fadd_to(b, out, agx_fmov(b, agx_abs(wx)), wy),
@ -164,3 +183,14 @@ TEST_F(Optimizer, SkipPreloads)
      agx_xor_to(b, out, preload, wy);
   });
 }
+
+TEST_F(Optimizer, NoConversionsOn16BitALU)
+{
+   NEGCASE16({
+      agx_index cvt = agx_temp(b->shader, AGX_SIZE_16);
+      agx_fmov_to(b, cvt, wx);
+      agx_fadd_to(b, out, cvt, hy);
+   });
+
+   NEGCASE32(agx_fmov_to(b, out, agx_fadd(b, hx, hy)));
+}