From 3a0d1f83d57db303b85ab281b161a34229484d5a Mon Sep 17 00:00:00 2001
From: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Date: Mon, 15 May 2023 16:48:14 -0400
Subject: [PATCH] agx: Stop bit-inexact conversion propagation

Despite being mathematically equivalent, the following code sequences are not
bit-identical under IEEE 754 rules due to differing internal precision:

   fadd16 r0l, r2, 0.0              z = f2f16 x
   fadd16 r1h, r0l, r0h             w = fadd z, y

versus

   fadd32 r1h, r2, r0h              f2f16(w) = fadd x, f2f32(y)

This is probably fine under GL's relaxed floating point precision rules, but
it's definitely not ok with the more strict OpenCL or Vulkan. It also is a
potential problem with GL invariance rules, if we get different results for the
same shader depending whether we did a monolithic compile or a fast link. The
place for doing inexact transformations is NIR, when we have the information
available to do so correctly. By the time we get to the backend, everything we
do needs to be bit-exact to preserve sanity.

Fixes dEQP-GLES2.functional.shaders.algorithm.rgb_to_hsl_vertex. We believe that
this is a CTS bug, but it's a useful one since it uncovered a serious driver bug
that would bite us in the much less friendly Vulkan (or god forbid OpenCL) CTS
later. It also seems like a magnet for GL app bugs, the fp16 support we do now
is uncovering bad enough bugs as it is.

shader-db results are pretty abysmal, though :|

total instructions in shared programs: 1537964 -> 1571328 (2.17%)
instructions in affected programs: 670231 -> 703595 (4.98%)

total bytes in shared programs: 10533984 -> 10732316 (1.88%)
bytes in affected programs: 4662414 -> 4860746 (4.25%)

total halfregs in shared programs: 483448 -> 474541 (-1.84%)
halfregs in affected programs: 58867 -> 49960 (-15.13%)

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23480>
---
 src/asahi/compiler/agx_optimizer.c         | 21 +++++++++++++++
 src/asahi/compiler/test/test-optimizer.cpp | 30 ++++++++++++++++++++++
 2 files changed, 51 insertions(+)

diff --git a/src/asahi/compiler/agx_optimizer.c b/src/asahi/compiler/agx_optimizer.c
index 458459f82ba..63d02ca3180 100644
--- a/src/asahi/compiler/agx_optimizer.c
+++ b/src/asahi/compiler/agx_optimizer.c
@@ -89,6 +89,17 @@ agx_optimizer_fmov(agx_instr **defs, agx_instr *ins)
       if (ins->op == AGX_OPCODE_FCMPSEL && s >= 2)
          continue;
 
+      /* We can fold f2f32 into 32-bit instructions, but we can't fold f2f16
+       * into 16-bit instructions, since the latter would implicitly promote to
+       * a 32-bit instruction which is not exact.
+       */
+      assert(def->src[0].size == AGX_SIZE_32 ||
+             def->src[0].size == AGX_SIZE_16);
+      assert(src.size == AGX_SIZE_32 || src.size == AGX_SIZE_16);
+
+      if (src.size == AGX_SIZE_16 && def->src[0].size == AGX_SIZE_32)
+         continue;
+
       ins->src[s] = agx_compose_float_src(src, def->src[0]);
    }
 }
@@ -154,6 +165,16 @@ agx_optimizer_fmov_rev(agx_instr *I, agx_instr *use)
    if (use->src[0].neg || use->src[0].abs)
       return false;
 
+   /* We can fold f2f16 into 32-bit instructions, but we can't fold f2f32 into
+    * 16-bit instructions, since the latter would implicitly promote to a 32-bit
+    * instruction which is not exact.
+    */
+   assert(use->dest[0].size == AGX_SIZE_32 || use->dest[0].size == AGX_SIZE_16);
+   assert(I->dest[0].size == AGX_SIZE_32 || I->dest[0].size == AGX_SIZE_16);
+
+   if (I->dest[0].size == AGX_SIZE_16 && use->dest[0].size == AGX_SIZE_32)
+      return false;
+
    /* saturate(saturate(x)) = saturate(x) */
    I->saturate |= use->saturate;
    I->dest[0] = use->dest[0];
diff --git a/src/asahi/compiler/test/test-optimizer.cpp b/src/asahi/compiler/test/test-optimizer.cpp
index 2b0c6c8b324..9e05c5ee39c 100644
--- a/src/asahi/compiler/test/test-optimizer.cpp
+++ b/src/asahi/compiler/test/test-optimizer.cpp
@@ -78,6 +78,25 @@ TEST_F(Optimizer, FloatCopyprop)
           agx_fadd_to(b, out, agx_neg(wx), wy));
 }
 
+TEST_F(Optimizer, FloatConversion)
+{
+   CASE32(
+      {
+         agx_index cvt = agx_temp(b->shader, AGX_SIZE_32);
+         agx_fmov_to(b, cvt, hx);
+         agx_fadd_to(b, out, cvt, wy);
+      },
+      { agx_fadd_to(b, out, hx, wy); });
+
+   CASE16(
+      {
+         agx_index sum = agx_temp(b->shader, AGX_SIZE_32);
+         agx_fadd_to(b, sum, wx, wy);
+         agx_fmov_to(b, out, sum);
+      },
+      { agx_fadd_to(b, out, wx, wy); });
+}
+
 TEST_F(Optimizer, FusedFABSNEG)
 {
    CASE32(agx_fadd_to(b, out, agx_fmov(b, agx_abs(wx)), wy),
@@ -164,3 +183,14 @@ TEST_F(Optimizer, SkipPreloads)
       agx_xor_to(b, out, preload, wy);
    });
 }
+
+TEST_F(Optimizer, NoConversionsOn16BitALU)
+{
+   NEGCASE16({
+      agx_index cvt = agx_temp(b->shader, AGX_SIZE_16);
+      agx_fmov_to(b, cvt, wx);
+      agx_fadd_to(b, out, cvt, hy);
+   });
+
+   NEGCASE32(agx_fmov_to(b, out, agx_fadd(b, hx, hy)));
+}