agx: Stop bit-inexact conversion propagation

Despite being mathematically equivalent, the following code sequences are not
bit-identical under IEEE 754 rules due to differing internal precision:

   fadd16 r0l, r2, 0.0              z = f2f16 x
   fadd16 r1h, r0l, r0h             w = fadd z, y

versus

   fadd32 r1h, r2, r0h              f2f16(w) = fadd x, f2f32(y)

This is probably fine under GL's relaxed floating point precision rules, but
it's definitely not ok with the more strict OpenCL or Vulkan. It also is a
potential problem with GL invariance rules, if we get different results for the
same shader depending whether we did a monolithic compile or a fast link. The
place for doing inexact transformations is NIR, when we have the information
available to do so correctly. By the time we get to the backend, everything we
do needs to be bit-exact to preserve sanity.

Fixes dEQP-GLES2.functional.shaders.algorithm.rgb_to_hsl_vertex. We believe that
this is a CTS bug, but it's a useful one since it uncovered a serious driver bug
that would bite us in the much less friendly Vulkan (or god forbid OpenCL) CTS
later. It also seems like a magnet for GL app bugs, the fp16 support we do now
is uncovering bad enough bugs as it is.

shader-db results are pretty abysmal, though :|

total instructions in shared programs: 1537964 -> 1571328 (2.17%)
instructions in affected programs: 670231 -> 703595 (4.98%)

total bytes in shared programs: 10533984 -> 10732316 (1.88%)
bytes in affected programs: 4662414 -> 4860746 (4.25%)

total halfregs in shared programs: 483448 -> 474541 (-1.84%)
halfregs in affected programs: 58867 -> 49960 (-15.13%)

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/23480>
This commit is contained in:
Alyssa Rosenzweig 2023-05-15 16:48:14 -04:00 committed by Marge Bot
parent be5004691c
commit 3a0d1f83d5
2 changed files with 51 additions and 0 deletions

View file

@ -89,6 +89,17 @@ agx_optimizer_fmov(agx_instr **defs, agx_instr *ins)
if (ins->op == AGX_OPCODE_FCMPSEL && s >= 2)
continue;
/* We can fold f2f32 into 32-bit instructions, but we can't fold f2f16
* into 16-bit instructions, since the latter would implicitly promote to
* a 32-bit instruction which is not exact.
*/
assert(def->src[0].size == AGX_SIZE_32 ||
def->src[0].size == AGX_SIZE_16);
assert(src.size == AGX_SIZE_32 || src.size == AGX_SIZE_16);
if (src.size == AGX_SIZE_16 && def->src[0].size == AGX_SIZE_32)
continue;
ins->src[s] = agx_compose_float_src(src, def->src[0]);
}
}
@ -154,6 +165,16 @@ agx_optimizer_fmov_rev(agx_instr *I, agx_instr *use)
if (use->src[0].neg || use->src[0].abs)
return false;
/* We can fold f2f16 into 32-bit instructions, but we can't fold f2f32 into
* 16-bit instructions, since the latter would implicitly promote to a 32-bit
* instruction which is not exact.
*/
assert(use->dest[0].size == AGX_SIZE_32 || use->dest[0].size == AGX_SIZE_16);
assert(I->dest[0].size == AGX_SIZE_32 || I->dest[0].size == AGX_SIZE_16);
if (I->dest[0].size == AGX_SIZE_16 && use->dest[0].size == AGX_SIZE_32)
return false;
/* saturate(saturate(x)) = saturate(x) */
I->saturate |= use->saturate;
I->dest[0] = use->dest[0];

View file

@ -78,6 +78,25 @@ TEST_F(Optimizer, FloatCopyprop)
agx_fadd_to(b, out, agx_neg(wx), wy));
}
TEST_F(Optimizer, FloatConversion)
{
CASE32(
{
agx_index cvt = agx_temp(b->shader, AGX_SIZE_32);
agx_fmov_to(b, cvt, hx);
agx_fadd_to(b, out, cvt, wy);
},
{ agx_fadd_to(b, out, hx, wy); });
CASE16(
{
agx_index sum = agx_temp(b->shader, AGX_SIZE_32);
agx_fadd_to(b, sum, wx, wy);
agx_fmov_to(b, out, sum);
},
{ agx_fadd_to(b, out, wx, wy); });
}
TEST_F(Optimizer, FusedFABSNEG)
{
CASE32(agx_fadd_to(b, out, agx_fmov(b, agx_abs(wx)), wy),
@ -164,3 +183,14 @@ TEST_F(Optimizer, SkipPreloads)
agx_xor_to(b, out, preload, wy);
});
}
TEST_F(Optimizer, NoConversionsOn16BitALU)
{
NEGCASE16({
agx_index cvt = agx_temp(b->shader, AGX_SIZE_16);
agx_fmov_to(b, cvt, wx);
agx_fadd_to(b, out, cvt, hy);
});
NEGCASE32(agx_fmov_to(b, out, agx_fadd(b, hx, hy)));
}