ir3: lower 64b registers

After all int64/double lowerings, there might still be 64b registers left which ir3 currently doesn't handle. This only happens in a small number of Piglit tests where those registers (or the variables they come from) did not get DCE'd. This patch handles 64b registers in ir3 by adding a NIR pass that does the following: - @decl_reg -> split in two 32b ones - @store_reg -> unpack_64_2x32_split_x/y and two separate stores - @load_reg -> two separate loads and pack_64_2x32_split After this pass, the 64b vecs used for the original loads/stores are still present and are also not handled yet by ir3. This patch removes them by running nir_lower_alu_to_scalar and nir_copy_prop. Signed-off-by: Job Noorman <jnoorman@igalia.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26175>
2023-11-10 11:57:34 +01:00 · 2023-11-10 11:57:34 +01:00 · 286caa5080
commit 286caa5080
parent 6e7a61df4c
5 changed files with 113 additions and 12 deletions
--- a/src/freedreno/ci/freedreno-a618-fails.txt
+++ b/src/freedreno/ci/freedreno-a618-fails.txt
@ -106,16 +106,10 @@ spec@arb_shader_image_load_store@qualifiers@r8/strict layout qualifiers/permissi
 # ir3_nir_lower_tess.c:251: lower_block_to_explicit_output: Assertion `util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1)' failed.
 spec@arb_tessellation_shader@execution@tcs-input-read-mat,Crash

-# Some 64b not getting lowered to 32b:
-spec@arb_tessellation_shader@execution@variable-indexing@vs-output-array-dvec4-index-wr-before-tcs,Crash
-
 spec@arb_texture_rectangle@1-1-linear-texture,Fail

 spec@arb_vertex_type_2_10_10_10_rev@attrib-p-type-size-match,Fail

-# fails unrelated to GL_ARB_enhanced_layouts
-spec@arb_enhanced_layouts@execution@component-layout@vs-fs-array-dvec3,Crash
-
 # fails on gen1 (a618/a630) with both fd and zink, but passes on gen4..
 # maybe gen1 sqe doesn't handle the count==0 case?
 spec@arb_indirect_parameters@tf-count-arrays,Fail
--- a/src/freedreno/ci/freedreno-a630-fails.txt
+++ b/src/freedreno/ci/freedreno-a630-fails.txt
@ -109,16 +109,10 @@ spec@arb_shader_image_load_store@qualifiers@r8/strict layout qualifiers/permissi
 # ir3_nir_lower_tess.c:251: lower_block_to_explicit_output: Assertion `util_is_power_of_two_nonzero(nir_intrinsic_write_mask(intr) + 1)' failed.
 spec@arb_tessellation_shader@execution@tcs-input-read-mat,Crash

-# Some 64b not getting lowered to 32b:
-spec@arb_tessellation_shader@execution@variable-indexing@vs-output-array-dvec4-index-wr-before-tcs,Crash
-
 spec@arb_texture_rectangle@1-1-linear-texture,Fail

 spec@arb_vertex_type_2_10_10_10_rev@attrib-p-type-size-match,Fail

-# fails unrelated to GL_ARB_enhanced_layouts
-spec@arb_enhanced_layouts@execution@component-layout@vs-fs-array-dvec3,Crash
-
 # fails on gen1 (a618/a630) with both fd and zink, but passes on gen4..
 # maybe gen1 sqe doesn't handle the count==0 case?
 spec@arb_indirect_parameters@tf-count-arrays,Fail
--- a/src/freedreno/ir3/ir3_context.c
+++ b/src/freedreno/ir3/ir3_context.c
@ -91,6 +91,21 @@ ir3_context_init(struct ir3_compiler *compiler, struct ir3_shader *shader,
   bool needs_late_alg = false;
   NIR_PASS(progress, ctx->s, nir_lower_locals_to_regs, 1);

+   if (progress) {
+      bool regs_progress = false;
+
+      /* Split 64b registers into two 32b ones. */
+      NIR_PASS(regs_progress, ctx->s, ir3_nir_lower_64b_regs);
+
+      if (regs_progress) {
+         /* After splitting registers, we might still have some 64b vecs. Run
+          * some passes to get rid of them.
+          */
+         NIR_PASS_V(ctx->s, nir_lower_alu_to_scalar, NULL, NULL);
+         NIR_PASS_V(ctx->s, nir_copy_prop);
+      }
+   }
+
   /* we could need cleanup after lower_locals_to_regs */
   while (progress) {
      progress = false;
--- a/src/freedreno/ir3/ir3_nir.h
+++ b/src/freedreno/ir3/ir3_nir.h
@ -65,6 +65,7 @@ void ir3_nir_lower_gs(nir_shader *shader);
 bool ir3_nir_lower_64b_intrinsics(nir_shader *shader);
 bool ir3_nir_lower_64b_undef(nir_shader *shader);
 bool ir3_nir_lower_64b_global(nir_shader *shader);
+bool ir3_nir_lower_64b_regs(nir_shader *shader);

 void ir3_optimize_loop(struct ir3_compiler *compiler, nir_shader *s);
 void ir3_nir_lower_io_to_temporaries(nir_shader *s);
--- a/src/freedreno/ir3/ir3_nir_lower_64b.c
+++ b/src/freedreno/ir3/ir3_nir_lower_64b.c
@ -299,3 +299,100 @@ ir3_nir_lower_64b_global(nir_shader *shader)
         shader, lower_64b_global_filter,
         lower_64b_global, NULL);
 }
+
+/*
+ * Lowering for 64b registers:
+ * - @decl_reg -> split in two 32b ones
+ * - @store_reg -> unpack_64_2x32_split_x/y and two separate stores
+ * - @load_reg -> two separate loads and pack_64_2x32_split
+ */
+
+static void
+lower_64b_reg(nir_builder *b, nir_intrinsic_instr *reg)
+{
+   unsigned num_components = nir_intrinsic_num_components(reg);
+   unsigned num_array_elems = nir_intrinsic_num_array_elems(reg);
+
+   nir_def *reg_hi = nir_decl_reg(b, num_components, 32, num_array_elems);
+   nir_def *reg_lo = nir_decl_reg(b, num_components, 32, num_array_elems);
+
+   nir_foreach_reg_store_safe (store_reg_src, reg) {
+      nir_intrinsic_instr *store =
+         nir_instr_as_intrinsic(nir_src_parent_instr(store_reg_src));
+      b->cursor = nir_before_instr(&store->instr);
+
+      nir_def *packed = store->src[0].ssa;
+      nir_def *unpacked_lo = nir_unpack_64_2x32_split_x(b, packed);
+      nir_def *unpacked_hi = nir_unpack_64_2x32_split_y(b, packed);
+      int base = nir_intrinsic_base(store);
+
+      if (store->intrinsic == nir_intrinsic_store_reg) {
+         nir_build_store_reg(b, unpacked_lo, reg_lo, .base = base);
+         nir_build_store_reg(b, unpacked_hi, reg_hi, .base = base);
+      } else {
+         assert(store->intrinsic == nir_intrinsic_store_reg_indirect);
+
+         nir_def *offset = store->src[2].ssa;
+         nir_store_reg_indirect(b, unpacked_lo, reg_lo, offset, .base = base);
+         nir_store_reg_indirect(b, unpacked_hi, reg_hi, offset, .base = base);
+      }
+
+      nir_instr_remove(&store->instr);
+   }
+
+   nir_foreach_reg_load_safe (load_reg_src, reg) {
+      nir_intrinsic_instr *load =
+         nir_instr_as_intrinsic(nir_src_parent_instr(load_reg_src));
+      b->cursor = nir_before_instr(&load->instr);
+
+      int base = nir_intrinsic_base(load);
+      nir_def *load_lo, *load_hi;
+
+      if (load->intrinsic == nir_intrinsic_load_reg) {
+         load_lo =
+            nir_build_load_reg(b, num_components, 32, reg_lo, .base = base);
+         load_hi =
+            nir_build_load_reg(b, num_components, 32, reg_hi, .base = base);
+      } else {
+         assert(load->intrinsic == nir_intrinsic_load_reg_indirect);
+
+         nir_def *offset = load->src[1].ssa;
+         load_lo = nir_load_reg_indirect(b, num_components, 32, reg_lo, offset,
+                                         .base = base);
+         load_hi = nir_load_reg_indirect(b, num_components, 32, reg_hi, offset,
+                                         .base = base);
+      }
+
+      nir_def *packed = nir_pack_64_2x32_split(b, load_lo, load_hi);
+      nir_def_rewrite_uses(&load->def, packed);
+      nir_instr_remove(&load->instr);
+   }
+
+   nir_instr_remove(&reg->instr);
+}
+
+bool
+ir3_nir_lower_64b_regs(nir_shader *shader)
+{
+   bool progress = false;
+
+   nir_foreach_function_impl (impl, shader) {
+      bool impl_progress = false;
+      nir_builder b = nir_builder_create(impl);
+
+      nir_foreach_reg_decl_safe (reg, impl) {
+         if (nir_intrinsic_bit_size(reg) == 64) {
+            lower_64b_reg(&b, reg);
+            impl_progress = true;
+         }
+      }
+
+      if (impl_progress) {
+         nir_metadata_preserve(
+            impl, nir_metadata_block_index | nir_metadata_dominance);
+         progress = true;
+      }
+   }
+
+   return progress;
+}