From 1bbbabedb7a5b6c4f153e1754bcff548c7cdb56c Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= <daniel@schuermann.dev>
Date: Mon, 7 Feb 2022 18:56:11 +0100
Subject: [PATCH] aco/insert_exec_mask: refactor and remove some unnecessary
 WQM handling code

Some cases cannot happen and don't need to be handled anymore.

Reviewed-by: Rhys Perry <pendingchaos02@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/14951>
---
 src/amd/compiler/aco_insert_exec_mask.cpp | 104 +++++-----------------
 1 file changed, 21 insertions(+), 83 deletions(-)
diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp
index f715df08a47..6f441dd6195 100644
--- a/src/amd/compiler/aco_insert_exec_mask.cpp
+++ b/src/amd/compiler/aco_insert_exec_mask.cpp
@@ -63,12 +63,11 @@ struct wqm_ctx {
 struct loop_info {
    Block* loop_header;
    uint16_t num_exec_masks;
-   uint8_t needs;
    bool has_divergent_break;
    bool has_divergent_continue;
    bool has_discard; /* has a discard or demote */
-   loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard)
-       : loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks),
+   loop_info(Block* b, uint16_t num, bool breaks, bool cont, bool discard)
+       : loop_header(b), num_exec_masks(num), has_divergent_break(breaks),
          has_divergent_continue(cont), has_discard(discard)
    {}
 };
@@ -188,7 +187,7 @@ transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
    if (ctx.info[idx].exec.back().second & mask_type_global) {
       Operand exec_mask = ctx.info[idx].exec.back().first;
       if (exec_mask.isUndefined()) {
-         exec_mask = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm), Operand(exec, bld.lm));
+         exec_mask = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm));
          ctx.info[idx].exec.back().first = exec_mask;
       }
 
@@ -202,8 +201,8 @@ transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx)
    assert(ctx.info[idx].exec.back().second & mask_type_wqm);
    assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
    assert(ctx.info[idx].exec.back().first.isTemp());
-   ctx.info[idx].exec.back().first = bld.pseudo(
-      aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
+   ctx.info[idx].exec.back().first =
+      bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
 }
 
 void
@@ -220,8 +219,8 @@ transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx)
       assert(ctx.info[idx].exec.back().second & mask_type_exact);
       assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
       assert(ctx.info[idx].exec.back().first.isTemp());
-      ctx.info[idx].exec.back().first = bld.pseudo(
-         aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
+      ctx.info[idx].exec.back().first =
+         bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
       return;
    }
    /* otherwise, we create an exact mask and push to the stack */
@@ -337,9 +336,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
          uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
          assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
          ctx.info[idx].exec.emplace_back(
-            bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm),
-                       ctx.info[idx].exec.back().first),
-            mask_type);
+            bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().first), mask_type);
       }
 
       return i;
@@ -410,41 +407,11 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
             ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type);
          }
       }
+
       assert(ctx.info[idx].exec.size() == info.num_exec_masks);
-
-      /* create a parallelcopy to move the live mask to exec */
-      unsigned i = 0;
-      while (block->instructions[i]->opcode != aco_opcode::p_logical_start) {
-         bld.insert(std::move(block->instructions[i]));
-         i++;
-      }
-
-      if (ctx.handle_wqm) {
-         if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) {
-            if (ctx.info[idx].block_needs == 0 || ctx.info[idx].block_needs == Exact) {
-               ctx.info[idx].exec.back().second |= mask_type_global;
-               transition_to_Exact(ctx, bld, idx);
-               ctx.handle_wqm = false;
-            }
-         }
-         if (ctx.info[idx].block_needs == WQM) {
-            assert(ctx.info[idx].exec.back().second & mask_type_wqm);
-            transition_to_WQM(ctx, bld, idx);
-         }
-      }
-
-      assert(ctx.info[idx].exec.back().first.size() == bld.lm.size());
-      if (get_exec_op(ctx.info[idx].exec.back().first).isTemp()) {
-         /* move current exec mask into exec register */
-         ctx.info[idx].exec.back().first = bld.pseudo(
-            aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
-      }
-
       ctx.loop.pop_back();
-      return i;
-   }
 
-   if (preds.size() == 1) {
+   } else if (preds.size() == 1) {
       ctx.info[idx].exec = ctx.info[preds[0]].exec;
    } else {
       assert(preds.size() == 2);
@@ -470,9 +437,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
             continue;
          }
 
-         bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge);
-         Temp phi = bld.pseudo(aco_opcode::p_linear_phi,
-                               in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm),
+         Temp phi = bld.pseudo(aco_opcode::p_linear_phi, bld.def(bld.lm),
                                get_exec_op(ctx.info[preds[0]].exec[i].first),
                                get_exec_op(ctx.info[preds[1]].exec[i].first));
          uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second;
@@ -496,16 +461,14 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instruction>>
             ctx.handle_wqm = false;
          }
       }
-      if (ctx.info[idx].block_needs == WQM) {
-         assert(ctx.info[idx].exec.back().second & mask_type_wqm);
-         transition_to_WQM(ctx, bld, idx);
-      }
    }
 
-   if (block->kind & block_kind_merge && !ctx.info[idx].exec.back().first.isUndefined()) {
+   /* restore exec mask after divergent control flow */
+   if (block->kind & (block_kind_loop_exit | block_kind_merge) &&
+       !ctx.info[idx].exec.back().first.isUndefined()) {
       Operand restore = ctx.info[idx].exec.back().first;
       assert(restore.size() == bld.lm.size());
-      bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), restore);
+      bld.copy(Definition(exec, bld.lm), restore);
       if (!restore.isConstant())
          ctx.info[idx].exec.back().first = Operand(bld.lm);
    }
@@ -518,9 +481,9 @@ process_instructions(exec_ctx& ctx, Block* block, std::vector<aco_ptr<Instructio
                      unsigned idx)
 {
    WQMState state;
-   if (ctx.info[block->index].exec.back().second & mask_type_wqm)
+   if (ctx.info[block->index].exec.back().second & mask_type_wqm) {
       state = WQM;
-   else {
+   } else {
       assert(!ctx.handle_wqm || ctx.info[block->index].exec.back().second & mask_type_exact);
       state = Exact;
    }
@@ -709,7 +672,6 @@ add_branch_code(exec_ctx& ctx, Block* block)
          transition_to_Exact(ctx, bld, idx);
          bld.insert(std::move(branch));
          ctx.handle_wqm = false;
-
       }
    }
 
@@ -718,12 +680,10 @@ add_branch_code(exec_ctx& ctx, Block* block)
       bool has_divergent_break = false;
       bool has_divergent_continue = false;
       bool has_discard = false;
-      uint8_t needs = 0;
       unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth;
 
       for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) {
          Block& loop_block = ctx.program->blocks[i];
-         needs |= ctx.info[i].block_needs;
 
          if (loop_block.kind & block_kind_uses_discard)
             has_discard = true;
@@ -738,25 +698,11 @@ add_branch_code(exec_ctx& ctx, Block* block)
             has_divergent_continue = true;
       }
 
-      if (ctx.handle_wqm) {
-         if (needs & WQM) {
-            aco_ptr<Instruction> branch = std::move(block->instructions.back());
-            block->instructions.pop_back();
-            transition_to_WQM(ctx, bld, idx);
-            bld.insert(std::move(branch));
-         } else {
-            aco_ptr<Instruction> branch = std::move(block->instructions.back());
-            block->instructions.pop_back();
-            transition_to_Exact(ctx, bld, idx);
-            bld.insert(std::move(branch));
-         }
-      }
-
       unsigned num_exec_masks = ctx.info[idx].exec.size();
       if (block->kind & block_kind_top_level)
          num_exec_masks = std::min(num_exec_masks, 2u);
 
-      ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, needs,
+      ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks,
                             has_divergent_break, has_divergent_continue, has_discard);
    }
 
@@ -780,8 +726,8 @@ add_branch_code(exec_ctx& ctx, Block* block)
       }
 
       if (need_parallelcopy)
-         ctx.info[idx].exec.back().first = bld.pseudo(
-            aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
+         ctx.info[idx].exec.back().first =
+            bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().first);
       bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm),
                  block->linear_succs[1], block->linear_succs[0]);
       return;
@@ -799,14 +745,6 @@ add_branch_code(exec_ctx& ctx, Block* block)
    }
 
    if (block->kind & block_kind_branch) {
-
-      if (ctx.handle_wqm && ctx.info[idx].exec.size() >= 2 &&
-          ctx.info[idx].exec.back().second == mask_type_exact &&
-          ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) {
-         /* return to wqm before branching */
-         ctx.info[idx].exec.pop_back();
-      }
-
       // orig = s_and_saveexec_b64
       assert(block->linear_succs.size() == 2);
       assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z);
@@ -815,7 +753,7 @@ add_branch_code(exec_ctx& ctx, Block* block)
 
       uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact);
       if (ctx.info[idx].exec.back().first.constantEquals(-1u)) {
-         bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), cond);
+         bld.copy(Definition(exec, bld.lm), cond);
       } else {
          Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc),
                                   Definition(exec, bld.lm), cond, Operand(exec, bld.lm));