From 1bbbabedb7a5b6c4f153e1754bcff548c7cdb56c Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Daniel=20Sch=C3=BCrmann?= Date: Mon, 7 Feb 2022 18:56:11 +0100 Subject: [PATCH] aco/insert_exec_mask: refactor and remove some unnecessary WQM handling code Some cases cannot happen and don't need to be handled anymore. Reviewed-by: Rhys Perry Part-of: --- src/amd/compiler/aco_insert_exec_mask.cpp | 104 +++++----------------- 1 file changed, 21 insertions(+), 83 deletions(-) diff --git a/src/amd/compiler/aco_insert_exec_mask.cpp b/src/amd/compiler/aco_insert_exec_mask.cpp index f715df08a47..6f441dd6195 100644 --- a/src/amd/compiler/aco_insert_exec_mask.cpp +++ b/src/amd/compiler/aco_insert_exec_mask.cpp @@ -63,12 +63,11 @@ struct wqm_ctx { struct loop_info { Block* loop_header; uint16_t num_exec_masks; - uint8_t needs; bool has_divergent_break; bool has_divergent_continue; bool has_discard; /* has a discard or demote */ - loop_info(Block* b, uint16_t num, uint8_t needs_, bool breaks, bool cont, bool discard) - : loop_header(b), num_exec_masks(num), needs(needs_), has_divergent_break(breaks), + loop_info(Block* b, uint16_t num, bool breaks, bool cont, bool discard) + : loop_header(b), num_exec_masks(num), has_divergent_break(breaks), has_divergent_continue(cont), has_discard(discard) {} }; @@ -188,7 +187,7 @@ transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) if (ctx.info[idx].exec.back().second & mask_type_global) { Operand exec_mask = ctx.info[idx].exec.back().first; if (exec_mask.isUndefined()) { - exec_mask = bld.pseudo(aco_opcode::p_parallelcopy, bld.def(bld.lm), Operand(exec, bld.lm)); + exec_mask = bld.copy(bld.def(bld.lm), Operand(exec, bld.lm)); ctx.info[idx].exec.back().first = exec_mask; } @@ -202,8 +201,8 @@ transition_to_WQM(exec_ctx& ctx, Builder bld, unsigned idx) assert(ctx.info[idx].exec.back().second & mask_type_wqm); assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); assert(ctx.info[idx].exec.back().first.isTemp()); - ctx.info[idx].exec.back().first = bld.pseudo( - aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); + ctx.info[idx].exec.back().first = + bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().first); } void @@ -220,8 +219,8 @@ transition_to_Exact(exec_ctx& ctx, Builder bld, unsigned idx) assert(ctx.info[idx].exec.back().second & mask_type_exact); assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); assert(ctx.info[idx].exec.back().first.isTemp()); - ctx.info[idx].exec.back().first = bld.pseudo( - aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); + ctx.info[idx].exec.back().first = + bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().first); return; } /* otherwise, we create an exact mask and push to the stack */ @@ -337,9 +336,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector> uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); ctx.info[idx].exec.emplace_back( - bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), - ctx.info[idx].exec.back().first), - mask_type); + bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().first), mask_type); } return i; @@ -410,41 +407,11 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector> ctx.info[idx].exec.emplace_back(bld.insert(std::move(phi)), type); } } + assert(ctx.info[idx].exec.size() == info.num_exec_masks); - - /* create a parallelcopy to move the live mask to exec */ - unsigned i = 0; - while (block->instructions[i]->opcode != aco_opcode::p_logical_start) { - bld.insert(std::move(block->instructions[i])); - i++; - } - - if (ctx.handle_wqm) { - if (block->kind & block_kind_top_level && ctx.info[idx].exec.size() == 2) { - if (ctx.info[idx].block_needs == 0 || ctx.info[idx].block_needs == Exact) { - ctx.info[idx].exec.back().second |= mask_type_global; - transition_to_Exact(ctx, bld, idx); - ctx.handle_wqm = false; - } - } - if (ctx.info[idx].block_needs == WQM) { - assert(ctx.info[idx].exec.back().second & mask_type_wqm); - transition_to_WQM(ctx, bld, idx); - } - } - - assert(ctx.info[idx].exec.back().first.size() == bld.lm.size()); - if (get_exec_op(ctx.info[idx].exec.back().first).isTemp()) { - /* move current exec mask into exec register */ - ctx.info[idx].exec.back().first = bld.pseudo( - aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); - } - ctx.loop.pop_back(); - return i; - } - if (preds.size() == 1) { + } else if (preds.size() == 1) { ctx.info[idx].exec = ctx.info[preds[0]].exec; } else { assert(preds.size() == 2); @@ -470,9 +437,7 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector> continue; } - bool in_exec = i == num_exec_masks - 1 && !(block->kind & block_kind_merge); - Temp phi = bld.pseudo(aco_opcode::p_linear_phi, - in_exec ? Definition(exec, bld.lm) : bld.def(bld.lm), + Temp phi = bld.pseudo(aco_opcode::p_linear_phi, bld.def(bld.lm), get_exec_op(ctx.info[preds[0]].exec[i].first), get_exec_op(ctx.info[preds[1]].exec[i].first)); uint8_t mask_type = ctx.info[preds[0]].exec[i].second & ctx.info[preds[1]].exec[i].second; @@ -496,16 +461,14 @@ add_coupling_code(exec_ctx& ctx, Block* block, std::vector> ctx.handle_wqm = false; } } - if (ctx.info[idx].block_needs == WQM) { - assert(ctx.info[idx].exec.back().second & mask_type_wqm); - transition_to_WQM(ctx, bld, idx); - } } - if (block->kind & block_kind_merge && !ctx.info[idx].exec.back().first.isUndefined()) { + /* restore exec mask after divergent control flow */ + if (block->kind & (block_kind_loop_exit | block_kind_merge) && + !ctx.info[idx].exec.back().first.isUndefined()) { Operand restore = ctx.info[idx].exec.back().first; assert(restore.size() == bld.lm.size()); - bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), restore); + bld.copy(Definition(exec, bld.lm), restore); if (!restore.isConstant()) ctx.info[idx].exec.back().first = Operand(bld.lm); } @@ -518,9 +481,9 @@ process_instructions(exec_ctx& ctx, Block* block, std::vectorindex].exec.back().second & mask_type_wqm) + if (ctx.info[block->index].exec.back().second & mask_type_wqm) { state = WQM; - else { + } else { assert(!ctx.handle_wqm || ctx.info[block->index].exec.back().second & mask_type_exact); state = Exact; } @@ -709,7 +672,6 @@ add_branch_code(exec_ctx& ctx, Block* block) transition_to_Exact(ctx, bld, idx); bld.insert(std::move(branch)); ctx.handle_wqm = false; - } } @@ -718,12 +680,10 @@ add_branch_code(exec_ctx& ctx, Block* block) bool has_divergent_break = false; bool has_divergent_continue = false; bool has_discard = false; - uint8_t needs = 0; unsigned loop_nest_depth = ctx.program->blocks[idx + 1].loop_nest_depth; for (unsigned i = idx + 1; ctx.program->blocks[i].loop_nest_depth >= loop_nest_depth; i++) { Block& loop_block = ctx.program->blocks[i]; - needs |= ctx.info[i].block_needs; if (loop_block.kind & block_kind_uses_discard) has_discard = true; @@ -738,25 +698,11 @@ add_branch_code(exec_ctx& ctx, Block* block) has_divergent_continue = true; } - if (ctx.handle_wqm) { - if (needs & WQM) { - aco_ptr branch = std::move(block->instructions.back()); - block->instructions.pop_back(); - transition_to_WQM(ctx, bld, idx); - bld.insert(std::move(branch)); - } else { - aco_ptr branch = std::move(block->instructions.back()); - block->instructions.pop_back(); - transition_to_Exact(ctx, bld, idx); - bld.insert(std::move(branch)); - } - } - unsigned num_exec_masks = ctx.info[idx].exec.size(); if (block->kind & block_kind_top_level) num_exec_masks = std::min(num_exec_masks, 2u); - ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, needs, + ctx.loop.emplace_back(&ctx.program->blocks[block->linear_succs[0]], num_exec_masks, has_divergent_break, has_divergent_continue, has_discard); } @@ -780,8 +726,8 @@ add_branch_code(exec_ctx& ctx, Block* block) } if (need_parallelcopy) - ctx.info[idx].exec.back().first = bld.pseudo( - aco_opcode::p_parallelcopy, Definition(exec, bld.lm), ctx.info[idx].exec.back().first); + ctx.info[idx].exec.back().first = + bld.copy(Definition(exec, bld.lm), ctx.info[idx].exec.back().first); bld.branch(aco_opcode::p_cbranch_nz, bld.hint_vcc(bld.def(s2)), Operand(exec, bld.lm), block->linear_succs[1], block->linear_succs[0]); return; @@ -799,14 +745,6 @@ add_branch_code(exec_ctx& ctx, Block* block) } if (block->kind & block_kind_branch) { - - if (ctx.handle_wqm && ctx.info[idx].exec.size() >= 2 && - ctx.info[idx].exec.back().second == mask_type_exact && - ctx.info[idx].exec[ctx.info[idx].exec.size() - 2].second & mask_type_wqm) { - /* return to wqm before branching */ - ctx.info[idx].exec.pop_back(); - } - // orig = s_and_saveexec_b64 assert(block->linear_succs.size() == 2); assert(block->instructions.back()->opcode == aco_opcode::p_cbranch_z); @@ -815,7 +753,7 @@ add_branch_code(exec_ctx& ctx, Block* block) uint8_t mask_type = ctx.info[idx].exec.back().second & (mask_type_wqm | mask_type_exact); if (ctx.info[idx].exec.back().first.constantEquals(-1u)) { - bld.pseudo(aco_opcode::p_parallelcopy, Definition(exec, bld.lm), cond); + bld.copy(Definition(exec, bld.lm), cond); } else { Temp old_exec = bld.sop1(Builder::s_and_saveexec, bld.def(bld.lm), bld.def(s1, scc), Definition(exec, bld.lm), cond, Operand(exec, bld.lm));