llvmpipe: Use full subgroups when possible

Fixes computeFullSubgroups on lavapipe.

cc: mesa-stable

Reviewed-by: Mike Blumenkrantz <michael.blumenkrantz@gmail.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/27603>
(cherry picked from commit eb3c96d5ed4fe8e57d8d225fa6e740282b510a8f)
This commit is contained in:
Konstantin Seurer 2024-02-13 11:25:54 +01:00 committed by Eric Engestrom
parent da3ac67e23
commit 5e178a07a0
2 changed files with 51 additions and 92 deletions

View file

@ -1344,7 +1344,7 @@
"description": "llvmpipe: Use full subgroups when possible", "description": "llvmpipe: Use full subgroups when possible",
"nominated": true, "nominated": true,
"nomination_type": 0, "nomination_type": 0,
"resolution": 0, "resolution": 1,
"main_sha": null, "main_sha": null,
"because_sha": null, "because_sha": null,
"notes": null "notes": null

View file

@ -95,7 +95,7 @@ enum {
CS_ARG_VERTEX_DATA, CS_ARG_VERTEX_DATA,
CS_ARG_PER_THREAD_DATA, CS_ARG_PER_THREAD_DATA,
CS_ARG_OUTER_COUNT, CS_ARG_OUTER_COUNT,
CS_ARG_CORO_X_LOOPS = CS_ARG_OUTER_COUNT, CS_ARG_CORO_SUBGROUP_COUNT = CS_ARG_OUTER_COUNT,
CS_ARG_CORO_PARTIALS, CS_ARG_CORO_PARTIALS,
CS_ARG_CORO_BLOCK_X_SIZE, CS_ARG_CORO_BLOCK_X_SIZE,
CS_ARG_CORO_BLOCK_Y_SIZE, CS_ARG_CORO_BLOCK_Y_SIZE,
@ -374,7 +374,7 @@ generate_compute(struct llvmpipe_context *lp,
else else
arg_types[CS_ARG_VERTEX_DATA] = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); /* mesh shaders only */ arg_types[CS_ARG_VERTEX_DATA] = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); /* mesh shaders only */
arg_types[CS_ARG_PER_THREAD_DATA] = variant->jit_cs_thread_data_ptr_type; /* per thread data */ arg_types[CS_ARG_PER_THREAD_DATA] = variant->jit_cs_thread_data_ptr_type; /* per thread data */
arg_types[CS_ARG_CORO_X_LOOPS] = int32_type; /* coro only - num X loops */ arg_types[CS_ARG_CORO_SUBGROUP_COUNT] = int32_type; /* coro only - subgroup count */
arg_types[CS_ARG_CORO_PARTIALS] = int32_type; /* coro only - partials */ arg_types[CS_ARG_CORO_PARTIALS] = int32_type; /* coro only - partials */
arg_types[CS_ARG_CORO_BLOCK_X_SIZE] = int32_type; /* coro block_x_size */ arg_types[CS_ARG_CORO_BLOCK_X_SIZE] = int32_type; /* coro block_x_size */
arg_types[CS_ARG_CORO_BLOCK_Y_SIZE] = int32_type; /* coro block_y_size */ arg_types[CS_ARG_CORO_BLOCK_Y_SIZE] = int32_type; /* coro block_y_size */
@ -560,23 +560,24 @@ generate_compute(struct llvmpipe_context *lp,
output_array = lp_build_array_alloca(gallivm, output_type, lp_build_const_int32(gallivm, align(MAX2(nir->info.mesh.max_primitives_out, nir->info.mesh.max_vertices_out), 8)), "outputs"); output_array = lp_build_array_alloca(gallivm, output_type, lp_build_const_int32(gallivm, align(MAX2(nir->info.mesh.max_primitives_out, nir->info.mesh.max_vertices_out), 8)), "outputs");
} }
struct lp_build_loop_state loop_state[4]; struct lp_build_loop_state loop_state[2];
LLVMValueRef num_x_loop;
LLVMValueRef vec_length = lp_build_const_int32(gallivm, cs_type.length);
num_x_loop = LLVMBuildAdd(gallivm->builder, block_x_size_arg, vec_length, "");
num_x_loop = LLVMBuildSub(gallivm->builder, num_x_loop, lp_build_const_int32(gallivm, 1), "");
num_x_loop = LLVMBuildUDiv(gallivm->builder, num_x_loop, vec_length, "");
LLVMValueRef partials = LLVMBuildURem(gallivm->builder, block_x_size_arg, vec_length, "");
LLVMValueRef coro_num_hdls = LLVMBuildMul(gallivm->builder, num_x_loop, block_y_size_arg, ""); LLVMValueRef vec_length = lp_build_const_int32(gallivm, cs_type.length);
coro_num_hdls = LLVMBuildMul(gallivm->builder, coro_num_hdls, block_z_size_arg, "");
LLVMValueRef invocation_count = LLVMBuildMul(gallivm->builder, block_x_size_arg, block_y_size_arg, "");
invocation_count = LLVMBuildMul(gallivm->builder, invocation_count, block_z_size_arg, "");
LLVMValueRef partials = LLVMBuildURem(gallivm->builder, invocation_count, vec_length, "");
LLVMValueRef num_subgroup_loop = LLVMBuildAdd(gallivm->builder, invocation_count, lp_build_const_int32(gallivm, cs_type.length - 1), "");
num_subgroup_loop = LLVMBuildUDiv(gallivm->builder, num_subgroup_loop, vec_length, "");
/* build a ptr in memory to store all the frames in later. */ /* build a ptr in memory to store all the frames in later. */
LLVMTypeRef hdl_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); LLVMTypeRef hdl_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
LLVMValueRef coro_mem = LLVMBuildAlloca(gallivm->builder, hdl_ptr_type, "coro_mem"); LLVMValueRef coro_mem = LLVMBuildAlloca(gallivm->builder, hdl_ptr_type, "coro_mem");
LLVMBuildStore(builder, LLVMConstNull(hdl_ptr_type), coro_mem); LLVMBuildStore(builder, LLVMConstNull(hdl_ptr_type), coro_mem);
LLVMValueRef coro_hdls = LLVMBuildArrayAlloca(gallivm->builder, hdl_ptr_type, coro_num_hdls, "coro_hdls"); LLVMValueRef coro_hdls = LLVMBuildArrayAlloca(gallivm->builder, hdl_ptr_type, num_subgroup_loop, "coro_hdls");
unsigned end_coroutine = INT_MAX; unsigned end_coroutine = INT_MAX;
@ -585,22 +586,17 @@ generate_compute(struct llvmpipe_context *lp,
* and calls the coroutine main entrypoint on the first pass, but in subsequent * and calls the coroutine main entrypoint on the first pass, but in subsequent
* passes it checks if the coroutine has completed and resumes it if not. * passes it checks if the coroutine has completed and resumes it if not.
*/ */
/* take x_width - round up to type.length width */
lp_build_loop_begin(&loop_state[3], gallivm,
lp_build_const_int32(gallivm, 0)); /* coroutine reentry loop */
lp_build_loop_begin(&loop_state[2], gallivm,
lp_build_const_int32(gallivm, 0)); /* z loop */
lp_build_loop_begin(&loop_state[1], gallivm, lp_build_loop_begin(&loop_state[1], gallivm,
lp_build_const_int32(gallivm, 0)); /* y loop */ lp_build_const_int32(gallivm, 0)); /* coroutine reentry loop */
lp_build_loop_begin(&loop_state[0], gallivm, lp_build_loop_begin(&loop_state[0], gallivm,
lp_build_const_int32(gallivm, 0)); /* x loop */ lp_build_const_int32(gallivm, 0)); /* subgroup loop */
{ {
LLVMValueRef args[CS_ARG_MAX]; LLVMValueRef args[CS_ARG_MAX];
args[CS_ARG_CONTEXT] = context_ptr; args[CS_ARG_CONTEXT] = context_ptr;
args[CS_ARG_RESOURCES] = resources_ptr; args[CS_ARG_RESOURCES] = resources_ptr;
args[CS_ARG_BLOCK_X_SIZE] = loop_state[0].counter; args[CS_ARG_BLOCK_X_SIZE] = LLVMGetUndef(int32_type);
args[CS_ARG_BLOCK_Y_SIZE] = loop_state[1].counter; args[CS_ARG_BLOCK_Y_SIZE] = LLVMGetUndef(int32_type);
args[CS_ARG_BLOCK_Z_SIZE] = loop_state[2].counter; args[CS_ARG_BLOCK_Z_SIZE] = LLVMGetUndef(int32_type);
args[CS_ARG_GRID_X] = grid_x_arg; args[CS_ARG_GRID_X] = grid_x_arg;
args[CS_ARG_GRID_Y] = grid_y_arg; args[CS_ARG_GRID_Y] = grid_y_arg;
args[CS_ARG_GRID_Z] = grid_z_arg; args[CS_ARG_GRID_Z] = grid_z_arg;
@ -611,34 +607,25 @@ generate_compute(struct llvmpipe_context *lp,
args[CS_ARG_DRAW_ID] = draw_id_arg; args[CS_ARG_DRAW_ID] = draw_id_arg;
args[CS_ARG_VERTEX_DATA] = io_ptr; args[CS_ARG_VERTEX_DATA] = io_ptr;
args[CS_ARG_PER_THREAD_DATA] = thread_data_ptr; args[CS_ARG_PER_THREAD_DATA] = thread_data_ptr;
args[CS_ARG_CORO_X_LOOPS] = num_x_loop; args[CS_ARG_CORO_SUBGROUP_COUNT] = num_subgroup_loop;
args[CS_ARG_CORO_PARTIALS] = partials; args[CS_ARG_CORO_PARTIALS] = partials;
args[CS_ARG_CORO_BLOCK_X_SIZE] = block_x_size_arg; args[CS_ARG_CORO_BLOCK_X_SIZE] = block_x_size_arg;
args[CS_ARG_CORO_BLOCK_Y_SIZE] = block_y_size_arg; args[CS_ARG_CORO_BLOCK_Y_SIZE] = block_y_size_arg;
args[CS_ARG_CORO_BLOCK_Z_SIZE] = block_z_size_arg; args[CS_ARG_CORO_BLOCK_Z_SIZE] = block_z_size_arg;
/* idx = (z * (size_x * size_y) + y * size_x + x */ args[CS_ARG_CORO_IDX] = loop_state[0].counter;
LLVMValueRef coro_hdl_idx = LLVMBuildMul(gallivm->builder, loop_state[2].counter,
LLVMBuildMul(gallivm->builder, num_x_loop, block_y_size_arg, ""), "");
coro_hdl_idx = LLVMBuildAdd(gallivm->builder, coro_hdl_idx,
LLVMBuildMul(gallivm->builder, loop_state[1].counter,
num_x_loop, ""), "");
coro_hdl_idx = LLVMBuildAdd(gallivm->builder, coro_hdl_idx,
loop_state[0].counter, "");
args[CS_ARG_CORO_IDX] = coro_hdl_idx;
args[CS_ARG_CORO_MEM] = coro_mem; args[CS_ARG_CORO_MEM] = coro_mem;
if (is_mesh) if (is_mesh)
args[CS_ARG_CORO_OUTPUTS] = output_array; args[CS_ARG_CORO_OUTPUTS] = output_array;
LLVMValueRef coro_entry = LLVMBuildGEP2(gallivm->builder, hdl_ptr_type, coro_hdls, &coro_hdl_idx, 1, ""); LLVMValueRef coro_entry = LLVMBuildGEP2(gallivm->builder, hdl_ptr_type, coro_hdls, &loop_state[0].counter, 1, "");
LLVMValueRef coro_hdl = LLVMBuildLoad2(gallivm->builder, hdl_ptr_type, coro_entry, "coro_hdl"); LLVMValueRef coro_hdl = LLVMBuildLoad2(gallivm->builder, hdl_ptr_type, coro_entry, "coro_hdl");
struct lp_build_if_state ifstate; struct lp_build_if_state ifstate;
LLVMValueRef cmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, loop_state[3].counter, LLVMValueRef cmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, loop_state[1].counter,
lp_build_const_int32(gallivm, 0), ""); lp_build_const_int32(gallivm, 0), "");
/* first time here - call the coroutine function entry point */ /* first time here - call the coroutine function entry point */
lp_build_if(&ifstate, gallivm, cmp); lp_build_if(&ifstate, gallivm, cmp);
@ -651,24 +638,18 @@ generate_compute(struct llvmpipe_context *lp,
lp_build_if(&ifstate2, gallivm, coro_done); lp_build_if(&ifstate2, gallivm, coro_done);
/* if done destroy and force loop exit */ /* if done destroy and force loop exit */
lp_build_coro_destroy(gallivm, coro_hdl); lp_build_coro_destroy(gallivm, coro_hdl);
lp_build_loop_force_set_counter(&loop_state[3], lp_build_const_int32(gallivm, end_coroutine - 1)); lp_build_loop_force_set_counter(&loop_state[1], lp_build_const_int32(gallivm, end_coroutine - 1));
lp_build_else(&ifstate2); lp_build_else(&ifstate2);
/* otherwise resume the coroutine */ /* otherwise resume the coroutine */
lp_build_coro_resume(gallivm, coro_hdl); lp_build_coro_resume(gallivm, coro_hdl);
lp_build_endif(&ifstate2); lp_build_endif(&ifstate2);
lp_build_endif(&ifstate); lp_build_endif(&ifstate);
lp_build_loop_force_reload_counter(&loop_state[3]); lp_build_loop_force_reload_counter(&loop_state[1]);
} }
lp_build_loop_end_cond(&loop_state[0], lp_build_loop_end_cond(&loop_state[0],
num_x_loop, num_subgroup_loop,
NULL, LLVMIntUGE); NULL, LLVMIntUGE);
lp_build_loop_end_cond(&loop_state[1], lp_build_loop_end_cond(&loop_state[1],
block_y_size_arg,
NULL, LLVMIntUGE);
lp_build_loop_end_cond(&loop_state[2],
block_z_size_arg,
NULL, LLVMIntUGE);
lp_build_loop_end_cond(&loop_state[3],
lp_build_const_int32(gallivm, end_coroutine), lp_build_const_int32(gallivm, end_coroutine),
NULL, LLVMIntEQ); NULL, LLVMIntEQ);
@ -680,12 +661,8 @@ generate_compute(struct llvmpipe_context *lp,
LLVMBuildRetVoid(builder); LLVMBuildRetVoid(builder);
/* This is stage (b) - generate the compute shader code inside the coroutine. */ /* This is stage (b) - generate the compute shader code inside the coroutine. */
LLVMValueRef x_size_arg, y_size_arg, z_size_arg;
context_ptr = LLVMGetParam(coro, CS_ARG_CONTEXT); context_ptr = LLVMGetParam(coro, CS_ARG_CONTEXT);
resources_ptr = LLVMGetParam(coro, CS_ARG_RESOURCES); resources_ptr = LLVMGetParam(coro, CS_ARG_RESOURCES);
x_size_arg = LLVMGetParam(coro, CS_ARG_BLOCK_X_SIZE);
y_size_arg = LLVMGetParam(coro, CS_ARG_BLOCK_Y_SIZE);
z_size_arg = LLVMGetParam(coro, CS_ARG_BLOCK_Z_SIZE);
grid_x_arg = LLVMGetParam(coro, CS_ARG_GRID_X); grid_x_arg = LLVMGetParam(coro, CS_ARG_GRID_X);
grid_y_arg = LLVMGetParam(coro, CS_ARG_GRID_Y); grid_y_arg = LLVMGetParam(coro, CS_ARG_GRID_Y);
grid_z_arg = LLVMGetParam(coro, CS_ARG_GRID_Z); grid_z_arg = LLVMGetParam(coro, CS_ARG_GRID_Z);
@ -696,12 +673,12 @@ generate_compute(struct llvmpipe_context *lp,
draw_id_arg = LLVMGetParam(coro, CS_ARG_DRAW_ID); draw_id_arg = LLVMGetParam(coro, CS_ARG_DRAW_ID);
io_ptr = LLVMGetParam(coro, CS_ARG_VERTEX_DATA); io_ptr = LLVMGetParam(coro, CS_ARG_VERTEX_DATA);
thread_data_ptr = LLVMGetParam(coro, CS_ARG_PER_THREAD_DATA); thread_data_ptr = LLVMGetParam(coro, CS_ARG_PER_THREAD_DATA);
num_x_loop = LLVMGetParam(coro, CS_ARG_CORO_X_LOOPS); num_subgroup_loop = LLVMGetParam(coro, CS_ARG_CORO_SUBGROUP_COUNT);
partials = LLVMGetParam(coro, CS_ARG_CORO_PARTIALS); partials = LLVMGetParam(coro, CS_ARG_CORO_PARTIALS);
block_x_size_arg = LLVMGetParam(coro, CS_ARG_CORO_BLOCK_X_SIZE); block_x_size_arg = LLVMGetParam(coro, CS_ARG_CORO_BLOCK_X_SIZE);
block_y_size_arg = LLVMGetParam(coro, CS_ARG_CORO_BLOCK_Y_SIZE); block_y_size_arg = LLVMGetParam(coro, CS_ARG_CORO_BLOCK_Y_SIZE);
block_z_size_arg = LLVMGetParam(coro, CS_ARG_CORO_BLOCK_Z_SIZE); block_z_size_arg = LLVMGetParam(coro, CS_ARG_CORO_BLOCK_Z_SIZE);
LLVMValueRef coro_idx = LLVMGetParam(coro, CS_ARG_CORO_IDX); LLVMValueRef subgroup_id = LLVMGetParam(coro, CS_ARG_CORO_IDX);
coro_mem = LLVMGetParam(coro, CS_ARG_CORO_MEM); coro_mem = LLVMGetParam(coro, CS_ARG_CORO_MEM);
if (is_mesh) if (is_mesh)
output_array = LLVMGetParam(coro, CS_ARG_CORO_OUTPUTS); output_array = LLVMGetParam(coro, CS_ARG_CORO_OUTPUTS);
@ -730,27 +707,32 @@ generate_compute(struct llvmpipe_context *lp,
variant->jit_cs_thread_data_type, variant->jit_cs_thread_data_type,
thread_data_ptr); thread_data_ptr);
LLVMValueRef coro_num_hdls = LLVMBuildMul(gallivm->builder, num_x_loop, block_y_size_arg, "");
coro_num_hdls = LLVMBuildMul(gallivm->builder, coro_num_hdls, block_z_size_arg, "");
/* these are coroutine entrypoint necessities */ /* these are coroutine entrypoint necessities */
LLVMValueRef coro_id = lp_build_coro_id(gallivm); LLVMValueRef coro_id = lp_build_coro_id(gallivm);
LLVMValueRef coro_entry = lp_build_coro_alloc_mem_array(gallivm, coro_mem, coro_idx, coro_num_hdls); LLVMValueRef coro_entry = lp_build_coro_alloc_mem_array(gallivm, coro_mem, subgroup_id, num_subgroup_loop);
LLVMTypeRef mem_ptr_type = LLVMInt8TypeInContext(gallivm->context); LLVMTypeRef mem_ptr_type = LLVMInt8TypeInContext(gallivm->context);
LLVMValueRef alloced_ptr = LLVMBuildLoad2(gallivm->builder, hdl_ptr_type, coro_mem, ""); LLVMValueRef alloced_ptr = LLVMBuildLoad2(gallivm->builder, hdl_ptr_type, coro_mem, "");
alloced_ptr = LLVMBuildGEP2(gallivm->builder, mem_ptr_type, alloced_ptr, &coro_entry, 1, ""); alloced_ptr = LLVMBuildGEP2(gallivm->builder, mem_ptr_type, alloced_ptr, &coro_entry, 1, "");
LLVMValueRef coro_hdl = lp_build_coro_begin(gallivm, coro_id, alloced_ptr); LLVMValueRef coro_hdl = lp_build_coro_begin(gallivm, coro_id, alloced_ptr);
LLVMValueRef has_partials = LLVMBuildICmp(gallivm->builder, LLVMIntNE, partials, lp_build_const_int32(gallivm, 0), ""); LLVMValueRef has_partials = LLVMBuildICmp(gallivm->builder, LLVMIntNE, partials, lp_build_const_int32(gallivm, 0), "");
LLVMValueRef tids_x[LP_MAX_VECTOR_LENGTH], tids_y[LP_MAX_VECTOR_LENGTH], tids_z[LP_MAX_VECTOR_LENGTH];
LLVMValueRef base_val = LLVMBuildMul(gallivm->builder, x_size_arg, vec_length, ""); struct lp_build_context bld;
for (i = 0; i < cs_type.length; i++) { lp_build_context_init(&bld, gallivm, lp_uint_type(cs_type));
tids_x[i] = LLVMBuildAdd(gallivm->builder, base_val, lp_build_const_int32(gallivm, i), "");
tids_y[i] = y_size_arg; LLVMValueRef base_val = LLVMBuildMul(gallivm->builder, subgroup_id, vec_length, "");
tids_z[i] = z_size_arg; LLVMValueRef invocation_indices[LP_MAX_VECTOR_LENGTH];
} for (i = 0; i < cs_type.length; i++)
system_values.thread_id[0] = lp_build_gather_values(gallivm, tids_x, cs_type.length); invocation_indices[i] = LLVMBuildAdd(gallivm->builder, base_val, lp_build_const_int32(gallivm, i), "");
system_values.thread_id[1] = lp_build_gather_values(gallivm, tids_y, cs_type.length); LLVMValueRef invocation_index = lp_build_gather_values(gallivm, invocation_indices, cs_type.length);
system_values.thread_id[2] = lp_build_gather_values(gallivm, tids_z, cs_type.length);
LLVMValueRef block_x_size_vec = lp_build_broadcast_scalar(&bld, block_x_size_arg);
LLVMValueRef block_y_size_vec = lp_build_broadcast_scalar(&bld, block_y_size_arg);
system_values.thread_id[0] = LLVMBuildURem(gallivm->builder, invocation_index, block_x_size_vec, "");
system_values.thread_id[1] = LLVMBuildUDiv(gallivm->builder, invocation_index, block_x_size_vec, "");
system_values.thread_id[1] = LLVMBuildURem(gallivm->builder, system_values.thread_id[1], block_y_size_vec, "");
system_values.thread_id[2] = LLVMBuildUDiv(gallivm->builder, invocation_index, block_x_size_vec, "");
system_values.thread_id[2] = LLVMBuildUDiv(gallivm->builder, system_values.thread_id[2], block_y_size_vec, "");
system_values.block_id[0] = grid_x_arg; system_values.block_id[0] = grid_x_arg;
system_values.block_id[1] = grid_y_arg; system_values.block_id[1] = grid_y_arg;
@ -763,38 +745,15 @@ generate_compute(struct llvmpipe_context *lp,
system_values.work_dim = work_dim_arg; system_values.work_dim = work_dim_arg;
system_values.draw_id = draw_id_arg; system_values.draw_id = draw_id_arg;
/* subgroup_id = ((z * block_size_x * block_size_y) + (y * block_size_x) + x) / subgroup_size
*
* this breaks if z or y is zero, so distribute the division to preserve ids
*
* subgroup_id = ((z * block_size_x * block_size_y) / subgroup_size) + ((y * block_size_x) / subgroup_size) + (x / subgroup_size)
*
* except "x" is pre-divided here
*
* subgroup_id = ((z * block_size_x * block_size_y) / subgroup_size) + ((y * block_size_x) / subgroup_size) + x
*/
LLVMValueRef subgroup_id = LLVMBuildUDiv(builder,
LLVMBuildMul(gallivm->builder, z_size_arg, LLVMBuildMul(gallivm->builder, block_x_size_arg, block_y_size_arg, ""), ""),
vec_length, "");
subgroup_id = LLVMBuildAdd(gallivm->builder,
subgroup_id,
LLVMBuildUDiv(builder, LLVMBuildMul(gallivm->builder, y_size_arg, block_x_size_arg, ""), vec_length, ""),
"");
subgroup_id = LLVMBuildAdd(gallivm->builder, subgroup_id, x_size_arg, "");
system_values.subgroup_id = subgroup_id; system_values.subgroup_id = subgroup_id;
LLVMValueRef num_subgroups = LLVMBuildUDiv(builder, system_values.num_subgroups = num_subgroup_loop;
LLVMBuildMul(builder, block_x_size_arg,
LLVMBuildMul(builder, block_y_size_arg, block_z_size_arg, ""), ""),
vec_length, "");
LLVMValueRef subgroup_cmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, num_subgroups, lp_build_const_int32(gallivm, 0), "");
system_values.num_subgroups = LLVMBuildSelect(builder, subgroup_cmp, lp_build_const_int32(gallivm, 1), num_subgroups, "");
system_values.block_size[0] = block_x_size_arg; system_values.block_size[0] = block_x_size_arg;
system_values.block_size[1] = block_y_size_arg; system_values.block_size[1] = block_y_size_arg;
system_values.block_size[2] = block_z_size_arg; system_values.block_size[2] = block_z_size_arg;
LLVMValueRef last_x_loop = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, x_size_arg, LLVMBuildSub(gallivm->builder, num_x_loop, lp_build_const_int32(gallivm, 1), ""), ""); LLVMValueRef last_loop = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, subgroup_id, LLVMBuildSub(gallivm->builder, num_subgroup_loop, lp_build_const_int32(gallivm, 1), ""), "");
LLVMValueRef use_partial_mask = LLVMBuildAnd(gallivm->builder, last_x_loop, has_partials, ""); LLVMValueRef use_partial_mask = LLVMBuildAnd(gallivm->builder, last_loop, has_partials, "");
struct lp_build_if_state if_state; struct lp_build_if_state if_state;
LLVMTypeRef mask_type = LLVMVectorType(int32_type, cs_type.length); LLVMTypeRef mask_type = LLVMVectorType(int32_type, cs_type.length);
LLVMValueRef mask_val = lp_build_alloca(gallivm, mask_type, "mask"); LLVMValueRef mask_val = lp_build_alloca(gallivm, mask_type, "mask");
@ -866,7 +825,7 @@ generate_compute(struct llvmpipe_context *lp,
lp_int_type(cs_type), 0); lp_int_type(cs_type), 0);
struct lp_build_if_state iter0state; struct lp_build_if_state iter0state;
LLVMValueRef is_iter0 = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, coro_idx, LLVMValueRef is_iter0 = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, subgroup_id,
lp_build_const_int32(gallivm, 0), ""); lp_build_const_int32(gallivm, 0), "");
LLVMValueRef vertex_count = LLVMBuildLoad2(gallivm->builder, i32t, mesh_iface.vertex_count, ""); LLVMValueRef vertex_count = LLVMBuildLoad2(gallivm->builder, i32t, mesh_iface.vertex_count, "");
LLVMValueRef prim_count = LLVMBuildLoad2(gallivm->builder, i32t, mesh_iface.prim_count, ""); LLVMValueRef prim_count = LLVMBuildLoad2(gallivm->builder, i32t, mesh_iface.prim_count, "");