radeonsi: convert "create_query_result_cs" shader to nir
Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25972>
This commit is contained in:
parent
740a4c3448
commit
c109c3f95c
3 changed files with 381 additions and 216 deletions
|
|
@ -1673,9 +1673,9 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
|
|||
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
|
||||
void *si_clear_12bytes_buffer_shader(struct si_context *sctx);
|
||||
void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array);
|
||||
void *si_create_query_result_cs(struct si_context *sctx);
|
||||
|
||||
/* si_shaderlib_tgsi.c */
|
||||
void *si_create_query_result_cs(struct si_context *sctx);
|
||||
void *gfx11_create_sh_query_result_cs(struct si_context *sctx);
|
||||
|
||||
/* gfx11_query.c */
|
||||
|
|
|
|||
|
|
@ -873,3 +873,383 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
|
|||
*vs = create_shader_state(sctx, b.shader);
|
||||
return *vs;
|
||||
}
|
||||
|
||||
/* Create the compute shader that is used to collect the results.
|
||||
*
|
||||
* One compute grid with a single thread is launched for every query result
|
||||
* buffer. The thread (optionally) reads a previous summary buffer, then
|
||||
* accumulates data from the query result buffer, and writes the result either
|
||||
* to a summary buffer to be consumed by the next grid invocation or to the
|
||||
* user-supplied buffer.
|
||||
*
|
||||
* Data layout:
|
||||
*
|
||||
* CONST
|
||||
* 0.x = end_offset
|
||||
* 0.y = result_stride
|
||||
* 0.z = result_count
|
||||
* 0.w = bit field:
|
||||
* 1: read previously accumulated values
|
||||
* 2: write accumulated values for chaining
|
||||
* 4: write result available
|
||||
* 8: convert result to boolean (0/1)
|
||||
* 16: only read one dword and use that as result
|
||||
* 32: apply timestamp conversion
|
||||
* 64: store full 64 bits result
|
||||
* 128: store signed 32 bits result
|
||||
* 256: SO_OVERFLOW mode: take the difference of two successive half-pairs
|
||||
* 1.x = fence_offset
|
||||
* 1.y = pair_stride
|
||||
* 1.z = pair_count
|
||||
*
|
||||
*/
|
||||
void *si_create_query_result_cs(struct si_context *sctx)
|
||||
{
|
||||
const nir_shader_compiler_options *options =
|
||||
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
|
||||
|
||||
nir_builder b =
|
||||
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_query_result_cs");
|
||||
b.shader->info.workgroup_size[0] = 1;
|
||||
b.shader->info.workgroup_size[1] = 1;
|
||||
b.shader->info.workgroup_size[2] = 1;
|
||||
b.shader->info.num_ubos = 1;
|
||||
b.shader->info.num_ssbos = 3;
|
||||
b.shader->num_uniforms = 2;
|
||||
|
||||
nir_def *var_undef = nir_undef(&b, 1, 32);
|
||||
nir_def *zero = nir_imm_int(&b, 0);
|
||||
nir_def *one = nir_imm_int(&b, 1);
|
||||
nir_def *two = nir_imm_int(&b, 2);
|
||||
nir_def *four = nir_imm_int(&b, 4);
|
||||
nir_def *eight = nir_imm_int(&b, 8);
|
||||
nir_def *sixteen = nir_imm_int(&b, 16);
|
||||
nir_def *thirty_one = nir_imm_int(&b, 31);
|
||||
nir_def *sixty_four = nir_imm_int(&b, 64);
|
||||
|
||||
/* uint32_t x, y, z = 0; */
|
||||
nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
|
||||
nir_variable *x = nir_local_variable_create(e, glsl_uint_type(), "x");
|
||||
nir_store_var(&b, x, var_undef, 0x1);
|
||||
nir_variable *y = nir_local_variable_create(e, glsl_uint_type(), "y");
|
||||
nir_store_var(&b, y, var_undef, 0x1);
|
||||
nir_variable *z = nir_local_variable_create(e, glsl_uint_type(), "z");
|
||||
nir_store_var(&b, z, zero, 0x1);
|
||||
|
||||
/* uint32_t buff_0[4] = load_ubo(0, 0); */
|
||||
nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
|
||||
/* uint32_t buff_1[4] = load_ubo(1, 16); */
|
||||
nir_def *buff_1 = nir_load_ubo(&b, 4, 32, zero, sixteen, .range_base = 16, .range = 16);
|
||||
|
||||
/* uint32_t b0_bitfield = buff_0.w; */
|
||||
nir_def *b0_bitfield = nir_channel(&b, buff_0, 3);
|
||||
|
||||
/* Check result availability.
|
||||
* if (b0_bitfield & (1u << 4)) {
|
||||
* ...
|
||||
*/
|
||||
nir_def *is_one_dword_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixteen));
|
||||
nir_if *if_one_dword_result = nir_push_if(&b, is_one_dword_result); {
|
||||
|
||||
/* int32_t value = load_ssbo(0, fence_offset);
|
||||
* z = ~(value >> 31);
|
||||
*/
|
||||
nir_def *value = nir_load_ssbo(&b, 1, 32, zero, nir_channel(&b, buff_1, 0));
|
||||
nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
|
||||
nir_store_var(&b, z, bitmask, 0x1);
|
||||
|
||||
/* Load result if available.
|
||||
* if (value < 0) {
|
||||
* uint32_t result[2] = load_ssbo(0, 0);
|
||||
* x = result[0];
|
||||
* y = result[1];
|
||||
* }
|
||||
*/
|
||||
nir_if *if_negative = nir_push_if(&b, nir_ilt(&b, value, zero)); {
|
||||
nir_def *result = nir_load_ssbo(&b, 2, 32, zero, zero);
|
||||
nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
|
||||
nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
|
||||
}
|
||||
nir_pop_if(&b, if_negative);
|
||||
} nir_push_else(&b, if_one_dword_result); {
|
||||
|
||||
/* } else {
|
||||
* x = 0; y = 0;
|
||||
*/
|
||||
nir_store_var(&b, x, zero, 0x1);
|
||||
nir_store_var(&b, y, zero, 0x1);
|
||||
|
||||
/* Load previously accumulated result if requested.
|
||||
* if (b0_bitfield & (1u << 0)) {
|
||||
* uint32_t result[3] = load_ssbo(1, 0);
|
||||
* x = result[0];
|
||||
* y = result[1];
|
||||
* z = result[2];
|
||||
* }
|
||||
*/
|
||||
nir_def *is_prev_acc_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, one));
|
||||
nir_if *if_prev_acc_result = nir_push_if(&b, is_prev_acc_result); {
|
||||
nir_def *result = nir_load_ssbo(&b, 3, 32, one, zero);
|
||||
nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
|
||||
nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
|
||||
nir_store_var(&b, z, nir_channel(&b, result, 2), 0x1);
|
||||
}
|
||||
nir_pop_if(&b, if_prev_acc_result);
|
||||
|
||||
/* if (!z) {
|
||||
* uint32_t result_index = 0;
|
||||
* uint32_t pitch = 0;
|
||||
* ...
|
||||
*/
|
||||
nir_def *z_value = nir_load_var(&b, z);
|
||||
nir_if *if_not_z = nir_push_if(&b, nir_ieq(&b, z_value, zero)); {
|
||||
nir_variable *outer_loop_iter =
|
||||
nir_local_variable_create(e, glsl_uint_type(), "outer_loop_iter");
|
||||
nir_store_var(&b, outer_loop_iter, zero, 0x1);
|
||||
nir_variable *pitch = nir_local_variable_create(e, glsl_uint_type(), "pitch");
|
||||
nir_store_var(&b, pitch, zero, 0x1);
|
||||
|
||||
/* Outer loop.
|
||||
* while (result_index <= result_count) {
|
||||
* ...
|
||||
*/
|
||||
nir_loop *loop_outer = nir_push_loop(&b); {
|
||||
nir_def *result_index = nir_load_var(&b, outer_loop_iter);
|
||||
nir_def *is_result_index_out_of_bound =
|
||||
nir_uge(&b, result_index, nir_channel(&b, buff_0, 2));
|
||||
nir_if *if_out_of_bound = nir_push_if(&b, is_result_index_out_of_bound); {
|
||||
nir_jump(&b, nir_jump_break);
|
||||
}
|
||||
nir_pop_if(&b, if_out_of_bound);
|
||||
|
||||
/* Load fence and check result availability.
|
||||
* pitch = i * result_stride;
|
||||
* uint32_t address = fence_offset + pitch;
|
||||
* int32_t value = load_ssbo(0, address);
|
||||
* z = ~(value >> 31);
|
||||
*/
|
||||
nir_def *pitch_outer_loop = nir_imul(&b, result_index, nir_channel(&b, buff_0, 1));
|
||||
nir_store_var(&b, pitch, pitch_outer_loop, 0x1);
|
||||
nir_def *address = nir_iadd(&b, pitch_outer_loop, nir_channel(&b, buff_1, 0));
|
||||
nir_def *value = nir_load_ssbo(&b, 1, 32, zero, address);
|
||||
nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
|
||||
nir_store_var(&b, z, bitmask, 0x1);
|
||||
|
||||
/* if (z) {
|
||||
* break;
|
||||
* }
|
||||
*/
|
||||
nir_if *if_result_available = nir_push_if(&b, nir_i2b(&b, bitmask)); {
|
||||
nir_jump(&b, nir_jump_break);
|
||||
}
|
||||
nir_pop_if(&b, if_result_available);
|
||||
|
||||
/* Inner loop iterator.
|
||||
* uint32_t i = 0;
|
||||
*/
|
||||
nir_variable *inner_loop_iter =
|
||||
nir_local_variable_create(e, glsl_uint_type(), "inner_loop_iter");
|
||||
nir_store_var(&b, inner_loop_iter, zero, 0x1);
|
||||
|
||||
/* Inner loop.
|
||||
* do {
|
||||
* ...
|
||||
*/
|
||||
nir_loop *loop_inner = nir_push_loop(&b); {
|
||||
nir_def *pitch_inner_loop = nir_load_var(&b, pitch);
|
||||
nir_def *i = nir_load_var(&b, inner_loop_iter);
|
||||
|
||||
/* Load start and end.
|
||||
* uint64_t first = load_ssbo(0, pitch);
|
||||
* uint64_t second = load_ssbo(0, pitch + end_offset);
|
||||
* uint64_t start_half_pair = second - first;
|
||||
*/
|
||||
nir_def *first = nir_load_ssbo(&b, 1, 64, zero, pitch_inner_loop);
|
||||
nir_def *new_pitch = nir_iadd(&b, pitch_inner_loop, nir_channel(&b, buff_0, 0));
|
||||
nir_def *second = nir_load_ssbo(&b, 1, 64, zero, new_pitch);
|
||||
nir_def *start_half_pair = nir_isub(&b, second, first);
|
||||
|
||||
/* Load second start/end half-pair and take the difference.
|
||||
* if (b0_bitfield & (1u << 8)) {
|
||||
* uint64_t first = load_ssbo(0, pitch + 8);
|
||||
* uint64_t second = load_ssbo(0, pitch + end_offset + 8);
|
||||
* uint64_t end_half_pair = second - first;
|
||||
* uint64_t difference = start_half_pair - end_half_pair;
|
||||
* }
|
||||
*/
|
||||
nir_def *difference;
|
||||
nir_def *is_so_overflow_mode = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 256));
|
||||
nir_if *if_so_overflow_mode = nir_push_if(&b, is_so_overflow_mode); {
|
||||
first = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, pitch_inner_loop, eight));
|
||||
second = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, new_pitch, eight));
|
||||
nir_def *end_half_pair = nir_isub(&b, second, first);
|
||||
difference = nir_isub(&b, start_half_pair, end_half_pair);
|
||||
}
|
||||
nir_pop_if(&b, if_so_overflow_mode);
|
||||
|
||||
/* uint64_t sum = (x | (uint64_t) y << 32) + difference; */
|
||||
nir_def *sum = nir_iadd(&b,
|
||||
nir_pack_64_2x32_split(&b,
|
||||
nir_load_var(&b, x),
|
||||
nir_load_var(&b, y)),
|
||||
nir_if_phi(&b, difference, start_half_pair));
|
||||
sum = nir_unpack_64_2x32(&b, sum);
|
||||
|
||||
/* Increment inner loop iterator.
|
||||
* i++;
|
||||
*/
|
||||
i = nir_iadd(&b, i, one);
|
||||
nir_store_var(&b, inner_loop_iter, i, 0x1);
|
||||
|
||||
/* Update pitch value.
|
||||
* pitch = i * pair_stride + pitch;
|
||||
*/
|
||||
nir_def *incremented_pitch = nir_iadd(&b,
|
||||
nir_imul(&b, i, nir_channel(&b, buff_1, 1)),
|
||||
pitch_outer_loop);
|
||||
nir_store_var(&b, pitch, incremented_pitch, 0x1);
|
||||
|
||||
/* Update x and y.
|
||||
* x = sum.x;
|
||||
* y = sum.x >> 32;
|
||||
*/
|
||||
nir_store_var(&b, x, nir_channel(&b, sum, 0), 0x1);
|
||||
nir_store_var(&b, y, nir_channel(&b, sum, 1), 0x1);
|
||||
|
||||
/* } while (i < pair_count);
|
||||
*/
|
||||
nir_def *is_pair_count_exceeded = nir_uge(&b, i, nir_channel(&b, buff_1, 2));
|
||||
nir_if *if_pair_count_exceeded = nir_push_if(&b, is_pair_count_exceeded); {
|
||||
nir_jump(&b, nir_jump_break);
|
||||
}
|
||||
nir_pop_if(&b, if_pair_count_exceeded);
|
||||
}
|
||||
nir_pop_loop(&b, loop_inner);
|
||||
|
||||
/* Increment pair iterator.
|
||||
* result_index++;
|
||||
*/
|
||||
nir_store_var(&b, outer_loop_iter, nir_iadd(&b, result_index, one), 0x1);
|
||||
}
|
||||
nir_pop_loop(&b, loop_outer);
|
||||
}
|
||||
nir_pop_if(&b, if_not_z);
|
||||
}
|
||||
nir_pop_if(&b, if_one_dword_result);
|
||||
|
||||
nir_def *x_value = nir_load_var(&b, x);
|
||||
nir_def *y_value = nir_load_var(&b, y);
|
||||
nir_def *z_value = nir_load_var(&b, z);
|
||||
|
||||
/* Store accumulated data for chaining.
|
||||
* if (b0_bitfield & (1u << 1)) {
|
||||
* store_ssbo(<x, y, z>, 2, 0);
|
||||
*/
|
||||
nir_def *is_acc_chaining = nir_i2b(&b, nir_iand(&b, b0_bitfield, two));
|
||||
nir_if *if_acc_chaining = nir_push_if(&b, is_acc_chaining); {
|
||||
nir_store_ssbo(&b, nir_vec3(&b, x_value, y_value, z_value), two, zero);
|
||||
} nir_push_else(&b, if_acc_chaining); {
|
||||
|
||||
/* Store result availability.
|
||||
* } else {
|
||||
* if (b0_bitfield & (1u << 2)) {
|
||||
* store_ssbo((~z & 1), 2, 0);
|
||||
* ...
|
||||
*/
|
||||
nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, b0_bitfield, four));
|
||||
nir_if *if_result_available = nir_push_if(&b, is_result_available); {
|
||||
nir_store_ssbo(&b, nir_iand(&b, nir_inot(&b, z_value), one), two, zero);
|
||||
|
||||
/* Store full 64 bits result.
|
||||
* if (b0_bitfield & (1u << 6)) {
|
||||
* store_ssbo(<0, 0>, 2, 0);
|
||||
* }
|
||||
*/
|
||||
nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
|
||||
nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
|
||||
nir_store_ssbo(&b, nir_imm_ivec2(&b, 0, 0), two, zero,
|
||||
.write_mask = (1u << 1));
|
||||
}
|
||||
nir_pop_if(&b, if_result_64_bits);
|
||||
} nir_push_else(&b, if_result_available); {
|
||||
|
||||
/* } else {
|
||||
* if (~z) {
|
||||
* ...
|
||||
*/
|
||||
nir_def *is_bitwise_not_z = nir_i2b(&b, nir_inot(&b, z_value));
|
||||
nir_if *if_bitwise_not_z = nir_push_if(&b, is_bitwise_not_z); {
|
||||
nir_def *ts_x, *ts_y;
|
||||
|
||||
/* Apply timestamp conversion.
|
||||
* if (b0_bitfield & (1u << 5)) {
|
||||
* uint64_t xy_million = (x | (uint64_t) y << 32) * (uint64_t) 1000000;
|
||||
* uint64_t ts_converted = xy_million / (uint64_t) clock_crystal_frequency;
|
||||
* x = ts_converted.x;
|
||||
* y = ts_converted.x >> 32;
|
||||
* }
|
||||
*/
|
||||
nir_def *is_apply_timestamp = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 32));
|
||||
nir_if *if_apply_timestamp = nir_push_if(&b, is_apply_timestamp); {
|
||||
/* Add the frequency into the shader for timestamp conversion
|
||||
* so that the backend can use the full range of optimizations
|
||||
* for divide-by-constant.
|
||||
*/
|
||||
nir_def *clock_crystal_frequency =
|
||||
nir_imm_int64(&b, sctx->screen->info.clock_crystal_freq);
|
||||
|
||||
nir_def *xy_million = nir_imul(&b,
|
||||
nir_pack_64_2x32_split(&b, x_value, y_value),
|
||||
nir_imm_int64(&b, 1000000));
|
||||
nir_def *ts_converted = nir_udiv(&b, xy_million, clock_crystal_frequency);
|
||||
ts_converted = nir_unpack_64_2x32(&b, ts_converted);
|
||||
ts_x = nir_channel(&b, ts_converted, 0);
|
||||
ts_y = nir_channel(&b, ts_converted, 1);
|
||||
}
|
||||
nir_pop_if(&b, if_apply_timestamp);
|
||||
|
||||
nir_def *nx = nir_if_phi(&b, ts_x, x_value);
|
||||
nir_def *ny = nir_if_phi(&b, ts_y, y_value);
|
||||
|
||||
/* x = b0_bitfield & (1u << 3) ? ((x | (uint64_t) y << 32) != 0) : x;
|
||||
* y = b0_bitfield & (1u << 3) ? 0 : y;
|
||||
*/
|
||||
nir_def *is_convert_to_bool = nir_i2b(&b, nir_iand(&b, b0_bitfield, eight));
|
||||
nir_def *xy = nir_pack_64_2x32_split(&b, nx, ny);
|
||||
nir_def *is_xy = nir_b2i32(&b, nir_ine(&b, xy, nir_imm_int64(&b, 0)));
|
||||
nx = nir_bcsel(&b, is_convert_to_bool, is_xy, nx);
|
||||
ny = nir_bcsel(&b, is_convert_to_bool, zero, ny);
|
||||
|
||||
/* if (b0_bitfield & (1u << 6)) {
|
||||
* store_ssbo(<x, y>, 2, 0);
|
||||
* }
|
||||
*/
|
||||
nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
|
||||
nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
|
||||
nir_store_ssbo(&b, nir_vec2(&b, nx, ny), two, zero);
|
||||
} nir_push_else(&b, if_result_64_bits); {
|
||||
|
||||
/* Clamping.
|
||||
* } else {
|
||||
* x = y ? UINT32_MAX : x;
|
||||
* x = b0_bitfield & (1u << 7) ? min(x, INT_MAX) : x;
|
||||
* store_ssbo(x, 2, 0);
|
||||
* }
|
||||
*/
|
||||
nir_def *is_y = nir_ine(&b, ny, zero);
|
||||
nx = nir_bcsel(&b, is_y, nir_imm_int(&b, UINT32_MAX), nx);
|
||||
nir_def *is_signed_32bit_result = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 128));
|
||||
nir_def *min = nir_umin(&b, nx, nir_imm_int(&b, INT_MAX));
|
||||
nx = nir_bcsel(&b, is_signed_32bit_result, min, nx);
|
||||
nir_store_ssbo(&b, nx, two, zero);
|
||||
}
|
||||
nir_pop_if(&b, if_result_64_bits);
|
||||
}
|
||||
nir_pop_if(&b, if_bitwise_not_z);
|
||||
}
|
||||
nir_pop_if(&b, if_result_available);
|
||||
}
|
||||
nir_pop_if(&b, if_acc_chaining);
|
||||
|
||||
return create_shader_state(sctx, b.shader);
|
||||
}
|
||||
|
|
|
|||
|
|
@ -8,221 +8,6 @@
|
|||
#include "tgsi/tgsi_text.h"
|
||||
#include "tgsi/tgsi_ureg.h"
|
||||
|
||||
/* Create the compute shader that is used to collect the results.
|
||||
*
|
||||
* One compute grid with a single thread is launched for every query result
|
||||
* buffer. The thread (optionally) reads a previous summary buffer, then
|
||||
* accumulates data from the query result buffer, and writes the result either
|
||||
* to a summary buffer to be consumed by the next grid invocation or to the
|
||||
* user-supplied buffer.
|
||||
*
|
||||
* Data layout:
|
||||
*
|
||||
* CONST
|
||||
* 0.x = end_offset
|
||||
* 0.y = result_stride
|
||||
* 0.z = result_count
|
||||
* 0.w = bit field:
|
||||
* 1: read previously accumulated values
|
||||
* 2: write accumulated values for chaining
|
||||
* 4: write result available
|
||||
* 8: convert result to boolean (0/1)
|
||||
* 16: only read one dword and use that as result
|
||||
* 32: apply timestamp conversion
|
||||
* 64: store full 64 bits result
|
||||
* 128: store signed 32 bits result
|
||||
* 256: SO_OVERFLOW mode: take the difference of two successive half-pairs
|
||||
* 1.x = fence_offset
|
||||
* 1.y = pair_stride
|
||||
* 1.z = pair_count
|
||||
*
|
||||
* BUFFER[0] = query result buffer
|
||||
* BUFFER[1] = previous summary buffer
|
||||
* BUFFER[2] = next summary buffer or user-supplied buffer
|
||||
*/
|
||||
void *si_create_query_result_cs(struct si_context *sctx)
|
||||
{
|
||||
/* TEMP[0].xy = accumulated result so far
|
||||
* TEMP[0].z = result not available
|
||||
*
|
||||
* TEMP[1].x = current result index
|
||||
* TEMP[1].y = current pair index
|
||||
*/
|
||||
static const char text_tmpl[] =
|
||||
"COMP\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
|
||||
"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
|
||||
"DCL BUFFER[0]\n"
|
||||
"DCL BUFFER[1]\n"
|
||||
"DCL BUFFER[2]\n"
|
||||
"DCL CONST[0][0..1]\n"
|
||||
"DCL TEMP[0..5]\n"
|
||||
"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
|
||||
"IMM[1] UINT32 {1, 2, 4, 8}\n"
|
||||
"IMM[2] UINT32 {16, 32, 64, 128}\n"
|
||||
"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
|
||||
"IMM[4] UINT32 {256, 0, 0, 0}\n"
|
||||
|
||||
"AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
|
||||
"UIF TEMP[5]\n"
|
||||
/* Check result availability. */
|
||||
"LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
|
||||
"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
|
||||
"MOV TEMP[1], TEMP[0].zzzz\n"
|
||||
"NOT TEMP[0].z, TEMP[0].zzzz\n"
|
||||
|
||||
/* Load result if available. */
|
||||
"UIF TEMP[1]\n"
|
||||
"LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
|
||||
"ENDIF\n"
|
||||
"ELSE\n"
|
||||
/* Load previously accumulated result if requested. */
|
||||
"MOV TEMP[0], IMM[0].xxxx\n"
|
||||
"AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
|
||||
"UIF TEMP[4]\n"
|
||||
"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
|
||||
"ENDIF\n"
|
||||
|
||||
"MOV TEMP[1].x, IMM[0].xxxx\n"
|
||||
"BGNLOOP\n"
|
||||
/* Break if accumulated result so far is not available. */
|
||||
"UIF TEMP[0].zzzz\n"
|
||||
"BRK\n"
|
||||
"ENDIF\n"
|
||||
|
||||
/* Break if result_index >= result_count. */
|
||||
"USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
|
||||
"UIF TEMP[5]\n"
|
||||
"BRK\n"
|
||||
"ENDIF\n"
|
||||
|
||||
/* Load fence and check result availability */
|
||||
"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
|
||||
"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
|
||||
"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
|
||||
"NOT TEMP[0].z, TEMP[0].zzzz\n"
|
||||
"UIF TEMP[0].zzzz\n"
|
||||
"BRK\n"
|
||||
"ENDIF\n"
|
||||
|
||||
"MOV TEMP[1].y, IMM[0].xxxx\n"
|
||||
"BGNLOOP\n"
|
||||
/* Load start and end. */
|
||||
"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
|
||||
"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
|
||||
"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
|
||||
|
||||
"UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
|
||||
"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
|
||||
|
||||
"U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
|
||||
|
||||
"AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
|
||||
"UIF TEMP[5].zzzz\n"
|
||||
/* Load second start/end half-pair and
|
||||
* take the difference
|
||||
*/
|
||||
"UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
|
||||
"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
|
||||
"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
|
||||
|
||||
"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
|
||||
"U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
|
||||
"ENDIF\n"
|
||||
|
||||
"U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
|
||||
|
||||
/* Increment pair index */
|
||||
"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
|
||||
"USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
|
||||
"UIF TEMP[5]\n"
|
||||
"BRK\n"
|
||||
"ENDIF\n"
|
||||
"ENDLOOP\n"
|
||||
|
||||
/* Increment result index */
|
||||
"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
|
||||
"ENDLOOP\n"
|
||||
"ENDIF\n"
|
||||
|
||||
"AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
|
||||
"UIF TEMP[4]\n"
|
||||
/* Store accumulated data for chaining. */
|
||||
"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
|
||||
"ELSE\n"
|
||||
"AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
|
||||
"UIF TEMP[4]\n"
|
||||
/* Store result availability. */
|
||||
"NOT TEMP[0].z, TEMP[0]\n"
|
||||
"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
|
||||
"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
|
||||
|
||||
"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
|
||||
"UIF TEMP[4]\n"
|
||||
"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
|
||||
"ENDIF\n"
|
||||
"ELSE\n"
|
||||
/* Store result if it is available. */
|
||||
"NOT TEMP[4], TEMP[0].zzzz\n"
|
||||
"UIF TEMP[4]\n"
|
||||
/* Apply timestamp conversion */
|
||||
"AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
|
||||
"UIF TEMP[4]\n"
|
||||
"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
|
||||
"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
|
||||
"ENDIF\n"
|
||||
|
||||
/* Convert to boolean */
|
||||
"AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
|
||||
"UIF TEMP[4]\n"
|
||||
"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
|
||||
"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
|
||||
"MOV TEMP[0].y, IMM[0].xxxx\n"
|
||||
"ENDIF\n"
|
||||
|
||||
"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
|
||||
"UIF TEMP[4]\n"
|
||||
"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
|
||||
"ELSE\n"
|
||||
/* Clamping */
|
||||
"UIF TEMP[0].yyyy\n"
|
||||
"MOV TEMP[0].x, IMM[0].wwww\n"
|
||||
"ENDIF\n"
|
||||
|
||||
"AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
|
||||
"UIF TEMP[4]\n"
|
||||
"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
|
||||
"ENDIF\n"
|
||||
|
||||
"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
|
||||
"ENDIF\n"
|
||||
"ENDIF\n"
|
||||
"ENDIF\n"
|
||||
"ENDIF\n"
|
||||
|
||||
"END\n";
|
||||
|
||||
char text[sizeof(text_tmpl) + 32];
|
||||
struct tgsi_token tokens[1024];
|
||||
struct pipe_compute_state state = {};
|
||||
|
||||
/* Hard code the frequency into the shader so that the backend can
|
||||
* use the full range of optimizations for divide-by-constant.
|
||||
*/
|
||||
snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq);
|
||||
|
||||
if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
|
||||
assert(false);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
state.ir_type = PIPE_SHADER_IR_TGSI;
|
||||
state.prog = tokens;
|
||||
|
||||
return sctx->b.create_compute_state(&sctx->b, &state);
|
||||
}
|
||||
|
||||
/* Create the compute shader that is used to collect the results of gfx10+
|
||||
* shader queries.
|
||||
*
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue