radeonsi: convert "create_query_result_cs" shader to nir

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25972>
This commit is contained in:
Ganesh Belgur Ramachandra 2023-09-29 13:00:49 -05:00 committed by Marge Bot
parent 740a4c3448
commit c109c3f95c
3 changed files with 381 additions and 216 deletions

View file

@ -1673,9 +1673,9 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
void *si_clear_12bytes_buffer_shader(struct si_context *sctx);
void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array);
void *si_create_query_result_cs(struct si_context *sctx);
/* si_shaderlib_tgsi.c */
void *si_create_query_result_cs(struct si_context *sctx);
void *gfx11_create_sh_query_result_cs(struct si_context *sctx);
/* gfx11_query.c */

View file

@ -873,3 +873,383 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
*vs = create_shader_state(sctx, b.shader);
return *vs;
}
/* Create the compute shader that is used to collect the results.
*
* One compute grid with a single thread is launched for every query result
* buffer. The thread (optionally) reads a previous summary buffer, then
* accumulates data from the query result buffer, and writes the result either
* to a summary buffer to be consumed by the next grid invocation or to the
* user-supplied buffer.
*
* Data layout:
*
* CONST
* 0.x = end_offset
* 0.y = result_stride
* 0.z = result_count
* 0.w = bit field:
* 1: read previously accumulated values
* 2: write accumulated values for chaining
* 4: write result available
* 8: convert result to boolean (0/1)
* 16: only read one dword and use that as result
* 32: apply timestamp conversion
* 64: store full 64 bits result
* 128: store signed 32 bits result
* 256: SO_OVERFLOW mode: take the difference of two successive half-pairs
* 1.x = fence_offset
* 1.y = pair_stride
* 1.z = pair_count
*
*/
void *si_create_query_result_cs(struct si_context *sctx)
{
const nir_shader_compiler_options *options =
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
nir_builder b =
nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_query_result_cs");
b.shader->info.workgroup_size[0] = 1;
b.shader->info.workgroup_size[1] = 1;
b.shader->info.workgroup_size[2] = 1;
b.shader->info.num_ubos = 1;
b.shader->info.num_ssbos = 3;
b.shader->num_uniforms = 2;
nir_def *var_undef = nir_undef(&b, 1, 32);
nir_def *zero = nir_imm_int(&b, 0);
nir_def *one = nir_imm_int(&b, 1);
nir_def *two = nir_imm_int(&b, 2);
nir_def *four = nir_imm_int(&b, 4);
nir_def *eight = nir_imm_int(&b, 8);
nir_def *sixteen = nir_imm_int(&b, 16);
nir_def *thirty_one = nir_imm_int(&b, 31);
nir_def *sixty_four = nir_imm_int(&b, 64);
/* uint32_t x, y, z = 0; */
nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
nir_variable *x = nir_local_variable_create(e, glsl_uint_type(), "x");
nir_store_var(&b, x, var_undef, 0x1);
nir_variable *y = nir_local_variable_create(e, glsl_uint_type(), "y");
nir_store_var(&b, y, var_undef, 0x1);
nir_variable *z = nir_local_variable_create(e, glsl_uint_type(), "z");
nir_store_var(&b, z, zero, 0x1);
/* uint32_t buff_0[4] = load_ubo(0, 0); */
nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
/* uint32_t buff_1[4] = load_ubo(1, 16); */
nir_def *buff_1 = nir_load_ubo(&b, 4, 32, zero, sixteen, .range_base = 16, .range = 16);
/* uint32_t b0_bitfield = buff_0.w; */
nir_def *b0_bitfield = nir_channel(&b, buff_0, 3);
/* Check result availability.
* if (b0_bitfield & (1u << 4)) {
* ...
*/
nir_def *is_one_dword_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixteen));
nir_if *if_one_dword_result = nir_push_if(&b, is_one_dword_result); {
/* int32_t value = load_ssbo(0, fence_offset);
* z = ~(value >> 31);
*/
nir_def *value = nir_load_ssbo(&b, 1, 32, zero, nir_channel(&b, buff_1, 0));
nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
nir_store_var(&b, z, bitmask, 0x1);
/* Load result if available.
* if (value < 0) {
* uint32_t result[2] = load_ssbo(0, 0);
* x = result[0];
* y = result[1];
* }
*/
nir_if *if_negative = nir_push_if(&b, nir_ilt(&b, value, zero)); {
nir_def *result = nir_load_ssbo(&b, 2, 32, zero, zero);
nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
}
nir_pop_if(&b, if_negative);
} nir_push_else(&b, if_one_dword_result); {
/* } else {
* x = 0; y = 0;
*/
nir_store_var(&b, x, zero, 0x1);
nir_store_var(&b, y, zero, 0x1);
/* Load previously accumulated result if requested.
* if (b0_bitfield & (1u << 0)) {
* uint32_t result[3] = load_ssbo(1, 0);
* x = result[0];
* y = result[1];
* z = result[2];
* }
*/
nir_def *is_prev_acc_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, one));
nir_if *if_prev_acc_result = nir_push_if(&b, is_prev_acc_result); {
nir_def *result = nir_load_ssbo(&b, 3, 32, one, zero);
nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
nir_store_var(&b, z, nir_channel(&b, result, 2), 0x1);
}
nir_pop_if(&b, if_prev_acc_result);
/* if (!z) {
* uint32_t result_index = 0;
* uint32_t pitch = 0;
* ...
*/
nir_def *z_value = nir_load_var(&b, z);
nir_if *if_not_z = nir_push_if(&b, nir_ieq(&b, z_value, zero)); {
nir_variable *outer_loop_iter =
nir_local_variable_create(e, glsl_uint_type(), "outer_loop_iter");
nir_store_var(&b, outer_loop_iter, zero, 0x1);
nir_variable *pitch = nir_local_variable_create(e, glsl_uint_type(), "pitch");
nir_store_var(&b, pitch, zero, 0x1);
/* Outer loop.
* while (result_index <= result_count) {
* ...
*/
nir_loop *loop_outer = nir_push_loop(&b); {
nir_def *result_index = nir_load_var(&b, outer_loop_iter);
nir_def *is_result_index_out_of_bound =
nir_uge(&b, result_index, nir_channel(&b, buff_0, 2));
nir_if *if_out_of_bound = nir_push_if(&b, is_result_index_out_of_bound); {
nir_jump(&b, nir_jump_break);
}
nir_pop_if(&b, if_out_of_bound);
/* Load fence and check result availability.
* pitch = i * result_stride;
* uint32_t address = fence_offset + pitch;
* int32_t value = load_ssbo(0, address);
* z = ~(value >> 31);
*/
nir_def *pitch_outer_loop = nir_imul(&b, result_index, nir_channel(&b, buff_0, 1));
nir_store_var(&b, pitch, pitch_outer_loop, 0x1);
nir_def *address = nir_iadd(&b, pitch_outer_loop, nir_channel(&b, buff_1, 0));
nir_def *value = nir_load_ssbo(&b, 1, 32, zero, address);
nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
nir_store_var(&b, z, bitmask, 0x1);
/* if (z) {
* break;
* }
*/
nir_if *if_result_available = nir_push_if(&b, nir_i2b(&b, bitmask)); {
nir_jump(&b, nir_jump_break);
}
nir_pop_if(&b, if_result_available);
/* Inner loop iterator.
* uint32_t i = 0;
*/
nir_variable *inner_loop_iter =
nir_local_variable_create(e, glsl_uint_type(), "inner_loop_iter");
nir_store_var(&b, inner_loop_iter, zero, 0x1);
/* Inner loop.
* do {
* ...
*/
nir_loop *loop_inner = nir_push_loop(&b); {
nir_def *pitch_inner_loop = nir_load_var(&b, pitch);
nir_def *i = nir_load_var(&b, inner_loop_iter);
/* Load start and end.
* uint64_t first = load_ssbo(0, pitch);
* uint64_t second = load_ssbo(0, pitch + end_offset);
* uint64_t start_half_pair = second - first;
*/
nir_def *first = nir_load_ssbo(&b, 1, 64, zero, pitch_inner_loop);
nir_def *new_pitch = nir_iadd(&b, pitch_inner_loop, nir_channel(&b, buff_0, 0));
nir_def *second = nir_load_ssbo(&b, 1, 64, zero, new_pitch);
nir_def *start_half_pair = nir_isub(&b, second, first);
/* Load second start/end half-pair and take the difference.
* if (b0_bitfield & (1u << 8)) {
* uint64_t first = load_ssbo(0, pitch + 8);
* uint64_t second = load_ssbo(0, pitch + end_offset + 8);
* uint64_t end_half_pair = second - first;
* uint64_t difference = start_half_pair - end_half_pair;
* }
*/
nir_def *difference;
nir_def *is_so_overflow_mode = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 256));
nir_if *if_so_overflow_mode = nir_push_if(&b, is_so_overflow_mode); {
first = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, pitch_inner_loop, eight));
second = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, new_pitch, eight));
nir_def *end_half_pair = nir_isub(&b, second, first);
difference = nir_isub(&b, start_half_pair, end_half_pair);
}
nir_pop_if(&b, if_so_overflow_mode);
/* uint64_t sum = (x | (uint64_t) y << 32) + difference; */
nir_def *sum = nir_iadd(&b,
nir_pack_64_2x32_split(&b,
nir_load_var(&b, x),
nir_load_var(&b, y)),
nir_if_phi(&b, difference, start_half_pair));
sum = nir_unpack_64_2x32(&b, sum);
/* Increment inner loop iterator.
* i++;
*/
i = nir_iadd(&b, i, one);
nir_store_var(&b, inner_loop_iter, i, 0x1);
/* Update pitch value.
* pitch = i * pair_stride + pitch;
*/
nir_def *incremented_pitch = nir_iadd(&b,
nir_imul(&b, i, nir_channel(&b, buff_1, 1)),
pitch_outer_loop);
nir_store_var(&b, pitch, incremented_pitch, 0x1);
/* Update x and y.
* x = sum.x;
* y = sum.x >> 32;
*/
nir_store_var(&b, x, nir_channel(&b, sum, 0), 0x1);
nir_store_var(&b, y, nir_channel(&b, sum, 1), 0x1);
/* } while (i < pair_count);
*/
nir_def *is_pair_count_exceeded = nir_uge(&b, i, nir_channel(&b, buff_1, 2));
nir_if *if_pair_count_exceeded = nir_push_if(&b, is_pair_count_exceeded); {
nir_jump(&b, nir_jump_break);
}
nir_pop_if(&b, if_pair_count_exceeded);
}
nir_pop_loop(&b, loop_inner);
/* Increment pair iterator.
* result_index++;
*/
nir_store_var(&b, outer_loop_iter, nir_iadd(&b, result_index, one), 0x1);
}
nir_pop_loop(&b, loop_outer);
}
nir_pop_if(&b, if_not_z);
}
nir_pop_if(&b, if_one_dword_result);
nir_def *x_value = nir_load_var(&b, x);
nir_def *y_value = nir_load_var(&b, y);
nir_def *z_value = nir_load_var(&b, z);
/* Store accumulated data for chaining.
* if (b0_bitfield & (1u << 1)) {
* store_ssbo(<x, y, z>, 2, 0);
*/
nir_def *is_acc_chaining = nir_i2b(&b, nir_iand(&b, b0_bitfield, two));
nir_if *if_acc_chaining = nir_push_if(&b, is_acc_chaining); {
nir_store_ssbo(&b, nir_vec3(&b, x_value, y_value, z_value), two, zero);
} nir_push_else(&b, if_acc_chaining); {
/* Store result availability.
* } else {
* if (b0_bitfield & (1u << 2)) {
* store_ssbo((~z & 1), 2, 0);
* ...
*/
nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, b0_bitfield, four));
nir_if *if_result_available = nir_push_if(&b, is_result_available); {
nir_store_ssbo(&b, nir_iand(&b, nir_inot(&b, z_value), one), two, zero);
/* Store full 64 bits result.
* if (b0_bitfield & (1u << 6)) {
* store_ssbo(<0, 0>, 2, 0);
* }
*/
nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
nir_store_ssbo(&b, nir_imm_ivec2(&b, 0, 0), two, zero,
.write_mask = (1u << 1));
}
nir_pop_if(&b, if_result_64_bits);
} nir_push_else(&b, if_result_available); {
/* } else {
* if (~z) {
* ...
*/
nir_def *is_bitwise_not_z = nir_i2b(&b, nir_inot(&b, z_value));
nir_if *if_bitwise_not_z = nir_push_if(&b, is_bitwise_not_z); {
nir_def *ts_x, *ts_y;
/* Apply timestamp conversion.
* if (b0_bitfield & (1u << 5)) {
* uint64_t xy_million = (x | (uint64_t) y << 32) * (uint64_t) 1000000;
* uint64_t ts_converted = xy_million / (uint64_t) clock_crystal_frequency;
* x = ts_converted.x;
* y = ts_converted.x >> 32;
* }
*/
nir_def *is_apply_timestamp = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 32));
nir_if *if_apply_timestamp = nir_push_if(&b, is_apply_timestamp); {
/* Add the frequency into the shader for timestamp conversion
* so that the backend can use the full range of optimizations
* for divide-by-constant.
*/
nir_def *clock_crystal_frequency =
nir_imm_int64(&b, sctx->screen->info.clock_crystal_freq);
nir_def *xy_million = nir_imul(&b,
nir_pack_64_2x32_split(&b, x_value, y_value),
nir_imm_int64(&b, 1000000));
nir_def *ts_converted = nir_udiv(&b, xy_million, clock_crystal_frequency);
ts_converted = nir_unpack_64_2x32(&b, ts_converted);
ts_x = nir_channel(&b, ts_converted, 0);
ts_y = nir_channel(&b, ts_converted, 1);
}
nir_pop_if(&b, if_apply_timestamp);
nir_def *nx = nir_if_phi(&b, ts_x, x_value);
nir_def *ny = nir_if_phi(&b, ts_y, y_value);
/* x = b0_bitfield & (1u << 3) ? ((x | (uint64_t) y << 32) != 0) : x;
* y = b0_bitfield & (1u << 3) ? 0 : y;
*/
nir_def *is_convert_to_bool = nir_i2b(&b, nir_iand(&b, b0_bitfield, eight));
nir_def *xy = nir_pack_64_2x32_split(&b, nx, ny);
nir_def *is_xy = nir_b2i32(&b, nir_ine(&b, xy, nir_imm_int64(&b, 0)));
nx = nir_bcsel(&b, is_convert_to_bool, is_xy, nx);
ny = nir_bcsel(&b, is_convert_to_bool, zero, ny);
/* if (b0_bitfield & (1u << 6)) {
* store_ssbo(<x, y>, 2, 0);
* }
*/
nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
nir_store_ssbo(&b, nir_vec2(&b, nx, ny), two, zero);
} nir_push_else(&b, if_result_64_bits); {
/* Clamping.
* } else {
* x = y ? UINT32_MAX : x;
* x = b0_bitfield & (1u << 7) ? min(x, INT_MAX) : x;
* store_ssbo(x, 2, 0);
* }
*/
nir_def *is_y = nir_ine(&b, ny, zero);
nx = nir_bcsel(&b, is_y, nir_imm_int(&b, UINT32_MAX), nx);
nir_def *is_signed_32bit_result = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 128));
nir_def *min = nir_umin(&b, nx, nir_imm_int(&b, INT_MAX));
nx = nir_bcsel(&b, is_signed_32bit_result, min, nx);
nir_store_ssbo(&b, nx, two, zero);
}
nir_pop_if(&b, if_result_64_bits);
}
nir_pop_if(&b, if_bitwise_not_z);
}
nir_pop_if(&b, if_result_available);
}
nir_pop_if(&b, if_acc_chaining);
return create_shader_state(sctx, b.shader);
}

View file

@ -8,221 +8,6 @@
#include "tgsi/tgsi_text.h"
#include "tgsi/tgsi_ureg.h"
/* Create the compute shader that is used to collect the results.
*
* One compute grid with a single thread is launched for every query result
* buffer. The thread (optionally) reads a previous summary buffer, then
* accumulates data from the query result buffer, and writes the result either
* to a summary buffer to be consumed by the next grid invocation or to the
* user-supplied buffer.
*
* Data layout:
*
* CONST
* 0.x = end_offset
* 0.y = result_stride
* 0.z = result_count
* 0.w = bit field:
* 1: read previously accumulated values
* 2: write accumulated values for chaining
* 4: write result available
* 8: convert result to boolean (0/1)
* 16: only read one dword and use that as result
* 32: apply timestamp conversion
* 64: store full 64 bits result
* 128: store signed 32 bits result
* 256: SO_OVERFLOW mode: take the difference of two successive half-pairs
* 1.x = fence_offset
* 1.y = pair_stride
* 1.z = pair_count
*
* BUFFER[0] = query result buffer
* BUFFER[1] = previous summary buffer
* BUFFER[2] = next summary buffer or user-supplied buffer
*/
void *si_create_query_result_cs(struct si_context *sctx)
{
/* TEMP[0].xy = accumulated result so far
* TEMP[0].z = result not available
*
* TEMP[1].x = current result index
* TEMP[1].y = current pair index
*/
static const char text_tmpl[] =
"COMP\n"
"PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
"PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
"PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
"DCL BUFFER[0]\n"
"DCL BUFFER[1]\n"
"DCL BUFFER[2]\n"
"DCL CONST[0][0..1]\n"
"DCL TEMP[0..5]\n"
"IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
"IMM[1] UINT32 {1, 2, 4, 8}\n"
"IMM[2] UINT32 {16, 32, 64, 128}\n"
"IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
"IMM[4] UINT32 {256, 0, 0, 0}\n"
"AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
"UIF TEMP[5]\n"
/* Check result availability. */
"LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
"ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
"MOV TEMP[1], TEMP[0].zzzz\n"
"NOT TEMP[0].z, TEMP[0].zzzz\n"
/* Load result if available. */
"UIF TEMP[1]\n"
"LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
"ENDIF\n"
"ELSE\n"
/* Load previously accumulated result if requested. */
"MOV TEMP[0], IMM[0].xxxx\n"
"AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
"UIF TEMP[4]\n"
"LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
"ENDIF\n"
"MOV TEMP[1].x, IMM[0].xxxx\n"
"BGNLOOP\n"
/* Break if accumulated result so far is not available. */
"UIF TEMP[0].zzzz\n"
"BRK\n"
"ENDIF\n"
/* Break if result_index >= result_count. */
"USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
"UIF TEMP[5]\n"
"BRK\n"
"ENDIF\n"
/* Load fence and check result availability */
"UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
"LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
"ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
"NOT TEMP[0].z, TEMP[0].zzzz\n"
"UIF TEMP[0].zzzz\n"
"BRK\n"
"ENDIF\n"
"MOV TEMP[1].y, IMM[0].xxxx\n"
"BGNLOOP\n"
/* Load start and end. */
"UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
"UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
"UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
"U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
"AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
"UIF TEMP[5].zzzz\n"
/* Load second start/end half-pair and
* take the difference
*/
"UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
"LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
"LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
"U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
"U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
"ENDIF\n"
"U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
/* Increment pair index */
"UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
"USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
"UIF TEMP[5]\n"
"BRK\n"
"ENDIF\n"
"ENDLOOP\n"
/* Increment result index */
"UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
"ENDLOOP\n"
"ENDIF\n"
"AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
"UIF TEMP[4]\n"
/* Store accumulated data for chaining. */
"STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
"ELSE\n"
"AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
"UIF TEMP[4]\n"
/* Store result availability. */
"NOT TEMP[0].z, TEMP[0]\n"
"AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
"UIF TEMP[4]\n"
"STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
"ENDIF\n"
"ELSE\n"
/* Store result if it is available. */
"NOT TEMP[4], TEMP[0].zzzz\n"
"UIF TEMP[4]\n"
/* Apply timestamp conversion */
"AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
"UIF TEMP[4]\n"
"U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
"U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
"ENDIF\n"
/* Convert to boolean */
"AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
"UIF TEMP[4]\n"
"U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
"AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
"MOV TEMP[0].y, IMM[0].xxxx\n"
"ENDIF\n"
"AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
"UIF TEMP[4]\n"
"STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
"ELSE\n"
/* Clamping */
"UIF TEMP[0].yyyy\n"
"MOV TEMP[0].x, IMM[0].wwww\n"
"ENDIF\n"
"AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
"UIF TEMP[4]\n"
"UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
"ENDIF\n"
"STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
"ENDIF\n"
"ENDIF\n"
"ENDIF\n"
"ENDIF\n"
"END\n";
char text[sizeof(text_tmpl) + 32];
struct tgsi_token tokens[1024];
struct pipe_compute_state state = {};
/* Hard code the frequency into the shader so that the backend can
* use the full range of optimizations for divide-by-constant.
*/
snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq);
if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
assert(false);
return NULL;
}
state.ir_type = PIPE_SHADER_IR_TGSI;
state.prog = tokens;
return sctx->b.create_compute_state(&sctx->b, &state);
}
/* Create the compute shader that is used to collect the results of gfx10+
* shader queries.
*