radeonsi: convert "create_query_result_cs" shader to nir

Acked-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/25972>
2023-09-29 13:00:49 -05:00 · 2023-09-29 13:00:49 -05:00 · c109c3f95c
commit c109c3f95c
parent 740a4c3448
3 changed files with 381 additions and 216 deletions
--- a/src/gallium/drivers/radeonsi/si_pipe.h
+++ b/src/gallium/drivers/radeonsi/si_pipe.h
@ -1673,9 +1673,9 @@ void *si_create_clear_buffer_rmw_cs(struct si_context *sctx);
 void *si_clear_render_target_shader(struct si_context *sctx, enum pipe_texture_target type);
 void *si_clear_12bytes_buffer_shader(struct si_context *sctx);
 void *si_create_fmask_expand_cs(struct si_context *sctx, unsigned num_samples, bool is_array);
+void *si_create_query_result_cs(struct si_context *sctx);

 /* si_shaderlib_tgsi.c */
-void *si_create_query_result_cs(struct si_context *sctx);
 void *gfx11_create_sh_query_result_cs(struct si_context *sctx);

 /* gfx11_query.c */
--- a/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_nir.c
@ -873,3 +873,383 @@ void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
   *vs = create_shader_state(sctx, b.shader);
   return *vs;
 }
+
+/* Create the compute shader that is used to collect the results.
+ *
+ * One compute grid with a single thread is launched for every query result
+ * buffer. The thread (optionally) reads a previous summary buffer, then
+ * accumulates data from the query result buffer, and writes the result either
+ * to a summary buffer to be consumed by the next grid invocation or to the
+ * user-supplied buffer.
+ *
+ * Data layout:
+ *
+ * CONST
+ *  0.x = end_offset
+ *  0.y = result_stride
+ *  0.z = result_count
+ *  0.w = bit field:
+ *          1: read previously accumulated values
+ *          2: write accumulated values for chaining
+ *          4: write result available
+ *          8: convert result to boolean (0/1)
+ *         16: only read one dword and use that as result
+ *         32: apply timestamp conversion
+ *         64: store full 64 bits result
+ *        128: store signed 32 bits result
+ *        256: SO_OVERFLOW mode: take the difference of two successive half-pairs
+ *  1.x = fence_offset
+ *  1.y = pair_stride
+ *  1.z = pair_count
+ *
+ */
+void *si_create_query_result_cs(struct si_context *sctx)
+{
+   const nir_shader_compiler_options *options =
+      sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
+
+   nir_builder b =
+      nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "create_query_result_cs");
+   b.shader->info.workgroup_size[0] = 1;
+   b.shader->info.workgroup_size[1] = 1;
+   b.shader->info.workgroup_size[2] = 1;
+   b.shader->info.num_ubos = 1;
+   b.shader->info.num_ssbos = 3;
+   b.shader->num_uniforms = 2;
+
+   nir_def *var_undef = nir_undef(&b, 1, 32);
+   nir_def *zero = nir_imm_int(&b, 0);
+   nir_def *one = nir_imm_int(&b, 1);
+   nir_def *two = nir_imm_int(&b, 2);
+   nir_def *four = nir_imm_int(&b, 4);
+   nir_def *eight = nir_imm_int(&b, 8);
+   nir_def *sixteen = nir_imm_int(&b, 16);
+   nir_def *thirty_one = nir_imm_int(&b, 31);
+   nir_def *sixty_four = nir_imm_int(&b, 64);
+
+   /* uint32_t x, y, z = 0; */
+   nir_function_impl *e = nir_shader_get_entrypoint(b.shader);
+   nir_variable *x = nir_local_variable_create(e, glsl_uint_type(), "x");
+   nir_store_var(&b, x, var_undef, 0x1);
+   nir_variable *y = nir_local_variable_create(e, glsl_uint_type(), "y");
+   nir_store_var(&b, y, var_undef, 0x1);
+   nir_variable *z = nir_local_variable_create(e, glsl_uint_type(), "z");
+   nir_store_var(&b, z, zero, 0x1);
+
+   /* uint32_t buff_0[4] = load_ubo(0, 0); */
+   nir_def *buff_0 = nir_load_ubo(&b, 4, 32, zero, zero, .range_base = 0, .range = 16);
+   /* uint32_t buff_1[4] = load_ubo(1, 16); */
+   nir_def *buff_1 = nir_load_ubo(&b, 4, 32, zero, sixteen, .range_base = 16, .range = 16);
+
+   /* uint32_t b0_bitfield = buff_0.w; */
+   nir_def *b0_bitfield = nir_channel(&b, buff_0, 3);
+
+   /* Check result availability.
+    *    if (b0_bitfield & (1u << 4)) {
+    *       ...
+    */
+   nir_def *is_one_dword_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixteen));
+   nir_if *if_one_dword_result = nir_push_if(&b, is_one_dword_result); {
+
+      /*   int32_t value = load_ssbo(0, fence_offset);
+       *   z = ~(value >> 31);
+       */
+      nir_def *value = nir_load_ssbo(&b, 1, 32, zero, nir_channel(&b, buff_1, 0));
+      nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
+      nir_store_var(&b, z, bitmask, 0x1);
+
+      /* Load result if available.
+       *    if (value < 0) {
+       *       uint32_t result[2] = load_ssbo(0, 0);
+       *       x = result[0];
+       *       y = result[1];
+       *    }
+       */
+      nir_if *if_negative = nir_push_if(&b, nir_ilt(&b, value, zero)); {
+         nir_def *result = nir_load_ssbo(&b, 2, 32, zero, zero);
+         nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
+         nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
+      }
+      nir_pop_if(&b, if_negative);
+   } nir_push_else(&b, if_one_dword_result); {
+
+      /* } else {
+       *    x = 0; y = 0;
+       */
+      nir_store_var(&b, x, zero, 0x1);
+      nir_store_var(&b, y, zero, 0x1);
+
+      /* Load previously accumulated result if requested.
+       *    if (b0_bitfield & (1u << 0)) {
+       *       uint32_t result[3] = load_ssbo(1, 0);
+       *       x = result[0];
+       *       y = result[1];
+       *       z = result[2];
+       *    }
+       */
+      nir_def *is_prev_acc_result = nir_i2b(&b, nir_iand(&b, b0_bitfield, one));
+      nir_if *if_prev_acc_result = nir_push_if(&b, is_prev_acc_result); {
+         nir_def *result = nir_load_ssbo(&b, 3, 32, one, zero);
+         nir_store_var(&b, x, nir_channel(&b, result, 0), 0x1);
+         nir_store_var(&b, y, nir_channel(&b, result, 1), 0x1);
+         nir_store_var(&b, z, nir_channel(&b, result, 2), 0x1);
+      }
+      nir_pop_if(&b, if_prev_acc_result);
+
+      /* if (!z) {
+       *    uint32_t result_index = 0;
+       *    uint32_t pitch = 0;
+       *    ...
+       */
+      nir_def *z_value = nir_load_var(&b, z);
+      nir_if *if_not_z = nir_push_if(&b, nir_ieq(&b, z_value, zero)); {
+         nir_variable *outer_loop_iter =
+            nir_local_variable_create(e, glsl_uint_type(), "outer_loop_iter");
+         nir_store_var(&b, outer_loop_iter, zero, 0x1);
+         nir_variable *pitch = nir_local_variable_create(e, glsl_uint_type(), "pitch");
+         nir_store_var(&b, pitch, zero, 0x1);
+
+         /* Outer loop.
+          *   while (result_index <= result_count) {
+          *      ...
+          */
+         nir_loop *loop_outer = nir_push_loop(&b); {
+            nir_def *result_index = nir_load_var(&b, outer_loop_iter);
+            nir_def *is_result_index_out_of_bound =
+               nir_uge(&b, result_index, nir_channel(&b, buff_0, 2));
+            nir_if *if_out_of_bound = nir_push_if(&b, is_result_index_out_of_bound); {
+               nir_jump(&b, nir_jump_break);
+            }
+            nir_pop_if(&b, if_out_of_bound);
+
+            /* Load fence and check result availability.
+             *    pitch = i * result_stride;
+             *    uint32_t address = fence_offset + pitch;
+             *    int32_t value = load_ssbo(0, address);
+             *    z = ~(value >> 31);
+             */
+            nir_def *pitch_outer_loop = nir_imul(&b, result_index, nir_channel(&b, buff_0, 1));
+            nir_store_var(&b, pitch, pitch_outer_loop, 0x1);
+            nir_def *address = nir_iadd(&b, pitch_outer_loop, nir_channel(&b, buff_1, 0));
+            nir_def *value = nir_load_ssbo(&b, 1, 32, zero, address);
+            nir_def *bitmask = nir_inot(&b, nir_ishr(&b, value, thirty_one));
+            nir_store_var(&b, z, bitmask, 0x1);
+
+            /*    if (z) {
+             *       break;
+             *    }
+             */
+            nir_if *if_result_available = nir_push_if(&b, nir_i2b(&b, bitmask)); {
+               nir_jump(&b, nir_jump_break);
+            }
+            nir_pop_if(&b, if_result_available);
+
+            /* Inner loop iterator.
+             *    uint32_t i = 0;
+             */
+            nir_variable *inner_loop_iter =
+               nir_local_variable_create(e, glsl_uint_type(), "inner_loop_iter");
+            nir_store_var(&b, inner_loop_iter, zero, 0x1);
+
+            /* Inner loop.
+             *    do {
+             *       ...
+             */
+            nir_loop *loop_inner = nir_push_loop(&b); {
+               nir_def *pitch_inner_loop = nir_load_var(&b, pitch);
+               nir_def *i = nir_load_var(&b, inner_loop_iter);
+
+               /* Load start and end.
+                *    uint64_t first = load_ssbo(0, pitch);
+                *    uint64_t second = load_ssbo(0, pitch + end_offset);
+                *    uint64_t start_half_pair = second - first;
+                */
+               nir_def *first = nir_load_ssbo(&b, 1, 64, zero, pitch_inner_loop);
+               nir_def *new_pitch = nir_iadd(&b, pitch_inner_loop, nir_channel(&b, buff_0, 0));
+               nir_def *second = nir_load_ssbo(&b, 1, 64, zero, new_pitch);
+               nir_def *start_half_pair = nir_isub(&b, second, first);
+
+               /* Load second start/end half-pair and take the difference.
+                *    if (b0_bitfield & (1u << 8)) {
+                *       uint64_t first = load_ssbo(0, pitch + 8);
+                *       uint64_t second = load_ssbo(0, pitch + end_offset + 8);
+                *       uint64_t end_half_pair = second - first;
+                *       uint64_t difference = start_half_pair - end_half_pair;
+                *    }
+                */
+               nir_def *difference;
+               nir_def *is_so_overflow_mode = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 256));
+               nir_if *if_so_overflow_mode = nir_push_if(&b, is_so_overflow_mode); {
+                  first = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, pitch_inner_loop, eight));
+                  second = nir_load_ssbo(&b, 1, 64, zero, nir_iadd(&b, new_pitch, eight));
+                  nir_def *end_half_pair = nir_isub(&b, second, first);
+                  difference = nir_isub(&b, start_half_pair, end_half_pair);
+               }
+               nir_pop_if(&b, if_so_overflow_mode);
+
+               /* uint64_t sum = (x | (uint64_t) y << 32) + difference; */
+               nir_def *sum = nir_iadd(&b,
+                                       nir_pack_64_2x32_split(&b,
+                                                              nir_load_var(&b, x),
+                                                              nir_load_var(&b, y)),
+                                       nir_if_phi(&b, difference, start_half_pair));
+               sum = nir_unpack_64_2x32(&b, sum);
+
+               /* Increment inner loop iterator.
+                *    i++;
+                */
+               i = nir_iadd(&b, i, one);
+               nir_store_var(&b, inner_loop_iter, i, 0x1);
+
+               /* Update pitch value.
+                *    pitch = i * pair_stride + pitch;
+                */
+               nir_def *incremented_pitch = nir_iadd(&b,
+                                             nir_imul(&b, i, nir_channel(&b, buff_1, 1)),
+                                             pitch_outer_loop);
+               nir_store_var(&b, pitch, incremented_pitch, 0x1);
+
+               /* Update x and y.
+                *    x = sum.x;
+                *    y = sum.x >> 32;
+                */
+               nir_store_var(&b, x, nir_channel(&b, sum, 0), 0x1);
+               nir_store_var(&b, y, nir_channel(&b, sum, 1), 0x1);
+
+               /* } while (i < pair_count);
+               */
+               nir_def *is_pair_count_exceeded = nir_uge(&b, i, nir_channel(&b, buff_1, 2));
+               nir_if *if_pair_count_exceeded = nir_push_if(&b, is_pair_count_exceeded); {
+                  nir_jump(&b, nir_jump_break);
+               }
+               nir_pop_if(&b, if_pair_count_exceeded);
+            }
+            nir_pop_loop(&b, loop_inner);
+
+            /* Increment pair iterator.
+             *    result_index++;
+             */
+            nir_store_var(&b, outer_loop_iter, nir_iadd(&b, result_index, one), 0x1);
+         }
+         nir_pop_loop(&b, loop_outer);
+      }
+      nir_pop_if(&b, if_not_z);
+   }
+   nir_pop_if(&b, if_one_dword_result);
+
+   nir_def *x_value = nir_load_var(&b, x);
+   nir_def *y_value = nir_load_var(&b, y);
+   nir_def *z_value = nir_load_var(&b, z);
+
+   /* Store accumulated data for chaining.
+    *    if (b0_bitfield & (1u << 1)) {
+    *       store_ssbo(<x, y, z>, 2, 0);
+    */
+   nir_def *is_acc_chaining = nir_i2b(&b, nir_iand(&b, b0_bitfield, two));
+   nir_if *if_acc_chaining = nir_push_if(&b, is_acc_chaining); {
+      nir_store_ssbo(&b, nir_vec3(&b, x_value, y_value, z_value), two, zero);
+   } nir_push_else(&b, if_acc_chaining); {
+
+      /* Store result availability.
+       *    } else {
+       *       if (b0_bitfield & (1u << 2)) {
+       *          store_ssbo((~z & 1), 2, 0);
+       *          ...
+       */
+      nir_def *is_result_available = nir_i2b(&b, nir_iand(&b, b0_bitfield, four));
+      nir_if *if_result_available = nir_push_if(&b, is_result_available); {
+         nir_store_ssbo(&b, nir_iand(&b, nir_inot(&b, z_value), one), two, zero);
+
+         /* Store full 64 bits result.
+          *    if (b0_bitfield & (1u << 6)) {
+          *       store_ssbo(<0, 0>, 2, 0);
+          *    }
+          */
+         nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
+         nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
+            nir_store_ssbo(&b, nir_imm_ivec2(&b, 0, 0), two, zero,
+                           .write_mask = (1u << 1));
+         }
+         nir_pop_if(&b, if_result_64_bits);
+      } nir_push_else(&b, if_result_available); {
+
+         /* } else {
+          *    if (~z) {
+          *       ...
+          */
+         nir_def *is_bitwise_not_z = nir_i2b(&b, nir_inot(&b, z_value));
+         nir_if *if_bitwise_not_z = nir_push_if(&b, is_bitwise_not_z); {
+            nir_def *ts_x, *ts_y;
+
+            /* Apply timestamp conversion.
+             *    if (b0_bitfield & (1u << 5)) {
+             *       uint64_t xy_million = (x | (uint64_t) y << 32) * (uint64_t) 1000000;
+             *       uint64_t ts_converted = xy_million / (uint64_t) clock_crystal_frequency;
+             *       x = ts_converted.x;
+             *       y = ts_converted.x >> 32;
+             *    }
+             */
+            nir_def *is_apply_timestamp = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 32));
+            nir_if *if_apply_timestamp = nir_push_if(&b, is_apply_timestamp); {
+               /* Add the frequency into the shader for timestamp conversion
+                * so that the backend can use the full range of optimizations
+                * for divide-by-constant.
+                */
+               nir_def *clock_crystal_frequency =
+                  nir_imm_int64(&b, sctx->screen->info.clock_crystal_freq);
+
+               nir_def *xy_million = nir_imul(&b,
+                                           nir_pack_64_2x32_split(&b, x_value, y_value),
+                                           nir_imm_int64(&b, 1000000));
+               nir_def *ts_converted = nir_udiv(&b, xy_million, clock_crystal_frequency);
+               ts_converted = nir_unpack_64_2x32(&b, ts_converted);
+               ts_x = nir_channel(&b, ts_converted, 0);
+               ts_y = nir_channel(&b, ts_converted, 1);
+            }
+            nir_pop_if(&b, if_apply_timestamp);
+
+            nir_def *nx = nir_if_phi(&b, ts_x, x_value);
+            nir_def *ny = nir_if_phi(&b, ts_y, y_value);
+
+            /* x = b0_bitfield & (1u << 3) ? ((x | (uint64_t) y << 32) != 0) : x;
+             * y = b0_bitfield & (1u << 3) ? 0 : y;
+             */
+            nir_def *is_convert_to_bool = nir_i2b(&b, nir_iand(&b, b0_bitfield, eight));
+            nir_def *xy = nir_pack_64_2x32_split(&b, nx, ny);
+            nir_def *is_xy = nir_b2i32(&b, nir_ine(&b, xy, nir_imm_int64(&b, 0)));
+            nx = nir_bcsel(&b, is_convert_to_bool, is_xy, nx);
+            ny = nir_bcsel(&b, is_convert_to_bool, zero, ny);
+
+            /* if (b0_bitfield & (1u << 6)) {
+             *    store_ssbo(<x, y>, 2, 0);
+             * }
+             */
+            nir_def *is_result_64_bits = nir_i2b(&b, nir_iand(&b, b0_bitfield, sixty_four));
+            nir_if *if_result_64_bits = nir_push_if(&b, is_result_64_bits); {
+               nir_store_ssbo(&b, nir_vec2(&b, nx, ny), two, zero);
+            } nir_push_else(&b, if_result_64_bits); {
+
+               /* Clamping.
+                *    } else {
+                *       x = y ? UINT32_MAX : x;
+                *       x = b0_bitfield & (1u << 7) ? min(x, INT_MAX) : x;
+                *       store_ssbo(x, 2, 0);
+                *    }
+                */
+               nir_def *is_y = nir_ine(&b, ny, zero);
+               nx = nir_bcsel(&b, is_y, nir_imm_int(&b, UINT32_MAX), nx);
+               nir_def *is_signed_32bit_result = nir_i2b(&b, nir_iand_imm(&b, b0_bitfield, 128));
+               nir_def *min = nir_umin(&b, nx, nir_imm_int(&b, INT_MAX));
+               nx = nir_bcsel(&b, is_signed_32bit_result, min, nx);
+               nir_store_ssbo(&b, nx, two, zero);
+            }
+            nir_pop_if(&b, if_result_64_bits);
+         }
+         nir_pop_if(&b, if_bitwise_not_z);
+      }
+      nir_pop_if(&b, if_result_available);
+   }
+   nir_pop_if(&b, if_acc_chaining);
+
+   return create_shader_state(sctx, b.shader);
+}
--- a/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
+++ b/src/gallium/drivers/radeonsi/si_shaderlib_tgsi.c
@ -8,221 +8,6 @@
 #include "tgsi/tgsi_text.h"
 #include "tgsi/tgsi_ureg.h"

-/* Create the compute shader that is used to collect the results.
- *
- * One compute grid with a single thread is launched for every query result
- * buffer. The thread (optionally) reads a previous summary buffer, then
- * accumulates data from the query result buffer, and writes the result either
- * to a summary buffer to be consumed by the next grid invocation or to the
- * user-supplied buffer.
- *
- * Data layout:
- *
- * CONST
- *  0.x = end_offset
- *  0.y = result_stride
- *  0.z = result_count
- *  0.w = bit field:
- *          1: read previously accumulated values
- *          2: write accumulated values for chaining
- *          4: write result available
- *          8: convert result to boolean (0/1)
- *         16: only read one dword and use that as result
- *         32: apply timestamp conversion
- *         64: store full 64 bits result
- *        128: store signed 32 bits result
- *        256: SO_OVERFLOW mode: take the difference of two successive half-pairs
- *  1.x = fence_offset
- *  1.y = pair_stride
- *  1.z = pair_count
- *
- * BUFFER[0] = query result buffer
- * BUFFER[1] = previous summary buffer
- * BUFFER[2] = next summary buffer or user-supplied buffer
- */
-void *si_create_query_result_cs(struct si_context *sctx)
-{
-   /* TEMP[0].xy = accumulated result so far
-    * TEMP[0].z = result not available
-    *
-    * TEMP[1].x = current result index
-    * TEMP[1].y = current pair index
-    */
-   static const char text_tmpl[] =
-      "COMP\n"
-      "PROPERTY CS_FIXED_BLOCK_WIDTH 1\n"
-      "PROPERTY CS_FIXED_BLOCK_HEIGHT 1\n"
-      "PROPERTY CS_FIXED_BLOCK_DEPTH 1\n"
-      "DCL BUFFER[0]\n"
-      "DCL BUFFER[1]\n"
-      "DCL BUFFER[2]\n"
-      "DCL CONST[0][0..1]\n"
-      "DCL TEMP[0..5]\n"
-      "IMM[0] UINT32 {0, 31, 2147483647, 4294967295}\n"
-      "IMM[1] UINT32 {1, 2, 4, 8}\n"
-      "IMM[2] UINT32 {16, 32, 64, 128}\n"
-      "IMM[3] UINT32 {1000000, 0, %u, 0}\n" /* for timestamp conversion */
-      "IMM[4] UINT32 {256, 0, 0, 0}\n"
-
-      "AND TEMP[5], CONST[0][0].wwww, IMM[2].xxxx\n"
-      "UIF TEMP[5]\n"
-      /* Check result availability. */
-      "LOAD TEMP[1].x, BUFFER[0], CONST[0][1].xxxx\n"
-      "ISHR TEMP[0].z, TEMP[1].xxxx, IMM[0].yyyy\n"
-      "MOV TEMP[1], TEMP[0].zzzz\n"
-      "NOT TEMP[0].z, TEMP[0].zzzz\n"
-
-      /* Load result if available. */
-      "UIF TEMP[1]\n"
-      "LOAD TEMP[0].xy, BUFFER[0], IMM[0].xxxx\n"
-      "ENDIF\n"
-      "ELSE\n"
-      /* Load previously accumulated result if requested. */
-      "MOV TEMP[0], IMM[0].xxxx\n"
-      "AND TEMP[4], CONST[0][0].wwww, IMM[1].xxxx\n"
-      "UIF TEMP[4]\n"
-      "LOAD TEMP[0].xyz, BUFFER[1], IMM[0].xxxx\n"
-      "ENDIF\n"
-
-      "MOV TEMP[1].x, IMM[0].xxxx\n"
-      "BGNLOOP\n"
-      /* Break if accumulated result so far is not available. */
-      "UIF TEMP[0].zzzz\n"
-      "BRK\n"
-      "ENDIF\n"
-
-      /* Break if result_index >= result_count. */
-      "USGE TEMP[5], TEMP[1].xxxx, CONST[0][0].zzzz\n"
-      "UIF TEMP[5]\n"
-      "BRK\n"
-      "ENDIF\n"
-
-      /* Load fence and check result availability */
-      "UMAD TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy, CONST[0][1].xxxx\n"
-      "LOAD TEMP[5].x, BUFFER[0], TEMP[5].xxxx\n"
-      "ISHR TEMP[0].z, TEMP[5].xxxx, IMM[0].yyyy\n"
-      "NOT TEMP[0].z, TEMP[0].zzzz\n"
-      "UIF TEMP[0].zzzz\n"
-      "BRK\n"
-      "ENDIF\n"
-
-      "MOV TEMP[1].y, IMM[0].xxxx\n"
-      "BGNLOOP\n"
-      /* Load start and end. */
-      "UMUL TEMP[5].x, TEMP[1].xxxx, CONST[0][0].yyyy\n"
-      "UMAD TEMP[5].x, TEMP[1].yyyy, CONST[0][1].yyyy, TEMP[5].xxxx\n"
-      "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
-
-      "UADD TEMP[5].y, TEMP[5].xxxx, CONST[0][0].xxxx\n"
-      "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
-
-      "U64ADD TEMP[4].xy, TEMP[3], -TEMP[2]\n"
-
-      "AND TEMP[5].z, CONST[0][0].wwww, IMM[4].xxxx\n"
-      "UIF TEMP[5].zzzz\n"
-      /* Load second start/end half-pair and
-       * take the difference
-       */
-      "UADD TEMP[5].xy, TEMP[5], IMM[1].wwww\n"
-      "LOAD TEMP[2].xy, BUFFER[0], TEMP[5].xxxx\n"
-      "LOAD TEMP[3].xy, BUFFER[0], TEMP[5].yyyy\n"
-
-      "U64ADD TEMP[3].xy, TEMP[3], -TEMP[2]\n"
-      "U64ADD TEMP[4].xy, TEMP[4], -TEMP[3]\n"
-      "ENDIF\n"
-
-      "U64ADD TEMP[0].xy, TEMP[0], TEMP[4]\n"
-
-      /* Increment pair index */
-      "UADD TEMP[1].y, TEMP[1].yyyy, IMM[1].xxxx\n"
-      "USGE TEMP[5], TEMP[1].yyyy, CONST[0][1].zzzz\n"
-      "UIF TEMP[5]\n"
-      "BRK\n"
-      "ENDIF\n"
-      "ENDLOOP\n"
-
-      /* Increment result index */
-      "UADD TEMP[1].x, TEMP[1].xxxx, IMM[1].xxxx\n"
-      "ENDLOOP\n"
-      "ENDIF\n"
-
-      "AND TEMP[4], CONST[0][0].wwww, IMM[1].yyyy\n"
-      "UIF TEMP[4]\n"
-      /* Store accumulated data for chaining. */
-      "STORE BUFFER[2].xyz, IMM[0].xxxx, TEMP[0]\n"
-      "ELSE\n"
-      "AND TEMP[4], CONST[0][0].wwww, IMM[1].zzzz\n"
-      "UIF TEMP[4]\n"
-      /* Store result availability. */
-      "NOT TEMP[0].z, TEMP[0]\n"
-      "AND TEMP[0].z, TEMP[0].zzzz, IMM[1].xxxx\n"
-      "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].zzzz\n"
-
-      "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
-      "UIF TEMP[4]\n"
-      "STORE BUFFER[2].y, IMM[0].xxxx, IMM[0].xxxx\n"
-      "ENDIF\n"
-      "ELSE\n"
-      /* Store result if it is available. */
-      "NOT TEMP[4], TEMP[0].zzzz\n"
-      "UIF TEMP[4]\n"
-      /* Apply timestamp conversion */
-      "AND TEMP[4], CONST[0][0].wwww, IMM[2].yyyy\n"
-      "UIF TEMP[4]\n"
-      "U64MUL TEMP[0].xy, TEMP[0], IMM[3].xyxy\n"
-      "U64DIV TEMP[0].xy, TEMP[0], IMM[3].zwzw\n"
-      "ENDIF\n"
-
-      /* Convert to boolean */
-      "AND TEMP[4], CONST[0][0].wwww, IMM[1].wwww\n"
-      "UIF TEMP[4]\n"
-      "U64SNE TEMP[0].x, TEMP[0].xyxy, IMM[4].zwzw\n"
-      "AND TEMP[0].x, TEMP[0].xxxx, IMM[1].xxxx\n"
-      "MOV TEMP[0].y, IMM[0].xxxx\n"
-      "ENDIF\n"
-
-      "AND TEMP[4], CONST[0][0].wwww, IMM[2].zzzz\n"
-      "UIF TEMP[4]\n"
-      "STORE BUFFER[2].xy, IMM[0].xxxx, TEMP[0].xyxy\n"
-      "ELSE\n"
-      /* Clamping */
-      "UIF TEMP[0].yyyy\n"
-      "MOV TEMP[0].x, IMM[0].wwww\n"
-      "ENDIF\n"
-
-      "AND TEMP[4], CONST[0][0].wwww, IMM[2].wwww\n"
-      "UIF TEMP[4]\n"
-      "UMIN TEMP[0].x, TEMP[0].xxxx, IMM[0].zzzz\n"
-      "ENDIF\n"
-
-      "STORE BUFFER[2].x, IMM[0].xxxx, TEMP[0].xxxx\n"
-      "ENDIF\n"
-      "ENDIF\n"
-      "ENDIF\n"
-      "ENDIF\n"
-
-      "END\n";
-
-   char text[sizeof(text_tmpl) + 32];
-   struct tgsi_token tokens[1024];
-   struct pipe_compute_state state = {};
-
-   /* Hard code the frequency into the shader so that the backend can
-    * use the full range of optimizations for divide-by-constant.
-    */
-   snprintf(text, sizeof(text), text_tmpl, sctx->screen->info.clock_crystal_freq);
-
-   if (!tgsi_text_translate(text, tokens, ARRAY_SIZE(tokens))) {
-      assert(false);
-      return NULL;
-   }
-
-   state.ir_type = PIPE_SHADER_IR_TGSI;
-   state.prog = tokens;
-
-   return sctx->b.create_compute_state(&sctx->b, &state);
-}
-
 /* Create the compute shader that is used to collect the results of gfx10+
 * shader queries.
 *