d3d12: ARB_query_buffer_object and GL4.4

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26156>
2023-11-10 14:48:07 -08:00 · 2023-11-10 14:48:07 -08:00 · 9ef621ec2e
commit 9ef621ec2e
parent 6384ccd1cd
8 changed files with 339 additions and 90 deletions
--- a/docs/features.txt
+++ b/docs/features.txt
@ -192,12 +192,12 @@ GL 4.3, GLSL 4.30 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, v
  GL_ARB_vertex_attrib_binding                          DONE (all drivers)


-GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, virgl, zink, iris, crocus/gen7.5+
+GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, virgl, zink, iris, crocus/gen7.5+, d3d12

  GL_MAX_VERTEX_ATTRIB_STRIDE                           DONE (all drivers)
-  GL_ARB_buffer_storage                                 DONE (freedreno, nv50, v3d, vc4, lima, panfrost, asahi, d3d12, softpipe, etnaviv, crocus)
+  GL_ARB_buffer_storage                                 DONE (freedreno, nv50, v3d, vc4, lima, panfrost, asahi, softpipe, etnaviv, crocus)
  GL_ARB_clear_texture                                  DONE (all drivers)
-  GL_ARB_enhanced_layouts                               DONE (freedreno/a3xx+, nv50, softpipe, crocus, d3d12)
+  GL_ARB_enhanced_layouts                               DONE (freedreno/a3xx+, nv50, softpipe, crocus)
  - compile-time constant expressions                   DONE
  - explicit byte offsets for blocks                    DONE
  - forced alignment within blocks                      DONE
@ -206,9 +206,9 @@ GL 4.4, GLSL 4.40 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, v
  - input/output block locations                        DONE
  GL_ARB_multi_bind                                     DONE (all drivers)
  GL_ARB_query_buffer_object                            DONE (freedreno/a6xx)
-  GL_ARB_texture_mirror_clamp_to_edge                   DONE (freedreno, nv50, softpipe, v3d, panfrost, asahi, crocus, d3d12)
-  GL_ARB_texture_stencil8                               DONE (freedreno, nv50, softpipe, v3d, panfrost, d3d12, asahi)
-  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (freedreno, nv50, softpipe, panfrost, d3d12, asahi, crocus)
+  GL_ARB_texture_mirror_clamp_to_edge                   DONE (freedreno, nv50, softpipe, v3d, panfrost, asahi, crocus)
+  GL_ARB_texture_stencil8                               DONE (freedreno, nv50, softpipe, v3d, panfrost, asahi)
+  GL_ARB_vertex_type_10f_11f_11f_rev                    DONE (freedreno, nv50, softpipe, panfrost, asahi, crocus)

 GL 4.5, GLSL 4.50 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, virgl, zink, iris, crocus/gen7.5+

--- a/src/gallium/drivers/d3d12/d3d12_batch.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_batch.cpp
@ -230,6 +230,9 @@ d3d12_start_batch(struct d3d12_context *ctx, struct d3d12_batch *batch)
         batch->has_errors = true;
         return;
      }
+      if (FAILED(ctx->cmdlist->QueryInterface(IID_PPV_ARGS(&ctx->cmdlist2)))) {
+         ctx->cmdlist2 = nullptr;
+      }
      if (FAILED(ctx->cmdlist->QueryInterface(IID_PPV_ARGS(&ctx->cmdlist8)))) {
         ctx->cmdlist8 = nullptr;
      }
--- a/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_compute_transforms.cpp
@ -212,6 +212,170 @@ get_draw_auto(const nir_shader_compiler_options *options)
   return b.shader;
 }

+static struct nir_shader *
+get_query_resolve(const nir_shader_compiler_options *options, const d3d12_compute_transform_key *key)
+{
+   nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, options, "QueryResolve");
+
+   uint32_t bit_size = key->query_resolve.is_64bit ? 64 : 32;
+   const struct glsl_type *value_type = glsl_uintN_t_type(bit_size);
+
+   assert(!key->query_resolve.is_resolve_in_place ||
+          (key->query_resolve.is_64bit && key->query_resolve.num_subqueries == 1));
+   assert(key->query_resolve.num_subqueries == 1 ||
+          key->query_resolve.pipe_query_type == PIPE_QUERY_PRIMITIVES_GENERATED);
+   assert(key->query_resolve.num_subqueries <= 3); /* Fourth state var is an output offset */
+
+   nir_variable *inputs[3];
+   for (uint32_t i = 0; i < key->query_resolve.num_subqueries; ++i) {
+      /* Inputs are always 64-bit */
+      inputs[i] = nir_variable_create(b.shader, nir_var_mem_ssbo, glsl_array_type(glsl_uint64_t_type(), 0, 8), "input");
+      inputs[i]->data.binding = i;
+   }
+   nir_variable *output = inputs[0];
+   if (!key->query_resolve.is_resolve_in_place) {
+      output = nir_variable_create(b.shader, nir_var_mem_ssbo, glsl_array_type(value_type, 0, bit_size / 8), "output");
+      output->data.binding = key->query_resolve.num_subqueries;
+   }
+
+   /* How many entries in each sub-query is passed via root constants */
+   nir_variable *state_var = nullptr;
+   nir_def *state_var_data = d3d12_get_state_var(&b, D3D12_STATE_VAR_TRANSFORM_GENERIC0, "state_var", glsl_uvec4_type(), &state_var);
+
+   /* For in-place resolves, we resolve each field of the query. Otherwise, resolve one field into the dest */
+   nir_variable *results[sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / sizeof(UINT64)];
+   uint32_t num_result_values = 1;
+
+   if (key->query_resolve.is_resolve_in_place) {
+      if (key->query_resolve.pipe_query_type == PIPE_QUERY_PIPELINE_STATISTICS)
+         num_result_values = sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / sizeof(UINT64);
+      else if (key->query_resolve.pipe_query_type == PIPE_QUERY_SO_STATISTICS)
+         num_result_values = sizeof(D3D12_QUERY_DATA_SO_STATISTICS) / sizeof(UINT64);
+   }
+   
+   uint32_t var_bit_size = key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED ||
+                           key->query_resolve.pipe_query_type == PIPE_QUERY_TIMESTAMP ? 64 : bit_size;
+   for (uint32_t i = 0; i < num_result_values; ++i) {
+      results[i] = nir_local_variable_create(b.impl, glsl_uintN_t_type(var_bit_size), "result");
+      nir_store_var(&b, results[i], nir_imm_intN_t(&b, 0, var_bit_size), 1);
+   }
+
+   /* For each subquery... */
+   for (uint32_t i = 0; i < key->query_resolve.num_subqueries; ++i) {
+      nir_def *num_results = nir_channel(&b, state_var_data, i);
+
+      uint32_t subquery_index = key->query_resolve.num_subqueries == 1 ?
+         key->query_resolve.single_subquery_index : i;
+      uint32_t base_offset = 0;
+      uint32_t stride = 0;
+      switch (key->query_resolve.pipe_query_type) {
+      case PIPE_QUERY_OCCLUSION_COUNTER:
+      case PIPE_QUERY_OCCLUSION_PREDICATE:
+      case PIPE_QUERY_OCCLUSION_PREDICATE_CONSERVATIVE:
+      case PIPE_QUERY_TIMESTAMP:
+         stride = 1;
+         break;
+      case PIPE_QUERY_TIME_ELAPSED:
+         stride = 2;
+         break;
+      case PIPE_QUERY_SO_STATISTICS:
+      case PIPE_QUERY_PRIMITIVES_EMITTED:
+         stride = sizeof(D3D12_QUERY_DATA_SO_STATISTICS) / sizeof(UINT64);
+         break;
+      case PIPE_QUERY_PRIMITIVES_GENERATED:
+         if (subquery_index == 0)
+            stride = sizeof(D3D12_QUERY_DATA_SO_STATISTICS) / sizeof(UINT64);
+         else
+            stride = sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / sizeof(UINT64);
+         if (!key->query_resolve.is_resolve_in_place) {
+            if (subquery_index == 1)
+               base_offset = offsetof(D3D12_QUERY_DATA_PIPELINE_STATISTICS, GSPrimitives) / sizeof(UINT64);
+            else if (subquery_index == 2)
+               base_offset = offsetof(D3D12_QUERY_DATA_PIPELINE_STATISTICS, IAPrimitives) / sizeof(UINT64);
+         }
+         break;
+      case PIPE_QUERY_PIPELINE_STATISTICS:
+         stride = sizeof(D3D12_QUERY_DATA_PIPELINE_STATISTICS) / sizeof(UINT64);
+         break;
+      default:
+         unreachable("Unhandled query resolve");
+      }
+
+      if (!key->query_resolve.is_resolve_in_place && key->query_resolve.num_subqueries == 1)
+         base_offset = key->query_resolve.single_result_field_offset;
+
+      nir_def *base_array_index = nir_imm_int(&b, base_offset);
+
+      /* For each query result in this subquery... */
+      nir_variable *loop_counter = nir_local_variable_create(b.impl, glsl_uint_type(), "loop_counter");
+      nir_store_var(&b, loop_counter, nir_imm_int(&b, 0), 1);
+      nir_loop *loop = nir_push_loop(&b);
+
+      nir_def *loop_counter_value = nir_load_var(&b, loop_counter);
+      nir_if *nif = nir_push_if(&b, nir_ieq(&b, loop_counter_value, num_results));
+      nir_jump(&b, nir_jump_break);
+      nir_pop_if(&b, nif);
+
+      /* For each field in the query result, accumulate */
+      nir_def *array_index = nir_iadd(&b, nir_imul_imm(&b, loop_counter_value, stride), base_array_index);
+      for (uint32_t j = 0; j < num_result_values; ++j) {
+         nir_def *new_value;
+         if (key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED) {
+            assert(j == 0 && i == 0);
+            nir_def *start = nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, array_index, 8));
+            nir_def *end = nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, nir_iadd_imm(&b, array_index, 1), 8));
+            new_value = nir_isub(&b, end, start);
+         } else {
+            new_value = nir_u2uN(&b, nir_load_ssbo(&b, 1, 64, nir_imm_int(&b, i), nir_imul_imm(&b, nir_iadd_imm(&b, array_index, j), 8)), var_bit_size);
+         }
+         nir_store_var(&b, results[j], nir_iadd(&b, nir_load_var(&b, results[j]), new_value), 1);
+      }
+      
+      nir_store_var(&b, loop_counter, nir_iadd_imm(&b, loop_counter_value, 1), 1);
+      nir_pop_loop(&b, loop);
+   }
+
+   /* Results are accumulated, now store the final values */
+   nir_def *output_base_index = nir_channel(&b, state_var_data, 3);
+   for (uint32_t i = 0; i < num_result_values; ++i) {
+      /* When resolving in-place, resolve each field, otherwise just write the one result */
+      uint32_t field_offset = key->query_resolve.is_resolve_in_place ?
+         i : key->query_resolve.single_result_field_offset;
+
+      /* When resolving time elapsed in-place, write [0, time], as the only special case */
+      if (key->query_resolve.is_resolve_in_place &&
+          key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED) {
+         nir_store_ssbo(&b, nir_imm_int64(&b, 0), nir_imm_int(&b, output->data.binding),
+                        nir_imul_imm(&b, output_base_index, bit_size / 8), 1, (gl_access_qualifier)0, bit_size / 8, 0);
+         field_offset++;
+      }
+      nir_def *result_val = nir_load_var(&b, results[i]);
+      if (!key->query_resolve.is_resolve_in_place &&
+          (key->query_resolve.pipe_query_type == PIPE_QUERY_TIME_ELAPSED ||
+           key->query_resolve.pipe_query_type == PIPE_QUERY_TIMESTAMP)) {
+         result_val = nir_f2u64(&b, nir_fmul_imm(&b, nir_u2f64(&b, result_val), key->query_resolve.timestamp_multiplier));
+
+         if (!key->query_resolve.is_64bit) {
+            nir_alu_type rounding_type = key->query_resolve.is_signed ? nir_type_int : nir_type_uint;
+            nir_alu_type src_round = (nir_alu_type)(rounding_type | 64);
+            nir_alu_type dst_round = (nir_alu_type)(rounding_type | bit_size);
+            result_val = nir_convert_alu_types(&b, bit_size, result_val, src_round, dst_round, nir_rounding_mode_undef, true);
+         }
+      }
+      nir_store_ssbo(&b, result_val, nir_imm_int(&b, output->data.binding),
+                     nir_imul_imm(&b, nir_iadd_imm(&b, output_base_index, field_offset), bit_size / 8),
+                     1, (gl_access_qualifier)0, bit_size / 8, 0);
+   }
+
+   nir_validate_shader(b.shader, "creation");
+   b.shader->info.num_ssbos = key->query_resolve.num_subqueries + !key->query_resolve.is_resolve_in_place;
+   b.shader->info.num_ubos = 0;
+
+   NIR_PASS_V(b.shader, nir_lower_convert_alu_types, NULL);
+
+   return b.shader;
+}
+
 static struct nir_shader *
 create_compute_transform(const nir_shader_compiler_options *options, const d3d12_compute_transform_key *key)
 {
@ -224,6 +388,8 @@ create_compute_transform(const nir_shader_compiler_options *options, const d3d12
      return get_fake_so_buffer_vertex_count(options);
   case d3d12_compute_transform_type::draw_auto:
      return get_draw_auto(options);
+   case d3d12_compute_transform_type::query_resolve:
+      return get_query_resolve(options, key);
   default:
      unreachable("Invalid transform");
   }
--- a/src/gallium/drivers/d3d12/d3d12_compute_transforms.h
+++ b/src/gallium/drivers/d3d12/d3d12_compute_transforms.h
@ -45,6 +45,8 @@ enum class d3d12_compute_transform_type
   fake_so_buffer_vertex_count,
   /* Append a buffer filled size with (vertex count, 1, 0, 0) */
   draw_auto,
+   /* Accumulate queries together and write a 32-bit or 64-bit result */
+   query_resolve,
   max,
 };

@ -67,6 +69,23 @@ struct d3d12_compute_transform_key
            uint16_t size;
         } ranges[PIPE_MAX_SO_OUTPUTS];
      } fake_so_buffer_copy_back;
+
+      struct {
+         /* true means the accumulation should be done as uint64, else uint32. */
+         uint8_t is_64bit : 1;
+         /* true means output is written where input[0] was, else output is a separate buffer.
+          * true also means all fields are accumulated, else single_result_field_offset determines
+          * which field is resolved. Implies num_subqueries == 1. */
+         uint8_t is_resolve_in_place : 1;
+         /* Indicates how many subqueries to accumulate together into a final result. When
+          * set to 1, single_subquery_index determines where the data comes from. */
+         uint8_t num_subqueries : 2;
+         uint8_t pipe_query_type : 4;
+         uint8_t single_subquery_index : 2;
+         uint8_t single_result_field_offset : 4;
+         uint8_t is_signed : 1;
+         double timestamp_multiplier;
+      } query_resolve;
   };
 };

@ -83,7 +102,7 @@ struct d3d12_compute_transform_save_restore
 {
   struct d3d12_shader_selector *cs;
   struct pipe_constant_buffer cbuf0;
-   struct pipe_shader_buffer ssbos[2];
+   struct pipe_shader_buffer ssbos[4];
 };

 void
--- a/src/gallium/drivers/d3d12/d3d12_context.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_context.cpp
@ -98,6 +98,8 @@ d3d12_context_destroy(struct pipe_context *pctx)
   for (unsigned i = 0; i < ARRAY_SIZE(ctx->batches); ++i)
      d3d12_destroy_batch(ctx, &ctx->batches[i]);
   ctx->cmdlist->Release();
+   if (ctx->cmdlist2)
+      ctx->cmdlist2->Release();
   if (ctx->cmdlist8)
      ctx->cmdlist8->Release();
   d3d12_descriptor_pool_free(ctx->sampler_pool);
--- a/src/gallium/drivers/d3d12/d3d12_context.h
+++ b/src/gallium/drivers/d3d12/d3d12_context.h
@ -257,6 +257,7 @@ struct d3d12_context {

   uint64_t submit_id;
   ID3D12GraphicsCommandList *cmdlist;
+   ID3D12GraphicsCommandList2 *cmdlist2;
   ID3D12GraphicsCommandList8 *cmdlist8;
   ID3D12GraphicsCommandList *state_fixup_cmdlist;

--- a/src/gallium/drivers/d3d12/d3d12_query.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_query.cpp
@ -23,6 +23,7 @@

 #include "d3d12_query.h"
 #include "d3d12_compiler.h"
+#include "d3d12_compute_transforms.h"
 #include "d3d12_context.h"
 #include "d3d12_resource.h"
 #include "d3d12_screen.h"
@ -184,9 +185,9 @@ d3d12_release_query(struct pipe_context *pctx,
 }

 static bool
-accumulate_subresult(struct d3d12_context *ctx, struct d3d12_query *q_parent,
-                     unsigned sub_query,
-                     union pipe_query_result *result, bool write)
+accumulate_subresult_cpu(struct d3d12_context *ctx, struct d3d12_query *q_parent,
+                         unsigned sub_query,
+                         union pipe_query_result *result)
 {
   struct pipe_transfer *transfer = NULL;
   struct d3d12_screen *screen = d3d12_screen(ctx->base.screen);
@ -194,8 +195,6 @@ accumulate_subresult(struct d3d12_context *ctx, struct d3d12_query *q_parent,
   unsigned access = PIPE_MAP_READ;
   void *results;

-   if (write)
-      access |= PIPE_MAP_WRITE;
   access |= PIPE_MAP_UNSYNCHRONIZED;

   results = pipe_buffer_map_range(&ctx->base, q->buffer, q->buffer_offset,
@ -256,32 +255,6 @@ accumulate_subresult(struct d3d12_context *ctx, struct d3d12_query *q_parent,
      }
   }

-   if (write) {
-      if (q->d3d12qtype == D3D12_QUERY_TYPE_PIPELINE_STATISTICS) {
-         results_stats[0].IAVertices = result->pipeline_statistics.ia_vertices;
-         results_stats[0].IAPrimitives = result->pipeline_statistics.ia_primitives;
-         results_stats[0].VSInvocations = result->pipeline_statistics.vs_invocations;
-         results_stats[0].GSInvocations = result->pipeline_statistics.gs_invocations;
-         results_stats[0].GSPrimitives = result->pipeline_statistics.gs_primitives;
-         results_stats[0].CInvocations = result->pipeline_statistics.c_invocations;
-         results_stats[0].CPrimitives = result->pipeline_statistics.c_primitives;
-         results_stats[0].PSInvocations = result->pipeline_statistics.ps_invocations;
-         results_stats[0].HSInvocations = result->pipeline_statistics.hs_invocations;
-         results_stats[0].DSInvocations = result->pipeline_statistics.ds_invocations;
-         results_stats[0].CSInvocations = result->pipeline_statistics.cs_invocations;
-      } else if (d3d12_query_heap_type(q_parent->type, sub_query) == D3D12_QUERY_HEAP_TYPE_SO_STATISTICS) {
-         results_so[0].NumPrimitivesWritten = result->so_statistics.num_primitives_written;
-         results_so[0].PrimitivesStorageNeeded = result->so_statistics.primitives_storage_needed;
-      } else {
-         if (unlikely(q->d3d12qtype == D3D12_QUERY_TYPE_TIMESTAMP)) {
-            results_u64[0] = 0;
-            results_u64[1] = result->u64;
-         } else {
-            results_u64[0] = result->u64;
-         }
-      }
-   }
-
   pipe_buffer_unmap(&ctx->base, transfer);

   if (q->d3d12qtype == D3D12_QUERY_TYPE_TIMESTAMP)
@ -291,33 +264,33 @@ accumulate_subresult(struct d3d12_context *ctx, struct d3d12_query *q_parent,
 }

 static bool
-accumulate_result(struct d3d12_context *ctx, struct d3d12_query *q,
-                  union pipe_query_result *result, bool write)
+accumulate_result_cpu(struct d3d12_context *ctx, struct d3d12_query *q,
+                      union pipe_query_result *result)
 {
   union pipe_query_result local_result;

   switch (q->type) {
   case PIPE_QUERY_PRIMITIVES_GENERATED:
-      if (!accumulate_subresult(ctx, q, 0, &local_result, write))
+      if (!accumulate_subresult_cpu(ctx, q, 0, &local_result))
         return false;
      result->u64 = local_result.so_statistics.primitives_storage_needed;

-      if (!accumulate_subresult(ctx, q, 1, &local_result, write))
+      if (!accumulate_subresult_cpu(ctx, q, 1, &local_result))
         return false;
      result->u64 += local_result.pipeline_statistics.gs_primitives;

-      if (!accumulate_subresult(ctx, q, 2, &local_result, write))
+      if (!accumulate_subresult_cpu(ctx, q, 2, &local_result))
         return false;
      result->u64 += local_result.pipeline_statistics.ia_primitives;
      return true;
   case PIPE_QUERY_PRIMITIVES_EMITTED:
-      if (!accumulate_subresult(ctx, q, 0, &local_result, write))
+      if (!accumulate_subresult_cpu(ctx, q, 0, &local_result))
         return false;
      result->u64 = local_result.so_statistics.num_primitives_written;
      return true;
   default:
      assert(num_sub_queries(q->type) == 1);
-      return accumulate_subresult(ctx, q, 0, result, write);
+      return accumulate_subresult_cpu(ctx, q, 0, result);
   }
 }

@ -362,21 +335,99 @@ query_ensure_ready(struct d3d12_screen* screen, struct d3d12_context* ctx, struc
   return true;
 }

+static void
+accumulate_subresult_gpu(struct d3d12_context *ctx, struct d3d12_query *q_parent,
+                         unsigned sub_query)
+{
+   d3d12_compute_transform_save_restore save;
+   d3d12_save_compute_transform_state(ctx, &save);
+
+   d3d12_compute_transform_key key;
+   memset(&key, 0, sizeof(key));
+   key.type = d3d12_compute_transform_type::query_resolve;
+   key.query_resolve.is_64bit = true;
+   key.query_resolve.is_resolve_in_place = true;
+   key.query_resolve.num_subqueries = 1;
+   key.query_resolve.pipe_query_type = q_parent->type;
+   key.query_resolve.single_subquery_index = sub_query;
+   key.query_resolve.is_signed = false;
+   key.query_resolve.timestamp_multiplier = 1.0;
+   ctx->base.bind_compute_state(&ctx->base, d3d12_get_compute_transform(ctx, &key));
+
+   ctx->transform_state_vars[0] = q_parent->subqueries[sub_query].curr_query;
+   ctx->transform_state_vars[1] = 0;
+   ctx->transform_state_vars[2] = 0;
+   ctx->transform_state_vars[3] = 0;
+
+   pipe_shader_buffer new_cs_ssbos[1];
+   new_cs_ssbos[0].buffer = q_parent->subqueries[sub_query].buffer;
+   new_cs_ssbos[0].buffer_offset = q_parent->subqueries[sub_query].buffer_offset;
+   new_cs_ssbos[0].buffer_size = q_parent->subqueries[sub_query].query_size * q_parent->subqueries[sub_query].num_queries;
+   ctx->base.set_shader_buffers(&ctx->base, PIPE_SHADER_COMPUTE, 0, 1, new_cs_ssbos, 1);
+
+   pipe_grid_info grid = {};
+   grid.block[0] = grid.block[1] = grid.block[2] = 1;
+   grid.grid[0] = grid.grid[1] = grid.grid[2] = 1;
+   ctx->base.launch_grid(&ctx->base, &grid);
+
+   d3d12_restore_compute_transform_state(ctx, &save);
+}
+
+static void
+accumulate_result_gpu(struct d3d12_context *ctx, struct d3d12_query *q,
+                      struct pipe_resource *dst, uint32_t dst_offset,
+                      int index, enum pipe_query_value_type result_type)
+{
+   d3d12_compute_transform_save_restore save;
+   d3d12_save_compute_transform_state(ctx, &save);
+
+   d3d12_compute_transform_key key;
+   memset(&key, 0, sizeof(key));
+   key.type = d3d12_compute_transform_type::query_resolve;
+   key.query_resolve.is_64bit = result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64;
+   key.query_resolve.is_resolve_in_place = false;
+   key.query_resolve.num_subqueries = num_sub_queries(q->type);
+   key.query_resolve.pipe_query_type = q->type;
+   key.query_resolve.single_result_field_offset = index;
+   key.query_resolve.is_signed = result_type == PIPE_QUERY_TYPE_I32 || result_type == PIPE_QUERY_TYPE_I64;
+   key.query_resolve.timestamp_multiplier = d3d12_screen(ctx->base.screen)->timestamp_multiplier;
+   ctx->base.bind_compute_state(&ctx->base, d3d12_get_compute_transform(ctx, &key));
+
+   pipe_shader_buffer new_cs_ssbos[4];
+   uint32_t num_ssbos = 0;
+   for (uint32_t i = 0; i < key.query_resolve.num_subqueries; ++i) {
+      ctx->transform_state_vars[i] = q->subqueries[i].curr_query;
+      new_cs_ssbos[num_ssbos].buffer = q->subqueries[i].buffer;
+      new_cs_ssbos[num_ssbos].buffer_offset = q->subqueries[i].buffer_offset;
+      new_cs_ssbos[num_ssbos].buffer_size = q->subqueries[i].query_size * q->subqueries[i].num_queries;
+      num_ssbos++;
+   }
+
+   assert(dst_offset % (key.query_resolve.is_64bit ? 8 : 4) == 0);
+   ctx->transform_state_vars[3] = dst_offset / (key.query_resolve.is_64bit ? 8 : 4);
+
+   new_cs_ssbos[num_ssbos].buffer = dst;
+   new_cs_ssbos[num_ssbos].buffer_offset = 0;
+   new_cs_ssbos[num_ssbos].buffer_size = dst->width0;
+   num_ssbos++;
+   
+   ctx->base.set_shader_buffers(&ctx->base, PIPE_SHADER_COMPUTE, 0, num_ssbos, new_cs_ssbos, 1 << (num_ssbos - 1));
+
+   pipe_grid_info grid = {};
+   grid.block[0] = grid.block[1] = grid.block[2] = 1;
+   grid.grid[0] = grid.grid[1] = grid.grid[2] = 1;
+   ctx->base.launch_grid(&ctx->base, &grid);
+
+   d3d12_restore_compute_transform_state(ctx, &save);
+}
+
 static void
 begin_subquery(struct d3d12_context *ctx, struct d3d12_query *q_parent, unsigned sub_query)
 {
   struct d3d12_query_impl *q = &q_parent->subqueries[sub_query];
   if (q->curr_query == q->num_queries) {
-      union pipe_query_result result;
-
-      query_ensure_ready(d3d12_screen(ctx->base.screen), ctx, q_parent, false);
-      d3d12_foreach_submitted_batch(ctx, old_batch) {
-         if (old_batch->fence && old_batch->fence->value <= q_parent->fence_value)
-            d3d12_reset_batch(ctx, old_batch, OS_TIMEOUT_INFINITE);
-      }
-
      /* Accumulate current results and store in first slot */
-      accumulate_subresult(ctx, q_parent, sub_query, &result, true);
+      accumulate_subresult_gpu(ctx, q_parent, sub_query);
      q->curr_query = 1;
   }

@ -412,18 +463,9 @@ begin_timer_query(struct d3d12_context *ctx, struct d3d12_query *q_parent, bool
      q->curr_query = 0;
      query_index = 0;
   } else if (query_index == q->num_queries) {
-      union pipe_query_result result;
-
      /* Accumulate current results and store in first slot */
-
-      query_ensure_ready(d3d12_screen(ctx->base.screen), ctx, q_parent, false);
-      d3d12_foreach_submitted_batch(ctx, old_batch) {
-         if (old_batch->fence && old_batch->fence->value <= q_parent->fence_value)
-            d3d12_reset_batch(ctx, old_batch, OS_TIMEOUT_INFINITE);
-      }
-
-      accumulate_subresult(ctx, q_parent, 0, &result, true);
-      q->curr_query = 2;
+      accumulate_subresult_gpu(ctx, q_parent, 0);
+      q->curr_query = 1;
   }

   ctx->cmdlist->EndQuery(q->query_heap, q->d3d12qtype, query_index);
@ -530,7 +572,39 @@ d3d12_get_query_result(struct pipe_context *pctx,
   if (!query_ensure_ready(screen, ctx, query, wait))
      return false;

-   return accumulate_result(ctx, query, result, false);
+   return accumulate_result_cpu(ctx, query, result);
+}
+
+static void
+d3d12_get_query_result_resource(struct pipe_context *pctx,
+                                struct pipe_query *q,
+                                enum pipe_query_flags flags,
+                                enum pipe_query_value_type result_type,
+                                int index,
+                                struct pipe_resource *resource,
+                                unsigned offset)
+{
+   struct d3d12_context *ctx = d3d12_context(pctx);
+
+   if (index == -1) {
+      /* Write the "available" bit, which is always true */
+      struct d3d12_resource *res = d3d12_resource(resource);
+      d3d12_transition_resource_state(ctx, res, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_TRANSITION_FLAG_NONE);
+      d3d12_apply_resource_states(ctx, false);
+
+      D3D12_GPU_VIRTUAL_ADDRESS gpuva_base = d3d12_resource_gpu_virtual_address(res) + offset;
+      D3D12_WRITEBUFFERIMMEDIATE_PARAMETER params[2] = {
+         { gpuva_base, 1 },
+         { gpuva_base + sizeof(uint32_t), 0 },
+      };
+      D3D12_WRITEBUFFERIMMEDIATE_MODE modes[2] = { D3D12_WRITEBUFFERIMMEDIATE_MODE_DEFAULT, D3D12_WRITEBUFFERIMMEDIATE_MODE_DEFAULT };
+      ctx->cmdlist8->WriteBufferImmediate(result_type == PIPE_QUERY_TYPE_I64 || result_type == PIPE_QUERY_TYPE_U64 ? 2 : 1,
+                                          params, modes);
+      return;
+   }
+
+   struct d3d12_query *query = (struct d3d12_query *)q;
+   accumulate_result_gpu(ctx, query, resource, offset, index, result_type);
 }

 void
@ -598,28 +672,7 @@ d3d12_render_condition(struct pipe_context *pctx,
      query->predicate = d3d12_resource(pipe_buffer_create(pctx->screen, 0,
                                                           PIPE_USAGE_DEFAULT, sizeof(uint64_t)));

-   if (mode == PIPE_RENDER_COND_WAIT) {
-
-      query_ensure_ready(d3d12_screen(ctx->base.screen), ctx, query, false);
-      d3d12_foreach_submitted_batch(ctx, old_batch) {
-         if (old_batch->fence && old_batch->fence->value <= query->fence_value)
-            d3d12_reset_batch(ctx, old_batch, OS_TIMEOUT_INFINITE);
-      }
-
-      union pipe_query_result result;
-      accumulate_result(ctx, (d3d12_query *)pquery, &result, true);
-   }
-
-   struct d3d12_resource *res = (struct d3d12_resource *)query->subqueries[0].buffer;
-   uint64_t source_offset = 0;
-   ID3D12Resource *source = d3d12_resource_underlying(res, &source_offset);
-   source_offset += query->subqueries[0].buffer_offset;
-   d3d12_transition_resource_state(ctx, res, D3D12_RESOURCE_STATE_COPY_SOURCE, D3D12_TRANSITION_FLAG_INVALIDATE_BINDINGS);
-   d3d12_transition_resource_state(ctx, query->predicate, D3D12_RESOURCE_STATE_COPY_DEST, D3D12_TRANSITION_FLAG_NONE);
-   d3d12_apply_resource_states(ctx, false);
-   ctx->cmdlist->CopyBufferRegion(d3d12_resource_resource(query->predicate), 0,
-                                  source, source_offset,
-                                  sizeof(uint64_t));
+   accumulate_result_gpu(ctx, query, &query->predicate->base.b, 0, 0, PIPE_QUERY_TYPE_U64);

   d3d12_transition_resource_state(ctx, query->predicate, D3D12_RESOURCE_STATE_PREDICATION, D3D12_TRANSITION_FLAG_NONE);
   d3d12_apply_resource_states(ctx, false);
@ -656,6 +709,7 @@ d3d12_context_query_init(struct pipe_context *pctx)
   pctx->begin_query = d3d12_begin_query;
   pctx->end_query = d3d12_end_query;
   pctx->get_query_result = d3d12_get_query_result;
+   pctx->get_query_result_resource = d3d12_get_query_result_resource;
   pctx->set_active_query_state = d3d12_set_active_query_state;
   pctx->render_condition = d3d12_render_condition;
 }
--- a/src/gallium/drivers/d3d12/d3d12_screen.cpp
+++ b/src/gallium/drivers/d3d12/d3d12_screen.cpp
@ -193,9 +193,9 @@ d3d12_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
      return 1;

   case PIPE_CAP_GLSL_FEATURE_LEVEL:
-      return 430;
+      return 440;
   case PIPE_CAP_GLSL_FEATURE_LEVEL_COMPATIBILITY:
-      return 430;
+      return 440;
   case PIPE_CAP_ESSL_FEATURE_LEVEL:
      return 310;

@ -335,8 +335,12 @@ d3d12_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
   case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
   case PIPE_CAP_SHADER_ARRAY_COMPONENTS:
   case PIPE_CAP_TEXTURE_MIRROR_CLAMP_TO_EDGE:
+   case PIPE_CAP_QUERY_TIME_ELAPSED:
      return 1;

+   case PIPE_CAP_QUERY_BUFFER_OBJECT:
+      return (screen->opts3.WriteBufferImmediateSupportFlags & D3D12_COMMAND_LIST_SUPPORT_FLAG_DIRECT) != 0;
+
   case PIPE_CAP_MAX_VERTEX_STREAMS:
      return D3D12_SO_BUFFER_SLOT_COUNT;