radv: Remove the BVH depth heuristics

It only helps Quake II RTX and hurts everything else. Reviewed-by: Friedrich Vock <friedrich.vock@gmx.de> Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26481>
2023-12-10 08:46:31 +01:00 · 2023-12-10 08:46:31 +01:00 · 2e4951d3fb
commit 2e4951d3fb
parent 719619c477
6 changed files with 5 additions and 76 deletions
--- a/src/amd/vulkan/bvh/bvh.h
+++ b/src/amd/vulkan/bvh/bvh.h
@ -107,8 +107,6 @@ struct radv_accel_struct_header {

 struct radv_ir_node {
   radv_aabb aabb;
-   /* Generic normalized cost of not merging this node. */
-   float cost;
 };

 #define RADV_UNKNOWN_BVH_OFFSET 0xFFFFFFFF
--- a/src/amd/vulkan/bvh/leaf.comp
+++ b/src/amd/vulkan/bvh/leaf.comp
@ -366,7 +366,6 @@ main(void)
   if (is_active) {
      REF(radv_ir_node) ir_node = INDEX(radv_ir_node, args.ir, primitive_id);
      DEREF(ir_node).aabb = bounds;
-      DEREF(ir_node).cost = 0.0;
   }

   uint32_t ir_offset = primitive_id * SIZEOF(radv_ir_node);
--- a/src/amd/vulkan/bvh/meson.build
+++ b/src/amd/vulkan/bvh/meson.build
@ -63,12 +63,7 @@ bvh_shaders = [
  [
    'ploc_internal.comp',
    'ploc_internal',
-    ['EXTENDED_SAH=0'],
-  ],
-  [
-    'ploc_internal.comp',
-    'ploc_internal_extended',
-    ['EXTENDED_SAH=1'],
+    [],
  ],
 ]

--- a/src/amd/vulkan/bvh/ploc_internal.comp
+++ b/src/amd/vulkan/bvh/ploc_internal.comp
@ -116,8 +116,6 @@ push_node(uint32_t children[2], radv_aabb bounds[2])
   total_bounds.min = vec3(INFINITY);
   total_bounds.max = vec3(-INFINITY);

-   float cost = 0.0;
-
   for (uint i = 0; i < 2; ++i) {
      VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i]));
      REF(radv_ir_node) child = REF(radv_ir_node)(node);
@ -125,15 +123,10 @@ push_node(uint32_t children[2], radv_aabb bounds[2])
      total_bounds.min = min(total_bounds.min, bounds[i].min);
      total_bounds.max = max(total_bounds.max, bounds[i].max);

-      cost += DEREF(child).cost;
-
      DEREF(dst_node).children[i] = children[i];
   }

   DEREF(dst_node).base.aabb = total_bounds;
-#if EXTENDED_SAH
-   DEREF(dst_node).base.cost = cost * 0.5 + BVH_LEVEL_COST;
-#endif
   DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
   return dst_id;
 }
@ -159,9 +152,6 @@ decode_neighbour_offset(uint32_t encoded_offset)
 #define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD

 shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
-#if EXTENDED_SAH
-shared float shared_costs[NUM_PLOC_LDS_ITEMS];
-#endif
 shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];

 uint32_t
@ -187,9 +177,6 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
      REF(radv_ir_node) node = REF(radv_ir_node)(addr);

      shared_bounds[i - lds_base] = DEREF(node).aabb;
-#if EXTENDED_SAH
-      shared_costs[i - lds_base] = DEREF(node).cost;
-#endif
   }
 }

@ -199,34 +186,7 @@ combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
   radv_aabb combined_bounds;
   combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
   combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
-   float area = aabb_surface_area(combined_bounds);
-
-#if EXTENDED_SAH
-   if (area == 0.0)
-      return 0.0;
-
-   /* p_a and p_b are the probabilities that i or j are hit by a ray:
-    *    Assuming that the current node is hit (p = 1) and the probability of hitting a node
-    *    is proportional to its surface area, p = area * c with p = 1 for the current node.
-    *    -> c = 1 / area
-    *
-    * We can use those probabilities to limit the impact of child cost to be proportional to
-    * its hit probability. (Child cost is the cost of not merging a node which increases with
-    * tree depth for internal nodes)
-    *
-    * Dividing area by both relative costs will make it more likely that we merge nodes with
-    * a high child cost.
-    */
-   float p_i = aabb_surface_area(shared_bounds[i - lds_base]) / area;
-   float p_j = aabb_surface_area(shared_bounds[j - lds_base]) / area;
-
-   float combined_cost =
-      (1.0 + shared_costs[i - lds_base] * p_i) * (1.0 + shared_costs[j - lds_base] * p_j);
-
-   return area / combined_cost;
-#else
-   return area;
-#endif
+   return aabb_surface_area(combined_bounds);
 }

 shared uint32_t shared_aggregate_sum;
--- a/src/amd/vulkan/radv_acceleration_structure.c
+++ b/src/amd/vulkan/radv_acceleration_structure.c
@ -57,10 +57,6 @@ static const uint32_t ploc_spv[] = {
 #include "bvh/ploc_internal.spv.h"
 };

-static const uint32_t ploc_extended_spv[] = {
-#include "bvh/ploc_internal_extended.spv.h"
-};
-
 static const uint32_t copy_spv[] = {
 #include "bvh/copy.spv.h"
 };
@ -87,7 +83,6 @@ enum internal_build_type {

 struct build_config {
   enum internal_build_type internal_type;
-   bool extended_sah;
   bool compact;
 };

@ -129,11 +124,6 @@ build_config(uint32_t leaf_count, const VkAccelerationStructureBuildGeometryInfo
   else
      config.internal_type = INTERNAL_BUILD_TYPE_LBVH;

-   /* 4^(lds stack entry count) assuming we push 1 node on average. */
-   uint32_t lds_spill_threshold = 1 << (8 * 2);
-   if (leaf_count < lds_spill_threshold)
-      config.extended_sah = true;
-
   if (build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR)
      config.compact = true;

@ -306,7 +296,6 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device)
   struct radv_meta_state *state = &device->meta_state;
   radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.copy_pipeline, &state->alloc);
   radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.ploc_pipeline, &state->alloc);
-   radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.ploc_extended_pipeline, &state->alloc);
   radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.lbvh_generate_ir_pipeline,
                        &state->alloc);
   radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.lbvh_main_pipeline, &state->alloc);
@ -544,12 +533,6 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
   if (result != VK_SUCCESS)
      goto exit;

-   result = create_build_pipeline_spv(device, ploc_extended_spv, sizeof(ploc_extended_spv), sizeof(struct ploc_args),
-                                      &device->meta_state.accel_struct_build.ploc_extended_pipeline,
-                                      &device->meta_state.accel_struct_build.ploc_p_layout);
-   if (result != VK_SUCCESS)
-      goto exit;
-
   result = create_build_pipeline_spv(device, encode_spv, sizeof(encode_spv), sizeof(struct encode_args),
                                      &device->meta_state.accel_struct_build.encode_pipeline,
                                      &device->meta_state.accel_struct_build.encode_p_layout);
@ -1004,19 +987,15 @@ lbvh_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount,

 static void
 ploc_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount,
-                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
-                    bool extended_sah)
+                    const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
 {
   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
   radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
-                        extended_sah ? cmd_buffer->device->meta_state.accel_struct_build.ploc_extended_pipeline
-                                     : cmd_buffer->device->meta_state.accel_struct_build.ploc_pipeline);
+                        cmd_buffer->device->meta_state.accel_struct_build.ploc_pipeline);

   for (uint32_t i = 0; i < infoCount; ++i) {
      if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_PLOC)
         continue;
-      if (bvh_states[i].config.extended_sah != extended_sah)
-         continue;

      uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
      uint32_t dst_scratch_offset = (src_scratch_offset == bvh_states[i].scratch.sort_buffer_offset[0])
@ -1242,8 +1221,7 @@ radv_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t i

   lbvh_build_internal(commandBuffer, infoCount, pInfos, bvh_states, flush_bits);

-   ploc_build_internal(commandBuffer, infoCount, pInfos, bvh_states, false);
-   ploc_build_internal(commandBuffer, infoCount, pInfos, bvh_states, true);
+   ploc_build_internal(commandBuffer, infoCount, pInfos, bvh_states);

   cmd_buffer->state.flush_bits |= flush_bits;

--- a/src/amd/vulkan/radv_private.h
+++ b/src/amd/vulkan/radv_private.h
@ -657,7 +657,6 @@ struct radv_meta_state {
      VkPipeline lbvh_generate_ir_pipeline;
      VkPipelineLayout ploc_p_layout;
      VkPipeline ploc_pipeline;
-      VkPipeline ploc_extended_pipeline;
      VkPipelineLayout encode_p_layout;
      VkPipeline encode_pipeline;
      VkPipeline encode_compact_pipeline;