radv: Remove the BVH depth heuristics

It only helps Quake II RTX and hurts everything else.

Reviewed-by: Friedrich Vock <friedrich.vock@gmx.de>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26481>
This commit is contained in:
Konstantin Seurer 2023-12-10 08:46:31 +01:00 committed by Marge Bot
parent 719619c477
commit 2e4951d3fb
6 changed files with 5 additions and 76 deletions

View file

@ -107,8 +107,6 @@ struct radv_accel_struct_header {
struct radv_ir_node {
radv_aabb aabb;
/* Generic normalized cost of not merging this node. */
float cost;
};
#define RADV_UNKNOWN_BVH_OFFSET 0xFFFFFFFF

View file

@ -366,7 +366,6 @@ main(void)
if (is_active) {
REF(radv_ir_node) ir_node = INDEX(radv_ir_node, args.ir, primitive_id);
DEREF(ir_node).aabb = bounds;
DEREF(ir_node).cost = 0.0;
}
uint32_t ir_offset = primitive_id * SIZEOF(radv_ir_node);

View file

@ -63,12 +63,7 @@ bvh_shaders = [
[
'ploc_internal.comp',
'ploc_internal',
['EXTENDED_SAH=0'],
],
[
'ploc_internal.comp',
'ploc_internal_extended',
['EXTENDED_SAH=1'],
[],
],
]

View file

@ -116,8 +116,6 @@ push_node(uint32_t children[2], radv_aabb bounds[2])
total_bounds.min = vec3(INFINITY);
total_bounds.max = vec3(-INFINITY);
float cost = 0.0;
for (uint i = 0; i < 2; ++i) {
VOID_REF node = OFFSET(args.bvh, ir_id_to_offset(children[i]));
REF(radv_ir_node) child = REF(radv_ir_node)(node);
@ -125,15 +123,10 @@ push_node(uint32_t children[2], radv_aabb bounds[2])
total_bounds.min = min(total_bounds.min, bounds[i].min);
total_bounds.max = max(total_bounds.max, bounds[i].max);
cost += DEREF(child).cost;
DEREF(dst_node).children[i] = children[i];
}
DEREF(dst_node).base.aabb = total_bounds;
#if EXTENDED_SAH
DEREF(dst_node).base.cost = cost * 0.5 + BVH_LEVEL_COST;
#endif
DEREF(dst_node).bvh_offset = RADV_UNKNOWN_BVH_OFFSET;
return dst_id;
}
@ -159,9 +152,6 @@ decode_neighbour_offset(uint32_t encoded_offset)
#define NUM_PLOC_LDS_ITEMS PLOC_WORKGROUP_SIZE + 4 * PLOC_NEIGHBOURHOOD
shared radv_aabb shared_bounds[NUM_PLOC_LDS_ITEMS];
#if EXTENDED_SAH
shared float shared_costs[NUM_PLOC_LDS_ITEMS];
#endif
shared uint32_t nearest_neighbour_indices[NUM_PLOC_LDS_ITEMS];
uint32_t
@ -187,9 +177,6 @@ load_bounds(VOID_REF ids, uint32_t iter, uint32_t task_index, uint32_t lds_base,
REF(radv_ir_node) node = REF(radv_ir_node)(addr);
shared_bounds[i - lds_base] = DEREF(node).aabb;
#if EXTENDED_SAH
shared_costs[i - lds_base] = DEREF(node).cost;
#endif
}
}
@ -199,34 +186,7 @@ combined_node_cost(uint32_t lds_base, uint32_t i, uint32_t j)
radv_aabb combined_bounds;
combined_bounds.min = min(shared_bounds[i - lds_base].min, shared_bounds[j - lds_base].min);
combined_bounds.max = max(shared_bounds[i - lds_base].max, shared_bounds[j - lds_base].max);
float area = aabb_surface_area(combined_bounds);
#if EXTENDED_SAH
if (area == 0.0)
return 0.0;
/* p_a and p_b are the probabilities that i or j are hit by a ray:
* Assuming that the current node is hit (p = 1) and the probability of hitting a node
* is proportional to its surface area, p = area * c with p = 1 for the current node.
* -> c = 1 / area
*
* We can use those probabilities to limit the impact of child cost to be proportional to
* its hit probability. (Child cost is the cost of not merging a node which increases with
* tree depth for internal nodes)
*
* Dividing area by both relative costs will make it more likely that we merge nodes with
* a high child cost.
*/
float p_i = aabb_surface_area(shared_bounds[i - lds_base]) / area;
float p_j = aabb_surface_area(shared_bounds[j - lds_base]) / area;
float combined_cost =
(1.0 + shared_costs[i - lds_base] * p_i) * (1.0 + shared_costs[j - lds_base] * p_j);
return area / combined_cost;
#else
return area;
#endif
return aabb_surface_area(combined_bounds);
}
shared uint32_t shared_aggregate_sum;

View file

@ -57,10 +57,6 @@ static const uint32_t ploc_spv[] = {
#include "bvh/ploc_internal.spv.h"
};
static const uint32_t ploc_extended_spv[] = {
#include "bvh/ploc_internal_extended.spv.h"
};
static const uint32_t copy_spv[] = {
#include "bvh/copy.spv.h"
};
@ -87,7 +83,6 @@ enum internal_build_type {
struct build_config {
enum internal_build_type internal_type;
bool extended_sah;
bool compact;
};
@ -129,11 +124,6 @@ build_config(uint32_t leaf_count, const VkAccelerationStructureBuildGeometryInfo
else
config.internal_type = INTERNAL_BUILD_TYPE_LBVH;
/* 4^(lds stack entry count) assuming we push 1 node on average. */
uint32_t lds_spill_threshold = 1 << (8 * 2);
if (leaf_count < lds_spill_threshold)
config.extended_sah = true;
if (build_info->flags & VK_BUILD_ACCELERATION_STRUCTURE_ALLOW_COMPACTION_BIT_KHR)
config.compact = true;
@ -306,7 +296,6 @@ radv_device_finish_accel_struct_build_state(struct radv_device *device)
struct radv_meta_state *state = &device->meta_state;
radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.copy_pipeline, &state->alloc);
radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.ploc_pipeline, &state->alloc);
radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.ploc_extended_pipeline, &state->alloc);
radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.lbvh_generate_ir_pipeline,
&state->alloc);
radv_DestroyPipeline(radv_device_to_handle(device), state->accel_struct_build.lbvh_main_pipeline, &state->alloc);
@ -544,12 +533,6 @@ radv_device_init_accel_struct_build_state(struct radv_device *device)
if (result != VK_SUCCESS)
goto exit;
result = create_build_pipeline_spv(device, ploc_extended_spv, sizeof(ploc_extended_spv), sizeof(struct ploc_args),
&device->meta_state.accel_struct_build.ploc_extended_pipeline,
&device->meta_state.accel_struct_build.ploc_p_layout);
if (result != VK_SUCCESS)
goto exit;
result = create_build_pipeline_spv(device, encode_spv, sizeof(encode_spv), sizeof(struct encode_args),
&device->meta_state.accel_struct_build.encode_pipeline,
&device->meta_state.accel_struct_build.encode_p_layout);
@ -1004,19 +987,15 @@ lbvh_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount,
static void
ploc_build_internal(VkCommandBuffer commandBuffer, uint32_t infoCount,
const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states,
bool extended_sah)
const VkAccelerationStructureBuildGeometryInfoKHR *pInfos, struct bvh_state *bvh_states)
{
RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
radv_CmdBindPipeline(commandBuffer, VK_PIPELINE_BIND_POINT_COMPUTE,
extended_sah ? cmd_buffer->device->meta_state.accel_struct_build.ploc_extended_pipeline
: cmd_buffer->device->meta_state.accel_struct_build.ploc_pipeline);
cmd_buffer->device->meta_state.accel_struct_build.ploc_pipeline);
for (uint32_t i = 0; i < infoCount; ++i) {
if (bvh_states[i].config.internal_type != INTERNAL_BUILD_TYPE_PLOC)
continue;
if (bvh_states[i].config.extended_sah != extended_sah)
continue;
uint32_t src_scratch_offset = bvh_states[i].scratch_offset;
uint32_t dst_scratch_offset = (src_scratch_offset == bvh_states[i].scratch.sort_buffer_offset[0])
@ -1242,8 +1221,7 @@ radv_CmdBuildAccelerationStructuresKHR(VkCommandBuffer commandBuffer, uint32_t i
lbvh_build_internal(commandBuffer, infoCount, pInfos, bvh_states, flush_bits);
ploc_build_internal(commandBuffer, infoCount, pInfos, bvh_states, false);
ploc_build_internal(commandBuffer, infoCount, pInfos, bvh_states, true);
ploc_build_internal(commandBuffer, infoCount, pInfos, bvh_states);
cmd_buffer->state.flush_bits |= flush_bits;

View file

@ -657,7 +657,6 @@ struct radv_meta_state {
VkPipeline lbvh_generate_ir_pipeline;
VkPipelineLayout ploc_p_layout;
VkPipeline ploc_pipeline;
VkPipeline ploc_extended_pipeline;
VkPipelineLayout encode_p_layout;
VkPipeline encode_pipeline;
VkPipeline encode_compact_pipeline;