From 0fa7252d8a9d450a55f7e6f835c1c3afd75f1f2e Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sat, 18 Nov 2023 12:19:04 -0400 Subject: [PATCH] asahi: Implement multidraw indirect GS only for now (inserting a passthru GS if needed). This should be optimized later, but it's ~correct. Signed-off-by: Alyssa Rosenzweig Part-of: --- docs/features.txt | 2 +- docs/relnotes/new_features.txt | 1 + src/asahi/clc/asahi_clc.c | 1 + src/asahi/lib/agx_nir_lower_gs.c | 36 ++++-- src/asahi/lib/agx_nir_lower_gs.h | 13 +- src/asahi/lib/agx_nir_lower_ia.c | 121 +++++++++++++++++- src/asahi/lib/shaders/geometry.cl | 96 +++++++++++++- src/asahi/lib/shaders/geometry.h | 29 ++++- .../drivers/asahi/agx_nir_lower_sysvals.c | 30 +++-- src/gallium/drivers/asahi/agx_pipe.c | 2 + src/gallium/drivers/asahi/agx_state.c | 108 +++++++++++++--- src/gallium/drivers/asahi/agx_state.h | 4 +- 12 files changed, 383 insertions(+), 60 deletions(-) diff --git a/docs/features.txt b/docs/features.txt index d66b91ac9e2..3460be1aced 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -228,7 +228,7 @@ GL 4.5, GLSL 4.50 -- all DONE: freedreno/a6xx, nvc0, r600, radeonsi, llvmpipe, v GL 4.6, GLSL 4.60 -- all DONE: radeonsi, virgl, zink, iris, crocus/gen7+, d3d12 GL_ARB_gl_spirv DONE (freedreno, llvmpipe) - GL_ARB_indirect_parameters DONE (freedreno/a6xx+, nvc0, llvmpipe, virgl) + GL_ARB_indirect_parameters DONE (freedreno/a6xx+, nvc0, llvmpipe, virgl, asahi) GL_ARB_pipeline_statistics_query DONE (freedreno/a6xx+, nvc0, r600, llvmpipe, softpipe, crocus/gen6+) GL_ARB_polygon_offset_clamp DONE (freedreno, nv50, nvc0, r600, llvmpipe, v3d, panfrost, crocus) GL_ARB_shader_atomic_counter_ops DONE (freedreno/a5xx+, nvc0, r600, llvmpipe, softpipe, v3d) diff --git a/docs/relnotes/new_features.txt b/docs/relnotes/new_features.txt index 9bb90d71537..2ae9ab08194 100644 --- a/docs/relnotes/new_features.txt +++ b/docs/relnotes/new_features.txt @@ -12,3 +12,4 @@ GL_ARB_base_instance on Asahi OpenGL 4.6 (up from 4.2) on d3d12 VK_EXT_depth_clamp_zero_one on RADV GL_ARB_shader_texture_image_samples on Asahi +GL_ARB_indirect_parameters on Asahi diff --git a/src/asahi/clc/asahi_clc.c b/src/asahi/clc/asahi_clc.c index 81c67723608..e34fe798b81 100644 --- a/src/asahi/clc/asahi_clc.c +++ b/src/asahi/clc/asahi_clc.c @@ -215,6 +215,7 @@ compile(void *memctx, const uint32_t *spirv, size_t spirv_size) NIR_PASS_V(nir, nir_lower_convert_alu_types, NULL); NIR_PASS_V(nir, nir_opt_if, nir_opt_if_aggressive_last_continue); + NIR_PASS_V(nir, nir_opt_idiv_const, 16); optimize(nir); diff --git a/src/asahi/lib/agx_nir_lower_gs.c b/src/asahi/lib/agx_nir_lower_gs.c index d2bbb067cb3..e2156b02770 100644 --- a/src/asahi/lib/agx_nir_lower_gs.c +++ b/src/asahi/lib/agx_nir_lower_gs.c @@ -972,13 +972,12 @@ link_libagx(nir_shader *nir, const nir_shader *libagx) void agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx, - bool rasterizer_discard, nir_shader **gs_count, - nir_shader **gs_copy, nir_shader **pre_gs, - enum mesa_prim *out_mode, unsigned *out_count_words) + struct agx_ia_key *ia, bool rasterizer_discard, + nir_shader **gs_count, nir_shader **gs_copy, + nir_shader **pre_gs, enum mesa_prim *out_mode, + unsigned *out_count_words) { link_libagx(vs, libagx); - NIR_PASS_V(vs, nir_lower_idiv, - &(const nir_lower_idiv_options){.allow_fp16 = true}); /* Collect output component counts so we can size the geometry output buffer * appropriately, instead of assuming everything is vec4. @@ -1037,6 +1036,17 @@ agx_nir_lower_gs(nir_shader *gs, nir_shader *vs, const nir_shader *libagx, NIR_PASS(progress, gs, nir_opt_loop_unroll); } while (progress); + if (ia->indirect_multidraw) + NIR_PASS_V(gs, agx_nir_lower_multidraw, ia); + + NIR_PASS_V(gs, nir_shader_intrinsics_pass, lower_id, + nir_metadata_block_index | nir_metadata_dominance, NULL); + + link_libagx(gs, libagx); + + NIR_PASS_V(gs, nir_lower_idiv, + &(const nir_lower_idiv_options){.allow_fp16 = true}); + /* All those variables we created should've gone away by now */ NIR_PASS_V(gs, nir_remove_dead_variables, nir_var_function_temp, NULL); @@ -1156,14 +1166,22 @@ agx_nir_prefix_sum_gs(const nir_shader *libagx, unsigned words) } nir_shader * -agx_nir_gs_setup_indirect(const nir_shader *libagx, enum mesa_prim prim) +agx_nir_gs_setup_indirect(const nir_shader *libagx, enum mesa_prim prim, + bool multidraw) { nir_builder b = nir_builder_init_simple_shader( MESA_SHADER_COMPUTE, &agx_nir_options, "GS indirect setup"); - libagx_gs_setup_indirect(&b, nir_load_geometry_param_buffer_agx(&b), - nir_load_input_assembly_buffer_agx(&b), - nir_imm_int(&b, prim)); + if (multidraw) { + uint32_t subgroup_size = 32; + b.shader->info.workgroup_size[0] = subgroup_size; + } + + libagx_gs_setup_indirect( + &b, nir_load_geometry_param_buffer_agx(&b), + nir_load_input_assembly_buffer_agx(&b), nir_imm_int(&b, prim), + nir_channel(&b, nir_load_local_invocation_id(&b), 0), + nir_imm_bool(&b, multidraw)); UNUSED struct agx_uncompiled_shader_info info; agx_preprocess_nir(b.shader, libagx, false, &info); diff --git a/src/asahi/lib/agx_nir_lower_gs.h b/src/asahi/lib/agx_nir_lower_gs.h index 1a5533edffc..a3b81142797 100644 --- a/src/asahi/lib/agx_nir_lower_gs.h +++ b/src/asahi/lib/agx_nir_lower_gs.h @@ -14,16 +14,19 @@ enum mesa_prim; void agx_nir_lower_ia(struct nir_shader *s, struct agx_ia_key *ia); +void agx_nir_lower_multidraw(struct nir_shader *s, struct agx_ia_key *key); + void agx_nir_lower_gs(struct nir_shader *gs, struct nir_shader *vs, - const struct nir_shader *libagx, bool rasterizer_discard, - struct nir_shader **gs_count, struct nir_shader **gs_copy, - struct nir_shader **pre_gs, enum mesa_prim *out_mode, - unsigned *out_count_words); + const struct nir_shader *libagx, struct agx_ia_key *ia, + bool rasterizer_discard, struct nir_shader **gs_count, + struct nir_shader **gs_copy, struct nir_shader **pre_gs, + enum mesa_prim *out_mode, unsigned *out_count_words); struct nir_shader *agx_nir_prefix_sum_gs(const struct nir_shader *libagx, unsigned words); struct nir_shader *agx_nir_gs_setup_indirect(const struct nir_shader *libagx, - enum mesa_prim prim); + enum mesa_prim prim, + bool multidraw); #endif diff --git a/src/asahi/lib/agx_nir_lower_ia.c b/src/asahi/lib/agx_nir_lower_ia.c index fb8061f2e1c..3d0331257c5 100644 --- a/src/asahi/lib/agx_nir_lower_ia.c +++ b/src/asahi/lib/agx_nir_lower_ia.c @@ -6,10 +6,26 @@ #include "asahi/compiler/agx_compile.h" #include "compiler/nir/nir_builder.h" #include "shaders/geometry.h" +#include "util/compiler.h" #include "agx_nir_lower_gs.h" #include "libagx_shaders.h" #include "nir.h" #include "nir_builder_opcodes.h" +#include "nir_intrinsics.h" + +/* + * This file implements input assembly in software for geometry/tessellation + * shaders. load_vertex_id is lowered based on the topology. Most of the logic + * lives in CL library routines. + * + * When geom/tess is used, multidraw indirect is implemented by: + * + * 1. Prefix summing the vertex counts across draws. + * 2. Issuing a single indirect draw for the summed vertices. + * 3. Binary searching the prefix sum buffer in software index fetch. + * + * This multidraw implementation kicks off the prefix sum and lowered draw. + */ static nir_def * load_vertex_id(nir_builder *b, struct agx_ia_key *key) @@ -24,9 +40,20 @@ load_vertex_id(nir_builder *b, struct agx_ia_key *key) * vertex ID is just the index as-is. */ if (key->index_size) { + nir_def *ia = nir_load_input_assembly_buffer_agx(b); + + /* + * For multidraw, apply the index buffer offset. For !multidraw, this is + * handled ahead-of-time and baked into the index buffer pointer. + */ + if (key->indirect_multidraw) { + nir_def *first = libagx_multidraw_param(b, ia, nir_load_draw_id(b), + nir_imm_int(b, 2)); + id = nir_iadd(b, id, first); + } + nir_def *address = - libagx_index_buffer(b, nir_load_input_assembly_buffer_agx(b), id, - nir_imm_int(b, key->index_size)); + libagx_index_buffer(b, ia, id, nir_imm_int(b, key->index_size)); nir_def *index = nir_load_global_constant(b, address, key->index_size, 1, key->index_size * 8); @@ -53,9 +80,95 @@ lower_vertex_id(nir_builder *b, nir_intrinsic_instr *intr, void *data) } void -agx_nir_lower_ia(nir_shader *s, struct agx_ia_key *ia) +agx_nir_lower_ia(nir_shader *s, struct agx_ia_key *key) { nir_shader_intrinsics_pass(s, lower_vertex_id, nir_metadata_block_index | nir_metadata_dominance, - ia); + key); +} + +struct multidraw_state { + nir_def *raw_id, *draw, *primitive, *first_vertex, *base_instance; + nir_def *num_vertices; + + bool indexed; +}; + +static nir_def * +map_multidraw_param(nir_builder *b, nir_intrinsic_op intrin, + struct multidraw_state *state) +{ + switch (intrin) { + case nir_intrinsic_load_draw_id: + return state->draw; + + case nir_intrinsic_load_primitive_id: + return state->primitive; + + case nir_intrinsic_load_base_vertex: + return state->indexed ? state->first_vertex : nir_imm_int(b, 0); + + case nir_intrinsic_load_first_vertex: + return state->first_vertex; + + case nir_intrinsic_load_base_instance: + return state->base_instance; + + case nir_intrinsic_load_num_vertices: + return state->num_vertices; + + default: + return NULL; + } +} + +static bool +lower_multidraw(nir_builder *b, nir_intrinsic_instr *intr, void *data) +{ + b->cursor = nir_before_instr(&intr->instr); + nir_def *id = map_multidraw_param(b, intr->intrinsic, data); + if (!id) + return false; + + nir_instr_remove(&intr->instr); + nir_def_rewrite_uses(&intr->def, id); + return true; +} + +void +agx_nir_lower_multidraw(nir_shader *s, struct agx_ia_key *key) +{ + assert(key->indirect_multidraw); + + nir_builder b_ = + nir_builder_at(nir_before_impl(nir_shader_get_entrypoint(s))); + nir_builder *b = &b_; + + struct multidraw_state state = { + /* Filled in at the end to avoid recursion */ + .raw_id = nir_undef(b, 1, 32), + .indexed = key->index_size > 0, + }; + + nir_def *ia = nir_load_input_assembly_buffer_agx(b); + state.draw = libagx_multidraw_draw_id(b, ia, state.raw_id); + + state.primitive = libagx_multidraw_primitive_id( + b, ia, state.draw, state.raw_id, nir_imm_int(b, key->mode)); + + state.num_vertices = + libagx_multidraw_param(b, ia, state.draw, nir_imm_int(b, 0)); + + state.first_vertex = libagx_multidraw_param( + b, ia, state.draw, nir_imm_int(b, state.indexed ? 3 : 2)); + + state.base_instance = libagx_multidraw_param( + b, ia, state.draw, nir_imm_int(b, state.indexed ? 4 : 3)); + + nir_shader_intrinsics_pass(b->shader, lower_multidraw, + nir_metadata_block_index | nir_metadata_dominance, + &state); + + b->cursor = nir_before_impl(b->impl); + nir_def_rewrite_uses(state.raw_id, nir_load_primitive_id(b)); } diff --git a/src/asahi/lib/shaders/geometry.cl b/src/asahi/lib/shaders/geometry.cl index d5cb6b42843..4803154b6de 100644 --- a/src/asahi/lib/shaders/geometry.cl +++ b/src/asahi/lib/shaders/geometry.cl @@ -110,6 +110,40 @@ libagx_index_buffer(constant struct agx_ia_state *p, uint id, return (uintptr_t)&p->index_buffer[id * index_size]; } +uint +libagx_multidraw_draw_id(constant struct agx_ia_state *p, uint raw_id) +{ + global uint *sums = p->prefix_sums; + + /* TODO: replace with binary search or interpolation search */ + uint i = 0; + for (i = 0; raw_id >= sums[i]; ++i) + ; + return i; +} + +uint +libagx_multidraw_param(constant struct agx_ia_state *p, uint draw_id, uint word) +{ + global uint *draw = (global uint *)(p->draws + (draw_id * p->draw_stride)); + return draw[word]; +} + +uint +libagx_multidraw_primitive_id(constant struct agx_ia_state *p, uint draw_id, + uint raw_id, enum mesa_prim mode) +{ + uint start = draw_id > 0 ? p->prefix_sums[draw_id - 1] : 0; + uint raw_offset = raw_id - start; + + /* Note: if we wanted, we could precompute magic divisors in the setup kernel + * to avoid the non-constant division here. + */ + uint vertex_count = libagx_multidraw_param(p, draw_id, 0); + uint primitive_count = u_decomposed_prims_for_vertices(mode, vertex_count); + return raw_offset % primitive_count; +} + uint libagx_setup_xfb_buffer(global struct agx_geometry_params *p, uint i) { @@ -208,12 +242,61 @@ process_draw(global uint *draw, enum mesa_prim mode) return (uint2)(prim_per_instance, instance_count); } +uint2 +process_multidraw(global struct agx_ia_state *s, uint local_id, + enum mesa_prim mode) +{ + uintptr_t draw_ptr = s->draws; + uint draw_stride = s->draw_stride; + + /* Prefix sum the vertex counts (multiplied by instance counts) across draws. + * The number of draws is expected to be small, so this serialization should + * be ok in practice. See libagx_prefix_sum for algorithm details. + */ + uint i, count = 0; + uint len = *(s->count); + uint len_remainder = len % 32; + uint len_rounded_down = len - len_remainder; + + for (i = local_id; i < len_rounded_down; i += 32) { + global uint *draw_ = (global uint *)(draw_ptr + (i * draw_stride)); + uint2 draw = process_draw(draw_, mode); + + /* Total primitives */ + uint value = draw.x * draw.y; + + /* TODO: use inclusive once that's wired up */ + uint value_prefix_sum = sub_group_scan_exclusive_add(value) + value; + s->prefix_sums[i] = count + value_prefix_sum; + count += sub_group_broadcast(value_prefix_sum, 31); + } + + if (local_id < len_remainder) { + global uint *draw_ = (global uint *)(draw_ptr + (i * draw_stride)); + uint2 draw = process_draw(draw_, mode); + uint value = draw.x * draw.y; + + /* TODO: use inclusive once that's wired up */ + s->prefix_sums[i] = count + sub_group_scan_exclusive_add(value) + value; + } + + return (uint2)(len > 0 ? s->prefix_sums[len - 1] : 0, 1); +} + void libagx_gs_setup_indirect(global struct agx_geometry_params *p, - global struct agx_ia_state *ia, enum mesa_prim mode) + global struct agx_ia_state *ia, enum mesa_prim mode, + uint local_id, bool multidraw) { - /* Determine the (primitives, instances) grid size. */ - uint2 draw = process_draw(p->input_indirect_desc, mode); + /* Determine the (primitives, instances) grid size. For multidraw, this will + * be a synthetic grid for the entire collection, but that's ok. + */ + uint2 draw = multidraw ? process_multidraw(ia, local_id, mode) + : process_draw((global uint *)ia->draws, mode); + + /* Elect a single lane */ + if (multidraw && local_id != 0) + return; /* There are primitives*instances primitives total */ p->input_primitives = draw.x * draw.y; @@ -227,9 +310,12 @@ libagx_gs_setup_indirect(global struct agx_geometry_params *p, * in elements. Apply that offset now that we have it. For a hardware * indirect draw, the hardware would do this for us, but for software input * assembly we need to do it ourselves. + * + * For multidraw, this happens per-draw in the input assembly instead. We + * could do that for non-multidraw too, but it'd be less efficient. */ - if (ia->index_buffer) { - ia->index_buffer += p->input_indirect_desc[2] * ia->index_size_B; + if (ia->index_buffer && !multidraw) { + ia->index_buffer += ((constant uint *)ia->draws)[2] * ia->index_size_B; } /* We may need to allocate a GS count buffer, do so now */ diff --git a/src/asahi/lib/shaders/geometry.h b/src/asahi/lib/shaders/geometry.h index 593a38b1d0f..2356fd01edb 100644 --- a/src/asahi/lib/shaders/geometry.h +++ b/src/asahi/lib/shaders/geometry.h @@ -10,9 +10,11 @@ #ifndef __OPENCL_VERSION__ #include "util/macros.h" #define GLOBAL(type_) uint64_t +#define CONST(type_) uint64_t #else #define PACKED #define GLOBAL(type_) global type_ * +#define CONST(type_) constant type_ * #endif #ifndef LIBAGX_GEOMETRY_H @@ -30,11 +32,33 @@ struct agx_ia_key { /* Use first vertex as the provoking vertex for flat shading */ bool flatshade_first; + + /* Whether we are doing input assembly for an indirect multidraw that is + * implemented by a single superdraw with a prefix sum of vertex counts per + * draw. This requires lowering lots of sysvals to index into the draw + * descriptors according to the associated dynamic multidraw state. + */ + bool indirect_multidraw; }; struct agx_ia_state { /* Input: index buffer if present. */ - GLOBAL(uchar) index_buffer; + CONST(uchar) index_buffer; + + /* Input: draw count */ + CONST(uint) count; + + /* Input: indirect draw descriptor. Raw pointer since it's strided. */ + uint64_t draws; + + /* For the geom/tess path, this is the temporary prefix sum buffer. + * Caller-allocated. For regular MDI, this is ok since the CPU knows the + * worst-case draw count. + */ + GLOBAL(uint) prefix_sums; + + /* Stride for the draw descrptor array */ + uint32_t draw_stride; /* The index size (1, 2, 4) or 0 if drawing without an index buffer. */ uint8_t index_size_B; @@ -89,9 +113,6 @@ struct agx_geometry_params { */ uint32_t xfb_prims[MAX_VERTEX_STREAMS]; - /* Address of input indirect buffer for indirect GS draw */ - GLOBAL(uint) input_indirect_desc; - /* Within an indirect GS draw, the grid used to dispatch the GS written out * by the GS indirect setup kernel. Unused for direct GS draws. */ diff --git a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c index b9c102faf3a..3ca7b895780 100644 --- a/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c +++ b/src/gallium/drivers/asahi/agx_nir_lower_sysvals.c @@ -117,7 +117,8 @@ load_texture_handle(nir_builder *b, nir_intrinsic_instr *intr, void *base) } static nir_def * -lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr) +lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr, + bool lower_draw_params) { struct agx_draw_uniforms *u = NULL; struct agx_stage_uniforms *s = NULL; @@ -151,6 +152,20 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr) case nir_intrinsic_get_ssbo_size: return load_sysval_indirect(b, 1, 32, stage_table(b), &s->ssbo_size, intr->src[0].ssa); + case nir_intrinsic_load_layer_id_written_agx: + return load_sysval_root(b, 1, 16, &u->layer_id_written); + case nir_intrinsic_load_input_assembly_buffer_agx: + return load_sysval_root(b, 1, 64, &u->input_assembly); + case nir_intrinsic_load_geometry_param_buffer_agx: + return load_sysval_root(b, 1, 64, &u->geometry_params); + default: + break; + } + + if (!lower_draw_params) + return NULL; + + switch (intr->intrinsic) { case nir_intrinsic_load_num_workgroups: return load_sysval(b, 3, 32, AGX_SYSVAL_TABLE_GRID, 0); case nir_intrinsic_load_first_vertex: @@ -166,12 +181,6 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr) load_sysval(b, 1, 32, AGX_SYSVAL_TABLE_PARAMS, 0), nir_imm_int(b, 0)); case nir_intrinsic_load_draw_id: return load_sysval_root(b, 1, 32, &u->draw_id); - case nir_intrinsic_load_layer_id_written_agx: - return load_sysval_root(b, 1, 16, &u->layer_id_written); - case nir_intrinsic_load_input_assembly_buffer_agx: - return load_sysval_root(b, 1, 64, &u->input_assembly); - case nir_intrinsic_load_geometry_param_buffer_agx: - return load_sysval_root(b, 1, 64, &u->geometry_params); default: return NULL; } @@ -181,6 +190,7 @@ lower_intrinsic(nir_builder *b, nir_intrinsic_instr *intr) static bool lower_sysvals(nir_builder *b, nir_instr *instr, void *data) { + bool *lower_draw_params = data; b->cursor = nir_before_instr(instr); nir_def *old; nir_def *replacement = NULL; @@ -188,7 +198,7 @@ lower_sysvals(nir_builder *b, nir_instr *instr, void *data) if (instr->type == nir_instr_type_intrinsic) { nir_intrinsic_instr *intr = nir_instr_as_intrinsic(instr); old = &intr->def; - replacement = lower_intrinsic(b, intr); + replacement = lower_intrinsic(b, intr, *lower_draw_params); } else if (instr->type == nir_instr_type_tex) { nir_tex_instr *tex = nir_instr_as_tex(instr); old = &tex->def; @@ -353,11 +363,11 @@ lay_out_uniforms(struct agx_compiled_shader *shader, struct state *state) } bool -agx_nir_lower_sysvals(nir_shader *shader) +agx_nir_lower_sysvals(nir_shader *shader, bool lower_draw_params) { return nir_shader_instructions_pass( shader, lower_sysvals, nir_metadata_block_index | nir_metadata_dominance, - NULL); + &lower_draw_params); } bool diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c index 39381dc9bf6..ba6ce09a285 100644 --- a/src/gallium/drivers/asahi/agx_pipe.c +++ b/src/gallium/drivers/asahi/agx_pipe.c @@ -1643,6 +1643,8 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_SAMPLE_SHADING: case PIPE_CAP_START_INSTANCE: case PIPE_CAP_DRAW_PARAMETERS: + case PIPE_CAP_MULTI_DRAW_INDIRECT: + case PIPE_CAP_MULTI_DRAW_INDIRECT_PARAMS: return 1; case PIPE_CAP_SURFACE_SAMPLE_COUNT: /* TODO: MSRTT */ diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index 45708253cd9..7fe55d4aff6 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -1645,7 +1645,7 @@ agx_compile_nir(struct agx_device *dev, nir_shader *nir, dev->params.num_dies > 1; key.libagx = dev->libagx; - NIR_PASS_V(nir, agx_nir_lower_sysvals); + NIR_PASS_V(nir, agx_nir_lower_sysvals, true); NIR_PASS_V(nir, agx_nir_layout_uniforms, compiled, &key.reserved_preamble); agx_compile_shader_nir(nir, &key, debug, &binary, &compiled->info); @@ -1712,20 +1712,18 @@ agx_compile_variant(struct agx_device *dev, struct pipe_context *pctx, /* Apply the VS key to the VS before linking it in */ NIR_PASS_V(vs, agx_nir_lower_vbo, &key->vbuf); - NIR_PASS_V(vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); + NIR_PASS_V(vs, agx_nir_lower_ia, &key->ia); + NIR_PASS_V(vs, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); NIR_PASS_V(nir, nir_lower_io_to_scalar, nir_var_shader_out, NULL, NULL); - /* Lower IA before VS sysvals to correctly handle indirect multidraws */ - agx_nir_lower_ia(vs, &key->ia); - /* Lower VS sysvals before it's merged in, so we access the correct shader - * stage for UBOs etc. + * stage for UBOs etc. Skip draw parameters, those are lowered later. */ - NIR_PASS_V(vs, agx_nir_lower_sysvals); + NIR_PASS_V(vs, agx_nir_lower_sysvals, false); /* Link VS with GS */ - NIR_PASS_V(nir, agx_nir_lower_gs, vs, dev->libagx, + NIR_PASS_V(nir, agx_nir_lower_gs, vs, dev->libagx, &key->ia, key->rasterizer_discard, &gs_count, &gs_copy, &pre_gs, &gs_out_prim, &gs_out_count_words); ralloc_free(vs); @@ -2178,7 +2176,8 @@ ia_needs_provoking(enum mesa_prim prim) } static bool -agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info) +agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info, + const struct pipe_draw_indirect_info *indirect) { /* Only proceed if there is a geometry shader. Due to input assembly * dependence, we don't bother to dirty track right now. @@ -2196,6 +2195,8 @@ agx_update_gs(struct agx_context *ctx, const struct pipe_draw_info *info) .ia.mode = translate_ia_mode(info->mode), .ia.flatshade_first = ia_needs_provoking(info->mode) && ctx->rast->base.flatshade_first, + .ia.indirect_multidraw = + indirect && indirect->indirect_draw_count != NULL, .rasterizer_discard = ctx->rast->base.rasterizer_discard, }; @@ -3403,6 +3404,25 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, .index_size_B = info->index_size, }; + if (indirect) { + struct agx_resource *rsrc = agx_resource(indirect->buffer); + agx_batch_reads(batch, rsrc); + + ia.draws = rsrc->bo->ptr.gpu + indirect->offset; + } + + if (indirect && indirect->indirect_draw_count) { + struct agx_resource *rsrc = agx_resource(indirect->indirect_draw_count); + agx_batch_reads(batch, rsrc); + + ia.count = rsrc->bo->ptr.gpu + indirect->indirect_draw_count_offset; + ia.draw_stride = indirect->stride; + + size_t max_sum_size = sizeof(uint32_t) * indirect->draw_count; + ia.prefix_sums = + agx_pool_alloc_aligned(&batch->pool, max_sum_size, 4).gpu; + } + batch->uniforms.input_assembly = agx_pool_upload_aligned(&batch->pool, &ia, sizeof(ia), 8); @@ -3448,10 +3468,7 @@ agx_batch_geometry_params(struct agx_batch *batch, uint64_t input_index_buffer, unsigned count_buffer_stride = batch->ctx->gs->gs_count_words * 4; if (indirect) { - struct agx_resource *rsrc = agx_resource(indirect->buffer); - params.input_indirect_desc = rsrc->bo->ptr.gpu + indirect->offset; params.count_buffer_stride = count_buffer_stride; - agx_batch_reads(batch, rsrc); } else { unsigned prim_per_instance = u_decomposed_prims_for_vertices(info->mode, draw->count); @@ -3498,20 +3515,23 @@ agx_launch_gs(struct agx_batch *batch, const struct pipe_draw_info *info, if (indirect) { assert(indirect->buffer && "drawauto already handled"); - if (!ctx->gs_setup_indirect[info->mode]) { + bool multidraw = (indirect->indirect_draw_count != NULL); + + if (!ctx->gs_setup_indirect[info->mode][multidraw]) { struct agx_shader_key base_key = {0}; - ctx->gs_setup_indirect[info->mode] = agx_compile_nir( - dev, agx_nir_gs_setup_indirect(dev->libagx, info->mode), &base_key, - NULL); + ctx->gs_setup_indirect[info->mode][multidraw] = agx_compile_nir( + dev, agx_nir_gs_setup_indirect(dev->libagx, info->mode, multidraw), + &base_key, NULL); } - const struct pipe_grid_info grid_1x1 = { - .block = {1, 1, 1}, + const struct pipe_grid_info grid_setup = { + .block = {multidraw ? 32 : 1, 1, 1}, .grid = {1, 1, 1}, }; - agx_launch(batch, &grid_1x1, ctx->gs_setup_indirect[info->mode], + agx_launch(batch, &grid_setup, + ctx->gs_setup_indirect[info->mode][multidraw], PIPE_SHADER_COMPUTE); /* Wrap the pool allocation in a fake resource for meta-Gallium use */ @@ -3651,6 +3671,12 @@ agx_needs_passthrough_gs(struct agx_context *ctx, return true; } + /* TODO: also sloppy, we should generate VDM commands from a shader */ + if (indirect && indirect->indirect_draw_count) { + perf_debug_ctx(ctx, "Using passthrough GS due to multidraw indirect"); + return true; + } + /* Transform feedback is layered on geometry shaders, so if transform * feedback is used, we need a GS. */ @@ -3741,6 +3767,38 @@ agx_apply_passthrough_gs(struct agx_context *ctx, } } +static void +util_draw_multi_unroll_indirect(struct pipe_context *pctx, + const struct pipe_draw_info *info, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws) +{ + for (unsigned i = 0; i < indirect->draw_count; ++i) { + const struct pipe_draw_indirect_info subindirect = { + .buffer = indirect->buffer, + .count_from_stream_output = indirect->count_from_stream_output, + .offset = indirect->offset + (i * indirect->stride), + .draw_count = 1, + }; + + pctx->draw_vbo(pctx, info, i, &subindirect, draws, 1); + } +} + +static void +util_draw_multi_upload_indirect(struct pipe_context *pctx, + const struct pipe_draw_info *info, + const struct pipe_draw_indirect_info *indirect, + const struct pipe_draw_start_count_bias *draws) +{ + struct pipe_draw_indirect_info indirect_ = *indirect; + u_upload_data(pctx->const_uploader, 0, 4, 4, &indirect->draw_count, + &indirect_.indirect_draw_count_offset, + &indirect_.indirect_draw_count); + + pctx->draw_vbo(pctx, info, 0, &indirect_, draws, 1); +} + static void agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, unsigned drawid_offset, @@ -3757,6 +3815,14 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, return; } + if (indirect && indirect->draw_count > 1 && !indirect->indirect_draw_count) { + assert(drawid_offset == 0); + assert(num_draws == 1); + + util_draw_multi_upload_indirect(pctx, info, indirect, draws); + return; + } + if (indirect && indirect->count_from_stream_output) { agx_draw_vbo_from_xfb(pctx, info, drawid_offset, indirect); return; @@ -3824,7 +3890,7 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, (ctx->dirty & AGX_DIRTY_VERTEX)) ctx->dirty |= AGX_DIRTY_VS; - agx_update_gs(ctx, info); + agx_update_gs(ctx, info, indirect); if (ctx->gs) { batch->geom_indirect = agx_pool_alloc_aligned_with_bo( @@ -3933,6 +3999,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, return; } + assert((!indirect || !indirect->indirect_draw_count) && "multidraw handled"); + /* Update batch masks based on current state */ if (ctx->dirty & AGX_DIRTY_BLEND) { /* TODO: Any point to tracking load? */ diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index f9c92677931..440734c60e0 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -509,7 +509,7 @@ struct agx_context { struct util_dynarray global_buffers; struct agx_compiled_shader *gs_prefix_sums[16]; - struct agx_compiled_shader *gs_setup_indirect[MESA_PRIM_MAX]; + struct agx_compiled_shader *gs_setup_indirect[MESA_PRIM_MAX][2]; struct agx_meta_cache meta; uint32_t syncobj; @@ -795,7 +795,7 @@ void agx_upload_uniforms(struct agx_batch *batch); uint64_t agx_upload_stage_uniforms(struct agx_batch *batch, uint64_t textures, enum pipe_shader_type stage); -bool agx_nir_lower_sysvals(nir_shader *shader); +bool agx_nir_lower_sysvals(nir_shader *shader, bool lower_draw_params); bool agx_nir_layout_uniforms(nir_shader *shader, struct agx_compiled_shader *compiled,