ir3, freedreno: implement GL_ARB_shader_draw_parameters

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/21593>
This commit is contained in:
Amber 2023-02-28 14:14:35 +01:00 committed by Marge Bot
parent 2cc77088b9
commit 7609f83c70
13 changed files with 84 additions and 22 deletions

View file

@ -285,6 +285,10 @@ ir3_compiler_create(struct fd_device *dev, const struct fd_dev_id *dev_id,
compiler->nir_options.force_indirect_unrolling = nir_var_all;
}
if (options->lower_base_vertex) {
compiler->nir_options.lower_base_vertex = true;
}
/* 16-bit ALU op generation is mostly controlled by frontend compiler options, but
* this core NIR option enables some optimizations of 16-bit operations.
*/

View file

@ -67,6 +67,9 @@ struct ir3_compiler_options {
/* True if 16-bit descriptors are used for both 16-bit and 32-bit access. */
bool storage_16bit;
/* If base_vertex should be lowered in nir */
bool lower_base_vertex;
};
struct ir3_compiler {

View file

@ -2204,6 +2204,12 @@ emit_intrinsic(struct ir3_context *ctx, nir_intrinsic_instr *intr)
}
dst[0] = ctx->basevertex;
break;
case nir_intrinsic_load_is_indexed_draw:
if (!ctx->is_indexed_draw) {
ctx->is_indexed_draw = create_driver_param(ctx, IR3_DP_IS_INDEXED_DRAW);
}
dst[0] = ctx->is_indexed_draw;
break;
case nir_intrinsic_load_draw_id:
if (!ctx->draw_id) {
ctx->draw_id = create_driver_param(ctx, IR3_DP_DRAWID);

View file

@ -85,7 +85,7 @@ struct ir3_context {
/* For vertex shaders, keep track of the system values sources */
struct ir3_instruction *vertex_id, *basevertex, *instance_id, *base_instance,
*draw_id, *view_index;
*draw_id, *view_index, *is_indexed_draw;
/* For fragment shaders: */
struct ir3_instruction *samp_id, *samp_mask_in;

View file

@ -864,6 +864,10 @@ ir3_nir_scan_driver_consts(struct ir3_compiler *compiler, nir_shader *shader, st
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_VTXID_BASE + 1);
break;
case nir_intrinsic_load_is_indexed_draw:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_IS_INDEXED_DRAW + 1);
break;
case nir_intrinsic_load_base_instance:
layout->num_driver_params =
MAX2(layout->num_driver_params, IR3_DP_INSTID_BASE + 1);

View file

@ -69,11 +69,12 @@ enum ir3_driver_param {
IR3_DP_VTXID_BASE = 1,
IR3_DP_INSTID_BASE = 2,
IR3_DP_VTXCNT_MAX = 3,
IR3_DP_IS_INDEXED_DRAW = 4, /* Note: boolean, ie. 0 or ~0 */
/* user-clip-plane components, up to 8x vec4's: */
IR3_DP_UCP0_X = 4,
IR3_DP_UCP0_X = 5,
/* .... */
IR3_DP_UCP7_W = 35,
IR3_DP_VS_COUNT = 36, /* must be aligned to vec4 */
IR3_DP_UCP7_W = 36,
IR3_DP_VS_COUNT = 40, /* must be aligned to vec4 */
/* TCS driver params: */
IR3_DP_HS_DEFAULT_OUTER_LEVEL_X = 0,

View file

@ -186,12 +186,25 @@ fd6_memory_barrier(struct pipe_context *pctx, unsigned flags)
if (flags & (PIPE_BARRIER_TEXTURE |
PIPE_BARRIER_IMAGE |
PIPE_BARRIER_INDIRECT_BUFFER |
PIPE_BARRIER_UPDATE_BUFFER |
PIPE_BARRIER_UPDATE_TEXTURE)) {
flushes |= FD6_FLUSH_CACHE | FD6_WAIT_FOR_IDLE;
}
if (flags & PIPE_BARRIER_INDIRECT_BUFFER) {
flushes |= FD6_FLUSH_CACHE | FD6_WAIT_FOR_IDLE;
/* Various firmware bugs/inconsistencies mean that some indirect draw opcodes
* do not wait for WFI's to complete before executing. Add a WAIT_FOR_ME if
* pending for these opcodes. This may result in a few extra WAIT_FOR_ME's
* with these opcodes, but the alternative would add unnecessary WAIT_FOR_ME's
* before draw opcodes that don't need it.
*/
if (fd_context(pctx)->screen->info->a6xx.indirect_draw_wfm_quirk) {
flushes |= FD6_WAIT_FOR_ME;
}
}
if (flags & PIPE_BARRIER_FRAMEBUFFER) {
fd6_texture_barrier(pctx, PIPE_TEXTURE_BARRIER_FRAMEBUFFER);
}

View file

@ -300,12 +300,12 @@ fd6_build_driver_params(struct fd6_emit *emit)
if (emit->vs->need_driver_params) {
ir3_emit_driver_params(emit->vs, dpconstobj, ctx, emit->info,
emit->indirect, emit->draw);
emit->indirect, emit->draw, emit->draw_id);
}
if (emit->gs && emit->gs->need_driver_params) {
ir3_emit_driver_params(emit->gs, dpconstobj, ctx, emit->info,
emit->indirect, emit->draw);
emit->indirect, emit->draw, 0);
}
if (emit->hs && emit->hs->need_driver_params) {
@ -314,7 +314,7 @@ fd6_build_driver_params(struct fd6_emit *emit)
if (emit->ds && emit->ds->need_driver_params) {
ir3_emit_driver_params(emit->ds, dpconstobj, ctx, emit->info,
emit->indirect, emit->draw);
emit->indirect, emit->draw, 0);
}
fd6_ctx->has_dp_state = true;

View file

@ -74,26 +74,39 @@ draw_emit_xfb(struct fd_ringbuffer *ring, struct CP_DRAW_INDX_OFFSET_0 *draw0,
}
static void
draw_emit_indirect(struct fd_ringbuffer *ring,
draw_emit_indirect(struct fd_context *ctx,
struct fd_ringbuffer *ring,
struct CP_DRAW_INDX_OFFSET_0 *draw0,
const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
unsigned index_offset)
unsigned index_offset, uint32_t driver_param)
{
struct fd_resource *ind = fd_resource(indirect->buffer);
if (info->index_size) {
OUT_PKT7(ring, CP_DRAW_INDIRECT_MULTI, 9);
OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value);
OUT_RING(ring,
(A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_INDEXED)
| A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(driver_param)));
struct pipe_resource *idx = info->index.resource;
unsigned max_indices = (idx->width0 - index_offset) / info->index_size;
OUT_PKT(ring, CP_DRAW_INDX_INDIRECT, pack_CP_DRAW_INDX_OFFSET_0(*draw0),
A5XX_CP_DRAW_INDX_INDIRECT_INDX_BASE(fd_resource(idx)->bo,
index_offset),
A5XX_CP_DRAW_INDX_INDIRECT_3(.max_indices = max_indices),
A5XX_CP_DRAW_INDX_INDIRECT_INDIRECT(ind->bo, indirect->offset));
OUT_RING(ring, indirect->draw_count);
//index va
OUT_RELOC(ring, fd_resource(idx)->bo, index_offset, 0, 0);
//max indices
OUT_RING(ring, max_indices);
OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0);
OUT_RING(ring, indirect->stride);
} else {
OUT_PKT(ring, CP_DRAW_INDIRECT, pack_CP_DRAW_INDX_OFFSET_0(*draw0),
A5XX_CP_DRAW_INDIRECT_INDIRECT(ind->bo, indirect->offset));
OUT_PKT7(ring, CP_DRAW_INDIRECT_MULTI, 6);
OUT_RING(ring, pack_CP_DRAW_INDX_OFFSET_0(*draw0).value);
OUT_RING(ring,
(A6XX_CP_DRAW_INDIRECT_MULTI_1_OPCODE(INDIRECT_OP_NORMAL)
| A6XX_CP_DRAW_INDIRECT_MULTI_1_DST_OFF(driver_param)));
OUT_RING(ring, indirect->draw_count);
OUT_RELOC(ring, ind->bo, indirect->offset, 0, 0);
OUT_RING(ring, indirect->stride);
}
}
@ -228,6 +241,7 @@ fd6_draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info,
emit.state.num_groups = 0;
emit.streamout_mask = 0;
emit.prog = NULL;
emit.draw_id = 0;
if (!(ctx->prog.vs && ctx->prog.fs))
return;
@ -365,7 +379,14 @@ fd6_draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info,
if (indirect->count_from_stream_output) {
draw_emit_xfb(ring, &draw0, info, indirect);
} else {
draw_emit_indirect(ring, &draw0, info, indirect, index_offset);
const struct ir3_const_state *const_state = ir3_const_state(emit.vs);
uint32_t dst_offset_dp = const_state->offsets.driver_param;
/* If unused, pass 0 for DST_OFF: */
if (dst_offset_dp > emit.vs->constlen)
dst_offset_dp = 0;
draw_emit_indirect(ctx, ring, &draw0, info, indirect, index_offset, dst_offset_dp);
}
} else {
draw_emit(ring, &draw0, info, &draws[0], index_offset);
@ -401,6 +422,7 @@ fd6_draw_vbos(struct fd_context *ctx, const struct pipe_draw_info *info,
if (emit.dirty_groups) {
emit.state.num_groups = 0;
emit.draw = &draws[i];
emit.draw_id = info->increment_draw_id ? i : 0;
fd6_emit_3d_state<CHIP>(ring, &emit);
}

View file

@ -183,6 +183,7 @@ struct fd6_emit {
bool rasterflat : 1;
bool primitive_restart : 1;
uint8_t streamout_mask;
uint32_t draw_id;
/* cached to avoid repeated lookups: */
const struct fd6_program_state *prog;

View file

@ -213,6 +213,8 @@ fd_screen_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_COPY_BETWEEN_COMPRESSED_AND_PLAIN_FORMATS:
case PIPE_CAP_CLEAR_TEXTURE:
case PIPE_CAP_MULTI_DRAW_INDIRECT:
case PIPE_CAP_DRAW_PARAMETERS:
return is_a6xx(screen);
case PIPE_CAP_VERTEX_BUFFER_OFFSET_4BYTE_ALIGNED_ONLY:

View file

@ -454,17 +454,19 @@ ir3_emit_driver_params(const struct ir3_shader_variant *v,
struct fd_ringbuffer *ring, struct fd_context *ctx,
const struct pipe_draw_info *info,
const struct pipe_draw_indirect_info *indirect,
const struct pipe_draw_start_count_bias *draw) assert_dt
const struct pipe_draw_start_count_bias *draw,
const uint32_t draw_id) assert_dt
{
assert(v->need_driver_params);
const struct ir3_const_state *const_state = ir3_const_state(v);
uint32_t offset = const_state->offsets.driver_param;
uint32_t vertex_params[IR3_DP_VS_COUNT] = {
[IR3_DP_DRAWID] = 0, /* filled by hw (CP_DRAW_INDIRECT_MULTI) */
[IR3_DP_DRAWID] = draw_id, /* filled by hw (CP_DRAW_INDIRECT_MULTI) */
[IR3_DP_VTXID_BASE] = info->index_size ? draw->index_bias : draw->start,
[IR3_DP_INSTID_BASE] = info->start_instance,
[IR3_DP_VTXCNT_MAX] = ctx->streamout.max_tf_vtx,
[IR3_DP_IS_INDEXED_DRAW] = info->index_size != 0 ? ~0 : 0,
};
if (v->key.ucp_enables) {
struct pipe_clip_state *ucp = &ctx->ucp;
@ -573,7 +575,7 @@ ir3_emit_vs_consts(const struct ir3_shader_variant *v,
/* emit driver params every time: */
if (info && v->need_driver_params) {
ring_wfi(ctx->batch, ring);
ir3_emit_driver_params(v, ring, ctx, info, indirect, draw);
ir3_emit_driver_params(v, ring, ctx, info, indirect, draw, 0);
}
}

View file

@ -570,6 +570,10 @@ ir3_screen_init(struct pipe_screen *pscreen)
.bindless_fb_read_slot = IR3_BINDLESS_IMAGE_OFFSET +
IR3_BINDLESS_IMAGE_COUNT - 1 - screen->max_rts,
};
if (screen->gen >= 6) {
options.lower_base_vertex = true;
}
screen->compiler = ir3_compiler_create(screen->dev, screen->dev_id, &options);
/* TODO do we want to limit things to # of fast cores, or just limit