radeonsi: implement a non-scaled compute blit+resolve and use it on gfx11

This was written from scratch. Only a few pieces were taken from Indrajit's
code. This is also much simpler, and hopefully easier to review. For example,
out-of-bounds coordinates are handled trivially.

The new blit test proves that this is identical to u_blitter except for
a few precision differences (NaNs, sRGB) where the compute blit should be
more precise.

This is only enabled on gfx11 because it's slower than the gfx blit on gfx10.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/17782>
This commit is contained in:
Marek Olšák 2022-07-19 05:08:23 -04:00
parent 098b9a8d02
commit 49237c0eb3
6 changed files with 399 additions and 2 deletions

View file

@ -1219,6 +1219,9 @@ static void si_blit(struct pipe_context *ctx, const struct pipe_blit_info *info)
return;
}
if (si_compute_blit(sctx, info))
return;
si_gfx_blit(ctx, info);
}

View file

@ -35,7 +35,17 @@ static bool si_can_use_compute_blit(struct si_context *sctx, enum pipe_format fo
if (format == PIPE_FORMAT_A8R8_UNORM && is_store)
return false;
if (num_samples > 1)
/* MSAA image stores are broken. AMD_DEBUG=nofmask fixes them, implying that the FMASK
* expand pass doesn't work, but let's use the gfx blit, which should be faster because
* it doesn't require expanding the FMASK.
*
* TODO: Broken MSAA stores can cause app issues, though this issue might only affect
* internal blits, not sure.
*
* EQAA image stores are also unimplemented, which should be rejected here after MSAA
* image stores are fixed.
*/
if (num_samples > 1 && is_store)
return false;
if (util_format_is_depth_or_stencil(format))
@ -1014,3 +1024,103 @@ void si_compute_clear_render_target(struct pipe_context *ctx, struct pipe_surfac
ctx->set_constant_buffer(ctx, PIPE_SHADER_COMPUTE, 0, true, &saved_cb);
}
bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info)
{
/* Compute blits require D16 right now (see the ISA).
*
* Testing on Navi21 showed that the compute blit is slightly slower than the gfx blit.
* The compute blit is even slower with DCC stores. VP13 CATIA_plane_pencil is a good test
* for that because it's mostly just blits.
*
* TODO: benchmark the performance on gfx11
*/
if (sctx->gfx_level < GFX11)
return false;
if (!si_can_use_compute_blit(sctx, info->dst.format, info->dst.resource->nr_samples, true,
vi_dcc_enabled((struct si_texture*)info->dst.resource,
info->dst.level)) ||
!si_can_use_compute_blit(sctx, info->src.format, info->src.resource->nr_samples, false,
vi_dcc_enabled((struct si_texture*)info->src.resource,
info->src.level)))
return false;
if (info->alpha_blend ||
info->num_window_rectangles ||
info->scissor_enable ||
/* No scaling. */
info->dst.box.width != abs(info->src.box.width) ||
info->dst.box.height != abs(info->src.box.height) ||
info->dst.box.depth != abs(info->src.box.depth))
return false;
assert(info->src.box.depth >= 0);
/* Shader images. */
struct pipe_image_view image[2];
image[0].resource = info->src.resource;
image[0].shader_access = image[0].access = PIPE_IMAGE_ACCESS_READ;
image[0].format = info->src.format;
image[0].u.tex.level = info->src.level;
image[0].u.tex.first_layer = 0;
image[0].u.tex.last_layer = util_max_layer(info->src.resource, info->src.level);
image[1].resource = info->dst.resource;
image[1].shader_access = image[1].access = PIPE_IMAGE_ACCESS_WRITE;
image[1].format = info->dst.format;
image[1].u.tex.level = info->dst.level;
image[1].u.tex.first_layer = 0;
image[1].u.tex.last_layer = util_max_layer(info->dst.resource, info->dst.level);
/* Get the shader key. */
const struct util_format_description *dst_desc = util_format_description(info->dst.format);
unsigned i = util_format_get_first_non_void_channel(info->dst.format);
union si_compute_blit_shader_key options;
options.key = 0;
options.always_true = true;
options.src_is_1d = info->src.resource->target == PIPE_TEXTURE_1D ||
info->src.resource->target == PIPE_TEXTURE_1D_ARRAY;
options.dst_is_1d = info->dst.resource->target == PIPE_TEXTURE_1D ||
info->dst.resource->target == PIPE_TEXTURE_1D_ARRAY;
options.src_is_msaa = info->src.resource->nr_samples > 1;
options.dst_is_msaa = info->dst.resource->nr_samples > 1;
/* Resolving integer formats only copies sample 0. log2_samples is then unused. */
options.sample0_only = options.src_is_msaa && !options.dst_is_msaa &&
util_format_is_pure_integer(info->src.format);
unsigned num_samples = MAX2(info->src.resource->nr_samples, info->dst.resource->nr_samples);
options.log2_samples = options.sample0_only ? 0 : util_logbase2(num_samples);
options.flip_x = info->src.box.width < 0;
options.flip_y = info->src.box.height < 0;
options.sint_to_uint = util_format_is_pure_sint(info->src.format) &&
util_format_is_pure_uint(info->dst.format);
options.uint_to_sint = util_format_is_pure_uint(info->src.format) &&
util_format_is_pure_sint(info->dst.format);
options.dst_is_srgb = util_format_is_srgb(info->dst.format);
options.fp16_rtz = !util_format_is_pure_integer(info->dst.format) &&
(dst_desc->channel[i].size <= 10 ||
(dst_desc->channel[i].type == UTIL_FORMAT_TYPE_FLOAT &&
dst_desc->channel[i].size <= 16));
struct hash_entry *entry = _mesa_hash_table_search(sctx->cs_blit_shaders,
(void*)(uintptr_t)options.key);
void *shader = entry ? entry->data : NULL;
if (!shader) {
shader = si_create_blit_cs(sctx, &options);
_mesa_hash_table_insert(sctx->cs_blit_shaders,
(void*)(uintptr_t)options.key, shader);
}
sctx->cs_user_data[0] = (info->src.box.x & 0xffff) | ((info->dst.box.x & 0xffff) << 16);
sctx->cs_user_data[1] = (info->src.box.y & 0xffff) | ((info->dst.box.y & 0xffff) << 16);
sctx->cs_user_data[2] = (info->src.box.z & 0xffff) | ((info->dst.box.z & 0xffff) << 16);
struct pipe_grid_info grid = {0};
set_work_size(&grid, 8, 8, 1, info->dst.box.width, info->dst.box.height, info->dst.box.depth);
si_launch_grid_internal_images(sctx, image, 2, &grid, shader,
SI_OP_SYNC_BEFORE_AFTER |
(info->render_condition_enable ? SI_OP_CS_RENDER_COND_ENABLE : 0));
return true;
}

View file

@ -356,6 +356,13 @@ static void si_destroy_context(struct pipe_context *context)
if (!(sctx->context_flags & SI_CONTEXT_FLAG_AUX))
p_atomic_dec(&context->screen->num_contexts);
if (sctx->cs_blit_shaders) {
hash_table_foreach(sctx->cs_blit_shaders, entry) {
context->delete_compute_state(context, entry->data);
}
_mesa_hash_table_destroy(sctx->cs_blit_shaders, NULL);
}
FREE(sctx);
}
@ -827,6 +834,11 @@ static struct pipe_context *si_create_context(struct pipe_screen *screen, unsign
}
sctx->initial_gfx_cs_size = sctx->gfx_cs.current.cdw;
sctx->cs_blit_shaders = _mesa_hash_table_create_u32_keys(NULL);
if (!sctx->cs_blit_shaders)
goto fail;
return &sctx->b;
fail:
fprintf(stderr, "radeonsi: Failed to create a context.\n");

View file

@ -984,6 +984,7 @@ struct si_context {
void *cs_clear_12bytes_buffer;
void *cs_dcc_retile[32];
void *cs_fmask_expand[3][2]; /* [log2(samples)-1][is_array] */
struct hash_table *cs_blit_shaders;
struct si_screen *screen;
struct util_debug_callback debug;
struct ac_llvm_compiler compiler; /* only non-threaded compilation */
@ -1436,6 +1437,7 @@ void si_retile_dcc(struct si_context *sctx, struct si_texture *tex);
void gfx9_clear_dcc_msaa(struct si_context *sctx, struct pipe_resource *res, uint32_t clear_value,
unsigned flags, enum si_coherency coher);
void si_compute_expand_fmask(struct pipe_context *ctx, struct pipe_resource *tex);
bool si_compute_blit(struct si_context *sctx, const struct pipe_blit_info *info);
void si_init_compute_blit_functions(struct si_context *sctx);
/* si_cp_dma.c */
@ -1547,6 +1549,31 @@ void *si_create_dcc_retile_cs(struct si_context *sctx, struct radeon_surf *surf)
void *gfx9_create_clear_dcc_msaa_cs(struct si_context *sctx, struct si_texture *tex);
void *si_create_passthrough_tcs(struct si_context *sctx);
union si_compute_blit_shader_key {
struct {
/* The key saved in _mesa_hash_table_create_u32_keys() can't be 0. */
bool always_true:1;
/* Declaration modifiers. */
bool src_is_1d:1;
bool dst_is_1d:1;
bool src_is_msaa:1;
bool dst_is_msaa:1;
uint8_t log2_samples:4;
bool sample0_only:1; /* src is MSAA, dst is not MSAA, log2_samples is ignored */
/* Source coordinate modifiers. */
bool flip_x:1;
bool flip_y:1;
/* Output modifiers. */
bool sint_to_uint:1;
bool uint_to_sint:1;
bool dst_is_srgb:1;
bool fp16_rtz:1; /* only for equality with pixel shaders, not necessary otherwise */
};
uint32_t key;
};
void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options);
/* si_shaderlib_tgsi.c */
void *si_get_blitter_vs(struct si_context *sctx, enum blitter_attrib_type type,
unsigned num_layers);

View file

@ -71,6 +71,12 @@ static void unpack_2x16(nir_builder *b, nir_ssa_def *src, nir_ssa_def **x, nir_s
*y = nir_ushr(b, src, nir_imm_int(b, 16));
}
static void unpack_2x16_signed(nir_builder *b, nir_ssa_def *src, nir_ssa_def **x, nir_ssa_def **y)
{
*x = nir_i2i32(b, nir_u2u16(b, src));
*y = nir_ishr(b, src, nir_imm_int(b, 16));
}
static nir_ssa_def *
deref_ssa(nir_builder *b, nir_variable *var)
{
@ -347,3 +353,242 @@ void *si_create_passthrough_tcs(struct si_context *sctx)
return create_shader_state(sctx, b.shader);
}
static nir_ssa_def *convert_linear_to_srgb(nir_builder *b, nir_ssa_def *input)
{
/* There are small precision differences compared to CB, so the gfx blit will return slightly
* different results.
*/
nir_ssa_def *cmp[3];
for (unsigned i = 0; i < 3; i++)
cmp[i] = nir_flt(b, nir_channel(b, input, i), nir_imm_float(b, 0.0031308));
nir_ssa_def *ltvals[3];
for (unsigned i = 0; i < 3; i++)
ltvals[i] = nir_fmul(b, nir_channel(b, input, i), nir_imm_float(b, 12.92));
nir_ssa_def *gtvals[3];
for (unsigned i = 0; i < 3; i++) {
gtvals[i] = nir_fpow(b, nir_channel(b, input, i), nir_imm_float(b, 1.0/2.4));
gtvals[i] = nir_fmul(b, gtvals[i], nir_imm_float(b, 1.055));
gtvals[i] = nir_fsub(b, gtvals[i], nir_imm_float(b, 0.055));
}
nir_ssa_def *comp[4];
for (unsigned i = 0; i < 3; i++)
comp[i] = nir_bcsel(b, cmp[i], ltvals[i], gtvals[i]);
comp[3] = nir_channel(b, input, 3);
return nir_vec(b, comp, 4);
}
static nir_ssa_def *image_resolve_msaa(nir_builder *b, nir_variable *img, unsigned num_samples,
nir_ssa_def *coord, enum amd_gfx_level gfx_level)
{
nir_ssa_def *zero = nir_imm_int(b, 0);
nir_ssa_def *result = NULL;
nir_variable *var = NULL;
/* Gfx11 doesn't support samples_identical, so we can't use it. */
if (gfx_level < GFX11) {
/* We need a local variable to get the result out of conditional branches in SSA. */
var = nir_local_variable_create(b->impl, glsl_vec4_type(), NULL);
/* If all samples are identical, load only sample 0. */
nir_push_if(b, nir_image_deref_samples_identical(b, 1, deref_ssa(b, img), coord));
result = nir_image_deref_load(b, 4, 32, deref_ssa(b, img), coord, zero, zero);
nir_store_var(b, var, result, 0xf);
nir_push_else(b, NULL);
}
/* Average all samples. (the only options on gfx11) */
result = NULL;
for (unsigned i = 0; i < num_samples; i++) {
nir_ssa_def *sample = nir_image_deref_load(b, 4, 32, deref_ssa(b, img),
coord, nir_imm_int(b, i), zero);
result = result ? nir_fadd(b, result, sample) : sample;
}
result = nir_fmul_imm(b, result, 1.0 / num_samples); /* average the sum */
if (gfx_level < GFX11) {
/* Exit the conditional branch and get the result out of the branch. */
nir_store_var(b, var, result, 0xf);
nir_pop_if(b, NULL);
result = nir_load_var(b, var);
}
return result;
}
static nir_ssa_def *apply_blit_output_modifiers(nir_builder *b, nir_ssa_def *color,
const union si_compute_blit_shader_key *options)
{
if (options->sint_to_uint)
color = nir_imax(b, color, nir_imm_int(b, 0));
if (options->uint_to_sint)
color = nir_umin(b, color, nir_imm_int(b, INT32_MAX));
if (options->dst_is_srgb)
color = convert_linear_to_srgb(b, color);
/* Convert to FP16 with rtz to match the pixel shader. Not necessary, but it helps verify
* the behavior of the whole shader by comparing it to the gfx blit.
*/
if (options->fp16_rtz)
color = nir_f2f16_rtz(b, color);
return color;
}
/* The compute blit shader.
*
* Differences compared to u_blitter (the gfx blit):
* - u_blitter doesn't preserve NaNs, but the compute blit does
* - u_blitter has lower linear->SRGB precision because the CB block doesn't
* use FP32, but the compute blit does.
*
* Other than that, non-scaled blits are identical to u_blitter.
*
* Implementation details:
* - Out-of-bounds dst coordinates are not clamped at all. The hw drops
* out-of-bounds stores for us.
* - Out-of-bounds src coordinates are clamped by emulating CLAMP_TO_EDGE using
* the image_size NIR intrinsic.
* - X/Y flipping just does this in the shader: -threadIDs - 1
* - MSAA copies are implemented but disabled because MSAA image stores don't
* work.
*/
void *si_create_blit_cs(struct si_context *sctx, const union si_compute_blit_shader_key *options)
{
const nir_shader_compiler_options *nir_options =
sctx->b.screen->get_compiler_options(sctx->b.screen, PIPE_SHADER_IR_NIR, PIPE_SHADER_COMPUTE);
nir_builder b = nir_builder_init_simple_shader(MESA_SHADER_COMPUTE, nir_options,
"blit_non_scaled_cs");
b.shader->info.num_images = 2;
if (options->src_is_msaa)
BITSET_SET(b.shader->info.msaa_images, 0);
if (options->dst_is_msaa)
BITSET_SET(b.shader->info.msaa_images, 1);
b.shader->info.workgroup_size[0] = 8;
b.shader->info.workgroup_size[1] = 8;
b.shader->info.workgroup_size[2] = 1;
b.shader->info.cs.user_data_components_amd = 3;
const struct glsl_type *img_type[2] = {
glsl_image_type(options->src_is_1d ? GLSL_SAMPLER_DIM_1D :
options->src_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
/*is_array*/ true, GLSL_TYPE_FLOAT),
glsl_image_type(options->dst_is_1d ? GLSL_SAMPLER_DIM_1D :
options->dst_is_msaa ? GLSL_SAMPLER_DIM_MS : GLSL_SAMPLER_DIM_2D,
/*is_array*/ true, GLSL_TYPE_FLOAT),
};
nir_variable *img_src = nir_variable_create(b.shader, nir_var_uniform, img_type[0], "img0");
img_src->data.binding = 0;
nir_variable *img_dst = nir_variable_create(b.shader, nir_var_uniform, img_type[1], "img1");
img_dst->data.binding = 1;
nir_ssa_def *zero = nir_imm_int(&b, 0);
/* Instructions. */
/* Let's work with 0-based src and dst coordinates (thread IDs) first. */
nir_ssa_def *dst_xyz = get_global_ids(&b, 3);
nir_ssa_def *src_xyz = dst_xyz;
/* Flip src coordinates. */
for (unsigned i = 0; i < 2; i++) {
if (i ? options->flip_y : options->flip_x) {
/* x goes from 0 to (dim - 1).
* The flipped blit should load from -dim to -1.
* Therefore do: x = -x - 1;
*/
nir_ssa_def *comp = nir_channel(&b, src_xyz, i);
comp = nir_iadd_imm(&b, nir_ineg(&b, comp), -1);
src_xyz = nir_vector_insert_imm(&b, src_xyz, comp, i);
}
}
/* Add box.xyz. */
nir_ssa_def *coord_src = NULL, *coord_dst = NULL;
unpack_2x16_signed(&b, nir_channels(&b, nir_load_user_data_amd(&b), 0x7),
&coord_src, &coord_dst);
coord_dst = nir_iadd(&b, coord_dst, dst_xyz);
coord_src = nir_iadd(&b, coord_src, src_xyz);
/* Clamp to edge for src, only X and Y because Z can't be out of bounds. */
unsigned src_clamp_channels = options->src_is_1d ? 0x1 : 0x3;
nir_ssa_def *dim = nir_image_deref_size(&b, 4, 32, deref_ssa(&b, img_src), zero);
dim = nir_channels(&b, dim, src_clamp_channels);
nir_ssa_def *coord_src_clamped = nir_channels(&b, coord_src, src_clamp_channels);
coord_src_clamped = nir_imax(&b, coord_src_clamped, nir_imm_int(&b, 0));
coord_src_clamped = nir_imin(&b, coord_src_clamped, nir_iadd_imm(&b, dim, -1));
for (unsigned i = 0; i < util_bitcount(src_clamp_channels); i++)
coord_src = nir_vector_insert_imm(&b, coord_src, nir_channel(&b, coord_src_clamped, i), i);
/* Swizzle coordinates for 1D_ARRAY. */
static unsigned swizzle_xz[] = {0, 2, 0, 0};
if (options->src_is_1d)
coord_src = nir_swizzle(&b, coord_src, swizzle_xz, 4);
if (options->dst_is_1d)
coord_dst = nir_swizzle(&b, coord_dst, swizzle_xz, 4);
/* Coordinates must have 4 channels in NIR. */
coord_src = nir_pad_vector(&b, coord_src, 4);
coord_dst = nir_pad_vector(&b, coord_dst, 4);
/* TODO: out-of-bounds image stores have no effect, but we could jump over them for better perf */
/* Execute the image loads and stores. */
unsigned num_samples = 1 << options->log2_samples;
nir_ssa_def *color;
if (options->src_is_msaa && !options->dst_is_msaa && !options->sample0_only) {
/* MSAA resolving (downsampling). */
assert(num_samples > 1);
color = image_resolve_msaa(&b, img_src, num_samples, coord_src, sctx->gfx_level);
color = apply_blit_output_modifiers(&b, color, options);
nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
} else if (options->src_is_msaa && options->dst_is_msaa) {
/* MSAA copy. */
nir_ssa_def *color[16];
assert(num_samples > 1);
/* Group loads together and then stores. */
for (unsigned i = 0; i < num_samples; i++) {
color[i] = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src,
nir_imm_int(&b, i), zero);
}
for (unsigned i = 0; i < num_samples; i++)
color[i] = apply_blit_output_modifiers(&b, color[i], options);
for (unsigned i = 0; i < num_samples; i++) {
nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
nir_imm_int(&b, i), color[i], zero);
}
} else if (!options->src_is_msaa && options->dst_is_msaa) {
/* MSAA upsampling. */
assert(num_samples > 1);
color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
color = apply_blit_output_modifiers(&b, color, options);
for (unsigned i = 0; i < num_samples; i++) {
nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst,
nir_imm_int(&b, i), color, zero);
}
} else {
/* Non-MSAA copy or read sample 0 only. */
/* src2 = sample_index (zero), src3 = lod (zero) */
assert(num_samples == 1);
color = nir_image_deref_load(&b, 4, 32, deref_ssa(&b, img_src), coord_src, zero, zero);
color = apply_blit_output_modifiers(&b, color, options);
nir_image_deref_store(&b, deref_ssa(&b, img_dst), coord_dst, zero, color, zero);
}
return create_shader_state(sctx, b.shader);
}

View file

@ -952,7 +952,7 @@ void si_test_blit(struct si_screen *sscreen, unsigned test_flags)
if (only_cb_resolve)
success = si_msaa_resolve_blit_via_CB(ctx, &info);
else
success = false;
success = si_compute_blit(sctx, &info);
if (success) {
printf(" %-7s", only_cb_resolve ? "resolve" : "comp");