radv: Implement helpers for shader part caching.

Currently, shader part caching logic is duplicated between VS prolog and
PS/TCS epilogs. This commit introduces a common abstraction to
deduplicate the code.

Additionally, there are a few design decisions that diverts from the
current implementation:
1. A simple mutex is used instead of reader-writer lock. Prolog/epilog
   constructions are serialized, removing the need to free duplicate
   objects in case of a race.
2. A CS-local cache is used to quickly lookup an entry without holding a
   lock. This eliminates locking in over 99% of cases.
3. A set is used to reduce number of allocations.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26028>
This commit is contained in:
Tatsuyuki Ishi 2023-11-03 17:20:30 +09:00 committed by Marge Bot
parent 3cd6bb3e5d
commit 611545fbfe
2 changed files with 98 additions and 0 deletions

View file

@ -2207,6 +2207,75 @@ fail:
return NULL;
}
bool
radv_shader_part_cache_init(struct radv_shader_part_cache *cache, struct radv_shader_part_cache_ops *ops)
{
cache->ops = ops;
if (!_mesa_set_init(&cache->entries, NULL, cache->ops->hash, cache->ops->equals))
return false;
simple_mtx_init(&cache->lock, mtx_plain);
return true;
}
void
radv_shader_part_cache_finish(struct radv_device *device, struct radv_shader_part_cache *cache)
{
set_foreach (&cache->entries, entry)
radv_shader_part_unref(device, radv_shader_part_from_cache_entry(entry->key));
simple_mtx_destroy(&cache->lock);
ralloc_free(cache->entries.table);
}
/*
* A cache with atomics-free fast path for prolog / epilog lookups.
*
* VS prologs and PS/TCS epilogs are used to support dynamic states. In
* particular dynamic blend state is heavily used by Zink. These are called
* every frame as a part of command buffer building, so these functions are
* on the hot path.
*
* Originally this was implemented with a rwlock, but this lead to high
* overhead. To avoid locking altogether in the hot path, the cache is done
* at two levels: one at device level, and another at each CS. Access to the
* CS cache is externally synchronized and do not require a lock.
*/
struct radv_shader_part *
radv_shader_part_cache_get(struct radv_device *device, struct radv_shader_part_cache *cache, struct set *local_entries,
const void *key)
{
struct set_entry *local, *global;
bool local_found, global_found;
uint32_t hash = cache->ops->hash(key);
local = _mesa_set_search_or_add_pre_hashed(local_entries, hash, key, &local_found);
if (local_found)
return radv_shader_part_from_cache_entry(local->key);
simple_mtx_lock(&cache->lock);
global = _mesa_set_search_or_add_pre_hashed(&cache->entries, hash, key, &global_found);
if (global_found) {
simple_mtx_unlock(&cache->lock);
local->key = global->key;
return radv_shader_part_from_cache_entry(global->key);
}
struct radv_shader_part *shader_part = cache->ops->create(device, key);
if (!shader_part) {
_mesa_set_remove(&cache->entries, global);
simple_mtx_unlock(&cache->lock);
_mesa_set_remove(local_entries, local);
return NULL;
}
/* Make the set entry a pointer to the key, so that the hash and equals
* functions from radv_shader_part_cache_ops can be directly used.
*/
global->key = &shader_part->key;
simple_mtx_unlock(&cache->lock);
local->key = &shader_part->key;
return shader_part;
}
static char *
radv_dump_nir_shaders(struct nir_shader *const *shaders, int shader_count)
{

View file

@ -621,6 +621,12 @@ struct radv_shader {
struct radv_shader_part {
uint32_t ref_count;
union {
struct radv_vs_prolog_key vs;
struct radv_ps_epilog_key ps;
struct radv_tcs_epilog_key tcs;
} key;
uint64_t va;
struct radeon_winsys_bo *bo;
@ -635,6 +641,18 @@ struct radv_shader_part {
char *disasm_string;
};
struct radv_shader_part_cache_ops {
uint32_t (*hash)(const void *key);
bool (*equals)(const void *a, const void *b);
struct radv_shader_part *(*create)(struct radv_device *device, const void *key);
};
struct radv_shader_part_cache {
simple_mtx_t lock;
struct radv_shader_part_cache_ops *ops;
struct set entries;
};
struct radv_pipeline_layout;
struct radv_shader_stage;
@ -722,6 +740,11 @@ struct radv_shader_part *radv_create_tcs_epilog(struct radv_device *device, cons
void radv_shader_part_destroy(struct radv_device *device, struct radv_shader_part *shader_part);
bool radv_shader_part_cache_init(struct radv_shader_part_cache *cache, struct radv_shader_part_cache_ops *ops);
void radv_shader_part_cache_finish(struct radv_device *device, struct radv_shader_part_cache *cache);
struct radv_shader_part *radv_shader_part_cache_get(struct radv_device *device, struct radv_shader_part_cache *cache,
struct set *local_entries, const void *key);
uint64_t radv_shader_get_va(const struct radv_shader *shader);
struct radv_shader *radv_find_shader(struct radv_device *device, uint64_t pc);
@ -776,6 +799,12 @@ radv_shader_part_unref(struct radv_device *device, struct radv_shader_part *shad
radv_shader_part_destroy(device, shader_part);
}
static inline struct radv_shader_part *
radv_shader_part_from_cache_entry(const void *key)
{
return container_of(key, struct radv_shader_part, key);
}
static inline unsigned
get_tcs_input_vertex_stride(unsigned tcs_num_inputs)
{