radv: Implement helpers for shader part caching.

Currently, shader part caching logic is duplicated between VS prolog and PS/TCS epilogs. This commit introduces a common abstraction to deduplicate the code. Additionally, there are a few design decisions that diverts from the current implementation: 1. A simple mutex is used instead of reader-writer lock. Prolog/epilog constructions are serialized, removing the need to free duplicate objects in case of a race. 2. A CS-local cache is used to quickly lookup an entry without holding a lock. This eliminates locking in over 99% of cases. 3. A set is used to reduce number of allocations. Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26028>
2023-11-03 17:20:30 +09:00 · 2023-11-03 17:20:30 +09:00 · 611545fbfe
commit 611545fbfe
parent 3cd6bb3e5d
2 changed files with 98 additions and 0 deletions
--- a/src/amd/vulkan/radv_shader.c
+++ b/src/amd/vulkan/radv_shader.c
@ -2207,6 +2207,75 @@ fail:
   return NULL;
 }

+bool
+radv_shader_part_cache_init(struct radv_shader_part_cache *cache, struct radv_shader_part_cache_ops *ops)
+{
+   cache->ops = ops;
+   if (!_mesa_set_init(&cache->entries, NULL, cache->ops->hash, cache->ops->equals))
+      return false;
+   simple_mtx_init(&cache->lock, mtx_plain);
+   return true;
+}
+
+void
+radv_shader_part_cache_finish(struct radv_device *device, struct radv_shader_part_cache *cache)
+{
+   set_foreach (&cache->entries, entry)
+      radv_shader_part_unref(device, radv_shader_part_from_cache_entry(entry->key));
+   simple_mtx_destroy(&cache->lock);
+   ralloc_free(cache->entries.table);
+}
+
+/*
+ * A cache with atomics-free fast path for prolog / epilog lookups.
+ *
+ * VS prologs and PS/TCS epilogs are used to support dynamic states. In
+ * particular dynamic blend state is heavily used by Zink. These are called
+ * every frame as a part of command buffer building, so these functions are
+ * on the hot path.
+ *
+ * Originally this was implemented with a rwlock, but this lead to high
+ * overhead. To avoid locking altogether in the hot path, the cache is done
+ * at two levels: one at device level, and another at each CS. Access to the
+ * CS cache is externally synchronized and do not require a lock.
+ */
+struct radv_shader_part *
+radv_shader_part_cache_get(struct radv_device *device, struct radv_shader_part_cache *cache, struct set *local_entries,
+                           const void *key)
+{
+   struct set_entry *local, *global;
+   bool local_found, global_found;
+   uint32_t hash = cache->ops->hash(key);
+
+   local = _mesa_set_search_or_add_pre_hashed(local_entries, hash, key, &local_found);
+   if (local_found)
+      return radv_shader_part_from_cache_entry(local->key);
+
+   simple_mtx_lock(&cache->lock);
+   global = _mesa_set_search_or_add_pre_hashed(&cache->entries, hash, key, &global_found);
+   if (global_found) {
+      simple_mtx_unlock(&cache->lock);
+      local->key = global->key;
+      return radv_shader_part_from_cache_entry(global->key);
+   }
+
+   struct radv_shader_part *shader_part = cache->ops->create(device, key);
+   if (!shader_part) {
+      _mesa_set_remove(&cache->entries, global);
+      simple_mtx_unlock(&cache->lock);
+      _mesa_set_remove(local_entries, local);
+      return NULL;
+   }
+
+   /* Make the set entry a pointer to the key, so that the hash and equals
+    * functions from radv_shader_part_cache_ops can be directly used.
+    */
+   global->key = &shader_part->key;
+   simple_mtx_unlock(&cache->lock);
+   local->key = &shader_part->key;
+   return shader_part;
+}
+
 static char *
 radv_dump_nir_shaders(struct nir_shader *const *shaders, int shader_count)
 {
--- a/src/amd/vulkan/radv_shader.h
+++ b/src/amd/vulkan/radv_shader.h
@ -621,6 +621,12 @@ struct radv_shader {
 struct radv_shader_part {
   uint32_t ref_count;

+   union {
+      struct radv_vs_prolog_key vs;
+      struct radv_ps_epilog_key ps;
+      struct radv_tcs_epilog_key tcs;
+   } key;
+
   uint64_t va;

   struct radeon_winsys_bo *bo;
@ -635,6 +641,18 @@ struct radv_shader_part {
   char *disasm_string;
 };

+struct radv_shader_part_cache_ops {
+   uint32_t (*hash)(const void *key);
+   bool (*equals)(const void *a, const void *b);
+   struct radv_shader_part *(*create)(struct radv_device *device, const void *key);
+};
+
+struct radv_shader_part_cache {
+   simple_mtx_t lock;
+   struct radv_shader_part_cache_ops *ops;
+   struct set entries;
+};
+
 struct radv_pipeline_layout;
 struct radv_shader_stage;

@ -722,6 +740,11 @@ struct radv_shader_part *radv_create_tcs_epilog(struct radv_device *device, cons

 void radv_shader_part_destroy(struct radv_device *device, struct radv_shader_part *shader_part);

+bool radv_shader_part_cache_init(struct radv_shader_part_cache *cache, struct radv_shader_part_cache_ops *ops);
+void radv_shader_part_cache_finish(struct radv_device *device, struct radv_shader_part_cache *cache);
+struct radv_shader_part *radv_shader_part_cache_get(struct radv_device *device, struct radv_shader_part_cache *cache,
+                                                    struct set *local_entries, const void *key);
+
 uint64_t radv_shader_get_va(const struct radv_shader *shader);
 struct radv_shader *radv_find_shader(struct radv_device *device, uint64_t pc);

@ -776,6 +799,12 @@ radv_shader_part_unref(struct radv_device *device, struct radv_shader_part *shad
      radv_shader_part_destroy(device, shader_part);
 }

+static inline struct radv_shader_part *
+radv_shader_part_from_cache_entry(const void *key)
+{
+   return container_of(key, struct radv_shader_part, key);
+}
+
 static inline unsigned
 get_tcs_input_vertex_stride(unsigned tcs_num_inputs)
 {