From cf2dc2d512fc8e621b99e6c53326f0957342ea70 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= <marek.olsak@amd.com>
Date: Fri, 8 Dec 2023 20:10:11 -0500
Subject: [PATCH] winsys/amdgpu: don't layer slabs, use only 1 level of slabs,
 it improves perf

This increases FPS in VP2020/Catia1 by 10-18%!!!!!!!!!!!!!!!!!!!!!!!

I have no rational explanation for this.

In the most extreme case, 8192 256B slab BOs (smallest size) are now
allocated from a single 2MB slab.

Reviewed-by: Pierre-Eric Pelloux-Prayer <pierre-eric.pelloux-prayer@amd.com>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26643>
---
 src/gallium/winsys/amdgpu/drm/amdgpu_bo.c     | 95 +++++++------------
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 47 ++++-----
 src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h |  8 +-
 3 files changed, 49 insertions(+), 101 deletions(-)

diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
index 86d57cf2eaa..a999d7e2882 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c
@@ -229,9 +229,7 @@ static void amdgpu_bo_destroy_or_cache(struct radeon_winsys *rws, struct pb_buff
 
 static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws)
 {
-   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++)
-      pb_slabs_reclaim(&ws->bo_slabs[i]);
-
+   pb_slabs_reclaim(&ws->bo_slabs);
    pb_cache_release_all_buffers(&ws->bo_cache);
 }
 
@@ -615,25 +613,11 @@ bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry)
    return amdgpu_bo_can_reclaim(priv, &bo->b.base);
 }
 
-static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size)
-{
-   /* Find the correct slab allocator for the given size. */
-   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
-      struct pb_slabs *slabs = &ws->bo_slabs[i];
-
-      if (size <= 1 << (slabs->min_order + slabs->num_orders - 1))
-         return slabs;
-   }
-
-   assert(0);
-   return NULL;
-}
-
 static unsigned get_slab_wasted_size(struct amdgpu_winsys *ws, struct amdgpu_bo_slab_entry *bo)
 {
    assert(bo->b.base.size <= bo->entry.slab->entry_size);
    assert(bo->b.base.size < (1 << bo->b.base.alignment_log2) ||
-          bo->b.base.size < 1 << ws->bo_slabs[0].min_order ||
+          bo->b.base.size < 1 << ws->bo_slabs.min_order ||
           bo->b.base.size > bo->entry.slab->entry_size / 2);
    return bo->entry.slab->entry_size - bo->b.base.size;
 }
@@ -642,23 +626,20 @@ static void amdgpu_bo_slab_destroy(struct radeon_winsys *rws, struct pb_buffer *
 {
    struct amdgpu_winsys *ws = amdgpu_winsys(rws);
    struct amdgpu_bo_slab_entry *bo = get_slab_entry_bo(amdgpu_winsys_bo(_buf));
-   struct pb_slabs *slabs;
-
-   slabs = get_slabs(ws, bo->b.base.size);
 
    if (bo->b.base.placement & RADEON_DOMAIN_VRAM)
       ws->slab_wasted_vram -= get_slab_wasted_size(ws, bo);
    else
       ws->slab_wasted_gtt -= get_slab_wasted_size(ws, bo);
 
-   pb_slab_free(slabs, &bo->entry);
+   pb_slab_free(&ws->bo_slabs, &bo->entry);
 }
 
 /* Return the power of two size of a slab entry matching the input size. */
 static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *ws, unsigned size)
 {
    unsigned entry_size = util_next_power_of_two(size);
-   unsigned min_entry_size = 1 << ws->bo_slabs[0].min_order;
+   unsigned min_entry_size = 1 << ws->bo_slabs.min_order;
 
    return MAX2(entry_size, min_entry_size);
 }
@@ -682,44 +663,37 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
    enum radeon_bo_domain domains = radeon_domain_from_heap(heap);
    enum radeon_bo_flag flags = radeon_flags_from_heap(heap);
    uint32_t base_id;
-   unsigned slab_size = 0;
 
    if (!slab)
       return NULL;
 
    /* Determine the slab buffer size. */
-   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
-      unsigned max_entry_size = 1 << (ws->bo_slabs[i].min_order + ws->bo_slabs[i].num_orders - 1);
+   unsigned max_entry_size = 1 << (ws->bo_slabs.min_order + ws->bo_slabs.num_orders - 1);
 
-      if (entry_size <= max_entry_size) {
-         /* The slab size is twice the size of the largest possible entry. */
-         slab_size = max_entry_size * 2;
+   assert(entry_size <= max_entry_size);
 
-         if (!util_is_power_of_two_nonzero(entry_size)) {
-            assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
+   /* The slab size is twice the size of the largest possible entry. */
+   unsigned slab_size = max_entry_size * 2;
 
-            /* If the entry size is 3/4 of a power of two, we would waste space and not gain
-             * anything if we allocated only twice the power of two for the backing buffer:
-             *   2 * 3/4 = 1.5 usable with buffer size 2
-             *
-             * Allocating 5 times the entry size leads us to the next power of two and results
-             * in a much better memory utilization:
-             *   5 * 3/4 = 3.75 usable with buffer size 4
-             */
-            if (entry_size * 5 > slab_size)
-               slab_size = util_next_power_of_two(entry_size * 5);
-         }
+   if (!util_is_power_of_two_nonzero(entry_size)) {
+      assert(util_is_power_of_two_nonzero(entry_size * 4 / 3));
 
-         /* The largest slab should have the same size as the PTE fragment
-          * size to get faster address translation.
-          */
-         if (i == NUM_SLAB_ALLOCATORS - 1 &&
-             slab_size < ws->info.pte_fragment_size)
-            slab_size = ws->info.pte_fragment_size;
-         break;
-      }
+      /* If the entry size is 3/4 of a power of two, we would waste space and not gain
+       * anything if we allocated only twice the power of two for the backing buffer:
+       *   2 * 3/4 = 1.5 usable with buffer size 2
+       *
+       * Allocating 5 times the entry size leads us to the next power of two and results
+       * in a much better memory utilization:
+       *   5 * 3/4 = 3.75 usable with buffer size 4
+       */
+      if (entry_size * 5 > slab_size)
+         slab_size = util_next_power_of_two(entry_size * 5);
    }
-   assert(slab_size != 0);
+
+   /* The largest slab should have the same size as the PTE fragment
+    * size to get faster address translation.
+    */
+   slab_size = MAX2(slab_size, ws->info.pte_fragment_size);
 
    slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(ws,
                                                     slab_size, slab_size,
@@ -727,6 +701,7 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
    if (!slab->buffer)
       goto fail;
 
+   /* We can get a buffer from pb_cache that is slightly larger. */
    slab_size = slab->buffer->base.size;
 
    slab->base.num_entries = slab_size / entry_size;
@@ -751,13 +726,9 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s
       bo->b.va = slab->buffer->va + i * entry_size;
       bo->b.unique_id = base_id + i;
 
-      if (is_real_bo(slab->buffer)) {
-         /* The slab is not suballocated. */
-         bo->real = get_real_bo(slab->buffer);
-      } else {
-         /* The slab is allocated out of a bigger slab. */
-         bo->real = get_slab_entry_bo(slab->buffer)->real;
-      }
+      /* The slab is not suballocated. */
+      assert(is_real_bo(slab->buffer));
+      bo->real = get_real_bo(slab->buffer);
 
       bo->entry.slab = &slab->base;
       list_addtail(&bo->entry.head, &slab->base.free);
@@ -1358,8 +1329,7 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
       return amdgpu_bo_sparse_create(ws, size, domain, flags);
    }
 
-   struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1];
-   unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1);
+   unsigned max_slab_entry_size = 1 << (ws->bo_slabs.min_order + ws->bo_slabs.num_orders - 1);
    int heap = radeon_get_heap_index(domain, flags);
 
    /* Sub-allocate small buffers from slabs. */
@@ -1387,13 +1357,12 @@ amdgpu_bo_create(struct amdgpu_winsys *ws,
          }
       }
 
-      struct pb_slabs *slabs = get_slabs(ws, alloc_size);
-      entry = pb_slab_alloc(slabs, alloc_size, heap);
+      entry = pb_slab_alloc(&ws->bo_slabs, alloc_size, heap);
       if (!entry) {
          /* Clean up buffer managers and try again. */
          amdgpu_clean_up_buffer_managers(ws);
 
-         entry = pb_slab_alloc(slabs, alloc_size, heap);
+         entry = pb_slab_alloc(&ws->bo_slabs, alloc_size, heap);
       }
       if (!entry)
          return NULL;
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
index abf4c464071..d54f229c363 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c
@@ -76,10 +76,8 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws)
       util_queue_destroy(&ws->cs_queue);
 
    simple_mtx_destroy(&ws->bo_fence_lock);
-   for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
-      if (ws->bo_slabs[i].groups)
-         pb_slabs_deinit(&ws->bo_slabs[i]);
-   }
+   if (ws->bo_slabs.groups)
+      pb_slabs_deinit(&ws->bo_slabs);
    pb_cache_deinit(&ws->bo_cache);
    _mesa_hash_table_destroy(ws->bo_export_table, NULL);
    simple_mtx_destroy(&ws->sws_list_lock);
@@ -454,35 +452,22 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config,
                      * is a struct pointer instead of void*. */
                     (void*)amdgpu_bo_destroy, (void*)amdgpu_bo_can_reclaim);
 
-      unsigned min_slab_order = 8;  /* 256 bytes */
-      unsigned max_slab_order = 20; /* 1 MB (slab size = 2 MB) */
-      unsigned num_slab_orders_per_allocator = (max_slab_order - min_slab_order) /
-                                               NUM_SLAB_ALLOCATORS;
-
-      /* Divide the size order range among slab managers. */
-      for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) {
-         unsigned min_order = min_slab_order;
-         unsigned max_order = MIN2(min_order + num_slab_orders_per_allocator,
-                                   max_slab_order);
-
-         if (!pb_slabs_init(&aws->bo_slabs[i],
-                            min_order, max_order,
-                            RADEON_NUM_HEAPS, true,
-                            aws,
-                            amdgpu_bo_can_reclaim_slab,
-                            amdgpu_bo_slab_alloc,
-                            /* Cast to void* because one of the function parameters
-                             * is a struct pointer instead of void*. */
-                            (void*)amdgpu_bo_slab_free)) {
-            amdgpu_winsys_destroy(&ws->base);
-            simple_mtx_unlock(&dev_tab_mutex);
-            return NULL;
-         }
-
-         min_slab_order = max_order + 1;
+      if (!pb_slabs_init(&aws->bo_slabs,
+                         8,  /* min slab entry size: 256 bytes */
+                         20, /* max slab entry size: 1 MB (slab size = 2 MB) */
+                         RADEON_NUM_HEAPS, true,
+                         aws,
+                         amdgpu_bo_can_reclaim_slab,
+                         amdgpu_bo_slab_alloc,
+                         /* Cast to void* because one of the function parameters
+                          * is a struct pointer instead of void*. */
+                         (void*)amdgpu_bo_slab_free)) {
+         amdgpu_winsys_destroy(&ws->base);
+         simple_mtx_unlock(&dev_tab_mutex);
+         return NULL;
       }
 
-      aws->info.min_alloc_size = 1 << aws->bo_slabs[0].min_order;
+      aws->info.min_alloc_size = 1 << aws->bo_slabs.min_order;
 
       /* init reference */
       pipe_reference_init(&aws->reference, 1);
diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
index 62b583a5d55..4c552461bca 100644
--- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
+++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h
@@ -17,8 +17,6 @@
 
 struct amdgpu_cs;
 
-#define NUM_SLAB_ALLOCATORS 3
-
 /* DRM file descriptors, file descriptions and buffer sharing.
  *
  * amdgpu_device_initialize first argument is a file descriptor (fd)
@@ -70,11 +68,7 @@ struct amdgpu_winsys {
    int fd;
 
    struct pb_cache bo_cache;
-
-   /* Each slab buffer can only contain suballocations of equal sizes, so we
-    * need to layer the allocators, so that we don't waste too much memory.
-    */
-   struct pb_slabs bo_slabs[NUM_SLAB_ALLOCATORS];
+   struct pb_slabs bo_slabs;  /* Slab allocator. */
 
    amdgpu_device_handle dev;