From cf2dc2d512fc8e621b99e6c53326f0957342ea70 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Marek=20Ol=C5=A1=C3=A1k?= Date: Fri, 8 Dec 2023 20:10:11 -0500 Subject: [PATCH] winsys/amdgpu: don't layer slabs, use only 1 level of slabs, it improves perf This increases FPS in VP2020/Catia1 by 10-18%!!!!!!!!!!!!!!!!!!!!!!! I have no rational explanation for this. In the most extreme case, 8192 256B slab BOs (smallest size) are now allocated from a single 2MB slab. Reviewed-by: Pierre-Eric Pelloux-Prayer Part-of: --- src/gallium/winsys/amdgpu/drm/amdgpu_bo.c | 95 +++++++------------ src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c | 47 ++++----- src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h | 8 +- 3 files changed, 49 insertions(+), 101 deletions(-) diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c index 86d57cf2eaa..a999d7e2882 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_bo.c @@ -229,9 +229,7 @@ static void amdgpu_bo_destroy_or_cache(struct radeon_winsys *rws, struct pb_buff static void amdgpu_clean_up_buffer_managers(struct amdgpu_winsys *ws) { - for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) - pb_slabs_reclaim(&ws->bo_slabs[i]); - + pb_slabs_reclaim(&ws->bo_slabs); pb_cache_release_all_buffers(&ws->bo_cache); } @@ -615,25 +613,11 @@ bool amdgpu_bo_can_reclaim_slab(void *priv, struct pb_slab_entry *entry) return amdgpu_bo_can_reclaim(priv, &bo->b.base); } -static struct pb_slabs *get_slabs(struct amdgpu_winsys *ws, uint64_t size) -{ - /* Find the correct slab allocator for the given size. */ - for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) { - struct pb_slabs *slabs = &ws->bo_slabs[i]; - - if (size <= 1 << (slabs->min_order + slabs->num_orders - 1)) - return slabs; - } - - assert(0); - return NULL; -} - static unsigned get_slab_wasted_size(struct amdgpu_winsys *ws, struct amdgpu_bo_slab_entry *bo) { assert(bo->b.base.size <= bo->entry.slab->entry_size); assert(bo->b.base.size < (1 << bo->b.base.alignment_log2) || - bo->b.base.size < 1 << ws->bo_slabs[0].min_order || + bo->b.base.size < 1 << ws->bo_slabs.min_order || bo->b.base.size > bo->entry.slab->entry_size / 2); return bo->entry.slab->entry_size - bo->b.base.size; } @@ -642,23 +626,20 @@ static void amdgpu_bo_slab_destroy(struct radeon_winsys *rws, struct pb_buffer * { struct amdgpu_winsys *ws = amdgpu_winsys(rws); struct amdgpu_bo_slab_entry *bo = get_slab_entry_bo(amdgpu_winsys_bo(_buf)); - struct pb_slabs *slabs; - - slabs = get_slabs(ws, bo->b.base.size); if (bo->b.base.placement & RADEON_DOMAIN_VRAM) ws->slab_wasted_vram -= get_slab_wasted_size(ws, bo); else ws->slab_wasted_gtt -= get_slab_wasted_size(ws, bo); - pb_slab_free(slabs, &bo->entry); + pb_slab_free(&ws->bo_slabs, &bo->entry); } /* Return the power of two size of a slab entry matching the input size. */ static unsigned get_slab_pot_entry_size(struct amdgpu_winsys *ws, unsigned size) { unsigned entry_size = util_next_power_of_two(size); - unsigned min_entry_size = 1 << ws->bo_slabs[0].min_order; + unsigned min_entry_size = 1 << ws->bo_slabs.min_order; return MAX2(entry_size, min_entry_size); } @@ -682,44 +663,37 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s enum radeon_bo_domain domains = radeon_domain_from_heap(heap); enum radeon_bo_flag flags = radeon_flags_from_heap(heap); uint32_t base_id; - unsigned slab_size = 0; if (!slab) return NULL; /* Determine the slab buffer size. */ - for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) { - unsigned max_entry_size = 1 << (ws->bo_slabs[i].min_order + ws->bo_slabs[i].num_orders - 1); + unsigned max_entry_size = 1 << (ws->bo_slabs.min_order + ws->bo_slabs.num_orders - 1); - if (entry_size <= max_entry_size) { - /* The slab size is twice the size of the largest possible entry. */ - slab_size = max_entry_size * 2; + assert(entry_size <= max_entry_size); - if (!util_is_power_of_two_nonzero(entry_size)) { - assert(util_is_power_of_two_nonzero(entry_size * 4 / 3)); + /* The slab size is twice the size of the largest possible entry. */ + unsigned slab_size = max_entry_size * 2; - /* If the entry size is 3/4 of a power of two, we would waste space and not gain - * anything if we allocated only twice the power of two for the backing buffer: - * 2 * 3/4 = 1.5 usable with buffer size 2 - * - * Allocating 5 times the entry size leads us to the next power of two and results - * in a much better memory utilization: - * 5 * 3/4 = 3.75 usable with buffer size 4 - */ - if (entry_size * 5 > slab_size) - slab_size = util_next_power_of_two(entry_size * 5); - } + if (!util_is_power_of_two_nonzero(entry_size)) { + assert(util_is_power_of_two_nonzero(entry_size * 4 / 3)); - /* The largest slab should have the same size as the PTE fragment - * size to get faster address translation. - */ - if (i == NUM_SLAB_ALLOCATORS - 1 && - slab_size < ws->info.pte_fragment_size) - slab_size = ws->info.pte_fragment_size; - break; - } + /* If the entry size is 3/4 of a power of two, we would waste space and not gain + * anything if we allocated only twice the power of two for the backing buffer: + * 2 * 3/4 = 1.5 usable with buffer size 2 + * + * Allocating 5 times the entry size leads us to the next power of two and results + * in a much better memory utilization: + * 5 * 3/4 = 3.75 usable with buffer size 4 + */ + if (entry_size * 5 > slab_size) + slab_size = util_next_power_of_two(entry_size * 5); } - assert(slab_size != 0); + + /* The largest slab should have the same size as the PTE fragment + * size to get faster address translation. + */ + slab_size = MAX2(slab_size, ws->info.pte_fragment_size); slab->buffer = amdgpu_winsys_bo(amdgpu_bo_create(ws, slab_size, slab_size, @@ -727,6 +701,7 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s if (!slab->buffer) goto fail; + /* We can get a buffer from pb_cache that is slightly larger. */ slab_size = slab->buffer->base.size; slab->base.num_entries = slab_size / entry_size; @@ -751,13 +726,9 @@ struct pb_slab *amdgpu_bo_slab_alloc(void *priv, unsigned heap, unsigned entry_s bo->b.va = slab->buffer->va + i * entry_size; bo->b.unique_id = base_id + i; - if (is_real_bo(slab->buffer)) { - /* The slab is not suballocated. */ - bo->real = get_real_bo(slab->buffer); - } else { - /* The slab is allocated out of a bigger slab. */ - bo->real = get_slab_entry_bo(slab->buffer)->real; - } + /* The slab is not suballocated. */ + assert(is_real_bo(slab->buffer)); + bo->real = get_real_bo(slab->buffer); bo->entry.slab = &slab->base; list_addtail(&bo->entry.head, &slab->base.free); @@ -1358,8 +1329,7 @@ amdgpu_bo_create(struct amdgpu_winsys *ws, return amdgpu_bo_sparse_create(ws, size, domain, flags); } - struct pb_slabs *last_slab = &ws->bo_slabs[NUM_SLAB_ALLOCATORS - 1]; - unsigned max_slab_entry_size = 1 << (last_slab->min_order + last_slab->num_orders - 1); + unsigned max_slab_entry_size = 1 << (ws->bo_slabs.min_order + ws->bo_slabs.num_orders - 1); int heap = radeon_get_heap_index(domain, flags); /* Sub-allocate small buffers from slabs. */ @@ -1387,13 +1357,12 @@ amdgpu_bo_create(struct amdgpu_winsys *ws, } } - struct pb_slabs *slabs = get_slabs(ws, alloc_size); - entry = pb_slab_alloc(slabs, alloc_size, heap); + entry = pb_slab_alloc(&ws->bo_slabs, alloc_size, heap); if (!entry) { /* Clean up buffer managers and try again. */ amdgpu_clean_up_buffer_managers(ws); - entry = pb_slab_alloc(slabs, alloc_size, heap); + entry = pb_slab_alloc(&ws->bo_slabs, alloc_size, heap); } if (!entry) return NULL; diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c index abf4c464071..d54f229c363 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.c @@ -76,10 +76,8 @@ static void do_winsys_deinit(struct amdgpu_winsys *ws) util_queue_destroy(&ws->cs_queue); simple_mtx_destroy(&ws->bo_fence_lock); - for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) { - if (ws->bo_slabs[i].groups) - pb_slabs_deinit(&ws->bo_slabs[i]); - } + if (ws->bo_slabs.groups) + pb_slabs_deinit(&ws->bo_slabs); pb_cache_deinit(&ws->bo_cache); _mesa_hash_table_destroy(ws->bo_export_table, NULL); simple_mtx_destroy(&ws->sws_list_lock); @@ -454,35 +452,22 @@ amdgpu_winsys_create(int fd, const struct pipe_screen_config *config, * is a struct pointer instead of void*. */ (void*)amdgpu_bo_destroy, (void*)amdgpu_bo_can_reclaim); - unsigned min_slab_order = 8; /* 256 bytes */ - unsigned max_slab_order = 20; /* 1 MB (slab size = 2 MB) */ - unsigned num_slab_orders_per_allocator = (max_slab_order - min_slab_order) / - NUM_SLAB_ALLOCATORS; - - /* Divide the size order range among slab managers. */ - for (unsigned i = 0; i < NUM_SLAB_ALLOCATORS; i++) { - unsigned min_order = min_slab_order; - unsigned max_order = MIN2(min_order + num_slab_orders_per_allocator, - max_slab_order); - - if (!pb_slabs_init(&aws->bo_slabs[i], - min_order, max_order, - RADEON_NUM_HEAPS, true, - aws, - amdgpu_bo_can_reclaim_slab, - amdgpu_bo_slab_alloc, - /* Cast to void* because one of the function parameters - * is a struct pointer instead of void*. */ - (void*)amdgpu_bo_slab_free)) { - amdgpu_winsys_destroy(&ws->base); - simple_mtx_unlock(&dev_tab_mutex); - return NULL; - } - - min_slab_order = max_order + 1; + if (!pb_slabs_init(&aws->bo_slabs, + 8, /* min slab entry size: 256 bytes */ + 20, /* max slab entry size: 1 MB (slab size = 2 MB) */ + RADEON_NUM_HEAPS, true, + aws, + amdgpu_bo_can_reclaim_slab, + amdgpu_bo_slab_alloc, + /* Cast to void* because one of the function parameters + * is a struct pointer instead of void*. */ + (void*)amdgpu_bo_slab_free)) { + amdgpu_winsys_destroy(&ws->base); + simple_mtx_unlock(&dev_tab_mutex); + return NULL; } - aws->info.min_alloc_size = 1 << aws->bo_slabs[0].min_order; + aws->info.min_alloc_size = 1 << aws->bo_slabs.min_order; /* init reference */ pipe_reference_init(&aws->reference, 1); diff --git a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h index 62b583a5d55..4c552461bca 100644 --- a/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h +++ b/src/gallium/winsys/amdgpu/drm/amdgpu_winsys.h @@ -17,8 +17,6 @@ struct amdgpu_cs; -#define NUM_SLAB_ALLOCATORS 3 - /* DRM file descriptors, file descriptions and buffer sharing. * * amdgpu_device_initialize first argument is a file descriptor (fd) @@ -70,11 +68,7 @@ struct amdgpu_winsys { int fd; struct pb_cache bo_cache; - - /* Each slab buffer can only contain suballocations of equal sizes, so we - * need to layer the allocators, so that we don't waste too much memory. - */ - struct pb_slabs bo_slabs[NUM_SLAB_ALLOCATORS]; + struct pb_slabs bo_slabs; /* Slab allocator. */ amdgpu_device_handle dev;