turnip: Add TU_DEBUG=bos to print stats of BOs live at submit time.

I keep needing to hack this in to debug BO leaks, so let's just add it as
an option to use in general.

Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/18679>
This commit is contained in:
Emma Anholt 2022-02-04 20:02:33 -08:00 committed by Marge Bot
parent c25662988f
commit e5cd28cb37
13 changed files with 210 additions and 34 deletions

View file

@ -107,7 +107,7 @@ create_submission_data(struct tu_device *dev, struct tu_autotune *at,
list_del(&submission_data->node);
} else {
submission_data = calloc(1, sizeof(struct tu_submission_data));
tu_cs_init(&submission_data->fence_cs, dev, TU_CS_MODE_GROW, 5);
tu_cs_init(&submission_data->fence_cs, dev, TU_CS_MODE_GROW, 5, "autotune fence cs");
}
submission_data->fence = fence;

View file

@ -1610,13 +1610,13 @@ tu_create_cmd_buffer(struct vk_command_pool *pool,
u_trace_init(&cmd_buffer->trace, &device->trace_context);
list_inithead(&cmd_buffer->renderpass_autotune_results);
tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048);
tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048);
tu_cs_init(&cmd_buffer->pre_chain.draw_cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->pre_chain.draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096);
tu_cs_init(&cmd_buffer->cs, device, TU_CS_MODE_GROW, 4096, "cmd cs");
tu_cs_init(&cmd_buffer->draw_cs, device, TU_CS_MODE_GROW, 4096, "draw cs");
tu_cs_init(&cmd_buffer->tile_store_cs, device, TU_CS_MODE_GROW, 2048, "tile store cs");
tu_cs_init(&cmd_buffer->draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096, "draw epilogue cs");
tu_cs_init(&cmd_buffer->sub_cs, device, TU_CS_MODE_SUB_STREAM, 2048, "draw sub cs");
tu_cs_init(&cmd_buffer->pre_chain.draw_cs, device, TU_CS_MODE_GROW, 4096, "prechain draw cs");
tu_cs_init(&cmd_buffer->pre_chain.draw_epilogue_cs, device, TU_CS_MODE_GROW, 4096, "prechain draw epiligoue cs");
*cmd_buffer_out = &cmd_buffer->vk;

View file

@ -14,7 +14,7 @@ void
tu_cs_init(struct tu_cs *cs,
struct tu_device *device,
enum tu_cs_mode mode,
uint32_t initial_size)
uint32_t initial_size, const char *name)
{
assert(mode != TU_CS_MODE_EXTERNAL);
@ -23,6 +23,7 @@ tu_cs_init(struct tu_cs *cs,
cs->device = device;
cs->mode = mode;
cs->next_bo_size = initial_size;
cs->name = name;
}
/**
@ -128,7 +129,7 @@ tu_cs_add_bo(struct tu_cs *cs, uint32_t size)
VkResult result =
tu_bo_init_new(cs->device, &new_bo, size * sizeof(uint32_t),
TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP);
TU_BO_ALLOC_GPU_READ_ONLY | TU_BO_ALLOC_ALLOW_DUMP, cs->name);
if (result != VK_SUCCESS) {
return result;
}

View file

@ -77,6 +77,7 @@ struct tu_cs
uint32_t *cur;
uint32_t *reserved_end;
uint32_t *end;
const char *name;
struct tu_device *device;
enum tu_cs_mode mode;
@ -111,7 +112,7 @@ void
tu_cs_init(struct tu_cs *cs,
struct tu_device *device,
enum tu_cs_mode mode,
uint32_t initial_size);
uint32_t initial_size, const char *name);
void
tu_cs_init_external(struct tu_cs *cs, struct tu_device *device,

View file

@ -739,7 +739,7 @@ tu_CreateDescriptorPool(VkDevice _device,
if (bo_size) {
if (!(pCreateInfo->flags & VK_DESCRIPTOR_POOL_CREATE_HOST_ONLY_BIT_VALVE)) {
ret = tu_bo_init_new(device, &pool->bo, bo_size, TU_BO_ALLOC_ALLOW_DUMP);
ret = tu_bo_init_new(device, &pool->bo, bo_size, TU_BO_ALLOC_ALLOW_DUMP, "descriptor pool");
if (ret)
goto fail_alloc;

View file

@ -390,6 +390,7 @@ static const struct debug_control tu_debug_options[] = {
{ "unaligned_store", TU_DEBUG_UNALIGNED_STORE },
{ "log_skip_gmem_ops", TU_DEBUG_LOG_SKIP_GMEM_OPS },
{ "dynamic", TU_DEBUG_DYNAMIC },
{ "bos", TU_DEBUG_BOS },
{ NULL, 0 }
};
@ -1615,7 +1616,7 @@ tu_trace_create_ts_buffer(struct u_trace_context *utctx, uint32_t size)
container_of(utctx, struct tu_device, trace_context);
struct tu_bo *bo;
tu_bo_init_new(device, &bo, size, false);
tu_bo_init_new(device, &bo, size, false, "trace");
return bo;
}
@ -1740,7 +1741,7 @@ tu_create_copy_timestamp_cs(struct tu_cmd_buffer *cmdbuf, struct tu_cs** cs,
}
tu_cs_init(*cs, cmdbuf->device, TU_CS_MODE_GROW,
list_length(&cmdbuf->trace.trace_chunks) * 6 + 3);
list_length(&cmdbuf->trace.trace_chunks) * 6 + 3, "trace copy timestamp cs");
tu_cs_begin(*cs);
@ -1932,6 +1933,9 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
u_rwlock_init(&device->dma_bo_lock);
pthread_mutex_init(&device->submit_mutex, NULL);
if (device->instance->debug_flags & TU_DEBUG_BOS)
device->bo_sizes = _mesa_hash_table_create(NULL, _mesa_hash_string, _mesa_key_string_equal);
#ifndef TU_USE_KGSL
vk_device_set_drm_fd(&device->vk, device->fd);
#endif
@ -1996,7 +2000,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
128 * 1024, 0);
result = tu_bo_init_new(device, &device->global_bo, global_size,
TU_BO_ALLOC_ALLOW_DUMP);
TU_BO_ALLOC_ALLOW_DUMP, "global");
if (result != VK_SUCCESS) {
vk_startup_errorf(device->instance, result, "BO init");
goto fail_global_bo;
@ -2062,7 +2066,7 @@ tu_CreateDevice(VkPhysicalDevice physicalDevice,
}
cs = device->perfcntrs_pass_cs;
tu_cs_init(cs, device, TU_CS_MODE_SUB_STREAM, 96);
tu_cs_init(cs, device, TU_CS_MODE_SUB_STREAM, 96, "perfcntrs cs");
for (unsigned i = 0; i < 32; i++) {
struct tu_cs sub_cs;
@ -2223,6 +2227,7 @@ tu_DestroyDevice(VkDevice _device, const VkAllocationCallbacks *pAllocator)
u_rwlock_destroy(&device->dma_bo_lock);
pthread_cond_destroy(&device->timeline_cond);
_mesa_hash_table_destroy(device->bo_sizes, NULL);
vk_free(&device->vk.alloc, device->bo_list);
vk_device_finish(&device->vk);
vk_free(&device->vk.alloc, device);
@ -2260,7 +2265,7 @@ tu_get_scratch_bo(struct tu_device *dev, uint64_t size, struct tu_bo **bo)
unsigned bo_size = 1ull << size_log2;
VkResult result = tu_bo_init_new(dev, &dev->scratch_bos[index].bo, bo_size,
TU_BO_ALLOC_NO_FLAGS);
TU_BO_ALLOC_NO_FLAGS, "scratch");
if (result != VK_SUCCESS) {
mtx_unlock(&dev->scratch_bos[index].construct_mtx);
return result;
@ -2432,12 +2437,15 @@ tu_AllocateMemory(VkDevice _device,
alloc_flags |= TU_BO_ALLOC_REPLAYABLE;
}
result = tu_bo_init_new_explicit_iova(device, &mem->bo,
pAllocateInfo->allocationSize,
client_address, alloc_flags);
char name[64] = "vkAllocateMemory()";
if (device->bo_sizes)
snprintf(name, ARRAY_SIZE(name), "vkAllocateMemory(%ldkb)",
(long)DIV_ROUND_UP(pAllocateInfo->allocationSize, 1024));
result = tu_bo_init_new_explicit_iova(
device, &mem->bo, pAllocateInfo->allocationSize, client_address,
alloc_flags, name);
}
if (result == VK_SUCCESS) {
mem_heap_used = p_atomic_add_return(&mem_heap->used, mem->bo->size);
if (mem_heap_used > mem_heap->size) {
@ -2654,7 +2662,7 @@ tu_CreateEvent(VkDevice _device,
return vk_error(device, VK_ERROR_OUT_OF_HOST_MEMORY);
VkResult result = tu_bo_init_new(device, &event->bo, 0x1000,
TU_BO_ALLOC_NO_FLAGS);
TU_BO_ALLOC_NO_FLAGS, "event");
if (result != VK_SUCCESS)
goto fail_alloc;
@ -3120,3 +3128,106 @@ uint64_t tu_GetDeviceMemoryOpaqueCaptureAddress(
TU_FROM_HANDLE(tu_device_memory, mem, pInfo->memory);
return mem->bo->iova;
}
struct tu_debug_bos_entry {
uint32_t count;
uint64_t size;
const char *name;
};
const char *
tu_debug_bos_add(struct tu_device *dev, uint64_t size, const char *name)
{
assert(name);
if (likely(!dev->bo_sizes))
return NULL;
mtx_lock(&dev->bo_mutex);
struct hash_entry *entry = _mesa_hash_table_search(dev->bo_sizes, name);
struct tu_debug_bos_entry *debug_bos;
if (!entry) {
debug_bos = calloc(1, sizeof(struct tu_debug_bos_entry));
debug_bos->name = strdup(name);
_mesa_hash_table_insert(dev->bo_sizes, debug_bos->name, debug_bos);
} else {
debug_bos = entry->data;
}
debug_bos->count++;
debug_bos->size += align(size, 4096);
mtx_unlock(&dev->bo_mutex);
return debug_bos->name;
}
void
tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo)
{
if (likely(!dev->bo_sizes) || !bo->name)
return;
mtx_lock(&dev->bo_mutex);
struct hash_entry *entry =
_mesa_hash_table_search(dev->bo_sizes, bo->name);
/* If we're finishing the BO, it should have been added already */
assert(entry);
struct tu_debug_bos_entry *debug_bos = entry->data;
debug_bos->count--;
debug_bos->size -= align(bo->size, 4096);
if (!debug_bos->count) {
_mesa_hash_table_remove(dev->bo_sizes, entry);
free((void *) debug_bos->name);
free(debug_bos);
}
mtx_unlock(&dev->bo_mutex);
}
static int debug_bos_count_compare(const void *in_a, const void *in_b)
{
struct tu_debug_bos_entry *a = *(struct tu_debug_bos_entry **)in_a;
struct tu_debug_bos_entry *b = *(struct tu_debug_bos_entry **)in_b;
return a->count - b->count;
}
void
tu_debug_bos_print_stats(struct tu_device *dev)
{
if (likely(!dev->bo_sizes))
return;
mtx_lock(&dev->bo_mutex);
/* Put the HT's sizes data in an array so we can sort by number of allocations. */
struct util_dynarray dyn;
util_dynarray_init(&dyn, NULL);
uint32_t size = 0;
uint32_t count = 0;
hash_table_foreach(dev->bo_sizes, entry)
{
struct tu_debug_bos_entry *debug_bos = (void *) entry->data;
util_dynarray_append(&dyn, struct tu_debug_bos_entry *, debug_bos);
size += debug_bos->size / 1024;
count += debug_bos->count;
}
qsort(dyn.data,
util_dynarray_num_elements(&dyn, struct tu_debug_bos_entry *),
sizeof(struct tu_debug_bos_entryos_entry *), debug_bos_count_compare);
util_dynarray_foreach(&dyn, struct tu_debug_bos_entry *, entryp)
{
struct tu_debug_bos_entry *debug_bos = *entryp;
mesa_logi("%30s: %4d bos, %lld kb\n", debug_bos->name, debug_bos->count,
(long long) (debug_bos->size / 1024));
}
mesa_logi("submitted %d bos (%d MB)\n", count, DIV_ROUND_UP(size, 1024));
util_dynarray_fini(&dyn);
mtx_unlock(&dev->bo_mutex);
}

View file

@ -57,6 +57,7 @@ enum tu_debug_flags
TU_DEBUG_PERF = 1 << 18,
TU_DEBUG_NOLRZFC = 1 << 19,
TU_DEBUG_DYNAMIC = 1 << 20,
TU_DEBUG_BOS = 1 << 21,
};
enum global_shader {
@ -285,6 +286,9 @@ struct tu_device
/* protects imported BOs creation/freeing */
struct u_rwlock dma_bo_lock;
/* Tracking of name -> size allocated for TU_DEBUG_BOS */
struct hash_table *bo_sizes;
/* This array holds all our 'struct tu_bo' allocations. We use this
* so we can add a refcount to our BOs and check if a particular BO
* was already allocated in this device using its GEM handle. This is
@ -496,4 +500,11 @@ tu_u_trace_submission_data_finish(
struct tu_device *device,
struct tu_u_trace_submission_data *submission_data);
const char *
tu_debug_bos_add(struct tu_device *dev, uint64_t size, const char *name);
void
tu_debug_bos_del(struct tu_device *dev, struct tu_bo *bo);
void
tu_debug_bos_print_stats(struct tu_device *dev);
#endif /* TU_DEVICE_H */

View file

@ -23,6 +23,7 @@
#include "drm-uapi/msm_drm.h"
#include "util/debug.h"
#include "util/hash_table.h"
#include "util/timespec.h"
#include "util/os_time.h"
@ -224,6 +225,7 @@ tu_gem_info(const struct tu_device *dev, uint32_t gem_handle, uint32_t info)
return req.value;
}
static VkResult
tu_allocate_userspace_iova(struct tu_device *dev,
uint32_t gem_handle,
@ -295,7 +297,8 @@ tu_bo_init(struct tu_device *dev,
uint32_t gem_handle,
uint64_t size,
uint64_t client_iova,
enum tu_bo_alloc_flags flags)
enum tu_bo_alloc_flags flags,
const char *name)
{
VkResult result = VK_SUCCESS;
uint64_t iova = 0;
@ -312,6 +315,8 @@ tu_bo_init(struct tu_device *dev,
if (result != VK_SUCCESS)
goto fail_bo_list;
name = tu_debug_bos_add(dev, size, name);
mtx_lock(&dev->bo_mutex);
uint32_t idx = dev->bo_count++;
@ -344,6 +349,7 @@ tu_bo_init(struct tu_device *dev,
.iova = iova,
.refcnt = 1,
.bo_list_idx = idx,
.name = name,
};
mtx_unlock(&dev->bo_mutex);
@ -355,12 +361,44 @@ fail_bo_list:
return result;
}
/**
* Sets the name in the kernel so that the contents of /debug/dri/0/gem are more
* useful.
*
* We skip this on release builds (when we're also not doing BO debugging) to
* reduce overhead.
*/
static void
tu_bo_set_kernel_name(struct tu_device *dev, struct tu_bo *bo, const char *name)
{
bool kernel_bo_names = dev->bo_sizes != NULL;
#ifdef DEBUG
kernel_bo_names = true;
#endif
if (!kernel_bo_names)
return;
struct drm_msm_gem_info req = {
.handle = bo->gem_handle,
.info = MSM_INFO_SET_NAME,
.value = (uintptr_t)(void *)name,
.len = strlen(name),
};
int ret = drmCommandWrite(dev->fd, DRM_MSM_GEM_INFO, &req, sizeof(req));
if (ret) {
mesa_logw_once("Failed to set BO name with DRM_MSM_GEM_INFO: %d",
ret);
}
}
VkResult
tu_bo_init_new_explicit_iova(struct tu_device *dev,
struct tu_bo **out_bo,
uint64_t size,
uint64_t client_iova,
enum tu_bo_alloc_flags flags)
enum tu_bo_alloc_flags flags,
const char *name)
{
/* TODO: Choose better flags. As of 2018-11-12, freedreno/drm/msm_bo.c
* always sets `flags = MSM_BO_WC`, and we copy that behavior here.
@ -382,13 +420,16 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev,
assert(bo && bo->gem_handle == 0);
VkResult result =
tu_bo_init(dev, bo, req.handle, size, client_iova, flags);
tu_bo_init(dev, bo, req.handle, size, client_iova, flags, name);
if (result != VK_SUCCESS)
memset(bo, 0, sizeof(*bo));
else
*out_bo = bo;
/* We don't use bo->name here because for the !TU_DEBUG=bo case bo->name is NULL. */
tu_bo_set_kernel_name(dev, bo, name);
return result;
}
@ -431,7 +472,7 @@ tu_bo_init_dmabuf(struct tu_device *dev,
}
VkResult result =
tu_bo_init(dev, bo, gem_handle, size, 0, TU_BO_ALLOC_NO_FLAGS);
tu_bo_init(dev, bo, gem_handle, size, 0, TU_BO_ALLOC_NO_FLAGS, "dmabuf");
if (result != VK_SUCCESS)
memset(bo, 0, sizeof(*bo));
@ -488,6 +529,8 @@ tu_bo_finish(struct tu_device *dev, struct tu_bo *bo)
if (bo->map)
munmap(bo->map, bo->size);
tu_debug_bos_del(dev, bo);
mtx_lock(&dev->bo_mutex);
dev->bo_count--;
dev->bo_list[bo->bo_list_idx] = dev->bo_list[dev->bo_count];
@ -1142,6 +1185,8 @@ tu_queue_submit_locked(struct tu_queue *queue, struct tu_queue_submit *submit)
mtx_unlock(&queue->device->bo_mutex);
tu_debug_bos_print_stats(queue->device);
if (ret)
return vk_device_set_lost(&queue->device->vk, "submit failed: %m");

View file

@ -51,6 +51,7 @@ struct tu_bo
uint64_t size;
uint64_t iova;
void *map;
const char *name; /* pointer to device->bo_sizes's entry's name */
int32_t refcnt;
#ifndef TU_USE_KGSL
@ -72,13 +73,13 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev,
struct tu_bo **out_bo,
uint64_t size,
uint64_t client_iova,
enum tu_bo_alloc_flags flags);
enum tu_bo_alloc_flags flags, const char *name);
static inline VkResult
tu_bo_init_new(struct tu_device *dev, struct tu_bo **out_bo, uint64_t size,
enum tu_bo_alloc_flags flags)
enum tu_bo_alloc_flags flags, const char *name)
{
return tu_bo_init_new_explicit_iova(dev, out_bo, size, 0, flags);
return tu_bo_init_new_explicit_iova(dev, out_bo, size, 0, flags, name);
}
VkResult

View file

@ -73,7 +73,8 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev,
struct tu_bo **out_bo,
uint64_t size,
uint64_t client_iova,
enum tu_bo_alloc_flags flags)
enum tu_bo_alloc_flags flags,
const char *name)
{
assert(client_iova == 0);
@ -101,6 +102,7 @@ tu_bo_init_new_explicit_iova(struct tu_device *dev,
.size = req.mmapsize,
.iova = req.gpuaddr,
.refcnt = 1,
.name = tu_debug_bos_add(dev, req.mmapsize, name),
};
*out_bo = bo;
@ -149,6 +151,7 @@ tu_bo_init_dmabuf(struct tu_device *dev,
.size = info_req.size,
.iova = info_req.gpuaddr,
.refcnt = 1,
.name = tu_debug_bos_add(dev, info_req.size, "dmabuf"),
};
*out_bo = bo;
@ -542,6 +545,9 @@ tu_QueueSubmit2(VkQueue _queue,
}
}
}
tu_debug_bos_print_stats(queue->device);
fail:
vk_free(&queue->device->vk.alloc, cmds);

View file

@ -1706,7 +1706,7 @@ tu6_emit_geom_tess_consts(struct tu_cs *cs,
/* Create the shared tess factor BO the first time tess is used on the device. */
mtx_lock(&dev->mutex);
if (!dev->tess_bo)
tu_bo_init_new(dev, &dev->tess_bo, TU_TESS_BO_SIZE, TU_BO_ALLOC_NO_FLAGS);
tu_bo_init_new(dev, &dev->tess_bo, TU_TESS_BO_SIZE, TU_BO_ALLOC_NO_FLAGS, "tess");
mtx_unlock(&dev->mutex);
uint64_t tess_factor_iova = dev->tess_bo->iova;
@ -2372,7 +2372,7 @@ tu_setup_pvtmem(struct tu_device *dev,
dev->physical_device->info->num_sp_cores * pvtmem_bo->per_sp_size;
VkResult result = tu_bo_init_new(dev, &pvtmem_bo->bo, total_size,
TU_BO_ALLOC_NO_FLAGS);
TU_BO_ALLOC_NO_FLAGS, "pvtmem");
if (result != VK_SUCCESS) {
mtx_unlock(&pvtmem_bo->mtx);
return result;

View file

@ -308,7 +308,7 @@ tu_CreateQueryPool(VkDevice _device,
}
VkResult result = tu_bo_init_new(device, &pool->bo,
pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS);
pCreateInfo->queryCount * slot_size, TU_BO_ALLOC_NO_FLAGS, "query pool");
if (result != VK_SUCCESS) {
vk_object_free(&device->vk, pAllocator, pool);
return result;

View file

@ -81,7 +81,7 @@ tu_suballoc_bo_alloc(struct tu_suballoc_bo *suballoc_bo,
if (!suballoc->bo) {
VkResult result = tu_bo_init_new(suballoc->dev, &suballoc->bo,
alloc_size,
suballoc->flags);
suballoc->flags, "suballoc");
if (result != VK_SUCCESS)
return result;
}