From 49225db140a29d0e79aa9c2aafe356694f8051cb Mon Sep 17 00:00:00 2001 From: Alyssa Rosenzweig Date: Sun, 5 Nov 2023 12:26:59 -0400 Subject: [PATCH] asahi: Implement timer queries Everything but the uapi piece. Signed-off-by: Alyssa Rosenzweig Part-of: --- src/asahi/lib/agx_device.c | 19 ++++++++++ src/asahi/lib/agx_device.h | 10 +++++ src/gallium/drivers/asahi/agx_batch.c | 29 ++++++++++++++- src/gallium/drivers/asahi/agx_pipe.c | 19 +++++++++- src/gallium/drivers/asahi/agx_query.c | 53 ++++++++++++++++++++++++++- src/gallium/drivers/asahi/agx_state.c | 3 ++ src/gallium/drivers/asahi/agx_state.h | 19 +++++++++- 7 files changed, 147 insertions(+), 5 deletions(-) diff --git a/src/asahi/lib/agx_device.c b/src/asahi/lib/agx_device.c index 78538cd4e6a..c9110402306 100644 --- a/src/asahi/lib/agx_device.c +++ b/src/asahi/lib/agx_device.c @@ -6,6 +6,7 @@ #include "agx_device.h" #include +#include "util/timespec.h" #include "agx_bo.h" #include "agx_compile.h" #include "decode.h" @@ -19,6 +20,7 @@ #include "util/log.h" #include "util/os_file.h" #include "util/os_mman.h" +#include "util/os_time.h" #include "util/simple_mtx.h" #include "git_sha1.h" #include "nir_serialize.h" @@ -456,3 +458,20 @@ agx_debug_fault(struct agx_device *dev, uint64_t addr) pthread_mutex_unlock(&dev->bo_map_lock); } + +uint64_t +agx_get_gpu_timestamp(struct agx_device *dev) +{ +#if DETECT_ARCH_ARCH64 + uint64_t ret; + __asm__ volatile("mrs \t%0, cntvct_el0" : "=r"(ret)); + return ret; +#elif DETECT_ARCH_X86 || DETECT_ARCH_X86_64 + /* Maps to the above when run under FEX without thunking */ + uint32_t high, low; + __asm__ volatile("rdtsc" : "=a"(low), "=d"(high)); + return (uint64_t)low | ((uint64_t)high << 32); +#else + unreachable("Kernel support for fetching timestamps pending"); +#endif +} diff --git a/src/asahi/lib/agx_device.h b/src/asahi/lib/agx_device.h index 91d0e194a6c..dfc7c1c76c5 100644 --- a/src/asahi/lib/agx_device.h +++ b/src/asahi/lib/agx_device.h @@ -8,6 +8,7 @@ #include "util/simple_mtx.h" #include "util/sparse_array.h" +#include "util/timespec.h" #include "util/vma.h" #include "agx_bo.h" #include "agx_formats.h" @@ -50,6 +51,7 @@ struct drm_asahi_params_global { uint32_t gpu_generation; uint32_t gpu_variant; uint32_t num_dies; + uint32_t timer_frequency_hz; }; /* How many power-of-two levels in the BO cache do we want? 2^14 minimum chosen @@ -145,4 +147,12 @@ int agx_export_sync_file(struct agx_device *dev, struct agx_bo *bo); void agx_debug_fault(struct agx_device *dev, uint64_t addr); +uint64_t agx_get_gpu_timestamp(struct agx_device *dev); + +static inline uint64_t +agx_gpu_time_to_ns(struct agx_device *dev, uint64_t gpu_time) +{ + return (gpu_time * NSEC_PER_SEC) / dev->params.timer_frequency_hz; +} + #endif diff --git a/src/gallium/drivers/asahi/agx_batch.c b/src/gallium/drivers/asahi/agx_batch.c index a1c965a249c..e6de8d9642d 100644 --- a/src/gallium/drivers/asahi/agx_batch.c +++ b/src/gallium/drivers/asahi/agx_batch.c @@ -6,6 +6,7 @@ #include #include "asahi/lib/decode.h" +#include "util/u_dynarray.h" #include "agx_state.h" #define foreach_active(ctx, idx) \ @@ -125,6 +126,7 @@ agx_batch_init(struct agx_context *ctx, util_dynarray_init(&batch->depth_bias, ctx); util_dynarray_init(&batch->occlusion_queries, ctx); util_dynarray_init(&batch->nonocclusion_queries, ctx); + util_dynarray_init(&batch->timestamp_queries, ctx); batch->clear = 0; batch->draw = 0; @@ -167,7 +169,9 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset) assert(ctx->batch != batch); - agx_finish_batch_queries(batch); + uint64_t begin_ts = ~0, end_ts = 0; + /* TODO: UAPI pending */ + agx_finish_batch_queries(batch, begin_ts, end_ts); batch->occlusion_buffer.cpu = NULL; batch->occlusion_buffer.gpu = 0; @@ -205,6 +209,7 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset) util_dynarray_fini(&batch->depth_bias); util_dynarray_fini(&batch->occlusion_queries); util_dynarray_fini(&batch->nonocclusion_queries); + util_dynarray_fini(&batch->timestamp_queries); if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) { agx_batch_print_stats(dev, batch); @@ -742,3 +747,25 @@ agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch) agx_batch_cleanup(ctx, batch, true); } + +void +agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q) +{ + if (q) + util_dynarray_append(&batch->timestamp_queries, struct agx_query *, q); +} + +/* + * Timestamp queries record the time after all current work is finished, + * which we handle as the time after all current batches finish (since we're a + * tiler and would rather not split the batch). So add a query to all active + * batches. + */ +void +agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q) +{ + unsigned idx; + foreach_active(ctx, idx) { + agx_batch_add_timestamp_query(&ctx->batches.slots[idx], q); + } +} diff --git a/src/gallium/drivers/asahi/agx_pipe.c b/src/gallium/drivers/asahi/agx_pipe.c index dc9f9288c11..f88b9ad2955 100644 --- a/src/gallium/drivers/asahi/agx_pipe.c +++ b/src/gallium/drivers/asahi/agx_pipe.c @@ -27,6 +27,8 @@ #include "pipe/p_state.h" #include "util/format/u_format.h" #include "util/half_float.h" +#include "util/macros.h" +#include "util/timespec.h" #include "util/u_drm.h" #include "util/u_gen_mipmap.h" #include "util/u_inlines.h" @@ -1555,6 +1557,8 @@ agx_get_name(struct pipe_screen *pscreen) static int agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param) { + struct agx_device *dev = agx_device(pscreen); + switch (param) { case PIPE_CAP_NPOT_TEXTURES: case PIPE_CAP_SHADER_STENCIL_EXPORT: @@ -1580,6 +1584,8 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param) return 1; case PIPE_CAP_OCCLUSION_QUERY: + case PIPE_CAP_QUERY_TIMESTAMP: + case PIPE_CAP_QUERY_TIME_ELAPSED: case PIPE_CAP_GENERATE_MIPMAP: case PIPE_CAP_PRIMITIVE_RESTART: case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX: @@ -1587,6 +1593,10 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param) case PIPE_CAP_NATIVE_FENCE_FD: return true; + case PIPE_CAP_TIMER_RESOLUTION: + /* Timer resolution is the length of a single tick in nanos */ + return agx_gpu_time_to_ns(dev, 1); + case PIPE_CAP_SAMPLER_VIEW_TARGET: case PIPE_CAP_TEXTURE_SWIZZLE: case PIPE_CAP_BLEND_EQUATION_SEPARATE: @@ -2146,6 +2156,13 @@ agx_screen_get_fd(struct pipe_screen *pscreen) return agx_device(pscreen)->fd; } +static uint64_t +agx_get_timestamp(struct pipe_screen *pscreen) +{ + struct agx_device *dev = agx_device(pscreen); + return agx_gpu_time_to_ns(dev, agx_get_gpu_timestamp(dev)); +} + struct pipe_screen * agx_screen_create(int fd, struct renderonly *ro, const struct pipe_screen_config *config) @@ -2197,7 +2214,7 @@ agx_screen_create(int fd, struct renderonly *ro, screen->resource_get_handle = agx_resource_get_handle; screen->resource_get_param = agx_resource_get_param; screen->resource_create_with_modifiers = agx_resource_create_with_modifiers; - screen->get_timestamp = u_default_get_timestamp; + screen->get_timestamp = agx_get_timestamp; screen->fence_reference = agx_fence_reference; screen->fence_finish = agx_fence_finish; screen->fence_get_fd = agx_fence_get_fd; diff --git a/src/gallium/drivers/asahi/agx_query.c b/src/gallium/drivers/asahi/agx_query.c index 3e59a654d51..7175efe949a 100644 --- a/src/gallium/drivers/asahi/agx_query.c +++ b/src/gallium/drivers/asahi/agx_query.c @@ -4,7 +4,10 @@ * SPDX-License-Identifier: MIT */ +#include +#include "pipe/p_defines.h" #include "util/u_prim.h" +#include "agx_device.h" #include "agx_state.h" #include "pool.h" @@ -72,6 +75,16 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery) ctx->tf_prims_generated = query; break; + case PIPE_QUERY_TIME_ELAPSED: + ctx->time_elapsed = query; + query->timestamp_begin = UINT64_MAX; + query->timestamp_end = 0; + return true; + + case PIPE_QUERY_TIMESTAMP: + /* No-op */ + break; + default: return false; } @@ -94,6 +107,7 @@ static bool agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery) { struct agx_context *ctx = agx_context(pctx); + struct agx_device *dev = agx_device(pctx->screen); struct agx_query *query = (struct agx_query *)pquery; ctx->dirty |= AGX_DIRTY_QUERY; @@ -109,6 +123,18 @@ agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery) return true; case PIPE_QUERY_PRIMITIVES_EMITTED: ctx->tf_prims_generated = NULL; + return true; + case PIPE_QUERY_TIME_ELAPSED: + ctx->time_elapsed = NULL; + return true; + case PIPE_QUERY_TIMESTAMP: + /* Timestamp logically written now, set up batches to MAX their finish + * time in. If there are no batches, it's just the current time stamp. + */ + agx_add_timestamp_end_query(ctx, query); + + query->timestamp_end = agx_get_gpu_timestamp(dev); + return true; default: return false; @@ -121,6 +147,7 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery, { struct agx_query *query = (struct agx_query *)pquery; struct agx_context *ctx = agx_context(pctx); + struct agx_device *dev = agx_device(pctx->screen); /* For GPU queries, flush the writer. When the writer is flushed, the GPU * will write the value, and when we wait for the writer, the CPU will read @@ -137,6 +164,11 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery, struct agx_batch *writer = query->writer; agx_flush_batch_for_reason(ctx, writer, "GPU query"); agx_sync_batch_for_reason(ctx, writer, "GPU query"); + } else if (query->type == PIPE_QUERY_TIMESTAMP || + query->type == PIPE_QUERY_TIME_ELAPSED) { + /* TODO: Optimize this... timestamp queries are bonkers on tilers. */ + agx_flush_all(ctx, "Timestamp query"); + agx_sync_all(ctx, "Timestamp query"); } /* After syncing, there is no writer left, so query->value is ready */ @@ -154,6 +186,15 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery, vresult->u64 = query->value; return true; + case PIPE_QUERY_TIMESTAMP: + vresult->u64 = agx_gpu_time_to_ns(dev, query->timestamp_end); + return true; + + case PIPE_QUERY_TIME_ELAPSED: + vresult->u64 = + agx_gpu_time_to_ns(dev, query->timestamp_end - query->timestamp_begin); + return true; + default: unreachable("Other queries not yet supported"); } @@ -224,7 +265,8 @@ agx_get_query_address(struct agx_batch *batch, struct agx_query *query) } void -agx_finish_batch_queries(struct agx_batch *batch) +agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts, + uint64_t end_ts) { uint64_t *occlusion = (uint64_t *)batch->occlusion_buffer.cpu; @@ -272,6 +314,15 @@ agx_finish_batch_queries(struct agx_batch *batch) query->ptr.cpu = NULL; query->ptr.gpu = 0; } + + util_dynarray_foreach(&batch->timestamp_queries, struct agx_query *, it) { + struct agx_query *query = *it; + if (query == NULL) + continue; + + query->timestamp_begin = MIN2(query->timestamp_begin, begin_ts); + query->timestamp_end = MAX2(query->timestamp_end, end_ts); + } } static void diff --git a/src/gallium/drivers/asahi/agx_state.c b/src/gallium/drivers/asahi/agx_state.c index be627128b44..4fab4a5ff8d 100644 --- a/src/gallium/drivers/asahi/agx_state.c +++ b/src/gallium/drivers/asahi/agx_state.c @@ -3743,6 +3743,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info, } struct agx_batch *batch = agx_get_batch(ctx); + agx_batch_add_timestamp_query(batch, ctx->time_elapsed); + unsigned idx_size = info->index_size; uint64_t ib = 0; size_t ib_extent = 0; @@ -4127,6 +4129,7 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info) { struct agx_context *ctx = agx_context(pipe); struct agx_batch *batch = agx_get_compute_batch(ctx); + agx_batch_add_timestamp_query(batch, ctx->time_elapsed); agx_batch_init_state(batch); diff --git a/src/gallium/drivers/asahi/agx_state.h b/src/gallium/drivers/asahi/agx_state.h index 409bff299ae..b1b8feb0d23 100644 --- a/src/gallium/drivers/asahi/agx_state.h +++ b/src/gallium/drivers/asahi/agx_state.h @@ -322,6 +322,7 @@ struct agx_batch { /* Non-occlusion queries */ struct util_dynarray nonocclusion_queries; + struct util_dynarray timestamp_queries; /* Result buffer where the kernel places command execution information */ union agx_batch_result *result; @@ -482,6 +483,7 @@ struct agx_context { struct agx_query *occlusion_query; struct agx_query *prims_generated; struct agx_query *tf_prims_generated; + struct agx_query *time_elapsed; bool active_queries; struct util_debug_callback debug; @@ -625,7 +627,15 @@ struct agx_query { struct agx_ptr ptr; /* Accumulator flushed to the CPU */ - uint64_t value; + union { + uint64_t value; + uint64_t timestamp_end; + }; + + /* For time elapsed queries, end is in the above union for consistent + * handling witn timestamp queries. + */ + uint64_t timestamp_begin; }; struct agx_sampler_state { @@ -889,6 +899,10 @@ struct agx_batch *agx_get_compute_batch(struct agx_context *ctx); void agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch); int agx_cleanup_batches(struct agx_context *ctx); +void agx_batch_add_timestamp_query(struct agx_batch *batch, + struct agx_query *q); +void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q); + /* Blit shaders */ void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter, bool render_cond); @@ -910,7 +924,8 @@ uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query); uint64_t agx_get_query_address(struct agx_batch *batch, struct agx_query *query); -void agx_finish_batch_queries(struct agx_batch *batch); +void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts, + uint64_t end_ts); bool agx_render_condition_check_inner(struct agx_context *ctx);