asahi: Implement timer queries

Everything but the uapi piece.

Signed-off-by: Alyssa Rosenzweig <alyssa@rosenzweig.io>
Part-of: <https://gitlab.freedesktop.org/mesa/mesa/-/merge_requests/26056>
This commit is contained in:
Alyssa Rosenzweig 2023-11-05 12:26:59 -04:00 committed by Marge Bot
parent bc8232c4a2
commit 49225db140
7 changed files with 147 additions and 5 deletions

View file

@ -6,6 +6,7 @@
#include "agx_device.h"
#include <inttypes.h>
#include "util/timespec.h"
#include "agx_bo.h"
#include "agx_compile.h"
#include "decode.h"
@ -19,6 +20,7 @@
#include "util/log.h"
#include "util/os_file.h"
#include "util/os_mman.h"
#include "util/os_time.h"
#include "util/simple_mtx.h"
#include "git_sha1.h"
#include "nir_serialize.h"
@ -456,3 +458,20 @@ agx_debug_fault(struct agx_device *dev, uint64_t addr)
pthread_mutex_unlock(&dev->bo_map_lock);
}
uint64_t
agx_get_gpu_timestamp(struct agx_device *dev)
{
#if DETECT_ARCH_ARCH64
uint64_t ret;
__asm__ volatile("mrs \t%0, cntvct_el0" : "=r"(ret));
return ret;
#elif DETECT_ARCH_X86 || DETECT_ARCH_X86_64
/* Maps to the above when run under FEX without thunking */
uint32_t high, low;
__asm__ volatile("rdtsc" : "=a"(low), "=d"(high));
return (uint64_t)low | ((uint64_t)high << 32);
#else
unreachable("Kernel support for fetching timestamps pending");
#endif
}

View file

@ -8,6 +8,7 @@
#include "util/simple_mtx.h"
#include "util/sparse_array.h"
#include "util/timespec.h"
#include "util/vma.h"
#include "agx_bo.h"
#include "agx_formats.h"
@ -50,6 +51,7 @@ struct drm_asahi_params_global {
uint32_t gpu_generation;
uint32_t gpu_variant;
uint32_t num_dies;
uint32_t timer_frequency_hz;
};
/* How many power-of-two levels in the BO cache do we want? 2^14 minimum chosen
@ -145,4 +147,12 @@ int agx_export_sync_file(struct agx_device *dev, struct agx_bo *bo);
void agx_debug_fault(struct agx_device *dev, uint64_t addr);
uint64_t agx_get_gpu_timestamp(struct agx_device *dev);
static inline uint64_t
agx_gpu_time_to_ns(struct agx_device *dev, uint64_t gpu_time)
{
return (gpu_time * NSEC_PER_SEC) / dev->params.timer_frequency_hz;
}
#endif

View file

@ -6,6 +6,7 @@
#include <xf86drm.h>
#include "asahi/lib/decode.h"
#include "util/u_dynarray.h"
#include "agx_state.h"
#define foreach_active(ctx, idx) \
@ -125,6 +126,7 @@ agx_batch_init(struct agx_context *ctx,
util_dynarray_init(&batch->depth_bias, ctx);
util_dynarray_init(&batch->occlusion_queries, ctx);
util_dynarray_init(&batch->nonocclusion_queries, ctx);
util_dynarray_init(&batch->timestamp_queries, ctx);
batch->clear = 0;
batch->draw = 0;
@ -167,7 +169,9 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
assert(ctx->batch != batch);
agx_finish_batch_queries(batch);
uint64_t begin_ts = ~0, end_ts = 0;
/* TODO: UAPI pending */
agx_finish_batch_queries(batch, begin_ts, end_ts);
batch->occlusion_buffer.cpu = NULL;
batch->occlusion_buffer.gpu = 0;
@ -205,6 +209,7 @@ agx_batch_cleanup(struct agx_context *ctx, struct agx_batch *batch, bool reset)
util_dynarray_fini(&batch->depth_bias);
util_dynarray_fini(&batch->occlusion_queries);
util_dynarray_fini(&batch->nonocclusion_queries);
util_dynarray_fini(&batch->timestamp_queries);
if (!(dev->debug & (AGX_DBG_TRACE | AGX_DBG_SYNC))) {
agx_batch_print_stats(dev, batch);
@ -742,3 +747,25 @@ agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch)
agx_batch_cleanup(ctx, batch, true);
}
void
agx_batch_add_timestamp_query(struct agx_batch *batch, struct agx_query *q)
{
if (q)
util_dynarray_append(&batch->timestamp_queries, struct agx_query *, q);
}
/*
* Timestamp queries record the time after all current work is finished,
* which we handle as the time after all current batches finish (since we're a
* tiler and would rather not split the batch). So add a query to all active
* batches.
*/
void
agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q)
{
unsigned idx;
foreach_active(ctx, idx) {
agx_batch_add_timestamp_query(&ctx->batches.slots[idx], q);
}
}

View file

@ -27,6 +27,8 @@
#include "pipe/p_state.h"
#include "util/format/u_format.h"
#include "util/half_float.h"
#include "util/macros.h"
#include "util/timespec.h"
#include "util/u_drm.h"
#include "util/u_gen_mipmap.h"
#include "util/u_inlines.h"
@ -1555,6 +1557,8 @@ agx_get_name(struct pipe_screen *pscreen)
static int
agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
{
struct agx_device *dev = agx_device(pscreen);
switch (param) {
case PIPE_CAP_NPOT_TEXTURES:
case PIPE_CAP_SHADER_STENCIL_EXPORT:
@ -1580,6 +1584,8 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
return 1;
case PIPE_CAP_OCCLUSION_QUERY:
case PIPE_CAP_QUERY_TIMESTAMP:
case PIPE_CAP_QUERY_TIME_ELAPSED:
case PIPE_CAP_GENERATE_MIPMAP:
case PIPE_CAP_PRIMITIVE_RESTART:
case PIPE_CAP_PRIMITIVE_RESTART_FIXED_INDEX:
@ -1587,6 +1593,10 @@ agx_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
case PIPE_CAP_NATIVE_FENCE_FD:
return true;
case PIPE_CAP_TIMER_RESOLUTION:
/* Timer resolution is the length of a single tick in nanos */
return agx_gpu_time_to_ns(dev, 1);
case PIPE_CAP_SAMPLER_VIEW_TARGET:
case PIPE_CAP_TEXTURE_SWIZZLE:
case PIPE_CAP_BLEND_EQUATION_SEPARATE:
@ -2146,6 +2156,13 @@ agx_screen_get_fd(struct pipe_screen *pscreen)
return agx_device(pscreen)->fd;
}
static uint64_t
agx_get_timestamp(struct pipe_screen *pscreen)
{
struct agx_device *dev = agx_device(pscreen);
return agx_gpu_time_to_ns(dev, agx_get_gpu_timestamp(dev));
}
struct pipe_screen *
agx_screen_create(int fd, struct renderonly *ro,
const struct pipe_screen_config *config)
@ -2197,7 +2214,7 @@ agx_screen_create(int fd, struct renderonly *ro,
screen->resource_get_handle = agx_resource_get_handle;
screen->resource_get_param = agx_resource_get_param;
screen->resource_create_with_modifiers = agx_resource_create_with_modifiers;
screen->get_timestamp = u_default_get_timestamp;
screen->get_timestamp = agx_get_timestamp;
screen->fence_reference = agx_fence_reference;
screen->fence_finish = agx_fence_finish;
screen->fence_get_fd = agx_fence_get_fd;

View file

@ -4,7 +4,10 @@
* SPDX-License-Identifier: MIT
*/
#include <stdint.h>
#include "pipe/p_defines.h"
#include "util/u_prim.h"
#include "agx_device.h"
#include "agx_state.h"
#include "pool.h"
@ -72,6 +75,16 @@ agx_begin_query(struct pipe_context *pctx, struct pipe_query *pquery)
ctx->tf_prims_generated = query;
break;
case PIPE_QUERY_TIME_ELAPSED:
ctx->time_elapsed = query;
query->timestamp_begin = UINT64_MAX;
query->timestamp_end = 0;
return true;
case PIPE_QUERY_TIMESTAMP:
/* No-op */
break;
default:
return false;
}
@ -94,6 +107,7 @@ static bool
agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
{
struct agx_context *ctx = agx_context(pctx);
struct agx_device *dev = agx_device(pctx->screen);
struct agx_query *query = (struct agx_query *)pquery;
ctx->dirty |= AGX_DIRTY_QUERY;
@ -109,6 +123,18 @@ agx_end_query(struct pipe_context *pctx, struct pipe_query *pquery)
return true;
case PIPE_QUERY_PRIMITIVES_EMITTED:
ctx->tf_prims_generated = NULL;
return true;
case PIPE_QUERY_TIME_ELAPSED:
ctx->time_elapsed = NULL;
return true;
case PIPE_QUERY_TIMESTAMP:
/* Timestamp logically written now, set up batches to MAX their finish
* time in. If there are no batches, it's just the current time stamp.
*/
agx_add_timestamp_end_query(ctx, query);
query->timestamp_end = agx_get_gpu_timestamp(dev);
return true;
default:
return false;
@ -121,6 +147,7 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
{
struct agx_query *query = (struct agx_query *)pquery;
struct agx_context *ctx = agx_context(pctx);
struct agx_device *dev = agx_device(pctx->screen);
/* For GPU queries, flush the writer. When the writer is flushed, the GPU
* will write the value, and when we wait for the writer, the CPU will read
@ -137,6 +164,11 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
struct agx_batch *writer = query->writer;
agx_flush_batch_for_reason(ctx, writer, "GPU query");
agx_sync_batch_for_reason(ctx, writer, "GPU query");
} else if (query->type == PIPE_QUERY_TIMESTAMP ||
query->type == PIPE_QUERY_TIME_ELAPSED) {
/* TODO: Optimize this... timestamp queries are bonkers on tilers. */
agx_flush_all(ctx, "Timestamp query");
agx_sync_all(ctx, "Timestamp query");
}
/* After syncing, there is no writer left, so query->value is ready */
@ -154,6 +186,15 @@ agx_get_query_result(struct pipe_context *pctx, struct pipe_query *pquery,
vresult->u64 = query->value;
return true;
case PIPE_QUERY_TIMESTAMP:
vresult->u64 = agx_gpu_time_to_ns(dev, query->timestamp_end);
return true;
case PIPE_QUERY_TIME_ELAPSED:
vresult->u64 =
agx_gpu_time_to_ns(dev, query->timestamp_end - query->timestamp_begin);
return true;
default:
unreachable("Other queries not yet supported");
}
@ -224,7 +265,8 @@ agx_get_query_address(struct agx_batch *batch, struct agx_query *query)
}
void
agx_finish_batch_queries(struct agx_batch *batch)
agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
uint64_t end_ts)
{
uint64_t *occlusion = (uint64_t *)batch->occlusion_buffer.cpu;
@ -272,6 +314,15 @@ agx_finish_batch_queries(struct agx_batch *batch)
query->ptr.cpu = NULL;
query->ptr.gpu = 0;
}
util_dynarray_foreach(&batch->timestamp_queries, struct agx_query *, it) {
struct agx_query *query = *it;
if (query == NULL)
continue;
query->timestamp_begin = MIN2(query->timestamp_begin, begin_ts);
query->timestamp_end = MAX2(query->timestamp_end, end_ts);
}
}
static void

View file

@ -3743,6 +3743,8 @@ agx_draw_vbo(struct pipe_context *pctx, const struct pipe_draw_info *info,
}
struct agx_batch *batch = agx_get_batch(ctx);
agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
unsigned idx_size = info->index_size;
uint64_t ib = 0;
size_t ib_extent = 0;
@ -4127,6 +4129,7 @@ agx_launch_grid(struct pipe_context *pipe, const struct pipe_grid_info *info)
{
struct agx_context *ctx = agx_context(pipe);
struct agx_batch *batch = agx_get_compute_batch(ctx);
agx_batch_add_timestamp_query(batch, ctx->time_elapsed);
agx_batch_init_state(batch);

View file

@ -322,6 +322,7 @@ struct agx_batch {
/* Non-occlusion queries */
struct util_dynarray nonocclusion_queries;
struct util_dynarray timestamp_queries;
/* Result buffer where the kernel places command execution information */
union agx_batch_result *result;
@ -482,6 +483,7 @@ struct agx_context {
struct agx_query *occlusion_query;
struct agx_query *prims_generated;
struct agx_query *tf_prims_generated;
struct agx_query *time_elapsed;
bool active_queries;
struct util_debug_callback debug;
@ -625,7 +627,15 @@ struct agx_query {
struct agx_ptr ptr;
/* Accumulator flushed to the CPU */
uint64_t value;
union {
uint64_t value;
uint64_t timestamp_end;
};
/* For time elapsed queries, end is in the above union for consistent
* handling witn timestamp queries.
*/
uint64_t timestamp_begin;
};
struct agx_sampler_state {
@ -889,6 +899,10 @@ struct agx_batch *agx_get_compute_batch(struct agx_context *ctx);
void agx_batch_reset(struct agx_context *ctx, struct agx_batch *batch);
int agx_cleanup_batches(struct agx_context *ctx);
void agx_batch_add_timestamp_query(struct agx_batch *batch,
struct agx_query *q);
void agx_add_timestamp_end_query(struct agx_context *ctx, struct agx_query *q);
/* Blit shaders */
void agx_blitter_save(struct agx_context *ctx, struct blitter_context *blitter,
bool render_cond);
@ -910,7 +924,8 @@ uint16_t agx_get_oq_index(struct agx_batch *batch, struct agx_query *query);
uint64_t agx_get_query_address(struct agx_batch *batch,
struct agx_query *query);
void agx_finish_batch_queries(struct agx_batch *batch);
void agx_finish_batch_queries(struct agx_batch *batch, uint64_t begin_ts,
uint64_t end_ts);
bool agx_render_condition_check_inner(struct agx_context *ctx);