diff --git a/docs/features.txt b/docs/features.txt index dcc38733494..0dbb1b31b8b 100644 --- a/docs/features.txt +++ b/docs/features.txt @@ -447,7 +447,7 @@ Vulkan 1.1 -- all DONE: anv, lvp, radv, tu, vn Vulkan 1.2 -- all DONE: anv, vn VK_KHR_8bit_storage DONE (anv/gen8+, lvp, radv, v3dv, vn) - VK_KHR_buffer_device_address DONE (anv/gen8+, lvp, radv, tu, vn) + VK_KHR_buffer_device_address DONE (anv/gen8+, lvp, radv, tu, v3dv, vn) VK_KHR_create_renderpass2 DONE (anv, lvp, radv, tu, v3dv, vn) VK_KHR_depth_stencil_resolve DONE (anv, lvp, radv, tu, v3dv, vn) VK_KHR_draw_indirect_count DONE (anv, lvp, radv, tu, vn) diff --git a/src/broadcom/ci/broadcom-rpi4-skips.txt b/src/broadcom/ci/broadcom-rpi4-skips.txt index 656797f286d..6f8aded7a66 100644 --- a/src/broadcom/ci/broadcom-rpi4-skips.txt +++ b/src/broadcom/ci/broadcom-rpi4-skips.txt @@ -44,3 +44,6 @@ dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_clamp dEQP-VK.texture.explicit_lod.2d.sizes.128x128_nearest_linear_mipmap_linear_repeat dEQP-VK.ubo.random.all_out_of_order_offsets.45 dEQP-VK.ubo.random.all_shared_buffer.48 +dEQP-VK.ssbo.phys.layout.3* +dEQP-VK.ssbo.phys.layout.single_struct_array* +dEQP-VK.ssbo.phys.layout.basic_unsized_array* diff --git a/src/broadcom/vulkan/v3dv_cmd_buffer.c b/src/broadcom/vulkan/v3dv_cmd_buffer.c index 567a93dd14f..8dc819006b6 100644 --- a/src/broadcom/vulkan/v3dv_cmd_buffer.c +++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c @@ -2538,6 +2538,10 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer, struct v3dv_job *job = cmd_buffer_pre_draw_split_job(cmd_buffer); job->draw_count++; + /* Track VK_KHR_buffer_device_address usage in the job */ + struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; + job->uses_buffer_device_address |= pipeline->uses_buffer_device_address; + /* If this job is serialized (has consumed a barrier) then check if we need * to sync at the binning stage by testing if the binning shaders involved * with the draw call require access to external resources. @@ -2545,7 +2549,6 @@ v3dv_cmd_buffer_emit_pre_draw(struct v3dv_cmd_buffer *cmd_buffer, if (job->serialize && (cmd_buffer->state.barrier.bcl_buffer_access || cmd_buffer->state.barrier.bcl_image_access)) { assert(!job->needs_bcl_sync); - struct v3dv_pipeline *pipeline = cmd_buffer->state.gfx.pipeline; if (cmd_buffer_binning_sync_required(cmd_buffer, pipeline, indexed, indirect)) { consume_bcl_sync(cmd_buffer, job); @@ -3721,6 +3724,10 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer, wg_uniform_offsets_out); submit->cfg[6] = uniforms.bo->offset + uniforms.offset; + + /* Track VK_KHR_buffer_device_address usage in the job */ + job->uses_buffer_device_address |= pipeline->uses_buffer_device_address; + v3dv_job_add_bo(job, uniforms.bo); return job; diff --git a/src/broadcom/vulkan/v3dv_device.c b/src/broadcom/vulkan/v3dv_device.c index 10f36ca7031..b7c06566939 100644 --- a/src/broadcom/vulkan/v3dv_device.c +++ b/src/broadcom/vulkan/v3dv_device.c @@ -116,6 +116,7 @@ get_device_extensions(const struct v3dv_physical_device *device, .KHR_8bit_storage = true, .KHR_16bit_storage = true, .KHR_bind_memory2 = true, + .KHR_buffer_device_address = true, .KHR_copy_commands2 = true, .KHR_create_renderpass2 = true, .KHR_dedicated_allocation = true, @@ -1203,6 +1204,10 @@ v3dv_GetPhysicalDeviceFeatures2(VkPhysicalDevice physicalDevice, .vulkanMemoryModel = true, .vulkanMemoryModelDeviceScope = true, .vulkanMemoryModelAvailabilityVisibilityChains = true, + + .bufferDeviceAddress = true, + .bufferDeviceAddressCaptureReplay = false, + .bufferDeviceAddressMultiDevice = false, }; VkPhysicalDeviceVulkan11Features vk11 = { @@ -1975,6 +1980,10 @@ v3dv_CreateDevice(VkPhysicalDevice physicalDevice, device->default_attribute_float = v3dv_pipeline_create_default_attribute_values(device, NULL); + device->device_address_mem_ctx = ralloc_context(NULL); + util_dynarray_init(&device->device_address_bo_list, + device->device_address_mem_ctx); + *pDevice = v3dv_device_to_handle(device); return VK_SUCCESS; @@ -2004,6 +2013,8 @@ v3dv_DestroyDevice(VkDevice _device, device->default_attribute_float = NULL; } + ralloc_free(device->device_address_mem_ctx); + /* Bo cache should be removed the last, as any other object could be * freeing their private bos */ @@ -2203,6 +2214,24 @@ fail_create: #endif } +static void +device_add_device_address_bo(struct v3dv_device *device, + struct v3dv_bo *bo) +{ + util_dynarray_append(&device->device_address_bo_list, + struct v3dv_bo *, + bo); +} + +static void +device_remove_device_address_bo(struct v3dv_device *device, + struct v3dv_bo *bo) +{ + util_dynarray_delete_unordered(&device->device_address_bo_list, + struct v3dv_bo *, + bo); +} + VKAPI_ATTR VkResult VKAPI_CALL v3dv_AllocateMemory(VkDevice _device, const VkMemoryAllocateInfo *pAllocateInfo, @@ -2229,6 +2258,7 @@ v3dv_AllocateMemory(VkDevice _device, const struct wsi_memory_allocate_info *wsi_info = NULL; const VkImportMemoryFdInfoKHR *fd_info = NULL; + const VkMemoryAllocateFlagsInfo *flags_info = NULL; vk_foreach_struct_const(ext, pAllocateInfo->pNext) { switch ((unsigned)ext->sType) { case VK_STRUCTURE_TYPE_WSI_MEMORY_ALLOCATE_INFO_MESA: @@ -2238,9 +2268,7 @@ v3dv_AllocateMemory(VkDevice _device, fd_info = (void *)ext; break; case VK_STRUCTURE_TYPE_MEMORY_ALLOCATE_FLAGS_INFO: - /* We don't support VK_KHR_buffer_device_address or multiple - * devices per device group, so we can ignore this. - */ + flags_info = (void *)ext; break; case VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO: /* We don't have particular optimizations associated with memory @@ -2288,6 +2316,20 @@ v3dv_AllocateMemory(VkDevice _device, return vk_error(device, result); } + /* If this memory can be used via VK_KHR_buffer_device_address then we + * will need to manually add the BO to any job submit that makes use of + * VK_KHR_buffer_device_address, since such jobs may produde buffer + * load/store operations that may access any buffer memory allocated with + * this flag and we don't have any means to tell which buffers will be + * accessed through this mechanism since they don't even have to be bound + * through descriptor state. + */ + if (flags_info && + (flags_info->flags & VK_MEMORY_ALLOCATE_DEVICE_ADDRESS_BIT_KHR)) { + mem->is_for_device_address = true; + device_add_device_address_bo(device, mem->bo); + } + *pMem = v3dv_device_memory_to_handle(mem); return result; } @@ -2306,6 +2348,9 @@ v3dv_FreeMemory(VkDevice _device, if (mem->bo->map) v3dv_UnmapMemory(_device, _mem); + if (mem->is_for_device_address) + device_remove_device_address_bo(device, mem->bo); + device_free(device, mem); vk_object_free(&device->vk, pAllocator, mem); @@ -2844,3 +2889,28 @@ vk_icdNegotiateLoaderICDInterfaceVersion(uint32_t* pSupportedVersion) *pSupportedVersion = MIN2(*pSupportedVersion, 5u); return VK_SUCCESS; } + +VkDeviceAddress +v3dv_GetBufferDeviceAddress(VkDevice device, + const VkBufferDeviceAddressInfoKHR *pInfo) +{ + V3DV_FROM_HANDLE(v3dv_buffer, buffer, pInfo->buffer); + return buffer->mem_offset + buffer->mem->bo->offset; +} + +uint64_t +v3dv_GetBufferOpaqueCaptureAddress(VkDevice device, + const VkBufferDeviceAddressInfoKHR *pInfo) +{ + /* Not implemented */ + return 0; +} + +uint64_t +v3dv_GetDeviceMemoryOpaqueCaptureAddress( + VkDevice device, + const VkDeviceMemoryOpaqueCaptureAddressInfoKHR *pInfo) +{ + /* Not implemented */ + return 0; +} diff --git a/src/broadcom/vulkan/v3dv_pipeline.c b/src/broadcom/vulkan/v3dv_pipeline.c index 6248bb8efbf..3068bdbf7bb 100644 --- a/src/broadcom/vulkan/v3dv_pipeline.c +++ b/src/broadcom/vulkan/v3dv_pipeline.c @@ -178,10 +178,11 @@ static const struct spirv_to_nir_options default_spirv_options = { .variable_pointers = true, .vk_memory_model = true, .vk_memory_model_device_scope = true, + .physical_storage_buffer_address = true, }, .ubo_addr_format = nir_address_format_32bit_index_offset, .ssbo_addr_format = nir_address_format_32bit_index_offset, - .phys_ssbo_addr_format = nir_address_format_64bit_global, + .phys_ssbo_addr_format = nir_address_format_2x32bit_global, .push_const_addr_format = nir_address_format_logical, .shared_addr_format = nir_address_format_32bit_offset, }; @@ -405,6 +406,10 @@ preprocess_nir(nir_shader *nir) nir_var_mem_ubo | nir_var_mem_ssbo, nir_address_format_32bit_index_offset); + NIR_PASS_V(nir, nir_lower_explicit_io, + nir_var_mem_global, + nir_address_format_2x32bit_global); + NIR_PASS_V(nir, nir_lower_load_const_to_scalar); /* Lower a bunch of stuff */ @@ -2320,6 +2325,20 @@ pipeline_add_multiview_gs(struct v3dv_pipeline *pipeline, return true; } +static void +pipeline_check_buffer_device_address(struct v3dv_pipeline *pipeline) +{ + for (int i = BROADCOM_SHADER_VERTEX; i < BROADCOM_SHADER_STAGES; i++) { + struct v3dv_shader_variant *variant = pipeline->shared_data->variants[i]; + if (variant && variant->prog_data.base->has_global_address) { + pipeline->uses_buffer_device_address = true; + return; + } + } + + pipeline->uses_buffer_device_address = false; +} + /* * It compiles a pipeline. Note that it also allocate internal object, but if * some allocations success, but other fails, the method is not freeing the @@ -2557,6 +2576,8 @@ pipeline_compile_graphics(struct v3dv_pipeline *pipeline, success: + pipeline_check_buffer_device_address(pipeline); + pipeline_feedback.duration = os_time_get_nano() - pipeline_start; write_creation_feedback(pipeline, pCreateInfo->pNext, @@ -3220,6 +3241,8 @@ pipeline_compile_compute(struct v3dv_pipeline *pipeline, success: + pipeline_check_buffer_device_address(pipeline); + pipeline_feedback.duration = os_time_get_nano() - pipeline_start; write_creation_feedback(pipeline, info->pNext, diff --git a/src/broadcom/vulkan/v3dv_private.h b/src/broadcom/vulkan/v3dv_private.h index d6e160bc074..3107ca10de1 100644 --- a/src/broadcom/vulkan/v3dv_private.h +++ b/src/broadcom/vulkan/v3dv_private.h @@ -513,6 +513,9 @@ struct v3dv_device { struct v3dv_bo *default_attribute_float; VkPhysicalDeviceFeatures features; + void *device_address_mem_ctx; + struct util_dynarray device_address_bo_list; /* Array of struct v3dv_bo * */ + #ifdef ANDROID const void *gralloc; enum { @@ -529,6 +532,7 @@ struct v3dv_device_memory { struct v3dv_bo *bo; const VkMemoryType *type; bool is_for_wsi; + bool is_for_device_address; }; #define V3D_OUTPUT_IMAGE_FORMAT_NO 255 @@ -1059,6 +1063,15 @@ struct v3dv_job { /* If the job executes on the transfer stage of the pipeline */ bool is_transfer; + /* VK_KHR_buffer_device_address allows shaders to use pointers that can + * dereference memory in any buffer that has been flagged with + * VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR. These buffers may not + * be bound via descriptor sets, so we need to make sure that a job that + * uses this functionality includes all these buffers in its kernel + * submission. + */ + bool uses_buffer_device_address; + enum v3dv_job_type type; struct v3dv_device *device; @@ -1951,6 +1964,9 @@ struct v3dv_pipeline { /* Flags for whether optional pipeline stages are present, for convenience */ bool has_gs; + /* Whether any stage in this pipeline uses VK_KHR_buffer_device_address */ + bool uses_buffer_device_address; + /* Spilling memory requirements */ struct { struct v3dv_bo *bo; diff --git a/src/broadcom/vulkan/v3dv_queue.c b/src/broadcom/vulkan/v3dv_queue.c index a3d92466d88..b988ee25218 100644 --- a/src/broadcom/vulkan/v3dv_queue.c +++ b/src/broadcom/vulkan/v3dv_queue.c @@ -770,6 +770,17 @@ handle_cl_job(struct v3dv_queue *queue, if (job->tmu_dirty_rcl) submit.flags |= DRM_V3D_SUBMIT_CL_FLUSH_CACHE; + /* If the job uses VK_KHR_buffer_device_addess we need to ensure all + * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR + * are included. + */ + if (job->uses_buffer_device_address) { + util_dynarray_foreach(&queue->device->device_address_bo_list, + struct v3dv_bo *, bo) { + v3dv_job_add_bo(job, *bo); + } + } + submit.bo_handle_count = job->bo_count; uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * submit.bo_handle_count); @@ -913,6 +924,17 @@ handle_csd_job(struct v3dv_queue *queue, struct drm_v3d_submit_csd *submit = &job->csd.submit; + /* If the job uses VK_KHR_buffer_device_addess we need to ensure all + * buffers flagged with VK_BUFFER_USAGE_SHADER_DEVICE_ADDRESS_BIT_KHR + * are included. + */ + if (job->uses_buffer_device_address) { + util_dynarray_foreach(&queue->device->device_address_bo_list, + struct v3dv_bo *, bo) { + v3dv_job_add_bo(job, *bo); + } + } + submit->bo_handle_count = job->bo_count; uint32_t *bo_handles = (uint32_t *) malloc(sizeof(uint32_t) * MAX2(4, submit->bo_handle_count * 2));