mesa-24.0.2

-----BEGIN PGP SIGNATURE----- iQFGBAABCAAwFiEEV1Ud4VuWj2NBwkj2jY4xr8MkKKYFAmXffd4SHGVyaWNAZW5n ZXN0cm9tLmNoAAoJEI2OMa/DJCime7oH/3U7BdJqYW0o12sWEk0vHgi1eGYRzAK9 CZMMMNzNeBBNfBoDBMlHh+r6jnwzmMnOLYcQQIY3jn+QA/md9vdM9GyHxJgSyQcJ Up7/9dJeWr9lOvxKiJR1c0Wz6y8cr+aLYJMjVihnHCTFU51cLjh+W1hfOtRhQtVJ o8yDtNBXCLyQgyXdPWm/ANYDtYWyuEkaONHq8tL2KaGXM7txjeTn1j8E4/nQe1QQ 6jmQKGKm148ftoplssNBYyLMWg2f46Fbp3c4s6pJ3fHwCQel8BTV9Rq3mfjtDeVF P3rTvBAUZ8hV1Rh51/ZxbFIG6M3FFnm/4ryKH5zPuPQ6WsTthTM9SKk= =P30T -----END PGP SIGNATURE----- gpgsig -----BEGIN PGP SIGNATURE----- iQIzBAABCAAdFiEEzgD2pEY1++nF3Ggr3Ztj+AXPXAMFAmXkohwACgkQ3Ztj+AXP XANvdA//aNtR+ZJh3mqwJ1iFvnlzl1B9hqw5fOczjiQs33qq3AD62XyDV+ZmP9Rg fevI8VMoDaFTBL+LRSdi5NIwEqSg7g9m2eU1I85YJB6f1yMeJlPOaHoKDhGHF5vK pJEgFc1bG6RyS6nmR2PJ1TZ93pSBWPDNSqkXAXm6BbY7+tcAmXSYBz1SgpyaBVC5 kZslxAkmN5OkITc0M8E2/1ph69Xo30673zGtQ92hZuYzlZpKNB8n+aQpo21nHWpj TkI36jjYkW0optGv13Xp2LAixmUvc1F+5uXPudGyw6VbpotutWfQR8cxslhvwUXq VAY3fV7wN71+L52dfIJvJGQ74brbJm+gxXZGjcdXjCO1Ux//TC8rAdtIv62DbHeQ Lt8eRL+mXCMpRNg/WoIjtYkKNsn/Hunxlvukcs+iFmPirkkEbYsr+82Hdut667Yc SX5PzayYCfqnmLx1BONmOX8kl1tA9hGyykrFXI6SawEclQ30+S0k6irfYzQmBlm/ jVaQCLvR8Y6A2Uj/Pp9sEbw4aR3JhmLPDRiaZMHkTPi0u9OY+YW1OHMBHMEmcACU COqu0Ks15WZt4c1mOJSJeocJ4A/OVfGHaJimL8vDXCsVovGQ5o6DcYgvIXm6k58p g2MYKZf2q7QwxEgkLN8qAVBcZmJGW4312VZhgI8Z0geDWkmK9lc= =Tln2 -----END PGP SIGNATURE----- Merge tag 'mesa-24.0.2' into 24/neroreflex mesa-24.0.2
2024-03-03 17:15:19 +01:00 · 2024-03-03 17:15:19 +01:00 · 097a0be263
commit 097a0be263
parent f688dfd9cf a3df5eab6c
87 changed files with 8813 additions and 500 deletions
--- a/.gitlab-ci/container/gitlab-ci.yml
+++ b/.gitlab-ci/container/gitlab-ci.yml
@ -316,6 +316,7 @@ fedora/x86_64_build:


 .kernel+rootfs:
+  timeout: 2h  # 24.0-only change
  extends:
    - .container+build-rules
    - .debian-container
--- a/.gitlab-ci/test/gitlab-ci.yml
+++ b/.gitlab-ci/test/gitlab-ci.yml
@ -43,7 +43,7 @@ rustfmt:
    - rustfmt --verbose src/**/lib.rs
    - rustfmt --verbose src/**/main.rs

-clang-format:
+.clang-format:
  extends:
    - .formatting-check
    - .lint-clang-format-rules
--- a/.pick_status.json
+++ b/.pick_status.json
--- a/2
+++ b/2
@ -1 +1 @@
-24.0.1
+24.0.2
--- a/docs/envvars.rst
+++ b/docs/envvars.rst
@ -1359,7 +1359,7 @@ RADV driver environment variables
   ``video_decode``
      enable experimental video decoding support
   ``gsfastlaunch2``
-      use GS_FAST_LAUNCH=2 for Mesh shaders (GFX11+)
+      use GS_FAST_LAUNCH=2 for Mesh shaders (GFX11+ dGPUs only)

 .. envvar:: RADV_TEX_ANISO

--- a/docs/relnotes.rst
+++ b/docs/relnotes.rst
@ -3,6 +3,7 @@ Release Notes

 The release notes summarize what's new or changed in each Mesa release.

+-  :doc:`24.0.2 release notes <relnotes/24.0.2>`
 -  :doc:`24.0.1 release notes <relnotes/24.0.1>`
 -  :doc:`24.0.0 release notes <relnotes/24.0.0>`
 -  :doc:`23.3.3 release notes <relnotes/23.3.3>`
@ -409,6 +410,7 @@ The release notes summarize what's new or changed in each Mesa release.
   :maxdepth: 1
   :hidden:

+   24.0.2 <relnotes/24.0.2>
   24.0.1 <relnotes/24.0.1>
   24.0.0 <relnotes/24.0.0>
   23.3.3 <relnotes/23.3.3>
--- a/docs/relnotes/24.0.1.rst
+++ b/docs/relnotes/24.0.1.rst
@ -19,7 +19,7 @@ SHA256 checksum

 ::

-    TBD.
+    f387192b08c471c545590dd12230a2a343244804b5fe866fec6aea02eab57613  mesa-24.0.1.tar.xz


 New features
--- a/docs/relnotes/24.0.2.rst
+++ b/docs/relnotes/24.0.2.rst
@ -0,0 +1,230 @@
+Mesa 24.0.2 Release Notes / 2024-02-28
+======================================
+
+Mesa 24.0.2 is a bug fix release which fixes bugs found since the 24.0.1 release.
+
+Mesa 24.0.2 implements the OpenGL 4.6 API, but the version reported by
+glGetString(GL_VERSION) or glGetIntegerv(GL_MAJOR_VERSION) /
+glGetIntegerv(GL_MINOR_VERSION) depends on the particular driver being used.
+Some drivers don't support all the features required in OpenGL 4.6. OpenGL
+4.6 is **only** available if requested at context creation.
+Compatibility contexts may report a lower version depending on each driver.
+
+Mesa 24.0.2 implements the Vulkan 1.3 API, but the version reported by
+the apiVersion property of the VkPhysicalDeviceProperties struct
+depends on the particular driver being used.
+
+SHA256 checksum
+---------------
+
+::
+
+    TBD.
+
+
+New features
+------------
+
+- None
+
+
+Bug fixes
+---------
+
+- KHR-Single-GL46.arrays_of_arrays_gl.AtomicUsage fails on MTL
+- GTF-GL46.gtf42.GL3Tests.texture_storage.texture_storage_texture_as_framebuffer_attachment fails on MTL
+- [intel][anv][build][regression] - genX_grl.h:27:10: fatal error: grl/grl_cl_kernel.h: No such file or directory
+- RX 6600 VDPAU not recognizing HEVC_MAIN_10 correctly
+- Running an app on another AMD GPU (offload, DRI_PRIME) produces corrupted frames on Wayland.
+- VDPAU declares a texture as "immutable" without also setting its ImmutableLevels attribute.
+- RX6600 hardware HEVC video decode fails for VDPAU but works for VA-API. (Can lock up GPU!)
+- Rusticl panics when getting program build logs using opencl.hpp
+- ue5 game issues lighting Rog Ally 7080u (z1e)
+- Missing textures in RoboCop: Rogue City with mesh shaders enabled
+- radv: Multiview PSO forgets to export layer in some cases.
+- zink: flickering artifacts in Selaco
+
+
+Changes
+-------
+
+Boyuan Zhang (1):
+
+- radeonsi/vcn: only use multi slices reflist when available
+
+Chia-I Wu (1):
+
+- radv: fix pipeline stats mask
+
+Chris Rankin (2):
+
+- vdpau: Declare texture object as immutable using helper function.
+- vdpau: Refactor query for video surface formats.
+
+Connor Abbott (1):
+
+- tu: Follow pipeline compatibility rules for dynamic descriptors
+
+Daniel Schürmann (1):
+
+- spirv: Fix SpvOpExpectKHR
+
+Daniel Stone (2):
+
+- egl/wayland: Add opaque-equivalent FourCCs
+- egl/wayland: Fix EGL_EXT_present_opaque
+
+Dave Airlie (2):
+
+- nouveau/winsys: fix bda heap leak.
+- nvk: fix dri options leak.
+
+David Rosca (1):
+
+- frontends/va: Only set VP9 segmentation fields when segmentation is enabled
+
+Eric Engestrom (10):
+
+- docs: add sha256sum for 24.0.1
+- [24.0-only change] ci: increase the kernel+rootfs builds timeout to 2h
+- .pick_status.json: Update to c6e855b64b9015235462959b2b7f3e9fc34b2f1f
+- .pick_status.json: Update to dce20690542c84ac00509a6db7902dcfc90b25bb
+- .pick_status.json: Update to c12300844d3f084ca011a3f54f0cbaa9807418f0
+- .pick_status.json: Mark 3b927567ac927316eb11901f50ee1573ead44fd2 as denominated
+- .pick_status.json: Update to 423add61e2d5b6ab6b5505d1feec01b93609f8fc
+- .pick_status.json: Update to 4071c399a27932ea9253eb8a65d5725504bac6f3
+- .pick_status.json: Update to 82ff9204abab5267f82a9ce73f9dca1541ef5ee6
+- [24.0 only] disable clang-format
+
+Erik Faye-Lund (1):
+
+- mesa/main: allow GL_BGRA for FBOs
+
+Faith Ekstrand (1):
+
+- nvk: Invalidate the texture cache before MSAA resolves
+
+Hans-Kristian Arntzen (1):
+
+- radv: export multiview in VS/TES/GS for depth-only rendering
+
+Iago Toral Quiroga (1):
+
+- v3d,v3dv: fix BO allocation for shared vars
+
+Ian Romanick (1):
+
+- nir: Mark nir_intrinsic_load_global_block_intel as divergent
+
+Jesse Natalie (1):
+
+- dzn: Don't set view instancing mask until after the PSO
+
+Jordan Justen (1):
+
+- intel/dev: Add 2 additional ADL-N PCI ids
+
+Juston Li (1):
+
+- venus: fix image reqs cache store locking
+
+Karol Herbst (3):
+
+- zink: lower unaligned memory accesses
+- rusticl/program: fix CL_PROGRAM_BINARIES for devs with no builds
+- meson: do not pull in clc for clover
+
+Konstantin Seurer (5):
+
+- zink: Always set mfence->submit_count to the fence submit_count
+- Revert "zink: always force flushes when originating from api frontend"
+- llvmpipe: Use full subgroups when possible
+- gallivm: Consider the initial mask when terminating loops
+- ci: Update llvmpipe trace checksums
+
+Lionel Landwerlin (8):
+
+- vulkan/runtime: add helper to query attachment layout
+- anv: fixup push descriptor shader analysis
+- anv: reenable ANV_ALWAYS_BINDLESS
+- anv: fix Wa_16013994831 macros
+- anv: disable Wa_16013994831
+- intel/nir: only consider ray query variables in lowering
+- anv: limit depth flush on dynamic render pass suspend
+- anv: add missing generated file dep
+
+Martin Roukala (né Peres) (1):
+
+- radv/ci: switch vkcts-polaris10 from mupuf to KWS' farm
+
+Michel Dänzer (1):
+
+- egl/wayland: Flush after blitting to linear copy
+
+Mike Blumenkrantz (25):
+
+- zink: prune dmabuf export tracking when adding resource binds
+- zink: fix sparse bo placement
+- zink: zero allocate resident_defs array in ntv
+- zink: move sparse lowering up in file
+- zink: run sparse lowering after all optimization passes
+- zink: adjust swizzled deref loads by the variable component offset
+- zink: clamp zink_gfx_lib_cache::stages_present for generated tcs
+- zink: promote gpl libs freeing during shader destroy out of prog loop
+- zink: don't add VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT for sparse textures
+- zink: delete maxDescriptorBufferBindings checks
+- zink: avoid infinite recursion on (very) small BAR systems in bo alloc
+- zink: add checks/compat for low-spec descriptor buffer implementations
+- zink: add a second fence disambiguation case
+- zink: force host-visible allocations for MAP_COHERENT resources
+- zink: handle stencil_fallback in zink_clear_depth_stencil
+- zink: don't destroy the current batch state on context destroy
+- mesa: check driver format support for certain GetInternalformativ queries
+- vk/wsi/x11/sw: use swapchain depth for putimage
+- zink: only scan active batch states for free states if > 1 exist
+- zink: fix longstanding issue with active batch state recycling
+- zink: assert that batch_id is valid in zink_screen_check_last_finished()
+- zink: clamp in_rp clears to fb size
+- zink: fix (dynamic rendering) execution of scissored clears during flush
+- zink: lock buffer age when chundering swapchain for readback
+- zink: flag acquired swapchain image as readback target on acquire, not present
+
+Patrick Lerda (3):
+
+- r300: fix vertex_buffer related refcnt imbalance
+- r300: fix r300_destroy_context() related memory leaks
+- r300: fix memory leaks when register allocation fails
+
+Pavel Ondračka (1):
+
+- r300: add explicit flrp lowering
+
+Rhys Perry (2):
+
+- aco/ra: don't initialize assigned in initializer list
+- aco/ra: fix GFX9- writelane
+
+Sagar Ghuge (1):
+
+- nir: Allow nir_texop_tg4 in implicit derivative
+
+Samuel Pitoiset (4):
+
+- radv: fix RGP barrier reason for RP barriers inserted by the runtime
+- radv: enable GS_FAST_LAUNCH=2 by default for RDNA3 APUs (Phoenix)
+- spirv: only consider IO variables when adjusting patch locations for TES
+- radv: fix indirect dispatches on compute queue with conditional rendering on GFX7
+
+Tapani Pälli (2):
+
+- intel/blorp: disable use of REP16 independent of format
+- iris: make sure DS and TE are sent in pairs on >= gfx125
+
+Yiwei Zhang (2):
+
+- venus: force async pipeline create on threads creating descriptor pools
+- venus: fix the cmd stride used for qfb recording
+
+thfrwn (1):
+
+- mesa: fix off-by-one for newblock allocation in dlist_alloc
--- a/include/pci_ids/iris_pci_ids.h
+++ b/include/pci_ids/iris_pci_ids.h
@ -187,6 +187,8 @@ CHIPSET(0x46c3, adl_gt2, "ADL GT2", "Intel(R) Graphics")
 CHIPSET(0x46d0, adl_n, "ADL-N", "Intel(R) Graphics")
 CHIPSET(0x46d1, adl_n, "ADL-N", "Intel(R) Graphics")
 CHIPSET(0x46d2, adl_n, "ADL-N", "Intel(R) Graphics")
+CHIPSET(0x46d3, adl_n, "ADL-N", "Intel(R) Graphics")
+CHIPSET(0x46d4, adl_n, "ADL-N", "Intel(R) Graphics")

 CHIPSET(0x9a40, tgl_gt2, "TGL GT2", "Intel(R) Xe Graphics")
 CHIPSET(0x9a49, tgl_gt2, "TGL GT2", "Intel(R) Xe Graphics")
--- a/meson.build
+++ b/meson.build
@ -813,7 +813,6 @@ if _opencl != 'disabled'
    error('The Clover OpenCL state tracker requires rtti')
  endif

-  with_clc = true
  with_gallium_opencl = true
  with_opencl_icd = _opencl == 'icd'
 else
@ -838,7 +837,7 @@ if with_gallium_rusticl
 endif

 dep_clc = null_dep
-if with_clc
+if with_gallium_opencl or with_clc
  dep_clc = dependency('libclc')
 endif

--- a/src/amd/ci/gitlab-ci.yml
+++ b/src/amd/ci/gitlab-ci.yml
@ -163,7 +163,7 @@ radeonsi-raven-va-full:x86_64:
 vkcts-polaris10-valve:
  extends:
    - .vkcts-test-valve
-    - .polaris10-test-valve-mupuf
+    - .polaris10-test-valve-kws
    - .radv-valve-manual-rules
  timeout: 1h 15m
  variables:
--- a/src/amd/compiler/aco_register_allocation.cpp
+++ b/src/amd/compiler/aco_register_allocation.cpp
@ -62,7 +62,7 @@ struct assignment {
   };
   uint32_t affinity = 0;
   assignment() = default;
-   assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_), assigned(-1) {}
+   assignment(PhysReg reg_, RegClass rc_) : reg(reg_), rc(rc_) { assigned = true; }
   void set(const Definition& def)
   {
      assigned = true;
@ -1936,19 +1936,6 @@ bool
 operand_can_use_reg(amd_gfx_level gfx_level, aco_ptr<Instruction>& instr, unsigned idx, PhysReg reg,
                    RegClass rc)
 {
-   bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
-                       instr->opcode == aco_opcode::v_writelane_b32_e64;
-   if (gfx_level <= GFX9 && is_writelane && idx <= 1) {
-      /* v_writelane_b32 can take two sgprs but only if one is m0. */
-      bool is_other_sgpr =
-         instr->operands[!idx].isTemp() &&
-         (!instr->operands[!idx].isFixed() || instr->operands[!idx].physReg() != m0);
-      if (is_other_sgpr && instr->operands[!idx].tempId() != instr->operands[idx].tempId()) {
-         instr->operands[idx].setFixed(m0);
-         return reg == m0;
-      }
-   }
-
   if (reg.byte()) {
      unsigned stride = get_subdword_operand_stride(gfx_level, instr, idx, rc);
      if (reg.byte() % stride)
@ -2844,6 +2831,18 @@ register_allocation(Program* program, std::vector<IDSet>& live_out_per_block, ra
               operand.isFixed() && ctx.assignments[operand.tempId()].reg != operand.physReg();
         }

+         bool is_writelane = instr->opcode == aco_opcode::v_writelane_b32 ||
+                             instr->opcode == aco_opcode::v_writelane_b32_e64;
+         if (program->gfx_level <= GFX9 && is_writelane && instr->operands[0].isTemp() &&
+             instr->operands[1].isTemp()) {
+            /* v_writelane_b32 can take two sgprs but only if one is m0. */
+            if (ctx.assignments[instr->operands[0].tempId()].reg != m0 &&
+                ctx.assignments[instr->operands[1].tempId()].reg != m0) {
+               instr->operands[0].setFixed(m0);
+               fixed = true;
+            }
+         }
+
         if (fixed)
            handle_fixed_operands(ctx, register_file, parallelcopy, instr);

--- a/src/amd/compiler/tests/test_regalloc.cpp
+++ b/src/amd/compiler/tests/test_regalloc.cpp
@ -410,3 +410,21 @@ BEGIN_TEST(regalloc.vinterp_fp16)

   finish_ra_test(ra_test_policy());
 END_TEST
+
+BEGIN_TEST(regalloc.writelane)
+   //>> v1: %in0:v[0], s1: %in1:s[0], s1: %in2:s[1], s1: %in3:s[2] = p_startpgm
+   if (!setup_cs("v1 s1 s1 s1", GFX8))
+      return;
+
+   //! s1: %tmp:m0 = p_parallelcopy %int3:s[2]
+   Temp tmp = bld.copy(bld.def(s1, m0), inputs[3]);
+
+   //! s1: %in1_2:m0,  s1: %tmp_2:s[0] = p_parallelcopy %in1:s[0], %tmp:m0
+   //! v1: %tmp2:v[0] = v_writelane_b32_e64 %in1_2:m0, %in2:s[1], %in0:v[0]
+   Temp tmp2 = bld.writelane(bld.def(v1), inputs[1], inputs[2], inputs[0]);
+
+   //! p_unit_test %tmp_2:s[0], %tmp2:v[0]
+   bld.pseudo(aco_opcode::p_unit_test, tmp, tmp2);
+
+   finish_ra_test(ra_test_policy());
+END_TEST
--- a/src/amd/vulkan/radv_cmd_buffer.c
+++ b/src/amd/vulkan/radv_cmd_buffer.c
@ -9665,12 +9665,16 @@ radv_emit_dispatch_packets(struct radv_cmd_buffer *cmd_buffer, const struct radv

      if (radv_cmd_buffer_uses_mec(cmd_buffer)) {
         uint64_t indirect_va = info->va;
+         const bool needs_align32_workaround =
+            cmd_buffer->device->physical_device->rad_info.has_async_compute_align32_bug &&
+            cmd_buffer->qf == RADV_QUEUE_COMPUTE && !radv_is_aligned(indirect_va, 32);
+         const unsigned ace_predication_size =
+            4 /* DISPATCH_INDIRECT */ + (needs_align32_workaround ? 6 * 3 /* 3x COPY_DATA */ : 0);

         radv_cs_emit_compute_predication(&cmd_buffer->state, cs, cmd_buffer->mec_inv_pred_va,
-                                          &cmd_buffer->mec_inv_pred_emitted, 4 /* DISPATCH_INDIRECT size */);
+                                          &cmd_buffer->mec_inv_pred_emitted, ace_predication_size);

-         if (cmd_buffer->device->physical_device->rad_info.has_async_compute_align32_bug &&
-             cmd_buffer->qf == RADV_QUEUE_COMPUTE && !radv_is_aligned(indirect_va, 32)) {
+         if (needs_align32_workaround) {
            const uint64_t unaligned_va = indirect_va;
            UNUSED void *ptr;
            uint32_t offset;
@ -10642,8 +10646,15 @@ VKAPI_ATTR void VKAPI_CALL
 radv_CmdPipelineBarrier2(VkCommandBuffer commandBuffer, const VkDependencyInfo *pDependencyInfo)
 {
   RADV_FROM_HANDLE(radv_cmd_buffer, cmd_buffer, commandBuffer);
+   enum rgp_barrier_reason barrier_reason;

-   radv_barrier(cmd_buffer, pDependencyInfo, RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER);
+   if (cmd_buffer->vk.runtime_rp_barrier) {
+      barrier_reason = RGP_BARRIER_EXTERNAL_RENDER_PASS_SYNC;
+   } else {
+      barrier_reason = RGP_BARRIER_EXTERNAL_CMD_PIPELINE_BARRIER;
+   }
+
+   radv_barrier(cmd_buffer, pDependencyInfo, barrier_reason);
 }

 static void
--- a/src/amd/vulkan/radv_pipeline_graphics.c
+++ b/src/amd/vulkan/radv_pipeline_graphics.c
@ -1178,11 +1178,12 @@ get_vs_output_info(const struct radv_graphics_pipeline *pipeline)
 static bool
 radv_should_export_multiview(const struct radv_shader_stage *stage, const struct radv_pipeline_key *pipeline_key)
 {
-   /* Export the layer in the last VGT stage if multiview is used. When the next stage is unknown
-    * (with graphics pipeline library), the layer is exported unconditionally.
+   /* Export the layer in the last VGT stage if multiview is used.
+    * Also checks for NONE stage, which happens when we have depth-only rendering.
+    * When the next stage is unknown (with graphics pipeline library), the layer is exported unconditionally.
    */
   return pipeline_key->has_multiview_view_index &&
-          (stage->info.next_stage == MESA_SHADER_FRAGMENT ||
+          (stage->info.next_stage == MESA_SHADER_FRAGMENT || stage->info.next_stage == MESA_SHADER_NONE ||
           !(pipeline_key->lib_flags & VK_GRAPHICS_PIPELINE_LIBRARY_FRAGMENT_SHADER_BIT_EXT)) &&
          !(stage->nir->info.outputs_written & VARYING_BIT_LAYER);
 }
--- a/src/amd/vulkan/radv_query.c
+++ b/src/amd/vulkan/radv_query.c
@ -1155,7 +1155,7 @@ radv_query_shader(struct radv_cmd_buffer *cmd_buffer, VkPipeline *pipeline, stru
                                                                         .range = VK_WHOLE_SIZE}}});

   /* Encode the number of elements for easy access by the shader. */
-   pipeline_stats_mask &= (1 << radv_get_pipelinestat_query_size(device)) - 1;
+   pipeline_stats_mask &= (1 << (radv_get_pipelinestat_query_size(device) / 8)) - 1;
   pipeline_stats_mask |= util_bitcount(pipeline_stats_mask) << 16;

   avail_offset -= src_offset;
--- a/src/broadcom/vulkan/v3dv_cmd_buffer.c
+++ b/src/broadcom/vulkan/v3dv_cmd_buffer.c
@ -4327,7 +4327,7 @@ cmd_buffer_create_csd_job(struct v3dv_cmd_buffer *cmd_buffer,
   if (cs_variant->prog_data.cs->shared_size > 0) {
      job->csd.shared_memory =
         v3dv_bo_alloc(cmd_buffer->device,
-                       cs_variant->prog_data.cs->shared_size * wgs_per_sg,
+                       cs_variant->prog_data.cs->shared_size * num_wgs,
                       "shared_vars", true);
      if (!job->csd.shared_memory) {
         v3dv_flag_oom(cmd_buffer, NULL);
--- a/src/compiler/meson.build
+++ b/src/compiler/meson.build
@ -79,7 +79,7 @@ subdir('nir')

 subdir('spirv')

-if with_opencl_spirv
+if with_clc
  subdir('clc')
 endif
 if with_gallium
--- a/src/compiler/nir/nir.c
+++ b/src/compiler/nir/nir.c
@ -3133,6 +3133,8 @@ nir_tex_instr_has_implicit_derivative(const nir_tex_instr *instr)
   case nir_texop_txb:
   case nir_texop_lod:
      return true;
+   case nir_texop_tg4:
+      return instr->is_gather_implicit_lod;
   default:
      return false;
   }
--- a/src/compiler/nir/nir_divergence_analysis.c
+++ b/src/compiler/nir/nir_divergence_analysis.c
@ -189,7 +189,6 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
   case nir_intrinsic_load_resume_shader_address_amd:
   case nir_intrinsic_load_global_const_block_intel:
   case nir_intrinsic_load_reloc_const_intel:
-   case nir_intrinsic_load_global_block_intel:
   case nir_intrinsic_load_btd_global_arg_addr_intel:
   case nir_intrinsic_load_btd_local_arg_addr_intel:
   case nir_intrinsic_load_mesh_inline_data_intel:
@ -219,6 +218,13 @@ visit_intrinsic(nir_shader *shader, nir_intrinsic_instr *instr)
      is_divergent = false;
      break;

+   /* This is divergent because it specifically loads sequential values into
+    * successive SIMD lanes.
+    */
+   case nir_intrinsic_load_global_block_intel:
+      is_divergent = true;
+      break;
+
   case nir_intrinsic_decl_reg:
      is_divergent = nir_intrinsic_divergent(instr);
      break;
--- a/src/compiler/spirv/spirv_to_nir.c
+++ b/src/compiler/spirv/spirv_to_nir.c
@ -4383,6 +4383,7 @@ vtn_handle_composite(struct vtn_builder *b, SpvOp opcode,
      break;
   }
   case SpvOpCopyObject:
+   case SpvOpExpectKHR:
      vtn_copy_value(b, w[3], w[2]);
      return;

@ -6458,18 +6459,18 @@ vtn_handle_body_instruction(struct vtn_builder *b, SpvOp opcode,
      vtn_handle_integer_dot(b, opcode, w, count);
      break;

+   case SpvOpBitcast:
+      vtn_handle_bitcast(b, w, count);
+      break;
+
   /* TODO: One day, we should probably do something with this information
    * For now, though, it's safe to implement them as no-ops.
    * Needed for Rusticl sycl support.
    */
   case SpvOpAssumeTrueKHR:
+      break;
+
   case SpvOpExpectKHR:
-      break;
-
-   case SpvOpBitcast:
-      vtn_handle_bitcast(b, w, count);
-      break;
-
   case SpvOpVectorExtractDynamic:
   case SpvOpVectorInsertDynamic:
   case SpvOpVectorShuffle:
--- a/src/compiler/spirv/vtn_variables.c
+++ b/src/compiler/spirv/vtn_variables.c
@ -2024,7 +2024,9 @@ adjust_patch_locations(struct vtn_builder *b, struct vtn_variable *var)

   for (uint16_t i = 0; i < num_data; i++) {
      vtn_assert(data[i].location < VARYING_SLOT_PATCH0);
-      if (data[i].patch && data[i].location >= VARYING_SLOT_VAR0)
+      if (data[i].patch &&
+          (data[i].mode == nir_var_shader_in || data[i].mode == nir_var_shader_out) &&
+          data[i].location >= VARYING_SLOT_VAR0)
         data[i].location += VARYING_SLOT_PATCH0 - VARYING_SLOT_VAR0;
   }
 }
--- a/src/egl/drivers/dri2/platform_wayland.c
+++ b/src/egl/drivers/dri2/platform_wayland.c
@ -73,6 +73,7 @@ static const struct dri2_wl_visual {
    */
   int alt_dri_image_format;
   int bpp;
+   int opaque_wl_drm_format;
   int rgba_shifts[4];
   unsigned int rgba_sizes[4];
 } dri2_wl_visuals[] = {
@ -83,6 +84,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_ABGR16161616F,
      0,
      64,
+      WL_DRM_FORMAT_XBGR16F,
      {0, 16, 32, 48},
      {16, 16, 16, 16},
   },
@ -93,6 +95,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_XBGR16161616F,
      0,
      64,
+      WL_DRM_FORMAT_XBGR16F,
      {0, 16, 32, -1},
      {16, 16, 16, 0},
   },
@ -103,6 +106,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_XRGB2101010,
      __DRI_IMAGE_FORMAT_XBGR2101010,
      32,
+      WL_DRM_FORMAT_XRGB2101010,
      {20, 10, 0, -1},
      {10, 10, 10, 0},
   },
@ -113,6 +117,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_ARGB2101010,
      __DRI_IMAGE_FORMAT_ABGR2101010,
      32,
+      WL_DRM_FORMAT_XRGB2101010,
      {20, 10, 0, 30},
      {10, 10, 10, 2},
   },
@ -123,6 +128,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_XBGR2101010,
      __DRI_IMAGE_FORMAT_XRGB2101010,
      32,
+      WL_DRM_FORMAT_XBGR2101010,
      {0, 10, 20, -1},
      {10, 10, 10, 0},
   },
@ -133,6 +139,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_ABGR2101010,
      __DRI_IMAGE_FORMAT_ARGB2101010,
      32,
+      WL_DRM_FORMAT_XBGR2101010,
      {0, 10, 20, 30},
      {10, 10, 10, 2},
   },
@ -143,6 +150,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_XRGB8888,
      __DRI_IMAGE_FORMAT_NONE,
      32,
+      WL_DRM_FORMAT_XRGB8888,
      {16, 8, 0, -1},
      {8, 8, 8, 0},
   },
@ -153,6 +161,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_ARGB8888,
      __DRI_IMAGE_FORMAT_NONE,
      32,
+      WL_DRM_FORMAT_XRGB8888,
      {16, 8, 0, 24},
      {8, 8, 8, 8},
   },
@ -163,6 +172,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_ABGR8888,
      __DRI_IMAGE_FORMAT_NONE,
      32,
+      WL_DRM_FORMAT_XBGR8888,
      {0, 8, 16, 24},
      {8, 8, 8, 8},
   },
@ -173,6 +183,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_XBGR8888,
      __DRI_IMAGE_FORMAT_NONE,
      32,
+      WL_DRM_FORMAT_XBGR8888,
      {0, 8, 16, -1},
      {8, 8, 8, 0},
   },
@ -183,6 +194,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_RGB565,
      __DRI_IMAGE_FORMAT_NONE,
      16,
+      WL_DRM_FORMAT_RGB565,
      {11, 5, 0, -1},
      {5, 6, 5, 0},
   },
@ -193,6 +205,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_ARGB1555,
      __DRI_IMAGE_FORMAT_ABGR1555,
      16,
+      WL_DRM_FORMAT_XRGB1555,
      {10, 5, 0, 15},
      {5, 5, 5, 1},
   },
@ -203,6 +216,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_XRGB1555,
      __DRI_IMAGE_FORMAT_XBGR1555,
      16,
+      WL_DRM_FORMAT_XRGB1555,
      {10, 5, 0, -1},
      {5, 5, 5, 0},
   },
@ -213,6 +227,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_ARGB4444,
      __DRI_IMAGE_FORMAT_XBGR4444,
      16,
+      WL_DRM_FORMAT_XRGB4444,
      {8, 4, 0, 12},
      {4, 4, 4, 4},
   },
@ -223,6 +238,7 @@ static const struct dri2_wl_visual {
      __DRI_IMAGE_FORMAT_XRGB4444,
      __DRI_IMAGE_FORMAT_XBGR4444,
      16,
+      WL_DRM_FORMAT_XRGB4444,
      {8, 4, 0, -1},
      {4, 4, 4, 0},
   },
@ -230,7 +246,7 @@ static const struct dri2_wl_visual {

 static int
 dri2_wl_visual_idx_from_config(struct dri2_egl_display *dri2_dpy,
-                               const __DRIconfig *config, bool force_opaque)
+                               const __DRIconfig *config)
 {
   int shifts[4];
   unsigned int sizes[4];
@ -240,16 +256,13 @@ dri2_wl_visual_idx_from_config(struct dri2_egl_display *dri2_dpy,
   for (unsigned int i = 0; i < ARRAY_SIZE(dri2_wl_visuals); i++) {
      const struct dri2_wl_visual *wl_visual = &dri2_wl_visuals[i];

-      int cmp_rgb_shifts =
-         memcmp(shifts, wl_visual->rgba_shifts, 3 * sizeof(shifts[0]));
-      int cmp_rgb_sizes =
-         memcmp(sizes, wl_visual->rgba_sizes, 3 * sizeof(sizes[0]));
+      int cmp_rgba_shifts =
+         memcmp(shifts, wl_visual->rgba_shifts, 4 * sizeof(shifts[0]));
+      int cmp_rgba_sizes =
+         memcmp(sizes, wl_visual->rgba_sizes, 4 * sizeof(sizes[0]));

-      if (cmp_rgb_shifts == 0 && cmp_rgb_sizes == 0 &&
-          wl_visual->rgba_shifts[3] == (force_opaque ? -1 : shifts[3]) &&
-          wl_visual->rgba_sizes[3] == (force_opaque ? 0 : sizes[3])) {
+      if (cmp_rgba_shifts == 0 && cmp_rgba_sizes == 0)
         return i;
-      }
   }

   return -1;
@ -302,7 +315,7 @@ dri2_wl_is_format_supported(void *user_data, uint32_t format)

   for (int i = 0; dri2_dpy->driver_configs[i]; i++)
      if (j == dri2_wl_visual_idx_from_config(
-                  dri2_dpy, dri2_dpy->driver_configs[i], false))
+                  dri2_dpy, dri2_dpy->driver_configs[i]))
         return true;

   return false;
@ -710,43 +723,10 @@ dri2_wl_create_window_surface(_EGLDisplay *disp, _EGLConfig *conf,
   dri2_surf->base.Width = window->width;
   dri2_surf->base.Height = window->height;

-#ifndef NDEBUG
-   /* Enforce that every visual has an opaque variant (requirement to support
-    * EGL_EXT_present_opaque)
-    */
-   for (unsigned int i = 0; i < ARRAY_SIZE(dri2_wl_visuals); i++) {
-      const struct dri2_wl_visual *transparent_visual = &dri2_wl_visuals[i];
-      if (transparent_visual->rgba_sizes[3] == 0) {
-         continue;
-      }
-
-      bool found_opaque_equivalent = false;
-      for (unsigned int j = 0; j < ARRAY_SIZE(dri2_wl_visuals); j++) {
-         const struct dri2_wl_visual *opaque_visual = &dri2_wl_visuals[j];
-         if (opaque_visual->rgba_sizes[3] != 0) {
-            continue;
-         }
-
-         int cmp_rgb_shifts =
-            memcmp(transparent_visual->rgba_shifts, opaque_visual->rgba_shifts,
-                   3 * sizeof(opaque_visual->rgba_shifts[0]));
-         int cmp_rgb_sizes =
-            memcmp(transparent_visual->rgba_sizes, opaque_visual->rgba_sizes,
-                   3 * sizeof(opaque_visual->rgba_sizes[0]));
-
-         if (cmp_rgb_shifts == 0 && cmp_rgb_sizes == 0) {
-            found_opaque_equivalent = true;
-            break;
-         }
-      }
-
-      assert(found_opaque_equivalent);
-   }
-#endif
-
-   visual_idx = dri2_wl_visual_idx_from_config(dri2_dpy, config,
-                                               dri2_surf->base.PresentOpaque);
+   visual_idx = dri2_wl_visual_idx_from_config(dri2_dpy, config);
   assert(visual_idx != -1);
+   assert(dri2_wl_visuals[visual_idx].dri_image_format !=
+          __DRI_IMAGE_FORMAT_NONE);

   if (dri2_dpy->wl_dmabuf || dri2_dpy->wl_drm) {
      dri2_surf->format = dri2_wl_visuals[visual_idx].wl_drm_format;
@ -1501,6 +1481,9 @@ create_wl_buffer(struct dri2_egl_display *dri2_dpy,
         close(fd);
      }

+      if (dri2_surf && dri2_surf->base.PresentOpaque)
+         fourcc = dri2_wl_visuals[visual_idx].opaque_wl_drm_format;
+
      ret = zwp_linux_buffer_params_v1_create_immed(params, width, height,
                                                    fourcc, 0);
      zwp_linux_buffer_params_v1_destroy(params);
@ -1643,6 +1626,12 @@ dri2_wl_swap_buffers_with_damage(_EGLDisplay *disp, _EGLSurface *draw,
         dri2_surf->current->dri_image, 0, 0, dri2_surf->base.Width,
         dri2_surf->base.Height, 0, 0, dri2_surf->base.Width,
         dri2_surf->base.Height, 0);
+
+      if (dri2_dpy->flush) {
+         __DRIdrawable *dri_drawable = dri2_dpy->vtbl->get_dri_drawable(draw);
+
+         dri2_dpy->flush->flush(dri_drawable);
+      }
   }

   wl_surface_commit(dri2_surf->wl_surface_wrapper);
@ -2078,7 +2067,7 @@ dri2_wl_add_configs_for_visuals(_EGLDisplay *disp)

         /* No match for config. Try if we can blitImage convert to a visual */
         c = dri2_wl_visual_idx_from_config(dri2_dpy,
-                                            dri2_dpy->driver_configs[i], false);
+                                            dri2_dpy->driver_configs[i]);

         if (c == -1)
            continue;
--- a/src/freedreno/vulkan/tu_cmd_buffer.cc
+++ b/src/freedreno/vulkan/tu_cmd_buffer.cc
@ -2093,7 +2093,7 @@ tu_reset_cmd_buffer(struct vk_command_buffer *vk_cmd_buffer,
      memset(&cmd_buffer->descriptors[i].push_set, 0, sizeof(cmd_buffer->descriptors[i].push_set));
      cmd_buffer->descriptors[i].push_set.base.type = VK_OBJECT_TYPE_DESCRIPTOR_SET;
      cmd_buffer->descriptors[i].max_sets_bound = 0;
-      cmd_buffer->descriptors[i].dynamic_bound = 0;
+      cmd_buffer->descriptors[i].max_dynamic_offset_size = 0;
   }

   u_trace_fini(&cmd_buffer->trace);
@ -2385,12 +2385,12 @@ tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
         cmd->state.desc_sets =
            tu_cs_draw_state(&cmd->sub_cs, &state_cs,
                             4 + 4 * descriptors_state->max_sets_bound +
-                             (descriptors_state->dynamic_bound ? 6 : 0));
+                             (descriptors_state->max_dynamic_offset_size ? 6 : 0));
      } else {
         cmd->state.desc_sets =
            tu_cs_draw_state(&cmd->sub_cs, &state_cs,
                             3 + 2 * descriptors_state->max_sets_bound +
-                             (descriptors_state->dynamic_bound ? 3 : 0));
+                             (descriptors_state->max_dynamic_offset_size ? 3 : 0));
      }
      cs = &state_cs;
   } else {
@ -2410,7 +2410,7 @@ tu6_emit_descriptor_sets(struct tu_cmd_buffer *cmd,
   }

   /* Dynamic descriptors get the reserved descriptor set. */
-   if (descriptors_state->dynamic_bound) {
+   if (descriptors_state->max_dynamic_offset_size) {
      int reserved_set_idx = cmd->device->physical_device->reserved_set_idx;
      assert(reserved_set_idx >= 0); /* reserved set must be bound */

@ -2561,22 +2561,26 @@ tu_CmdBindDescriptorSets(VkCommandBuffer commandBuffer,
   assert(dyn_idx == dynamicOffsetCount);

   if (dynamic_offset_offset) {
+      descriptors_state->max_dynamic_offset_size =
+         MAX2(descriptors_state->max_dynamic_offset_size, dynamic_offset_offset);
+
      /* allocate and fill out dynamic descriptor set */
      struct tu_cs_memory dynamic_desc_set;
      int reserved_set_idx = cmd->device->physical_device->reserved_set_idx;
-      VkResult result = tu_cs_alloc(&cmd->sub_cs,
-                                    dynamic_offset_offset / (4 * A6XX_TEX_CONST_DWORDS),
-                                    A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
+      VkResult result =
+         tu_cs_alloc(&cmd->sub_cs,
+                     descriptors_state->max_dynamic_offset_size /
+                     (4 * A6XX_TEX_CONST_DWORDS),
+                     A6XX_TEX_CONST_DWORDS, &dynamic_desc_set);
      if (result != VK_SUCCESS) {
         vk_command_buffer_set_error(&cmd->vk, result);
         return;
      }

      memcpy(dynamic_desc_set.map, descriptors_state->dynamic_descriptors,
-             dynamic_offset_offset);
+             descriptors_state->max_dynamic_offset_size);
      assert(reserved_set_idx >= 0); /* reserved set must be bound */
      descriptors_state->set_iova[reserved_set_idx] = dynamic_desc_set.iova | BINDLESS_DESCRIPTOR_64B;
-      descriptors_state->dynamic_bound = true;
   }

   tu_dirty_desc_sets(cmd, pipelineBindPoint);
--- a/src/freedreno/vulkan/tu_cmd_buffer.h
+++ b/src/freedreno/vulkan/tu_cmd_buffer.h
@ -54,7 +54,7 @@ struct tu_descriptor_state
   uint32_t dynamic_descriptors[MAX_DYNAMIC_BUFFERS_SIZE];
   uint64_t set_iova[MAX_SETS];
   uint32_t max_sets_bound;
-   bool dynamic_bound;
+   uint32_t max_dynamic_offset_size;
 };

 enum tu_cmd_dirty_bits
--- a/src/gallium/auxiliary/gallivm/lp_bld_ir_common.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_ir_common.c
@ -27,6 +27,7 @@
 **************************************************************************/

 #include "util/u_memory.h"
+#include "lp_bld_const.h"
 #include "lp_bld_type.h"
 #include "lp_bld_init.h"
 #include "lp_bld_flow.h"
@ -271,18 +272,17 @@ void lp_exec_bgnloop(struct lp_exec_mask *mask, bool load)
 }

 void lp_exec_endloop(struct gallivm_state *gallivm,
-                     struct lp_exec_mask *mask)
+                     struct lp_exec_mask *exec_mask,
+                     struct lp_build_mask_context *mask)
 {
-   LLVMBuilderRef builder = mask->bld->gallivm->builder;
-   struct function_ctx *ctx = func_ctx(mask);
+   LLVMBuilderRef builder = exec_mask->bld->gallivm->builder;
+   struct function_ctx *ctx = func_ctx(exec_mask);
   LLVMBasicBlockRef endloop;
-   LLVMTypeRef int_type = LLVMInt32TypeInContext(mask->bld->gallivm->context);
-   LLVMTypeRef reg_type = LLVMIntTypeInContext(gallivm->context,
-                                               mask->bld->type.width *
-                                               mask->bld->type.length);
+   LLVMTypeRef int_type = LLVMInt32TypeInContext(exec_mask->bld->gallivm->context);
+   LLVMTypeRef mask_type = LLVMIntTypeInContext(exec_mask->bld->gallivm->context, exec_mask->bld->type.length);
   LLVMValueRef i1cond, i2cond, icond, limiter;

-   assert(mask->break_mask);
+   assert(exec_mask->break_mask);

   assert(ctx->loop_stack_size);
   if (ctx->loop_stack_size > LP_MAX_TGSI_NESTING) {
@ -294,14 +294,14 @@ void lp_exec_endloop(struct gallivm_state *gallivm,
   /*
    * Restore the cont_mask, but don't pop
    */
-   mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size - 1].cont_mask;
-   lp_exec_mask_update(mask);
+   exec_mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size - 1].cont_mask;
+   lp_exec_mask_update(exec_mask);

   /*
    * Unlike the continue mask, the break_mask must be preserved across loop
    * iterations
    */
-   LLVMBuildStore(builder, mask->break_mask, ctx->break_var);
+   LLVMBuildStore(builder, exec_mask->break_mask, ctx->break_var);

   /* Decrement the loop limiter */
   limiter = LLVMBuildLoad2(builder, int_type, ctx->loop_limiter, "");
@ -314,12 +314,18 @@ void lp_exec_endloop(struct gallivm_state *gallivm,

   LLVMBuildStore(builder, limiter, ctx->loop_limiter);

-   /* i1cond = (mask != 0) */
+   LLVMValueRef end_mask = exec_mask->exec_mask;
+   if (mask)
+      end_mask = LLVMBuildAnd(builder, exec_mask->exec_mask, lp_build_mask_value(mask), "");
+   end_mask = LLVMBuildICmp(builder, LLVMIntNE, end_mask, lp_build_zero(gallivm, exec_mask->bld->type), "");
+   end_mask = LLVMBuildBitCast(builder, end_mask, mask_type, "");
+
+   /* i1cond = (end_mask != 0) */
   i1cond = LLVMBuildICmp(
      builder,
      LLVMIntNE,
-      LLVMBuildBitCast(builder, mask->exec_mask, reg_type, ""),
-      LLVMConstNull(reg_type), "i1cond");
+      end_mask,
+      LLVMConstNull(mask_type), "i1cond");

   /* i2cond = (looplimiter > 0) */
   i2cond = LLVMBuildICmp(
@ -331,7 +337,7 @@ void lp_exec_endloop(struct gallivm_state *gallivm,
   /* if( i1cond && i2cond ) */
   icond = LLVMBuildAnd(builder, i1cond, i2cond, "");

-   endloop = lp_build_insert_new_block(mask->bld->gallivm, "endloop");
+   endloop = lp_build_insert_new_block(exec_mask->bld->gallivm, "endloop");

   LLVMBuildCondBr(builder,
                   icond, ctx->loop_block, endloop);
@ -341,14 +347,14 @@ void lp_exec_endloop(struct gallivm_state *gallivm,
   assert(ctx->loop_stack_size);
   --ctx->loop_stack_size;
   --ctx->bgnloop_stack_size;
-   mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size].cont_mask;
-   mask->break_mask = ctx->loop_stack[ctx->loop_stack_size].break_mask;
+   exec_mask->cont_mask = ctx->loop_stack[ctx->loop_stack_size].cont_mask;
+   exec_mask->break_mask = ctx->loop_stack[ctx->loop_stack_size].break_mask;
   ctx->loop_block = ctx->loop_stack[ctx->loop_stack_size].loop_block;
   ctx->break_var = ctx->loop_stack[ctx->loop_stack_size].break_var;
   ctx->break_type = ctx->break_type_stack[ctx->loop_stack_size +
         ctx->switch_stack_size];

-   lp_exec_mask_update(mask);
+   lp_exec_mask_update(exec_mask);
 }

 void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
--- a/src/gallium/auxiliary/gallivm/lp_bld_ir_common.h
+++ b/src/gallium/auxiliary/gallivm/lp_bld_ir_common.h
@ -101,6 +101,8 @@ struct lp_exec_mask {
   int function_stack_size;
 };

+struct lp_build_mask_context;
+
 void lp_exec_mask_function_init(struct lp_exec_mask *mask, int function_idx);
 void lp_exec_mask_init(struct lp_exec_mask *mask, struct lp_build_context *bld);
 void lp_exec_mask_fini(struct lp_exec_mask *mask);
@ -112,7 +114,8 @@ void lp_exec_mask_update(struct lp_exec_mask *mask);
 void lp_exec_bgnloop_post_phi(struct lp_exec_mask *mask);
 void lp_exec_bgnloop(struct lp_exec_mask *mask, bool load_mask);
 void lp_exec_endloop(struct gallivm_state *gallivm,
-                     struct lp_exec_mask *mask);
+                     struct lp_exec_mask *exec_mask,
+                     struct lp_build_mask_context *mask);
 void lp_exec_mask_cond_push(struct lp_exec_mask *mask,
                            LLVMValueRef val);
 void lp_exec_mask_cond_invert(struct lp_exec_mask *mask);
--- a/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_nir_soa.c
@ -2024,7 +2024,7 @@ static void bgnloop(struct lp_build_nir_context *bld_base)
 static void endloop(struct lp_build_nir_context *bld_base)
 {
   struct lp_build_nir_soa_context *bld = (struct lp_build_nir_soa_context *)bld_base;
-   lp_exec_endloop(bld_base->base.gallivm, &bld->exec_mask);
+   lp_exec_endloop(bld_base->base.gallivm, &bld->exec_mask, bld->mask);
 }

 static void if_cond(struct lp_build_nir_context *bld_base, LLVMValueRef cond)
--- a/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
+++ b/src/gallium/auxiliary/gallivm/lp_bld_tgsi_soa.c
@ -4268,7 +4268,7 @@ endloop_emit(
 {
   struct lp_build_tgsi_soa_context * bld = lp_soa_context(bld_base);

-   lp_exec_endloop(bld_base->base.gallivm, &bld->exec_mask);
+   lp_exec_endloop(bld_base->base.gallivm, &bld->exec_mask, bld->mask);
 }

 static void
--- a/src/gallium/auxiliary/vl/vl_video_buffer.c
+++ b/src/gallium/auxiliary/vl/vl_video_buffer.c
@ -118,21 +118,21 @@ vl_video_buffer_is_format_supported(struct pipe_screen *screen,
   vl_get_video_buffer_formats(screen, format, resource_formats);

   for (i = 0; i < VL_NUM_COMPONENTS; ++i) {
-      enum pipe_format format = resource_formats[i];
+      enum pipe_format fmt = resource_formats[i];

-      if (format == PIPE_FORMAT_NONE)
+      if (fmt == PIPE_FORMAT_NONE)
         continue;

      /* we at least need to sample from it */
-      if (!screen->is_format_supported(screen, format, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW))
-         return false;
+      if (!screen->is_format_supported(screen, fmt, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_SAMPLER_VIEW))
+         continue;

-      format = vl_video_buffer_surface_format(format);
-      if (!screen->is_format_supported(screen, format, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_RENDER_TARGET))
-         return false;
+      fmt = vl_video_buffer_surface_format(fmt);
+      if (screen->is_format_supported(screen, fmt, PIPE_TEXTURE_2D, 0, 0, PIPE_BIND_RENDER_TARGET))
+         return true;
   }

-   return true;
+   return false;
 }

 unsigned
--- a/src/gallium/drivers/iris/iris_state.c
+++ b/src/gallium/drivers/iris/iris_state.c
@ -7135,7 +7135,12 @@ iris_upload_dirty_render_state(struct iris_context *ice,
      }
   }

+#if GFX_VERx10 >= 125
+   /* This is only used on >= gfx125 for dynamic 3DSTATE_TE emission
+    * related workarounds.
+    */
   bool program_needs_wa_14015055625 = false;
+#endif

 #if INTEL_WA_14015055625_GFX_VER
   /* Check if FS stage will use primitive ID overrides for Wa_14015055625. */
@ -7239,16 +7244,14 @@ iris_upload_dirty_render_state(struct iris_context *ice,
                            GENX(3DSTATE_PS_length));
            iris_emit_merge(batch, shader_psx, psx_state,
                            GENX(3DSTATE_PS_EXTRA_length));
-         } else if (stage == MESA_SHADER_TESS_EVAL &&
-                    intel_needs_workaround(batch->screen->devinfo, 14015055625) &&
-                    !program_needs_wa_14015055625) {
-            /* This program doesn't require Wa_14015055625, so we can enable
-             * a Tessellation Distribution Mode.
-             */
 #if GFX_VERx10 >= 125
+         } else if (stage == MESA_SHADER_TESS_EVAL) {
            uint32_t te_state[GENX(3DSTATE_TE_length)] = { 0 };
            iris_pack_command(GENX(3DSTATE_TE), te_state, te) {
-               if (intel_needs_workaround(batch->screen->devinfo, 22012699309))
+               if (intel_needs_workaround(screen->devinfo, 14015055625) &&
+                   program_needs_wa_14015055625)
+                  te.TessellationDistributionMode = TEDMODE_OFF;
+               else if (intel_needs_workaround(screen->devinfo, 22012699309))
                  te.TessellationDistributionMode = TEDMODE_RR_STRICT;
               else
                  te.TessellationDistributionMode = TEDMODE_RR_FREE;
--- a/src/gallium/drivers/llvmpipe/ci/traces-llvmpipe.yml
+++ b/src/gallium/drivers/llvmpipe/ci/traces-llvmpipe.yml
@ -111,7 +111,7 @@ traces:
      checksum: 58a6a276abc0e28fcb2a8acea3342712
  gputest/pixmark-piano-v2.trace:
    gl-vmware-llvmpipe:
-      checksum: edc09da55fea262e76686d99548f2cfd
+      checksum: b0077264046fe6dd2cdec059d9e53bf5
  gputest/triangle-v2.trace:
    gl-vmware-llvmpipe:
      checksum: 7812de00011a3a059892e36cea19c696
--- a/src/gallium/drivers/llvmpipe/lp_state_cs.c
+++ b/src/gallium/drivers/llvmpipe/lp_state_cs.c
@ -95,7 +95,7 @@ enum {
   CS_ARG_VERTEX_DATA,
   CS_ARG_PER_THREAD_DATA,
   CS_ARG_OUTER_COUNT,
-   CS_ARG_CORO_X_LOOPS = CS_ARG_OUTER_COUNT,
+   CS_ARG_CORO_SUBGROUP_COUNT = CS_ARG_OUTER_COUNT,
   CS_ARG_CORO_PARTIALS,
   CS_ARG_CORO_BLOCK_X_SIZE,
   CS_ARG_CORO_BLOCK_Y_SIZE,
@ -374,7 +374,7 @@ generate_compute(struct llvmpipe_context *lp,
   else
      arg_types[CS_ARG_VERTEX_DATA] = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0); /* mesh shaders only */
   arg_types[CS_ARG_PER_THREAD_DATA] = variant->jit_cs_thread_data_ptr_type;  /* per thread data */
-   arg_types[CS_ARG_CORO_X_LOOPS] = int32_type;                        /* coro only - num X loops */
+   arg_types[CS_ARG_CORO_SUBGROUP_COUNT] = int32_type;                 /* coro only - subgroup count */
   arg_types[CS_ARG_CORO_PARTIALS] = int32_type;                       /* coro only - partials */
   arg_types[CS_ARG_CORO_BLOCK_X_SIZE] = int32_type;                   /* coro block_x_size */
   arg_types[CS_ARG_CORO_BLOCK_Y_SIZE] = int32_type;                   /* coro block_y_size */
@ -560,23 +560,24 @@ generate_compute(struct llvmpipe_context *lp,
      output_array = lp_build_array_alloca(gallivm, output_type, lp_build_const_int32(gallivm, align(MAX2(nir->info.mesh.max_primitives_out, nir->info.mesh.max_vertices_out), 8)), "outputs");
   }

-   struct lp_build_loop_state loop_state[4];
-   LLVMValueRef num_x_loop;
-   LLVMValueRef vec_length = lp_build_const_int32(gallivm, cs_type.length);
-   num_x_loop = LLVMBuildAdd(gallivm->builder, block_x_size_arg, vec_length, "");
-   num_x_loop = LLVMBuildSub(gallivm->builder, num_x_loop, lp_build_const_int32(gallivm, 1), "");
-   num_x_loop = LLVMBuildUDiv(gallivm->builder, num_x_loop, vec_length, "");
-   LLVMValueRef partials = LLVMBuildURem(gallivm->builder, block_x_size_arg, vec_length, "");
+   struct lp_build_loop_state loop_state[2];

-   LLVMValueRef coro_num_hdls = LLVMBuildMul(gallivm->builder, num_x_loop, block_y_size_arg, "");
-   coro_num_hdls = LLVMBuildMul(gallivm->builder, coro_num_hdls, block_z_size_arg, "");
+   LLVMValueRef vec_length = lp_build_const_int32(gallivm, cs_type.length);
+
+   LLVMValueRef invocation_count = LLVMBuildMul(gallivm->builder, block_x_size_arg, block_y_size_arg, "");
+   invocation_count = LLVMBuildMul(gallivm->builder, invocation_count, block_z_size_arg, "");
+
+   LLVMValueRef partials = LLVMBuildURem(gallivm->builder, invocation_count, vec_length, "");
+
+   LLVMValueRef num_subgroup_loop = LLVMBuildAdd(gallivm->builder, invocation_count, lp_build_const_int32(gallivm, cs_type.length - 1), "");
+   num_subgroup_loop = LLVMBuildUDiv(gallivm->builder, num_subgroup_loop, vec_length, "");

   /* build a ptr in memory to store all the frames in later. */
   LLVMTypeRef hdl_ptr_type = LLVMPointerType(LLVMInt8TypeInContext(gallivm->context), 0);
   LLVMValueRef coro_mem = LLVMBuildAlloca(gallivm->builder, hdl_ptr_type, "coro_mem");
   LLVMBuildStore(builder, LLVMConstNull(hdl_ptr_type), coro_mem);

-   LLVMValueRef coro_hdls = LLVMBuildArrayAlloca(gallivm->builder, hdl_ptr_type, coro_num_hdls, "coro_hdls");
+   LLVMValueRef coro_hdls = LLVMBuildArrayAlloca(gallivm->builder, hdl_ptr_type, num_subgroup_loop, "coro_hdls");

   unsigned end_coroutine = INT_MAX;

@ -585,22 +586,17 @@ generate_compute(struct llvmpipe_context *lp,
    * and calls the coroutine main entrypoint on the first pass, but in subsequent
    * passes it checks if the coroutine has completed and resumes it if not.
    */
-   /* take x_width - round up to type.length width */
-   lp_build_loop_begin(&loop_state[3], gallivm,
-                       lp_build_const_int32(gallivm, 0)); /* coroutine reentry loop */
-   lp_build_loop_begin(&loop_state[2], gallivm,
-                       lp_build_const_int32(gallivm, 0)); /* z loop */
   lp_build_loop_begin(&loop_state[1], gallivm,
-                       lp_build_const_int32(gallivm, 0)); /* y loop */
+                       lp_build_const_int32(gallivm, 0)); /* coroutine reentry loop */
   lp_build_loop_begin(&loop_state[0], gallivm,
-                       lp_build_const_int32(gallivm, 0)); /* x loop */
+                       lp_build_const_int32(gallivm, 0)); /* subgroup loop */
   {
      LLVMValueRef args[CS_ARG_MAX];
      args[CS_ARG_CONTEXT] = context_ptr;
      args[CS_ARG_RESOURCES] = resources_ptr;
-      args[CS_ARG_BLOCK_X_SIZE] = loop_state[0].counter;
-      args[CS_ARG_BLOCK_Y_SIZE] = loop_state[1].counter;
-      args[CS_ARG_BLOCK_Z_SIZE] = loop_state[2].counter;
+      args[CS_ARG_BLOCK_X_SIZE] = LLVMGetUndef(int32_type);
+      args[CS_ARG_BLOCK_Y_SIZE] = LLVMGetUndef(int32_type);
+      args[CS_ARG_BLOCK_Z_SIZE] = LLVMGetUndef(int32_type);
      args[CS_ARG_GRID_X] = grid_x_arg;
      args[CS_ARG_GRID_Y] = grid_y_arg;
      args[CS_ARG_GRID_Z] = grid_z_arg;
@ -611,34 +607,25 @@ generate_compute(struct llvmpipe_context *lp,
      args[CS_ARG_DRAW_ID] = draw_id_arg;
      args[CS_ARG_VERTEX_DATA] = io_ptr;
      args[CS_ARG_PER_THREAD_DATA] = thread_data_ptr;
-      args[CS_ARG_CORO_X_LOOPS] = num_x_loop;
+      args[CS_ARG_CORO_SUBGROUP_COUNT] = num_subgroup_loop;
      args[CS_ARG_CORO_PARTIALS] = partials;
      args[CS_ARG_CORO_BLOCK_X_SIZE] = block_x_size_arg;
      args[CS_ARG_CORO_BLOCK_Y_SIZE] = block_y_size_arg;
      args[CS_ARG_CORO_BLOCK_Z_SIZE] = block_z_size_arg;

-      /* idx = (z * (size_x * size_y) + y * size_x + x */
-      LLVMValueRef coro_hdl_idx = LLVMBuildMul(gallivm->builder, loop_state[2].counter,
-                                               LLVMBuildMul(gallivm->builder, num_x_loop, block_y_size_arg, ""), "");
-      coro_hdl_idx = LLVMBuildAdd(gallivm->builder, coro_hdl_idx,
-                                  LLVMBuildMul(gallivm->builder, loop_state[1].counter,
-                                               num_x_loop, ""), "");
-      coro_hdl_idx = LLVMBuildAdd(gallivm->builder, coro_hdl_idx,
-                                  loop_state[0].counter, "");
-
-      args[CS_ARG_CORO_IDX] = coro_hdl_idx;
+      args[CS_ARG_CORO_IDX] = loop_state[0].counter;

      args[CS_ARG_CORO_MEM] = coro_mem;

      if (is_mesh)
         args[CS_ARG_CORO_OUTPUTS] = output_array;

-      LLVMValueRef coro_entry = LLVMBuildGEP2(gallivm->builder, hdl_ptr_type, coro_hdls, &coro_hdl_idx, 1, "");
+      LLVMValueRef coro_entry = LLVMBuildGEP2(gallivm->builder, hdl_ptr_type, coro_hdls, &loop_state[0].counter, 1, "");

      LLVMValueRef coro_hdl = LLVMBuildLoad2(gallivm->builder, hdl_ptr_type, coro_entry, "coro_hdl");

      struct lp_build_if_state ifstate;
-      LLVMValueRef cmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, loop_state[3].counter,
+      LLVMValueRef cmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, loop_state[1].counter,
                                       lp_build_const_int32(gallivm, 0), "");
      /* first time here - call the coroutine function entry point */
      lp_build_if(&ifstate, gallivm, cmp);
@ -651,24 +638,18 @@ generate_compute(struct llvmpipe_context *lp,
      lp_build_if(&ifstate2, gallivm, coro_done);
      /* if done destroy and force loop exit */
      lp_build_coro_destroy(gallivm, coro_hdl);
-      lp_build_loop_force_set_counter(&loop_state[3], lp_build_const_int32(gallivm, end_coroutine - 1));
+      lp_build_loop_force_set_counter(&loop_state[1], lp_build_const_int32(gallivm, end_coroutine - 1));
      lp_build_else(&ifstate2);
      /* otherwise resume the coroutine */
      lp_build_coro_resume(gallivm, coro_hdl);
      lp_build_endif(&ifstate2);
      lp_build_endif(&ifstate);
-      lp_build_loop_force_reload_counter(&loop_state[3]);
+      lp_build_loop_force_reload_counter(&loop_state[1]);
   }
   lp_build_loop_end_cond(&loop_state[0],
-                          num_x_loop,
+                          num_subgroup_loop,
                          NULL,  LLVMIntUGE);
   lp_build_loop_end_cond(&loop_state[1],
-                          block_y_size_arg,
-                          NULL,  LLVMIntUGE);
-   lp_build_loop_end_cond(&loop_state[2],
-                          block_z_size_arg,
-                          NULL,  LLVMIntUGE);
-   lp_build_loop_end_cond(&loop_state[3],
                          lp_build_const_int32(gallivm, end_coroutine),
                          NULL, LLVMIntEQ);

@ -680,12 +661,8 @@ generate_compute(struct llvmpipe_context *lp,
   LLVMBuildRetVoid(builder);

   /* This is stage (b) - generate the compute shader code inside the coroutine. */
-   LLVMValueRef x_size_arg, y_size_arg, z_size_arg;
   context_ptr  = LLVMGetParam(coro, CS_ARG_CONTEXT);
   resources_ptr = LLVMGetParam(coro, CS_ARG_RESOURCES);
-   x_size_arg = LLVMGetParam(coro, CS_ARG_BLOCK_X_SIZE);
-   y_size_arg = LLVMGetParam(coro, CS_ARG_BLOCK_Y_SIZE);
-   z_size_arg = LLVMGetParam(coro, CS_ARG_BLOCK_Z_SIZE);
   grid_x_arg = LLVMGetParam(coro, CS_ARG_GRID_X);
   grid_y_arg = LLVMGetParam(coro, CS_ARG_GRID_Y);
   grid_z_arg = LLVMGetParam(coro, CS_ARG_GRID_Z);
@ -696,12 +673,12 @@ generate_compute(struct llvmpipe_context *lp,
   draw_id_arg = LLVMGetParam(coro, CS_ARG_DRAW_ID);
   io_ptr = LLVMGetParam(coro, CS_ARG_VERTEX_DATA);
   thread_data_ptr  = LLVMGetParam(coro, CS_ARG_PER_THREAD_DATA);
-   num_x_loop = LLVMGetParam(coro, CS_ARG_CORO_X_LOOPS);
+   num_subgroup_loop = LLVMGetParam(coro, CS_ARG_CORO_SUBGROUP_COUNT);
   partials = LLVMGetParam(coro, CS_ARG_CORO_PARTIALS);
   block_x_size_arg = LLVMGetParam(coro, CS_ARG_CORO_BLOCK_X_SIZE);
   block_y_size_arg = LLVMGetParam(coro, CS_ARG_CORO_BLOCK_Y_SIZE);
   block_z_size_arg = LLVMGetParam(coro, CS_ARG_CORO_BLOCK_Z_SIZE);
-   LLVMValueRef coro_idx = LLVMGetParam(coro, CS_ARG_CORO_IDX);
+   LLVMValueRef subgroup_id = LLVMGetParam(coro, CS_ARG_CORO_IDX);
   coro_mem = LLVMGetParam(coro, CS_ARG_CORO_MEM);
   if (is_mesh)
      output_array = LLVMGetParam(coro, CS_ARG_CORO_OUTPUTS);
@ -730,27 +707,32 @@ generate_compute(struct llvmpipe_context *lp,
                                                  variant->jit_cs_thread_data_type,
                                                  thread_data_ptr);

-      LLVMValueRef coro_num_hdls = LLVMBuildMul(gallivm->builder, num_x_loop, block_y_size_arg, "");
-      coro_num_hdls = LLVMBuildMul(gallivm->builder, coro_num_hdls, block_z_size_arg, "");
-
      /* these are coroutine entrypoint necessities */
      LLVMValueRef coro_id = lp_build_coro_id(gallivm);
-      LLVMValueRef coro_entry = lp_build_coro_alloc_mem_array(gallivm, coro_mem, coro_idx, coro_num_hdls);
+      LLVMValueRef coro_entry = lp_build_coro_alloc_mem_array(gallivm, coro_mem, subgroup_id, num_subgroup_loop);
      LLVMTypeRef mem_ptr_type = LLVMInt8TypeInContext(gallivm->context);
      LLVMValueRef alloced_ptr = LLVMBuildLoad2(gallivm->builder, hdl_ptr_type, coro_mem, "");
      alloced_ptr = LLVMBuildGEP2(gallivm->builder, mem_ptr_type, alloced_ptr, &coro_entry, 1, "");
      LLVMValueRef coro_hdl = lp_build_coro_begin(gallivm, coro_id, alloced_ptr);
      LLVMValueRef has_partials = LLVMBuildICmp(gallivm->builder, LLVMIntNE, partials, lp_build_const_int32(gallivm, 0), "");
-      LLVMValueRef tids_x[LP_MAX_VECTOR_LENGTH], tids_y[LP_MAX_VECTOR_LENGTH], tids_z[LP_MAX_VECTOR_LENGTH];
-      LLVMValueRef base_val = LLVMBuildMul(gallivm->builder, x_size_arg, vec_length, "");
-      for (i = 0; i < cs_type.length; i++) {
-         tids_x[i] = LLVMBuildAdd(gallivm->builder, base_val, lp_build_const_int32(gallivm, i), "");
-         tids_y[i] = y_size_arg;
-         tids_z[i] = z_size_arg;
-      }
-      system_values.thread_id[0] = lp_build_gather_values(gallivm, tids_x, cs_type.length);
-      system_values.thread_id[1] = lp_build_gather_values(gallivm, tids_y, cs_type.length);
-      system_values.thread_id[2] = lp_build_gather_values(gallivm, tids_z, cs_type.length);
+
+      struct lp_build_context bld;
+      lp_build_context_init(&bld, gallivm, lp_uint_type(cs_type));
+
+      LLVMValueRef base_val = LLVMBuildMul(gallivm->builder, subgroup_id, vec_length, "");
+      LLVMValueRef invocation_indices[LP_MAX_VECTOR_LENGTH];
+      for (i = 0; i < cs_type.length; i++)
+         invocation_indices[i] = LLVMBuildAdd(gallivm->builder, base_val, lp_build_const_int32(gallivm, i), "");
+      LLVMValueRef invocation_index = lp_build_gather_values(gallivm, invocation_indices, cs_type.length);
+
+      LLVMValueRef block_x_size_vec = lp_build_broadcast_scalar(&bld, block_x_size_arg);
+      LLVMValueRef block_y_size_vec = lp_build_broadcast_scalar(&bld, block_y_size_arg);
+
+      system_values.thread_id[0] = LLVMBuildURem(gallivm->builder, invocation_index, block_x_size_vec, "");
+      system_values.thread_id[1] = LLVMBuildUDiv(gallivm->builder, invocation_index, block_x_size_vec, "");
+      system_values.thread_id[1] = LLVMBuildURem(gallivm->builder, system_values.thread_id[1], block_y_size_vec, "");
+      system_values.thread_id[2] = LLVMBuildUDiv(gallivm->builder, invocation_index, block_x_size_vec, "");
+      system_values.thread_id[2] = LLVMBuildUDiv(gallivm->builder, system_values.thread_id[2], block_y_size_vec, "");

      system_values.block_id[0] = grid_x_arg;
      system_values.block_id[1] = grid_y_arg;
@ -763,38 +745,15 @@ generate_compute(struct llvmpipe_context *lp,
      system_values.work_dim = work_dim_arg;
      system_values.draw_id = draw_id_arg;

-      /* subgroup_id = ((z * block_size_x * block_size_y) + (y * block_size_x) + x) / subgroup_size
-       *
-       * this breaks if z or y is zero, so distribute the division to preserve ids
-       *
-       * subgroup_id = ((z * block_size_x * block_size_y) / subgroup_size) + ((y * block_size_x) / subgroup_size) + (x / subgroup_size)
-       *
-       * except "x" is pre-divided here
-       *
-       * subgroup_id = ((z * block_size_x * block_size_y) / subgroup_size) + ((y * block_size_x) / subgroup_size) + x
-       */
-      LLVMValueRef subgroup_id = LLVMBuildUDiv(builder,
-                                               LLVMBuildMul(gallivm->builder, z_size_arg, LLVMBuildMul(gallivm->builder, block_x_size_arg, block_y_size_arg, ""), ""),
-                                               vec_length, "");
-      subgroup_id = LLVMBuildAdd(gallivm->builder,
-                                 subgroup_id,
-                                 LLVMBuildUDiv(builder, LLVMBuildMul(gallivm->builder, y_size_arg, block_x_size_arg, ""), vec_length, ""),
-                                 "");
-      subgroup_id = LLVMBuildAdd(gallivm->builder, subgroup_id, x_size_arg, "");
      system_values.subgroup_id = subgroup_id;
-      LLVMValueRef num_subgroups = LLVMBuildUDiv(builder,
-                                                 LLVMBuildMul(builder, block_x_size_arg,
-                                                              LLVMBuildMul(builder, block_y_size_arg, block_z_size_arg, ""), ""),
-                                                 vec_length, "");
-      LLVMValueRef subgroup_cmp = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, num_subgroups, lp_build_const_int32(gallivm, 0), "");
-      system_values.num_subgroups = LLVMBuildSelect(builder, subgroup_cmp, lp_build_const_int32(gallivm, 1), num_subgroups, "");
+      system_values.num_subgroups = num_subgroup_loop;

      system_values.block_size[0] = block_x_size_arg;
      system_values.block_size[1] = block_y_size_arg;
      system_values.block_size[2] = block_z_size_arg;

-      LLVMValueRef last_x_loop = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, x_size_arg, LLVMBuildSub(gallivm->builder, num_x_loop, lp_build_const_int32(gallivm, 1), ""), "");
-      LLVMValueRef use_partial_mask = LLVMBuildAnd(gallivm->builder, last_x_loop, has_partials, "");
+      LLVMValueRef last_loop = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, subgroup_id, LLVMBuildSub(gallivm->builder, num_subgroup_loop, lp_build_const_int32(gallivm, 1), ""), "");
+      LLVMValueRef use_partial_mask = LLVMBuildAnd(gallivm->builder, last_loop, has_partials, "");
      struct lp_build_if_state if_state;
      LLVMTypeRef mask_type = LLVMVectorType(int32_type, cs_type.length);
      LLVMValueRef mask_val = lp_build_alloca(gallivm, mask_type, "mask");
@ -866,7 +825,7 @@ generate_compute(struct llvmpipe_context *lp,
                                                        lp_int_type(cs_type), 0);

         struct lp_build_if_state iter0state;
-         LLVMValueRef is_iter0 = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, coro_idx,
+         LLVMValueRef is_iter0 = LLVMBuildICmp(gallivm->builder, LLVMIntEQ, subgroup_id,
                                               lp_build_const_int32(gallivm, 0), "");
         LLVMValueRef vertex_count = LLVMBuildLoad2(gallivm->builder, i32t, mesh_iface.vertex_count, "");
         LLVMValueRef prim_count = LLVMBuildLoad2(gallivm->builder, i32t, mesh_iface.prim_count, "");
--- a/src/gallium/drivers/r300/compiler/r300_nir.c
+++ b/src/gallium/drivers/r300/compiler/r300_nir.c
@ -80,6 +80,7 @@ r300_optimize_nir(struct nir_shader *s, struct pipe_screen *screen)
      NIR_PASS_V(s, nir_lower_vars_to_ssa);

      NIR_PASS(progress, s, nir_copy_prop);
+      NIR_PASS(progress, s, r300_nir_lower_flrp);
      NIR_PASS(progress, s, nir_opt_algebraic);
      if (s->info.stage == MESA_SHADER_VERTEX) {
         if (!is_r500)
--- a/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
+++ b/src/gallium/drivers/r300/compiler/r3xx_vertprog.c
@ -689,6 +689,7 @@ static void allocate_temporary_registers(struct radeon_compiler *c, void *user)

 	if (!ra_allocate(graph)) {
 		rc_error(c, "Ran out of hardware temporaries\n");
+                ralloc_free(graph);
 		return;
 	}

--- a/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
+++ b/src/gallium/drivers/r300/compiler/radeon_pair_regalloc.c
@ -357,6 +357,7 @@ static void do_advanced_regalloc(struct regalloc_state * s)

 	if (!ra_allocate(graph)) {
 		rc_error(s->C, "Ran out of hardware temporaries\n");
+                ralloc_free(graph);
 		return;
 	}

--- a/src/gallium/drivers/r300/r300_context.c
+++ b/src/gallium/drivers/r300/r300_context.c
@ -86,10 +86,15 @@ static void r300_destroy_context(struct pipe_context* context)
    if (r300->draw)
        draw_destroy(r300->draw);

+    for (unsigned i = 0; i < r300->nr_vertex_buffers; i++)
+       pipe_vertex_buffer_unreference(&r300->vertex_buffer[i]);
+
    if (r300->uploader)
        u_upload_destroy(r300->uploader);
    if (r300->context.stream_uploader)
        u_upload_destroy(r300->context.stream_uploader);
+    if (r300->context.const_uploader)
+       u_upload_destroy(r300->context.const_uploader);

    /* XXX: This function assumes r300->query_list was initialized */
    r300_release_referenced_objects(r300);
@ -99,6 +104,7 @@ static void r300_destroy_context(struct pipe_context* context)
        r300->rws->ctx_destroy(r300->ctx);

    rc_destroy_regalloc_state(&r300->fs_regalloc_state);
+    rc_destroy_regalloc_state(&r300->vs_regalloc_state);

    /* XXX: No way to tell if this was initialized or not? */
    slab_destroy_child(&r300->pool_transfers);
@ -125,6 +131,9 @@ static void r300_destroy_context(struct pipe_context* context)
            FREE(r300->vertex_stream_state.state);
        }
    }
+
+    FREE(r300->stencilref_fallback);
+
    FREE(r300);
 }

--- a/src/gallium/drivers/r300/r300_fs.c
+++ b/src/gallium/drivers/r300/r300_fs.c
@ -525,6 +525,7 @@ static void r300_translate_fragment_shader(
            abort();
        }

+        free(compiler.code->constants.Constants);
        rc_destroy(&compiler.Base);
        r300_dummy_fragment_shader(r300, shader);
        return;
--- a/src/gallium/drivers/radeonsi/radeon_vcn_dec.c
+++ b/src/gallium/drivers/radeonsi/radeon_vcn_dec.c
@ -307,15 +307,15 @@ static rvcn_dec_message_hevc_t get_h265_msg(struct radeon_decoder *dec,
   result.sps_info_flags |= pic->pps->sps->separate_colour_plane_flag << 8;
   if (((struct si_screen *)dec->screen)->info.family == CHIP_CARRIZO)
      result.sps_info_flags |= 1 << 9;
-   if (pic->UseRefPicList == true)
+   if (pic->UseRefPicList == true) {
      result.sps_info_flags |= 1 << 10;
+      result.sps_info_flags |= 1 << 12;
+   }
   if (pic->UseStRpsBits == true && pic->pps->st_rps_bits != 0) {
      result.sps_info_flags |= 1 << 11;
      result.st_rps_bits = pic->pps->st_rps_bits;
   }

-   result.sps_info_flags |= 1 << 12;
-
   result.chroma_format = pic->pps->sps->chroma_format_idc;
   result.bit_depth_luma_minus8 = pic->pps->sps->bit_depth_luma_minus8;
   result.bit_depth_chroma_minus8 = pic->pps->sps->bit_depth_chroma_minus8;
--- a/src/gallium/drivers/v3d/v3dx_draw.c
+++ b/src/gallium/drivers/v3d/v3dx_draw.c
@ -1390,7 +1390,7 @@ v3d_launch_grid(struct pipe_context *pctx, const struct pipe_grid_info *info)
                v3d->compute_shared_memory =
                        v3d_bo_alloc(v3d->screen,
                                     v3d->prog.compute->prog_data.compute->shared_size *
-                                     wgs_per_sg,
+                                     num_wgs,
                                     "shared_vars");
        }

--- a/src/gallium/drivers/virgl/ci/traces-virgl.yml
+++ b/src/gallium/drivers/virgl/ci/traces-virgl.yml
@ -12,7 +12,7 @@ traces:
      checksum: 57ddd36b117adc9216c65c10d914a37e
  gputest/pixmark-piano-v2.trace:
    gl-virgl:
-      checksum: cbe50265c2d1a114fd75bf12407fbad9
+      checksum: 3b760606c18aebda1ad0eff6eb03203a
  gputest/triangle-v2.trace:
    gl-virgl:
      checksum: 7812de00011a3a059892e36cea19c696
--- a/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
+++ b/src/gallium/drivers/zink/nir_to_spirv/nir_to_spirv.c
@ -4774,7 +4774,7 @@ nir_to_spirv(struct nir_shader *s, const struct zink_shader_info *sinfo, uint32_
      /* this could be huge, so only alloc if needed since it's extremely unlikely to
       * ever be used by anything except cts
       */
-      ctx.resident_defs = ralloc_array_size(ctx.mem_ctx,
+      ctx.resident_defs = rzalloc_array_size(ctx.mem_ctx,
                                            sizeof(SpvId), entry->ssa_alloc);
      if (!ctx.resident_defs)
         goto fail;
--- a/src/gallium/drivers/zink/zink_batch.c
+++ b/src/gallium/drivers/zink/zink_batch.c
@ -458,10 +458,13 @@ get_batch_state(struct zink_context *ctx, struct zink_batch *batch)
      }
      simple_mtx_unlock(&screen->free_batch_states_lock);
   }
-   if (!bs && ctx->batch_states) {
-      /* states are stored sequentially, so if the first one doesn't work, none of them will */
-      if (zink_screen_check_last_finished(screen, ctx->batch_states->fence.batch_id) ||
-          find_unused_state(ctx->batch_states)) {
+   /* states are stored sequentially, so if the first one doesn't work, none of them will */
+   if (!bs && ctx->batch_states && ctx->batch_states->next) {
+      /* only a submitted state can be reused */
+      if (p_atomic_read(&ctx->batch_states->fence.submitted) &&
+          /* a submitted state must have completed before it can be reused */
+          (zink_screen_check_last_finished(screen, ctx->batch_states->fence.batch_id) ||
+           p_atomic_read(&ctx->batch_states->fence.completed))) {
         bs = ctx->batch_states;
         pop_batch_state(ctx);
      }
--- a/src/gallium/drivers/zink/zink_bo.c
+++ b/src/gallium/drivers/zink/zink_bo.c
@ -548,7 +548,7 @@ bo_sparse_create(struct zink_screen *screen, uint64_t size)
   bo->base.base.alignment_log2 = util_logbase2(ZINK_SPARSE_BUFFER_PAGE_SIZE);
   bo->base.base.size = size;
   bo->base.vtbl = &bo_sparse_vtbl;
-   unsigned placement = zink_mem_type_idx_from_bits(screen, ZINK_HEAP_DEVICE_LOCAL_SPARSE, VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT);
+   unsigned placement = zink_mem_type_idx_from_types(screen, ZINK_HEAP_DEVICE_LOCAL_SPARSE, UINT32_MAX);
   assert(placement != UINT32_MAX);
   bo->base.base.placement = placement;
   bo->unique_id = p_atomic_inc_return(&screen->pb.next_bo_unique_id);
@ -622,6 +622,8 @@ zink_bo_create(struct zink_screen *screen, uint64_t size, unsigned alignment, en
            low_bound *= 2; //nvidia has fat textures or something
         unsigned vk_heap_idx = screen->info.mem_props.memoryTypes[mem_type_idx].heapIndex;
         reclaim_all = screen->info.mem_props.memoryHeaps[vk_heap_idx].size <= low_bound;
+         if (reclaim_all)
+            reclaim_all = clean_up_buffer_managers(screen);
      }
      entry = pb_slab_alloc_reclaimed(slabs, alloc_size, mem_type_idx, reclaim_all);
      if (!entry) {
--- a/src/gallium/drivers/zink/zink_bo.h
+++ b/src/gallium/drivers/zink/zink_bo.h
@ -94,10 +94,10 @@ zink_heap_from_domain_flags(VkMemoryPropertyFlags domains, enum zink_alloc_flag
 }

 static ALWAYS_INLINE unsigned
-zink_mem_type_idx_from_bits(struct zink_screen *screen, enum zink_heap heap, uint32_t bits)
+zink_mem_type_idx_from_types(struct zink_screen *screen, enum zink_heap heap, uint32_t types)
 {
   for (unsigned i = 0; i < screen->heap_count[heap]; i++) {
-      if (bits & BITFIELD_BIT(screen->heap_map[heap][i])) {
+      if (types & BITFIELD_BIT(screen->heap_map[heap][i])) {
         return screen->heap_map[heap][i];
      }
   }
--- a/src/gallium/drivers/zink/zink_clear.c
+++ b/src/gallium/drivers/zink/zink_clear.c
@ -98,8 +98,8 @@ clear_in_rp(struct pipe_context *pctx,
         return;
      cr.rect.offset.x = scissor_state->minx;
      cr.rect.offset.y = scissor_state->miny;
-      cr.rect.extent.width = MIN2(fb->width, scissor_state->maxx - scissor_state->minx);
-      cr.rect.extent.height = MIN2(fb->height, scissor_state->maxy - scissor_state->miny);
+      cr.rect.extent.width = MIN2(fb->width - cr.rect.offset.x, scissor_state->maxx - scissor_state->minx);
+      cr.rect.extent.height = MIN2(fb->height - cr.rect.offset.y, scissor_state->maxy - scissor_state->miny);
   } else {
      cr.rect.extent.width = fb->width;
      cr.rect.extent.height = fb->height;
@ -644,6 +644,8 @@ zink_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *dst,
                         bool render_condition_enabled)
 {
   struct zink_context *ctx = zink_context(pctx);
+   /* check for stencil fallback */
+   bool blitting = ctx->blitting;
   zink_flush_dgc_if_enabled(ctx);
   bool render_condition_active = ctx->render_condition_active;
   if (!render_condition_enabled && render_condition_active) {
@ -656,14 +658,16 @@ zink_clear_depth_stencil(struct pipe_context *pctx, struct pipe_surface *dst,
       dsty + height > ctx->fb_state.height)
      cur_attachment = false;
   if (!cur_attachment) {
-      util_blitter_save_framebuffer(ctx->blitter, &ctx->fb_state);
-      set_clear_fb(pctx, NULL, dst);
-      zink_blit_barriers(ctx, NULL, zink_resource(dst->texture), false);
-      ctx->blitting = true;
+      if (!blitting) {
+         util_blitter_save_framebuffer(ctx->blitter, &ctx->fb_state);
+         set_clear_fb(pctx, NULL, dst);
+         zink_blit_barriers(ctx, NULL, zink_resource(dst->texture), false);
+         ctx->blitting = true;
+      }
   }
   struct pipe_scissor_state scissor = {dstx, dsty, dstx + width, dsty + height};
   pctx->clear(pctx, clear_flags, &scissor, NULL, depth, stencil);
-   if (!cur_attachment) {
+   if (!cur_attachment && !blitting) {
      util_blitter_restore_fb_state(ctx->blitter);
      ctx->blitting = false;
   }
--- a/src/gallium/drivers/zink/zink_compiler.c
+++ b/src/gallium/drivers/zink/zink_compiler.c
@ -3543,6 +3543,88 @@ invert_point_coord(nir_shader *nir)
                                     nir_metadata_dominance, NULL);
 }

+static bool
+is_residency_code(nir_def *src)
+{
+   nir_instr *parent = src->parent_instr;
+   while (1) {
+      if (parent->type == nir_instr_type_intrinsic) {
+         ASSERTED nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
+         assert(intr->intrinsic == nir_intrinsic_is_sparse_texels_resident);
+         return false;
+      }
+      if (parent->type == nir_instr_type_tex)
+         return true;
+      assert(parent->type == nir_instr_type_alu);
+      nir_alu_instr *alu = nir_instr_as_alu(parent);
+      parent = alu->src[0].src.ssa->parent_instr;
+   }
+}
+
+static bool
+lower_sparse_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data)
+{
+   if (instr->intrinsic == nir_intrinsic_sparse_residency_code_and) {
+      b->cursor = nir_before_instr(&instr->instr);
+      nir_def *src0;
+      if (is_residency_code(instr->src[0].ssa))
+         src0 = nir_is_sparse_texels_resident(b, 1, instr->src[0].ssa);
+      else
+         src0 = instr->src[0].ssa;
+      nir_def *src1;
+      if (is_residency_code(instr->src[1].ssa))
+         src1 = nir_is_sparse_texels_resident(b, 1, instr->src[1].ssa);
+      else
+         src1 = instr->src[1].ssa;
+      nir_def *def = nir_iand(b, src0, src1);
+      nir_def_rewrite_uses_after(&instr->def, def, &instr->instr);
+      nir_instr_remove(&instr->instr);
+      return true;
+   }
+   if (instr->intrinsic != nir_intrinsic_is_sparse_texels_resident)
+      return false;
+
+   /* vulkan vec can only be a vec4, but this is (maybe) vec5,
+    * so just rewrite as the first component since ntv is going to use a different
+    * method for storing the residency value anyway
+    */
+   b->cursor = nir_before_instr(&instr->instr);
+   nir_instr *parent = instr->src[0].ssa->parent_instr;
+   if (is_residency_code(instr->src[0].ssa)) {
+      assert(parent->type == nir_instr_type_alu);
+      nir_alu_instr *alu = nir_instr_as_alu(parent);
+      nir_def_rewrite_uses_after(instr->src[0].ssa, nir_channel(b, alu->src[0].src.ssa, 0), parent);
+      nir_instr_remove(parent);
+   } else {
+      nir_def *src;
+      if (parent->type == nir_instr_type_intrinsic) {
+         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
+         assert(intr->intrinsic == nir_intrinsic_is_sparse_texels_resident);
+         src = intr->src[0].ssa;
+      } else {
+         assert(parent->type == nir_instr_type_alu);
+         nir_alu_instr *alu = nir_instr_as_alu(parent);
+         src = alu->src[0].src.ssa;
+      }
+      if (instr->def.bit_size != 32) {
+         if (instr->def.bit_size == 1)
+            src = nir_ieq_imm(b, src, 1);
+         else
+            src = nir_u2uN(b, src, instr->def.bit_size);
+      }
+      nir_def_rewrite_uses(&instr->def, src);
+      nir_instr_remove(&instr->instr);
+   }
+   return true;
+}
+
+static bool
+lower_sparse(nir_shader *shader)
+{
+   return nir_shader_intrinsics_pass(shader, lower_sparse_instr,
+                                     nir_metadata_dominance, NULL);
+}
+
 static bool
 add_derefs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
 {
@ -3679,7 +3761,7 @@ add_derefs_instr(nir_builder *b, nir_intrinsic_instr *intr, void *data)
         }
         /* filter needed components */
         if (intr->num_components < load->num_components)
-            load = nir_channels(b, load, BITFIELD_MASK(intr->num_components) << c);
+            load = nir_channels(b, load, BITFIELD_MASK(intr->num_components) << (c - var->data.location_frac));
         nir_def_rewrite_uses(&intr->def, load);
      } else {
         nir_def *store = intr->src[0].ssa;
@ -3936,6 +4018,7 @@ zink_shader_compile(struct zink_screen *screen, bool can_shobj, struct zink_shad
         zs->can_inline = false;
   } else if (need_optimize)
      optimize_nir(nir, zs, true);
+   NIR_PASS_V(nir, lower_sparse);
   
   struct zink_shader_object obj = compile_module(screen, zs, nir, can_shobj, pg);
   ralloc_free(nir);
@ -4570,88 +4653,6 @@ scan_nir(struct zink_screen *screen, nir_shader *shader, struct zink_shader *zs)
   }
 }

-static bool
-is_residency_code(nir_def *src)
-{
-   nir_instr *parent = src->parent_instr;
-   while (1) {
-      if (parent->type == nir_instr_type_intrinsic) {
-         ASSERTED nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
-         assert(intr->intrinsic == nir_intrinsic_is_sparse_texels_resident);
-         return false;
-      }
-      if (parent->type == nir_instr_type_tex)
-         return true;
-      assert(parent->type == nir_instr_type_alu);
-      nir_alu_instr *alu = nir_instr_as_alu(parent);
-      parent = alu->src[0].src.ssa->parent_instr;
-   }
-}
-
-static bool
-lower_sparse_instr(nir_builder *b, nir_intrinsic_instr *instr, void *data)
-{
-   if (instr->intrinsic == nir_intrinsic_sparse_residency_code_and) {
-      b->cursor = nir_before_instr(&instr->instr);
-      nir_def *src0;
-      if (is_residency_code(instr->src[0].ssa))
-         src0 = nir_is_sparse_texels_resident(b, 1, instr->src[0].ssa);
-      else
-         src0 = instr->src[0].ssa;
-      nir_def *src1;
-      if (is_residency_code(instr->src[1].ssa))
-         src1 = nir_is_sparse_texels_resident(b, 1, instr->src[1].ssa);
-      else
-         src1 = instr->src[1].ssa;
-      nir_def *def = nir_iand(b, src0, src1);
-      nir_def_rewrite_uses_after(&instr->def, def, &instr->instr);
-      nir_instr_remove(&instr->instr);
-      return true;
-   }
-   if (instr->intrinsic != nir_intrinsic_is_sparse_texels_resident)
-      return false;
-
-   /* vulkan vec can only be a vec4, but this is (maybe) vec5,
-    * so just rewrite as the first component since ntv is going to use a different
-    * method for storing the residency value anyway
-    */
-   b->cursor = nir_before_instr(&instr->instr);
-   nir_instr *parent = instr->src[0].ssa->parent_instr;
-   if (is_residency_code(instr->src[0].ssa)) {
-      assert(parent->type == nir_instr_type_alu);
-      nir_alu_instr *alu = nir_instr_as_alu(parent);
-      nir_def_rewrite_uses_after(instr->src[0].ssa, nir_channel(b, alu->src[0].src.ssa, 0), parent);
-      nir_instr_remove(parent);
-   } else {
-      nir_def *src;
-      if (parent->type == nir_instr_type_intrinsic) {
-         nir_intrinsic_instr *intr = nir_instr_as_intrinsic(parent);
-         assert(intr->intrinsic == nir_intrinsic_is_sparse_texels_resident);
-         src = intr->src[0].ssa;
-      } else {
-         assert(parent->type == nir_instr_type_alu);
-         nir_alu_instr *alu = nir_instr_as_alu(parent);
-         src = alu->src[0].src.ssa;
-      }
-      if (instr->def.bit_size != 32) {
-         if (instr->def.bit_size == 1)
-            src = nir_ieq_imm(b, src, 1);
-         else
-            src = nir_u2uN(b, src, instr->def.bit_size);
-      }
-      nir_def_rewrite_uses(&instr->def, src);
-      nir_instr_remove(&instr->instr);
-   }
-   return true;
-}
-
-static bool
-lower_sparse(nir_shader *shader)
-{
-   return nir_shader_intrinsics_pass(shader, lower_sparse_instr,
-                                     nir_metadata_dominance, NULL);
-}
-
 static bool
 match_tex_dests_instr(nir_builder *b, nir_instr *in, void *data)
 {
@ -5301,11 +5302,20 @@ mem_access_size_align_cb(nir_intrinsic_op intrin, uint8_t bytes,

   assert(util_is_power_of_two_nonzero(align));

-   return (nir_mem_access_size_align){
-      .num_components = MIN2(bytes / (bit_size / 8), 4),
-      .bit_size = bit_size,
-      .align = bit_size / 8,
-   };
+   /* simply drop the bit_size for unaligned load/stores */
+   if (align < (bit_size / 8)) {
+      return (nir_mem_access_size_align){
+         .num_components = MIN2(bytes / align, 4),
+         .bit_size = align * 8,
+         .align = align,
+      };
+   } else {
+      return (nir_mem_access_size_align){
+         .num_components = MIN2(bytes / (bit_size / 8), 4),
+         .bit_size = bit_size,
+         .align = bit_size / 8,
+      };
+   }
 }

 static nir_mem_access_size_align
@ -5468,7 +5478,6 @@ zink_shader_create(struct zink_screen *screen, struct nir_shader *nir)

   NIR_PASS_V(nir, lower_basevertex);
   NIR_PASS_V(nir, lower_baseinstance);
-   NIR_PASS_V(nir, lower_sparse);
   NIR_PASS_V(nir, split_bitfields);
   NIR_PASS_V(nir, nir_lower_frexp); /* TODO: Use the spirv instructions for this. */

@ -5744,20 +5753,6 @@ zink_gfx_shader_free(struct zink_screen *screen, struct zink_shader *shader)
         }

      }
-      while (util_dynarray_contains(&shader->pipeline_libs, struct zink_gfx_lib_cache*)) {
-         struct zink_gfx_lib_cache *libs = util_dynarray_pop(&shader->pipeline_libs, struct zink_gfx_lib_cache*);
-         //this condition is equivalent to verifying that, for each bit stages_present_i in stages_present,
-         //stages_present_i implies libs->stages_present_i
-         if ((stages_present & ~(libs->stages_present & stages_present)) != 0)
-            continue;
-         if (!libs->removed) {
-            libs->removed = true;
-            simple_mtx_lock(&screen->pipeline_libs_lock[idx]);
-            _mesa_set_remove_key(&screen->pipeline_libs[idx], libs);
-            simple_mtx_unlock(&screen->pipeline_libs_lock[idx]);
-         }
-         zink_gfx_lib_cache_unref(screen, libs);
-      }
      if (stage == MESA_SHADER_FRAGMENT || !shader->non_fs.is_generated) {
         prog->shaders[stage] = NULL;
         prog->stages_remaining &= ~BITFIELD_BIT(stage);
@ -5773,6 +5768,17 @@ zink_gfx_shader_free(struct zink_screen *screen, struct zink_shader *shader)
      }
      zink_gfx_program_reference(screen, &prog, NULL);
   }
+   while (util_dynarray_contains(&shader->pipeline_libs, struct zink_gfx_lib_cache*)) {
+      struct zink_gfx_lib_cache *libs = util_dynarray_pop(&shader->pipeline_libs, struct zink_gfx_lib_cache*);
+      if (!libs->removed) {
+         libs->removed = true;
+         unsigned idx = zink_program_cache_stages(libs->stages_present);
+         simple_mtx_lock(&screen->pipeline_libs_lock[idx]);
+         _mesa_set_remove_key(&screen->pipeline_libs[idx], libs);
+         simple_mtx_unlock(&screen->pipeline_libs_lock[idx]);
+      }
+      zink_gfx_lib_cache_unref(screen, libs);
+   }
   if (shader->info.stage == MESA_SHADER_TESS_EVAL &&
       shader->non_fs.generated_tcs) {
      /* automatically destroy generated tcs shaders when tes is destroyed */
--- a/src/gallium/drivers/zink/zink_context.c
+++ b/src/gallium/drivers/zink/zink_context.c
@ -187,21 +187,31 @@ zink_context_destroy(struct pipe_context *pctx)
         screen->free_batch_states = ctx->batch_states;
         screen->last_free_batch_state = screen->free_batch_states;
      }
-      while (screen->last_free_batch_state->next)
-         screen->last_free_batch_state = screen->last_free_batch_state->next;
   }
+   while (screen->last_free_batch_state && screen->last_free_batch_state->next)
+      screen->last_free_batch_state = screen->last_free_batch_state->next;
   if (ctx->free_batch_states) {
      if (screen->free_batch_states)
         screen->last_free_batch_state->next = ctx->free_batch_states;
-      else
+      else {
         screen->free_batch_states = ctx->free_batch_states;
-      screen->last_free_batch_state = ctx->last_free_batch_state;
+         screen->last_free_batch_state = ctx->last_free_batch_state;
+      }
   }
-   simple_mtx_unlock(&screen->free_batch_states_lock);
+   while (screen->last_free_batch_state && screen->last_free_batch_state->next)
+      screen->last_free_batch_state = screen->last_free_batch_state->next;
   if (ctx->batch.state) {
      zink_clear_batch_state(ctx, ctx->batch.state);
-      zink_batch_state_destroy(screen, ctx->batch.state);
+      if (screen->free_batch_states)
+         screen->last_free_batch_state->next = ctx->batch.state;
+      else {
+         screen->free_batch_states = ctx->batch.state;
+         screen->last_free_batch_state = screen->free_batch_states;
+      }
   }
+   while (screen->last_free_batch_state && screen->last_free_batch_state->next)
+      screen->last_free_batch_state = screen->last_free_batch_state->next;
+   simple_mtx_unlock(&screen->free_batch_states_lock);

   for (unsigned i = 0; i < 2; i++) {
      util_idalloc_fini(&ctx->di.bindless[i].tex_slots);
@ -2837,6 +2847,29 @@ begin_rendering(struct zink_context *ctx)
               ctx->dynamic_fb.attachments[PIPE_MAX_COLOR_BUFS+1].loadOp = VK_ATTACHMENT_LOAD_OP_CLEAR;
            }
         }
+      }
+      if (changed_size || changed_layout)
+         ctx->rp_changed = true;
+      ctx->rp_loadop_changed = false;
+      ctx->rp_layout_changed = false;
+   }
+   /* always assemble clear_buffers mask:
+    * if a scissored clear must be triggered during glFlush,
+    * the renderpass metadata may be unchanged (e.g., LOAD from previous rp),
+    * but the buffer mask must still be returned
+    */
+   if (ctx->clears_enabled) {
+      for (int i = 0; i < ctx->fb_state.nr_cbufs; i++) {
+         /* these are no-ops */
+         if (!ctx->fb_state.cbufs[i] || !zink_fb_clear_enabled(ctx, i))
+            continue;
+         /* these need actual clear calls inside the rp */
+         if (zink_fb_clear_needs_explicit(&ctx->fb_clears[i]))
+            clear_buffers |= (PIPE_CLEAR_COLOR0 << i);
+      }
+      if (ctx->fb_state.zsbuf && zink_fb_clear_enabled(ctx, PIPE_MAX_COLOR_BUFS)) {
+         struct zink_framebuffer_clear *fb_clear = &ctx->fb_clears[PIPE_MAX_COLOR_BUFS];
+         struct zink_framebuffer_clear_data *clear = zink_fb_clear_element(fb_clear, 0);
         if (zink_fb_clear_needs_explicit(fb_clear)) {
            for (int j = !zink_fb_clear_element_needs_explicit(clear);
                 (clear_buffers & PIPE_CLEAR_DEPTHSTENCIL) != PIPE_CLEAR_DEPTHSTENCIL && j < zink_fb_clear_count(fb_clear);
@ -2844,10 +2877,6 @@ begin_rendering(struct zink_context *ctx)
               clear_buffers |= zink_fb_clear_element(fb_clear, j)->zs.bits;
         }
      }
-      if (changed_size || changed_layout)
-         ctx->rp_changed = true;
-      ctx->rp_loadop_changed = false;
-      ctx->rp_layout_changed = false;
   }

   if (!ctx->rp_changed && ctx->batch.in_rp)
@ -3803,7 +3832,6 @@ zink_flush(struct pipe_context *pctx,
   struct zink_batch *batch = &ctx->batch;
   struct zink_fence *fence = NULL;
   struct zink_screen *screen = zink_screen(ctx->base.screen);
-   unsigned submit_count = 0;
   VkSemaphore export_sem = VK_NULL_HANDLE;

   /* triggering clears will force has_work */
@ -3864,8 +3892,7 @@ zink_flush(struct pipe_context *pctx,
      }
   }

-   /* TODO: if swapchains gain timeline semaphore semantics, `flags` can be eliminated and no-op fence can return timeline id */
-   if (!batch->has_work && flags) {
+   if (!batch->has_work) {
       if (pfence) {
          /* reuse last fence */
          fence = ctx->last_fence;
@ -3882,7 +3909,6 @@ zink_flush(struct pipe_context *pctx,
         tc_driver_internal_flush_notify(ctx->tc);
   } else {
      fence = &batch->state->fence;
-      submit_count = batch->state->usage.submit_count;
      if (deferred && !(flags & PIPE_FLUSH_FENCE_FD) && pfence)
         deferred_fence = true;
      else
@ -3906,7 +3932,7 @@ zink_flush(struct pipe_context *pctx,
      mfence->fence = fence;
      mfence->sem = export_sem;
      if (fence) {
-         mfence->submit_count = submit_count;
+         mfence->submit_count = zink_batch_state(fence)->usage.submit_count;
         util_dynarray_append(&fence->mfences, struct zink_tc_fence *, mfence);
      }
      if (export_sem) {
--- a/src/gallium/drivers/zink/zink_fence.c
+++ b/src/gallium/drivers/zink/zink_fence.c
@ -185,7 +185,12 @@ zink_fence_finish(struct zink_screen *screen, struct pipe_context *pctx, struct
   if (submit_diff > 1)
      return true;

-   if (fence->submitted && zink_screen_check_last_finished(screen, fence->batch_id))
+   /* - if fence is submitted, batch_id is nonzero and can be checked
+    * - if fence is not submitted here, it must be reset; batch_id will be 0 and submitted is false
+    * in either case, the fence has finished
+    */
+   if ((fence->submitted && zink_screen_check_last_finished(screen, fence->batch_id)) ||
+       (!fence->submitted && submit_diff))
      return true;

   return fence_wait(screen, fence, timeout_ns);
--- a/src/gallium/drivers/zink/zink_kopper.c
+++ b/src/gallium/drivers/zink/zink_kopper.c
@ -561,6 +561,8 @@ kopper_acquire(struct zink_screen *screen, struct zink_resource *res, uint64_t t
   if (cdt->swapchain->images[res->obj->dt_idx].readback)
      zink_resource(cdt->swapchain->images[res->obj->dt_idx].readback)->valid = false;
   res->obj->image = cdt->swapchain->images[res->obj->dt_idx].image;
+   if (!cdt->age_locked)
+      zink_kopper_update_last_written(res);
   cdt->swapchain->images[res->obj->dt_idx].acquired = false;
   if (!cdt->swapchain->images[res->obj->dt_idx].init) {
      /* swapchain images are initially in the UNDEFINED layout */
@ -792,7 +794,7 @@ zink_kopper_present_queue(struct zink_screen *screen, struct zink_resource *res)
   cpi->res = res;
   cpi->swapchain = cdt->swapchain;
   cpi->indefinite_acquire = res->obj->indefinite_acquire;
-   res->obj->last_dt_idx = cpi->image = res->obj->dt_idx;
+   cpi->image = res->obj->dt_idx;
   cpi->info.sType = VK_STRUCTURE_TYPE_PRESENT_INFO_KHR;
   cpi->info.pNext = NULL;
   cpi->info.waitSemaphoreCount = 1;
@ -812,11 +814,13 @@ zink_kopper_present_queue(struct zink_screen *screen, struct zink_resource *res)
    *  * Any other color buffers' ages are incremented by 1 if
    *    their age was previously greater than 0.
    */
-   for (int i = 0; i < cdt->swapchain->num_images; i++) {
-       if (i == res->obj->dt_idx)
-           cdt->swapchain->images[i].age = 1;
-       else if (cdt->swapchain->images[i].age > 0)
-           cdt->swapchain->images[i].age += 1;
+   if (!cdt->age_locked) {
+      for (int i = 0; i < cdt->swapchain->num_images; i++) {
+            if (i == res->obj->dt_idx)
+               cdt->swapchain->images[i].age = 1;
+            else if (cdt->swapchain->images[i].age > 0)
+               cdt->swapchain->images[i].age += 1;
+      }
   }
   if (util_queue_is_initialized(&screen->flush_queue)) {
      p_atomic_inc(&cpi->swapchain->async_presents);
@ -832,6 +836,12 @@ zink_kopper_present_queue(struct zink_screen *screen, struct zink_resource *res)
   res->obj->dt_idx = UINT32_MAX;
 }

+void
+zink_kopper_update_last_written(struct zink_resource *res)
+{
+   res->obj->last_dt_idx = res->obj->dt_idx;
+}
+
 static void
 kopper_ensure_readback(struct zink_screen *screen, struct zink_resource *res)
 {
@ -873,14 +883,17 @@ zink_kopper_acquire_readback(struct zink_context *ctx, struct zink_resource *res
   if (++cdt->readback_counter >= ZINK_READBACK_THRESHOLD)
      kopper_ensure_readback(screen, res);
   while (res->obj->dt_idx != last_dt_idx) {
+      cdt->age_locked = true;
      if (res->obj->dt_idx != UINT32_MAX && !zink_kopper_present_readback(ctx, res))
         break;
+      cdt->age_locked = true;
      do {
         ret = kopper_acquire(screen, res, 0);
      } while (!is_swapchain_kill(ret) && (ret == VK_NOT_READY || ret == VK_TIMEOUT));
      if (is_swapchain_kill(ret)) {
         kill_swapchain(ctx, res);
         *readback = NULL;
+         cdt->age_locked = false;
         return false;
      }
   }
@ -936,6 +949,10 @@ zink_kopper_present_readback(struct zink_context *ctx, struct zink_resource *res
   simple_mtx_lock(&screen->semaphores_lock);
   util_dynarray_append(&screen->semaphores, VkSemaphore, acquire);
   simple_mtx_unlock(&screen->semaphores_lock);
+
+   struct kopper_displaytarget *cdt = res->obj->dt;
+   cdt->age_locked = false;
+
   return zink_screen_handle_vkresult(screen, error);
 }

--- a/src/gallium/drivers/zink/zink_kopper.h
+++ b/src/gallium/drivers/zink/zink_kopper.h
@ -95,6 +95,8 @@ struct kopper_displaytarget
   bool is_kill;
   VkPresentModeKHR present_mode;
   unsigned readback_counter;
+
+   bool age_locked; //disables buffer age during readback
 };

 struct zink_context;
@ -119,6 +121,9 @@ zink_kopper_acquired(const struct kopper_displaytarget *cdt, uint32_t idx)
   return idx != UINT32_MAX && cdt->swapchain->images[idx].acquired;
 }

+void
+zink_kopper_update_last_written(struct zink_resource *res);
+
 struct kopper_displaytarget *
 zink_kopper_displaytarget_create(struct zink_screen *screen, unsigned tex_usage,
                                 enum pipe_format format, unsigned width,
--- a/src/gallium/drivers/zink/zink_program.c
+++ b/src/gallium/drivers/zink/zink_program.c
@ -1006,6 +1006,8 @@ create_lib_cache(struct zink_gfx_program *prog, bool generated_tcs)
 {
   struct zink_gfx_lib_cache *libs = CALLOC_STRUCT(zink_gfx_lib_cache);
   libs->stages_present = prog->stages_present;
+   if (generated_tcs)
+      libs->stages_present &= ~BITFIELD_BIT(MESA_SHADER_TESS_CTRL);
   simple_mtx_init(&libs->lock, mtx_plain);
   if (generated_tcs)
      _mesa_set_init(&libs->libs, NULL, hash_pipeline_lib_generated_tcs, equals_pipeline_lib_generated_tcs);
--- a/src/gallium/drivers/zink/zink_resource.c
+++ b/src/gallium/drivers/zink/zink_resource.c
@ -729,7 +729,8 @@ init_ici(struct zink_screen *screen, VkImageCreateInfo *ici, const struct pipe_r

   case PIPE_TEXTURE_3D:
      ici->imageType = VK_IMAGE_TYPE_3D;
-      ici->flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
+      if (!(templ->flags & PIPE_RESOURCE_FLAG_SPARSE))
+         ici->flags |= VK_IMAGE_CREATE_2D_ARRAY_COMPATIBLE_BIT;
      if (screen->info.have_EXT_image_2d_view_of_3d)
         ici->flags |= VK_IMAGE_CREATE_2D_VIEW_COMPATIBLE_BIT_EXT;
      break;
@ -1180,6 +1181,10 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
   mai.pNext = NULL;
   mai.allocationSize = reqs.size;
   enum zink_heap heap = zink_heap_from_domain_flags(flags, aflags);
+   if (templ->flags & PIPE_RESOURCE_FLAG_MAP_COHERENT) {
+      if (!(vk_domain_from_heap(heap) & VK_MEMORY_PROPERTY_HOST_COHERENT_BIT))
+         heap = zink_heap_from_domain_flags(flags & ~VK_MEMORY_PROPERTY_DEVICE_LOCAL_BIT, aflags);
+   }

   VkMemoryDedicatedAllocateInfo ded_alloc_info = {
      .sType = VK_STRUCTURE_TYPE_MEMORY_DEDICATED_ALLOCATE_INFO,
@ -1267,7 +1272,7 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
      alignment = MAX2(alignment, screen->info.props.limits.minMemoryMapAlignment);
   obj->alignment = alignment;

-   if (zink_mem_type_idx_from_bits(screen, heap, reqs.memoryTypeBits) == UINT32_MAX) {
+   if (zink_mem_type_idx_from_types(screen, heap, reqs.memoryTypeBits) == UINT32_MAX) {
      /* not valid based on reqs; demote to more compatible type */
      switch (heap) {
      case ZINK_HEAP_DEVICE_LOCAL_VISIBLE:
@ -1279,7 +1284,7 @@ resource_object_create(struct zink_screen *screen, const struct pipe_resource *t
      default:
         break;
      }
-      assert(zink_mem_type_idx_from_bits(screen, heap, reqs.memoryTypeBits) != UINT32_MAX);
+      assert(zink_mem_type_idx_from_types(screen, heap, reqs.memoryTypeBits) != UINT32_MAX);
   }

 retry:
@ -1611,6 +1616,11 @@ add_resource_bind(struct zink_context *ctx, struct zink_resource *res, unsigned
      box.depth = util_num_layers(&res->base.b, i);
      ctx->base.resource_copy_region(&ctx->base, &res->base.b, i, 0, 0, 0, &staging.base.b, i, &box);
   }
+   if (old_obj->exportable) {
+      simple_mtx_lock(&ctx->batch.state->exportable_lock);
+      _mesa_set_remove_key(&ctx->batch.state->dmabuf_exports, &staging);
+      simple_mtx_unlock(&ctx->batch.state->exportable_lock);
+   }
   zink_resource_object_reference(screen, &old_obj, NULL);
   return true;
 }
--- a/src/gallium/drivers/zink/zink_screen.c
+++ b/src/gallium/drivers/zink/zink_screen.c
@ -837,6 +837,9 @@ zink_get_param(struct pipe_screen *pscreen, enum pipe_cap param)
      return 1;

   case PIPE_CAP_BINDLESS_TEXTURE:
+      if (zink_descriptor_mode == ZINK_DESCRIPTOR_MODE_DB &&
+          (screen->info.db_props.maxDescriptorBufferBindings < 2 || screen->info.db_props.maxSamplerDescriptorBufferBindings < 2))
+         return 0;
      return screen->info.have_EXT_descriptor_indexing;

   case PIPE_CAP_TEXTURE_BUFFER_OFFSET_ALIGNMENT:
@ -3465,20 +3468,11 @@ zink_internal_create_screen(const struct pipe_screen_config *config, int64_t dev
         mesa_logw("zink: bug detected: inputAttachmentDescriptorSize(%u) > %u", (unsigned)screen->info.db_props.inputAttachmentDescriptorSize, ZINK_FBFETCH_DESCRIPTOR_SIZE);
         can_db = false;
      }
-      if (screen->compact_descriptors) {
-         if (screen->info.db_props.maxDescriptorBufferBindings < 3) {
-            if (zink_descriptor_mode == ZINK_DESCRIPTOR_MODE_DB) {
-               mesa_loge("Cannot use db descriptor mode with compact descriptors with maxDescriptorBufferBindings < 3");
-               goto fail;
-            }
-            can_db = false;
-         }
-      } else {
-         if (screen->info.db_props.maxDescriptorBufferBindings < 5) {
-            if (zink_descriptor_mode == ZINK_DESCRIPTOR_MODE_DB) {
-               mesa_loge("Cannot use db descriptor mode with maxDescriptorBufferBindings < 5");
-               goto fail;
-            }
+      if (screen->info.db_props.maxDescriptorBufferBindings < 2 || screen->info.db_props.maxSamplerDescriptorBufferBindings < 2) {
+         if (zink_descriptor_mode == ZINK_DESCRIPTOR_MODE_DB) {
+            /* allow for testing, but disable bindless */
+            mesa_logw("Cannot use bindless and db descriptor mode with (maxDescriptorBufferBindings||maxSamplerDescriptorBufferBindings) < 2");
+         } else {
            can_db = false;
         }
      }
--- a/src/gallium/drivers/zink/zink_screen.h
+++ b/src/gallium/drivers/zink/zink_screen.h
@ -61,6 +61,7 @@ static inline bool
 zink_screen_check_last_finished(struct zink_screen *screen, uint32_t batch_id)
 {
   const uint32_t check_id = (uint32_t)batch_id;
+   assert(check_id);
   /* last_finished may have wrapped */
   if (screen->last_finished < UINT_MAX / 2) {
      /* last_finished has wrapped, batch_id has not */
--- a/src/gallium/frontends/rusticl/core/program.rs
+++ b/src/gallium/frontends/rusticl/core/program.rs
@ -505,7 +505,12 @@ impl Program {
        for (i, d) in self.devs.iter().enumerate() {
            let mut ptr = ptrs[i];
            let info = lock.dev_build(d);
-            let spirv = info.spirv.as_ref().unwrap().to_bin();
+
+            // no spirv means nothing to write
+            let Some(spirv) = info.spirv.as_ref() else {
+                continue;
+            };
+            let spirv = spirv.to_bin();

            unsafe {
                // 1. binary format version
--- a/src/gallium/frontends/va/picture_vp9.c
+++ b/src/gallium/frontends/va/picture_vp9.c
@ -56,8 +56,10 @@ void vlVaHandlePictureParameterBufferVP9(vlVaDriver *drv, vlVaContext *context,
   context->desc.vp9.picture_parameter.pic_fields.refresh_frame_context = vp9->pic_fields.bits.refresh_frame_context;
   context->desc.vp9.picture_parameter.pic_fields.frame_context_idx = vp9->pic_fields.bits.frame_context_idx;
   context->desc.vp9.picture_parameter.pic_fields.segmentation_enabled = vp9->pic_fields.bits.segmentation_enabled;
-   context->desc.vp9.picture_parameter.pic_fields.segmentation_temporal_update = vp9->pic_fields.bits.segmentation_temporal_update;
-   context->desc.vp9.picture_parameter.pic_fields.segmentation_update_map = vp9->pic_fields.bits.segmentation_update_map;
+   context->desc.vp9.picture_parameter.pic_fields.segmentation_temporal_update =
+      vp9->pic_fields.bits.segmentation_enabled && vp9->pic_fields.bits.segmentation_temporal_update;
+   context->desc.vp9.picture_parameter.pic_fields.segmentation_update_map =
+      vp9->pic_fields.bits.segmentation_enabled && vp9->pic_fields.bits.segmentation_update_map;
   context->desc.vp9.picture_parameter.pic_fields.last_ref_frame = vp9->pic_fields.bits.last_ref_frame;
   context->desc.vp9.picture_parameter.pic_fields.last_ref_frame_sign_bias = vp9->pic_fields.bits.last_ref_frame_sign_bias;
   context->desc.vp9.picture_parameter.pic_fields.golden_ref_frame = vp9->pic_fields.bits.golden_ref_frame;
--- a/src/gallium/frontends/vdpau/query.c
+++ b/src/gallium/frontends/vdpau/query.c
@ -108,6 +108,8 @@ vlVdpVideoSurfaceQueryGetPutBitsYCbCrCapabilities(VdpDevice device, VdpChromaTyp
 {
   vlVdpDevice *dev;
   struct pipe_screen *pscreen;
+   VdpYCbCrFormat ycbcrFormat;
+   bool supported;

   if (!is_supported)
      return VDP_STATUS_INVALID_POINTER;
@ -122,47 +124,50 @@ vlVdpVideoSurfaceQueryGetPutBitsYCbCrCapabilities(VdpDevice device, VdpChromaTyp

   mtx_lock(&dev->mutex);

+   ycbcrFormat = bits_ycbcr_format;
   switch(bits_ycbcr_format) {
   case VDP_YCBCR_FORMAT_NV12:
-      *is_supported = surface_chroma_type == VDP_CHROMA_TYPE_420;
+      supported = surface_chroma_type == VDP_CHROMA_TYPE_420;
      break;

   case VDP_YCBCR_FORMAT_YV12:
-      *is_supported = surface_chroma_type == VDP_CHROMA_TYPE_420;
+      supported = surface_chroma_type == VDP_CHROMA_TYPE_420;

      /* We can convert YV12 to NV12 on the fly! */
-      if (*is_supported &&
-          pscreen->is_video_format_supported(pscreen,
-                                             PIPE_FORMAT_NV12,
-                                             PIPE_VIDEO_PROFILE_UNKNOWN,
-                                             PIPE_VIDEO_ENTRYPOINT_BITSTREAM)) {
-         mtx_unlock(&dev->mutex);
-         return VDP_STATUS_OK;
-      }
+      ycbcrFormat = VDP_YCBCR_FORMAT_NV12;
      break;

   case VDP_YCBCR_FORMAT_UYVY:
   case VDP_YCBCR_FORMAT_YUYV:
-      *is_supported = surface_chroma_type == VDP_CHROMA_TYPE_422;
+      supported = surface_chroma_type == VDP_CHROMA_TYPE_422;
      break;

   case VDP_YCBCR_FORMAT_Y8U8V8A8:
   case VDP_YCBCR_FORMAT_V8U8Y8A8:
-      *is_supported = surface_chroma_type == VDP_CHROMA_TYPE_444;
+      supported = surface_chroma_type == VDP_CHROMA_TYPE_444;
+      break;
+
+   case VDP_YCBCR_FORMAT_P010:
+   case VDP_YCBCR_FORMAT_P016:
+      /* Do any other profiles imply support for this chroma type? */
+      supported = (surface_chroma_type == VDP_CHROMA_TYPE_420_16)
+                  && vl_codec_supported(pscreen, PIPE_VIDEO_PROFILE_HEVC_MAIN_10, false);
      break;

   default:
-      *is_supported = false;
+      supported = false;
      break;
   }

-   if (*is_supported &&
+   if (supported &&
       !pscreen->is_video_format_supported(pscreen,
-                                           FormatYCBCRToPipe(bits_ycbcr_format),
+                                           FormatYCBCRToPipe(ycbcrFormat),
                                           PIPE_VIDEO_PROFILE_UNKNOWN,
                                           PIPE_VIDEO_ENTRYPOINT_BITSTREAM)) {
-      *is_supported = false;
+      supported = false;
   }
+   *is_supported = supported;
+
   mtx_unlock(&dev->mutex);

   return VDP_STATUS_OK;
--- a/src/intel/blorp/blorp_clear.c
+++ b/src/intel/blorp/blorp_clear.c
@ -605,15 +605,17 @@ blorp_clear(struct blorp_batch *batch,
   if (batch->blorp->isl_dev->info->ver < 6)
      use_simd16_replicated_data = false;

-   /* From the BSpec: 47719 Replicate Data:
+   /* From the BSpec: 47719 (TGL/DG2/MTL) Replicate Data:
    *
    * "Replicate Data Render Target Write message should not be used
    *  on all projects TGL+."
    *
+    * Xe2 spec (57350) does not mention this restriction.
+    *
    *  See 14017879046, 14017880152 for additional information.
    */
   if (batch->blorp->isl_dev->info->ver >= 12 &&
-       format == ISL_FORMAT_R10G10B10_FLOAT_A2_UNORM)
+       batch->blorp->isl_dev->info->ver < 20)
      use_simd16_replicated_data = false;

   if (compute)
--- a/src/intel/compiler/brw_nir.h
+++ b/src/intel/compiler/brw_nir.h
@ -122,8 +122,7 @@ brw_nir_ubo_surface_index_is_pushable(nir_src src)

   if (intrin && intrin->intrinsic == nir_intrinsic_resource_intel) {
      return (nir_intrinsic_resource_access_intel(intrin) &
-              nir_resource_intel_pushable) &&
-             nir_src_is_const(intrin->src[1]);
+              nir_resource_intel_pushable);
   }

   return nir_src_is_const(src);
@ -146,6 +145,14 @@ brw_nir_ubo_surface_index_get_push_block(nir_src src)
   return nir_intrinsic_resource_block_intel(intrin);
 }

+/* This helper return the binding table index of a surface access (any
+ * buffer/image/etc...). It works off the source of one of the intrinsics
+ * (load_ubo, load_ssbo, store_ssbo, load_image, store_image, etc...).
+ *
+ * If the source is constant, then this is the binding table index. If we're
+ * going through a resource_intel intel intrinsic, then we need to check
+ * src[1] of that intrinsic.
+ */
 static inline unsigned
 brw_nir_ubo_surface_index_get_bti(nir_src src)
 {
@ -155,8 +162,19 @@ brw_nir_ubo_surface_index_get_bti(nir_src src)
   assert(src.ssa->parent_instr->type == nir_instr_type_intrinsic);

   nir_intrinsic_instr *intrin = nir_instr_as_intrinsic(src.ssa->parent_instr);
-   assert(intrin->intrinsic == nir_intrinsic_resource_intel);
-   assert(nir_src_is_const(intrin->src[1]));
+   if (!intrin || intrin->intrinsic != nir_intrinsic_resource_intel)
+      return UINT32_MAX;
+
+   /* In practice we could even drop this intrinsic because the bindless
+    * access always operate from a base offset coming from a push constant, so
+    * they can never be constant.
+    */
+   if (nir_intrinsic_resource_access_intel(intrin) &
+       nir_resource_intel_bindless)
+      return UINT32_MAX;
+
+   if (!nir_src_is_const(intrin->src[1]))
+      return UINT32_MAX;

   return nir_src_as_uint(intrin->src[1]);
 }
--- a/src/intel/compiler/brw_nir_lower_ray_queries.c
+++ b/src/intel/compiler/brw_nir_lower_ray_queries.c
@ -543,8 +543,11 @@ brw_nir_lower_ray_queries(nir_shader *shader,
   };

   /* Map all query variable to internal type variables */
-   nir_foreach_function_temp_variable(var, state.impl)
+   nir_foreach_function_temp_variable(var, state.impl) {
+      if (!var->data.ray_query)
+         continue;
      register_opaque_var(var, &state);
+   }
   hash_table_foreach(state.queries, entry)
      create_internal_var(entry->data, &state);

--- a/src/intel/vulkan/anv_device.c
+++ b/src/intel/vulkan/anv_device.c
@ -2168,6 +2168,14 @@ anv_physical_device_try_create(struct vk_instance *vk_instance,
      goto fail_fd;
   }

+   /* Disable Wa_16013994831 on Gfx12.0 because we found other cases where we
+    * need to always disable preemption :
+    *    - https://gitlab.freedesktop.org/mesa/mesa/-/issues/5963
+    *    - https://gitlab.freedesktop.org/mesa/mesa/-/issues/5662
+    */
+   if (devinfo.verx10 == 120)
+      BITSET_CLEAR(devinfo.workarounds, INTEL_WA_16013994831);
+
   if (!devinfo.has_context_isolation) {
      result = vk_errorf(instance, VK_ERROR_INCOMPATIBLE_DRIVER,
                         "Vulkan requires context isolation for %s", devinfo.name);
--- a/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
+++ b/src/intel/vulkan/anv_nir_apply_pipeline_layout.c
@ -1975,6 +1975,34 @@ add_push_entry(struct anv_pipeline_push_map *push_map,
   };
 }

+static bool
+binding_should_use_surface_binding_table(const struct apply_pipeline_layout_state *state,
+                                         const struct anv_descriptor_set_binding_layout *binding)
+{
+   if ((binding->data & ANV_DESCRIPTOR_BTI_SURFACE_STATE) == 0)
+      return false;
+
+   if (state->pdevice->always_use_bindless &&
+       (binding->data & ANV_DESCRIPTOR_SURFACE))
+      return false;
+
+   return true;
+}
+
+static bool
+binding_should_use_sampler_binding_table(const struct apply_pipeline_layout_state *state,
+                                         const struct anv_descriptor_set_binding_layout *binding)
+{
+   if ((binding->data & ANV_DESCRIPTOR_BTI_SAMPLER_STATE) == 0)
+      return false;
+
+   if (state->pdevice->always_use_bindless &&
+       (binding->data & ANV_DESCRIPTOR_SAMPLER))
+      return false;
+
+   return true;
+}
+
 void
 anv_nir_apply_pipeline_layout(nir_shader *shader,
                              const struct anv_physical_device *pdevice,
@ -2146,7 +2174,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader,
      state.set[set].binding[b].surface_offset = BINDLESS_OFFSET;
      state.set[set].binding[b].sampler_offset = BINDLESS_OFFSET;

-      if (binding->data & ANV_DESCRIPTOR_BTI_SURFACE_STATE) {
+      if (binding_should_use_surface_binding_table(&state, binding)) {
         if (map->surface_count + array_size * array_multiplier > MAX_BINDING_TABLE_SIZE ||
             anv_descriptor_requires_bindless(pdevice, binding, false) ||
             brw_shader_stage_requires_bindless_resources(shader->info.stage)) {
@ -2177,7 +2205,7 @@ anv_nir_apply_pipeline_layout(nir_shader *shader,
         assert(map->surface_count <= MAX_BINDING_TABLE_SIZE);
      }

-      if (binding->data & ANV_DESCRIPTOR_BTI_SAMPLER_STATE) {
+      if (binding_should_use_sampler_binding_table(&state, binding)) {
         if (map->sampler_count + array_size * array_multiplier > MAX_SAMPLER_TABLE_SIZE ||
             anv_descriptor_requires_bindless(pdevice, binding, true) ||
             brw_shader_stage_requires_bindless_resources(shader->info.stage)) {
--- a/src/intel/vulkan/anv_nir_push_descriptor_analysis.c
+++ b/src/intel/vulkan/anv_nir_push_descriptor_analysis.c
@ -126,18 +126,17 @@ anv_nir_loads_push_desc_buffer(nir_shader *nir,
            if (intrin->intrinsic != nir_intrinsic_load_ubo)
               continue;

-            const nir_const_value *const_bt_idx =
-               nir_src_as_const_value(intrin->src[0]);
-            if (const_bt_idx == NULL)
+            const unsigned bt_idx =
+               brw_nir_ubo_surface_index_get_bti(intrin->src[0]);
+            if (bt_idx == UINT32_MAX)
               continue;

-            const unsigned bt_idx = const_bt_idx[0].u32;
-
            const struct anv_pipeline_binding *binding =
               &bind_map->surface_to_descriptor[bt_idx];
            if (binding->set == ANV_DESCRIPTOR_SET_DESCRIPTORS &&
-                binding->index == push_set)
+                binding->index == push_set) {
               return true;
+            }
         }
      }
   }
@ -162,6 +161,7 @@ anv_nir_push_desc_ubo_fully_promoted(nir_shader *nir,
   if (push_set_layout == NULL)
      return 0;

+   /* Assume every UBO can be promoted first. */
   uint32_t ubos_fully_promoted = 0;
   for (uint32_t b = 0; b < push_set_layout->binding_count; b++) {
      const struct anv_descriptor_set_binding_layout *bind_layout =
@ -174,6 +174,10 @@ anv_nir_push_desc_ubo_fully_promoted(nir_shader *nir,
         ubos_fully_promoted |= BITFIELD_BIT(bind_layout->descriptor_index);
   }

+   /* For each load_ubo intrinsic, if the descriptor index or the offset is
+    * not a constant, we could not promote to push constant. Then check the
+    * offset + size against the push ranges.
+    */
   nir_foreach_function_impl(impl, nir) {
      nir_foreach_block(block, impl) {
         nir_foreach_instr(instr, block) {
@ -184,45 +188,65 @@ anv_nir_push_desc_ubo_fully_promoted(nir_shader *nir,
            if (intrin->intrinsic != nir_intrinsic_load_ubo)
               continue;

-            if (!brw_nir_ubo_surface_index_is_pushable(intrin->src[0]))
+            /* Don't check the load_ubo from descriptor buffers */
+            nir_intrinsic_instr *resource =
+               intrin->src[0].ssa->parent_instr->type == nir_instr_type_intrinsic ?
+               nir_instr_as_intrinsic(intrin->src[0].ssa->parent_instr) : NULL;
+            if (resource == NULL || resource->intrinsic != nir_intrinsic_resource_intel)
               continue;

-            const unsigned bt_idx =
-               brw_nir_ubo_surface_index_get_bti(intrin->src[0]);
-
-            /* Skip if this isn't a load from push descriptor buffer. */
-            const struct anv_pipeline_binding *binding =
-               &bind_map->surface_to_descriptor[bt_idx];
-            if (binding->set != push_set)
+            /* Skip load_ubo not loading from the push descriptor */
+            if (nir_intrinsic_desc_set(resource) != push_set)
               continue;

+            uint32_t binding = nir_intrinsic_binding(resource);
+
+            /* If we have indirect indexing in the binding, no push promotion
+             * in possible for the entire binding.
+             */
+            if (!nir_src_is_const(resource->src[1])) {
+               for (uint32_t i = 0; i < push_set_layout->binding[binding].array_size; i++) {
+                  ubos_fully_promoted &=
+                     ~BITFIELD_BIT(push_set_layout->binding[binding].descriptor_index + i);
+               }
+               continue;
+            }
+
+            const nir_const_value *const_bt_id =
+               nir_src_as_const_value(resource->src[1]);
+            uint32_t bt_id = const_bt_id[0].u32;
+
+            const struct anv_pipeline_binding *pipe_bind =
+               &bind_map->surface_to_descriptor[bt_id];
+
            const uint32_t desc_idx =
-               push_set_layout->binding[binding->binding].descriptor_index;
-            assert(desc_idx < MAX_PUSH_DESCRIPTORS);
-
-            bool promoted = false;
+               push_set_layout->binding[binding].descriptor_index;

            /* If the offset in the entry is dynamic, we can't tell if
             * promoted or not.
             */
            const nir_const_value *const_load_offset =
               nir_src_as_const_value(intrin->src[1]);
-            if (const_load_offset != NULL) {
-               /* Check if the load was promoted to a push constant. */
-               const unsigned load_offset = const_load_offset[0].u32;
-               const int load_bytes = nir_intrinsic_dest_components(intrin) *
-                  (intrin->def.bit_size / 8);
+            if (const_load_offset == NULL) {
+               ubos_fully_promoted &= ~BITFIELD_BIT(desc_idx);
+               continue;
+            }

-               for (unsigned i = 0; i < ARRAY_SIZE(bind_map->push_ranges); i++) {
-                  if (bind_map->push_ranges[i].set == binding->set &&
-                      bind_map->push_ranges[i].index == desc_idx &&
-                      bind_map->push_ranges[i].start * 32 <= load_offset &&
-                      (bind_map->push_ranges[i].start +
-                       bind_map->push_ranges[i].length) * 32 >=
-                      (load_offset + load_bytes)) {
-                     promoted = true;
-                     break;
-                  }
+            /* Check if the load was promoted to a push constant. */
+            const unsigned load_offset = const_load_offset[0].u32;
+            const int load_bytes = nir_intrinsic_dest_components(intrin) *
+               (intrin->def.bit_size / 8);
+
+            bool promoted = false;
+            for (unsigned i = 0; i < ARRAY_SIZE(bind_map->push_ranges); i++) {
+               if (bind_map->push_ranges[i].set == pipe_bind->set &&
+                   bind_map->push_ranges[i].index == desc_idx &&
+                   bind_map->push_ranges[i].start * 32 <= load_offset &&
+                   (bind_map->push_ranges[i].start +
+                    bind_map->push_ranges[i].length) * 32 >=
+                   (load_offset + load_bytes)) {
+                  promoted = true;
+                  break;
               }
            }

--- a/src/intel/vulkan/anv_pipeline.c
+++ b/src/intel/vulkan/anv_pipeline.c
@ -1728,7 +1728,7 @@ anv_pipeline_account_shader(struct anv_pipeline *pipeline,

   if (shader->push_desc_info.used_set_buffer) {
      pipeline->use_push_descriptor_buffer |=
-         BITFIELD_BIT(mesa_to_vk_shader_stage(shader->stage));
+         mesa_to_vk_shader_stage(shader->stage);
   }
   if (shader->push_desc_info.used_descriptors &
       ~shader->push_desc_info.fully_promoted_ubo_descriptors)
--- a/src/intel/vulkan/genX_cmd_buffer.c
+++ b/src/intel/vulkan/genX_cmd_buffer.c
@ -8370,8 +8370,9 @@ void genX(CmdEndRendering)(
                                "MSAA resolve");
   }

-   if (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
-       gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE) {
+   if (!(gfx->rendering_flags & VK_RENDERING_SUSPENDING_BIT) &&
+       (gfx->depth_att.resolve_mode != VK_RESOLVE_MODE_NONE ||
+        gfx->stencil_att.resolve_mode != VK_RESOLVE_MODE_NONE)) {
      /* We are about to do some MSAA resolves.  We need to flush so that the
       * result of writes to the MSAA depth attachments show up in the sampler
       * when we blit to the single-sampled resolve target.
--- a/src/intel/vulkan/genX_gfx_state.c
+++ b/src/intel/vulkan/genX_gfx_state.c
@ -68,7 +68,7 @@ static const uint32_t genX(vk_to_intel_blend_op)[] = {
 static void
 genX(streamout_prologue)(struct anv_cmd_buffer *cmd_buffer)
 {
-#if GFX_VERx10 >= 120
+#if INTEL_WA_16013994831_GFX_VER
   /* Wa_16013994831 - Disable preemption during streamout, enable back
    * again if XFB not used by the current pipeline.
    *
--- a/src/intel/vulkan/genX_gpu_memcpy.c
+++ b/src/intel/vulkan/genX_gpu_memcpy.c
@ -80,7 +80,9 @@ emit_common_so_memcpy(struct anv_batch *batch, struct anv_device *device,
      anv_batch_emit(batch, GENX(3DSTATE_MESH_CONTROL), mesh);
      anv_batch_emit(batch, GENX(3DSTATE_TASK_CONTROL), task);
   }
+#endif

+#if INTEL_WA_16013994831_GFX_VER
   /* Wa_16013994831 - Disable preemption during streamout. */
   if (intel_needs_workaround(device->info, 16013994831))
      genX(batch_set_preemption)(batch, device->info, _3D, false);
--- a/src/intel/vulkan/grl/meson.build
+++ b/src/intel/vulkan/grl/meson.build
@ -201,6 +201,6 @@ libgrl = static_library(
 idep_grl = declare_dependency(
  link_with : libgrl,
  dependencies : libgrl_deps,
-  sources : grl_metakernel_h,
+  sources : [grl_metakernel_h, grl_cl_kernel_h],
  include_directories : include_directories('include', 'gpu'),
 )
--- a/src/mesa/main/dlist.c
+++ b/src/mesa/main/dlist.c
@ -1220,7 +1220,7 @@ dlist_alloc(struct gl_context *ctx, OpCode opcode, GLuint bytes, bool align8)
      ctx->ListState.CurrentPos++;
   }

-   if (ctx->ListState.CurrentPos + numNodes + contNodes > BLOCK_SIZE) {
+   if (ctx->ListState.CurrentPos + numNodes + contNodes >= BLOCK_SIZE) {
      /* This block is full.  Allocate a new block and chain to it */
      Node *newblock;
      Node *n = ctx->ListState.CurrentBlock + ctx->ListState.CurrentPos;
--- a/src/mesa/main/fbobject.c
+++ b/src/mesa/main/fbobject.c
@ -2659,6 +2659,16 @@ _mesa_base_fbo_format(const struct gl_context *ctx, GLenum internalFormat)
   case GL_RGB565:
      return _mesa_is_gles(ctx) || ctx->Extensions.ARB_ES2_compatibility
         ? GL_RGB : 0;
+
+   case GL_BGRA:
+      /* EXT_texture_format_BGRA8888 only adds this as color-renderable for
+       * GLES 2 and later
+       */
+      if (_mesa_has_EXT_texture_format_BGRA8888(ctx) && _mesa_is_gles2(ctx))
+         return GL_RGBA;
+      else
+         return 0;
+
   default:
      return 0;
   }
--- a/src/mesa/main/formatquery.c
+++ b/src/mesa/main/formatquery.c
@ -1112,6 +1112,12 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
      if (get_pname == 0)
         goto end;

+      /* if the resource is unsupported, zero is returned */
+      if (!st_QueryTextureFormatSupport(ctx, target, internalformat)) {
+         buffer[0] = 0;
+         break;
+      }
+
      _mesa_GetIntegerv(get_pname, buffer);
      break;
   }
@ -1123,6 +1129,12 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
      if (!_mesa_is_array_texture(target))
         goto end;

+      /* if the resource is unsupported, zero is returned */
+      if (!st_QueryTextureFormatSupport(ctx, target, internalformat)) {
+         buffer[0] = 0;
+         break;
+      }
+
      _mesa_GetIntegerv(GL_MAX_ARRAY_TEXTURE_LAYERS, buffer);
      break;

@ -1137,6 +1149,12 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
      unsigned i;
      GLint current_value;

+      /* if the resource is unsupported, zero is returned */
+      if (!st_QueryTextureFormatSupport(ctx, target, internalformat)) {
+         buffer[0] = 0;
+         break;
+      }
+
      /* Combining the dimensions. Note that for array targets, this would
       * automatically include the value of MAX_LAYERS, as that value is
       * returned as MAX_HEIGHT or MAX_DEPTH */
@ -1515,6 +1533,14 @@ _mesa_GetInternalformativ(GLenum target, GLenum internalformat, GLenum pname,
      if (targetIndex < 0 || targetIndex == TEXTURE_BUFFER_INDEX)
         goto end;

+      /* If the resource is not supported for image textures,
+       * or if image textures are not supported, NONE is returned.
+       */
+      if (!st_QueryTextureFormatSupport(ctx, target, internalformat)) {
+         buffer[0] = GL_NONE;
+         break;
+      }
+
      /* From spec: "Equivalent to calling GetTexParameter with <value> set
       * to IMAGE_FORMAT_COMPATIBILITY_TYPE."
       *
--- a/src/mesa/main/vdpau.c
+++ b/src/mesa/main/vdpau.c
@ -39,6 +39,7 @@
 #include "glformats.h"
 #include "texobj.h"
 #include "teximage.h"
+#include "textureview.h"
 #include "api_exec_decl.h"

 #include "state_tracker/st_cb_texture.h"
@ -179,7 +180,7 @@ register_surface(struct gl_context *ctx, GLboolean isOutput,
      }

      /* This will disallow respecifying the storage. */
-      tex->Immutable = GL_TRUE;
+      _mesa_set_texture_view_state(ctx, tex, target, 1);
      _mesa_unlock_texture(ctx, tex);

      _mesa_reference_texobj(&surf->textures[i], tex);
--- a/src/mesa/state_tracker/st_format.c
+++ b/src/mesa/state_tracker/st_format.c
@ -1507,6 +1507,49 @@ st_QuerySamplesForFormat(struct gl_context *ctx, GLenum target,
   return num_sample_counts;
 }

+/* check whether any texture can be allocated for a given format */
+bool
+st_QueryTextureFormatSupport(struct gl_context *ctx, GLenum target, GLenum internalFormat)
+{
+   struct st_context *st = st_context(ctx);
+
+   /* If an sRGB framebuffer is unsupported, sRGB formats behave like linear
+    * formats.
+    */
+   if (!ctx->Extensions.EXT_sRGB) {
+      internalFormat = _mesa_get_linear_internalformat(internalFormat);
+   }
+
+   /* multisample textures need >= 2 samples */
+   unsigned min_samples = target == GL_TEXTURE_2D_MULTISAMPLE ||
+                          target == GL_TEXTURE_2D_MULTISAMPLE_ARRAY ? 1 : 0;
+   unsigned max_samples = min_samples ? 16 : 1;
+
+   /* compressed textures will be allocated as e.g., RGBA8, so check that instead */
+   enum pipe_format pf = st_choose_format(st, internalFormat, GL_NONE, GL_NONE,
+                                          PIPE_TEXTURE_2D, 0, 0, 0,
+                                          false, false);
+   if (util_format_is_compressed(pf)) {
+      enum pipe_format fmts[2] = {0};
+      pf = st_mesa_format_to_pipe_format(st, st_pipe_format_to_mesa_format(pf));
+      fmts[0] = pf;
+      for (unsigned i = max_samples; i > min_samples; i >>= 1) {
+         if (find_supported_format(st->screen, fmts, PIPE_TEXTURE_2D,
+                                   i, i, PIPE_BIND_SAMPLER_VIEW, false))
+            return true;
+      }
+      return false;
+   }
+   for (unsigned i = max_samples; i > min_samples; i >>= 1) {
+      if (st_choose_format(st, internalFormat, GL_NONE, GL_NONE,
+                           PIPE_TEXTURE_2D, i, i, PIPE_BIND_SAMPLER_VIEW,
+                           false, false))
+         return true;
+   }
+
+   return false;
+}
+

 /**
 * ARB_internalformat_query2 driver hook.
--- a/src/mesa/state_tracker/st_format.h
+++ b/src/mesa/state_tracker/st_format.h
@ -70,7 +70,8 @@ extern mesa_format
 st_ChooseTextureFormat(struct gl_context * ctx, GLenum target,
                       GLint internalFormat,
                       GLenum format, GLenum type);
-
+bool
+st_QueryTextureFormatSupport(struct gl_context *ctx, GLenum target, GLenum internalFormat);
 void
 st_QueryInternalFormat(struct gl_context *ctx, GLenum target,
                       GLenum internalFormat, GLenum pname, GLint *params);
--- a/src/microsoft/vulkan/dzn_cmd_buffer.c
+++ b/src/microsoft/vulkan/dzn_cmd_buffer.c
@ -3100,6 +3100,7 @@ dzn_cmd_buffer_update_pipeline(struct dzn_cmd_buffer *cmdbuf, uint32_t bindpoint
   ID3D12PipelineState *old_pipeline_state =
      cmdbuf->state.pipeline ? cmdbuf->state.pipeline->state : NULL;

+   uint32_t view_instance_mask = 0;
   if (cmdbuf->state.bindpoint[bindpoint].dirty & DZN_CMD_BINDPOINT_DIRTY_PIPELINE) {
      if (cmdbuf->state.bindpoint[bindpoint].root_sig != pipeline->root.sig) {
         cmdbuf->state.bindpoint[bindpoint].root_sig = pipeline->root.sig;
@ -3135,9 +3136,9 @@ dzn_cmd_buffer_update_pipeline(struct dzn_cmd_buffer *cmdbuf, uint32_t bindpoint
         ID3D12GraphicsCommandList1_IASetPrimitiveTopology(cmdbuf->cmdlist, gfx->ia.topology);
         dzn_graphics_pipeline_get_state(gfx, &cmdbuf->state.pipeline_variant);
         if (gfx->multiview.native_view_instancing)
-            ID3D12GraphicsCommandList1_SetViewInstanceMask(cmdbuf->cmdlist, gfx->multiview.view_mask);
+            view_instance_mask = gfx->multiview.view_mask;
         else
-            ID3D12GraphicsCommandList1_SetViewInstanceMask(cmdbuf->cmdlist, 1);
+            view_instance_mask = 1;

         if (gfx->zsa.dynamic_depth_bias && gfx->use_gs_for_polygon_mode_point)
            cmdbuf->state.bindpoint[bindpoint].dirty |= DZN_CMD_BINDPOINT_DIRTY_SYSVALS;
@ -3150,6 +3151,11 @@ dzn_cmd_buffer_update_pipeline(struct dzn_cmd_buffer *cmdbuf, uint32_t bindpoint
      ID3D12GraphicsCommandList1_SetPipelineState(cmdbuf->cmdlist, pipeline->state);
      cmdbuf->state.pipeline = pipeline;
   }
+
+   /* Deferring this until after the pipeline has been set due to an NVIDIA driver bug
+    * when view instancing mask is set with no pipeline bound. */
+   if (view_instance_mask)
+      ID3D12GraphicsCommandList1_SetViewInstanceMask(cmdbuf->cmdlist, view_instance_mask);
 }

 static void
--- a/src/nouveau/vulkan/nvk_cmd_draw.c
+++ b/src/nouveau/vulkan/nvk_cmd_draw.c
@ -917,7 +917,9 @@ nvk_CmdEndRendering(VkCommandBuffer commandBuffer)

   if (need_resolve) {
      struct nv_push *p = nvk_cmd_buffer_push(cmd, 2);
-      P_IMMD(p, NV9097, WAIT_FOR_IDLE, 0);
+      P_IMMD(p, NVA097, INVALIDATE_TEXTURE_DATA_CACHE, {
+         .lines = LINES_ALL,
+      });

      nvk_meta_resolve_rendering(cmd, &vk_render);
   }
--- a/src/nouveau/vulkan/nvk_instance.c
+++ b/src/nouveau/vulkan/nvk_instance.c
@ -158,6 +158,9 @@ nvk_DestroyInstance(VkInstance _instance,
   if (!instance)
      return;

+   driDestroyOptionCache(&instance->dri_options);
+   driDestroyOptionInfo(&instance->available_dri_options);
+
   vk_instance_finish(&instance->vk);
   vk_free(&instance->vk.alloc, instance);
 }
--- a/src/nouveau/winsys/nouveau_device.c
+++ b/src/nouveau/winsys/nouveau_device.c
@ -351,6 +351,7 @@ nouveau_ws_device_new(drmDevicePtr drm_device)
 out_err:
   if (device->has_vm_bind) {
      util_vma_heap_finish(&device->vma_heap);
+      util_vma_heap_finish(&device->bda_heap);
      simple_mtx_destroy(&device->vma_mutex);
   }
   if (ver)
@ -372,6 +373,7 @@ nouveau_ws_device_destroy(struct nouveau_ws_device *device)

   if (device->has_vm_bind) {
      util_vma_heap_finish(&device->vma_heap);
+      util_vma_heap_finish(&device->bda_heap);
      simple_mtx_destroy(&device->vma_mutex);
   }

--- a/src/virtio/vulkan/vn_descriptor_set.c
+++ b/src/virtio/vulkan/vn_descriptor_set.c
@ -390,6 +390,8 @@ vn_CreateDescriptorPool(VkDevice device,
   vn_async_vkCreateDescriptorPool(dev->primary_ring, device, pCreateInfo,
                                   NULL, &pool_handle);

+   vn_tls_set_async_pipeline_create();
+
   *pDescriptorPool = pool_handle;

   return VK_SUCCESS;
--- a/src/virtio/vulkan/vn_device.c
+++ b/src/virtio/vulkan/vn_device.c
@ -570,6 +570,8 @@ vn_CreateDevice(VkPhysicalDevice physicalDevice,
      vn_log(instance, "%s", physical_dev->properties.vulkan_1_2.driverInfo);
   }

+   vn_tls_set_async_pipeline_create();
+
   *pDevice = vn_device_to_handle(dev);

   return VK_SUCCESS;
--- a/src/virtio/vulkan/vn_image.c
+++ b/src/virtio/vulkan/vn_image.c
@ -231,8 +231,15 @@ vn_image_store_reqs_in_cache(struct vn_device *dev,
   assert(cache->ht);

   simple_mtx_lock(&cache->mutex);
-   uint32_t cache_entry_count = _mesa_hash_table_num_entries(cache->ht);
-   if (cache_entry_count == IMAGE_REQS_CACHE_MAX_ENTRIES) {
+
+   /* Check if entry was added before lock */
+   if (_mesa_hash_table_search(cache->ht, key)) {
+      simple_mtx_unlock(&cache->mutex);
+      return;
+   }
+
+   if (_mesa_hash_table_num_entries(cache->ht) ==
+       IMAGE_REQS_CACHE_MAX_ENTRIES) {
      /* Evict/use the last entry in the lru list for this new entry */
      cache_entry =
         list_last_entry(&cache->lru, struct vn_image_reqs_cache_entry, head);
@ -242,11 +249,11 @@ vn_image_store_reqs_in_cache(struct vn_device *dev,
   } else {
      cache_entry = vk_zalloc(alloc, sizeof(*cache_entry), VN_DEFAULT_ALIGN,
                              VK_SYSTEM_ALLOCATION_SCOPE_OBJECT);
+      if (!cache_entry) {
+         simple_mtx_unlock(&cache->mutex);
+         return;
+      }
   }
-   simple_mtx_unlock(&cache->mutex);
-
-   if (!cache_entry)
-      return;

   for (uint32_t i = 0; i < plane_count; i++)
      cache_entry->requirements[i] = requirements[i];
@ -254,12 +261,10 @@ vn_image_store_reqs_in_cache(struct vn_device *dev,
   memcpy(cache_entry->key, key, SHA1_DIGEST_LENGTH);
   cache_entry->plane_count = plane_count;

-   simple_mtx_lock(&cache->mutex);
-   if (!_mesa_hash_table_search(cache->ht, cache_entry->key)) {
-      _mesa_hash_table_insert(dev->image_reqs_cache.ht, cache_entry->key,
-                              cache_entry);
-      list_add(&cache_entry->head, &cache->lru);
-   }
+   _mesa_hash_table_insert(dev->image_reqs_cache.ht, cache_entry->key,
+                           cache_entry);
+   list_add(&cache_entry->head, &cache->lru);
+
   simple_mtx_unlock(&cache->mutex);
 }

--- a/src/virtio/vulkan/vn_queue.c
+++ b/src/virtio/vulkan/vn_queue.c
@ -600,7 +600,7 @@ vn_queue_submission_add_query_feedback(struct vn_queue_submission *submit,
   VkCommandBuffer *feedback_cmd_handle =
      vn_get_feedback_cmd_handle(submit, feedback_cmds, cmd_count);
   const uint32_t stride = submit->batch_type == VK_STRUCTURE_TYPE_SUBMIT_INFO
-                              ? sizeof(VkCommandBuffer *)
+                              ? sizeof(VkCommandBuffer)
                              : sizeof(VkCommandBufferSubmitInfo);

   struct vn_feedback_cmd_pool *feedback_cmd_pool = NULL;
--- a/src/vulkan/runtime/vk_command_buffer.h
+++ b/src/vulkan/runtime/vk_command_buffer.h
@ -174,6 +174,12 @@ struct vk_command_buffer {
   struct vk_framebuffer *framebuffer;
   VkRect2D render_area;

+   /**
+    * True if we are currently inside a CmdPipelineBarrier() is inserted by
+    * the runtime's vk_render_pass.c
+    */
+   bool runtime_rp_barrier;
+
   /* This uses the same trick as STACK_ARRAY */
   struct vk_attachment_state *attachments;
   struct vk_attachment_state _attachments[8];
--- a/src/vulkan/runtime/vk_render_pass.c
+++ b/src/vulkan/runtime/vk_render_pass.c
@ -1392,13 +1392,40 @@ can_use_attachment_initial_layout(struct vk_command_buffer *cmd_buffer,
   return true;
 }

-static void
-set_attachment_layout(struct vk_command_buffer *cmd_buffer,
-                      uint32_t att_idx,
-                      uint32_t view_mask,
-                      VkImageLayout layout,
-                      VkImageLayout stencil_layout)
+uint32_t
+vk_command_buffer_get_attachment_layout(const struct vk_command_buffer *cmd_buffer,
+                                        const struct vk_image *image,
+                                        VkImageLayout *out_layout,
+                                        VkImageLayout *out_stencil_layout)
 {
+   const struct vk_render_pass *render_pass = cmd_buffer->render_pass;
+   assert(render_pass != NULL);
+
+   const struct vk_subpass *subpass =
+      &render_pass->subpasses[cmd_buffer->subpass_idx];
+   int first_view = ffs(subpass->view_mask) - 1;
+
+   for (uint32_t a = 0; a < render_pass->attachment_count; a++) {
+      if (cmd_buffer->attachments[a].image_view->image == image) {
+         *out_layout = cmd_buffer->attachments[a].views[first_view].layout;
+         *out_stencil_layout =
+            cmd_buffer->attachments[a].views[first_view].stencil_layout;
+         return a;
+      }
+   }
+   unreachable("Image not found in attachments");
+}
+
+void
+vk_command_buffer_set_attachment_layout(struct vk_command_buffer *cmd_buffer,
+                                        uint32_t att_idx,
+                                        VkImageLayout layout,
+                                        VkImageLayout stencil_layout)
+{
+   const struct vk_render_pass *render_pass = cmd_buffer->render_pass;
+   const struct vk_subpass *subpass =
+      &render_pass->subpasses[cmd_buffer->subpass_idx];
+   uint32_t view_mask = subpass->view_mask;
   struct vk_attachment_state *att_state = &cmd_buffer->attachments[att_idx];

   u_foreach_bit(view, view_mask) {
@ -1650,9 +1677,10 @@ begin_subpass(struct vk_command_buffer *cmd_buffer,
            };
            __vk_append_struct(color_attachment, color_initial_layout);

-            set_attachment_layout(cmd_buffer, sp_att->attachment,
-                                  subpass->view_mask,
-                                  sp_att->layout, VK_IMAGE_LAYOUT_UNDEFINED);
+            vk_command_buffer_set_attachment_layout(cmd_buffer,
+                                                    sp_att->attachment,
+                                                    sp_att->layout,
+                                                    VK_IMAGE_LAYOUT_UNDEFINED);
         }
      } else {
         /* We've seen at least one of the views of this attachment before so
@ -1770,9 +1798,10 @@ begin_subpass(struct vk_command_buffer *cmd_buffer,
                                  &stencil_initial_layout);
            }

-            set_attachment_layout(cmd_buffer, sp_att->attachment,
-                                  subpass->view_mask,
-                                  sp_att->layout, sp_att->stencil_layout);
+            vk_command_buffer_set_attachment_layout(cmd_buffer,
+                                                    sp_att->attachment,
+                                                    sp_att->layout,
+                                                    sp_att->stencil_layout);
         }
      } else {
         /* We've seen at least one of the views of this attachment before so
@ -2048,8 +2077,10 @@ begin_subpass(struct vk_command_buffer *cmd_buffer,
         .pImageMemoryBarriers = image_barrier_count > 0 ?
                                 image_barriers : NULL,
      };
+      cmd_buffer->runtime_rp_barrier = true;
      disp->CmdPipelineBarrier2(vk_command_buffer_to_handle(cmd_buffer),
                                &dependency_info);
+      cmd_buffer->runtime_rp_barrier = false;
   }

   STACK_ARRAY_FINISH(image_barriers);
@ -2227,8 +2258,10 @@ end_subpass(struct vk_command_buffer *cmd_buffer,
         .memoryBarrierCount = 1,
         .pMemoryBarriers = &mem_barrier,
      };
+      cmd_buffer->runtime_rp_barrier = true;
      disp->CmdPipelineBarrier2(vk_command_buffer_to_handle(cmd_buffer),
                                &dependency_info);
+      cmd_buffer->runtime_rp_barrier = false;
   }
 }

@ -2455,8 +2488,10 @@ vk_common_CmdEndRenderPass2(VkCommandBuffer commandBuffer,
         .imageMemoryBarrierCount = image_barrier_count,
         .pImageMemoryBarriers = image_barriers,
      };
+      cmd_buffer->runtime_rp_barrier = true;
      disp->CmdPipelineBarrier2(vk_command_buffer_to_handle(cmd_buffer),
                                &dependency_info);
+      cmd_buffer->runtime_rp_barrier = false;
   }

   STACK_ARRAY_FINISH(image_barriers);
--- a/src/vulkan/runtime/vk_render_pass.h
+++ b/src/vulkan/runtime/vk_render_pass.h
@ -29,6 +29,9 @@
 extern "C" {
 #endif

+struct vk_command_buffer;
+struct vk_image;
+
 /**
 * Pseudo-extension struct that may be chained into VkRenderingAttachmentInfo
 * to indicate an initial layout for the attachment.  This is only allowed if
@ -425,9 +428,9 @@ vk_subpass_dependency_is_fb_local(const VkSubpassDependency2 *dep,
      VK_PIPELINE_STAGE_2_LATE_FRAGMENT_TESTS_BIT |
      VK_PIPELINE_STAGE_2_COLOR_ATTACHMENT_OUTPUT_BIT;

-   const VkPipelineStageFlags2 src_framebuffer_space_stages = 
+   const VkPipelineStageFlags2 src_framebuffer_space_stages =
      framebuffer_space_stages | VK_PIPELINE_STAGE_2_TOP_OF_PIPE_BIT;
-   const VkPipelineStageFlags2 dst_framebuffer_space_stages = 
+   const VkPipelineStageFlags2 dst_framebuffer_space_stages =
      framebuffer_space_stages | VK_PIPELINE_STAGE_2_BOTTOM_OF_PIPE_BIT;

   /* Check for frambuffer-space dependency. */
@ -439,6 +442,18 @@ vk_subpass_dependency_is_fb_local(const VkSubpassDependency2 *dep,
   return dep->dependencyFlags & VK_DEPENDENCY_BY_REGION_BIT;
 }

+uint32_t
+vk_command_buffer_get_attachment_layout(const struct vk_command_buffer *cmd_buffer,
+                                        const struct vk_image *image,
+                                        VkImageLayout *out_layout,
+                                        VkImageLayout *out_stencil_layout);
+
+void
+vk_command_buffer_set_attachment_layout(struct vk_command_buffer *cmd_buffer,
+                                        uint32_t att_idx,
+                                        VkImageLayout layout,
+                                        VkImageLayout stencil_layout);
+
 #ifdef __cplusplus
 }
 #endif
--- a/src/vulkan/wsi/wsi_common_x11.c
+++ b/src/vulkan/wsi/wsi_common_x11.c
@ -1666,7 +1666,7 @@ x11_present_to_x11_sw(struct x11_swapchain *chain, uint32_t image_index,
                             chain->gc,
                             image->base.row_pitches[0] / 4,
                             chain->extent.height,
-                             0,0,0,24,
+                             0,0,0,chain->depth,
                             image->base.row_pitches[0] * chain->extent.height,
                             image->base.cpu_map);
      xcb_discard_reply(chain->conn, cookie.sequence);
@ -1681,7 +1681,7 @@ x11_present_to_x11_sw(struct x11_swapchain *chain, uint32_t image_index,
                                chain->gc,
                                image->base.row_pitches[0] / 4,
                                this_lines,
-                                0,y_start,0,24,
+                                0,y_start,0,chain->depth,
                                this_lines * stride_b,
                                (const uint8_t *)myptr + (y_start * stride_b));
         xcb_discard_reply(chain->conn, cookie.sequence);
 @ -1 +1 @@
 .0.1
 .0.2