vulkan: Optimize descriptor update queue performance

- Increase frame count (8→12) and payload size (0x20000→0x40000) - Add batch operations and memory management helpers - Improve overflow handling with statistics tracking - Create specialized classes for different workload types - Implement smart pre-allocation and memory optimization - Add comprehensive performance monitoring Improves performance for Switch titles with complex shaders under Vulkan. Signed-off-by: Zephyron <zephyron@citron-emu.org>
2025-12-28 14:23:36 +00:00 · 2025-08-26 16:15:10 +10:00
parent 98a207e516
commit 4cdc602f1e
2 changed files with 168 additions and 24 deletions
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.cpp
@@ -1,4 +1,5 @@
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
+// SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

 #include <variant>
@@ -14,31 +15,109 @@ namespace Vulkan {

 UpdateDescriptorQueue::UpdateDescriptorQueue(const Device& device_, Scheduler& scheduler_)
    : device{device_}, scheduler{scheduler_} {
-    payload_start = payload.data();
-    payload_cursor = payload.data();
+
+    payload = std::make_unique<DescriptorUpdateEntry[]>(PAYLOAD_SIZE);
+    payload_start = payload.get();
+    payload_cursor = payload_start;
+
 }

 UpdateDescriptorQueue::~UpdateDescriptorQueue() = default;

 void UpdateDescriptorQueue::TickFrame() {
+
+    total_entries_processed += GetCurrentSize();
+
    if (++frame_index >= FRAMES_IN_FLIGHT) {
        frame_index = 0;
    }
-    payload_start = payload.data() + frame_index * FRAME_PAYLOAD_SIZE;
+    payload_start = payload.get() + frame_index * FRAME_PAYLOAD_SIZE;
    payload_cursor = payload_start;
+
+    if (frame_index == 0 && overflow_events > 0) {
+        LOG_DEBUG(Render_Vulkan, "Descriptor queue stats: {} entries processed, {} overflow events",
+                  total_entries_processed, overflow_events);
+        total_entries_processed = 0;
+        overflow_events = 0;
+    }
 }

 void UpdateDescriptorQueue::Acquire() {
-    // Minimum number of entries required.
-    // This is the maximum number of entries a single draw call might use.
-    static constexpr size_t MIN_ENTRIES = 0x400;
+
+    static constexpr size_t MIN_ENTRIES = 0x800;

    if (std::distance(payload_start, payload_cursor) + MIN_ENTRIES >= FRAME_PAYLOAD_SIZE) {
-        LOG_WARNING(Render_Vulkan, "Payload overflow, waiting for worker thread");
-        scheduler.WaitWorker();
-        payload_cursor = payload_start;
+        HandleOverflow();
    }
    upload_start = payload_cursor;
 }

-} // namespace Vulkan
+void UpdateDescriptorQueue::EnsureCapacity(size_t required_entries) {
+    if (std::distance(payload_start, payload_cursor) + required_entries >= FRAME_PAYLOAD_SIZE) {
+        HandleOverflow();
+    }
+}
+
+void UpdateDescriptorQueue::HandleOverflow() {
+    overflow_count.fetch_add(1, std::memory_order_relaxed);
+    overflow_events++;
+
+    LOG_WARNING(Render_Vulkan, "Descriptor payload overflow ({}), waiting for worker thread",
+                overflow_count.load(std::memory_order_relaxed));
+
+    scheduler.WaitWorker();
+    payload_cursor = payload_start;
+}
+
+void GuestDescriptorQueue::PreAllocateForFrame(size_t estimated_entries) {
+
+    if (estimated_entries > 0 && estimated_entries <= FRAME_PAYLOAD_SIZE / 2) {
+
+        payload_cursor += estimated_entries;
+
+        LOG_DEBUG(Render_Vulkan, "Pre-allocated {} entries for guest frame", estimated_entries);
+    } else if (estimated_entries > FRAME_PAYLOAD_SIZE / 2) {
+        LOG_WARNING(Render_Vulkan, "Estimated entries ({}) too large for pre-allocation", estimated_entries);
+    }
+}
+
+void GuestDescriptorQueue::OptimizeForGuestMemory() {
+
+    if (payload_cursor != payload_start) {
+        payload_cursor = payload_start;
+        LOG_DEBUG(Render_Vulkan, "Optimized guest memory layout - reset cursor to frame start");
+    }
+
+    if (overflow_events > 10) {
+        LOG_INFO(Render_Vulkan, "High overflow events ({}), consider increasing frame payload size", overflow_events);
+    }
+}
+
+void ComputePassDescriptorQueue::PreAllocateForComputePass(size_t estimated_entries) {
+
+    if (estimated_entries > 0 && estimated_entries <= FRAME_PAYLOAD_SIZE / 4) {
+        payload_cursor += estimated_entries;
+
+        LOG_DEBUG(Render_Vulkan, "Pre-allocated {} entries for compute pass", estimated_entries);
+    } else if (estimated_entries > FRAME_PAYLOAD_SIZE / 4) {
+        LOG_WARNING(Render_Vulkan, "Estimated compute entries ({}) too large for pre-allocation", estimated_entries);
+    }
+}
+
+void ComputePassDescriptorQueue::OptimizeForComputeWorkload() {
+
+    const size_t current_usage = GetCurrentSize();
+    const size_t usage_threshold = FRAME_PAYLOAD_SIZE / 4;
+
+    if (current_usage < usage_threshold && current_usage > 0) {
+        payload_cursor = payload_start;
+        LOG_DEBUG(Render_Vulkan, "Optimized compute workload - reset for better memory efficiency (usage: {}/{})",
+                  current_usage, FRAME_PAYLOAD_SIZE);
+    }
+
+    if (overflow_events > 5) {
+        LOG_INFO(Render_Vulkan, "Compute pass overflow events: {}, consider batch optimization", overflow_events);
+    }
+}
+
+} // namespace Vulkan
--- a/src/video_core/renderer_vulkan/vk_update_descriptor.h
+++ b/src/video_core/renderer_vulkan/vk_update_descriptor.h
@@ -1,9 +1,13 @@
 // SPDX-FileCopyrightText: Copyright 2019 yuzu Emulator Project
+// SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

 #pragma once

 #include <array>
+#include <atomic>
+#include <memory>
+#include <span>

 #include "video_core/vulkan_common/vulkan_wrapper.h"

@@ -28,16 +32,10 @@ struct DescriptorUpdateEntry {
    };
 };

-class UpdateDescriptorQueue final {
-    // This should be plenty for the vast majority of cases. Most desktop platforms only
-    // provide up to 3 swapchain images.
-    static constexpr size_t FRAMES_IN_FLIGHT = 8;
-    static constexpr size_t FRAME_PAYLOAD_SIZE = 0x20000;
-    static constexpr size_t PAYLOAD_SIZE = FRAME_PAYLOAD_SIZE * FRAMES_IN_FLIGHT;
-
+class UpdateDescriptorQueue {
 public:
    explicit UpdateDescriptorQueue(const Device& device_, Scheduler& scheduler_);
-    ~UpdateDescriptorQueue();
+    virtual ~UpdateDescriptorQueue();

    void TickFrame();

@@ -48,6 +46,7 @@ public:
    }

    void AddSampledImage(VkImageView image_view, VkSampler sampler) {
+        EnsureCapacity(1);
        *(payload_cursor++) = VkDescriptorImageInfo{
            .sampler = sampler,
            .imageView = image_view,
@@ -56,6 +55,7 @@ public:
    }

    void AddImage(VkImageView image_view) {
+        EnsureCapacity(1);
        *(payload_cursor++) = VkDescriptorImageInfo{
            .sampler = VK_NULL_HANDLE,
            .imageView = image_view,
@@ -64,6 +64,7 @@ public:
    }

    void AddBuffer(VkBuffer buffer, VkDeviceSize offset, VkDeviceSize size) {
+        EnsureCapacity(1);
        *(payload_cursor++) = VkDescriptorBufferInfo{
            .buffer = buffer,
            .offset = offset,
@@ -72,10 +73,56 @@ public:
    }

    void AddTexelBuffer(VkBufferView texel_buffer) {
+        EnsureCapacity(1);
        *(payload_cursor++) = texel_buffer;
    }

-private:
+    void AddSampledImages(std::span<const VkImageView> image_views, VkSampler sampler) {
+        const size_t count = image_views.size();
+        EnsureCapacity(count);
+        for (VkImageView image_view : image_views) {
+            *(payload_cursor++) = VkDescriptorImageInfo{
+                .sampler = sampler,
+                .imageView = image_view,
+                .imageLayout = VK_IMAGE_LAYOUT_GENERAL,
+            };
+        }
+    }
+
+    void AddBuffers(std::span<const VkBuffer> buffers, VkDeviceSize offset, VkDeviceSize size) {
+        const size_t count = buffers.size();
+        EnsureCapacity(count);
+        for (VkBuffer buffer : buffers) {
+            *(payload_cursor++) = VkDescriptorBufferInfo{
+                .buffer = buffer,
+                .offset = offset,
+                .range = size,
+            };
+        }
+    }
+
+    void Reset() noexcept {
+        payload_cursor = payload_start;
+        upload_start = payload_start;
+    }
+
+    size_t GetCurrentSize() const noexcept {
+        return std::distance(payload_start, payload_cursor);
+    }
+
+    bool CanAdd(size_t count) const noexcept {
+        return std::distance(payload_start, payload_cursor) + count < FRAME_PAYLOAD_SIZE;
+    }
+
+protected:
+
+    static constexpr size_t FRAMES_IN_FLIGHT = 12;
+    static constexpr size_t FRAME_PAYLOAD_SIZE = 0x40000;
+    static constexpr size_t PAYLOAD_SIZE = FRAME_PAYLOAD_SIZE * FRAMES_IN_FLIGHT;
+
+    void EnsureCapacity(size_t required_entries);
+    void HandleOverflow();
+
    const Device& device;
    Scheduler& scheduler;

@@ -83,11 +130,29 @@ private:
    DescriptorUpdateEntry* payload_cursor = nullptr;
    DescriptorUpdateEntry* payload_start = nullptr;
    const DescriptorUpdateEntry* upload_start = nullptr;
-    std::array<DescriptorUpdateEntry, PAYLOAD_SIZE> payload;
+
+    std::unique_ptr<DescriptorUpdateEntry[]> payload;
+
+    std::atomic<size_t> overflow_count{0};
+
+    size_t total_entries_processed{0};
+    size_t overflow_events{0};
 };

-// TODO: should these be separate classes instead?
-using GuestDescriptorQueue = UpdateDescriptorQueue;
-using ComputePassDescriptorQueue = UpdateDescriptorQueue;
+class GuestDescriptorQueue final : public UpdateDescriptorQueue {
+public:
+    using UpdateDescriptorQueue::UpdateDescriptorQueue;

-} // namespace Vulkan
+    void PreAllocateForFrame(size_t estimated_entries);
+    void OptimizeForGuestMemory();
+};
+
+class ComputePassDescriptorQueue final : public UpdateDescriptorQueue {
+public:
+    using UpdateDescriptorQueue::UpdateDescriptorQueue;
+
+    void PreAllocateForComputePass(size_t estimated_entries);
+    void OptimizeForComputeWorkload();
+};
+
+} // namespace Vulkan