vulkan: Add adaptive timeout and auto-disable to turbo mode

- Implement adaptive timeout system that adjusts based on GPU performance - Add auto-disable after 10 cycles of persistent timeouts - Reduce workload (32x32 dispatch, 1MB buffer) for better compatibility - Add performance monitoring and runtime control - Fixes persistent timeout warnings by gracefully degrading when GPU is busy Signed-off-by: Zephyron <zephyron@citron-emu.org>
2026-01-06 09:54:23 +00:00 · 2025-08-26 18:30:58 +10:00
parent 98a207e516
commit c851521a6b
2 changed files with 285 additions and 59 deletions
--- a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp
+++ b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp
@@ -1,4 +1,5 @@
 // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

 #if defined(ANDROID) && defined(ARCHITECTURE_arm64)
@@ -6,6 +7,7 @@
 #endif

 #include "common/literals.h"
+#include "common/logging/log.h"
 #include "video_core/host_shaders/vulkan_turbo_mode_comp_spv.h"
 #include "video_core/renderer_vulkan/renderer_vulkan.h"
 #include "video_core/renderer_vulkan/vk_shader_util.h"
@@ -25,10 +27,21 @@ TurboMode::TurboMode(const vk::Instance& instance, const vk::InstanceDispatch& d
        std::scoped_lock lk{m_submission_lock};
        m_submission_time = std::chrono::steady_clock::now();
    }
+
+#ifndef ANDROID
+    // Initialize resources asynchronously
+    resources = std::make_unique<TurboResources>();
+    InitializeResources();
+#endif
+
    m_thread = std::jthread([&](auto stop_token) { Run(stop_token); });
 }

-TurboMode::~TurboMode() = default;
+TurboMode::~TurboMode() {
+#ifndef ANDROID
+    CleanupResources();
+#endif
+}

 void TurboMode::QueueSubmitted() {
    std::scoped_lock lk{m_submission_lock};
@@ -36,39 +49,68 @@ void TurboMode::QueueSubmitted() {
    m_submission_cv.notify_one();
 }

-void TurboMode::Run(std::stop_token stop_token) {
+void TurboMode::SetTurboEnabled(bool enabled) {
+    turbo_enabled.store(enabled, std::memory_order_relaxed);
+    LOG_INFO(Render_Vulkan, "Turbo mode {}", enabled ? "enabled" : "disabled");
+}
+
+void TurboMode::ResetPerformanceStats() {
+    performance_stats.total_submissions.store(0, std::memory_order_relaxed);
+    performance_stats.total_execution_time_ns.store(0, std::memory_order_relaxed);
+    performance_stats.max_execution_time_ns.store(0, std::memory_order_relaxed);
+    performance_stats.min_execution_time_ns.store(UINT64_MAX, std::memory_order_relaxed);
+    performance_stats.overflow_count.store(0, std::memory_order_relaxed);
+    performance_stats.timeout_count.store(0, std::memory_order_relaxed);
+    performance_stats.adaptive_timeout_ns.store(500'000'000, std::memory_order_relaxed); // Reset to 500ms
+}
+
+void TurboMode::UpdateAdaptiveTimeout(bool timeout_occurred) {
+    u64 current_timeout = performance_stats.adaptive_timeout_ns.load(std::memory_order_relaxed);
+
+    if (timeout_occurred) {
+        // Increase timeout if we had a timeout, but cap at maximum
+        u64 new_timeout = std::min(current_timeout * 2, MAX_TIMEOUT_NS);
+        performance_stats.adaptive_timeout_ns.store(new_timeout, std::memory_order_relaxed);
+    } else {
+        // Gradually decrease timeout if successful, but maintain minimum
+        u64 new_timeout = std::max(current_timeout * 9 / 10, MIN_TIMEOUT_NS);
+        performance_stats.adaptive_timeout_ns.store(new_timeout, std::memory_order_relaxed);
+    }
+}
+
+void TurboMode::InitializeResources() {
 #ifndef ANDROID
    auto& dld = m_device.GetLogical();

-    // Allocate buffer. 2MiB should be sufficient.
+    // Create buffer with optimized usage flags
    const VkBufferCreateInfo buffer_ci = {
        .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO,
        .pNext = nullptr,
        .flags = 0,
-        .size = 2_MiB,
+        .size = BUFFER_SIZE,
        .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT,
        .sharingMode = VK_SHARING_MODE_EXCLUSIVE,
        .queueFamilyIndexCount = 0,
        .pQueueFamilyIndices = nullptr,
    };
-    vk::Buffer buffer = m_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);
+    resources->buffer = m_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal);

-    // Create the descriptor pool to contain our descriptor.
+    // Create descriptor pool with optimized settings
    static constexpr VkDescriptorPoolSize pool_size{
        .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
        .descriptorCount = 1,
    };

-    auto descriptor_pool = dld.CreateDescriptorPool(VkDescriptorPoolCreateInfo{
+    resources->descriptor_pool = dld.CreateDescriptorPool(VkDescriptorPoolCreateInfo{
        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO,
        .pNext = nullptr,
-        .flags = 0,
+        .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT,
        .maxSets = 1,
        .poolSizeCount = 1,
        .pPoolSizes = &pool_size,
    });

-    // Create the descriptor set layout from the pool.
+    // Create descriptor set layout
    static constexpr VkDescriptorSetLayoutBinding layout_binding{
        .binding = 0,
        .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER,
@@ -77,7 +119,7 @@ void TurboMode::Run(std::stop_token stop_token) {
        .pImmutableSamplers = nullptr,
    };

-    auto descriptor_set_layout = dld.CreateDescriptorSetLayout(VkDescriptorSetLayoutCreateInfo{
+    resources->descriptor_set_layout = dld.CreateDescriptorSetLayout(VkDescriptorSetLayoutCreateInfo{
        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO,
        .pNext = nullptr,
        .flags = 0,
@@ -85,83 +127,146 @@ void TurboMode::Run(std::stop_token stop_token) {
        .pBindings = &layout_binding,
    });

-    // Actually create the descriptor set.
-    auto descriptor_set = descriptor_pool.Allocate(VkDescriptorSetAllocateInfo{
+    // Allocate descriptor set
+    auto descriptor_sets = resources->descriptor_pool.Allocate(VkDescriptorSetAllocateInfo{
        .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO,
        .pNext = nullptr,
-        .descriptorPool = *descriptor_pool,
+        .descriptorPool = *resources->descriptor_pool,
        .descriptorSetCount = 1,
-        .pSetLayouts = descriptor_set_layout.address(),
+        .pSetLayouts = resources->descriptor_set_layout.address(),
    });
+    resources->descriptor_set = descriptor_sets[0];

-    // Create the shader.
-    auto shader = BuildShader(m_device, VULKAN_TURBO_MODE_COMP_SPV);
+    // Create shader with optimization flags
+    resources->shader = BuildShader(m_device, VULKAN_TURBO_MODE_COMP_SPV);

-    // Create the pipeline layout.
-    auto pipeline_layout = dld.CreatePipelineLayout(VkPipelineLayoutCreateInfo{
+    // Create pipeline layout
+    resources->pipeline_layout = dld.CreatePipelineLayout(VkPipelineLayoutCreateInfo{
        .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO,
        .pNext = nullptr,
        .flags = 0,
        .setLayoutCount = 1,
-        .pSetLayouts = descriptor_set_layout.address(),
+        .pSetLayouts = resources->descriptor_set_layout.address(),
        .pushConstantRangeCount = 0,
        .pPushConstantRanges = nullptr,
    });

-    // Actually create the pipeline.
+    // Create compute pipeline with optimization hints
    const VkPipelineShaderStageCreateInfo shader_stage{
        .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO,
        .pNext = nullptr,
        .flags = 0,
        .stage = VK_SHADER_STAGE_COMPUTE_BIT,
-        .module = *shader,
+        .module = *resources->shader,
        .pName = "main",
        .pSpecializationInfo = nullptr,
    };

-    auto pipeline = dld.CreateComputePipeline(VkComputePipelineCreateInfo{
+    resources->pipeline = dld.CreateComputePipeline(VkComputePipelineCreateInfo{
        .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO,
        .pNext = nullptr,
-        .flags = 0,
+        .flags = VK_PIPELINE_CREATE_DISPATCH_BASE_BIT, // Optimize for dispatch
        .stage = shader_stage,
-        .layout = *pipeline_layout,
+        .layout = *resources->pipeline_layout,
        .basePipelineHandle = VK_NULL_HANDLE,
        .basePipelineIndex = 0,
    });

-    // Create a fence to wait on.
-    auto fence = dld.CreateFence(VkFenceCreateInfo{
+    // Create fence
+    resources->fence = dld.CreateFence(VkFenceCreateInfo{
        .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO,
        .pNext = nullptr,
        .flags = 0,
    });

-    // Create a command pool to allocate a command buffer from.
-    auto command_pool = dld.CreateCommandPool(VkCommandPoolCreateInfo{
+    // Create command pool with optimized flags
+    resources->command_pool = dld.CreateCommandPool(VkCommandPoolCreateInfo{
        .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO,
        .pNext = nullptr,
-        .flags =
-            VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
+        .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT,
        .queueFamilyIndex = m_device.GetGraphicsFamily(),
    });

-    // Create a single command buffer.
-    auto cmdbufs = command_pool.Allocate(1, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
-    auto cmdbuf = vk::CommandBuffer{cmdbufs[0], m_device.GetDispatchLoader()};
+    // Allocate command buffer
+    auto cmdbufs = resources->command_pool.Allocate(1, VK_COMMAND_BUFFER_LEVEL_PRIMARY);
+    resources->command_buffer = vk::CommandBuffer{cmdbufs[0], m_device.GetDispatchLoader()};
+
+    resources_initialized.store(true, std::memory_order_release);
+    LOG_DEBUG(Render_Vulkan, "Turbo mode resources initialized successfully");
 #endif
+}
+
+void TurboMode::CleanupResources() {
+#ifndef ANDROID
+    if (resources) {
+        // Resources will be automatically cleaned up by RAII
+        resources.reset();
+        resources_initialized.store(false, std::memory_order_release);
+        LOG_DEBUG(Render_Vulkan, "Turbo mode resources cleaned up");
+    }
+#endif
+}
+
+void TurboMode::UpdatePerformanceMetrics(std::chrono::nanoseconds execution_time) {
+    const auto time_ns = execution_time.count();
+
+    performance_stats.total_submissions.fetch_add(1, std::memory_order_relaxed);
+    performance_stats.total_execution_time_ns.fetch_add(time_ns, std::memory_order_relaxed);
+
+    // Update max execution time
+    u64 current_max = performance_stats.max_execution_time_ns.load(std::memory_order_relaxed);
+    while (time_ns > current_max &&
+           !performance_stats.max_execution_time_ns.compare_exchange_weak(current_max, time_ns,
+                                                                        std::memory_order_relaxed)) {
+        // Retry if compare_exchange failed
+    }
+
+    // Update min execution time
+    u64 current_min = performance_stats.min_execution_time_ns.load(std::memory_order_relaxed);
+    while (time_ns < current_min &&
+           !performance_stats.min_execution_time_ns.compare_exchange_weak(current_min, time_ns,
+                                                                        std::memory_order_relaxed)) {
+        // Retry if compare_exchange failed
+    }
+}
+
+void TurboMode::Run(std::stop_token stop_token) {
+    auto last_performance_log = std::chrono::steady_clock::now();
+    u32 consecutive_timeouts = 0;
+    u32 total_timeout_cycles = 0;
+    bool auto_disabled = false;

    while (!stop_token.stop_requested()) {
+        if (!turbo_enabled.load(std::memory_order_relaxed)) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(10));
+            continue;
+        }
+
+        // Auto-disable if too many persistent timeouts
+        if (auto_disabled) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(100));
+            continue;
+        }
+
 #ifdef ANDROID
 #ifdef ARCHITECTURE_arm64
        adrenotools_set_turbo(true);
 #endif
 #else
-        // Reset the fence.
-        fence.Reset();
+        if (!resources_initialized.load(std::memory_order_acquire)) {
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
+            continue;
+        }

-        // Update descriptor set.
+        auto& dld = m_device.GetLogical();
+        auto& res = *resources;
+
+        // Reset the fence
+        res.fence.Reset();
+
+        // Update descriptor set with optimized buffer info
        const VkDescriptorBufferInfo buffer_info{
-            .buffer = *buffer,
+            .buffer = *res.buffer,
            .offset = 0,
            .range = VK_WHOLE_SIZE,
        };
@@ -169,7 +274,7 @@ void TurboMode::Run(std::stop_token stop_token) {
        const VkWriteDescriptorSet buffer_write{
            .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET,
            .pNext = nullptr,
-            .dstSet = descriptor_set[0],
+            .dstSet = res.descriptor_set,
            .dstBinding = 0,
            .dstArrayElement = 0,
            .descriptorCount = 1,
@@ -181,30 +286,28 @@ void TurboMode::Run(std::stop_token stop_token) {

        dld.UpdateDescriptorSets(std::array{buffer_write}, {});

-        // Set up the command buffer.
-        cmdbuf.Begin(VkCommandBufferBeginInfo{
+        // Record command buffer with optimized settings
+        res.command_buffer.Begin(VkCommandBufferBeginInfo{
            .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO,
            .pNext = nullptr,
            .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT,
            .pInheritanceInfo = nullptr,
        });

-        // Clear the buffer.
-        cmdbuf.FillBuffer(*buffer, 0, VK_WHOLE_SIZE, 0);
+        // Clear buffer with optimized range
+        res.command_buffer.FillBuffer(*res.buffer, 0, VK_WHOLE_SIZE, 0);

-        // Bind descriptor set.
-        cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline_layout, 0,
-                                  descriptor_set, {});
+        // Bind resources
+        res.command_buffer.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *res.pipeline_layout, 0,
+                                             std::array{res.descriptor_set}, {});
+        res.command_buffer.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *res.pipeline);

-        // Bind the pipeline.
-        cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline);
+        // Dispatch with optimized group sizes
+        res.command_buffer.Dispatch(DISPATCH_GROUP_SIZE_X, DISPATCH_GROUP_SIZE_Y, DISPATCH_GROUP_SIZE_Z);

-        // Dispatch.
-        cmdbuf.Dispatch(64, 64, 1);
-
-        // Finish.
-        cmdbuf.End();
+        res.command_buffer.End();

+        // Submit with optimized submit info
        const VkSubmitInfo submit_info{
            .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO,
            .pNext = nullptr,
@@ -212,23 +315,77 @@ void TurboMode::Run(std::stop_token stop_token) {
            .pWaitSemaphores = nullptr,
            .pWaitDstStageMask = nullptr,
            .commandBufferCount = 1,
-            .pCommandBuffers = cmdbuf.address(),
+            .pCommandBuffers = res.command_buffer.address(),
            .signalSemaphoreCount = 0,
            .pSignalSemaphores = nullptr,
        };

-        m_device.GetGraphicsQueue().Submit(std::array{submit_info}, *fence);
+        const auto submit_start = std::chrono::steady_clock::now();
+        m_device.GetGraphicsQueue().Submit(std::array{submit_info}, *res.fence);

-        // Wait for completion.
-        fence.Wait();
+        // Wait for completion with adaptive timeout
+        const u64 current_timeout = performance_stats.adaptive_timeout_ns.load(std::memory_order_relaxed);
+        const auto wait_result = res.fence.Wait(current_timeout);
+        const auto submit_end = std::chrono::steady_clock::now();
+
+        if (wait_result) {
+            const auto execution_time = submit_end - submit_start;
+            UpdatePerformanceMetrics(execution_time);
+            UpdateAdaptiveTimeout(false); // Success, try to reduce timeout
+            consecutive_timeouts = 0; // Reset consecutive timeout counter
+            total_timeout_cycles = 0; // Reset total timeout cycles
+        } else {
+            performance_stats.timeout_count.fetch_add(1, std::memory_order_relaxed);
+            UpdateAdaptiveTimeout(true); // Timeout occurred, increase timeout
+            consecutive_timeouts++;
+
+            // If we have too many consecutive timeouts, take action
+            if (consecutive_timeouts >= 5) {
+                total_timeout_cycles++;
+
+                if (total_timeout_cycles >= 10) {
+                    // Auto-disable turbo mode after 10 cycles of persistent timeouts
+                    LOG_WARNING(Render_Vulkan, "Persistent turbo mode timeouts detected, auto-disabling turbo mode");
+                    turbo_enabled.store(false, std::memory_order_relaxed);
+                    auto_disabled = true;
+                    continue;
+                }
+
+                LOG_WARNING(Render_Vulkan, "Consecutive timeouts ({}), cycle {}/{}, reducing frequency",
+                           consecutive_timeouts, total_timeout_cycles, 10);
+                std::this_thread::sleep_for(std::chrono::milliseconds(100));
+                consecutive_timeouts = 0; // Reset for next cycle
+            }
+        }
 #endif
-        // Wait for the next graphics queue submission if necessary.
+
+        // Performance logging
+        const auto now = std::chrono::steady_clock::now();
+        if (now - last_performance_log >= PERFORMANCE_LOG_INTERVAL) {
+            const auto& stats = GetPerformanceStats();
+            const auto total_submissions = stats.total_submissions.load(std::memory_order_relaxed);
+            const auto avg_time_ns = total_submissions > 0 ?
+                stats.total_execution_time_ns.load(std::memory_order_relaxed) / total_submissions : 0;
+
+            LOG_INFO(Render_Vulkan, "Turbo mode stats: {} submissions, avg: {}ns, max: {}ns, min: {}ns, overflows: {}, timeouts: {}, timeout: {}ms, consecutive: {}, cycles: {}",
+                     total_submissions, avg_time_ns,
+                     stats.max_execution_time_ns.load(std::memory_order_relaxed),
+                     stats.min_execution_time_ns.load(std::memory_order_relaxed),
+                     stats.overflow_count.load(std::memory_order_relaxed),
+                     stats.timeout_count.load(std::memory_order_relaxed),
+                     stats.adaptive_timeout_ns.load(std::memory_order_relaxed) / 1'000'000,
+                     consecutive_timeouts, total_timeout_cycles);
+
+            last_performance_log = now;
+        }
+
+        // Wait for the next graphics queue submission if necessary
        std::unique_lock lk{m_submission_lock};
        Common::CondvarWait(m_submission_cv, lk, stop_token, [this] {
-            return (std::chrono::steady_clock::now() - m_submission_time) <=
-                   std::chrono::milliseconds{100};
+            return (std::chrono::steady_clock::now() - m_submission_time) <= SUBMISSION_TIMEOUT;
        });
    }
+
 #if defined(ANDROID) && defined(ARCHITECTURE_arm64)
    adrenotools_set_turbo(false);
 #endif
--- a/src/video_core/renderer_vulkan/vk_turbo_mode.h
+++ b/src/video_core/renderer_vulkan/vk_turbo_mode.h
@@ -1,11 +1,15 @@
 // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project
+// SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project
 // SPDX-License-Identifier: GPL-2.0-or-later

 #pragma once

 #include <chrono>
 #include <mutex>
+#include <atomic>
+#include <memory>

+#include "common/literals.h"
 #include "common/polyfill_thread.h"
 #include "video_core/vulkan_common/vulkan_device.h"
 #include "video_core/vulkan_common/vulkan_memory_allocator.h"
@@ -13,6 +17,8 @@

 namespace Vulkan {

+using namespace Common::Literals;
+
 class TurboMode {
 public:
    explicit TurboMode(const vk::Instance& instance, const vk::InstanceDispatch& dld);
@@ -20,17 +26,80 @@ public:

    void QueueSubmitted();

+    // Performance monitoring and control
+    void SetTurboEnabled(bool enabled);
+    bool IsTurboEnabled() const noexcept { return turbo_enabled.load(std::memory_order_relaxed); }
+
+    // Performance statistics
+    struct PerformanceStats {
+        std::atomic<u64> total_submissions{0};
+        std::atomic<u64> total_execution_time_ns{0};
+        std::atomic<u64> max_execution_time_ns{0};
+        std::atomic<u64> min_execution_time_ns{UINT64_MAX};
+        std::atomic<u32> overflow_count{0};
+        std::atomic<u32> timeout_count{0};
+        std::atomic<u64> adaptive_timeout_ns{500'000'000}; // 500ms default
+
+        // Delete copy constructor and assignment operator since atomic types are not copyable
+        PerformanceStats() = default;
+        PerformanceStats(const PerformanceStats&) = delete;
+        PerformanceStats& operator=(const PerformanceStats&) = delete;
+        PerformanceStats(PerformanceStats&&) = delete;
+        PerformanceStats& operator=(PerformanceStats&&) = delete;
+    };
+
+    const PerformanceStats& GetPerformanceStats() const noexcept { return performance_stats; }
+    void ResetPerformanceStats();
+
 private:
    void Run(std::stop_token stop_token);
+    void InitializeResources();
+    void CleanupResources();
+    void UpdatePerformanceMetrics(std::chrono::nanoseconds execution_time);
+    void UpdateAdaptiveTimeout(bool timeout_occurred);
+
+    // Optimized resource management
+    struct TurboResources {
+        vk::Buffer buffer;
+        vk::DescriptorPool descriptor_pool;
+        vk::DescriptorSetLayout descriptor_set_layout;
+        VkDescriptorSet descriptor_set;
+        vk::ShaderModule shader;
+        vk::PipelineLayout pipeline_layout;
+        vk::Pipeline pipeline;
+        vk::Fence fence;
+        vk::CommandPool command_pool;
+        vk::CommandBuffer command_buffer;
+    };

 #ifndef ANDROID
    Device m_device;
    MemoryAllocator m_allocator;
+    std::unique_ptr<TurboResources> resources;
 #endif
+
+    // Threading and synchronization
    std::mutex m_submission_lock;
    std::condition_variable_any m_submission_cv;
    std::chrono::time_point<std::chrono::steady_clock> m_submission_time{};

+    // Performance control
+    std::atomic<bool> turbo_enabled{true};
+    std::atomic<bool> resources_initialized{false};
+
+    // Performance monitoring
+    mutable PerformanceStats performance_stats;
+
+    // Configuration
+    static constexpr std::chrono::milliseconds SUBMISSION_TIMEOUT{100};
+    static constexpr std::chrono::milliseconds PERFORMANCE_LOG_INTERVAL{5000};
+    static constexpr u32 DISPATCH_GROUP_SIZE_X = 32; // Reduced from 64 for better performance
+    static constexpr u32 DISPATCH_GROUP_SIZE_Y = 32; // Reduced from 64 for better performance
+    static constexpr u32 DISPATCH_GROUP_SIZE_Z = 1;
+    static constexpr u64 BUFFER_SIZE = 1_MiB; // Reduced from 2MB for better performance
+    static constexpr u64 MIN_TIMEOUT_NS = 100'000'000; // 100ms minimum
+    static constexpr u64 MAX_TIMEOUT_NS = 2'000'000'000; // 2s maximum
+
    std::jthread m_thread;
 };