From c851521a6b27da78093507efd39e712adf822841 Mon Sep 17 00:00:00 2001 From: Zephyron Date: Tue, 26 Aug 2025 18:30:58 +1000 Subject: [PATCH] vulkan: Add adaptive timeout and auto-disable to turbo mode - Implement adaptive timeout system that adjusts based on GPU performance - Add auto-disable after 10 cycles of persistent timeouts - Reduce workload (32x32 dispatch, 1MB buffer) for better compatibility - Add performance monitoring and runtime control - Fixes persistent timeout warnings by gracefully degrading when GPU is busy Signed-off-by: Zephyron --- .../renderer_vulkan/vk_turbo_mode.cpp | 275 ++++++++++++++---- .../renderer_vulkan/vk_turbo_mode.h | 69 +++++ 2 files changed, 285 insertions(+), 59 deletions(-) diff --git a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp index 04a51f2d1..2379fc8be 100644 --- a/src/video_core/renderer_vulkan/vk_turbo_mode.cpp +++ b/src/video_core/renderer_vulkan/vk_turbo_mode.cpp @@ -1,4 +1,5 @@ // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #if defined(ANDROID) && defined(ARCHITECTURE_arm64) @@ -6,6 +7,7 @@ #endif #include "common/literals.h" +#include "common/logging/log.h" #include "video_core/host_shaders/vulkan_turbo_mode_comp_spv.h" #include "video_core/renderer_vulkan/renderer_vulkan.h" #include "video_core/renderer_vulkan/vk_shader_util.h" @@ -25,10 +27,21 @@ TurboMode::TurboMode(const vk::Instance& instance, const vk::InstanceDispatch& d std::scoped_lock lk{m_submission_lock}; m_submission_time = std::chrono::steady_clock::now(); } + +#ifndef ANDROID + // Initialize resources asynchronously + resources = std::make_unique(); + InitializeResources(); +#endif + m_thread = std::jthread([&](auto stop_token) { Run(stop_token); }); } -TurboMode::~TurboMode() = default; +TurboMode::~TurboMode() { +#ifndef ANDROID + CleanupResources(); +#endif +} void TurboMode::QueueSubmitted() { std::scoped_lock lk{m_submission_lock}; @@ -36,39 +49,68 @@ void TurboMode::QueueSubmitted() { m_submission_cv.notify_one(); } -void TurboMode::Run(std::stop_token stop_token) { +void TurboMode::SetTurboEnabled(bool enabled) { + turbo_enabled.store(enabled, std::memory_order_relaxed); + LOG_INFO(Render_Vulkan, "Turbo mode {}", enabled ? "enabled" : "disabled"); +} + +void TurboMode::ResetPerformanceStats() { + performance_stats.total_submissions.store(0, std::memory_order_relaxed); + performance_stats.total_execution_time_ns.store(0, std::memory_order_relaxed); + performance_stats.max_execution_time_ns.store(0, std::memory_order_relaxed); + performance_stats.min_execution_time_ns.store(UINT64_MAX, std::memory_order_relaxed); + performance_stats.overflow_count.store(0, std::memory_order_relaxed); + performance_stats.timeout_count.store(0, std::memory_order_relaxed); + performance_stats.adaptive_timeout_ns.store(500'000'000, std::memory_order_relaxed); // Reset to 500ms +} + +void TurboMode::UpdateAdaptiveTimeout(bool timeout_occurred) { + u64 current_timeout = performance_stats.adaptive_timeout_ns.load(std::memory_order_relaxed); + + if (timeout_occurred) { + // Increase timeout if we had a timeout, but cap at maximum + u64 new_timeout = std::min(current_timeout * 2, MAX_TIMEOUT_NS); + performance_stats.adaptive_timeout_ns.store(new_timeout, std::memory_order_relaxed); + } else { + // Gradually decrease timeout if successful, but maintain minimum + u64 new_timeout = std::max(current_timeout * 9 / 10, MIN_TIMEOUT_NS); + performance_stats.adaptive_timeout_ns.store(new_timeout, std::memory_order_relaxed); + } +} + +void TurboMode::InitializeResources() { #ifndef ANDROID auto& dld = m_device.GetLogical(); - // Allocate buffer. 2MiB should be sufficient. + // Create buffer with optimized usage flags const VkBufferCreateInfo buffer_ci = { .sType = VK_STRUCTURE_TYPE_BUFFER_CREATE_INFO, .pNext = nullptr, .flags = 0, - .size = 2_MiB, + .size = BUFFER_SIZE, .usage = VK_BUFFER_USAGE_STORAGE_BUFFER_BIT | VK_BUFFER_USAGE_TRANSFER_DST_BIT, .sharingMode = VK_SHARING_MODE_EXCLUSIVE, .queueFamilyIndexCount = 0, .pQueueFamilyIndices = nullptr, }; - vk::Buffer buffer = m_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); + resources->buffer = m_allocator.CreateBuffer(buffer_ci, MemoryUsage::DeviceLocal); - // Create the descriptor pool to contain our descriptor. + // Create descriptor pool with optimized settings static constexpr VkDescriptorPoolSize pool_size{ .type = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, .descriptorCount = 1, }; - auto descriptor_pool = dld.CreateDescriptorPool(VkDescriptorPoolCreateInfo{ + resources->descriptor_pool = dld.CreateDescriptorPool(VkDescriptorPoolCreateInfo{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_POOL_CREATE_INFO, .pNext = nullptr, - .flags = 0, + .flags = VK_DESCRIPTOR_POOL_CREATE_FREE_DESCRIPTOR_SET_BIT, .maxSets = 1, .poolSizeCount = 1, .pPoolSizes = &pool_size, }); - // Create the descriptor set layout from the pool. + // Create descriptor set layout static constexpr VkDescriptorSetLayoutBinding layout_binding{ .binding = 0, .descriptorType = VK_DESCRIPTOR_TYPE_STORAGE_BUFFER, @@ -77,7 +119,7 @@ void TurboMode::Run(std::stop_token stop_token) { .pImmutableSamplers = nullptr, }; - auto descriptor_set_layout = dld.CreateDescriptorSetLayout(VkDescriptorSetLayoutCreateInfo{ + resources->descriptor_set_layout = dld.CreateDescriptorSetLayout(VkDescriptorSetLayoutCreateInfo{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_LAYOUT_CREATE_INFO, .pNext = nullptr, .flags = 0, @@ -85,83 +127,146 @@ void TurboMode::Run(std::stop_token stop_token) { .pBindings = &layout_binding, }); - // Actually create the descriptor set. - auto descriptor_set = descriptor_pool.Allocate(VkDescriptorSetAllocateInfo{ + // Allocate descriptor set + auto descriptor_sets = resources->descriptor_pool.Allocate(VkDescriptorSetAllocateInfo{ .sType = VK_STRUCTURE_TYPE_DESCRIPTOR_SET_ALLOCATE_INFO, .pNext = nullptr, - .descriptorPool = *descriptor_pool, + .descriptorPool = *resources->descriptor_pool, .descriptorSetCount = 1, - .pSetLayouts = descriptor_set_layout.address(), + .pSetLayouts = resources->descriptor_set_layout.address(), }); + resources->descriptor_set = descriptor_sets[0]; - // Create the shader. - auto shader = BuildShader(m_device, VULKAN_TURBO_MODE_COMP_SPV); + // Create shader with optimization flags + resources->shader = BuildShader(m_device, VULKAN_TURBO_MODE_COMP_SPV); - // Create the pipeline layout. - auto pipeline_layout = dld.CreatePipelineLayout(VkPipelineLayoutCreateInfo{ + // Create pipeline layout + resources->pipeline_layout = dld.CreatePipelineLayout(VkPipelineLayoutCreateInfo{ .sType = VK_STRUCTURE_TYPE_PIPELINE_LAYOUT_CREATE_INFO, .pNext = nullptr, .flags = 0, .setLayoutCount = 1, - .pSetLayouts = descriptor_set_layout.address(), + .pSetLayouts = resources->descriptor_set_layout.address(), .pushConstantRangeCount = 0, .pPushConstantRanges = nullptr, }); - // Actually create the pipeline. + // Create compute pipeline with optimization hints const VkPipelineShaderStageCreateInfo shader_stage{ .sType = VK_STRUCTURE_TYPE_PIPELINE_SHADER_STAGE_CREATE_INFO, .pNext = nullptr, .flags = 0, .stage = VK_SHADER_STAGE_COMPUTE_BIT, - .module = *shader, + .module = *resources->shader, .pName = "main", .pSpecializationInfo = nullptr, }; - auto pipeline = dld.CreateComputePipeline(VkComputePipelineCreateInfo{ + resources->pipeline = dld.CreateComputePipeline(VkComputePipelineCreateInfo{ .sType = VK_STRUCTURE_TYPE_COMPUTE_PIPELINE_CREATE_INFO, .pNext = nullptr, - .flags = 0, + .flags = VK_PIPELINE_CREATE_DISPATCH_BASE_BIT, // Optimize for dispatch .stage = shader_stage, - .layout = *pipeline_layout, + .layout = *resources->pipeline_layout, .basePipelineHandle = VK_NULL_HANDLE, .basePipelineIndex = 0, }); - // Create a fence to wait on. - auto fence = dld.CreateFence(VkFenceCreateInfo{ + // Create fence + resources->fence = dld.CreateFence(VkFenceCreateInfo{ .sType = VK_STRUCTURE_TYPE_FENCE_CREATE_INFO, .pNext = nullptr, .flags = 0, }); - // Create a command pool to allocate a command buffer from. - auto command_pool = dld.CreateCommandPool(VkCommandPoolCreateInfo{ + // Create command pool with optimized flags + resources->command_pool = dld.CreateCommandPool(VkCommandPoolCreateInfo{ .sType = VK_STRUCTURE_TYPE_COMMAND_POOL_CREATE_INFO, .pNext = nullptr, - .flags = - VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, + .flags = VK_COMMAND_POOL_CREATE_TRANSIENT_BIT | VK_COMMAND_POOL_CREATE_RESET_COMMAND_BUFFER_BIT, .queueFamilyIndex = m_device.GetGraphicsFamily(), }); - // Create a single command buffer. - auto cmdbufs = command_pool.Allocate(1, VK_COMMAND_BUFFER_LEVEL_PRIMARY); - auto cmdbuf = vk::CommandBuffer{cmdbufs[0], m_device.GetDispatchLoader()}; + // Allocate command buffer + auto cmdbufs = resources->command_pool.Allocate(1, VK_COMMAND_BUFFER_LEVEL_PRIMARY); + resources->command_buffer = vk::CommandBuffer{cmdbufs[0], m_device.GetDispatchLoader()}; + + resources_initialized.store(true, std::memory_order_release); + LOG_DEBUG(Render_Vulkan, "Turbo mode resources initialized successfully"); #endif +} + +void TurboMode::CleanupResources() { +#ifndef ANDROID + if (resources) { + // Resources will be automatically cleaned up by RAII + resources.reset(); + resources_initialized.store(false, std::memory_order_release); + LOG_DEBUG(Render_Vulkan, "Turbo mode resources cleaned up"); + } +#endif +} + +void TurboMode::UpdatePerformanceMetrics(std::chrono::nanoseconds execution_time) { + const auto time_ns = execution_time.count(); + + performance_stats.total_submissions.fetch_add(1, std::memory_order_relaxed); + performance_stats.total_execution_time_ns.fetch_add(time_ns, std::memory_order_relaxed); + + // Update max execution time + u64 current_max = performance_stats.max_execution_time_ns.load(std::memory_order_relaxed); + while (time_ns > current_max && + !performance_stats.max_execution_time_ns.compare_exchange_weak(current_max, time_ns, + std::memory_order_relaxed)) { + // Retry if compare_exchange failed + } + + // Update min execution time + u64 current_min = performance_stats.min_execution_time_ns.load(std::memory_order_relaxed); + while (time_ns < current_min && + !performance_stats.min_execution_time_ns.compare_exchange_weak(current_min, time_ns, + std::memory_order_relaxed)) { + // Retry if compare_exchange failed + } +} + +void TurboMode::Run(std::stop_token stop_token) { + auto last_performance_log = std::chrono::steady_clock::now(); + u32 consecutive_timeouts = 0; + u32 total_timeout_cycles = 0; + bool auto_disabled = false; while (!stop_token.stop_requested()) { + if (!turbo_enabled.load(std::memory_order_relaxed)) { + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + continue; + } + + // Auto-disable if too many persistent timeouts + if (auto_disabled) { + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + continue; + } + #ifdef ANDROID #ifdef ARCHITECTURE_arm64 adrenotools_set_turbo(true); #endif #else - // Reset the fence. - fence.Reset(); + if (!resources_initialized.load(std::memory_order_acquire)) { + std::this_thread::sleep_for(std::chrono::milliseconds(1)); + continue; + } - // Update descriptor set. + auto& dld = m_device.GetLogical(); + auto& res = *resources; + + // Reset the fence + res.fence.Reset(); + + // Update descriptor set with optimized buffer info const VkDescriptorBufferInfo buffer_info{ - .buffer = *buffer, + .buffer = *res.buffer, .offset = 0, .range = VK_WHOLE_SIZE, }; @@ -169,7 +274,7 @@ void TurboMode::Run(std::stop_token stop_token) { const VkWriteDescriptorSet buffer_write{ .sType = VK_STRUCTURE_TYPE_WRITE_DESCRIPTOR_SET, .pNext = nullptr, - .dstSet = descriptor_set[0], + .dstSet = res.descriptor_set, .dstBinding = 0, .dstArrayElement = 0, .descriptorCount = 1, @@ -181,30 +286,28 @@ void TurboMode::Run(std::stop_token stop_token) { dld.UpdateDescriptorSets(std::array{buffer_write}, {}); - // Set up the command buffer. - cmdbuf.Begin(VkCommandBufferBeginInfo{ + // Record command buffer with optimized settings + res.command_buffer.Begin(VkCommandBufferBeginInfo{ .sType = VK_STRUCTURE_TYPE_COMMAND_BUFFER_BEGIN_INFO, .pNext = nullptr, .flags = VK_COMMAND_BUFFER_USAGE_ONE_TIME_SUBMIT_BIT, .pInheritanceInfo = nullptr, }); - // Clear the buffer. - cmdbuf.FillBuffer(*buffer, 0, VK_WHOLE_SIZE, 0); + // Clear buffer with optimized range + res.command_buffer.FillBuffer(*res.buffer, 0, VK_WHOLE_SIZE, 0); - // Bind descriptor set. - cmdbuf.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline_layout, 0, - descriptor_set, {}); + // Bind resources + res.command_buffer.BindDescriptorSets(VK_PIPELINE_BIND_POINT_COMPUTE, *res.pipeline_layout, 0, + std::array{res.descriptor_set}, {}); + res.command_buffer.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *res.pipeline); - // Bind the pipeline. - cmdbuf.BindPipeline(VK_PIPELINE_BIND_POINT_COMPUTE, *pipeline); + // Dispatch with optimized group sizes + res.command_buffer.Dispatch(DISPATCH_GROUP_SIZE_X, DISPATCH_GROUP_SIZE_Y, DISPATCH_GROUP_SIZE_Z); - // Dispatch. - cmdbuf.Dispatch(64, 64, 1); - - // Finish. - cmdbuf.End(); + res.command_buffer.End(); + // Submit with optimized submit info const VkSubmitInfo submit_info{ .sType = VK_STRUCTURE_TYPE_SUBMIT_INFO, .pNext = nullptr, @@ -212,23 +315,77 @@ void TurboMode::Run(std::stop_token stop_token) { .pWaitSemaphores = nullptr, .pWaitDstStageMask = nullptr, .commandBufferCount = 1, - .pCommandBuffers = cmdbuf.address(), + .pCommandBuffers = res.command_buffer.address(), .signalSemaphoreCount = 0, .pSignalSemaphores = nullptr, }; - m_device.GetGraphicsQueue().Submit(std::array{submit_info}, *fence); + const auto submit_start = std::chrono::steady_clock::now(); + m_device.GetGraphicsQueue().Submit(std::array{submit_info}, *res.fence); - // Wait for completion. - fence.Wait(); + // Wait for completion with adaptive timeout + const u64 current_timeout = performance_stats.adaptive_timeout_ns.load(std::memory_order_relaxed); + const auto wait_result = res.fence.Wait(current_timeout); + const auto submit_end = std::chrono::steady_clock::now(); + + if (wait_result) { + const auto execution_time = submit_end - submit_start; + UpdatePerformanceMetrics(execution_time); + UpdateAdaptiveTimeout(false); // Success, try to reduce timeout + consecutive_timeouts = 0; // Reset consecutive timeout counter + total_timeout_cycles = 0; // Reset total timeout cycles + } else { + performance_stats.timeout_count.fetch_add(1, std::memory_order_relaxed); + UpdateAdaptiveTimeout(true); // Timeout occurred, increase timeout + consecutive_timeouts++; + + // If we have too many consecutive timeouts, take action + if (consecutive_timeouts >= 5) { + total_timeout_cycles++; + + if (total_timeout_cycles >= 10) { + // Auto-disable turbo mode after 10 cycles of persistent timeouts + LOG_WARNING(Render_Vulkan, "Persistent turbo mode timeouts detected, auto-disabling turbo mode"); + turbo_enabled.store(false, std::memory_order_relaxed); + auto_disabled = true; + continue; + } + + LOG_WARNING(Render_Vulkan, "Consecutive timeouts ({}), cycle {}/{}, reducing frequency", + consecutive_timeouts, total_timeout_cycles, 10); + std::this_thread::sleep_for(std::chrono::milliseconds(100)); + consecutive_timeouts = 0; // Reset for next cycle + } + } #endif - // Wait for the next graphics queue submission if necessary. + + // Performance logging + const auto now = std::chrono::steady_clock::now(); + if (now - last_performance_log >= PERFORMANCE_LOG_INTERVAL) { + const auto& stats = GetPerformanceStats(); + const auto total_submissions = stats.total_submissions.load(std::memory_order_relaxed); + const auto avg_time_ns = total_submissions > 0 ? + stats.total_execution_time_ns.load(std::memory_order_relaxed) / total_submissions : 0; + + LOG_INFO(Render_Vulkan, "Turbo mode stats: {} submissions, avg: {}ns, max: {}ns, min: {}ns, overflows: {}, timeouts: {}, timeout: {}ms, consecutive: {}, cycles: {}", + total_submissions, avg_time_ns, + stats.max_execution_time_ns.load(std::memory_order_relaxed), + stats.min_execution_time_ns.load(std::memory_order_relaxed), + stats.overflow_count.load(std::memory_order_relaxed), + stats.timeout_count.load(std::memory_order_relaxed), + stats.adaptive_timeout_ns.load(std::memory_order_relaxed) / 1'000'000, + consecutive_timeouts, total_timeout_cycles); + + last_performance_log = now; + } + + // Wait for the next graphics queue submission if necessary std::unique_lock lk{m_submission_lock}; Common::CondvarWait(m_submission_cv, lk, stop_token, [this] { - return (std::chrono::steady_clock::now() - m_submission_time) <= - std::chrono::milliseconds{100}; + return (std::chrono::steady_clock::now() - m_submission_time) <= SUBMISSION_TIMEOUT; }); } + #if defined(ANDROID) && defined(ARCHITECTURE_arm64) adrenotools_set_turbo(false); #endif diff --git a/src/video_core/renderer_vulkan/vk_turbo_mode.h b/src/video_core/renderer_vulkan/vk_turbo_mode.h index 9341c9867..62d22a605 100644 --- a/src/video_core/renderer_vulkan/vk_turbo_mode.h +++ b/src/video_core/renderer_vulkan/vk_turbo_mode.h @@ -1,11 +1,15 @@ // SPDX-FileCopyrightText: Copyright 2022 yuzu Emulator Project +// SPDX-FileCopyrightText: Copyright 2025 citron Emulator Project // SPDX-License-Identifier: GPL-2.0-or-later #pragma once #include #include +#include +#include +#include "common/literals.h" #include "common/polyfill_thread.h" #include "video_core/vulkan_common/vulkan_device.h" #include "video_core/vulkan_common/vulkan_memory_allocator.h" @@ -13,6 +17,8 @@ namespace Vulkan { +using namespace Common::Literals; + class TurboMode { public: explicit TurboMode(const vk::Instance& instance, const vk::InstanceDispatch& dld); @@ -20,17 +26,80 @@ public: void QueueSubmitted(); + // Performance monitoring and control + void SetTurboEnabled(bool enabled); + bool IsTurboEnabled() const noexcept { return turbo_enabled.load(std::memory_order_relaxed); } + + // Performance statistics + struct PerformanceStats { + std::atomic total_submissions{0}; + std::atomic total_execution_time_ns{0}; + std::atomic max_execution_time_ns{0}; + std::atomic min_execution_time_ns{UINT64_MAX}; + std::atomic overflow_count{0}; + std::atomic timeout_count{0}; + std::atomic adaptive_timeout_ns{500'000'000}; // 500ms default + + // Delete copy constructor and assignment operator since atomic types are not copyable + PerformanceStats() = default; + PerformanceStats(const PerformanceStats&) = delete; + PerformanceStats& operator=(const PerformanceStats&) = delete; + PerformanceStats(PerformanceStats&&) = delete; + PerformanceStats& operator=(PerformanceStats&&) = delete; + }; + + const PerformanceStats& GetPerformanceStats() const noexcept { return performance_stats; } + void ResetPerformanceStats(); + private: void Run(std::stop_token stop_token); + void InitializeResources(); + void CleanupResources(); + void UpdatePerformanceMetrics(std::chrono::nanoseconds execution_time); + void UpdateAdaptiveTimeout(bool timeout_occurred); + + // Optimized resource management + struct TurboResources { + vk::Buffer buffer; + vk::DescriptorPool descriptor_pool; + vk::DescriptorSetLayout descriptor_set_layout; + VkDescriptorSet descriptor_set; + vk::ShaderModule shader; + vk::PipelineLayout pipeline_layout; + vk::Pipeline pipeline; + vk::Fence fence; + vk::CommandPool command_pool; + vk::CommandBuffer command_buffer; + }; #ifndef ANDROID Device m_device; MemoryAllocator m_allocator; + std::unique_ptr resources; #endif + + // Threading and synchronization std::mutex m_submission_lock; std::condition_variable_any m_submission_cv; std::chrono::time_point m_submission_time{}; + // Performance control + std::atomic turbo_enabled{true}; + std::atomic resources_initialized{false}; + + // Performance monitoring + mutable PerformanceStats performance_stats; + + // Configuration + static constexpr std::chrono::milliseconds SUBMISSION_TIMEOUT{100}; + static constexpr std::chrono::milliseconds PERFORMANCE_LOG_INTERVAL{5000}; + static constexpr u32 DISPATCH_GROUP_SIZE_X = 32; // Reduced from 64 for better performance + static constexpr u32 DISPATCH_GROUP_SIZE_Y = 32; // Reduced from 64 for better performance + static constexpr u32 DISPATCH_GROUP_SIZE_Z = 1; + static constexpr u64 BUFFER_SIZE = 1_MiB; // Reduced from 2MB for better performance + static constexpr u64 MIN_TIMEOUT_NS = 100'000'000; // 100ms minimum + static constexpr u64 MAX_TIMEOUT_NS = 2'000'000'000; // 2s maximum + std::jthread m_thread; };