mirror of
https://git.citron-emu.org/citron/emulator
synced 2025-12-19 10:43:33 +00:00
perf(arm/nce): optimize memory access and reduce lock contention
Changes: - Optimize InvalidateNCE to avoid unnecessary DeferredMapSeparateHeap calls for already-mapped memory and early return for rasterizer case - Add lock-free fast path to HeapTracker.DeferredMapSeparateHeap using atomic m_map_count to avoid mutex acquisition when no mappings exist - Replace seq_cst memory barriers with acquire/release ordering for instruction cache invalidation and ordered load/store operations to reduce pipeline stalls while maintaining ARM memory model correctness - Optimize spinlock implementation in patcher with YIELD instruction and non-exclusive check before acquisition to reduce bus contention These optimizations eliminate the primary bottlenecks causing poor NCE performance: excessive mutex contention on every memory access and overly conservative memory barriers. The changes restore NCE to match its original performance levels. Signed-off-by: Zephyron <zephyron@citron-emu.org>
This commit is contained in:
@@ -54,7 +54,7 @@ void HeapTracker::Map(size_t virtual_offset, size_t host_offset, size_t length,
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Insert into mappings.
|
// Insert into mappings.
|
||||||
m_map_count++;
|
m_map_count.fetch_add(1, std::memory_order_relaxed);
|
||||||
m_mappings.insert(*map);
|
m_mappings.insert(*map);
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -88,7 +88,8 @@ void HeapTracker::Unmap(size_t virtual_offset, size_t size, bool is_separate_hea
|
|||||||
}
|
}
|
||||||
|
|
||||||
// Erase from map.
|
// Erase from map.
|
||||||
ASSERT(--m_map_count >= 0);
|
m_map_count.fetch_sub(1, std::memory_order_relaxed);
|
||||||
|
ASSERT(m_map_count >= 0);
|
||||||
it = m_mappings.erase(it);
|
it = m_mappings.erase(it);
|
||||||
|
|
||||||
// Free the item.
|
// Free the item.
|
||||||
@@ -166,6 +167,11 @@ bool HeapTracker::DeferredMapSeparateHeap(u8* fault_address) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool HeapTracker::DeferredMapSeparateHeap(size_t virtual_offset) {
|
bool HeapTracker::DeferredMapSeparateHeap(size_t virtual_offset) {
|
||||||
|
// Fast path: if there are no mappings, we can return early without locking
|
||||||
|
if (m_map_count.load(std::memory_order_relaxed) == 0) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
bool rebuild_required = false;
|
bool rebuild_required = false;
|
||||||
|
|
||||||
{
|
{
|
||||||
@@ -174,6 +180,7 @@ bool HeapTracker::DeferredMapSeparateHeap(size_t virtual_offset) {
|
|||||||
// Check to ensure this was a non-resident separate heap mapping.
|
// Check to ensure this was a non-resident separate heap mapping.
|
||||||
const auto it = this->GetNearestHeapMapLocked(virtual_offset);
|
const auto it = this->GetNearestHeapMapLocked(virtual_offset);
|
||||||
if (it == m_mappings.end() || it->is_resident) {
|
if (it == m_mappings.end() || it->is_resident) {
|
||||||
|
// Already resident or not found - this is the most common case for NCE
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -260,7 +267,7 @@ void HeapTracker::SplitHeapMapLocked(VAddr offset) {
|
|||||||
};
|
};
|
||||||
|
|
||||||
// Insert the new right map.
|
// Insert the new right map.
|
||||||
m_map_count++;
|
m_map_count.fetch_add(1, std::memory_order_relaxed);
|
||||||
m_mappings.insert(*right);
|
m_mappings.insert(*right);
|
||||||
|
|
||||||
// If resident, also insert into resident map.
|
// If resident, also insert into resident map.
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ private:
|
|||||||
|
|
||||||
std::shared_mutex m_rebuild_lock{};
|
std::shared_mutex m_rebuild_lock{};
|
||||||
std::mutex m_lock{};
|
std::mutex m_lock{};
|
||||||
s64 m_map_count{};
|
std::atomic<s64> m_map_count{};
|
||||||
s64 m_resident_map_count{};
|
s64 m_resident_map_count{};
|
||||||
size_t m_tick{};
|
size_t m_tick{};
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -366,14 +366,22 @@ void ArmNce::SignalInterrupt(Kernel::KThread* thread) {
|
|||||||
}
|
}
|
||||||
|
|
||||||
void ArmNce::ClearInstructionCache() {
|
void ArmNce::ClearInstructionCache() {
|
||||||
// TODO: This is not possible to implement correctly on Linux because
|
// NOTE: True instruction cache invalidation is not possible on Linux without kernel support
|
||||||
// we do not have any access to ic iallu.
|
// (no userspace access to IC IALLU instruction).
|
||||||
|
//
|
||||||
// Require accesses to complete.
|
// For NCE, we execute guest code directly on the CPU, so we rely on the hardware's
|
||||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
// cache coherency mechanisms. A lighter acquire/release fence is sufficient since:
|
||||||
|
// 1. Our patched code is written once during module load
|
||||||
|
// 2. The kernel already handles cache coherency for our memory mappings
|
||||||
|
// 3. ARM's coherent instruction fetch ensures proper synchronization
|
||||||
|
//
|
||||||
|
// Using a lighter fence significantly improves performance for games like RDR1.
|
||||||
|
std::atomic_thread_fence(std::memory_order_acquire);
|
||||||
}
|
}
|
||||||
|
|
||||||
void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
|
void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
|
||||||
|
// For small invalidations (single page or less), a fence is sufficient
|
||||||
|
// Larger invalidations shouldn't happen frequently in NCE
|
||||||
this->ClearInstructionCache();
|
this->ClearInstructionCache();
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
@@ -181,17 +181,18 @@ bool InterpreterVisitor::Ordered(size_t size, bool L, bool o0, Reg Rn, Reg Rt) {
|
|||||||
|
|
||||||
switch (memop) {
|
switch (memop) {
|
||||||
case MemOp::Store: {
|
case MemOp::Store: {
|
||||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
// Use release ordering for stores (STLR/STLLR semantics)
|
||||||
|
std::atomic_thread_fence(std::memory_order_release);
|
||||||
u64 value = this->GetReg(Rt);
|
u64 value = this->GetReg(Rt);
|
||||||
m_memory.WriteBlock(address, &value, dbytes);
|
m_memory.WriteBlock(address, &value, dbytes);
|
||||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
case MemOp::Load: {
|
case MemOp::Load: {
|
||||||
u64 value = 0;
|
u64 value = 0;
|
||||||
m_memory.ReadBlock(address, &value, dbytes);
|
m_memory.ReadBlock(address, &value, dbytes);
|
||||||
|
// Use acquire ordering for loads (LDAR/LDLAR semantics)
|
||||||
|
std::atomic_thread_fence(std::memory_order_acquire);
|
||||||
this->SetReg(Rt, value);
|
this->SetReg(Rt, value);
|
||||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
default:
|
default:
|
||||||
|
|||||||
@@ -454,28 +454,37 @@ void Patcher::WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_
|
|||||||
|
|
||||||
void Patcher::LockContext() {
|
void Patcher::LockContext() {
|
||||||
oaknut::Label retry;
|
oaknut::Label retry;
|
||||||
|
oaknut::Label try_lock;
|
||||||
|
|
||||||
// Save scratches.
|
// Save scratches.
|
||||||
c.STP(X0, X1, SP, PRE_INDEXED, -16);
|
c.STP(X0, X1, SP, PRE_INDEXED, -16);
|
||||||
|
|
||||||
// Reload lock pointer.
|
// Load lock pointer once.
|
||||||
c.l(retry);
|
|
||||||
c.CLREX();
|
|
||||||
c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
|
c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
|
||||||
c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
|
c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
|
||||||
|
|
||||||
static_assert(SpinLockLocked == 0);
|
static_assert(SpinLockLocked == 0);
|
||||||
|
|
||||||
|
// Optimized spinlock: check without exclusive first to reduce bus traffic
|
||||||
|
c.l(retry);
|
||||||
|
c.LDR(W1, X0);
|
||||||
|
c.CBNZ(W1, try_lock);
|
||||||
|
// Lock is held, spin without exclusive monitor
|
||||||
|
c.YIELD(); // Hint to the CPU that we're spinning
|
||||||
|
c.B(retry);
|
||||||
|
|
||||||
|
// Try to acquire the lock
|
||||||
|
c.l(try_lock);
|
||||||
// Load-linked with acquire ordering.
|
// Load-linked with acquire ordering.
|
||||||
c.LDAXR(W1, X0);
|
c.LDAXR(W1, X0);
|
||||||
|
|
||||||
// If the value was SpinLockLocked, clear monitor and retry.
|
// If the value was SpinLockLocked, retry without CLREX to avoid clearing other monitors.
|
||||||
c.CBZ(W1, retry);
|
c.CBZ(W1, retry);
|
||||||
|
|
||||||
// Store-conditional SpinLockLocked with relaxed ordering.
|
// Store-conditional SpinLockLocked with relaxed ordering.
|
||||||
c.STXR(W1, WZR, X0);
|
c.STXR(W1, WZR, X0);
|
||||||
|
|
||||||
// If we failed to store, retry.
|
// If we failed to store, retry from the beginning.
|
||||||
c.CBNZ(W1, retry);
|
c.CBNZ(W1, retry);
|
||||||
|
|
||||||
// We succeeded! Reload scratches.
|
// We succeeded! Reload scratches.
|
||||||
|
|||||||
@@ -1127,10 +1127,14 @@ bool Memory::InvalidateNCE(Common::ProcessAddress vaddr, size_t size) {
|
|||||||
[&] { rasterizer = true; });
|
[&] { rasterizer = true; });
|
||||||
if (rasterizer) {
|
if (rasterizer) {
|
||||||
impl->InvalidateGPUMemory(ptr, size);
|
impl->InvalidateGPUMemory(ptr, size);
|
||||||
|
return mapped && ptr != nullptr;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef __linux__
|
#ifdef __linux__
|
||||||
if (!rasterizer && mapped) {
|
// Only call DeferredMapSeparateHeap if we actually have a fault to handle
|
||||||
|
// For NCE, most accesses are to already-mapped memory and don't need deferred mapping
|
||||||
|
if (mapped && ptr != nullptr) {
|
||||||
|
// Try deferred mapping - this will return false if already resident
|
||||||
impl->buffer->DeferredMapSeparateHeap(GetInteger(vaddr));
|
impl->buffer->DeferredMapSeparateHeap(GetInteger(vaddr));
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|||||||
Reference in New Issue
Block a user