perf(arm/nce): optimize memory access and reduce lock contention

Changes:
- Optimize InvalidateNCE to avoid unnecessary DeferredMapSeparateHeap
  calls for already-mapped memory and early return for rasterizer case

- Add lock-free fast path to HeapTracker.DeferredMapSeparateHeap using
  atomic m_map_count to avoid mutex acquisition when no mappings exist

- Replace seq_cst memory barriers with acquire/release ordering for
  instruction cache invalidation and ordered load/store operations to
  reduce pipeline stalls while maintaining ARM memory model correctness

- Optimize spinlock implementation in patcher with YIELD instruction
  and non-exclusive check before acquisition to reduce bus contention

These optimizations eliminate the primary bottlenecks causing poor NCE
performance: excessive mutex contention on every memory access and
overly conservative memory barriers. The changes restore NCE to match
its original performance levels.

Signed-off-by: Zephyron <zephyron@citron-emu.org>
This commit is contained in:
Zephyron
2025-10-29 12:32:48 +10:00
parent cbfa876fb0
commit 7a385e90c4
6 changed files with 47 additions and 18 deletions

View File

@@ -54,7 +54,7 @@ void HeapTracker::Map(size_t virtual_offset, size_t host_offset, size_t length,
};
// Insert into mappings.
m_map_count++;
m_map_count.fetch_add(1, std::memory_order_relaxed);
m_mappings.insert(*map);
}
@@ -88,7 +88,8 @@ void HeapTracker::Unmap(size_t virtual_offset, size_t size, bool is_separate_hea
}
// Erase from map.
ASSERT(--m_map_count >= 0);
m_map_count.fetch_sub(1, std::memory_order_relaxed);
ASSERT(m_map_count >= 0);
it = m_mappings.erase(it);
// Free the item.
@@ -166,6 +167,11 @@ bool HeapTracker::DeferredMapSeparateHeap(u8* fault_address) {
}
bool HeapTracker::DeferredMapSeparateHeap(size_t virtual_offset) {
// Fast path: if there are no mappings, we can return early without locking
if (m_map_count.load(std::memory_order_relaxed) == 0) {
return false;
}
bool rebuild_required = false;
{
@@ -174,6 +180,7 @@ bool HeapTracker::DeferredMapSeparateHeap(size_t virtual_offset) {
// Check to ensure this was a non-resident separate heap mapping.
const auto it = this->GetNearestHeapMapLocked(virtual_offset);
if (it == m_mappings.end() || it->is_resident) {
// Already resident or not found - this is the most common case for NCE
return false;
}
@@ -260,7 +267,7 @@ void HeapTracker::SplitHeapMapLocked(VAddr offset) {
};
// Insert the new right map.
m_map_count++;
m_map_count.fetch_add(1, std::memory_order_relaxed);
m_mappings.insert(*right);
// If resident, also insert into resident map.

View File

@@ -90,7 +90,7 @@ private:
std::shared_mutex m_rebuild_lock{};
std::mutex m_lock{};
s64 m_map_count{};
std::atomic<s64> m_map_count{};
s64 m_resident_map_count{};
size_t m_tick{};
};

View File

@@ -366,14 +366,22 @@ void ArmNce::SignalInterrupt(Kernel::KThread* thread) {
}
void ArmNce::ClearInstructionCache() {
// TODO: This is not possible to implement correctly on Linux because
// we do not have any access to ic iallu.
// Require accesses to complete.
std::atomic_thread_fence(std::memory_order_seq_cst);
// NOTE: True instruction cache invalidation is not possible on Linux without kernel support
// (no userspace access to IC IALLU instruction).
//
// For NCE, we execute guest code directly on the CPU, so we rely on the hardware's
// cache coherency mechanisms. A lighter acquire/release fence is sufficient since:
// 1. Our patched code is written once during module load
// 2. The kernel already handles cache coherency for our memory mappings
// 3. ARM's coherent instruction fetch ensures proper synchronization
//
// Using a lighter fence significantly improves performance for games like RDR1.
std::atomic_thread_fence(std::memory_order_acquire);
}
void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
// For small invalidations (single page or less), a fence is sufficient
// Larger invalidations shouldn't happen frequently in NCE
this->ClearInstructionCache();
}

View File

@@ -181,17 +181,18 @@ bool InterpreterVisitor::Ordered(size_t size, bool L, bool o0, Reg Rn, Reg Rt) {
switch (memop) {
case MemOp::Store: {
std::atomic_thread_fence(std::memory_order_seq_cst);
// Use release ordering for stores (STLR/STLLR semantics)
std::atomic_thread_fence(std::memory_order_release);
u64 value = this->GetReg(Rt);
m_memory.WriteBlock(address, &value, dbytes);
std::atomic_thread_fence(std::memory_order_seq_cst);
break;
}
case MemOp::Load: {
u64 value = 0;
m_memory.ReadBlock(address, &value, dbytes);
// Use acquire ordering for loads (LDAR/LDLAR semantics)
std::atomic_thread_fence(std::memory_order_acquire);
this->SetReg(Rt, value);
std::atomic_thread_fence(std::memory_order_seq_cst);
break;
}
default:

View File

@@ -454,28 +454,37 @@ void Patcher::WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_
void Patcher::LockContext() {
oaknut::Label retry;
oaknut::Label try_lock;
// Save scratches.
c.STP(X0, X1, SP, PRE_INDEXED, -16);
// Reload lock pointer.
c.l(retry);
c.CLREX();
// Load lock pointer once.
c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
static_assert(SpinLockLocked == 0);
// Optimized spinlock: check without exclusive first to reduce bus traffic
c.l(retry);
c.LDR(W1, X0);
c.CBNZ(W1, try_lock);
// Lock is held, spin without exclusive monitor
c.YIELD(); // Hint to the CPU that we're spinning
c.B(retry);
// Try to acquire the lock
c.l(try_lock);
// Load-linked with acquire ordering.
c.LDAXR(W1, X0);
// If the value was SpinLockLocked, clear monitor and retry.
// If the value was SpinLockLocked, retry without CLREX to avoid clearing other monitors.
c.CBZ(W1, retry);
// Store-conditional SpinLockLocked with relaxed ordering.
c.STXR(W1, WZR, X0);
// If we failed to store, retry.
// If we failed to store, retry from the beginning.
c.CBNZ(W1, retry);
// We succeeded! Reload scratches.

View File

@@ -1127,10 +1127,14 @@ bool Memory::InvalidateNCE(Common::ProcessAddress vaddr, size_t size) {
[&] { rasterizer = true; });
if (rasterizer) {
impl->InvalidateGPUMemory(ptr, size);
return mapped && ptr != nullptr;
}
#ifdef __linux__
if (!rasterizer && mapped) {
// Only call DeferredMapSeparateHeap if we actually have a fault to handle
// For NCE, most accesses are to already-mapped memory and don't need deferred mapping
if (mapped && ptr != nullptr) {
// Try deferred mapping - this will return false if already resident
impl->buffer->DeferredMapSeparateHeap(GetInteger(vaddr));
}
#endif