mirror of
https://git.citron-emu.org/citron/emulator
synced 2025-12-19 02:33:32 +00:00
perf(arm/nce): optimize memory access and reduce lock contention
Changes: - Optimize InvalidateNCE to avoid unnecessary DeferredMapSeparateHeap calls for already-mapped memory and early return for rasterizer case - Add lock-free fast path to HeapTracker.DeferredMapSeparateHeap using atomic m_map_count to avoid mutex acquisition when no mappings exist - Replace seq_cst memory barriers with acquire/release ordering for instruction cache invalidation and ordered load/store operations to reduce pipeline stalls while maintaining ARM memory model correctness - Optimize spinlock implementation in patcher with YIELD instruction and non-exclusive check before acquisition to reduce bus contention These optimizations eliminate the primary bottlenecks causing poor NCE performance: excessive mutex contention on every memory access and overly conservative memory barriers. The changes restore NCE to match its original performance levels. Signed-off-by: Zephyron <zephyron@citron-emu.org>
This commit is contained in:
@@ -54,7 +54,7 @@ void HeapTracker::Map(size_t virtual_offset, size_t host_offset, size_t length,
|
||||
};
|
||||
|
||||
// Insert into mappings.
|
||||
m_map_count++;
|
||||
m_map_count.fetch_add(1, std::memory_order_relaxed);
|
||||
m_mappings.insert(*map);
|
||||
}
|
||||
|
||||
@@ -88,7 +88,8 @@ void HeapTracker::Unmap(size_t virtual_offset, size_t size, bool is_separate_hea
|
||||
}
|
||||
|
||||
// Erase from map.
|
||||
ASSERT(--m_map_count >= 0);
|
||||
m_map_count.fetch_sub(1, std::memory_order_relaxed);
|
||||
ASSERT(m_map_count >= 0);
|
||||
it = m_mappings.erase(it);
|
||||
|
||||
// Free the item.
|
||||
@@ -166,6 +167,11 @@ bool HeapTracker::DeferredMapSeparateHeap(u8* fault_address) {
|
||||
}
|
||||
|
||||
bool HeapTracker::DeferredMapSeparateHeap(size_t virtual_offset) {
|
||||
// Fast path: if there are no mappings, we can return early without locking
|
||||
if (m_map_count.load(std::memory_order_relaxed) == 0) {
|
||||
return false;
|
||||
}
|
||||
|
||||
bool rebuild_required = false;
|
||||
|
||||
{
|
||||
@@ -174,6 +180,7 @@ bool HeapTracker::DeferredMapSeparateHeap(size_t virtual_offset) {
|
||||
// Check to ensure this was a non-resident separate heap mapping.
|
||||
const auto it = this->GetNearestHeapMapLocked(virtual_offset);
|
||||
if (it == m_mappings.end() || it->is_resident) {
|
||||
// Already resident or not found - this is the most common case for NCE
|
||||
return false;
|
||||
}
|
||||
|
||||
@@ -260,7 +267,7 @@ void HeapTracker::SplitHeapMapLocked(VAddr offset) {
|
||||
};
|
||||
|
||||
// Insert the new right map.
|
||||
m_map_count++;
|
||||
m_map_count.fetch_add(1, std::memory_order_relaxed);
|
||||
m_mappings.insert(*right);
|
||||
|
||||
// If resident, also insert into resident map.
|
||||
|
||||
@@ -90,7 +90,7 @@ private:
|
||||
|
||||
std::shared_mutex m_rebuild_lock{};
|
||||
std::mutex m_lock{};
|
||||
s64 m_map_count{};
|
||||
std::atomic<s64> m_map_count{};
|
||||
s64 m_resident_map_count{};
|
||||
size_t m_tick{};
|
||||
};
|
||||
|
||||
@@ -366,14 +366,22 @@ void ArmNce::SignalInterrupt(Kernel::KThread* thread) {
|
||||
}
|
||||
|
||||
void ArmNce::ClearInstructionCache() {
|
||||
// TODO: This is not possible to implement correctly on Linux because
|
||||
// we do not have any access to ic iallu.
|
||||
|
||||
// Require accesses to complete.
|
||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||
// NOTE: True instruction cache invalidation is not possible on Linux without kernel support
|
||||
// (no userspace access to IC IALLU instruction).
|
||||
//
|
||||
// For NCE, we execute guest code directly on the CPU, so we rely on the hardware's
|
||||
// cache coherency mechanisms. A lighter acquire/release fence is sufficient since:
|
||||
// 1. Our patched code is written once during module load
|
||||
// 2. The kernel already handles cache coherency for our memory mappings
|
||||
// 3. ARM's coherent instruction fetch ensures proper synchronization
|
||||
//
|
||||
// Using a lighter fence significantly improves performance for games like RDR1.
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
}
|
||||
|
||||
void ArmNce::InvalidateCacheRange(u64 addr, std::size_t size) {
|
||||
// For small invalidations (single page or less), a fence is sufficient
|
||||
// Larger invalidations shouldn't happen frequently in NCE
|
||||
this->ClearInstructionCache();
|
||||
}
|
||||
|
||||
|
||||
@@ -181,17 +181,18 @@ bool InterpreterVisitor::Ordered(size_t size, bool L, bool o0, Reg Rn, Reg Rt) {
|
||||
|
||||
switch (memop) {
|
||||
case MemOp::Store: {
|
||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||
// Use release ordering for stores (STLR/STLLR semantics)
|
||||
std::atomic_thread_fence(std::memory_order_release);
|
||||
u64 value = this->GetReg(Rt);
|
||||
m_memory.WriteBlock(address, &value, dbytes);
|
||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||
break;
|
||||
}
|
||||
case MemOp::Load: {
|
||||
u64 value = 0;
|
||||
m_memory.ReadBlock(address, &value, dbytes);
|
||||
// Use acquire ordering for loads (LDAR/LDLAR semantics)
|
||||
std::atomic_thread_fence(std::memory_order_acquire);
|
||||
this->SetReg(Rt, value);
|
||||
std::atomic_thread_fence(std::memory_order_seq_cst);
|
||||
break;
|
||||
}
|
||||
default:
|
||||
|
||||
@@ -454,28 +454,37 @@ void Patcher::WriteCntpctHandler(ModuleDestLabel module_dest, oaknut::XReg dest_
|
||||
|
||||
void Patcher::LockContext() {
|
||||
oaknut::Label retry;
|
||||
oaknut::Label try_lock;
|
||||
|
||||
// Save scratches.
|
||||
c.STP(X0, X1, SP, PRE_INDEXED, -16);
|
||||
|
||||
// Reload lock pointer.
|
||||
c.l(retry);
|
||||
c.CLREX();
|
||||
// Load lock pointer once.
|
||||
c.MRS(X0, oaknut::SystemReg::TPIDR_EL0);
|
||||
c.ADD(X0, X0, offsetof(NativeExecutionParameters, lock));
|
||||
|
||||
static_assert(SpinLockLocked == 0);
|
||||
|
||||
// Optimized spinlock: check without exclusive first to reduce bus traffic
|
||||
c.l(retry);
|
||||
c.LDR(W1, X0);
|
||||
c.CBNZ(W1, try_lock);
|
||||
// Lock is held, spin without exclusive monitor
|
||||
c.YIELD(); // Hint to the CPU that we're spinning
|
||||
c.B(retry);
|
||||
|
||||
// Try to acquire the lock
|
||||
c.l(try_lock);
|
||||
// Load-linked with acquire ordering.
|
||||
c.LDAXR(W1, X0);
|
||||
|
||||
// If the value was SpinLockLocked, clear monitor and retry.
|
||||
// If the value was SpinLockLocked, retry without CLREX to avoid clearing other monitors.
|
||||
c.CBZ(W1, retry);
|
||||
|
||||
// Store-conditional SpinLockLocked with relaxed ordering.
|
||||
c.STXR(W1, WZR, X0);
|
||||
|
||||
// If we failed to store, retry.
|
||||
// If we failed to store, retry from the beginning.
|
||||
c.CBNZ(W1, retry);
|
||||
|
||||
// We succeeded! Reload scratches.
|
||||
|
||||
@@ -1127,10 +1127,14 @@ bool Memory::InvalidateNCE(Common::ProcessAddress vaddr, size_t size) {
|
||||
[&] { rasterizer = true; });
|
||||
if (rasterizer) {
|
||||
impl->InvalidateGPUMemory(ptr, size);
|
||||
return mapped && ptr != nullptr;
|
||||
}
|
||||
|
||||
#ifdef __linux__
|
||||
if (!rasterizer && mapped) {
|
||||
// Only call DeferredMapSeparateHeap if we actually have a fault to handle
|
||||
// For NCE, most accesses are to already-mapped memory and don't need deferred mapping
|
||||
if (mapped && ptr != nullptr) {
|
||||
// Try deferred mapping - this will return false if already resident
|
||||
impl->buffer->DeferredMapSeparateHeap(GetInteger(vaddr));
|
||||
}
|
||||
#endif
|
||||
|
||||
Reference in New Issue
Block a user