From 9ea0a36fc6abb04306b0e6169ee9212ca8630449 Mon Sep 17 00:00:00 2001 From: Collecting Date: Sun, 28 Dec 2025 19:07:41 +0000 Subject: [PATCH] video_core: Force inlining of BCn decoder functions Apply aggressive inlining attributes to BCn decoding logic to eliminate function call overhead during texture decompression. This allows the compiler to better analyze loops for vectorization. Includes portability guards to ensure compatibility across Clang, GCC, and MSVC. Signed-off-by: Collecting --- externals/bc_decoder/bc_decoder.cpp | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) diff --git a/externals/bc_decoder/bc_decoder.cpp b/externals/bc_decoder/bc_decoder.cpp index 536c44f34..fa1cffb6c 100644 --- a/externals/bc_decoder/bc_decoder.cpp +++ b/externals/bc_decoder/bc_decoder.cpp @@ -1,6 +1,7 @@ // SPDX-License-Identifier: MPL-2.0 // Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/) // Copyright 2019 The SwiftShader Authors. All Rights Reserved. +// SPDXFileCopyrightText: 2025 citron Emulator Project // This BCn Decoder is directly derivative of Swiftshader's BCn Decoder found at: https://github.com/google/swiftshader/blob/d070309f7d154d6764cbd514b1a5c8bfcef61d06/src/Device/BC_Decoder.cpp // This file does not follow the Skyline code conventions but has certain Skyline specific code @@ -16,6 +17,11 @@ namespace { constexpr int BlockHeight = 4; struct BC_color { + #if defined(__clang__) || defined(__GNUC__) + __attribute__((always_inline)) + #elif defined(_MSC_VER) + __forceinline + #endif void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, bool hasAlphaChannel, bool hasSeparateAlpha) const { Color c[4]; c[0].extract565(c0); @@ -30,6 +36,9 @@ namespace { } } + #ifdef __clang__ + #pragma clang loop vectorize(enable) interleave(enable) + #endif for (int j = 0; j < BlockHeight && (y + j) < dstH; j++) { size_t dstOffset = j * dstPitch; size_t idxOffset = j * BlockHeight; @@ -108,6 +117,11 @@ namespace { static_assert(sizeof(BC_color) == 8, "BC_color must be 8 bytes"); struct BC_channel { + #if defined(__clang__) || defined(__GNUC__) + __attribute__((always_inline)) + #elif defined(_MSC_VER) + __forceinline + #endif void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, size_t channel, bool isSigned) const { int c[8] = {0}; @@ -131,6 +145,9 @@ namespace { c[7] = isSigned ? 127 : 255; } + #ifdef __clang__ + #pragma clang loop vectorize(enable) interleave(enable) + #endif for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++) { for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++) { dst[channel + (i * dstBpp) + (j * dstPitch)] = static_cast(c[getIdx((j * BlockHeight) + i)]); @@ -149,8 +166,16 @@ namespace { static_assert(sizeof(BC_channel) == 8, "BC_channel must be 8 bytes"); struct BC_alpha { + #if defined(__clang__) || defined(__GNUC__) + __attribute__((always_inline)) + #elif defined(_MSC_VER) + __forceinline + #endif void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp) const { dst += 3; // Write only to alpha (channel 3) + #ifdef __clang__ + #pragma clang loop vectorize(enable) interleave(enable) + #endif for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++, dst += dstPitch) { uint8_t *dstRow = dst; for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++, dstRow += dstBpp) { @@ -1520,3 +1545,4 @@ namespace bcn { block->decode(dst, x, y, width, height, pitch); } } +