video_core: Force inlining of BCn decoder functions

Apply aggressive inlining attributes to BCn decoding logic to eliminate function call overhead during texture decompression. This allows the compiler to better analyze loops for vectorization. Includes portability guards to ensure compatibility across Clang, GCC, and MSVC.

Signed-off-by: Collecting <collecting@noreply.localhost>
This commit is contained in:
Collecting
2025-12-28 19:07:41 +00:00
parent c5f35b0712
commit 9ea0a36fc6

View File

@@ -1,6 +1,7 @@
// SPDX-License-Identifier: MPL-2.0
// Copyright © 2022 Skyline Team and Contributors (https://github.com/skyline-emu/)
// Copyright 2019 The SwiftShader Authors. All Rights Reserved.
// SPDXFileCopyrightText: 2025 citron Emulator Project
// This BCn Decoder is directly derivative of Swiftshader's BCn Decoder found at: https://github.com/google/swiftshader/blob/d070309f7d154d6764cbd514b1a5c8bfcef61d06/src/Device/BC_Decoder.cpp
// This file does not follow the Skyline code conventions but has certain Skyline specific code
@@ -16,6 +17,11 @@ namespace {
constexpr int BlockHeight = 4;
struct BC_color {
#if defined(__clang__) || defined(__GNUC__)
__attribute__((always_inline))
#elif defined(_MSC_VER)
__forceinline
#endif
void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, bool hasAlphaChannel, bool hasSeparateAlpha) const {
Color c[4];
c[0].extract565(c0);
@@ -30,6 +36,9 @@ namespace {
}
}
#ifdef __clang__
#pragma clang loop vectorize(enable) interleave(enable)
#endif
for (int j = 0; j < BlockHeight && (y + j) < dstH; j++) {
size_t dstOffset = j * dstPitch;
size_t idxOffset = j * BlockHeight;
@@ -108,6 +117,11 @@ namespace {
static_assert(sizeof(BC_color) == 8, "BC_color must be 8 bytes");
struct BC_channel {
#if defined(__clang__) || defined(__GNUC__)
__attribute__((always_inline))
#elif defined(_MSC_VER)
__forceinline
#endif
void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp, size_t channel, bool isSigned) const {
int c[8] = {0};
@@ -131,6 +145,9 @@ namespace {
c[7] = isSigned ? 127 : 255;
}
#ifdef __clang__
#pragma clang loop vectorize(enable) interleave(enable)
#endif
for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++) {
for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++) {
dst[channel + (i * dstBpp) + (j * dstPitch)] = static_cast<uint8_t>(c[getIdx((j * BlockHeight) + i)]);
@@ -149,8 +166,16 @@ namespace {
static_assert(sizeof(BC_channel) == 8, "BC_channel must be 8 bytes");
struct BC_alpha {
#if defined(__clang__) || defined(__GNUC__)
__attribute__((always_inline))
#elif defined(_MSC_VER)
__forceinline
#endif
void decode(uint8_t *dst, size_t x, size_t y, size_t dstW, size_t dstH, size_t dstPitch, size_t dstBpp) const {
dst += 3; // Write only to alpha (channel 3)
#ifdef __clang__
#pragma clang loop vectorize(enable) interleave(enable)
#endif
for (size_t j = 0; j < BlockHeight && (y + j) < dstH; j++, dst += dstPitch) {
uint8_t *dstRow = dst;
for (size_t i = 0; i < BlockWidth && (x + i) < dstW; i++, dstRow += dstBpp) {
@@ -1520,3 +1545,4 @@ namespace bcn {
block->decode(dst, x, y, width, height, pitch);
}
}