Small cleanup of nnue_feature_transformer.h

Passed Non-regression STC:
LLR: 2.94 (-2.94,2.94) <-1.75,0.25>
Total: 285760 W: 73716 L: 73768 D: 138276
Ptnml(0-2): 777, 30775, 79851, 30677, 800
https://tests.stockfishchess.org/tests/view/676f78681a2f267f205485aa

closes https://github.com/official-stockfish/Stockfish/pull/5745

No functional change
This commit is contained in:
Shawn Xu
2024-12-26 23:11:41 -08:00
committed by Disservin
parent d1a1ff4f17
commit c47e6fcf84
+99 -94
View File
@@ -146,10 +146,10 @@ using psqt_vec_t = int32x4_t;
#endif #endif
#ifdef VECTOR
// Compute optimal SIMD register count for feature transformer accumulation. // Compute optimal SIMD register count for feature transformer accumulation.
template<IndexType TransformedFeatureWidth, IndexType HalfDimensions>
class SIMDTiling {
#ifdef VECTOR
// We use __m* types as template arguments, which causes GCC to emit warnings // We use __m* types as template arguments, which causes GCC to emit warnings
// about losing some attribute information. This is irrelevant to us as we // about losing some attribute information. This is irrelevant to us as we
// only take their size, so the following pragma are harmless. // only take their size, so the following pragma are harmless.
@@ -160,8 +160,8 @@ using psqt_vec_t = int32x4_t;
template<typename SIMDRegisterType, typename LaneType, int NumLanes, int MaxRegisters> template<typename SIMDRegisterType, typename LaneType, int NumLanes, int MaxRegisters>
static constexpr int BestRegisterCount() { static constexpr int BestRegisterCount() {
#define RegisterSize sizeof(SIMDRegisterType) constexpr std::size_t RegisterSize = sizeof(SIMDRegisterType);
#define LaneSize sizeof(LaneType) constexpr std::size_t LaneSize = sizeof(LaneType);
static_assert(RegisterSize >= LaneSize); static_assert(RegisterSize >= LaneSize);
static_assert(MaxRegisters <= NumRegistersSIMD); static_assert(MaxRegisters <= NumRegistersSIMD);
@@ -181,10 +181,24 @@ static constexpr int BestRegisterCount() {
return 1; return 1;
} }
#if defined(__GNUC__) #if defined(__GNUC__)
#pragma GCC diagnostic pop #pragma GCC diagnostic pop
#endif #endif
public:
static constexpr int NumRegs =
BestRegisterCount<vec_t, WeightType, TransformedFeatureWidth, NumRegistersSIMD>();
static constexpr int NumPsqtRegs =
BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2;
static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions");
static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets");
#endif #endif
};
// Input feature converter // Input feature converter
@@ -196,17 +210,7 @@ class FeatureTransformer {
static constexpr IndexType HalfDimensions = TransformedFeatureDimensions; static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
private: private:
#ifdef VECTOR using Tiling = SIMDTiling<TransformedFeatureDimensions, HalfDimensions>;
static constexpr int NumRegs =
BestRegisterCount<vec_t, WeightType, TransformedFeatureDimensions, NumRegistersSIMD>();
static constexpr int NumPsqtRegs =
BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2;
static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions");
static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets");
#endif
public: public:
// Output type // Output type
@@ -478,8 +482,8 @@ class FeatureTransformer {
#ifdef VECTOR #ifdef VECTOR
// Gcc-10.2 unnecessarily spills AVX2 registers if this array // Gcc-10.2 unnecessarily spills AVX2 registers if this array
// is defined in the VECTOR code below, once in each branch. // is defined in the VECTOR code below, once in each branch.
vec_t acc[NumRegs]; vec_t acc[Tiling::NumRegs];
psqt_vec_t psqt[NumPsqtRegs]; psqt_vec_t psqt[Tiling::NumPsqtRegs];
#endif #endif
const Square ksq = pos.square<KING>(Perspective); const Square ksq = pos.square<KING>(Perspective);
@@ -504,14 +508,14 @@ class FeatureTransformer {
#ifdef VECTOR #ifdef VECTOR
if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1) if ((removed.size() == 1 || removed.size() == 2) && added.size() == 1)
{ {
auto accIn = auto* accIn =
reinterpret_cast<const vec_t*>(&(computed->*accPtr).accumulation[Perspective][0]); reinterpret_cast<const vec_t*>(&(computed->*accPtr).accumulation[Perspective][0]);
auto accOut = reinterpret_cast<vec_t*>(&(next->*accPtr).accumulation[Perspective][0]); auto* accOut = reinterpret_cast<vec_t*>(&(next->*accPtr).accumulation[Perspective][0]);
const IndexType offsetR0 = HalfDimensions * removed[0]; const IndexType offsetR0 = HalfDimensions * removed[0];
auto columnR0 = reinterpret_cast<const vec_t*>(&weights[offsetR0]); auto* columnR0 = reinterpret_cast<const vec_t*>(&weights[offsetR0]);
const IndexType offsetA = HalfDimensions * added[0]; const IndexType offsetA = HalfDimensions * added[0];
auto columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]); auto* columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]);
if (removed.size() == 1) if (removed.size() == 1)
{ {
@@ -521,22 +525,22 @@ class FeatureTransformer {
else else
{ {
const IndexType offsetR1 = HalfDimensions * removed[1]; const IndexType offsetR1 = HalfDimensions * removed[1];
auto columnR1 = reinterpret_cast<const vec_t*>(&weights[offsetR1]); auto* columnR1 = reinterpret_cast<const vec_t*>(&weights[offsetR1]);
for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i) for (IndexType i = 0; i < HalfDimensions * sizeof(WeightType) / sizeof(vec_t); ++i)
accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA[i]), accOut[i] = vec_sub_16(vec_add_16(accIn[i], columnA[i]),
vec_add_16(columnR0[i], columnR1[i])); vec_add_16(columnR0[i], columnR1[i]));
} }
auto accPsqtIn = reinterpret_cast<const psqt_vec_t*>( auto* accPsqtIn = reinterpret_cast<const psqt_vec_t*>(
&(computed->*accPtr).psqtAccumulation[Perspective][0]); &(computed->*accPtr).psqtAccumulation[Perspective][0]);
auto accPsqtOut = auto* accPsqtOut =
reinterpret_cast<psqt_vec_t*>(&(next->*accPtr).psqtAccumulation[Perspective][0]); reinterpret_cast<psqt_vec_t*>(&(next->*accPtr).psqtAccumulation[Perspective][0]);
const IndexType offsetPsqtR0 = PSQTBuckets * removed[0]; const IndexType offsetPsqtR0 = PSQTBuckets * removed[0];
auto columnPsqtR0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR0]); auto* columnPsqtR0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR0]);
const IndexType offsetPsqtA = PSQTBuckets * added[0]; const IndexType offsetPsqtA = PSQTBuckets * added[0];
auto columnPsqtA = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtA]); auto* columnPsqtA = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtA]);
if (removed.size() == 1) if (removed.size() == 1)
{ {
@@ -548,7 +552,8 @@ class FeatureTransformer {
else else
{ {
const IndexType offsetPsqtR1 = PSQTBuckets * removed[1]; const IndexType offsetPsqtR1 = PSQTBuckets * removed[1];
auto columnPsqtR1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR1]); auto* columnPsqtR1 =
reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offsetPsqtR1]);
for (std::size_t i = 0; for (std::size_t i = 0;
i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i) i < PSQTBuckets * sizeof(PSQTWeightType) / sizeof(psqt_vec_t); ++i)
@@ -559,69 +564,69 @@ class FeatureTransformer {
} }
else else
{ {
for (IndexType i = 0; i < HalfDimensions / TileHeight; ++i) for (IndexType i = 0; i < HalfDimensions / Tiling::TileHeight; ++i)
{ {
// Load accumulator // Load accumulator
auto accTileIn = reinterpret_cast<const vec_t*>( auto* accTileIn = reinterpret_cast<const vec_t*>(
&(computed->*accPtr).accumulation[Perspective][i * TileHeight]); &(computed->*accPtr).accumulation[Perspective][i * Tiling::TileHeight]);
for (IndexType j = 0; j < NumRegs; ++j) for (IndexType j = 0; j < Tiling::NumRegs; ++j)
acc[j] = vec_load(&accTileIn[j]); acc[j] = vec_load(&accTileIn[j]);
// Difference calculation for the deactivated features // Difference calculation for the deactivated features
for (const auto index : removed) for (const auto index : removed)
{ {
const IndexType offset = HalfDimensions * index + i * TileHeight; const IndexType offset = HalfDimensions * index + i * Tiling::TileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights[offset]); auto* column = reinterpret_cast<const vec_t*>(&weights[offset]);
for (IndexType j = 0; j < NumRegs; ++j) for (IndexType j = 0; j < Tiling::NumRegs; ++j)
acc[j] = vec_sub_16(acc[j], column[j]); acc[j] = vec_sub_16(acc[j], column[j]);
} }
// Difference calculation for the activated features // Difference calculation for the activated features
for (const auto index : added) for (const auto index : added)
{ {
const IndexType offset = HalfDimensions * index + i * TileHeight; const IndexType offset = HalfDimensions * index + i * Tiling::TileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights[offset]); auto* column = reinterpret_cast<const vec_t*>(&weights[offset]);
for (IndexType j = 0; j < NumRegs; ++j) for (IndexType j = 0; j < Tiling::NumRegs; ++j)
acc[j] = vec_add_16(acc[j], column[j]); acc[j] = vec_add_16(acc[j], column[j]);
} }
// Store accumulator // Store accumulator
auto accTileOut = reinterpret_cast<vec_t*>( auto* accTileOut = reinterpret_cast<vec_t*>(
&(next->*accPtr).accumulation[Perspective][i * TileHeight]); &(next->*accPtr).accumulation[Perspective][i * Tiling::TileHeight]);
for (IndexType j = 0; j < NumRegs; ++j) for (IndexType j = 0; j < Tiling::NumRegs; ++j)
vec_store(&accTileOut[j], acc[j]); vec_store(&accTileOut[j], acc[j]);
} }
for (IndexType i = 0; i < PSQTBuckets / PsqtTileHeight; ++i) for (IndexType i = 0; i < PSQTBuckets / Tiling::PsqtTileHeight; ++i)
{ {
// Load accumulator // Load accumulator
auto accTilePsqtIn = reinterpret_cast<const psqt_vec_t*>( auto* accTilePsqtIn = reinterpret_cast<const psqt_vec_t*>(
&(computed->*accPtr).psqtAccumulation[Perspective][i * PsqtTileHeight]); &(computed->*accPtr).psqtAccumulation[Perspective][i * Tiling::PsqtTileHeight]);
for (std::size_t j = 0; j < NumPsqtRegs; ++j) for (std::size_t j = 0; j < Tiling::NumPsqtRegs; ++j)
psqt[j] = vec_load_psqt(&accTilePsqtIn[j]); psqt[j] = vec_load_psqt(&accTilePsqtIn[j]);
// Difference calculation for the deactivated features // Difference calculation for the deactivated features
for (const auto index : removed) for (const auto index : removed)
{ {
const IndexType offset = PSQTBuckets * index + i * PsqtTileHeight; const IndexType offset = PSQTBuckets * index + i * Tiling::PsqtTileHeight;
auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]); auto* columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
for (std::size_t j = 0; j < NumPsqtRegs; ++j) for (std::size_t j = 0; j < Tiling::NumPsqtRegs; ++j)
psqt[j] = vec_sub_psqt_32(psqt[j], columnPsqt[j]); psqt[j] = vec_sub_psqt_32(psqt[j], columnPsqt[j]);
} }
// Difference calculation for the activated features // Difference calculation for the activated features
for (const auto index : added) for (const auto index : added)
{ {
const IndexType offset = PSQTBuckets * index + i * PsqtTileHeight; const IndexType offset = PSQTBuckets * index + i * Tiling::PsqtTileHeight;
auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]); auto* columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
for (std::size_t j = 0; j < NumPsqtRegs; ++j) for (std::size_t j = 0; j < Tiling::NumPsqtRegs; ++j)
psqt[j] = vec_add_psqt_32(psqt[j], columnPsqt[j]); psqt[j] = vec_add_psqt_32(psqt[j], columnPsqt[j]);
} }
// Store accumulator // Store accumulator
auto accTilePsqtOut = reinterpret_cast<psqt_vec_t*>( auto* accTilePsqtOut = reinterpret_cast<psqt_vec_t*>(
&(next->*accPtr).psqtAccumulation[Perspective][i * PsqtTileHeight]); &(next->*accPtr).psqtAccumulation[Perspective][i * Tiling::PsqtTileHeight]);
for (std::size_t j = 0; j < NumPsqtRegs; ++j) for (std::size_t j = 0; j < Tiling::NumPsqtRegs; ++j)
vec_store_psqt(&accTilePsqtOut[j], psqt[j]); vec_store_psqt(&accTilePsqtOut[j], psqt[j]);
} }
} }
@@ -700,88 +705,88 @@ class FeatureTransformer {
accumulator.computed[Perspective] = true; accumulator.computed[Perspective] = true;
#ifdef VECTOR #ifdef VECTOR
vec_t acc[NumRegs]; vec_t acc[Tiling::NumRegs];
psqt_vec_t psqt[NumPsqtRegs]; psqt_vec_t psqt[Tiling::NumPsqtRegs];
for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j) for (IndexType j = 0; j < HalfDimensions / Tiling::TileHeight; ++j)
{ {
auto accTile = auto* accTile = reinterpret_cast<vec_t*>(
reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]); &accumulator.accumulation[Perspective][j * Tiling::TileHeight]);
auto entryTile = reinterpret_cast<vec_t*>(&entry.accumulation[j * TileHeight]); auto* entryTile = reinterpret_cast<vec_t*>(&entry.accumulation[j * Tiling::TileHeight]);
for (IndexType k = 0; k < NumRegs; ++k) for (IndexType k = 0; k < Tiling::NumRegs; ++k)
acc[k] = entryTile[k]; acc[k] = entryTile[k];
int i = 0; std::size_t i = 0;
for (; i < int(std::min(removed.size(), added.size())); ++i) for (; i < std::min(removed.size(), added.size()); ++i)
{ {
IndexType indexR = removed[i]; IndexType indexR = removed[i];
const IndexType offsetR = HalfDimensions * indexR + j * TileHeight; const IndexType offsetR = HalfDimensions * indexR + j * Tiling::TileHeight;
auto columnR = reinterpret_cast<const vec_t*>(&weights[offsetR]); auto* columnR = reinterpret_cast<const vec_t*>(&weights[offsetR]);
IndexType indexA = added[i]; IndexType indexA = added[i];
const IndexType offsetA = HalfDimensions * indexA + j * TileHeight; const IndexType offsetA = HalfDimensions * indexA + j * Tiling::TileHeight;
auto columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]); auto* columnA = reinterpret_cast<const vec_t*>(&weights[offsetA]);
for (unsigned k = 0; k < NumRegs; ++k) for (IndexType k = 0; k < Tiling::NumRegs; ++k)
acc[k] = vec_add_16(acc[k], vec_sub_16(columnA[k], columnR[k])); acc[k] = vec_add_16(acc[k], vec_sub_16(columnA[k], columnR[k]));
} }
for (; i < int(removed.size()); ++i) for (; i < removed.size(); ++i)
{ {
IndexType index = removed[i]; IndexType index = removed[i];
const IndexType offset = HalfDimensions * index + j * TileHeight; const IndexType offset = HalfDimensions * index + j * Tiling::TileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights[offset]); auto* column = reinterpret_cast<const vec_t*>(&weights[offset]);
for (unsigned k = 0; k < NumRegs; ++k) for (IndexType k = 0; k < Tiling::NumRegs; ++k)
acc[k] = vec_sub_16(acc[k], column[k]); acc[k] = vec_sub_16(acc[k], column[k]);
} }
for (; i < int(added.size()); ++i) for (; i < added.size(); ++i)
{ {
IndexType index = added[i]; IndexType index = added[i];
const IndexType offset = HalfDimensions * index + j * TileHeight; const IndexType offset = HalfDimensions * index + j * Tiling::TileHeight;
auto column = reinterpret_cast<const vec_t*>(&weights[offset]); auto* column = reinterpret_cast<const vec_t*>(&weights[offset]);
for (unsigned k = 0; k < NumRegs; ++k) for (IndexType k = 0; k < Tiling::NumRegs; ++k)
acc[k] = vec_add_16(acc[k], column[k]); acc[k] = vec_add_16(acc[k], column[k]);
} }
for (IndexType k = 0; k < NumRegs; k++) for (IndexType k = 0; k < Tiling::NumRegs; k++)
vec_store(&entryTile[k], acc[k]); vec_store(&entryTile[k], acc[k]);
for (IndexType k = 0; k < NumRegs; k++) for (IndexType k = 0; k < Tiling::NumRegs; k++)
vec_store(&accTile[k], acc[k]); vec_store(&accTile[k], acc[k]);
} }
for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j) for (IndexType j = 0; j < PSQTBuckets / Tiling::PsqtTileHeight; ++j)
{ {
auto accTilePsqt = reinterpret_cast<psqt_vec_t*>( auto* accTilePsqt = reinterpret_cast<psqt_vec_t*>(
&accumulator.psqtAccumulation[Perspective][j * PsqtTileHeight]); &accumulator.psqtAccumulation[Perspective][j * Tiling::PsqtTileHeight]);
auto entryTilePsqt = auto* entryTilePsqt =
reinterpret_cast<psqt_vec_t*>(&entry.psqtAccumulation[j * PsqtTileHeight]); reinterpret_cast<psqt_vec_t*>(&entry.psqtAccumulation[j * Tiling::PsqtTileHeight]);
for (std::size_t k = 0; k < NumPsqtRegs; ++k) for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
psqt[k] = entryTilePsqt[k]; psqt[k] = entryTilePsqt[k];
for (int i = 0; i < int(removed.size()); ++i) for (std::size_t i = 0; i < removed.size(); ++i)
{ {
IndexType index = removed[i]; IndexType index = removed[i];
const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; const IndexType offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]); auto* columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
for (std::size_t k = 0; k < NumPsqtRegs; ++k) for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]); psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
} }
for (int i = 0; i < int(added.size()); ++i) for (std::size_t i = 0; i < added.size(); ++i)
{ {
IndexType index = added[i]; IndexType index = added[i];
const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight; const IndexType offset = PSQTBuckets * index + j * Tiling::PsqtTileHeight;
auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]); auto* columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
for (std::size_t k = 0; k < NumPsqtRegs; ++k) for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]); psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
} }
for (std::size_t k = 0; k < NumPsqtRegs; ++k) for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
vec_store_psqt(&entryTilePsqt[k], psqt[k]); vec_store_psqt(&entryTilePsqt[k], psqt[k]);
for (std::size_t k = 0; k < NumPsqtRegs; ++k) for (std::size_t k = 0; k < Tiling::NumPsqtRegs; ++k)
vec_store_psqt(&accTilePsqt[k], psqt[k]); vec_store_psqt(&accTilePsqt[k], psqt[k]);
} }