mirror of
https://github.com/opelly27/Stockfish.git
synced 2026-05-20 06:17:49 +00:00
Manually align arrays on the stack
as a workaround to issues with overaligned alignas() on stack variables in gcc < 9.3 on windows. closes https://github.com/official-stockfish/Stockfish/pull/3217 fixes #3216 No functional change
This commit is contained in:
committed by
Joost VandeVondele
parent
a260c9a8a2
commit
3f6451eff7
@@ -25,6 +25,7 @@
|
||||
#include "../position.h"
|
||||
#include "../misc.h"
|
||||
#include "../uci.h"
|
||||
#include "../types.h"
|
||||
|
||||
#include "evaluate_nnue.h"
|
||||
|
||||
@@ -126,10 +127,28 @@ namespace Eval::NNUE {
|
||||
// Evaluation function. Perform differential calculation.
|
||||
Value evaluate(const Position& pos) {
|
||||
|
||||
alignas(kCacheLineSize) TransformedFeatureType
|
||||
transformed_features[FeatureTransformer::kBufferSize];
|
||||
// We manually align the arrays on the stack because with gcc < 9.3
|
||||
// overaligning stack variables with alignas() doesn't work correctly.
|
||||
|
||||
constexpr uint64_t alignment = kCacheLineSize;
|
||||
|
||||
#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
|
||||
TransformedFeatureType transformed_features_unaligned[
|
||||
FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
|
||||
char buffer_unaligned[Network::kBufferSize + alignment];
|
||||
|
||||
auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
|
||||
auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
|
||||
#else
|
||||
alignas(alignment)
|
||||
TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
|
||||
alignas(alignment) char buffer[Network::kBufferSize];
|
||||
#endif
|
||||
|
||||
ASSERT_ALIGNED(transformed_features, alignment);
|
||||
ASSERT_ALIGNED(buffer, alignment);
|
||||
|
||||
feature_transformer->Transform(pos, transformed_features);
|
||||
alignas(kCacheLineSize) char buffer[Network::kBufferSize];
|
||||
const auto output = network->Propagate(transformed_features, buffer);
|
||||
|
||||
return static_cast<Value>(output[0] / FV_SCALE);
|
||||
|
||||
@@ -74,12 +74,12 @@ namespace Eval::NNUE::Layers {
|
||||
const auto out = reinterpret_cast<__m256i*>(output);
|
||||
for (IndexType i = 0; i < kNumChunks; ++i) {
|
||||
const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
|
||||
_mm256_loadA_si256(&in[i * 4 + 0]),
|
||||
_mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
|
||||
_mm256_load_si256(&in[i * 4 + 0]),
|
||||
_mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
|
||||
const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
|
||||
_mm256_loadA_si256(&in[i * 4 + 2]),
|
||||
_mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
|
||||
_mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
|
||||
_mm256_load_si256(&in[i * 4 + 2]),
|
||||
_mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
|
||||
_mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
|
||||
_mm256_packs_epi16(words0, words1), kZero), kOffsets));
|
||||
}
|
||||
constexpr IndexType kStart = kNumChunks * kSimdWidth;
|
||||
|
||||
@@ -43,29 +43,6 @@
|
||||
#include <arm_neon.h>
|
||||
#endif
|
||||
|
||||
// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
|
||||
// compiled with older g++ crashes because the output memory is not aligned
|
||||
// even though alignas is specified.
|
||||
#if defined(USE_AVX2)
|
||||
#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
|
||||
#define _mm256_loadA_si256 _mm256_loadu_si256
|
||||
#define _mm256_storeA_si256 _mm256_storeu_si256
|
||||
#else
|
||||
#define _mm256_loadA_si256 _mm256_load_si256
|
||||
#define _mm256_storeA_si256 _mm256_store_si256
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#if defined(USE_AVX512)
|
||||
#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
|
||||
#define _mm512_loadA_si512 _mm512_loadu_si512
|
||||
#define _mm512_storeA_si512 _mm512_storeu_si512
|
||||
#else
|
||||
#define _mm512_loadA_si512 _mm512_load_si512
|
||||
#define _mm512_storeA_si512 _mm512_store_si512
|
||||
#endif
|
||||
#endif
|
||||
|
||||
namespace Eval::NNUE {
|
||||
|
||||
// Version of the evaluation file
|
||||
|
||||
@@ -36,16 +36,16 @@ namespace Eval::NNUE {
|
||||
|
||||
#ifdef USE_AVX512
|
||||
typedef __m512i vec_t;
|
||||
#define vec_load(a) _mm512_loadA_si512(a)
|
||||
#define vec_store(a,b) _mm512_storeA_si512(a,b)
|
||||
#define vec_load(a) _mm512_load_si512(a)
|
||||
#define vec_store(a,b) _mm512_store_si512(a,b)
|
||||
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
|
||||
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
|
||||
static constexpr IndexType kNumRegs = 8; // only 8 are needed
|
||||
|
||||
#elif USE_AVX2
|
||||
typedef __m256i vec_t;
|
||||
#define vec_load(a) _mm256_loadA_si256(a)
|
||||
#define vec_store(a,b) _mm256_storeA_si256(a,b)
|
||||
#define vec_load(a) _mm256_load_si256(a)
|
||||
#define vec_store(a,b) _mm256_store_si256(a,b)
|
||||
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
|
||||
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
|
||||
static constexpr IndexType kNumRegs = 16;
|
||||
@@ -157,11 +157,11 @@ namespace Eval::NNUE {
|
||||
#if defined(USE_AVX2)
|
||||
auto out = reinterpret_cast<__m256i*>(&output[offset]);
|
||||
for (IndexType j = 0; j < kNumChunks; ++j) {
|
||||
__m256i sum0 = _mm256_loadA_si256(
|
||||
__m256i sum0 = _mm256_load_si256(
|
||||
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
|
||||
__m256i sum1 = _mm256_loadA_si256(
|
||||
__m256i sum1 = _mm256_load_si256(
|
||||
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
|
||||
_mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
|
||||
_mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
|
||||
_mm256_packs_epi16(sum0, sum1), kZero), kControl));
|
||||
}
|
||||
|
||||
|
||||
Reference in New Issue
Block a user