Manually align arrays on the stack

as a workaround to issues with overaligned alignas() on stack variables in gcc < 9.3 on windows.

closes https://github.com/official-stockfish/Stockfish/pull/3217

fixes #3216

No functional change
This commit is contained in:
Tomasz Sobczyk
2020-11-03 11:23:35 +01:00
committed by Joost VandeVondele
parent a260c9a8a2
commit 3f6451eff7
8 changed files with 64 additions and 38 deletions
+22 -3
View File
@@ -25,6 +25,7 @@
#include "../position.h"
#include "../misc.h"
#include "../uci.h"
#include "../types.h"
#include "evaluate_nnue.h"
@@ -126,10 +127,28 @@ namespace Eval::NNUE {
// Evaluation function. Perform differential calculation.
Value evaluate(const Position& pos) {
alignas(kCacheLineSize) TransformedFeatureType
transformed_features[FeatureTransformer::kBufferSize];
// We manually align the arrays on the stack because with gcc < 9.3
// overaligning stack variables with alignas() doesn't work correctly.
constexpr uint64_t alignment = kCacheLineSize;
#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
TransformedFeatureType transformed_features_unaligned[
FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
char buffer_unaligned[Network::kBufferSize + alignment];
auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
#else
alignas(alignment)
TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
alignas(alignment) char buffer[Network::kBufferSize];
#endif
ASSERT_ALIGNED(transformed_features, alignment);
ASSERT_ALIGNED(buffer, alignment);
feature_transformer->Transform(pos, transformed_features);
alignas(kCacheLineSize) char buffer[Network::kBufferSize];
const auto output = network->Propagate(transformed_features, buffer);
return static_cast<Value>(output[0] / FV_SCALE);
+5 -5
View File
@@ -74,12 +74,12 @@ namespace Eval::NNUE::Layers {
const auto out = reinterpret_cast<__m256i*>(output);
for (IndexType i = 0; i < kNumChunks; ++i) {
const __m256i words0 = _mm256_srai_epi16(_mm256_packs_epi32(
_mm256_loadA_si256(&in[i * 4 + 0]),
_mm256_loadA_si256(&in[i * 4 + 1])), kWeightScaleBits);
_mm256_load_si256(&in[i * 4 + 0]),
_mm256_load_si256(&in[i * 4 + 1])), kWeightScaleBits);
const __m256i words1 = _mm256_srai_epi16(_mm256_packs_epi32(
_mm256_loadA_si256(&in[i * 4 + 2]),
_mm256_loadA_si256(&in[i * 4 + 3])), kWeightScaleBits);
_mm256_storeA_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
_mm256_load_si256(&in[i * 4 + 2]),
_mm256_load_si256(&in[i * 4 + 3])), kWeightScaleBits);
_mm256_store_si256(&out[i], _mm256_permutevar8x32_epi32(_mm256_max_epi8(
_mm256_packs_epi16(words0, words1), kZero), kOffsets));
}
constexpr IndexType kStart = kNumChunks * kSimdWidth;
-23
View File
@@ -43,29 +43,6 @@
#include <arm_neon.h>
#endif
// HACK: Use _mm256_loadu_si256() instead of _mm256_load_si256. Otherwise a binary
// compiled with older g++ crashes because the output memory is not aligned
// even though alignas is specified.
#if defined(USE_AVX2)
#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
#define _mm256_loadA_si256 _mm256_loadu_si256
#define _mm256_storeA_si256 _mm256_storeu_si256
#else
#define _mm256_loadA_si256 _mm256_load_si256
#define _mm256_storeA_si256 _mm256_store_si256
#endif
#endif
#if defined(USE_AVX512)
#if defined(__GNUC__ ) && (__GNUC__ < 9) && defined(_WIN32) && !defined(__clang__)
#define _mm512_loadA_si512 _mm512_loadu_si512
#define _mm512_storeA_si512 _mm512_storeu_si512
#else
#define _mm512_loadA_si512 _mm512_load_si512
#define _mm512_storeA_si512 _mm512_store_si512
#endif
#endif
namespace Eval::NNUE {
// Version of the evaluation file
+7 -7
View File
@@ -36,16 +36,16 @@ namespace Eval::NNUE {
#ifdef USE_AVX512
typedef __m512i vec_t;
#define vec_load(a) _mm512_loadA_si512(a)
#define vec_store(a,b) _mm512_storeA_si512(a,b)
#define vec_load(a) _mm512_load_si512(a)
#define vec_store(a,b) _mm512_store_si512(a,b)
#define vec_add_16(a,b) _mm512_add_epi16(a,b)
#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
static constexpr IndexType kNumRegs = 8; // only 8 are needed
#elif USE_AVX2
typedef __m256i vec_t;
#define vec_load(a) _mm256_loadA_si256(a)
#define vec_store(a,b) _mm256_storeA_si256(a,b)
#define vec_load(a) _mm256_load_si256(a)
#define vec_store(a,b) _mm256_store_si256(a,b)
#define vec_add_16(a,b) _mm256_add_epi16(a,b)
#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
static constexpr IndexType kNumRegs = 16;
@@ -157,11 +157,11 @@ namespace Eval::NNUE {
#if defined(USE_AVX2)
auto out = reinterpret_cast<__m256i*>(&output[offset]);
for (IndexType j = 0; j < kNumChunks; ++j) {
__m256i sum0 = _mm256_loadA_si256(
__m256i sum0 = _mm256_load_si256(
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
__m256i sum1 = _mm256_loadA_si256(
__m256i sum1 = _mm256_load_si256(
&reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
_mm256_storeA_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
_mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
_mm256_packs_epi16(sum0, sum1), kZero), kControl));
}