mirror of
https://github.com/opelly27/Stockfish.git
synced 2026-05-20 06:17:49 +00:00
Affine transform robust implementation
Size of the weights in the last layer is less than 512 bits. It leads to wrong data access for AVX512. There is no error because in current implementation it is guaranteed that there is an array of zeros after weights so zero multiplied by something is returned and sum is correct. It is a mistake that can lead to unexpected bugs in the future. Used AVX2 instructions for smaller input size. No measurable slowdown on avx512. closes https://github.com/official-stockfish/Stockfish/pull/3298 No functional change.
This commit is contained in:
committed by
Joost VandeVondele
parent
4d30438400
commit
303713b560
@@ -301,20 +301,40 @@ namespace Eval::NNUE::Layers {
|
|||||||
}
|
}
|
||||||
else if constexpr (kOutputDimensions == 1)
|
else if constexpr (kOutputDimensions == 1)
|
||||||
{
|
{
|
||||||
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
#if defined (USE_AVX512)
|
||||||
|
if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) != 0)
|
||||||
vec_t sum0 = vec_setzero();
|
|
||||||
|
|
||||||
const auto row0 = reinterpret_cast<const vec_t*>(&weights_[0]);
|
|
||||||
|
|
||||||
for (int j = 0; j < (int)kNumChunks; ++j)
|
|
||||||
{
|
{
|
||||||
const vec_t in = input_vector[j];
|
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
||||||
|
const auto input_vector256 = reinterpret_cast<const __m256i*>(input);
|
||||||
|
|
||||||
vec_add_dpbusd_32(sum0, in, row0[j]);
|
__m256i sum0 = _mm256_setzero_si256();
|
||||||
|
const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
|
||||||
|
|
||||||
|
for (int j = 0; j < (int)kNumChunks; ++j)
|
||||||
|
{
|
||||||
|
const __m256i in = input_vector256[j];
|
||||||
|
m256_add_dpbusd_epi32(sum0, in, row0[j]);
|
||||||
|
}
|
||||||
|
output[0] = m256_hadd(sum0, biases_[0]);
|
||||||
}
|
}
|
||||||
|
else
|
||||||
|
#endif
|
||||||
|
{
|
||||||
|
#if defined (USE_AVX512)
|
||||||
|
constexpr IndexType kNumChunks = kPaddedInputDimensions / (kSimdWidth * 2);
|
||||||
|
#else
|
||||||
|
constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
|
||||||
|
#endif
|
||||||
|
vec_t sum0 = vec_setzero();
|
||||||
|
const auto row0 = reinterpret_cast<const vec_t*>(&weights_[0]);
|
||||||
|
|
||||||
output[0] = vec_hadd(sum0, biases_[0]);
|
for (int j = 0; j < (int)kNumChunks; ++j)
|
||||||
|
{
|
||||||
|
const vec_t in = input_vector[j];
|
||||||
|
vec_add_dpbusd_32(sum0, in, row0[j]);
|
||||||
|
}
|
||||||
|
output[0] = vec_hadd(sum0, biases_[0]);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
#else
|
#else
|
||||||
|
|||||||
Reference in New Issue
Block a user