Fix for MSVC compilation.

MSVC needs two more explicit casts to compile new affine_transform_sparse_input.
See https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm256_castsi256_ps
and https://www.intel.com/content/www/us/en/docs/intrinsics-guide/index.html#text=_mm_castsi128_ps

closes https://github.com/official-stockfish/Stockfish/pull/4616

No functional change
This commit is contained in:
Andreas Matthies
2023-06-13 06:24:04 +02:00
committed by Joost VandeVondele
parent 92c949e12e
commit 7922e07af8
2 changed files with 3 additions and 2 deletions
+1
View File
@@ -19,6 +19,7 @@ Alexander Kure
Alexander Pagel (Lolligerhans) Alexander Pagel (Lolligerhans)
Alfredo Menezes (lonfom169) Alfredo Menezes (lonfom169)
Ali AlZhrani (Cooffe) Ali AlZhrani (Cooffe)
Andreas Matthies (Matthies)
Andrei Vetrov (proukornew) Andrei Vetrov (proukornew)
Andrew Grant (AndyGrant) Andrew Grant (AndyGrant)
Andrey Neporada (nepal) Andrey Neporada (nepal)
@@ -98,10 +98,10 @@ namespace Stockfish::Eval::NNUE::Layers {
#define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512()) #define vec_nnz(a) _mm512_cmpgt_epi32_mask(a, _mm512_setzero_si512())
#elif defined (USE_AVX2) #elif defined (USE_AVX2)
using vec_t = __m256i; using vec_t = __m256i;
#define vec_nnz(a) _mm256_movemask_ps((__m256)_mm256_cmpgt_epi32(a, _mm256_setzero_si256())) #define vec_nnz(a) _mm256_movemask_ps(_mm256_castsi256_ps(_mm256_cmpgt_epi32(a, _mm256_setzero_si256())))
#elif defined (USE_SSSE3) #elif defined (USE_SSSE3)
using vec_t = __m128i; using vec_t = __m128i;
#define vec_nnz(a) _mm_movemask_ps((__m128)_mm_cmpgt_epi32(a, _mm_setzero_si128())) #define vec_nnz(a) _mm_movemask_ps(_mm_castsi128_ps(_mm_cmpgt_epi32(a, _mm_setzero_si128())))
#endif #endif
constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(std::int32_t); constexpr IndexType InputSimdWidth = sizeof(vec_t) / sizeof(std::int32_t);
// Inputs are processed InputSimdWidth at a time and outputs are processed 8 at a time so we process in chunks of max(InputSimdWidth, 8) // Inputs are processed InputSimdWidth at a time and outputs are processed 8 at a time so we process in chunks of max(InputSimdWidth, 8)