Support VNNI on 256bit vectors

due to downclocking on current chips (tested up to cascade lake) supporting avx512 and vnni512, it is better to use avx2 or vnni256 in multithreaded (in particular hyperthreaded) engine use. In single threaded use, the picture is different. gcc compilation for vnni256 requires a toolchain for gcc >= 9. closes https://github.com/official-stockfish/Stockfish/pull/3038 No functional change
2026-05-20 08:37:44 +00:00 · 2020-08-20 16:59:27 -07:00
parent e453f09f06
commit 701b2427bd
3 changed files with 42 additions and 11 deletions
@@ -85,8 +85,10 @@ namespace Eval::NNUE::Layers {

  #elif defined(USE_AVX2)
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
-      const __m256i kOnes = _mm256_set1_epi16(1);
      const auto input_vector = reinterpret_cast<const __m256i*>(input);
+  #if !defined(USE_VNNI)
+      const __m256i kOnes = _mm256_set1_epi16(1);
+  #endif

  #elif defined(USE_SSE2)
      constexpr IndexType kNumChunks = kPaddedInputDimensions / kSimdWidth;
@@ -145,9 +147,13 @@ namespace Eval::NNUE::Layers {
        __m256i sum = _mm256_setzero_si256();
        const auto row = reinterpret_cast<const __m256i*>(&weights_[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
+  #if defined(USE_VNNI)
+          sum = _mm256_dpbusd_epi32(sum, _mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
+  #else
          __m256i product = _mm256_maddubs_epi16(_mm256_loadA_si256(&input_vector[j]), _mm256_load_si256(&row[j]));
          product = _mm256_madd_epi16(product, kOnes);
          sum = _mm256_add_epi32(sum, product);
+  #endif
        }
        __m128i sum128 = _mm_add_epi32(_mm256_castsi256_si128(sum), _mm256_extracti128_si256(sum, 1));
        sum128 = _mm_add_epi32(sum128, _mm_shuffle_epi32(sum128, _MM_PERM_BADC));