New NNUE architecture and net

Introduces a new NNUE network architecture and associated network parameters, as obtained by a new pytorch trainer. The network is already very strong at short TC, without regression at longer TC, and has potential for further improvements. https://tests.stockfishchess.org/tests/view/60a159c65085663412d0921d TC: 10s+0.1s, 1 thread ELO: 21.74 +-3.4 (95%) LOS: 100.0% Total: 10000 W: 1559 L: 934 D: 7507 Ptnml(0-2): 38, 701, 2972, 1176, 113 https://tests.stockfishchess.org/tests/view/60a187005085663412d0925b TC: 60s+0.6s, 1 thread ELO: 5.85 +-1.7 (95%) LOS: 100.0% Total: 20000 W: 1381 L: 1044 D: 17575 Ptnml(0-2): 27, 885, 7864, 1172, 52 https://tests.stockfishchess.org/tests/view/60a2beede229097940a03806 TC: 20s+0.2s, 8 threads LLR: 2.93 (-2.94,2.94) <0.50,3.50> Total: 34272 W: 1610 L: 1452 D: 31210 Ptnml(0-2): 30, 1285, 14350, 1439, 32 https://tests.stockfishchess.org/tests/view/60a2d687e229097940a03c72 TC: 60s+0.6s, 8 threads LLR: 2.94 (-2.94,2.94) <-2.50,0.50> Total: 45544 W: 1262 L: 1214 D: 43068 Ptnml(0-2): 12, 1129, 20442, 1177, 12 The network has been trained (by vondele) using the https://github.com/glinscott/nnue-pytorch/ trainer (started by glinscott), specifically the branch https://github.com/Sopel97/nnue-pytorch/tree/experiment_56. The data used are in 64 billion positions (193GB total) generated and scored with the current master net d8: https://drive.google.com/file/d/1hOOYSDKgOOp38ZmD0N4DV82TOLHzjUiF/view?usp=sharing d9: https://drive.google.com/file/d/1VlhnHL8f-20AXhGkILujnNXHwy9T-MQw/view?usp=sharing d10: https://drive.google.com/file/d/1ZC5upzBYMmMj1gMYCkt6rCxQG0GnO3Kk/view?usp=sharing fishtest_d9: https://drive.google.com/file/d/1GQHt0oNgKaHazwJFTRbXhlCN3FbUedFq/view?usp=sharing This network also contains a few architectural changes with respect to the current master: Size changed from 256x2-32-32-1 to 512x2-16-32-1 ~15-20% slower ~2x larger adds a special path for 16 valued ClippedReLU fixes affine transform code for 16 inputs/outputs, buy using InputDimensions instead of PaddedInputDimensions this is safe now because the inputs are processed in groups of 4 in the current affine transform code The feature set changed from HalfKP to HalfKAv2 Includes information about the kings like HalfKA Packs king features better, resulting in 8% size reduction compared to HalfKA The board is flipped for the black's perspective, instead of rotated like in the current master PSQT values for each feature the feature transformer now outputs a part that is fowarded directly to the output and allows learning piece values more directly than the previous network architecture. The effect is visible for high imbalance positions, where the current master network outputs evaluations skewed towards zero. 8 PSQT values per feature, chosen based on (popcount(pos.pieces()) - 1) / 4 initialized to classical material values on the start of the training 8 subnetworks (512x2->16->32->1), chosen based on (popcount(pos.pieces()) - 1) / 4 only one subnetwork is evaluated for any position, no or marginal speed loss A diagram of the network is available: https://user-images.githubusercontent.com/8037982/118656988-553a1700-b7eb-11eb-82ef-56a11cbebbf2.png A more complete description: https://github.com/glinscott/nnue-pytorch/blob/master/docs/nnue.md closes https://github.com/official-stockfish/Stockfish/pull/3474 Bench: 3806488
2026-05-20 06:17:49 +00:00 · 2021-05-18 17:36:26 +02:00
parent f90274d8ce
commit e8d64af123
13 changed files with 265 additions and 173 deletions
@@ -35,45 +35,82 @@ namespace Stockfish::Eval::NNUE {
  // vector registers.
  #define VECTOR

+  static_assert(PSQTBuckets == 8, "Assumed by the current choice of constants.");
+
  #ifdef USE_AVX512
  typedef __m512i vec_t;
+  typedef __m256i psqt_vec_t;
  #define vec_load(a) _mm512_load_si512(a)
  #define vec_store(a,b) _mm512_store_si512(a,b)
  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  #define vec_load_psqt(a) _mm256_load_si256(a)
+  #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
+  #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
+  #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
+  #define vec_zero_psqt() _mm256_setzero_si256()
  static constexpr IndexType NumRegs = 8; // only 8 are needed
+  static constexpr IndexType NumPsqtRegs = 1;

  #elif USE_AVX2
  typedef __m256i vec_t;
+  typedef __m256i psqt_vec_t;
  #define vec_load(a) _mm256_load_si256(a)
  #define vec_store(a,b) _mm256_store_si256(a,b)
  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  #define vec_load_psqt(a) _mm256_load_si256(a)
+  #define vec_store_psqt(a,b) _mm256_store_si256(a,b)
+  #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
+  #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
+  #define vec_zero_psqt() _mm256_setzero_si256()
  static constexpr IndexType NumRegs = 16;
+  static constexpr IndexType NumPsqtRegs = 1;

  #elif USE_SSE2
  typedef __m128i vec_t;
+  typedef __m128i psqt_vec_t;
  #define vec_load(a) (*(a))
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) _mm_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+  #define vec_load_psqt(a) (*(a))
+  #define vec_store_psqt(a,b) *(a)=(b)
+  #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
+  #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
+  #define vec_zero_psqt() _mm_setzero_si128()
  static constexpr IndexType NumRegs = Is64Bit ? 16 : 8;
+  static constexpr IndexType NumPsqtRegs = 2;

  #elif USE_MMX
  typedef __m64 vec_t;
+  typedef std::int32_t psqt_vec_t;
  #define vec_load(a) (*(a))
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) _mm_add_pi16(a,b)
  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+  #define vec_load_psqt(a) (*(a))
+  #define vec_store_psqt(a,b) *(a)=(b)
+  #define vec_add_psqt_32(a,b) a+b
+  #define vec_sub_psqt_32(a,b) a-b
+  #define vec_zero_psqt() 0
  static constexpr IndexType NumRegs = 8;
+  static constexpr IndexType NumPsqtRegs = 8;

  #elif USE_NEON
  typedef int16x8_t vec_t;
+  typedef int32x4_t psqt_vec_t;
  #define vec_load(a) (*(a))
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) vaddq_s16(a,b)
  #define vec_sub_16(a,b) vsubq_s16(a,b)
+  #define vec_load_psqt(a) (*(a))
+  #define vec_store_psqt(a,b) *(a)=(b)
+  #define vec_add_psqt_32(a,b) vaddq_s32(a,b)
+  #define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
+  #define vec_zero_psqt() psqt_vec_t{0}
  static constexpr IndexType NumRegs = 16;
+  static constexpr IndexType NumPsqtRegs = 2;

  #else
  #undef VECTOR
@@ -87,9 +124,13 @@ namespace Stockfish::Eval::NNUE {
    // Number of output dimensions for one side
    static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;

+    static constexpr int LazyThreshold = 1400;
+
    #ifdef VECTOR
    static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2;
+    static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
    static_assert(HalfDimensions % TileHeight == 0, "TileHeight must divide HalfDimensions");
+    static_assert(PSQTBuckets % PsqtTileHeight == 0, "PsqtTileHeight must divide PSQTBuckets");
    #endif

   public:
@@ -115,6 +156,8 @@ namespace Stockfish::Eval::NNUE {
        biases[i] = read_little_endian<BiasType>(stream);
      for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i)
        weights[i] = read_little_endian<WeightType>(stream);
+      for (std::size_t i = 0; i < PSQTBuckets * InputDimensions; ++i)
+        psqtWeights[i] = read_little_endian<PSQTWeightType>(stream);
      return !stream.fail();
    }

@@ -128,11 +171,21 @@ namespace Stockfish::Eval::NNUE {
    }

    // Convert input features
-    void transform(const Position& pos, OutputType* output) const {
+    std::pair<std::int32_t, bool> transform(const Position& pos, OutputType* output, int bucket) const {
      update_accumulator(pos, WHITE);
      update_accumulator(pos, BLACK);

+      const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
      const auto& accumulation = pos.state()->accumulator.accumulation;
+      const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation;
+
+      const auto psqt = (
+            psqtAccumulation[static_cast<int>(perspectives[0])][bucket]
+          - psqtAccumulation[static_cast<int>(perspectives[1])][bucket]
+        ) / 2;
+
+      if (abs(psqt) > LazyThreshold * OutputScale)
+        return { psqt, true };

  #if defined(USE_AVX512)
      constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2);
@@ -163,7 +216,6 @@ namespace Stockfish::Eval::NNUE {
      const int8x8_t Zero = {0};
  #endif

-      const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
      for (IndexType p = 0; p < 2; ++p) {
        const IndexType offset = HalfDimensions * p;

@@ -240,6 +292,8 @@ namespace Stockfish::Eval::NNUE {
  #if defined(USE_MMX)
      _mm_empty();
  #endif
+
+      return { psqt, false };
    }

   private:
@@ -255,6 +309,7 @@ namespace Stockfish::Eval::NNUE {
      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
      // is defined in the VECTOR code below, once in each branch
      vec_t acc[NumRegs];
+      psqt_vec_t psqt[NumPsqtRegs];
  #endif

      // Look for a usable accumulator of an earlier position. We keep track
@@ -333,12 +388,52 @@ namespace Stockfish::Eval::NNUE {
          }
        }

+        for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
+        {
+          // Load accumulator
+          auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
+            &st->accumulator.psqtAccumulation[perspective][j * PsqtTileHeight]);
+          for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+            psqt[k] = vec_load_psqt(&accTilePsqt[k]);
+
+          for (IndexType i = 0; states_to_update[i]; ++i)
+          {
+            // Difference calculation for the deactivated features
+            for (const auto index : removed[i])
+            {
+              const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
+              auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+              for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+                psqt[k] = vec_sub_psqt_32(psqt[k], columnPsqt[k]);
+            }
+
+            // Difference calculation for the activated features
+            for (const auto index : added[i])
+            {
+              const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
+              auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+              for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+                psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+            }
+
+            // Store accumulator
+            accTilePsqt = reinterpret_cast<psqt_vec_t*>(
+              &states_to_update[i]->accumulator.psqtAccumulation[perspective][j * PsqtTileHeight]);
+            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+              vec_store_psqt(&accTilePsqt[k], psqt[k]);
+          }
+        }
+
  #else
        for (IndexType i = 0; states_to_update[i]; ++i)
        {
          std::memcpy(states_to_update[i]->accumulator.accumulation[perspective],
              st->accumulator.accumulation[perspective],
              HalfDimensions * sizeof(BiasType));
+
+          for (std::size_t k = 0; k < PSQTBuckets; ++k)
+            states_to_update[i]->accumulator.psqtAccumulation[perspective][k] = st->accumulator.psqtAccumulation[perspective][k];
+
          st = states_to_update[i];

          // Difference calculation for the deactivated features
@@ -348,6 +443,9 @@ namespace Stockfish::Eval::NNUE {

            for (IndexType j = 0; j < HalfDimensions; ++j)
              st->accumulator.accumulation[perspective][j] -= weights[offset + j];
+
+            for (std::size_t k = 0; k < PSQTBuckets; ++k)
+              st->accumulator.psqtAccumulation[perspective][k] -= psqtWeights[index * PSQTBuckets + k];
          }

          // Difference calculation for the activated features
@@ -357,6 +455,9 @@ namespace Stockfish::Eval::NNUE {

            for (IndexType j = 0; j < HalfDimensions; ++j)
              st->accumulator.accumulation[perspective][j] += weights[offset + j];
+
+            for (std::size_t k = 0; k < PSQTBuckets; ++k)
+              st->accumulator.psqtAccumulation[perspective][k] += psqtWeights[index * PSQTBuckets + k];
          }
        }
  #endif
@@ -392,16 +493,42 @@ namespace Stockfish::Eval::NNUE {
            vec_store(&accTile[k], acc[k]);
        }

+        for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
+        {
+          for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+            psqt[k] = vec_zero_psqt();
+
+          for (const auto index : active)
+          {
+            const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
+            auto columnPsqt = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
+
+            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+              psqt[k] = vec_add_psqt_32(psqt[k], columnPsqt[k]);
+          }
+
+          auto accTilePsqt = reinterpret_cast<psqt_vec_t*>(
+            &accumulator.psqtAccumulation[perspective][j * PsqtTileHeight]);
+          for (std::size_t k = 0; k < NumPsqtRegs; ++k)
+            vec_store_psqt(&accTilePsqt[k], psqt[k]);
+        }
+
  #else
        std::memcpy(accumulator.accumulation[perspective], biases,
            HalfDimensions * sizeof(BiasType));

+        for (std::size_t k = 0; k < PSQTBuckets; ++k)
+          accumulator.psqtAccumulation[perspective][k] = 0;
+
        for (const auto index : active)
        {
          const IndexType offset = HalfDimensions * index;

          for (IndexType j = 0; j < HalfDimensions; ++j)
            accumulator.accumulation[perspective][j] += weights[offset + j];
+
+          for (std::size_t k = 0; k < PSQTBuckets; ++k)
+            accumulator.psqtAccumulation[perspective][k] += psqtWeights[index * PSQTBuckets + k];
        }
  #endif
      }
@@ -413,9 +540,11 @@ namespace Stockfish::Eval::NNUE {

    using BiasType = std::int16_t;
    using WeightType = std::int16_t;
+    using PSQTWeightType = std::int32_t;

    alignas(CacheLineSize) BiasType biases[HalfDimensions];
    alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions];
+    alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets];
  };

 }  // namespace Stockfish::Eval::NNUE