Merge remote-tracking branch 'remotes/official/master' into merge

2026-05-20 08:37:44 +00:00 · 2020-11-28 06:19:16 +08:00
parent 92b14a5ba2 190dd26b9f
commit 0b2ae6cb64
16 changed files with 1086 additions and 988 deletions
@@ -44,6 +44,7 @@ Daniel Dugovic (ddugovic)
 Dariusz Orzechowski (dorzechowski)
 David Zar
 Daylen Yang (daylen)
 Deshawn Mohan-Smith (GoldenRare)
 DiscanX
 Dominik Schlösser (domschl)
 double-beep
@@ -64,7 +65,6 @@ Gary Heckman (gheckman)
 George Sobala (gsobala)
 gguliash
 Gian-Carlo Pascutto (gcp)
 Deshawn Mohan-Smith (GoldenRare)
 Gontran Lemaire (gonlem)
 Goodkov Vasiliy Aleksandrovich (goodkov)
 Gregor Cramer
@@ -112,6 +112,7 @@ Mark Tenzer (31m059)
 marotear
 Matthew Lai (matthewlai)
 Matthew Sullivan (Matt14916)
 Maxim Molchanov (Maxim)
 Michael An (man)
 Michael Byrne (MichaelB7)
 Michael Chaly (Vizvezdenec)
@@ -41,7 +41,7 @@ BINDIR = $(PREFIX)/bin
 ### Built-in benchmark for pgo-builds
 PGO_TRAINING_DATA_FILE = pgo_training_data.bin
 PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE)
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name $(PGO_TRAINING_DATA_FILE)
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
@@ -84,11 +84,11 @@ using namespace Trace;
 namespace {
  // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold1 =  Value(1400);
+  constexpr Value LazyThreshold1 =  Value(1565);
-  constexpr Value LazyThreshold2 =  Value(1300);
+  constexpr Value LazyThreshold2 =  Value(1102);
-  constexpr Value SpaceThreshold = Value(12222);
+  constexpr Value SpaceThreshold = Value(11551);
-  constexpr Value NNUEThreshold1 =   Value(550);
+  constexpr Value NNUEThreshold1 =   Value(682);
-  constexpr Value NNUEThreshold2 =   Value(150);
+  constexpr Value NNUEThreshold2 =   Value(176);
  // KingAttackWeights[PieceType] contains king attack weights by piece type
  constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -930,7 +930,7 @@ Value Eval::evaluate(const Position& pos) {
  {
      // Scale and shift NNUE for compatibility with search and classical evaluation
      auto  adjusted_NNUE = [&](){
-         int mat = pos.non_pawn_material() + PieceValue[MG][PAWN] * pos.count<PAWN>();
+         int mat = pos.non_pawn_material() + PawnValueMg * pos.count<PAWN>();
         return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo;
      };
@@ -940,16 +940,18 @@ Value Eval::evaluate(const Position& pos) {
      bool  largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
      bool  classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
-      v = classical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
+      bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2;
      v = classical || strongClassical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
      // If the classical eval is small and imbalance large, use NNUE nevertheless.
      // For the case of opposite colored bishops, switch to NNUE eval with
      // small probability if the classical eval is less than the threshold.
-      if (   largePsq
+      if (   largePsq && !strongClassical
-          && (abs(v) * 16 < NNUEThreshold2 * r50
+          && (   abs(v) * 16 < NNUEThreshold2 * r50
-          || (   pos.opposite_bishops()
+              || (   pos.opposite_bishops()
-              && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
+                  && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
-              && !(pos.this_thread()->nodes & 0xB))))
+                  && !(pos.this_thread()->nodes & 0xB))))
          v = adjusted_NNUE();
  }
@@ -585,11 +585,10 @@ namespace CommandLine {
 string argv0;            // path+name of the executable binary, as given by argv[0]
 string binaryDirectory;  // path of the executable directory
 string workingDirectory; // path of the working directory
 string pathSeparator;    // Separator for our current OS
 void init(int argc, char* argv[]) {
    (void)argc;
-    string separator;
+    string pathSeparator;
    // extract the path+name of the executable binary
    argv0 = argv[0];
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-    Stockfish is free software: you can redistribute it and/or modify
+  Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+  it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+  the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  (at your option) any later version.
-    Stockfish is distributed in the hope that it will be useful,
+  Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
+  You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // Code for calculating NNUE evaluation function
@@ -40,330 +40,313 @@
 namespace Eval::NNUE {
-    const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
+  const uint32_t kpp_board_index[PIECE_NB][COLOR_NB] = {
-        // convention: W - us, B - them
+   // convention: W - us, B - them
-        // viewed from other side, W and B are reversed
+   // viewed from other side, W and B are reversed
-        { PS_NONE,     PS_NONE     },
+      { PS_NONE,     PS_NONE     },
-        { PS_W_PAWN,   PS_B_PAWN   },
+      { PS_W_PAWN,   PS_B_PAWN   },
-        { PS_W_KNIGHT, PS_B_KNIGHT },
+      { PS_W_KNIGHT, PS_B_KNIGHT },
-        { PS_W_BISHOP, PS_B_BISHOP },
+      { PS_W_BISHOP, PS_B_BISHOP },
-        { PS_W_ROOK,   PS_B_ROOK   },
+      { PS_W_ROOK,   PS_B_ROOK   },
-        { PS_W_QUEEN,  PS_B_QUEEN  },
+      { PS_W_QUEEN,  PS_B_QUEEN  },
-        { PS_W_KING,   PS_B_KING   },
+      { PS_W_KING,   PS_B_KING   },
-        { PS_NONE,     PS_NONE     },
+      { PS_NONE,     PS_NONE     },
-        { PS_NONE,     PS_NONE     },
+      { PS_NONE,     PS_NONE     },
-        { PS_B_PAWN,   PS_W_PAWN   },
+      { PS_B_PAWN,   PS_W_PAWN   },
-        { PS_B_KNIGHT, PS_W_KNIGHT },
+      { PS_B_KNIGHT, PS_W_KNIGHT },
-        { PS_B_BISHOP, PS_W_BISHOP },
+      { PS_B_BISHOP, PS_W_BISHOP },
-        { PS_B_ROOK,   PS_W_ROOK   },
+      { PS_B_ROOK,   PS_W_ROOK   },
-        { PS_B_QUEEN,  PS_W_QUEEN  },
+      { PS_B_QUEEN,  PS_W_QUEEN  },
-        { PS_B_KING,   PS_W_KING   },
+      { PS_B_KING,   PS_W_KING   },
-        { PS_NONE,     PS_NONE     }
+      { PS_NONE,     PS_NONE     }
-    };
+  };
-    // Input feature converter
+  // Input feature converter
-    LargePagePtr<FeatureTransformer> feature_transformer;
+  LargePagePtr<FeatureTransformer> feature_transformer;
-    // Evaluation function
+  // Evaluation function
-    AlignedPtr<Network> network;
+  AlignedPtr<Network> network;
-    // Evaluation function file name
+  // Evaluation function file name
-    std::string fileName;
+  std::string fileName;
-    // Saved evaluation function file name
+  // Saved evaluation function file name
-    std::string savedfileName = "nn.bin";
+  std::string savedfileName = "nn.bin";
-    // Get a string that represents the structure of the evaluation function
+  // Get a string that represents the structure of the evaluation function
-    std::string get_architecture_string() {
+  std::string get_architecture_string() {
-        return "Features=" + FeatureTransformer::get_structure_string() +
+    return "Features=" + FeatureTransformer::get_structure_string() +
-            ",Network=" + Network::get_structure_string();
+        ",Network=" + Network::get_structure_string();
-    }
+  }
-    std::string get_layers_info() {
+  std::string get_layers_info() {
-        return
+    return
-            FeatureTransformer::get_layers_info()
+        FeatureTransformer::get_layers_info()
-            + '\n' + Network::get_layers_info();
+        + '\n' + Network::get_layers_info();
-    }
+  }
-    UseNNUEMode useNNUE;
+  UseNNUEMode useNNUE;
-    std::string eval_file_loaded = "None";
+  std::string eval_file_loaded = "None";
-    namespace Detail {
+  namespace Detail {
-        // Initialize the evaluation function parameters
+  // Initialize the evaluation function parameters
-        template <typename T>
+  template <typename T>
-        void initialize(AlignedPtr<T>& pointer) {
+  void initialize(AlignedPtr<T>& pointer) {
-            pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
+    pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
-            std::memset(pointer.get(), 0, sizeof(T));
+    std::memset(pointer.get(), 0, sizeof(T));
-        }
+  }
-        template <typename T>
+  template <typename T>
-        void initialize(LargePagePtr<T>& pointer) {
+  void initialize(LargePagePtr<T>& pointer) {
-            static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
+    static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
    std::memset(pointer.get(), 0, sizeof(T));
  }
-            pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
+  // Read evaluation function parameters
-            std::memset(pointer.get(), 0, sizeof(T));
+  template <typename T>
-        }
+  bool ReadParameters(std::istream& stream, T& reference) {
-        // Read evaluation function parameters
+    std::uint32_t header;
-        template <typename T>
+    header = read_little_endian<std::uint32_t>(stream);
-        bool ReadParameters(std::istream& stream, T& reference) {
+    if (!stream || header != T::GetHashValue()) return false;
    return reference.ReadParameters(stream);
  }
-            std::uint32_t header;
+  // write evaluation function parameters
-            header = read_little_endian<std::uint32_t>(stream);
+  template <typename T>
  bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
    constexpr std::uint32_t header = T::GetHashValue();
-            if (!stream || header != T::GetHashValue())
+    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
                return false;
-            return reference.ReadParameters(stream);
+    return pointer->WriteParameters(stream);
-        }
+  }
-        // write evaluation function parameters
+  template <typename T>
-        template <typename T>
+  bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
-        bool WriteParameters(std::ostream& stream, const AlignedPtr<T>& pointer) {
+    constexpr std::uint32_t header = T::GetHashValue();
            constexpr std::uint32_t header = T::GetHashValue();
-            stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+    stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
-            return pointer->WriteParameters(stream);
+    return pointer->WriteParameters(stream);
-        }
+  }
  }  // namespace Detail
-        template <typename T>
+  // Initialize the evaluation function parameters
-        bool WriteParameters(std::ostream& stream, const LargePagePtr<T>& pointer) {
+  void initialize() {
            constexpr std::uint32_t header = T::GetHashValue();
-            stream.write(reinterpret_cast<const char*>(&header), sizeof(header));
+    Detail::initialize(feature_transformer);
    Detail::initialize(network);
  }
-            return pointer->WriteParameters(stream);
+  // Read network header
-        }
+  bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
-    }  // namespace Detail
+  {
    std::uint32_t version, size;
-    // Initialize the evaluation function parameters
+    version     = read_little_endian<std::uint32_t>(stream);
-    void initialize() {
+    *hash_value = read_little_endian<std::uint32_t>(stream);
    size        = read_little_endian<std::uint32_t>(stream);
    if (!stream || version != kVersion) return false;
    architecture->resize(size);
    stream.read(&(*architecture)[0], size);
    return !stream.fail();
  }
-        Detail::initialize(feature_transformer);
+  // write the header
-        Detail::initialize(network);
+  bool write_header(std::ostream& stream,
-    }
+    std::uint32_t hash_value, const std::string& architecture) {
-    // Read network header
+    stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
-    bool read_header(std::istream& stream, std::uint32_t* hash_value, std::string* architecture)
+    stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
    {
        std::uint32_t version, size;
-        version     = read_little_endian<std::uint32_t>(stream);
+    const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
        *hash_value = read_little_endian<std::uint32_t>(stream);
        size        = read_little_endian<std::uint32_t>(stream);
-        if (!stream || version != kVersion)
+    stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
-            return false;
+    stream.write(architecture.data(), size);
-        architecture->resize(size);
+    return !stream.fail();
-        stream.read(&(*architecture)[0], size);
+  }
-        return !stream.fail();
+  // Read network parameters
-    }
+  bool ReadParameters(std::istream& stream) {
-    // write the header
+    std::uint32_t hash_value;
-    bool write_header(std::ostream& stream,
+    std::string architecture;
-        std::uint32_t hash_value, const std::string& architecture) {
+    if (!read_header(stream, &hash_value, &architecture)) return false;
    if (hash_value != kHashValue) return false;
    if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
    if (!Detail::ReadParameters(stream, *network)) return false;
    return stream && stream.peek() == std::ios::traits_type::eof();
  }
-        stream.write(reinterpret_cast<const char*>(&kVersion), sizeof(kVersion));
+  // write evaluation function parameters
-        stream.write(reinterpret_cast<const char*>(&hash_value), sizeof(hash_value));
+  bool WriteParameters(std::ostream& stream) {
-        const std::uint32_t size = static_cast<std::uint32_t>(architecture.size());
+    if (!write_header(stream, kHashValue, get_architecture_string()))
        return false;
-        stream.write(reinterpret_cast<const char*>(&size), sizeof(size));
+    if (!Detail::WriteParameters(stream, feature_transformer))
-        stream.write(architecture.data(), size);
+        return false;
-        return !stream.fail();
+    if (!Detail::WriteParameters(stream, network))
-    }
+        return false;
-    // Read network parameters
+    return !stream.fail();
-    bool ReadParameters(std::istream& stream) {
+}
-        std::uint32_t hash_value;
+  // Evaluation function. Perform differential calculation.
-        std::string architecture;
+  Value evaluate(const Position& pos) {
        if (!read_header(stream, &hash_value, &architecture))
            return false;
-        if (hash_value != kHashValue)
+    // We manually align the arrays on the stack because with gcc < 9.3
-            return false;
+    // overaligning stack variables with alignas() doesn't work correctly.
-        if (!Detail::ReadParameters(stream, *feature_transformer))
+    constexpr uint64_t alignment = kCacheLineSize;
            return false;
        if (!Detail::ReadParameters(stream, *network))
            return false;
        return stream && stream.peek() == std::ios::traits_type::eof();
    }
    // write evaluation function parameters
    bool WriteParameters(std::ostream& stream) {
        if (!write_header(stream, kHashValue, get_architecture_string()))
            return false;
        if (!Detail::WriteParameters(stream, feature_transformer))
            return false;
        if (!Detail::WriteParameters(stream, network))
            return false;
        return !stream.fail();
    }
    // Evaluation function. Perform differential calculation.
    Value evaluate(const Position& pos) {
        // We manually align the arrays on the stack because with gcc < 9.3
        // overaligning stack variables with alignas() doesn't work correctly.
        constexpr uint64_t alignment = kCacheLineSize;
 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
-        TransformedFeatureType transformed_features_unaligned[
+    TransformedFeatureType transformed_features_unaligned[
-          FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
+      FeatureTransformer::kBufferSize + alignment / sizeof(TransformedFeatureType)];
-        char buffer_unaligned[Network::kBufferSize + alignment];
+    char buffer_unaligned[Network::kBufferSize + alignment];
-        auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
+    auto* transformed_features = align_ptr_up<alignment>(&transformed_features_unaligned[0]);
-        auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
+    auto* buffer = align_ptr_up<alignment>(&buffer_unaligned[0]);
 #else
-        alignas(alignment)
+    alignas(alignment)
-          TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
+      TransformedFeatureType transformed_features[FeatureTransformer::kBufferSize];
-        alignas(alignment) char buffer[Network::kBufferSize];
+    alignas(alignment) char buffer[Network::kBufferSize];
 #endif
-        ASSERT_ALIGNED(transformed_features, alignment);
+    ASSERT_ALIGNED(transformed_features, alignment);
-        ASSERT_ALIGNED(buffer, alignment);
+    ASSERT_ALIGNED(buffer, alignment);
-        feature_transformer->Transform(pos, transformed_features);
+    feature_transformer->Transform(pos, transformed_features);
    const auto output = network->Propagate(transformed_features, buffer);
    return static_cast<Value>(output[0] / FV_SCALE);
  }
-        const auto output = network->Propagate(transformed_features, buffer);
+  // Load eval, from a file stream or a memory stream
  bool load_eval(std::string name, std::istream& stream) {
-        return static_cast<Value>(output[0] / FV_SCALE);
+    initialize();
-    }
+    fileName = name;
    return ReadParameters(stream);
 }
-    // Load eval, from a file stream or a memory stream
+static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
-    bool load_eval(std::string name, std::istream& stream) {
+{
  if (mode == "false")
    return UseNNUEMode::False;
  else if (mode == "true")
     return UseNNUEMode::True;
  else if (mode == "pure")
    return UseNNUEMode::Pure;
-        initialize();
+  return UseNNUEMode::False;
 }
-        fileName = name;
+void init() {
        return ReadParameters(stream);
    }
-    static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+  useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
    {
        if (mode == "false")
          return UseNNUEMode::False;
        else if (mode == "true")
          return UseNNUEMode::True;
        else if (mode == "pure")
          return UseNNUEMode::Pure;
-        return UseNNUEMode::False;
+  if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
-    }
+  {
    eval_file_loaded.clear();
    return;
  }
-    void init() {
+  std::string eval_file = std::string(Options["EvalFile"]);
        useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
        if (Options["SkipLoadingEval"] || useNNUE == UseNNUEMode::False)
        {
            eval_file_loaded.clear();
            return;
        }
        std::string eval_file = std::string(Options["EvalFile"]);
 #if defined(DEFAULT_NNUE_DIRECTORY)
 #define stringify2(x) #x
 #define stringify(x) stringify2(x)
-        std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory , stringify(DEFAULT_NNUE_DIRECTORY) };
 #else
-        std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
+  std::vector<std::string> dirs = { "" , CommandLine::binaryDirectory };
 #endif
-        for (std::string directory : dirs)
+  for (std::string directory : dirs)
-        {
+  {
-            if (eval_file_loaded != eval_file)
+    if (eval_file_loaded != eval_file)
-            {
+    {
-                std::ifstream stream(directory + eval_file, std::ios::binary);
+      std::ifstream stream(directory + eval_file, std::ios::binary);
-                if (load_eval(eval_file, stream))
+      if (load_eval(eval_file, stream))
-                {
+      {
-                    sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
+        sync_cout << "info string Loaded eval file " << directory + eval_file << sync_endl;
-                    eval_file_loaded = eval_file;
+        eval_file_loaded = eval_file;
-                }
+      }
-                else
+      else
-                {
+      {
-                    sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
+        sync_cout << "info string ERROR: failed to load eval file " << directory + eval_file << sync_endl;
-                    eval_file_loaded.clear();
+        eval_file_loaded.clear();
-                }
+      }
-            }
+    }
-        }
+  }
 #undef stringify2
 #undef stringify
-    }
+}
-    /// NNUE::verify() verifies that the last net used was loaded successfully
+/// NNUE::verify() verifies that the last net used was loaded successfully
-    void verify_eval_file_loaded() {
+void verify_eval_file_loaded() {
-        std::string eval_file = std::string(Options["EvalFile"]);
+  std::string eval_file = std::string(Options["EvalFile"]);
-        if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
+  if (useNNUE != UseNNUEMode::False && eval_file_loaded != eval_file)
-        {
+  {
-            UCI::OptionsMap defaults;
+    UCI::OptionsMap defaults;
-            UCI::init(defaults);
+    UCI::init(defaults);
-            std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-            std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
+    std::string msg2 = "The option is set to true, but the network file " + eval_file + " was not loaded successfully.";
-            std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-            std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
+    std::string msg4 = "The default net can be downloaded from: https://tests.stockfishchess.org/api/nn/" + std::string(defaults["EvalFile"]);
-            std::string msg5 = "The engine will be terminated now.";
+    std::string msg5 = "The engine will be terminated now.";
-            sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
-            sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
-            sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
-            sync_cout << "info string ERROR: " << msg4 << sync_endl;
+    sync_cout << "info string ERROR: " << msg4 << sync_endl;
-            sync_cout << "info string ERROR: " << msg5 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;
-            std::exit(EXIT_FAILURE);
+    std::exit(EXIT_FAILURE);
-        }
+  }
-        if (useNNUE != UseNNUEMode::False)
+  if (useNNUE != UseNNUEMode::False)
-            sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
+    sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
-        else
+  else
-            sync_cout << "info string classical evaluation enabled" << sync_endl;
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
-    }
+}
-    /// In training we override eval file so this is useful.
+/// In training we override eval file so this is useful.
-    void verify_any_net_loaded() {
+void verify_any_net_loaded() {
-        if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
+  if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
-        {
+  {
-            UCI::OptionsMap defaults;
+    UCI::OptionsMap defaults;
-            UCI::init(defaults);
+    UCI::init(defaults);
-            std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
+    std::string msg1 = "If the UCI option \"Use NNUE\" is set to true, network evaluation parameters compatible with the engine must be available.";
-            std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
+    std::string msg2 = "The option is set to true, but the network file was not loaded successfully.";
-            std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
+    std::string msg3 = "The UCI option EvalFile might need to specify the full path, including the directory name, to the network file.";
-            std::string msg5 = "The engine will be terminated now.";
+    std::string msg5 = "The engine will be terminated now.";
-            sync_cout << "info string ERROR: " << msg1 << sync_endl;
+    sync_cout << "info string ERROR: " << msg1 << sync_endl;
-            sync_cout << "info string ERROR: " << msg2 << sync_endl;
+    sync_cout << "info string ERROR: " << msg2 << sync_endl;
-            sync_cout << "info string ERROR: " << msg3 << sync_endl;
+    sync_cout << "info string ERROR: " << msg3 << sync_endl;
-            sync_cout << "info string ERROR: " << msg5 << sync_endl;
+    sync_cout << "info string ERROR: " << msg5 << sync_endl;
-            std::exit(EXIT_FAILURE);
+    std::exit(EXIT_FAILURE);
-        }
+  }
-        if (useNNUE != UseNNUEMode::False)
+  if (useNNUE != UseNNUEMode::False)
-            sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
+    sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
-        else
+  else
-            sync_cout << "info string classical evaluation enabled" << sync_endl;
+    sync_cout << "info string classical evaluation enabled" << sync_endl;
-    }
+}
 } // namespace Eval::NNUE
@@ -1,21 +1,23 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-    Stockfish is free software: you can redistribute it and/or modify
+  Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+  it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+  the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  (at your option) any later version.
-    Stockfish is distributed in the hope that it will be useful,
+  Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
+  You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // header used in NNUE evaluation function
 #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
 #define NNUE_EVALUATE_NNUE_H_INCLUDED
@@ -25,84 +27,83 @@
 #include <memory>
 // header used in NNUE evaluation function
 namespace Eval::NNUE {
-    enum struct UseNNUEMode
+  enum struct UseNNUEMode
-    {
+  {
-        False,
+    False,
-        True,
+    True,
-        Pure
+    Pure
-    };
+  };
-    // Hash value of evaluation function structure
+  // Hash value of evaluation function structure
-    constexpr std::uint32_t kHashValue =
+  constexpr std::uint32_t kHashValue =
-        FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
+      FeatureTransformer::GetHashValue() ^ Network::GetHashValue();
-    // Deleter for automating release of memory area
+  // Deleter for automating release of memory area
-    template <typename T>
+  template <typename T>
-    struct AlignedDeleter {
+  struct AlignedDeleter {
-        void operator()(T* ptr) const {
+    void operator()(T* ptr) const {
-            ptr->~T();
+      ptr->~T();
-            std_aligned_free(ptr);
+      std_aligned_free(ptr);
-        }
+    }
-    };
+  };
-    template <typename T>
+  template <typename T>
-    struct LargePageDeleter {
+  struct LargePageDeleter {
-        void operator()(T* ptr) const {
+    void operator()(T* ptr) const {
-            ptr->~T();
+      ptr->~T();
-            aligned_large_pages_free(ptr);
+      aligned_large_pages_free(ptr);
-        }
+    }
-    };
+  };
-    template <typename T>
+  template <typename T>
-    using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
+  using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
-    template <typename T>
+  template <typename T>
-    using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
+  using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
-    // Input feature converter
+  // Input feature converter
-    extern LargePagePtr<FeatureTransformer> feature_transformer;
+  extern LargePagePtr<FeatureTransformer> feature_transformer;
-    // Evaluation function
+  // Evaluation function
-    extern AlignedPtr<Network> network;
+  extern AlignedPtr<Network> network;
-    // Evaluation function file name
+  // Evaluation function file name
-    extern std::string fileName;
+  extern std::string fileName;
-    // Saved evaluation function file name
+  // Saved evaluation function file name
-    extern std::string savedfileName;
+  extern std::string savedfileName;
-    extern UseNNUEMode useNNUE;
+  extern UseNNUEMode useNNUE;
-    extern std::string eval_file_loaded;
+  extern std::string eval_file_loaded;
-    // Get a string that represents the structure of the evaluation function
+  // Get a string that represents the structure of the evaluation function
-    std::string get_architecture_string();
+  std::string get_architecture_string();
-    std::string get_layers_info();
+  std::string get_layers_info();
-    // read the header
+  // read the header
-    bool read_header(std::istream& stream,
+  bool read_header(std::istream& stream,
-        std::uint32_t* hash_value, std::string* architecture);
+      std::uint32_t* hash_value, std::string* architecture);
-    // write the header
+  // write the header
-    bool write_header(std::ostream& stream,
+  bool write_header(std::ostream& stream,
-        std::uint32_t hash_value, const std::string& architecture);
+      std::uint32_t hash_value, const std::string& architecture);
-    // read evaluation function parameters
+  // read evaluation function parameters
-    bool ReadParameters(std::istream& stream);
+  bool ReadParameters(std::istream& stream);
-    // write evaluation function parameters
+  // write evaluation function parameters
-    bool WriteParameters(std::ostream& stream);
+  bool WriteParameters(std::ostream& stream);
-    Value evaluate(const Position& pos);
+  Value evaluate(const Position& pos);
-    bool load_eval(std::string name, std::istream& stream);
+  bool load_eval(std::string name, std::istream& stream);
-    void init();
+  void init();
-    void verify_eval_file_loaded();
+  void verify_eval_file_loaded();
-    void verify_any_net_loaded();
+  void verify_any_net_loaded();
 }  // namespace Eval::NNUE
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-    Stockfish is free software: you can redistribute it and/or modify
+  Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+  it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+  the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  (at your option) any later version.
-    Stockfish is distributed in the hope that it will be useful,
+  Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
+  You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // A class template that represents the input feature set of the NNUE evaluation function
@@ -22,7 +22,6 @@
 #define NNUE_FEATURE_SET_H_INCLUDED
 #include "features_common.h"
 #include <array>
 namespace Eval::NNUE::Features {
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-    Stockfish is free software: you can redistribute it and/or modify
+  Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+  it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+  the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  (at your option) any later version.
-    Stockfish is distributed in the hope that it will be useful,
+  Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
+  You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 //Common header of input features of NNUE evaluation function
@@ -21,30 +21,29 @@
 #ifndef NNUE_FEATURES_COMMON_H_INCLUDED
 #define NNUE_FEATURES_COMMON_H_INCLUDED
-#include "evaluate.h"
+#include "../../evaluate.h"
-
+#include "../nnue_common.h"
 #include "nnue/nnue_common.h"
 namespace Eval::NNUE::Features {
-    class IndexList;
+  class IndexList;
-    template <typename... FeatureTypes>
+  template <typename... FeatureTypes>
-    class FeatureSet;
+  class FeatureSet;
-    // Trigger to perform full calculations instead of difference only
+  // Trigger to perform full calculations instead of difference only
-    enum class TriggerEvent {
+  enum class TriggerEvent {
-        kNone, // Calculate the difference whenever possible
+    kNone, // Calculate the difference whenever possible
-        kFriendKingMoved, // calculate full evaluation when own king moves
+    kFriendKingMoved, // calculate full evaluation when own king moves
-        kEnemyKingMoved, // calculate full evaluation when opponent king moves
+    kEnemyKingMoved, // calculate full evaluation when opponent king moves
-        kAnyKingMoved, // calculate full evaluation when any king moves
+    kAnyKingMoved, // calculate full evaluation when any king moves
-        kAnyPieceMoved, // always calculate full evaluation
+    kAnyPieceMoved, // always calculate full evaluation
-    };
+  };
-    enum class Side {
+  enum class Side {
-        kFriend, // side to move
+    kFriend, // side to move
-        kEnemy, // opponent
+    kEnemy, // opponent
-    };
+  };
 }  // namespace Eval::NNUE::Features
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-    Stockfish is free software: you can redistribute it and/or modify
+  Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+  it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+  the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  (at your option) any later version.
-    Stockfish is distributed in the hope that it will be useful,
+  Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
+  You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // Definition of index list of input features
@@ -21,43 +21,43 @@
 #ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED
 #define NNUE_FEATURES_INDEX_LIST_H_INCLUDED
-#include "position.h"
+#include "../../position.h"
-
+#include "../nnue_architecture.h"
 #include "nnue/nnue_architecture.h"
 namespace Eval::NNUE::Features {
-    // Class template used for feature index list
+  // Class template used for feature index list
-    template <typename T, std::size_t MaxSize>
+  template <typename T, std::size_t MaxSize>
-    class ValueList {
+  class ValueList {
-    public:
+   public:
-        std::size_t size() const { return size_; }
+    std::size_t size() const { return size_; }
-        void resize(std::size_t size) { size_ = size; }
+    void resize(std::size_t size) { size_ = size; }
-        void push_back(const T& value) { values_[size_++] = value; }
+    void push_back(const T& value) { values_[size_++] = value; }
-        T& operator[](std::size_t index) { return values_[index]; }
+    T& operator[](std::size_t index) { return values_[index]; }
-        T* begin() { return values_; }
+    T* begin() { return values_; }
-        T* end() { return values_ + size_; }
+    T* end() { return values_ + size_; }
-        const T& operator[](std::size_t index) const { return values_[index]; }
+    const T& operator[](std::size_t index) const { return values_[index]; }
-        const T* begin() const { return values_; }
+    const T* begin() const { return values_; }
-        const T* end() const { return values_ + size_; }
+    const T* end() const { return values_ + size_; }
-        void swap(ValueList& other) {
+    void swap(ValueList& other) {
-            const std::size_t max_size = std::max(size_, other.size_);
+      const std::size_t max_size = std::max(size_, other.size_);
-            for (std::size_t i = 0; i < max_size; ++i) {
+      for (std::size_t i = 0; i < max_size; ++i) {
-                std::swap(values_[i], other.values_[i]);
+        std::swap(values_[i], other.values_[i]);
-            }
+      }
-            std::swap(size_, other.size_);
+      std::swap(size_, other.size_);
-        }
+    }
-    private:
+   private:
-        T values_[MaxSize] = {};
+    T values_[MaxSize];
-        std::size_t size_ = 0;
+    std::size_t size_ = 0;
-    };
+  };
-    //Type of feature index list
+  //Type of feature index list
-    class IndexList : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
+  class IndexList
-    };
+      : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
  };
 }  // namespace Eval::NNUE::Features
@@ -223,13 +223,13 @@ namespace Eval::NNUE::Layers {
        return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
      };
      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
 #if defined (USE_VNNI)
      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
        acc = _mm512_dpbusd_epi32(acc, a, b);
 #else
      [[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i {
        __m512i product0 = _mm512_maddubs_epi16(a, b);
-        product0 = _mm512_madd_epi16(product0, kOnes512);
+        return _mm512_madd_epi16(product0, kOnes512);
        acc = _mm512_add_epi32(acc, product0);
 #endif
      };
@@ -256,14 +256,13 @@ namespace Eval::NNUE::Layers {
        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
      };
      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
 #if defined (USE_VNNI)
      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
        acc = _mm256_dpbusd_epi32(acc, a, b);
 #else
      [[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i {
        __m256i product0 = _mm256_maddubs_epi16(a, b);
-        product0 = _mm256_madd_epi16(product0, kOnes256);
+        return _mm256_madd_epi16(product0, kOnes256);
        acc = _mm256_add_epi32(acc, product0);
 #endif
      };
@@ -288,10 +287,9 @@ namespace Eval::NNUE::Layers {
        return _mm_add_epi32(sum0, bias);
      };
-      [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
+      [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i {
        __m128i product0 = _mm_maddubs_epi16(a, b);
-        product0 = _mm_madd_epi16(product0, kOnes128);
+        return _mm_madd_epi16(product0, kOnes128);
        acc = _mm_add_epi32(acc, product0);
      };
 #endif
@@ -335,15 +333,6 @@ namespace Eval::NNUE::Layers {
          const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
          __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
          __m512i sum01a = _mm512_setzero_si512();
          __m512i sum23a = _mm512_setzero_si512();
          __m512i sum45a = _mm512_setzero_si512();
          __m512i sum67a = _mm512_setzero_si512();
          __m512i sum01b = _mm512_setzero_si512();
          __m512i sum23b = _mm512_setzero_si512();
          __m512i sum45b = _mm512_setzero_si512();
          __m512i sum67b = _mm512_setzero_si512();
          const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
          const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
          const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
@@ -356,6 +345,16 @@ namespace Eval::NNUE::Layers {
          const __m256i in256 = input_vector256[0];
          const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
 #if defined (USE_VNNI)
          __m512i sum01a = _mm512_setzero_si512();
          __m512i sum23a = _mm512_setzero_si512();
          __m512i sum45a = _mm512_setzero_si512();
          __m512i sum67a = _mm512_setzero_si512();
          __m512i sum01b = _mm512_setzero_si512();
          __m512i sum23b = _mm512_setzero_si512();
          __m512i sum45b = _mm512_setzero_si512();
          __m512i sum67b = _mm512_setzero_si512();
          m512_add_dpbusd_epi32(sum01a, in, row01a);
          m512_add_dpbusd_epi32(sum23a, in, row23a);
          m512_add_dpbusd_epi32(sum45a, in, row45a);
@@ -364,6 +363,16 @@ namespace Eval::NNUE::Layers {
          m512_add_dpbusd_epi32(sum23b, in, row23b);
          m512_add_dpbusd_epi32(sum45b, in, row45b);
          m512_add_dpbusd_epi32(sum67b, in, row67b);
 #else
          __m512i sum01a = m512_dpbusd_epi32(in, row01a);
          __m512i sum23a = m512_dpbusd_epi32(in, row23a);
          __m512i sum45a = m512_dpbusd_epi32(in, row45a);
          __m512i sum67a = m512_dpbusd_epi32(in, row67a);
          __m512i sum01b = m512_dpbusd_epi32(in, row01b);
          __m512i sum23b = m512_dpbusd_epi32(in, row23b);
          __m512i sum45b = m512_dpbusd_epi32(in, row45b);
          __m512i sum67b = m512_dpbusd_epi32(in, row67b);
 #endif
          *outptr = m512_hadd256x16(
            sum01a, sum23a, sum45a, sum67a,
@@ -384,48 +393,80 @@ namespace Eval::NNUE::Layers {
          if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
          {
            __m512i sum0 = _mm512_setzero_si512();
            __m512i sum1 = _mm512_setzero_si512();
            __m512i sum2 = _mm512_setzero_si512();
            __m512i sum3 = _mm512_setzero_si512();
            const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
            const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
            const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
            const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
-            for (IndexType j = 0; j < kNumChunks512; ++j)
+#if defined (USE_VNNI)
            __m512i sum0 = _mm512_setzero_si512();
            __m512i sum1 = _mm512_setzero_si512();
            __m512i sum2 = _mm512_setzero_si512();
            __m512i sum3 = _mm512_setzero_si512();
            const IndexType kStart = 0;
 #else
            __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
            __m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]);
            __m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]);
            __m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]);
            const IndexType kStart = 1;
 #endif
            for (IndexType j = kStart; j < kNumChunks512; ++j)
            {
              const __m512i in = input_vector512[j];
 #if defined (USE_VNNI)
              m512_add_dpbusd_epi32(sum0, in, row0[j]);
              m512_add_dpbusd_epi32(sum1, in, row1[j]);
              m512_add_dpbusd_epi32(sum2, in, row2[j]);
              m512_add_dpbusd_epi32(sum3, in, row3[j]);
 #else
              sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
              sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j]));
              sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j]));
              sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j]));
 #endif
            }
            *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
          }
          else
          {
            __m256i sum0 = _mm256_setzero_si256();
            __m256i sum1 = _mm256_setzero_si256();
            __m256i sum2 = _mm256_setzero_si256();
            __m256i sum3 = _mm256_setzero_si256();
            const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
            const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
            const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
            const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
-            for (IndexType j = 0; j < kNumChunks256; ++j)
+#if defined (USE_VNNI)
            __m256i sum0 = _mm256_setzero_si256();
            __m256i sum1 = _mm256_setzero_si256();
            __m256i sum2 = _mm256_setzero_si256();
            __m256i sum3 = _mm256_setzero_si256();
            const IndexType kStart = 0;
 #else
            __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
            __m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]);
            __m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]);
            __m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]);
            const IndexType kStart = 1;
 #endif
            for (IndexType j = kStart; j < kNumChunks256; ++j)
            {
              const __m256i in = input_vector256[j];
 #if defined (USE_VNNI)
              m256_add_dpbusd_epi32(sum0, in, row0[j]);
              m256_add_dpbusd_epi32(sum1, in, row1[j]);
              m256_add_dpbusd_epi32(sum2, in, row2[j]);
              m256_add_dpbusd_epi32(sum3, in, row3[j]);
 #else
              sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
              sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
              sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
              sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
 #endif
            }
            *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -436,30 +477,50 @@ namespace Eval::NNUE::Layers {
      {
        if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
        {
          __m512i sum0 = _mm512_setzero_si512();
          const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
-          for (IndexType j = 0; j < kNumChunks512; ++j)
+#if defined (USE_VNNI)
          __m512i sum0 = _mm512_setzero_si512();
          const IndexType kStart = 0;
 #else
          __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
          const IndexType kStart = 1;
 #endif
          for (IndexType j = kStart; j < kNumChunks512; ++j)
          {
            const __m512i in = input_vector512[j];
 #if defined (USE_VNNI)
            m512_add_dpbusd_epi32(sum0, in, row0[j]);
 #else
            sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
 #endif
          }
          output[0] = m512_hadd(sum0, biases_[0]);
        }
        else
        {
          __m256i sum0 = _mm256_setzero_si256();
          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
-          for (IndexType j = 0; j < kNumChunks256; ++j)
+#if defined (USE_VNNI)
          __m256i sum0 = _mm256_setzero_si256();
          const IndexType kStart = 0;
 #else
          __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
          const IndexType kStart = 1;
 #endif
          for (IndexType j = kStart; j < kNumChunks256; ++j)
          {
            const __m256i in = input_vector256[j];
 #if defined (USE_VNNI)
            m256_add_dpbusd_epi32(sum0, in, row0[j]);
 #else
            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
 #endif
          }
          output[0] = m256_hadd(sum0, biases_[0]);
@@ -493,24 +554,40 @@ namespace Eval::NNUE::Layers {
          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
          __m256i sum0 = _mm256_setzero_si256();
          __m256i sum1 = _mm256_setzero_si256();
          __m256i sum2 = _mm256_setzero_si256();
          __m256i sum3 = _mm256_setzero_si256();
          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
          const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
          const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
          const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
-          for (IndexType j = 0; j < kNumChunks; ++j)
+#if defined (USE_VNNI)
          __m256i sum0 = _mm256_setzero_si256();
          __m256i sum1 = _mm256_setzero_si256();
          __m256i sum2 = _mm256_setzero_si256();
          __m256i sum3 = _mm256_setzero_si256();
          const IndexType kStart = 0;
 #else
          __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
          __m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]);
          __m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]);
          __m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]);
          const IndexType kStart = 1;
 #endif
          for (IndexType j = kStart; j < kNumChunks; ++j)
          {
            const __m256i in = input_vector[j];
 #if defined (USE_VNNI)
            m256_add_dpbusd_epi32(sum0, in, row0[j]);
            m256_add_dpbusd_epi32(sum1, in, row1[j]);
            m256_add_dpbusd_epi32(sum2, in, row2[j]);
            m256_add_dpbusd_epi32(sum3, in, row3[j]);
 #else
            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
            sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
            sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
            sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
 #endif
          }
          *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -518,15 +595,25 @@ namespace Eval::NNUE::Layers {
      }
      else if constexpr (kOutputDimensions == 1)
      {
        __m256i sum0 = _mm256_setzero_si256();
        const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
-        for (IndexType j = 0; j < kNumChunks; ++j)
+#if defined (USE_VNNI)
        __m256i sum0 = _mm256_setzero_si256();
        const IndexType kStart = 0;
 #else
        __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
        const IndexType kStart = 1;
 #endif
        for (IndexType j = kStart; j < kNumChunks; ++j)
        {
          const __m256i in = input_vector[j];
-            m256_add_dpbusd_epi32(sum0, in, row0[j]);
+#if defined (USE_VNNI)
          m256_add_dpbusd_epi32(sum0, in, row0[j]);
 #else
          sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
 #endif
        }
        output[0] = m256_hadd(sum0, biases_[0]);
@@ -559,24 +646,24 @@ namespace Eval::NNUE::Layers {
          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
          __m128i sum0 = _mm_setzero_si128();
          __m128i sum1 = _mm_setzero_si128();
          __m128i sum2 = _mm_setzero_si128();
          __m128i sum3 = _mm_setzero_si128();
          const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
          const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
          const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
          const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
-          for (int j = 0; j < (int)kNumChunks; j += 1)
+          __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
          __m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]);
          __m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]);
          __m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]);
          for (int j = 1; j < (int)kNumChunks; ++j)
          {
            const __m128i in = input_vector[j];
-            m128_add_dpbusd_epi32(sum0, in, row0[j]);
+            sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j]));
-            m128_add_dpbusd_epi32(sum1, in, row1[j]);
+            sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j]));
-            m128_add_dpbusd_epi32(sum2, in, row2[j]);
+            sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j]));
-            m128_add_dpbusd_epi32(sum3, in, row3[j]);
+            sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j]));
          }
          *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -584,16 +671,12 @@ namespace Eval::NNUE::Layers {
      }
      else if constexpr (kOutputDimensions == 1)
      {
        __m128i sum0 = _mm_setzero_si128();
        const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
-        for (int j = 0; j < (int)kNumChunks; j += 1)
+        __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
        {
          const __m128i in = input_vector[j];
-          m128_add_dpbusd_epi32(sum0, in, row0[j]);
+        for (int j = 1; j < (int)kNumChunks; ++j)
-        }
+          sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j]));
        output[0] = m128_hadd(sum0, biases_[0]);
      }
@@ -1,34 +1,35 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-    Stockfish is free software: you can redistribute it and/or modify
+  Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+  it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+  the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  (at your option) any later version.
-    Stockfish is distributed in the hope that it will be useful,
+  Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
+  You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // Class for difference calculation of NNUE evaluation function
 #ifndef NNUE_ACCUMULATOR_H_INCLUDED
 #define NNUE_ACCUMULATOR_H_INCLUDED
 #include "nnue_architecture.h"
 // Class for difference calculation of NNUE evaluation function
 namespace Eval::NNUE {
-    // Class that holds the result of affine transformation of input features
+  // Class that holds the result of affine transformation of input features
-    struct alignas(kCacheLineSize) Accumulator {
+  struct alignas(kCacheLineSize) Accumulator {
-        std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
+      std::int16_t accumulation[2][kRefreshTriggers.size()][kTransformedFeatureDimensions];
-        bool computed_accumulation;
+      bool computed_accumulation;
-    };
+  };
 }  // namespace Eval::NNUE
@@ -1,36 +1,37 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-    Stockfish is free software: you can redistribute it and/or modify
+  Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+  it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+  the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  (at your option) any later version.
-    Stockfish is distributed in the hope that it will be useful,
+  Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
+  You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // Input features and network structure used in NNUE evaluation function
 #ifndef NNUE_ARCHITECTURE_H_INCLUDED
 #define NNUE_ARCHITECTURE_H_INCLUDED
 // Defines the network structure
 #include "architectures/halfkp_256x2-32-32.h"
 // Input features and network structure used in NNUE evaluation function
 namespace Eval::NNUE {
-    static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
+  static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
-    static_assert(Network::kOutputDimensions == 1, "");
+  static_assert(Network::kOutputDimensions == 1, "");
-    static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
+  static_assert(std::is_same<Network::OutputType, std::int32_t>::value, "");
-    // Trigger for full calculation instead of difference calculation
+  // Trigger for full calculation instead of difference calculation
-    constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
+  constexpr auto kRefreshTriggers = RawFeatures::kRefreshTriggers;
 }  // namespace Eval::NNUE
@@ -1,19 +1,19 @@
 /*
-    Stockfish, a UCI chess playing engine derived from Glaurung 2.1
+  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
-    Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
+  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
-    Stockfish is free software: you can redistribute it and/or modify
+  Stockfish is free software: you can redistribute it and/or modify
-    it under the terms of the GNU General Public License as published by
+  it under the terms of the GNU General Public License as published by
-    the Free Software Foundation, either version 3 of the License, or
+  the Free Software Foundation, either version 3 of the License, or
-    (at your option) any later version.
+  (at your option) any later version.
-    Stockfish is distributed in the hope that it will be useful,
+  Stockfish is distributed in the hope that it will be useful,
-    but WITHOUT ANY WARRANTY; without even the implied warranty of
+  but WITHOUT ANY WARRANTY; without even the implied warranty of
-    MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
+  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
-    GNU General Public License for more details.
+  GNU General Public License for more details.
-    You should have received a copy of the GNU General Public License
+  You should have received a copy of the GNU General Public License
-    along with this program.  If not, see <http://www.gnu.org/licenses/>.
+  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // A class that converts the input features of the NNUE evaluation function
@@ -23,7 +23,6 @@
 #include "nnue_common.h"
 #include "nnue_architecture.h"
 #include "features/index_list.h"
 #include <cstring>
@@ -31,456 +30,486 @@
 namespace Eval::NNUE {
-    // If vector instructions are enabled, we update and refresh the
+  // If vector instructions are enabled, we update and refresh the
-    // accumulator tile by tile such that each tile fits in the CPU's
+  // accumulator tile by tile such that each tile fits in the CPU's
-    // vector registers.
+  // vector registers.
-#define TILING
+  #define VECTOR
-#ifdef USE_AVX512
+  #ifdef USE_AVX512
-    typedef __m512i vec_t;
+  typedef __m512i vec_t;
-#define vec_load(a) _mm512_load_si512(a)
+  #define vec_load(a) _mm512_load_si512(a)
-#define vec_store(a,b) _mm512_store_si512(a,b)
+  #define vec_store(a,b) _mm512_store_si512(a,b)
-#define vec_add_16(a,b) _mm512_add_epi16(a,b)
+  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
-#define vec_zero _mm512_setzero_si512()
+  #define vec_zero _mm512_setzero_si512()
-    static constexpr IndexType kNumRegs = 8; // only 8 are needed
+  static constexpr IndexType kNumRegs = 8; // only 8 are needed
-#elif USE_AVX2
+  #elif USE_AVX2
-    typedef __m256i vec_t;
+  typedef __m256i vec_t;
-#define vec_load(a) _mm256_load_si256(a)
+  #define vec_load(a) _mm256_load_si256(a)
-#define vec_store(a,b) _mm256_store_si256(a,b)
+  #define vec_store(a,b) _mm256_store_si256(a,b)
-#define vec_add_16(a,b) _mm256_add_epi16(a,b)
+  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
-#define vec_zero _mm256_setzero_si256()
+  #define vec_zero _mm256_setzero_si256()
    static constexpr IndexType kNumRegs = 16;
 #elif USE_SSE2
    typedef __m128i vec_t;
 #define vec_load(a) (*(a))
 #define vec_store(a,b) *(a)=(b)
 #define vec_add_16(a,b) _mm_add_epi16(a,b)
 #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
 #define vec_zero _mm_setzero_si128()
    static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
 #elif USE_MMX
    typedef __m64 vec_t;
 #define vec_load(a) (*(a))
 #define vec_store(a,b) *(a)=(b)
 #define vec_add_16(a,b) _mm_add_pi16(a,b)
 #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
 #define vec_zero _mm_setzero_si64()
    static constexpr IndexType kNumRegs = 8;
 #elif USE_NEON
    typedef int16x8_t vec_t;
 #define vec_load(a) (*(a))
 #define vec_store(a,b) *(a)=(b)
 #define vec_add_16(a,b) vaddq_s16(a,b)
 #define vec_sub_16(a,b) vsubq_s16(a,b)
 #define vec_zero {0}
  static constexpr IndexType kNumRegs = 16;
  #elif USE_SSE2
  typedef __m128i vec_t;
  #define vec_load(a) (*(a))
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) _mm_add_epi16(a,b)
  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
  #define vec_zero _mm_setzero_si128()
  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
  #elif USE_MMX
  typedef __m64 vec_t;
  #define vec_load(a) (*(a))
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) _mm_add_pi16(a,b)
  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
  #define vec_zero _mm_setzero_si64()
  static constexpr IndexType kNumRegs = 8;
  #elif USE_NEON
  typedef int16x8_t vec_t;
  #define vec_load(a) (*(a))
  #define vec_store(a,b) *(a)=(b)
  #define vec_add_16(a,b) vaddq_s16(a,b)
  #define vec_sub_16(a,b) vsubq_s16(a,b)
  #define vec_zero {0}
  static constexpr IndexType kNumRegs = 16;
  #else
  #undef VECTOR
  #endif
  // Input feature converter
  class FeatureTransformer {
   private:
    // Number of output dimensions for one side
    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
    #ifdef VECTOR
    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
    #endif
   public:
    // Output type
    using OutputType = TransformedFeatureType;
    // Number of input/output dimensions
    static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
    static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
    // Size of forward propagation buffer
    static constexpr std::size_t kBufferSize =
        kOutputDimensions * sizeof(OutputType);
    static constexpr int kLayerIndex = 0;
    // Hash value embedded in the evaluation file
    static constexpr std::uint32_t GetHashValue() {
      return RawFeatures::kHashValue ^ kOutputDimensions;
    }
    static std::string get_name() {
      return RawFeatures::get_name() + "[" +
          std::to_string(kInputDimensions) + "->" +
          std::to_string(kHalfDimensions) + "x2]";
    }
    // a string representing the structure
    static std::string get_structure_string() {
      return get_name();
    }
    static std::string get_layers_info() {
      std::string info = "  - ";
      info += std::to_string(kLayerIndex);
      info += " - ";
      info += get_name();
      return info;
    }
    // Read network parameters
    bool ReadParameters(std::istream& stream) {
      for (std::size_t i = 0; i < kHalfDimensions; ++i)
        biases_[i] = read_little_endian<BiasType>(stream);
      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
        weights_[i] = read_little_endian<WeightType>(stream);
      return !stream.fail();
    }
    // write parameters
    bool WriteParameters(std::ostream& stream) const {
      stream.write(reinterpret_cast<const char*>(biases_),
          kHalfDimensions * sizeof(BiasType));
      stream.write(reinterpret_cast<const char*>(weights_),
          kHalfDimensions * kInputDimensions * sizeof(WeightType));
      return !stream.fail();
    }
    // Proceed with the difference calculation if possible
    bool update_accumulator_if_possible(const Position& pos) const {
      const auto now = pos.state();
      if (now->accumulator.computed_accumulation)
        return true;
      const auto prev = now->previous;
      if (prev && prev->accumulator.computed_accumulation) {
        update_accumulator(pos);
        return true;
      }
      return false;
    }
    // Convert input features
    void Transform(const Position& pos, OutputType* output) const {
      if (!update_accumulator_if_possible(pos))
        refresh_accumulator(pos);
      const auto& accumulation = pos.state()->accumulator.accumulation;
  #if defined(USE_AVX512)
      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
      static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
      const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
      const __m512i kZero = _mm512_setzero_si512();
  #elif defined(USE_AVX2)
      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
      constexpr int kControl = 0b11011000;
      const __m256i kZero = _mm256_setzero_si256();
  #elif defined(USE_SSE2)
      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
  #ifdef USE_SSE41
      const __m128i kZero = _mm_setzero_si128();
  #else
      const __m128i k0x80s = _mm_set1_epi8(-128);
  #endif
  #elif defined(USE_MMX)
      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
      const __m64 k0x80s = _mm_set1_pi8(-128);
  #elif defined(USE_NEON)
      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
      const int8x8_t kZero = {0};
  #endif
      const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
      for (IndexType p = 0; p < 2; ++p) {
        const IndexType offset = kHalfDimensions * p;
  #if defined(USE_AVX512)
        auto out = reinterpret_cast<__m512i*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          __m512i sum0 = _mm512_load_si512(
              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
          __m512i sum1 = _mm512_load_si512(
              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
              sum0 = _mm512_add_epi16(sum0, reinterpret_cast<const __m512i*>(
                  accumulation[perspectives[p]][i])[j * 2 + 0]);
              sum1 = _mm512_add_epi16(sum1, reinterpret_cast<const __m512i*>(
                  accumulation[perspectives[p]][i])[j * 2 + 1]);
          }
          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
        }
  #elif defined(USE_AVX2)
        auto out = reinterpret_cast<__m256i*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          __m256i sum0 = _mm256_load_si256(
              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
          __m256i sum1 = _mm256_load_si256(
              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
              sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
                  accumulation[perspectives[p]][i])[j * 2 + 0]);
              sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
                  accumulation[perspectives[p]][i])[j * 2 + 1]);
          }
          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
              _mm256_packs_epi16(sum0, sum1), kZero), kControl));
        }
  #elif defined(USE_SSE2)
        auto out = reinterpret_cast<__m128i*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
              accumulation[perspectives[p]][0])[j * 2 + 0]);
          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
              accumulation[perspectives[p]][0])[j * 2 + 1]);
          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
            sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
                accumulation[perspectives[p]][i])[j * 2 + 0]);
            sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
                accumulation[perspectives[p]][i])[j * 2 + 1]);
          }
      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
          _mm_store_si128(&out[j],
  #ifdef USE_SSE41
              _mm_max_epi8(packedbytes, kZero)
  #else
              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
  #endif
          );
        }
  #elif defined(USE_MMX)
        auto out = reinterpret_cast<__m64*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
              accumulation[perspectives[p]][0])[j * 2 + 0]);
          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
              accumulation[perspectives[p]][0])[j * 2 + 1]);
          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
              sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
                  accumulation[perspectives[p]][i])[j * 2 + 0]);
              sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
                  accumulation[perspectives[p]][i])[j * 2 + 1]);
          }
          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
        }
  #elif defined(USE_NEON)
        const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
              accumulation[perspectives[p]][0])[j];
          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
              sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
                  accumulation[perspectives[p]][i])[j]);
          }
          out[j] = vmax_s8(vqmovn_s16(sum), kZero);
        }
  #else
        for (IndexType j = 0; j < kHalfDimensions; ++j) {
          BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
              sum += accumulation[static_cast<int>(perspectives[p])][i][j];
          }
          output[offset + j] = static_cast<OutputType>(
              std::max<int>(0, std::min<int>(127, sum)));
        }
  #endif
      }
  #if defined(USE_MMX)
      _mm_empty();
  #endif
    }
   private:
    // Calculate cumulative value without using difference calculation
    void refresh_accumulator(const Position& pos) const {
  #ifdef VECTOR
      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
      // is defined in the VECTOR code below, once in each branch
      vec_t acc[kNumRegs];
  #endif
      auto& accumulator = pos.state()->accumulator;
      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
        Features::IndexList active_indices[2];
        RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
                                           active_indices);
          for (Color perspective : { WHITE, BLACK }) {
 #ifdef VECTOR
            for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
              auto accTile = reinterpret_cast<vec_t*>(
                  &accumulator.accumulation[perspective][i][j * kTileHeight]);
              if (i == 0) {
                auto biasesTile = reinterpret_cast<const vec_t*>(
                    &biases_[j * kTileHeight]);
                for (IndexType k = 0; k < kNumRegs; ++k)
                  acc[k] = biasesTile[k];
              } else {
                for (IndexType k = 0; k < kNumRegs; ++k)
                  acc[k] = vec_zero;
              }
              for (const auto index : active_indices[perspective]) {
                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
                for (IndexType k = 0; k < kNumRegs; ++k)
                  acc[k] = vec_add_16(acc[k], column[k]);
              }
              for (IndexType k = 0; k < kNumRegs; k++)
                vec_store(&accTile[k], acc[k]);
            }
 #else
-#undef TILING
+            if (i == 0) {
-
+              std::memcpy(accumulator.accumulation[perspective][i], biases_,
-#endif
+                          kHalfDimensions * sizeof(BiasType));
-
+            } else {
-    // Input feature converter
+              std::memset(accumulator.accumulation[perspective][i], 0,
-    class FeatureTransformer {
+                          kHalfDimensions * sizeof(BiasType));
    private:
        // Number of output dimensions for one side
        static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
 #ifdef TILING
        static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
        static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
 #endif
    public:
        // Output type
        using OutputType = TransformedFeatureType;
        // Number of input/output dimensions
        static constexpr IndexType kInputDimensions = RawFeatures::kDimensions;
        static constexpr IndexType kOutputDimensions = kHalfDimensions * 2;
        // Size of forward propagation buffer
        static constexpr std::size_t kBufferSize =
            kOutputDimensions * sizeof(OutputType);
        static constexpr int kLayerIndex = 0;
        // Hash value embedded in the evaluation file
        static constexpr std::uint32_t GetHashValue() {
            return RawFeatures::kHashValue ^ kOutputDimensions;
        }
        static std::string get_name() {
            return RawFeatures::get_name() + "[" +
                std::to_string(kInputDimensions) + "->" +
                std::to_string(kHalfDimensions) + "x2]";
        }
        // a string representing the structure
        static std::string get_structure_string() {
            return get_name();
        }
        static std::string get_layers_info() {
            std::string info = "  - ";
            info += std::to_string(kLayerIndex);
            info += " - ";
            info += get_name();
            return info;
        }
        // Read network parameters
        bool ReadParameters(std::istream& stream) {
            for (std::size_t i = 0; i < kHalfDimensions; ++i)
                biases_[i] = read_little_endian<BiasType>(stream);
            for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
                weights_[i] = read_little_endian<WeightType>(stream);
            return !stream.fail();
        }
        // write parameters
        bool WriteParameters(std::ostream& stream) const {
            stream.write(reinterpret_cast<const char*>(biases_),
                kHalfDimensions * sizeof(BiasType));
            stream.write(reinterpret_cast<const char*>(weights_),
                kHalfDimensions * kInputDimensions * sizeof(WeightType));
            return !stream.fail();
        }
        // Proceed with the difference calculation if possible
        bool update_accumulator_if_possible(const Position& pos) const {
            const auto now = pos.state();
            if (now->accumulator.computed_accumulation)
                return true;
            const auto prev = now->previous;
            if (prev && prev->accumulator.computed_accumulation) {
                update_accumulator(pos);
                return true;
            }
-            return false;
+            for (const auto index : active_indices[perspective]) {
-        }
+              const IndexType offset = kHalfDimensions * index;
        // Convert input features
        void Transform(const Position& pos, OutputType* output) const {
            if (!update_accumulator_if_possible(pos))
              refresh_accumulator(pos);
            const auto& accumulation = pos.state()->accumulator.accumulation;
 #if defined(USE_AVX2)
            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
            constexpr int kControl = 0b11011000;
            const __m256i kZero = _mm256_setzero_si256();
 #elif defined(USE_SSE2)
            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
 #ifdef USE_SSE41
            const __m128i kZero = _mm_setzero_si128();
 #else
            const __m128i k0x80s = _mm_set1_epi8(-128);
 #endif
 #elif defined(USE_MMX)
            constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
            const __m64 k0x80s = _mm_set1_pi8(-128);
 #elif defined(USE_NEON)
            constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
            const int8x8_t kZero = {0};
 #endif
            const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
            for (IndexType p = 0; p < 2; ++p) {
                const IndexType offset = kHalfDimensions * p;
 #if defined(USE_AVX2)
                auto out = reinterpret_cast<__m256i*>(&output[offset]);
                for (IndexType j = 0; j < kNumChunks; ++j) {
                    __m256i sum0 = _mm256_load_si256(
                        &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
                    __m256i sum1 = _mm256_load_si256(
                      &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
                        sum0 = _mm256_add_epi16(sum0, reinterpret_cast<const __m256i*>(
                            accumulation[perspectives[p]][i])[j * 2 + 0]);
                        sum1 = _mm256_add_epi16(sum1, reinterpret_cast<const __m256i*>(
                            accumulation[perspectives[p]][i])[j * 2 + 1]);
                    }
                    _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
                        _mm256_packs_epi16(sum0, sum1), kZero), kControl));
                }
 #elif defined(USE_SSE2)
                auto out = reinterpret_cast<__m128i*>(&output[offset]);
                for (IndexType j = 0; j < kNumChunks; ++j) {
                    __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
                        accumulation[perspectives[p]][0])[j * 2 + 0]);
                    __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
                        accumulation[perspectives[p]][0])[j * 2 + 1]);
                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
                        sum0 = _mm_add_epi16(sum0, reinterpret_cast<const __m128i*>(
                            accumulation[perspectives[p]][i])[j * 2 + 0]);
                        sum1 = _mm_add_epi16(sum1, reinterpret_cast<const __m128i*>(
                            accumulation[perspectives[p]][i])[j * 2 + 1]);
                    }
                    const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
                    _mm_store_si128(&out[j],
 #ifdef USE_SSE41
                        _mm_max_epi8(packedbytes, kZero)
 #else
                        _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
 #endif
                    );
                }
 #elif defined(USE_MMX)
                auto out = reinterpret_cast<__m64*>(&output[offset]);
                for (IndexType j = 0; j < kNumChunks; ++j) {
                    __m64 sum0 = *(&reinterpret_cast<const __m64*>(
                        accumulation[perspectives[p]][0])[j * 2 + 0]);
                    __m64 sum1 = *(&reinterpret_cast<const __m64*>(
                        accumulation[perspectives[p]][0])[j * 2 + 1]);
                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
                        sum0 = _mm_add_pi16(sum0, reinterpret_cast<const __m64*>(
                            accumulation[perspectives[p]][i])[j * 2 + 0]);
                        sum1 = _mm_add_pi16(sum1, reinterpret_cast<const __m64*>(
                            accumulation[perspectives[p]][i])[j * 2 + 1]);
                    }
                    const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
                    out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
                }
 #elif defined(USE_NEON)
                const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
                for (IndexType j = 0; j < kNumChunks; ++j) {
                    int16x8_t sum = reinterpret_cast<const int16x8_t*>(
                        accumulation[perspectives[p]][0])[j];
                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
                        sum = vaddq_s16(sum, reinterpret_cast<const int16x8_t*>(
                            accumulation[perspectives[p]][i])[j]);
                    }
                    out[j] = vmax_s8(vqmovn_s16(sum), kZero);
                }
 #else
                for (IndexType j = 0; j < kHalfDimensions; ++j) {
                    BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
                    for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
                        sum += accumulation[static_cast<int>(perspectives[p])][i][j];
                    }
                    output[offset + j] = static_cast<OutputType>(
                        std::max<int>(0, std::min<int>(127, sum)));
                }
 #endif
              for (IndexType j = 0; j < kHalfDimensions; ++j)
                accumulator.accumulation[perspective][i][j] += weights_[offset + j];
            }
 #if defined(USE_MMX)
            _mm_empty();
 #endif
          }
        }
    private:
        // Calculate cumulative value without using difference calculation
        void refresh_accumulator(const Position& pos) const {
            auto& accumulator = pos.state()->accumulator;
            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
                Features::IndexList active_indices[2];
                RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
                                                   active_indices);
                for (Color perspective : { WHITE, BLACK }) {
 #ifdef TILING
                    for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
                        auto accTile = reinterpret_cast<vec_t*>(
                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
                        vec_t acc[kNumRegs];
                        if (i == 0) {
                            auto biasesTile = reinterpret_cast<const vec_t*>(
                                &biases_[j * kTileHeight]);
                            for (unsigned k = 0; k < kNumRegs; ++k)
                                acc[k] = biasesTile[k];
                        } else {
                            for (unsigned k = 0; k < kNumRegs; ++k)
                                acc[k] = vec_zero;
                        }
                        for (const auto index : active_indices[perspective]) {
                            const IndexType offset = kHalfDimensions * index + j * kTileHeight;
                            auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
                            for (unsigned k = 0; k < kNumRegs; ++k)
                                acc[k] = vec_add_16(acc[k], column[k]);
                        }
                        for (unsigned k = 0; k < kNumRegs; k++)
                            vec_store(&accTile[k], acc[k]);
                    }
 #else
                    if (i == 0) {
                        std::memcpy(accumulator.accumulation[perspective][i], biases_,
                                    kHalfDimensions * sizeof(BiasType));
                    } else {
                        std::memset(accumulator.accumulation[perspective][i], 0,
                                    kHalfDimensions * sizeof(BiasType));
                    }
                    for (const auto index : active_indices[perspective]) {
                        const IndexType offset = kHalfDimensions * index;
                        for (IndexType j = 0; j < kHalfDimensions; ++j)
                            accumulator.accumulation[perspective][i][j] += weights_[offset + j];
                    }
 #endif
                }
            }
 #if defined(USE_MMX)
-            _mm_empty();
+        _mm_empty();
 #endif
-            accumulator.computed_accumulation = true;
+        accumulator.computed_accumulation = true;
    }
    // Calculate cumulative value using difference calculation
    void update_accumulator(const Position& pos) const {
  #ifdef VECTOR
      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
      // is defined in the VECTOR code below, once in each branch
      vec_t acc[kNumRegs];
  #endif
    const auto& prev_accumulator = pos.state()->previous->accumulator;
    auto& accumulator = pos.state()->accumulator;
    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
      Features::IndexList removed_indices[2], added_indices[2];
      bool reset[2] = { false, false };
      RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
                                          removed_indices, added_indices, reset);
 #ifdef VECTOR
      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
        for (Color perspective : { WHITE, BLACK }) {
          auto accTile = reinterpret_cast<vec_t*>(
              &accumulator.accumulation[perspective][i][j * kTileHeight]);
          if (reset[perspective]) {
            if (i == 0) {
              auto biasesTile = reinterpret_cast<const vec_t*>(
                  &biases_[j * kTileHeight]);
              for (IndexType k = 0; k < kNumRegs; ++k)
                acc[k] = biasesTile[k];
            } else {
              for (IndexType k = 0; k < kNumRegs; ++k)
                acc[k] = vec_zero;
            }
          } else {
            auto prevAccTile = reinterpret_cast<const vec_t*>(
                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
            for (IndexType k = 0; k < kNumRegs; ++k)
              acc[k] = vec_load(&prevAccTile[k]);
            // Difference calculation for the deactivated features
            for (const auto index : removed_indices[perspective]) {
              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
              for (IndexType k = 0; k < kNumRegs; ++k)
                acc[k] = vec_sub_16(acc[k], column[k]);
            }
          }
          { // Difference calculation for the activated features
            for (const auto index : added_indices[perspective]) {
              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
              for (IndexType k = 0; k < kNumRegs; ++k)
                acc[k] = vec_add_16(acc[k], column[k]);
            }
          }
          for (IndexType k = 0; k < kNumRegs; ++k)
            vec_store(&accTile[k], acc[k]);
        }
-
+      }
        // Calculate cumulative value using difference calculation
        void update_accumulator(const Position& pos) const {
            const auto& prev_accumulator = pos.state()->previous->accumulator;
            auto& accumulator = pos.state()->accumulator;
            for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
                Features::IndexList removed_indices[2], added_indices[2];
                bool reset[2] = { false, false };
                RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
                                                    removed_indices, added_indices, reset);
 #ifdef TILING
                for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
                    for (Color perspective : { WHITE, BLACK }) {
                        auto accTile = reinterpret_cast<vec_t*>(
                            &accumulator.accumulation[perspective][i][j * kTileHeight]);
                        vec_t acc[kNumRegs];
                        if (reset[perspective]) {
                            if (i == 0) {
                                auto biasesTile = reinterpret_cast<const vec_t*>(
                                    &biases_[j * kTileHeight]);
                                for (unsigned k = 0; k < kNumRegs; ++k)
                                    acc[k] = biasesTile[k];
                            } else {
                                for (unsigned k = 0; k < kNumRegs; ++k)
                                    acc[k] = vec_zero;
                            }
                        } else {
                            auto prevAccTile = reinterpret_cast<const vec_t*>(
                                &prev_accumulator.accumulation[perspective][i][j * kTileHeight]);
                            for (IndexType k = 0; k < kNumRegs; ++k)
                                acc[k] = vec_load(&prevAccTile[k]);
                            // Difference calculation for the deactivated features
                            for (const auto index : removed_indices[perspective]) {
                                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
                                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
                                for (IndexType k = 0; k < kNumRegs; ++k)
                                    acc[k] = vec_sub_16(acc[k], column[k]);
                            }
                        }
                        { // Difference calculation for the activated features
                          for (const auto index : added_indices[perspective]) {
                              const IndexType offset = kHalfDimensions * index + j * kTileHeight;
                              auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
                              for (IndexType k = 0; k < kNumRegs; ++k)
                                  acc[k] = vec_add_16(acc[k], column[k]);
                          }
                        }
                        for (IndexType k = 0; k < kNumRegs; ++k)
                          vec_store(&accTile[k], acc[k]);
                    }
                }
 #if defined(USE_MMX)
-                _mm_empty();
+      _mm_empty();
 #endif
 #else
-                for (Color perspective : { WHITE, BLACK }) {
+      for (Color perspective : { WHITE, BLACK }) {
-                    if (reset[perspective]) {
+        if (reset[perspective]) {
-                        if (i == 0) {
+          if (i == 0) {
-                            std::memcpy(accumulator.accumulation[perspective][i], biases_,
+            std::memcpy(accumulator.accumulation[perspective][i], biases_,
-                                        kHalfDimensions * sizeof(BiasType));
+                        kHalfDimensions * sizeof(BiasType));
-                        } else {
+          } else {
-                            std::memset(accumulator.accumulation[perspective][i], 0,
+            std::memset(accumulator.accumulation[perspective][i], 0,
-                                        kHalfDimensions * sizeof(BiasType));
+                        kHalfDimensions * sizeof(BiasType));
-                        }
+          }
-                    } else {
+        } else {
-                        std::memcpy(accumulator.accumulation[perspective][i],
+          std::memcpy(accumulator.accumulation[perspective][i],
-                                    prev_accumulator.accumulation[perspective][i],
+                      prev_accumulator.accumulation[perspective][i],
-                                    kHalfDimensions * sizeof(BiasType));
+                      kHalfDimensions * sizeof(BiasType));
-                        // Difference calculation for the deactivated features
+          // Difference calculation for the deactivated features
-                        for (const auto index : removed_indices[perspective]) {
+          for (const auto index : removed_indices[perspective]) {
-                            const IndexType offset = kHalfDimensions * index;
+            const IndexType offset = kHalfDimensions * index;
-                            for (IndexType j = 0; j < kHalfDimensions; ++j)
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
-                                accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
+              accumulator.accumulation[perspective][i][j] -= weights_[offset + j];
-                        }
+          }
                    }
                    { // Difference calculation for the activated features
                        for (const auto index : added_indices[perspective]) {
                          const IndexType offset = kHalfDimensions * index;
                          for (IndexType j = 0; j < kHalfDimensions; ++j)
                              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
                        }
                    }
                }
 #endif
            }
            accumulator.computed_accumulation = true;
        }
        { // Difference calculation for the activated features
          for (const auto index : added_indices[perspective]) {
            const IndexType offset = kHalfDimensions * index;
-        using BiasType = std::int16_t;
+            for (IndexType j = 0; j < kHalfDimensions; ++j)
-        using WeightType = std::int16_t;
+              accumulator.accumulation[perspective][i][j] += weights_[offset + j];
          }
        }
      }
 #endif
      }
      accumulator.computed_accumulation = true;
    }
-        // Make the learning class a friend
+    using BiasType = std::int16_t;
-        friend class Trainer<FeatureTransformer>;
+    using WeightType = std::int16_t;
-        alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
+    // Make the learning class a friend
-        alignas(kCacheLineSize)
+    friend class Trainer<FeatureTransformer>;
-            WeightType weights_[kHalfDimensions * kInputDimensions];
+
-    };
+    alignas(kCacheLineSize) BiasType biases_[kHalfDimensions];
    alignas(kCacheLineSize)
        WeightType weights_[kHalfDimensions * kInputDimensions];
  };
 }  // namespace Eval::NNUE
-#endif //#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
@@ -176,8 +176,8 @@ namespace {
            score -=  Doubled * doubled
                    + WeakLever * more_than_one(lever);
-        if (blocked && r > RANK_4)
+        if (blocked && r >= RANK_5)
-            score += BlockedPawn[r-4];
+            score += BlockedPawn[r - RANK_5];
    }
    return score;
@@ -59,7 +59,7 @@ namespace {
  // Razor and futility margins
  constexpr int RazorMargin = 510;
  Value futility_margin(Depth d, bool improving) {
-    return Value(223 * (d - improving));
+    return Value(234 * (d - improving));
  }
  // Reductions lookup table, initialized at startup
@@ -67,7 +67,7 @@ namespace {
  Depth reduction(bool i, Depth d, int mn) {
    int r = Reductions[d] * Reductions[mn];
-    return (r + 509) / 1024 + (!i && r > 894);
+    return (r + 503) / 1024 + (!i && r > 915);
  }
  constexpr int futility_move_count(bool improving, Depth depth) {
@@ -188,7 +188,7 @@ namespace {
 void Search::init() {
  for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((22.0 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
+      Reductions[i] = int((21.3 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
 }
@@ -404,7 +404,7 @@ void Thread::search() {
              beta  = std::min(prev + delta, VALUE_INFINITE);
              // Adjust contempt based on root move's previousScore (dynamic contempt)
-              int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149);
+              int dct = ct + (113 - ct / 2) * prev / (abs(prev) + 147);
              contempt = (us == WHITE ?  make_score(dct, dct / 2)
                                      : -make_score(dct, dct / 2));
@@ -824,7 +824,7 @@ namespace {
        && (ss-1)->statScore < 22977
        &&  eval >= beta
        &&  eval >= ss->staticEval
-        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 182
+        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 168
        && !excludedMove
        &&  pos.non_pawn_material(us)
        && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -832,7 +832,7 @@ namespace {
        assert(eval - beta >= 0);
        // Null move dynamic reduction based on depth and value
-        Depth R = (982 + 85 * depth) / 256 + std::min(int(eval - beta) / 192, 3);
+        Depth R = (1015 + 85 * depth) / 256 + std::min(int(eval - beta) / 191, 3);
        ss->currentMove = MOVE_NULL;
        ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -849,7 +849,7 @@ namespace {
            if (nullValue >= VALUE_TB_WIN_IN_MAX_PLY)
                nullValue = beta;
-            if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 13))
+            if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 14))
                return nullValue;
            assert(!thisThread->nmpMinPly); // Recursive verification is not allowed
@@ -868,7 +868,7 @@ namespace {
        }
    }
-    probCutBeta = beta + 176 - 49 * improving;
+    probCutBeta = beta + 183 - 49 * improving;
    // Step 10. ProbCut (~10 Elo)
    // If we have a good enough capture and a reduced search returns a value
@@ -1036,7 +1036,7 @@ moves_loop: // When in check, search starts from here
              // Futility pruning: parent node (~5 Elo)
              if (   lmrDepth < 7
                  && !ss->inCheck
-                  && ss->staticEval + 283 + 170 * lmrDepth <= alpha
+                  && ss->staticEval + 266 + 170 * lmrDepth <= alpha
                  &&  (*contHist[0])[movedPiece][to_sq(move)]
                    + (*contHist[1])[movedPiece][to_sq(move)]
                    + (*contHist[3])[movedPiece][to_sq(move)]
@@ -1044,7 +1044,7 @@ moves_loop: // When in check, search starts from here
                  continue;
              // Prune moves with negative SEE (~20 Elo)
-              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
+              if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
                  continue;
          }
          else
@@ -1055,8 +1055,8 @@ moves_loop: // When in check, search starts from here
                  && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0)
                  continue;
-              // See based pruning
+              // SEE based pruning
-              if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
+              if (!pos.see_ge(move, Value(-213) * depth)) // (~25 Elo)
                  continue;
          }
      }
@@ -1150,12 +1150,12 @@ moves_loop: // When in check, search starts from here
              || moveCountPruning
              || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
              || cutNode
-              || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024))
+              || thisThread->ttHitAverage < 432 * TtHitAverageResolution * TtHitAverageWindow / 1024))
      {
          Depth r = reduction(improving, depth, moveCount);
          // Decrease reduction if the ttHit running average is large
-          if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
+          if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024)
              r--;
          // Increase reduction if other threads are searching this position
@@ -1208,10 +1208,10 @@ moves_loop: // When in check, search starts from here
                             - 5287;
              // Decrease/increase reduction by comparing opponent's stat score (~10 Elo)
-              if (ss->statScore >= -106 && (ss-1)->statScore < -104)
+              if (ss->statScore >= -105 && (ss-1)->statScore < -103)
                  r--;
-              else if ((ss-1)->statScore >= -119 && ss->statScore < -140)
+              else if ((ss-1)->statScore >= -122 && ss->statScore < -129)
                  r++;
              // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
@@ -1225,7 +1225,7 @@ moves_loop: // When in check, search starts from here
              // Unless giving check, this capture is likely bad
              if (   !givesCheck
-                  && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha)
+                  && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 210 * depth <= alpha)
                  r++;
          }
@@ -1499,7 +1499,7 @@ moves_loop: // When in check, search starts from here
        if (PvNode && bestValue > alpha)
            alpha = bestValue;
-        futilityBase = bestValue + 145;
+        futilityBase = bestValue + 155;
    }
    const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
@@ -204,8 +204,8 @@ enum PieceType {
 enum Piece {
  NO_PIECE,
-  W_PAWN = 1, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
+  W_PAWN = PAWN,     W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
-  B_PAWN = 9, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
+  B_PAWN = PAWN + 8, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
  PIECE_NB = 16
 };