Merge remote-tracking branch 'remotes/official/master' into merge

2026-05-20 14:27:45 +00:00 · 2020-11-28 06:19:16 +08:00
parent 92b14a5ba2 190dd26b9f
commit 0b2ae6cb64
16 changed files with 1086 additions and 988 deletions
@@ -44,6 +44,7 @@ Daniel Dugovic (ddugovic)
 Dariusz Orzechowski (dorzechowski)
 David Zar
 Daylen Yang (daylen)
 Deshawn Mohan-Smith (GoldenRare)
 DiscanX
 Dominik Schlösser (domschl)
 double-beep
@@ -64,7 +65,6 @@ Gary Heckman (gheckman)
 George Sobala (gsobala)
 gguliash
 Gian-Carlo Pascutto (gcp)
 Deshawn Mohan-Smith (GoldenRare)
 Gontran Lemaire (gonlem)
 Goodkov Vasiliy Aleksandrovich (goodkov)
 Gregor Cramer
@@ -112,6 +112,7 @@ Mark Tenzer (31m059)
 marotear
 Matthew Lai (matthewlai)
 Matthew Sullivan (Matt14916)
 Maxim Molchanov (Maxim)
 Michael An (man)
 Michael Byrne (MichaelB7)
 Michael Chaly (Vizvezdenec)
@@ -41,7 +41,7 @@ BINDIR = $(PREFIX)/bin
 ### Built-in benchmark for pgo-builds
 PGO_TRAINING_DATA_FILE = pgo_training_data.bin
 PGOBENCH = ./$(EXE) bench
-PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 output_file_name $(PGO_TRAINING_DATA_FILE)
+PGOGENSFEN = ./$(EXE) gensfen depth 3 loop 1000 sfen_format bin output_file_name $(PGO_TRAINING_DATA_FILE)
 ### Source and object files
 SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp \
@@ -84,11 +84,11 @@ using namespace Trace;
 namespace {
  // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold1 =  Value(1400);
+  constexpr Value LazyThreshold1 =  Value(1565);
-  constexpr Value LazyThreshold2 =  Value(1300);
+  constexpr Value LazyThreshold2 =  Value(1102);
-  constexpr Value SpaceThreshold = Value(12222);
+  constexpr Value SpaceThreshold = Value(11551);
-  constexpr Value NNUEThreshold1 =   Value(550);
+  constexpr Value NNUEThreshold1 =   Value(682);
-  constexpr Value NNUEThreshold2 =   Value(150);
+  constexpr Value NNUEThreshold2 =   Value(176);
  // KingAttackWeights[PieceType] contains king attack weights by piece type
  constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -930,7 +930,7 @@ Value Eval::evaluate(const Position& pos) {
  {
      // Scale and shift NNUE for compatibility with search and classical evaluation
      auto  adjusted_NNUE = [&](){
-         int mat = pos.non_pawn_material() + PieceValue[MG][PAWN] * pos.count<PAWN>();
+         int mat = pos.non_pawn_material() + PawnValueMg * pos.count<PAWN>();
         return NNUE::evaluate(pos) * (720 + mat / 32) / 1024 + Tempo;
      };
@@ -940,13 +940,15 @@ Value Eval::evaluate(const Position& pos) {
      bool  largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
      bool  classical = largePsq || (psq > PawnValueMg / 4 && !(pos.this_thread()->nodes & 0xB));
-      v = classical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
+      bool strongClassical = pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2;
      v = classical || strongClassical ? Evaluation<NO_TRACE>(pos).value() : adjusted_NNUE();
      // If the classical eval is small and imbalance large, use NNUE nevertheless.
      // For the case of opposite colored bishops, switch to NNUE eval with
      // small probability if the classical eval is less than the threshold.
-      if (   largePsq
+      if (   largePsq && !strongClassical
-          && (abs(v) * 16 < NNUEThreshold2 * r50
+          && (   abs(v) * 16 < NNUEThreshold2 * r50
              || (   pos.opposite_bishops()
                  && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50
                  && !(pos.this_thread()->nodes & 0xB))))
@@ -585,11 +585,10 @@ namespace CommandLine {
 string argv0;            // path+name of the executable binary, as given by argv[0]
 string binaryDirectory;  // path of the executable directory
 string workingDirectory; // path of the working directory
 string pathSeparator;    // Separator for our current OS
 void init(int argc, char* argv[]) {
    (void)argc;
-    string separator;
+    string pathSeparator;
    // extract the path+name of the executable binary
    argv0 = argv[0];
@@ -102,7 +102,6 @@ namespace Eval::NNUE {
  void initialize(LargePagePtr<T>& pointer) {
    static_assert(alignof(T) <= 4096, "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
    std::memset(pointer.get(), 0, sizeof(T));
  }
@@ -113,10 +112,7 @@ namespace Eval::NNUE {
    std::uint32_t header;
    header = read_little_endian<std::uint32_t>(stream);
-
+    if (!stream || header != T::GetHashValue()) return false;
            if (!stream || header != T::GetHashValue())
                return false;
    return reference.ReadParameters(stream);
  }
@@ -155,13 +151,9 @@ namespace Eval::NNUE {
    version     = read_little_endian<std::uint32_t>(stream);
    *hash_value = read_little_endian<std::uint32_t>(stream);
    size        = read_little_endian<std::uint32_t>(stream);
-
+    if (!stream || version != kVersion) return false;
        if (!stream || version != kVersion)
            return false;
    architecture->resize(size);
    stream.read(&(*architecture)[0], size);
    return !stream.fail();
  }
@@ -185,20 +177,13 @@ namespace Eval::NNUE {
    std::uint32_t hash_value;
    std::string architecture;
-        if (!read_header(stream, &hash_value, &architecture))
+    if (!read_header(stream, &hash_value, &architecture)) return false;
-            return false;
+    if (hash_value != kHashValue) return false;
-
+    if (!Detail::ReadParameters(stream, *feature_transformer)) return false;
-        if (hash_value != kHashValue)
+    if (!Detail::ReadParameters(stream, *network)) return false;
            return false;
        if (!Detail::ReadParameters(stream, *feature_transformer))
            return false;
        if (!Detail::ReadParameters(stream, *network))
            return false;
    return stream && stream.peek() == std::ios::traits_type::eof();
  }
  // write evaluation function parameters
  bool WriteParameters(std::ostream& stream) {
@@ -212,7 +197,8 @@ namespace Eval::NNUE {
        return false;
    return !stream.fail();
-    }
+}
  // Evaluation function. Perform differential calculation.
  Value evaluate(const Position& pos) {
@@ -238,8 +224,6 @@ namespace Eval::NNUE {
    ASSERT_ALIGNED(buffer, alignment);
    feature_transformer->Transform(pos, transformed_features);
    const auto output = network->Propagate(transformed_features, buffer);
    return static_cast<Value>(output[0] / FV_SCALE);
@@ -249,13 +233,12 @@ namespace Eval::NNUE {
  bool load_eval(std::string name, std::istream& stream) {
    initialize();
    fileName = name;
    return ReadParameters(stream);
-    }
+}
-    static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
+static UseNNUEMode nnue_mode_from_option(const UCI::Option& mode)
-    {
+{
  if (mode == "false")
    return UseNNUEMode::False;
  else if (mode == "true")
@@ -264,9 +247,9 @@ namespace Eval::NNUE {
    return UseNNUEMode::Pure;
  return UseNNUEMode::False;
-    }
+}
-    void init() {
+void init() {
  useNNUE = nnue_mode_from_option(Options["Use NNUE"]);
@@ -306,10 +289,10 @@ namespace Eval::NNUE {
 #undef stringify2
 #undef stringify
-    }
+}
-    /// NNUE::verify() verifies that the last net used was loaded successfully
+/// NNUE::verify() verifies that the last net used was loaded successfully
-    void verify_eval_file_loaded() {
+void verify_eval_file_loaded() {
  std::string eval_file = std::string(Options["EvalFile"]);
@@ -337,10 +320,10 @@ namespace Eval::NNUE {
    sync_cout << "info string NNUE evaluation using " << eval_file << " enabled" << sync_endl;
  else
    sync_cout << "info string classical evaluation enabled" << sync_endl;
-    }
+}
-    /// In training we override eval file so this is useful.
+/// In training we override eval file so this is useful.
-    void verify_any_net_loaded() {
+void verify_any_net_loaded() {
  if (!Options["SkipLoadingEval"] && useNNUE != UseNNUEMode::False && eval_file_loaded.empty())
  {
@@ -364,6 +347,6 @@ namespace Eval::NNUE {
    sync_cout << "info string NNUE evaluation using " << eval_file_loaded << " enabled" << sync_endl;
  else
    sync_cout << "info string classical evaluation enabled" << sync_endl;
-    }
+}
 } // namespace Eval::NNUE
@@ -16,6 +16,8 @@
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // header used in NNUE evaluation function
 #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
 #define NNUE_EVALUATE_NNUE_H_INCLUDED
@@ -25,7 +27,6 @@
 #include <memory>
 // header used in NNUE evaluation function
 namespace Eval::NNUE {
  enum struct UseNNUEMode
@@ -22,7 +22,6 @@
 #define NNUE_FEATURE_SET_H_INCLUDED
 #include "features_common.h"
 #include <array>
 namespace Eval::NNUE::Features {
@@ -21,9 +21,8 @@
 #ifndef NNUE_FEATURES_COMMON_H_INCLUDED
 #define NNUE_FEATURES_COMMON_H_INCLUDED
-#include "evaluate.h"
+#include "../../evaluate.h"
-
+#include "../nnue_common.h"
 #include "nnue/nnue_common.h"
 namespace Eval::NNUE::Features {
@@ -21,9 +21,8 @@
 #ifndef NNUE_FEATURES_INDEX_LIST_H_INCLUDED
 #define NNUE_FEATURES_INDEX_LIST_H_INCLUDED
-#include "position.h"
+#include "../../position.h"
-
+#include "../nnue_architecture.h"
 #include "nnue/nnue_architecture.h"
 namespace Eval::NNUE::Features {
@@ -51,12 +50,13 @@ namespace Eval::NNUE::Features {
    }
   private:
-        T values_[MaxSize] = {};
+    T values_[MaxSize];
    std::size_t size_ = 0;
  };
  //Type of feature index list
-    class IndexList : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
+  class IndexList
      : public ValueList<IndexType, RawFeatures::kMaxActiveDimensions> {
  };
 }  // namespace Eval::NNUE::Features
@@ -223,13 +223,13 @@ namespace Eval::NNUE::Layers {
        return _mm512_add_epi32(_mm512_permutexvar_epi32(indices, x), bias);
      };
      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
 #if defined (USE_VNNI)
      [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
        acc = _mm512_dpbusd_epi32(acc, a, b);
 #else
      [[maybe_unused]] auto m512_dpbusd_epi32 = [=](__m512i a, __m512i b) -> __m512i {
        __m512i product0 = _mm512_maddubs_epi16(a, b);
-        product0 = _mm512_madd_epi16(product0, kOnes512);
+        return _mm512_madd_epi16(product0, kOnes512);
        acc = _mm512_add_epi32(acc, product0);
 #endif
      };
@@ -256,14 +256,13 @@ namespace Eval::NNUE::Layers {
        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
      };
      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
 #if defined (USE_VNNI)
      [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
        acc = _mm256_dpbusd_epi32(acc, a, b);
 #else
      [[maybe_unused]] auto m256_dpbusd_epi32 = [=](__m256i a, __m256i b) -> __m256i {
        __m256i product0 = _mm256_maddubs_epi16(a, b);
-        product0 = _mm256_madd_epi16(product0, kOnes256);
+        return _mm256_madd_epi16(product0, kOnes256);
        acc = _mm256_add_epi32(acc, product0);
 #endif
      };
@@ -288,10 +287,9 @@ namespace Eval::NNUE::Layers {
        return _mm_add_epi32(sum0, bias);
      };
-      [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
+      [[maybe_unused]] auto m128_dpbusd_epi32 = [=](__m128i a, __m128i b) -> __m128i {
        __m128i product0 = _mm_maddubs_epi16(a, b);
-        product0 = _mm_madd_epi16(product0, kOnes128);
+        return _mm_madd_epi16(product0, kOnes128);
        acc = _mm_add_epi32(acc, product0);
      };
 #endif
@@ -335,15 +333,6 @@ namespace Eval::NNUE::Layers {
          const __m512i bias = *reinterpret_cast<const __m512i*>(&biases_[i]);
          __m512i* outptr = reinterpret_cast<__m512i*>(&output[i]);
          __m512i sum01a = _mm512_setzero_si512();
          __m512i sum23a = _mm512_setzero_si512();
          __m512i sum45a = _mm512_setzero_si512();
          __m512i sum67a = _mm512_setzero_si512();
          __m512i sum01b = _mm512_setzero_si512();
          __m512i sum23b = _mm512_setzero_si512();
          __m512i sum45b = _mm512_setzero_si512();
          __m512i sum67b = _mm512_setzero_si512();
          const auto row01a = *reinterpret_cast<const __m512i*>(&weights_[offset01a]);
          const auto row23a = *reinterpret_cast<const __m512i*>(&weights_[offset23a]);
          const auto row45a = *reinterpret_cast<const __m512i*>(&weights_[offset45a]);
@@ -356,6 +345,16 @@ namespace Eval::NNUE::Layers {
          const __m256i in256 = input_vector256[0];
          const __m512i in = _mm512_inserti64x4(_mm512_castsi256_si512(in256), in256, 1);
 #if defined (USE_VNNI)
          __m512i sum01a = _mm512_setzero_si512();
          __m512i sum23a = _mm512_setzero_si512();
          __m512i sum45a = _mm512_setzero_si512();
          __m512i sum67a = _mm512_setzero_si512();
          __m512i sum01b = _mm512_setzero_si512();
          __m512i sum23b = _mm512_setzero_si512();
          __m512i sum45b = _mm512_setzero_si512();
          __m512i sum67b = _mm512_setzero_si512();
          m512_add_dpbusd_epi32(sum01a, in, row01a);
          m512_add_dpbusd_epi32(sum23a, in, row23a);
          m512_add_dpbusd_epi32(sum45a, in, row45a);
@@ -364,6 +363,16 @@ namespace Eval::NNUE::Layers {
          m512_add_dpbusd_epi32(sum23b, in, row23b);
          m512_add_dpbusd_epi32(sum45b, in, row45b);
          m512_add_dpbusd_epi32(sum67b, in, row67b);
 #else
          __m512i sum01a = m512_dpbusd_epi32(in, row01a);
          __m512i sum23a = m512_dpbusd_epi32(in, row23a);
          __m512i sum45a = m512_dpbusd_epi32(in, row45a);
          __m512i sum67a = m512_dpbusd_epi32(in, row67a);
          __m512i sum01b = m512_dpbusd_epi32(in, row01b);
          __m512i sum23b = m512_dpbusd_epi32(in, row23b);
          __m512i sum45b = m512_dpbusd_epi32(in, row45b);
          __m512i sum67b = m512_dpbusd_epi32(in, row67b);
 #endif
          *outptr = m512_hadd256x16(
            sum01a, sum23a, sum45a, sum67a,
@@ -384,48 +393,80 @@ namespace Eval::NNUE::Layers {
          if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
          {
            __m512i sum0 = _mm512_setzero_si512();
            __m512i sum1 = _mm512_setzero_si512();
            __m512i sum2 = _mm512_setzero_si512();
            __m512i sum3 = _mm512_setzero_si512();
            const auto row0 = reinterpret_cast<const __m512i*>(&weights_[offset0]);
            const auto row1 = reinterpret_cast<const __m512i*>(&weights_[offset1]);
            const auto row2 = reinterpret_cast<const __m512i*>(&weights_[offset2]);
            const auto row3 = reinterpret_cast<const __m512i*>(&weights_[offset3]);
-            for (IndexType j = 0; j < kNumChunks512; ++j)
+#if defined (USE_VNNI)
            __m512i sum0 = _mm512_setzero_si512();
            __m512i sum1 = _mm512_setzero_si512();
            __m512i sum2 = _mm512_setzero_si512();
            __m512i sum3 = _mm512_setzero_si512();
            const IndexType kStart = 0;
 #else
            __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
            __m512i sum1 = m512_dpbusd_epi32(input_vector512[0], row1[0]);
            __m512i sum2 = m512_dpbusd_epi32(input_vector512[0], row2[0]);
            __m512i sum3 = m512_dpbusd_epi32(input_vector512[0], row3[0]);
            const IndexType kStart = 1;
 #endif
            for (IndexType j = kStart; j < kNumChunks512; ++j)
            {
              const __m512i in = input_vector512[j];
 #if defined (USE_VNNI)
              m512_add_dpbusd_epi32(sum0, in, row0[j]);
              m512_add_dpbusd_epi32(sum1, in, row1[j]);
              m512_add_dpbusd_epi32(sum2, in, row2[j]);
              m512_add_dpbusd_epi32(sum3, in, row3[j]);
 #else
              sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
              sum1 = _mm512_add_epi32(sum1, m512_dpbusd_epi32(in, row1[j]));
              sum2 = _mm512_add_epi32(sum2, m512_dpbusd_epi32(in, row2[j]));
              sum3 = _mm512_add_epi32(sum3, m512_dpbusd_epi32(in, row3[j]));
 #endif
            }
            *outptr = m512_haddx4(sum0, sum1, sum2, sum3, bias);
          }
          else
          {
            __m256i sum0 = _mm256_setzero_si256();
            __m256i sum1 = _mm256_setzero_si256();
            __m256i sum2 = _mm256_setzero_si256();
            __m256i sum3 = _mm256_setzero_si256();
            const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
            const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
            const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
            const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
-            for (IndexType j = 0; j < kNumChunks256; ++j)
+#if defined (USE_VNNI)
            __m256i sum0 = _mm256_setzero_si256();
            __m256i sum1 = _mm256_setzero_si256();
            __m256i sum2 = _mm256_setzero_si256();
            __m256i sum3 = _mm256_setzero_si256();
            const IndexType kStart = 0;
 #else
            __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
            __m256i sum1 = m256_dpbusd_epi32(input_vector256[0], row1[0]);
            __m256i sum2 = m256_dpbusd_epi32(input_vector256[0], row2[0]);
            __m256i sum3 = m256_dpbusd_epi32(input_vector256[0], row3[0]);
            const IndexType kStart = 1;
 #endif
            for (IndexType j = kStart; j < kNumChunks256; ++j)
            {
              const __m256i in = input_vector256[j];
 #if defined (USE_VNNI)
              m256_add_dpbusd_epi32(sum0, in, row0[j]);
              m256_add_dpbusd_epi32(sum1, in, row1[j]);
              m256_add_dpbusd_epi32(sum2, in, row2[j]);
              m256_add_dpbusd_epi32(sum3, in, row3[j]);
 #else
              sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
              sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
              sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
              sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
 #endif
            }
            *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -436,30 +477,50 @@ namespace Eval::NNUE::Layers {
      {
        if constexpr (kPaddedInputDimensions % (kSimdWidth * 2) == 0)
        {
          __m512i sum0 = _mm512_setzero_si512();
          const auto row0 = reinterpret_cast<const __m512i*>(&weights_[0]);
-          for (IndexType j = 0; j < kNumChunks512; ++j)
+#if defined (USE_VNNI)
          __m512i sum0 = _mm512_setzero_si512();
          const IndexType kStart = 0;
 #else
          __m512i sum0 = m512_dpbusd_epi32(input_vector512[0], row0[0]);
          const IndexType kStart = 1;
 #endif
          for (IndexType j = kStart; j < kNumChunks512; ++j)
          {
            const __m512i in = input_vector512[j];
 #if defined (USE_VNNI)
            m512_add_dpbusd_epi32(sum0, in, row0[j]);
 #else
            sum0 = _mm512_add_epi32(sum0, m512_dpbusd_epi32(in, row0[j]));
 #endif
          }
          output[0] = m512_hadd(sum0, biases_[0]);
        }
        else
        {
          __m256i sum0 = _mm256_setzero_si256();
          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
-          for (IndexType j = 0; j < kNumChunks256; ++j)
+#if defined (USE_VNNI)
          __m256i sum0 = _mm256_setzero_si256();
          const IndexType kStart = 0;
 #else
          __m256i sum0 = m256_dpbusd_epi32(input_vector256[0], row0[0]);
          const IndexType kStart = 1;
 #endif
          for (IndexType j = kStart; j < kNumChunks256; ++j)
          {
            const __m256i in = input_vector256[j];
 #if defined (USE_VNNI)
            m256_add_dpbusd_epi32(sum0, in, row0[j]);
 #else
            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
 #endif
          }
          output[0] = m256_hadd(sum0, biases_[0]);
@@ -493,24 +554,40 @@ namespace Eval::NNUE::Layers {
          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
          __m256i sum0 = _mm256_setzero_si256();
          __m256i sum1 = _mm256_setzero_si256();
          __m256i sum2 = _mm256_setzero_si256();
          __m256i sum3 = _mm256_setzero_si256();
          const auto row0 = reinterpret_cast<const __m256i*>(&weights_[offset0]);
          const auto row1 = reinterpret_cast<const __m256i*>(&weights_[offset1]);
          const auto row2 = reinterpret_cast<const __m256i*>(&weights_[offset2]);
          const auto row3 = reinterpret_cast<const __m256i*>(&weights_[offset3]);
-          for (IndexType j = 0; j < kNumChunks; ++j)
+#if defined (USE_VNNI)
          __m256i sum0 = _mm256_setzero_si256();
          __m256i sum1 = _mm256_setzero_si256();
          __m256i sum2 = _mm256_setzero_si256();
          __m256i sum3 = _mm256_setzero_si256();
          const IndexType kStart = 0;
 #else
          __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
          __m256i sum1 = m256_dpbusd_epi32(input_vector[0], row1[0]);
          __m256i sum2 = m256_dpbusd_epi32(input_vector[0], row2[0]);
          __m256i sum3 = m256_dpbusd_epi32(input_vector[0], row3[0]);
          const IndexType kStart = 1;
 #endif
          for (IndexType j = kStart; j < kNumChunks; ++j)
          {
            const __m256i in = input_vector[j];
 #if defined (USE_VNNI)
            m256_add_dpbusd_epi32(sum0, in, row0[j]);
            m256_add_dpbusd_epi32(sum1, in, row1[j]);
            m256_add_dpbusd_epi32(sum2, in, row2[j]);
            m256_add_dpbusd_epi32(sum3, in, row3[j]);
 #else
            sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
            sum1 = _mm256_add_epi32(sum1, m256_dpbusd_epi32(in, row1[j]));
            sum2 = _mm256_add_epi32(sum2, m256_dpbusd_epi32(in, row2[j]));
            sum3 = _mm256_add_epi32(sum3, m256_dpbusd_epi32(in, row3[j]));
 #endif
          }
          *outptr = m256_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -518,15 +595,25 @@ namespace Eval::NNUE::Layers {
      }
      else if constexpr (kOutputDimensions == 1)
      {
        __m256i sum0 = _mm256_setzero_si256();
        const auto row0 = reinterpret_cast<const __m256i*>(&weights_[0]);
-        for (IndexType j = 0; j < kNumChunks; ++j)
+#if defined (USE_VNNI)
        __m256i sum0 = _mm256_setzero_si256();
        const IndexType kStart = 0;
 #else
        __m256i sum0 = m256_dpbusd_epi32(input_vector[0], row0[0]);
        const IndexType kStart = 1;
 #endif
        for (IndexType j = kStart; j < kNumChunks; ++j)
        {
          const __m256i in = input_vector[j];
 #if defined (USE_VNNI)
          m256_add_dpbusd_epi32(sum0, in, row0[j]);
 #else
          sum0 = _mm256_add_epi32(sum0, m256_dpbusd_epi32(in, row0[j]));
 #endif
        }
        output[0] = m256_hadd(sum0, biases_[0]);
@@ -559,24 +646,24 @@ namespace Eval::NNUE::Layers {
          const __m128i bias = *reinterpret_cast<const __m128i*>(&biases_[i]);
          __m128i* outptr = reinterpret_cast<__m128i*>(&output[i]);
          __m128i sum0 = _mm_setzero_si128();
          __m128i sum1 = _mm_setzero_si128();
          __m128i sum2 = _mm_setzero_si128();
          __m128i sum3 = _mm_setzero_si128();
          const auto row0 = reinterpret_cast<const __m128i*>(&weights_[offset0]);
          const auto row1 = reinterpret_cast<const __m128i*>(&weights_[offset1]);
          const auto row2 = reinterpret_cast<const __m128i*>(&weights_[offset2]);
          const auto row3 = reinterpret_cast<const __m128i*>(&weights_[offset3]);
-          for (int j = 0; j < (int)kNumChunks; j += 1)
+          __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
          __m128i sum1 = m128_dpbusd_epi32(input_vector[0], row1[0]);
          __m128i sum2 = m128_dpbusd_epi32(input_vector[0], row2[0]);
          __m128i sum3 = m128_dpbusd_epi32(input_vector[0], row3[0]);
          for (int j = 1; j < (int)kNumChunks; ++j)
          {
            const __m128i in = input_vector[j];
-            m128_add_dpbusd_epi32(sum0, in, row0[j]);
+            sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(in, row0[j]));
-            m128_add_dpbusd_epi32(sum1, in, row1[j]);
+            sum1 = _mm_add_epi32(sum1, m128_dpbusd_epi32(in, row1[j]));
-            m128_add_dpbusd_epi32(sum2, in, row2[j]);
+            sum2 = _mm_add_epi32(sum2, m128_dpbusd_epi32(in, row2[j]));
-            m128_add_dpbusd_epi32(sum3, in, row3[j]);
+            sum3 = _mm_add_epi32(sum3, m128_dpbusd_epi32(in, row3[j]));
          }
          *outptr = m128_haddx4(sum0, sum1, sum2, sum3, bias);
@@ -584,16 +671,12 @@ namespace Eval::NNUE::Layers {
      }
      else if constexpr (kOutputDimensions == 1)
      {
        __m128i sum0 = _mm_setzero_si128();
        const auto row0 = reinterpret_cast<const __m128i*>(&weights_[0]);
-        for (int j = 0; j < (int)kNumChunks; j += 1)
+        __m128i sum0 = m128_dpbusd_epi32(input_vector[0], row0[0]);
        {
          const __m128i in = input_vector[j];
-          m128_add_dpbusd_epi32(sum0, in, row0[j]);
+        for (int j = 1; j < (int)kNumChunks; ++j)
-        }
+          sum0 = _mm_add_epi32(sum0, m128_dpbusd_epi32(input_vector[j], row0[j]));
        output[0] = m128_hadd(sum0, biases_[0]);
      }
@@ -16,12 +16,13 @@
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // Class for difference calculation of NNUE evaluation function
 #ifndef NNUE_ACCUMULATOR_H_INCLUDED
 #define NNUE_ACCUMULATOR_H_INCLUDED
 #include "nnue_architecture.h"
 // Class for difference calculation of NNUE evaluation function
 namespace Eval::NNUE {
  // Class that holds the result of affine transformation of input features
@@ -16,13 +16,14 @@
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // Input features and network structure used in NNUE evaluation function
 #ifndef NNUE_ARCHITECTURE_H_INCLUDED
 #define NNUE_ARCHITECTURE_H_INCLUDED
 // Defines the network structure
 #include "architectures/halfkp_256x2-32-32.h"
 // Input features and network structure used in NNUE evaluation function
 namespace Eval::NNUE {
  static_assert(kTransformedFeatureDimensions % kMaxSimdWidth == 0, "");
@@ -23,7 +23,6 @@
 #include "nnue_common.h"
 #include "nnue_architecture.h"
 #include "features/index_list.h"
 #include <cstring>
@@ -34,57 +33,57 @@ namespace Eval::NNUE {
  // If vector instructions are enabled, we update and refresh the
  // accumulator tile by tile such that each tile fits in the CPU's
  // vector registers.
-#define TILING
+  #define VECTOR
-#ifdef USE_AVX512
+  #ifdef USE_AVX512
  typedef __m512i vec_t;
-#define vec_load(a) _mm512_load_si512(a)
+  #define vec_load(a) _mm512_load_si512(a)
-#define vec_store(a,b) _mm512_store_si512(a,b)
+  #define vec_store(a,b) _mm512_store_si512(a,b)
-#define vec_add_16(a,b) _mm512_add_epi16(a,b)
+  #define vec_add_16(a,b) _mm512_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
+  #define vec_sub_16(a,b) _mm512_sub_epi16(a,b)
-#define vec_zero _mm512_setzero_si512()
+  #define vec_zero _mm512_setzero_si512()
  static constexpr IndexType kNumRegs = 8; // only 8 are needed
-#elif USE_AVX2
+  #elif USE_AVX2
  typedef __m256i vec_t;
-#define vec_load(a) _mm256_load_si256(a)
+  #define vec_load(a) _mm256_load_si256(a)
-#define vec_store(a,b) _mm256_store_si256(a,b)
+  #define vec_store(a,b) _mm256_store_si256(a,b)
-#define vec_add_16(a,b) _mm256_add_epi16(a,b)
+  #define vec_add_16(a,b) _mm256_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
+  #define vec_sub_16(a,b) _mm256_sub_epi16(a,b)
-#define vec_zero _mm256_setzero_si256()
+  #define vec_zero _mm256_setzero_si256()
  static constexpr IndexType kNumRegs = 16;
-#elif USE_SSE2
+  #elif USE_SSE2
  typedef __m128i vec_t;
-#define vec_load(a) (*(a))
+  #define vec_load(a) (*(a))
-#define vec_store(a,b) *(a)=(b)
+  #define vec_store(a,b) *(a)=(b)
-#define vec_add_16(a,b) _mm_add_epi16(a,b)
+  #define vec_add_16(a,b) _mm_add_epi16(a,b)
-#define vec_sub_16(a,b) _mm_sub_epi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_epi16(a,b)
-#define vec_zero _mm_setzero_si128()
+  #define vec_zero _mm_setzero_si128()
  static constexpr IndexType kNumRegs = Is64Bit ? 16 : 8;
-#elif USE_MMX
+  #elif USE_MMX
  typedef __m64 vec_t;
-#define vec_load(a) (*(a))
+  #define vec_load(a) (*(a))
-#define vec_store(a,b) *(a)=(b)
+  #define vec_store(a,b) *(a)=(b)
-#define vec_add_16(a,b) _mm_add_pi16(a,b)
+  #define vec_add_16(a,b) _mm_add_pi16(a,b)
-#define vec_sub_16(a,b) _mm_sub_pi16(a,b)
+  #define vec_sub_16(a,b) _mm_sub_pi16(a,b)
-#define vec_zero _mm_setzero_si64()
+  #define vec_zero _mm_setzero_si64()
  static constexpr IndexType kNumRegs = 8;
-#elif USE_NEON
+  #elif USE_NEON
  typedef int16x8_t vec_t;
-#define vec_load(a) (*(a))
+  #define vec_load(a) (*(a))
-#define vec_store(a,b) *(a)=(b)
+  #define vec_store(a,b) *(a)=(b)
-#define vec_add_16(a,b) vaddq_s16(a,b)
+  #define vec_add_16(a,b) vaddq_s16(a,b)
-#define vec_sub_16(a,b) vsubq_s16(a,b)
+  #define vec_sub_16(a,b) vsubq_s16(a,b)
-#define vec_zero {0}
+  #define vec_zero {0}
  static constexpr IndexType kNumRegs = 16;
-#else
+  #else
-#undef TILING
+  #undef VECTOR
-#endif
+  #endif
  // Input feature converter
  class FeatureTransformer {
@@ -93,10 +92,10 @@ namespace Eval::NNUE {
    // Number of output dimensions for one side
    static constexpr IndexType kHalfDimensions = kTransformedFeatureDimensions;
-#ifdef TILING
+    #ifdef VECTOR
    static constexpr IndexType kTileHeight = kNumRegs * sizeof(vec_t) / 2;
    static_assert(kHalfDimensions % kTileHeight == 0, "kTileHeight must divide kHalfDimensions");
-#endif
+    #endif
   public:
    // Output type
@@ -142,10 +141,8 @@ namespace Eval::NNUE {
      for (std::size_t i = 0; i < kHalfDimensions; ++i)
        biases_[i] = read_little_endian<BiasType>(stream);
      for (std::size_t i = 0; i < kHalfDimensions * kInputDimensions; ++i)
        weights_[i] = read_little_endian<WeightType>(stream);
      return !stream.fail();
    }
@@ -184,34 +181,58 @@ namespace Eval::NNUE {
      const auto& accumulation = pos.state()->accumulator.accumulation;
-#if defined(USE_AVX2)
+  #if defined(USE_AVX512)
      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth * 2);
      static_assert(kHalfDimensions % (kSimdWidth * 2) == 0);
      const __m512i kControl = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
      const __m512i kZero = _mm512_setzero_si512();
  #elif defined(USE_AVX2)
      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
      constexpr int kControl = 0b11011000;
      const __m256i kZero = _mm256_setzero_si256();
-#elif defined(USE_SSE2)
+  #elif defined(USE_SSE2)
      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
-#ifdef USE_SSE41
+  #ifdef USE_SSE41
      const __m128i kZero = _mm_setzero_si128();
-#else
+  #else
      const __m128i k0x80s = _mm_set1_epi8(-128);
-#endif
+  #endif
-#elif defined(USE_MMX)
+  #elif defined(USE_MMX)
      constexpr IndexType kNumChunks = kHalfDimensions / kSimdWidth;
      const __m64 k0x80s = _mm_set1_pi8(-128);
-#elif defined(USE_NEON)
+  #elif defined(USE_NEON)
      constexpr IndexType kNumChunks = kHalfDimensions / (kSimdWidth / 2);
      const int8x8_t kZero = {0};
-#endif
+  #endif
      const Color perspectives[2] = {pos.side_to_move(), ~pos.side_to_move()};
      for (IndexType p = 0; p < 2; ++p) {
        const IndexType offset = kHalfDimensions * p;
-#if defined(USE_AVX2)
+  #if defined(USE_AVX512)
        auto out = reinterpret_cast<__m512i*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          __m512i sum0 = _mm512_load_si512(
              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 0]);
          __m512i sum1 = _mm512_load_si512(
              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]][0])[j * 2 + 1]);
          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
              sum0 = _mm512_add_epi16(sum0, reinterpret_cast<const __m512i*>(
                  accumulation[perspectives[p]][i])[j * 2 + 0]);
              sum1 = _mm512_add_epi16(sum1, reinterpret_cast<const __m512i*>(
                  accumulation[perspectives[p]][i])[j * 2 + 1]);
          }
          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(kControl,
              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), kZero)));
        }
  #elif defined(USE_AVX2)
        auto out = reinterpret_cast<__m256i*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          __m256i sum0 = _mm256_load_si256(
@@ -229,7 +250,7 @@ namespace Eval::NNUE {
              _mm256_packs_epi16(sum0, sum1), kZero), kControl));
        }
-#elif defined(USE_SSE2)
+  #elif defined(USE_SSE2)
        auto out = reinterpret_cast<__m128i*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
@@ -247,16 +268,16 @@ namespace Eval::NNUE {
          _mm_store_si128(&out[j],
-#ifdef USE_SSE41
+  #ifdef USE_SSE41
              _mm_max_epi8(packedbytes, kZero)
-#else
+  #else
              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-#endif
+  #endif
          );
        }
-#elif defined(USE_MMX)
+  #elif defined(USE_MMX)
        auto out = reinterpret_cast<__m64*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
@@ -274,7 +295,7 @@ namespace Eval::NNUE {
          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
        }
-#elif defined(USE_NEON)
+  #elif defined(USE_NEON)
        const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
        for (IndexType j = 0; j < kNumChunks; ++j) {
          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
@@ -288,7 +309,7 @@ namespace Eval::NNUE {
          out[j] = vmax_s8(vqmovn_s16(sum), kZero);
        }
-#else
+  #else
        for (IndexType j = 0; j < kHalfDimensions; ++j) {
          BiasType sum = accumulation[static_cast<int>(perspectives[p])][0][j];
          for (IndexType i = 1; i < kRefreshTriggers.size(); ++i) {
@@ -298,37 +319,41 @@ namespace Eval::NNUE {
          output[offset + j] = static_cast<OutputType>(
              std::max<int>(0, std::min<int>(127, sum)));
        }
-#endif
+  #endif
      }
-#if defined(USE_MMX)
+  #if defined(USE_MMX)
      _mm_empty();
-#endif
+  #endif
    }
   private:
    // Calculate cumulative value without using difference calculation
    void refresh_accumulator(const Position& pos) const {
  #ifdef VECTOR
      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
      // is defined in the VECTOR code below, once in each branch
      vec_t acc[kNumRegs];
  #endif
      auto& accumulator = pos.state()->accumulator;
      for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
        Features::IndexList active_indices[2];
        RawFeatures::append_active_indices(pos, kRefreshTriggers[i],
                                           active_indices);
          for (Color perspective : { WHITE, BLACK }) {
-#ifdef TILING
+#ifdef VECTOR
-                    for (unsigned j = 0; j < kHalfDimensions / kTileHeight; ++j) {
+            for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
              auto accTile = reinterpret_cast<vec_t*>(
                  &accumulator.accumulation[perspective][i][j * kTileHeight]);
                        vec_t acc[kNumRegs];
              if (i == 0) {
                auto biasesTile = reinterpret_cast<const vec_t*>(
                    &biases_[j * kTileHeight]);
-                            for (unsigned k = 0; k < kNumRegs; ++k)
+                for (IndexType k = 0; k < kNumRegs; ++k)
                  acc[k] = biasesTile[k];
              } else {
-                            for (unsigned k = 0; k < kNumRegs; ++k)
+                for (IndexType k = 0; k < kNumRegs; ++k)
                  acc[k] = vec_zero;
              }
@@ -336,11 +361,11 @@ namespace Eval::NNUE {
                const IndexType offset = kHalfDimensions * index + j * kTileHeight;
                auto column = reinterpret_cast<const vec_t*>(&weights_[offset]);
-                            for (unsigned k = 0; k < kNumRegs; ++k)
+                for (IndexType k = 0; k < kNumRegs; ++k)
                  acc[k] = vec_add_16(acc[k], column[k]);
              }
-                        for (unsigned k = 0; k < kNumRegs; k++)
+              for (IndexType k = 0; k < kNumRegs; k++)
                vec_store(&accTile[k], acc[k]);
            }
 #else
@@ -373,6 +398,11 @@ namespace Eval::NNUE {
    // Calculate cumulative value using difference calculation
    void update_accumulator(const Position& pos) const {
  #ifdef VECTOR
      // Gcc-10.2 unnecessarily spills AVX2 registers if this array
      // is defined in the VECTOR code below, once in each branch
      vec_t acc[kNumRegs];
  #endif
    const auto& prev_accumulator = pos.state()->previous->accumulator;
    auto& accumulator = pos.state()->accumulator;
    for (IndexType i = 0; i < kRefreshTriggers.size(); ++i) {
@@ -381,21 +411,20 @@ namespace Eval::NNUE {
      RawFeatures::append_changed_indices(pos, kRefreshTriggers[i],
                                          removed_indices, added_indices, reset);
-#ifdef TILING
+#ifdef VECTOR
      for (IndexType j = 0; j < kHalfDimensions / kTileHeight; ++j) {
        for (Color perspective : { WHITE, BLACK }) {
          auto accTile = reinterpret_cast<vec_t*>(
              &accumulator.accumulation[perspective][i][j * kTileHeight]);
                        vec_t acc[kNumRegs];
          if (reset[perspective]) {
            if (i == 0) {
              auto biasesTile = reinterpret_cast<const vec_t*>(
                  &biases_[j * kTileHeight]);
-                                for (unsigned k = 0; k < kNumRegs; ++k)
+              for (IndexType k = 0; k < kNumRegs; ++k)
                acc[k] = biasesTile[k];
            } else {
-                                for (unsigned k = 0; k < kNumRegs; ++k)
+              for (IndexType k = 0; k < kNumRegs; ++k)
                acc[k] = vec_zero;
            }
          } else {
@@ -483,4 +512,4 @@ namespace Eval::NNUE {
 }  // namespace Eval::NNUE
-#endif //#ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
+#endif // #ifndef NNUE_FEATURE_TRANSFORMER_H_INCLUDED
@@ -176,8 +176,8 @@ namespace {
            score -=  Doubled * doubled
                    + WeakLever * more_than_one(lever);
-        if (blocked && r > RANK_4)
+        if (blocked && r >= RANK_5)
-            score += BlockedPawn[r-4];
+            score += BlockedPawn[r - RANK_5];
    }
    return score;
@@ -59,7 +59,7 @@ namespace {
  // Razor and futility margins
  constexpr int RazorMargin = 510;
  Value futility_margin(Depth d, bool improving) {
-    return Value(223 * (d - improving));
+    return Value(234 * (d - improving));
  }
  // Reductions lookup table, initialized at startup
@@ -67,7 +67,7 @@ namespace {
  Depth reduction(bool i, Depth d, int mn) {
    int r = Reductions[d] * Reductions[mn];
-    return (r + 509) / 1024 + (!i && r > 894);
+    return (r + 503) / 1024 + (!i && r > 915);
  }
  constexpr int futility_move_count(bool improving, Depth depth) {
@@ -188,7 +188,7 @@ namespace {
 void Search::init() {
  for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((22.0 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
+      Reductions[i] = int((21.3 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
 }
@@ -404,7 +404,7 @@ void Thread::search() {
              beta  = std::min(prev + delta, VALUE_INFINITE);
              // Adjust contempt based on root move's previousScore (dynamic contempt)
-              int dct = ct + (105 - ct / 2) * prev / (abs(prev) + 149);
+              int dct = ct + (113 - ct / 2) * prev / (abs(prev) + 147);
              contempt = (us == WHITE ?  make_score(dct, dct / 2)
                                      : -make_score(dct, dct / 2));
@@ -824,7 +824,7 @@ namespace {
        && (ss-1)->statScore < 22977
        &&  eval >= beta
        &&  eval >= ss->staticEval
-        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 182
+        &&  ss->staticEval >= beta - 30 * depth - 28 * improving + 84 * ss->ttPv + 168
        && !excludedMove
        &&  pos.non_pawn_material(us)
        && (ss->ply >= thisThread->nmpMinPly || us != thisThread->nmpColor))
@@ -832,7 +832,7 @@ namespace {
        assert(eval - beta >= 0);
        // Null move dynamic reduction based on depth and value
-        Depth R = (982 + 85 * depth) / 256 + std::min(int(eval - beta) / 192, 3);
+        Depth R = (1015 + 85 * depth) / 256 + std::min(int(eval - beta) / 191, 3);
        ss->currentMove = MOVE_NULL;
        ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -849,7 +849,7 @@ namespace {
            if (nullValue >= VALUE_TB_WIN_IN_MAX_PLY)
                nullValue = beta;
-            if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 13))
+            if (thisThread->nmpMinPly || (abs(beta) < VALUE_KNOWN_WIN && depth < 14))
                return nullValue;
            assert(!thisThread->nmpMinPly); // Recursive verification is not allowed
@@ -868,7 +868,7 @@ namespace {
        }
    }
-    probCutBeta = beta + 176 - 49 * improving;
+    probCutBeta = beta + 183 - 49 * improving;
    // Step 10. ProbCut (~10 Elo)
    // If we have a good enough capture and a reduced search returns a value
@@ -1036,7 +1036,7 @@ moves_loop: // When in check, search starts from here
              // Futility pruning: parent node (~5 Elo)
              if (   lmrDepth < 7
                  && !ss->inCheck
-                  && ss->staticEval + 283 + 170 * lmrDepth <= alpha
+                  && ss->staticEval + 266 + 170 * lmrDepth <= alpha
                  &&  (*contHist[0])[movedPiece][to_sq(move)]
                    + (*contHist[1])[movedPiece][to_sq(move)]
                    + (*contHist[3])[movedPiece][to_sq(move)]
@@ -1044,7 +1044,7 @@ moves_loop: // When in check, search starts from here
                  continue;
              // Prune moves with negative SEE (~20 Elo)
-              if (!pos.see_ge(move, Value(-(29 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
+              if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
                  continue;
          }
          else
@@ -1055,8 +1055,8 @@ moves_loop: // When in check, search starts from here
                  && captureHistory[movedPiece][to_sq(move)][type_of(pos.piece_on(to_sq(move)))] < 0)
                  continue;
-              // See based pruning
+              // SEE based pruning
-              if (!pos.see_ge(move, Value(-221) * depth)) // (~25 Elo)
+              if (!pos.see_ge(move, Value(-213) * depth)) // (~25 Elo)
                  continue;
          }
      }
@@ -1150,12 +1150,12 @@ moves_loop: // When in check, search starts from here
              || moveCountPruning
              || ss->staticEval + PieceValue[EG][pos.captured_piece()] <= alpha
              || cutNode
-              || thisThread->ttHitAverage < 427 * TtHitAverageResolution * TtHitAverageWindow / 1024))
+              || thisThread->ttHitAverage < 432 * TtHitAverageResolution * TtHitAverageWindow / 1024))
      {
          Depth r = reduction(improving, depth, moveCount);
          // Decrease reduction if the ttHit running average is large
-          if (thisThread->ttHitAverage > 509 * TtHitAverageResolution * TtHitAverageWindow / 1024)
+          if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024)
              r--;
          // Increase reduction if other threads are searching this position
@@ -1208,10 +1208,10 @@ moves_loop: // When in check, search starts from here
                             - 5287;
              // Decrease/increase reduction by comparing opponent's stat score (~10 Elo)
-              if (ss->statScore >= -106 && (ss-1)->statScore < -104)
+              if (ss->statScore >= -105 && (ss-1)->statScore < -103)
                  r--;
-              else if ((ss-1)->statScore >= -119 && ss->statScore < -140)
+              else if ((ss-1)->statScore >= -122 && ss->statScore < -129)
                  r++;
              // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
@@ -1225,7 +1225,7 @@ moves_loop: // When in check, search starts from here
              // Unless giving check, this capture is likely bad
              if (   !givesCheck
-                  && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 213 * depth <= alpha)
+                  && ss->staticEval + PieceValue[EG][pos.captured_piece()] + 210 * depth <= alpha)
                  r++;
          }
@@ -1499,7 +1499,7 @@ moves_loop: // When in check, search starts from here
        if (PvNode && bestValue > alpha)
            alpha = bestValue;
-        futilityBase = bestValue + 145;
+        futilityBase = bestValue + 155;
    }
    const PieceToHistory* contHist[] = { (ss-1)->continuationHistory, (ss-2)->continuationHistory,
@@ -204,8 +204,8 @@ enum PieceType {
 enum Piece {
  NO_PIECE,
-  W_PAWN = 1, W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
+  W_PAWN = PAWN,     W_KNIGHT, W_BISHOP, W_ROOK, W_QUEEN, W_KING,
-  B_PAWN = 9, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
+  B_PAWN = PAWN + 8, B_KNIGHT, B_BISHOP, B_ROOK, B_QUEEN, B_KING,
  PIECE_NB = 16
 };