Merge pull request #5161 from Disservin/cluster

Merge SF master in the cluster branch
Merge branch 'master' into cluster
2026-05-20 13:17:44 +00:00 · 2024-04-10 22:18:16 +02:00 · 2024-04-10 18:46:26 +02:00 · 2024-04-02 08:49:48 +02:00 · 2024-03-30 12:38:02 +01:00 · 2024-03-29 13:13:07 +01:00
40 changed files with 2389 additions and 1292 deletions
@@ -30,7 +30,7 @@ jobs:
      - name: Comment on PR
        if: steps.clang-format.outcome == 'failure'
-        uses: thollander/actions-comment-pull-request@1d3973dc4b8e1399c0620d3f2b1aa5e795465308 # @v2.4.3
+        uses: thollander/actions-comment-pull-request@fabd468d3a1a0b97feee5f6b9e499eab0dd903f6 # @v2.5.0
        with:
          message: |
            clang-format 17 needs to be run on this PR.
@@ -42,7 +42,7 @@ jobs:
      - name: Comment on PR
        if: steps.clang-format.outcome != 'failure'
-        uses: thollander/actions-comment-pull-request@1d3973dc4b8e1399c0620d3f2b1aa5e795465308 # @v2.4.3
+        uses: thollander/actions-comment-pull-request@fabd468d3a1a0b97feee5f6b9e499eab0dd903f6 # @v2.5.0
        with:
          message: |
            _(execution **${{ github.run_id }}** / attempt **${{ github.run_attempt }}**)_
@@ -16,6 +16,8 @@ jobs:
    if: github.repository == 'official-stockfish/Stockfish' && (github.ref == 'refs/heads/master' || (startsWith(github.ref_name, 'sf_') && github.ref_type == 'tag'))
    runs-on: ubuntu-latest
    steps:
      - uses: actions/checkout@v4
      # returns null if no pre-release exists
      - name: Get Commit SHA of Latest Pre-release
        run: |
@@ -23,14 +25,40 @@ jobs:
          sudo apt-get update
          sudo apt-get install -y curl jq
-          echo "COMMIT_SHA=$(jq -r 'map(select(.prerelease)) | first | .tag_name' <<< $(curl -s https://api.github.com/repos/${{ github.repository_owner }}/Stockfish/releases))" >> $GITHUB_ENV
+          echo "COMMIT_SHA_TAG=$(jq -r 'map(select(.prerelease)) | first | .tag_name' <<< $(curl -s https://api.github.com/repos/${{ github.repository_owner }}/Stockfish/releases))" >> $GITHUB_ENV
-        # delete old previous pre-release and tag
+      # delete old previous pre-release and tag
-      - uses: actions/checkout@v4
+      - run: gh release delete ${{ env.COMMIT_SHA_TAG }} --cleanup-tag
-      - run: gh release delete ${{ env.COMMIT_SHA }} --cleanup-tag
+        if: env.COMMIT_SHA_TAG != 'null'
        if: env.COMMIT_SHA != 'null'
        env:
          GITHUB_TOKEN: ${{ secrets.GITHUB_TOKEN }}
      # Make sure that an old ci that still runs on master doesn't recreate a prerelease
      - name: Check Pullable Commits
        id: check_commits
        run: |
          git fetch
          CHANGES=$(git rev-list HEAD..origin/master --count)
          echo "CHANGES=$CHANGES" >> $GITHUB_ENV
      - name: Get last commit SHA
        id: last_commit
        run: echo "COMMIT_SHA=$(git rev-parse HEAD | cut -c 1-8)" >> $GITHUB_ENV
      - name: Get commit date
        id: commit_date
        run: echo "COMMIT_DATE=$(git show -s --date=format:'%Y%m%d' --format=%cd HEAD)" >> $GITHUB_ENV
      # Create a new pre-release, the other upload_binaries.yml will upload the binaries
      # to this pre-release.
      - name: Create Prerelease
        if: github.ref_name == 'master' && env.CHANGES == '0'
        uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981
        with:
          name: Stockfish dev-${{ env.COMMIT_DATE }}-${{ env.COMMIT_SHA }}
          tag_name: stockfish-dev-${{ env.COMMIT_DATE }}-${{ env.COMMIT_SHA }}
          prerelease: true
  Matrix:
    runs-on: ubuntu-latest
    outputs:
@@ -65,6 +65,7 @@ jobs:
      - name: Create tar
        if: runner.os != 'Windows'
        run: |
          chmod +x ./stockfish/stockfish-$NAME-$BINARY$EXT
          tar -cvf stockfish-$NAME-$BINARY.tar stockfish
      - name: Create zip
@@ -97,7 +98,7 @@ jobs:
      - name: Prerelease
        if: github.ref_name == 'master' && env.CHANGES == '0'
        continue-on-error: true
-        uses: softprops/action-gh-release@de2c0eb89ae2a093876385947365aca7b0e5f844 # @v1
+        uses: softprops/action-gh-release@4634c16e79c963813287e889244c50009e7f0981
        with:
          name: Stockfish dev-${{ env.COMMIT_DATE }}-${{ env.COMMIT_SHA }}
          tag_name: stockfish-dev-${{ env.COMMIT_DATE }}-${{ env.COMMIT_SHA }}
@@ -204,6 +204,7 @@ sf-x
 Shahin M. Shahin (peregrine)
 Shane Booth (shane31)
 Shawn Varghese (xXH4CKST3RXx)
 Shawn Xu (xu-shawn)
 Siad Daboul (Topologist)
 Stefan Geschwentner (locutus2)
 Stefano Cardanobile (Stefano80)
@@ -59,6 +59,33 @@ This distribution of Stockfish consists of the following files:
  * a file with the .nnue extension, storing the neural network for the NNUE
    evaluation. Binary distributions will have this file embedded.
 ## Stockfish on distributed memory systems
 The cluster branch allows for running Stockfish on a cluster of servers (nodes)
 that are connected with a high-speed and low-latency network, using the message
 passing interface (MPI). In this case, one MPI process should be run per node,
 and UCI options can be used to set the number of threads/hash per node as usual.
 Typically, the engine will be invoked as
 ```
 mpirun -np N /path/to/stockfish
 ```
 where ```N``` stands for the number of MPI processes used (alternatives to ```mpirun```,
 include ```mpiexec```, ```srun```). Use 1 mpi rank per node, and employ threading
 according to the cores per node. To build the cluster
 branch, it is sufficient to specify ```COMPCXX=mpicxx``` (or e.g. CC depending on the name
 of the compiler providing MPI support) on the make command line, and do a clean build:
 ```
 make -j ARCH=x86-64-modern clean build COMPCXX=mpicxx mpi=yes
 ```
 Make sure that the MPI installation is configured to support ```MPI_THREAD_MULTIPLE```,
 this might require adding system specific compiler options to the Makefile. Stockfish employs
 non-blocking (asynchronous) communication, and benefits from an MPI
 implementation that efficiently supports this. Some MPI implentations might benefit
 from leaving 1 core/thread free for these asynchronous communications, and might require
 setting additional environment variables. ```mpirun``` should forward stdin/stdout
 to ```rank 0``` only (e.g. ```srun --input=0 --output=0```).
 Refer to your MPI documentation for more info.
 ## Contributing
 __See [Contributing Guide](CONTRIBUTING.md).__
@@ -53,17 +53,17 @@ PGOBENCH = $(WINE_PATH) ./$(EXE) bench
 ### Source and object files
 SRCS = benchmark.cpp bitboard.cpp evaluate.cpp main.cpp \
-	misc.cpp movegen.cpp movepick.cpp position.cpp \
+	misc.cpp movegen.cpp movepick.cpp position.cpp cluster.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
-	nnue/evaluate_nnue.cpp nnue/features/half_ka_v2_hm.cpp
+	nnue/nnue_misc.cpp nnue/features/half_ka_v2_hm.cpp nnue/network.cpp
 HEADERS = benchmark.h bitboard.h evaluate.h misc.h movegen.h movepick.h \
-		nnue/evaluate_nnue.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \
+		nnue/nnue_misc.h nnue/features/half_ka_v2_hm.h nnue/layers/affine_transform.h \
 		nnue/layers/affine_transform_sparse_input.h nnue/layers/clipped_relu.h nnue/layers/simd.h \
 		nnue/layers/sqr_clipped_relu.h nnue/nnue_accumulator.h nnue/nnue_architecture.h \
 		nnue/nnue_common.h nnue/nnue_feature_transformer.h position.h \
 		search.h syzygy/tbprobe.h thread.h thread_win32_osx.h timeman.h \
-		tt.h tune.h types.h uci.h ucioption.h perft.h
+		tt.h tune.h types.h uci.h ucioption.h perft.h nnue/network.h cluster.h
 OBJS = $(notdir $(SRCS:.cpp=.o))
@@ -100,6 +100,7 @@ VPATH = syzygy:nnue:nnue/features
 # vnni512 = yes/no    --- -mavx512vnni       --- Use Intel Vector Neural Network Instructions 512
 # neon = yes/no       --- -DUSE_NEON         --- Use ARM SIMD architecture
 # dotprod = yes/no    --- -DUSE_NEON_DOTPROD --- Use ARM advanced SIMD Int8 dot product instructions
 # mpi = yes/no        --- -DUSE_MPI        --- Use Message Passing Interface
 #
 # Note that Makefile is space sensitive, so when adding new architectures
 # or modifying existing flags, you have to make sure there are no extra spaces
@@ -149,6 +150,7 @@ avx512 = no
 vnni256 = no
 vnni512 = no
 neon = no
 mpi = no
 dotprod = no
 arm_version = 0
 STRIP = strip
@@ -791,6 +793,15 @@ ifeq ($(OS), Android)
 	LDFLAGS += -fPIE -pie
 endif
 ### 3.10 MPI
 ifneq (,$(findstring mpi, $(CXX)))
 	mpi = yes
 endif
 ifeq ($(mpi),yes)
 	CXXFLAGS += -DUSE_MPI -Wno-cast-qual -fexceptions
        DEPENDFLAGS += -DUSE_MPI
 endif
 ### ==========================================================================
 ### Section 4. Public Targets
 ### ==========================================================================
@@ -1013,6 +1024,7 @@ config-sanity: net
 	@echo "vnni256: '$(vnni256)'"
 	@echo "vnni512: '$(vnni512)'"
 	@echo "neon: '$(neon)'"
 	@echo "mpi: '$(mpi)'"
 	@echo "dotprod: '$(dotprod)'"
 	@echo "arm_version: '$(arm_version)'"
 	@echo "target_windows: '$(target_windows)'"
@@ -0,0 +1,480 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  Stockfish is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifdef USE_MPI
    #include <array>
    #include <cstddef>
    #include <cstdlib>
    #include <iostream>
    #include <istream>
    #include <map>
    #include <mpi.h>
    #include <string>
    #include <vector>
    #include "cluster.h"
    #include "thread.h"
    #include "timeman.h"
    #include "tt.h"
    #include "search.h"
 namespace Stockfish {
 namespace Cluster {
 // Total number of ranks and rank within the communicator
 static int world_rank = MPI_PROC_NULL;
 static int world_size = 0;
 // Signals between ranks exchange basic info using a dedicated communicator
 static MPI_Comm    signalsComm        = MPI_COMM_NULL;
 static MPI_Request reqSignals         = MPI_REQUEST_NULL;
 static uint64_t    signalsCallCounter = 0;
 // Signals are the number of nodes searched, stop, table base hits, transposition table saves
 enum Signals : int {
    SIG_NODES = 0,
    SIG_STOP  = 1,
    SIG_TB    = 2,
    SIG_TTS   = 3,
    SIG_NB    = 4
 };
 static uint64_t signalsSend[SIG_NB] = {};
 static uint64_t signalsRecv[SIG_NB] = {};
 static uint64_t nodesSearchedOthers = 0;
 static uint64_t tbHitsOthers        = 0;
 static uint64_t TTsavesOthers       = 0;
 static uint64_t stopSignalsPosted   = 0;
 // The UCI threads of each rank exchange use a dedicated communicator
 static MPI_Comm InputComm = MPI_COMM_NULL;
 // bestMove requires MoveInfo communicators and data types
 static MPI_Comm     MoveComm   = MPI_COMM_NULL;
 static MPI_Datatype MIDatatype = MPI_DATATYPE_NULL;
 // TT entries are communicated with a dedicated communicator.
 // The receive buffer is used to gather information from all ranks.
 // THe TTCacheCounter tracks the number of local elements that are ready to be sent.
 static MPI_Comm                                 TTComm = MPI_COMM_NULL;
 static std::array<std::vector<KeyedTTEntry>, 2> TTSendRecvBuffs;
 static std::array<MPI_Request, 2> reqsTTSendRecv = {MPI_REQUEST_NULL, MPI_REQUEST_NULL};
 static uint64_t                   sendRecvPosted = 0;
 static std::atomic<uint64_t>      TTCacheCounter = {};
 /// Initialize MPI and associated data types. Note that the MPI library must be configured
 /// to support MPI_THREAD_MULTIPLE, since multiple threads access MPI simultaneously.
 void init() {
    int thread_support;
    MPI_Init_thread(nullptr, nullptr, MPI_THREAD_MULTIPLE, &thread_support);
    if (thread_support < MPI_THREAD_MULTIPLE)
    {
        std::cerr << "Stockfish requires support for MPI_THREAD_MULTIPLE." << std::endl;
        std::exit(EXIT_FAILURE);
    }
    MPI_Comm_rank(MPI_COMM_WORLD, &world_rank);
    MPI_Comm_size(MPI_COMM_WORLD, &world_size);
    const std::array<MPI_Aint, 5> MIdisps = {offsetof(MoveInfo, move), offsetof(MoveInfo, ponder),
                                             offsetof(MoveInfo, depth), offsetof(MoveInfo, score),
                                             offsetof(MoveInfo, rank)};
    MPI_Type_create_hindexed_block(5, 1, MIdisps.data(), MPI_INT, &MIDatatype);
    MPI_Type_commit(&MIDatatype);
    MPI_Comm_dup(MPI_COMM_WORLD, &InputComm);
    MPI_Comm_dup(MPI_COMM_WORLD, &TTComm);
    MPI_Comm_dup(MPI_COMM_WORLD, &MoveComm);
    MPI_Comm_dup(MPI_COMM_WORLD, &signalsComm);
 }
 /// Finalize MPI and free the associated data types.
 void finalize() {
    MPI_Type_free(&MIDatatype);
    MPI_Comm_free(&InputComm);
    MPI_Comm_free(&TTComm);
    MPI_Comm_free(&MoveComm);
    MPI_Comm_free(&signalsComm);
    MPI_Finalize();
 }
 /// Return the total number of ranks
 int size() { return world_size; }
 /// Return the rank (index) of the process
 int rank() { return world_rank; }
 /// The receive buffer depends on the number of MPI ranks and threads, resize as needed
 void ttSendRecvBuff_resize(size_t nThreads) {
    for (int i : {0, 1})
    {
        TTSendRecvBuffs[i].resize(TTCacheSize * world_size * nThreads);
        std::fill(TTSendRecvBuffs[i].begin(), TTSendRecvBuffs[i].end(), KeyedTTEntry());
    }
 }
 /// As input is only received by the root (rank 0) of the cluster, this input must be relayed
 /// to the UCI threads of all ranks, in order to setup the position, etc. We do this with a
 /// dedicated getline implementation, where the root broadcasts to all other ranks the received
 /// information.
 bool getline(std::istream& input, std::string& str) {
    int               size;
    std::vector<char> vec;
    int               state;
    if (is_root())
    {
        state = static_cast<bool>(std::getline(input, str));
        vec.assign(str.begin(), str.end());
        size = vec.size();
    }
    // Some MPI implementations use busy-wait polling, while we need yielding as otherwise
    // the UCI thread on the non-root ranks would be consuming resources.
    static MPI_Request reqInput = MPI_REQUEST_NULL;
    MPI_Ibcast(&size, 1, MPI_INT, 0, InputComm, &reqInput);
    if (is_root())
        MPI_Wait(&reqInput, MPI_STATUS_IGNORE);
    else
    {
        while (true)
        {
            int flag;
            MPI_Test(&reqInput, &flag, MPI_STATUS_IGNORE);
            if (flag)
                break;
            else
                std::this_thread::sleep_for(std::chrono::milliseconds(10));
        }
    }
    // Broadcast received string
    if (!is_root())
        vec.resize(size);
    MPI_Bcast(vec.data(), size, MPI_CHAR, 0, InputComm);
    if (!is_root())
        str.assign(vec.begin(), vec.end());
    MPI_Bcast(&state, 1, MPI_INT, 0, InputComm);
    return state;
 }
 /// Sending part of the signal communication loop
 namespace {
 void signals_send(const ThreadPool& threads) {
    signalsSend[SIG_NODES] = threads.nodes_searched();
    signalsSend[SIG_TB]    = threads.tb_hits();
    signalsSend[SIG_TTS]   = threads.TT_saves();
    signalsSend[SIG_STOP]  = threads.stop;
    MPI_Iallreduce(signalsSend, signalsRecv, SIG_NB, MPI_UINT64_T, MPI_SUM, signalsComm,
                   &reqSignals);
    ++signalsCallCounter;
 }
 /// Processing part of the signal communication loop.
 /// For some counters (e.g. nodes) we only keep their sum on the other nodes
 /// allowing to add local counters at any time for more fine grained process,
 /// which is useful to indicate progress during early iterations, and to have
 /// node counts that exactly match the non-MPI code in the single rank case.
 /// This call also propagates the stop signal between ranks.
 void signals_process(ThreadPool& threads) {
    nodesSearchedOthers = signalsRecv[SIG_NODES] - signalsSend[SIG_NODES];
    tbHitsOthers        = signalsRecv[SIG_TB] - signalsSend[SIG_TB];
    TTsavesOthers       = signalsRecv[SIG_TTS] - signalsSend[SIG_TTS];
    stopSignalsPosted   = signalsRecv[SIG_STOP];
    if (signalsRecv[SIG_STOP] > 0)
        threads.stop = true;
 }
 void sendrecv_post() {
    ++sendRecvPosted;
    MPI_Irecv(TTSendRecvBuffs[sendRecvPosted % 2].data(),
              TTSendRecvBuffs[sendRecvPosted % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
              (rank() + size() - 1) % size(), 42, TTComm, &reqsTTSendRecv[0]);
    MPI_Isend(TTSendRecvBuffs[(sendRecvPosted + 1) % 2].data(),
              TTSendRecvBuffs[(sendRecvPosted + 1) % 2].size() * sizeof(KeyedTTEntry), MPI_BYTE,
              (rank() + 1) % size(), 42, TTComm, &reqsTTSendRecv[1]);
 }
 }
 /// During search, most message passing is asynchronous, but at the end of
 /// search it makes sense to bring them to a common, finalized state.
 void signals_sync(ThreadPool& threads) {
    while (stopSignalsPosted < uint64_t(size()))
        signals_poll(threads);
    // Finalize outstanding messages of the signal loops.
    // We might have issued one call less than needed on some ranks.
    uint64_t globalCounter;
    MPI_Allreduce(&signalsCallCounter, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
    if (signalsCallCounter < globalCounter)
    {
        MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
        signals_send(threads);
    }
    assert(signalsCallCounter == globalCounter);
    MPI_Wait(&reqSignals, MPI_STATUS_IGNORE);
    signals_process(threads);
    // Finalize outstanding messages in the sendRecv loop
    MPI_Allreduce(&sendRecvPosted, &globalCounter, 1, MPI_UINT64_T, MPI_MAX, MoveComm);
    while (sendRecvPosted < globalCounter)
    {
        MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
        sendrecv_post();
    }
    assert(sendRecvPosted == globalCounter);
    MPI_Waitall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), MPI_STATUSES_IGNORE);
 }
 /// Initialize signal counters to zero.
 void signals_init() {
    stopSignalsPosted = tbHitsOthers = TTsavesOthers = nodesSearchedOthers = 0;
    signalsSend[SIG_NODES] = signalsRecv[SIG_NODES] = 0;
    signalsSend[SIG_TB] = signalsRecv[SIG_TB] = 0;
    signalsSend[SIG_TTS] = signalsRecv[SIG_TTS] = 0;
    signalsSend[SIG_STOP] = signalsRecv[SIG_STOP] = 0;
 }
 /// Poll the signal loop, and start next round as needed.
 void signals_poll(ThreadPool& threads) {
    int flag;
    MPI_Test(&reqSignals, &flag, MPI_STATUS_IGNORE);
    if (flag)
    {
        signals_process(threads);
        signals_send(threads);
    }
 }
 /// Provide basic info related the cluster performance, in particular, the number of signals send,
 /// signals per sounds (sps), the number of gathers, the number of positions gathered (per node and per second, gpps)
 /// The number of TT saves and TT saves per second. If gpps equals approximately TTSavesps the gather loop has enough bandwidth.
 void cluster_info(const ThreadPool& threads, Depth depth, TimePoint elapsed) {
    // TimePoint elapsed = Time.elapsed() + 1;
    uint64_t TTSaves = TT_saves(threads);
    sync_cout << "info depth " << depth << " cluster "
              << " signals " << signalsCallCounter << " sps " << signalsCallCounter * 1000 / elapsed
              << " sendRecvs " << sendRecvPosted << " srpps "
              << TTSendRecvBuffs[0].size() * sendRecvPosted * 1000 / elapsed << " TTSaves "
              << TTSaves << " TTSavesps " << TTSaves * 1000 / elapsed << sync_endl;
 }
 /// When a TT entry is saved, additional steps are taken if the entry is of sufficient depth.
 /// If sufficient entries has been collected, a communication is initiated.
 /// If a communication has been completed, the received results are saved to the TT.
 void save(TranspositionTable& TT,
          ThreadPool&         threads,
          Search::Worker*     thread,
          TTEntry*            tte,
          Key                 k,
          Value               v,
          bool                PvHit,
          Bound               b,
          Depth               d,
          Move                m,
          Value               ev,
          uint8_t             generation8) {
    // Standard save to the TT
    tte->save(k, v, PvHit, b, d, m, ev, generation8);
    // If the entry is of sufficient depth to be worth communicating, take action.
    if (d > 3)
    {
        // count the TTsaves to information: this should be relatively similar
        // to the number of entries we can send/recv.
        thread->TTsaves.fetch_add(1, std::memory_order_relaxed);
        // Add to thread's send buffer, the locking here avoids races when the master thread
        // prepares the send buffer.
        {
            std::lock_guard<std::mutex> lk(thread->ttCache.mutex);
            thread->ttCache.buffer.replace(KeyedTTEntry(k, *tte));
            ++TTCacheCounter;
        }
        size_t recvBuffPerRankSize = threads.size() * TTCacheSize;
        // Communicate on main search thread, as soon the threads combined have collected
        // sufficient data to fill the send buffers.
        if (thread == threads.main_thread()->worker.get() && TTCacheCounter > recvBuffPerRankSize)
        {
            // Test communication status
            int flag;
            MPI_Testall(reqsTTSendRecv.size(), reqsTTSendRecv.data(), &flag, MPI_STATUSES_IGNORE);
            // Current communication is complete
            if (flag)
            {
                // Save all received entries to TT, and store our TTCaches, ready for the next round of communication
                for (size_t irank = 0; irank < size_t(size()); ++irank)
                {
                    if (irank
                        == size_t(
                          rank()))  // this is our part, fill the part of the buffer for sending
                    {
                        // Copy from the thread caches to the right spot in the buffer
                        size_t i = irank * recvBuffPerRankSize;
                        for (auto&& th : threads)
                        {
                            std::lock_guard<std::mutex> lk(th->worker->ttCache.mutex);
                            for (auto&& e : th->worker->ttCache.buffer)
                                TTSendRecvBuffs[sendRecvPosted % 2][i++] = e;
                            // Reset thread's send buffer
                            th->worker->ttCache.buffer = {};
                        }
                        TTCacheCounter = 0;
                    }
                    else  // process data received from the corresponding rank.
                        for (size_t i = irank * recvBuffPerRankSize;
                             i < (irank + 1) * recvBuffPerRankSize; ++i)
                        {
                            auto&&   e = TTSendRecvBuffs[sendRecvPosted % 2][i];
                            bool     found;
                            TTEntry* replace_tte;
                            replace_tte = TT.probe(e.first, found);
                            replace_tte->save(e.first, e.second.value(), e.second.is_pv(),
                                              e.second.bound(), e.second.depth(), e.second.move(),
                                              e.second.eval(), TT.generation());
                        }
                }
                // Start next communication
                sendrecv_post();
                // Force check of time on the next occasion, the above actions might have taken some time.
                thread->main_manager()->callsCnt = 0;
            }
        }
    }
 }
 /// Picks the bestMove across ranks, and send the associated info and PV to the root of the cluster.
 /// Note that this bestMove and PV must be output by the root, the guarantee proper ordering of output.
 /// TODO update to the scheme in master.. can this use aggregation of votes?
 void pick_moves(MoveInfo& mi, std::string& PVLine) {
    MoveInfo* pMoveInfo = NULL;
    if (is_root())
    {
        pMoveInfo = (MoveInfo*) malloc(sizeof(MoveInfo) * size());
    }
    MPI_Gather(&mi, 1, MIDatatype, pMoveInfo, 1, MIDatatype, 0, MoveComm);
    if (is_root())
    {
        std::map<int, int> votes;
        int                minScore = pMoveInfo[0].score;
        for (int i = 0; i < size(); ++i)
        {
            minScore                 = std::min(minScore, pMoveInfo[i].score);
            votes[pMoveInfo[i].move] = 0;
        }
        for (int i = 0; i < size(); ++i)
        {
            votes[pMoveInfo[i].move] += pMoveInfo[i].score - minScore + pMoveInfo[i].depth;
        }
        int bestVote = votes[pMoveInfo[0].move];
        for (int i = 0; i < size(); ++i)
        {
            if (votes[pMoveInfo[i].move] > bestVote)
            {
                bestVote = votes[pMoveInfo[i].move];
                mi       = pMoveInfo[i];
            }
        }
        free(pMoveInfo);
    }
    // Send around the final result
    MPI_Bcast(&mi, 1, MIDatatype, 0, MoveComm);
    // Send PV line to root as needed
    if (mi.rank != 0 && mi.rank == rank())
    {
        int               size;
        std::vector<char> vec;
        vec.assign(PVLine.begin(), PVLine.end());
        size = vec.size();
        MPI_Send(&size, 1, MPI_INT, 0, 42, MoveComm);
        MPI_Send(vec.data(), size, MPI_CHAR, 0, 42, MoveComm);
    }
    if (mi.rank != 0 && is_root())
    {
        int               size;
        std::vector<char> vec;
        MPI_Recv(&size, 1, MPI_INT, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
        vec.resize(size);
        MPI_Recv(vec.data(), size, MPI_CHAR, mi.rank, 42, MoveComm, MPI_STATUS_IGNORE);
        PVLine.assign(vec.begin(), vec.end());
    }
 }
 /// Return nodes searched (lazily updated cluster wide in the signal loop)
 uint64_t nodes_searched(const ThreadPool& threads) {
    return nodesSearchedOthers + threads.nodes_searched();
 }
 /// Return table base hits (lazily updated cluster wide in the signal loop)
 uint64_t tb_hits(const ThreadPool& threads) { return tbHitsOthers + threads.tb_hits(); }
 /// Return the number of saves to the TT buffers, (lazily updated cluster wide in the signal loop)
 uint64_t TT_saves(const ThreadPool& threads) { return TTsavesOthers + threads.TT_saves(); }
 }
 }
 #else
    #include "cluster.h"
    #include "thread.h"
 namespace Stockfish {
 namespace Cluster {
 uint64_t nodes_searched(const ThreadPool& threads) { return threads.nodes_searched(); }
 uint64_t tb_hits(const ThreadPool& threads) { return threads.tb_hits(); }
 uint64_t TT_saves(const ThreadPool& threads) { return threads.TT_saves(); }
 }
 }
 #endif  // USE_MPI
@@ -0,0 +1,157 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
  Copyright (C) 2004-2020 The Stockfish developers (see AUTHORS file)
  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  Stockfish is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef CLUSTER_H_INCLUDED
 #define CLUSTER_H_INCLUDED
 #include <algorithm>
 #include <array>
 #include <istream>
 #include <string>
 #include "tt.h"
 namespace Stockfish {
 class Thread;
 class ThreadPool;
 namespace Search {
 class Worker;
 }
 /// The Cluster namespace contains functionality required to run on distributed
 /// memory architectures using MPI as the message passing interface. On a high level,
 /// a 'lazy SMP'-like scheme is implemented where TT saves of sufficient depth are
 /// collected on each rank and distributed to, and used by, all other ranks,
 /// which search essentially independently.  The root (MPI rank 0) of the cluster
 /// is responsible for all I/O and time management, communicating this info to
 /// the other ranks as needed. UCI options such as Threads and Hash specify these
 /// quantities per MPI rank.  It is recommended to have one rank (MPI process) per node.
 /// For the non-MPI case, wrappers that will be compiler-optimized away are provided.
 namespace Cluster {
 /// Basic info to find the cluster-wide bestMove
 struct MoveInfo {
    int move;
    int ponder;
    int depth;
    int score;
    int rank;
 };
 #ifdef USE_MPI
 // store the TTEntry with its full key, so it can be saved on the receiver side
 using KeyedTTEntry                = std::pair<Key, TTEntry>;
 constexpr std::size_t TTCacheSize = 16;
 // Threads locally cache their high-depth TT entries till a batch can be send by MPI
 template<std::size_t N>
 class TTCache: public std::array<KeyedTTEntry, N> {
    struct Compare {
        inline bool operator()(const KeyedTTEntry& lhs, const KeyedTTEntry& rhs) {
            return lhs.second.depth() > rhs.second.depth();
        }
    };
    Compare compare;
   public:
    // Keep a heap of entries replacing low depth with high depth entries
    bool replace(const KeyedTTEntry& value) {
        if (compare(value, this->front()))
        {
            std::pop_heap(this->begin(), this->end(), compare);
            this->back() = value;
            std::push_heap(this->begin(), this->end(), compare);
            return true;
        }
        return false;
    }
 };
 void        init();
 void        finalize();
 bool        getline(std::istream& input, std::string& str);
 int         size();
 int         rank();
 inline bool is_root() { return rank() == 0; }
 void        save(TranspositionTable&,
                 ThreadPool&,
                 Search::Worker* thread,
                 TTEntry*        tte,
                 Key             k,
                 Value           v,
                 bool            PvHit,
                 Bound           b,
                 Depth           d,
                 Move            m,
                 Value           ev,
                 uint8_t         generation8);
 void        pick_moves(MoveInfo& mi, std::string& PVLine);
 void        ttSendRecvBuff_resize(size_t nThreads);
 uint64_t    nodes_searched(const ThreadPool&);
 uint64_t    tb_hits(const ThreadPool&);
 uint64_t    TT_saves(const ThreadPool&);
 void        cluster_info(const ThreadPool&, Depth depth, TimePoint elapsed);
 void        signals_init();
 void        signals_poll(ThreadPool& threads);
 void        signals_sync(ThreadPool& threads);
 #else
 inline void init() {}
 inline void finalize() {}
 inline bool getline(std::istream& input, std::string& str) {
    return static_cast<bool>(std::getline(input, str));
 }
 constexpr int  size() { return 1; }
 constexpr int  rank() { return 0; }
 constexpr bool is_root() { return true; }
 inline void    save(TranspositionTable&,
                    ThreadPool&,
                    Search::Worker*,
                    TTEntry* tte,
                    Key      k,
                    Value    v,
                    bool     PvHit,
                    Bound    b,
                    Depth    d,
                    Move     m,
                    Value    ev,
                    uint8_t  generation8) {
    tte->save(k, v, PvHit, b, d, m, ev, generation8);
 }
 inline void pick_moves(MoveInfo&, std::string&) {}
 inline void ttSendRecvBuff_resize(size_t) {}
 uint64_t    nodes_searched(const ThreadPool&);
 uint64_t    tb_hits(const ThreadPool&);
 uint64_t    TT_saves(const ThreadPool&);
 inline void cluster_info(const ThreadPool&, Depth, TimePoint) {}
 inline void signals_init() {}
 inline void signals_poll(ThreadPool& threads) {}
 inline void signals_sync(ThreadPool& threads) {}
 #endif /* USE_MPI */
 }
 }
 #endif  // #ifndef CLUSTER_H_INCLUDED
@@ -22,161 +22,18 @@
 #include <cassert>
 #include <cmath>
 #include <cstdlib>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <optional>
 #include <sstream>
 #include <unordered_map>
 #include <vector>
-#include "incbin/incbin.h"
+#include "nnue/network.h"
-#include "misc.h"
+#include "nnue/nnue_misc.h"
 #include "nnue/evaluate_nnue.h"
 #include "nnue/nnue_architecture.h"
 #include "position.h"
 #include "types.h"
 #include "uci.h"
 #include "ucioption.h"
 // Macro to embed the default efficiently updatable neural network (NNUE) file
 // data in the engine binary (using incbin.h, by Dale Weiler).
 // This macro invocation will declare the following three variables
 //     const unsigned char        gEmbeddedNNUEData[];  // a pointer to the embedded data
 //     const unsigned char *const gEmbeddedNNUEEnd;     // a marker to the end
 //     const unsigned int         gEmbeddedNNUESize;    // the size of the embedded file
 // Note that this does not work in Microsoft Visual Studio.
 #if !defined(_MSC_VER) && !defined(NNUE_EMBEDDING_OFF)
 INCBIN(EmbeddedNNUEBig, EvalFileDefaultNameBig);
 INCBIN(EmbeddedNNUESmall, EvalFileDefaultNameSmall);
 #else
 const unsigned char        gEmbeddedNNUEBigData[1]   = {0x0};
 const unsigned char* const gEmbeddedNNUEBigEnd       = &gEmbeddedNNUEBigData[1];
 const unsigned int         gEmbeddedNNUEBigSize      = 1;
 const unsigned char        gEmbeddedNNUESmallData[1] = {0x0};
 const unsigned char* const gEmbeddedNNUESmallEnd     = &gEmbeddedNNUESmallData[1];
 const unsigned int         gEmbeddedNNUESmallSize    = 1;
 #endif
 namespace Stockfish {
 namespace Eval {
 // Tries to load a NNUE network at startup time, or when the engine
 // receives a UCI command "setoption name EvalFile value nn-[a-z0-9]{12}.nnue"
 // The name of the NNUE network is always retrieved from the EvalFile option.
 // We search the given network in three locations: internally (the default
 // network may be embedded in the binary), in the active working directory and
 // in the engine directory. Distro packagers may define the DEFAULT_NNUE_DIRECTORY
 // variable to have the engine search in a special directory in their distro.
 NNUE::EvalFiles NNUE::load_networks(const std::string& rootDirectory,
                                    const OptionsMap&  options,
                                    NNUE::EvalFiles    evalFiles) {
    for (auto& [netSize, evalFile] : evalFiles)
    {
        std::string user_eval_file = options[evalFile.optionName];
        if (user_eval_file.empty())
            user_eval_file = evalFile.defaultName;
 #if defined(DEFAULT_NNUE_DIRECTORY)
        std::vector<std::string> dirs = {"<internal>", "", rootDirectory,
                                         stringify(DEFAULT_NNUE_DIRECTORY)};
 #else
        std::vector<std::string> dirs = {"<internal>", "", rootDirectory};
 #endif
        for (const std::string& directory : dirs)
        {
            if (evalFile.current != user_eval_file)
            {
                if (directory != "<internal>")
                {
                    std::ifstream stream(directory + user_eval_file, std::ios::binary);
                    auto          description = NNUE::load_eval(stream, netSize);
                    if (description.has_value())
                    {
                        evalFile.current        = user_eval_file;
                        evalFile.netDescription = description.value();
                    }
                }
                if (directory == "<internal>" && user_eval_file == evalFile.defaultName)
                {
                    // C++ way to prepare a buffer for a memory stream
                    class MemoryBuffer: public std::basic_streambuf<char> {
                       public:
                        MemoryBuffer(char* p, size_t n) {
                            setg(p, p, p + n);
                            setp(p, p + n);
                        }
                    };
                    MemoryBuffer buffer(
                      const_cast<char*>(reinterpret_cast<const char*>(
                        netSize == Small ? gEmbeddedNNUESmallData : gEmbeddedNNUEBigData)),
                      size_t(netSize == Small ? gEmbeddedNNUESmallSize : gEmbeddedNNUEBigSize));
                    (void) gEmbeddedNNUEBigEnd;  // Silence warning on unused variable
                    (void) gEmbeddedNNUESmallEnd;
                    std::istream stream(&buffer);
                    auto         description = NNUE::load_eval(stream, netSize);
                    if (description.has_value())
                    {
                        evalFile.current        = user_eval_file;
                        evalFile.netDescription = description.value();
                    }
                }
            }
        }
    }
    return evalFiles;
 }
 // Verifies that the last net used was loaded successfully
 void NNUE::verify(const OptionsMap&                                        options,
                  const std::unordered_map<Eval::NNUE::NetSize, EvalFile>& evalFiles) {
    for (const auto& [netSize, evalFile] : evalFiles)
    {
        std::string user_eval_file = options[evalFile.optionName];
        if (user_eval_file.empty())
            user_eval_file = evalFile.defaultName;
        if (evalFile.current != user_eval_file)
        {
            std::string msg1 =
              "Network evaluation parameters compatible with the engine must be available.";
            std::string msg2 =
              "The network file " + user_eval_file + " was not loaded successfully.";
            std::string msg3 = "The UCI option EvalFile might need to specify the full path, "
                               "including the directory name, to the network file.";
            std::string msg4 = "The default net can be downloaded from: "
                               "https://tests.stockfishchess.org/api/nn/"
                             + evalFile.defaultName;
            std::string msg5 = "The engine will be terminated now.";
            sync_cout << "info string ERROR: " << msg1 << sync_endl;
            sync_cout << "info string ERROR: " << msg2 << sync_endl;
            sync_cout << "info string ERROR: " << msg3 << sync_endl;
            sync_cout << "info string ERROR: " << msg4 << sync_endl;
            sync_cout << "info string ERROR: " << msg5 << sync_endl;
            exit(EXIT_FAILURE);
        }
        sync_cout << "info string NNUE evaluation using " << user_eval_file << sync_endl;
    }
 }
 }
 // Returns a static, purely materialistic evaluation of the position from
 // the point of view of the given color. It can be divided by PawnValue to get
 // an approximation of the material advantage on the board in terms of pawns.
@@ -188,28 +45,42 @@ int Eval::simple_eval(const Position& pos, Color c) {
 // Evaluate is the evaluator for the outer world. It returns a static evaluation
 // of the position from the point of view of the side to move.
-Value Eval::evaluate(const Position& pos, int optimism) {
+Value Eval::evaluate(const Eval::NNUE::Networks& networks, const Position& pos, int optimism) {
    assert(!pos.checkers());
    int  simpleEval = simple_eval(pos, pos.side_to_move());
-    bool smallNet   = std::abs(simpleEval) > 1050;
+    bool smallNet   = std::abs(simpleEval) > SmallNetThreshold;
    bool psqtOnly   = std::abs(simpleEval) > PsqtOnlyThreshold;
    int  nnueComplexity;
    int  v;
-    int nnueComplexity;
+    Value nnue = smallNet ? networks.small.evaluate(pos, true, &nnueComplexity, psqtOnly)
                          : networks.big.evaluate(pos, true, &nnueComplexity, false);
-    Value nnue = smallNet ? NNUE::evaluate<NNUE::Small>(pos, true, &nnueComplexity)
+    const auto adjustEval = [&](int optDiv, int nnueDiv, int pawnCountConstant, int pawnCountMul,
-                          : NNUE::evaluate<NNUE::Big>(pos, true, &nnueComplexity);
+                                int npmConstant, int evalDiv, int shufflingConstant,
                                int shufflingDiv) {
        // Blend optimism and eval with nnue complexity and material imbalance
        optimism += optimism * (nnueComplexity + std::abs(simpleEval - nnue)) / optDiv;
        nnue -= nnue * (nnueComplexity * 5 / 3) / nnueDiv;
-    // Blend optimism and eval with nnue complexity and material imbalance
+        int npm = pos.non_pawn_material() / 64;
-    optimism += optimism * (nnueComplexity + std::abs(simpleEval - nnue)) / 512;
+        v       = (nnue * (npm + pawnCountConstant + pawnCountMul * pos.count<PAWN>())
-    nnue -= nnue * (nnueComplexity + std::abs(simpleEval - nnue)) / 32768;
+             + optimism * (npmConstant + npm))
          / evalDiv;
-    int npm = pos.non_pawn_material() / 64;
+        // Damp down the evaluation linearly when shuffling
-    int v   = (nnue * (915 + npm + 9 * pos.count<PAWN>()) + optimism * (154 + npm)) / 1024;
+        int shuffling = pos.rule50_count();
        v             = v * (shufflingConstant - shuffling) / shufflingDiv;
    };
-    // Damp down the evaluation linearly when shuffling
+    if (!smallNet)
-    int shuffling = pos.rule50_count();
+        adjustEval(513, 32395, 919, 11, 145, 1036, 178, 204);
-    v             = v * (200 - shuffling) / 214;
+    else if (psqtOnly)
        adjustEval(517, 32857, 908, 7, 155, 1019, 224, 238);
    else
        adjustEval(499, 32793, 903, 9, 147, 1067, 208, 211);
    // Guarantee evaluation does not hit the tablebase range
    v = std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
@@ -221,25 +92,24 @@ Value Eval::evaluate(const Position& pos, int optimism) {
 // a string (suitable for outputting to stdout) that contains the detailed
 // descriptions and values of each evaluation term. Useful for debugging.
 // Trace scores are from white's point of view
-std::string Eval::trace(Position& pos) {
+std::string Eval::trace(Position& pos, const Eval::NNUE::Networks& networks) {
    if (pos.checkers())
        return "Final evaluation: none (in check)";
    std::stringstream ss;
    ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2);
-    ss << '\n' << NNUE::trace(pos) << '\n';
+    ss << '\n' << NNUE::trace(pos, networks) << '\n';
    ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15);
-    Value v;
+    Value v = networks.big.evaluate(pos, false);
-    v = NNUE::evaluate<NNUE::Big>(pos, false);
+    v       = pos.side_to_move() == WHITE ? v : -v;
-    v = pos.side_to_move() == WHITE ? v : -v;
+    ss << "NNUE evaluation        " << 0.01 * UCI::to_cp(v, pos) << " (white side)\n";
    ss << "NNUE evaluation        " << 0.01 * UCI::to_cp(v) << " (white side)\n";
-    v = evaluate(pos, VALUE_ZERO);
+    v = evaluate(networks, pos, VALUE_ZERO);
    v = pos.side_to_move() == WHITE ? v : -v;
-    ss << "Final evaluation       " << 0.01 * UCI::to_cp(v) << " (white side)";
+    ss << "Final evaluation       " << 0.01 * UCI::to_cp(v, pos) << " (white side)";
    ss << " [with scaled NNUE, ...]";
    ss << "\n";
@@ -20,49 +20,33 @@
 #define EVALUATE_H_INCLUDED
 #include <string>
 #include <unordered_map>
 #include "types.h"
 namespace Stockfish {
 class Position;
 class OptionsMap;
 namespace Eval {
-std::string trace(Position& pos);
+constexpr inline int SmallNetThreshold = 1165, PsqtOnlyThreshold = 2500;
 int   simple_eval(const Position& pos, Color c);
 Value evaluate(const Position& pos, int optimism);
 // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
 // for the build process (profile-build and fishtest) to work. Do not change the
-// name of the macro, as it is used in the Makefile.
+// name of the macro or the location where this macro is defined, as it is used
-#define EvalFileDefaultNameBig "nn-b1a57edbea57.nnue"
+// in the Makefile/Fishtest.
 #define EvalFileDefaultNameBig "nn-ae6a388e4a1a.nnue"
 #define EvalFileDefaultNameSmall "nn-baff1ede1f90.nnue"
 struct EvalFile {
    // UCI option name
    std::string optionName;
    // Default net name, will use one of the macros above
    std::string defaultName;
    // Selected net name, either via uci option or default
    std::string current;
    // Net description extracted from the net file
    std::string netDescription;
 };
 namespace NNUE {
 struct Networks;
 }
-enum NetSize : int;
+std::string trace(Position& pos, const Eval::NNUE::Networks& networks);
-using EvalFiles = std::unordered_map<Eval::NNUE::NetSize, EvalFile>;
+int   simple_eval(const Position& pos, Color c);
 Value evaluate(const NNUE::Networks& networks, const Position& pos, int optimism);
 EvalFiles load_networks(const std::string&, const OptionsMap&, EvalFiles);
 void      verify(const OptionsMap&, const EvalFiles&);
 }  // namespace NNUE
 }  // namespace Eval
@@ -17,10 +17,8 @@
 */
 #include <iostream>
 #include <unordered_map>
 #include "bitboard.h"
 #include "evaluate.h"
 #include "misc.h"
 #include "position.h"
 #include "tune.h"
@@ -31,7 +29,9 @@ using namespace Stockfish;
 int main(int argc, char* argv[]) {
-    std::cout << engine_info() << std::endl;
+    Cluster::init();
    if (Cluster::is_root())
        std::cout << engine_info() << std::endl;
    Bitboards::init();
    Position::init();
@@ -40,9 +40,9 @@ int main(int argc, char* argv[]) {
    Tune::init(uci.options);
    uci.evalFiles = Eval::NNUE::load_networks(uci.workingDirectory(), uci.options, uci.evalFiles);
    uci.loop();
    Cluster::finalize();
    return 0;
 }
@@ -75,7 +75,7 @@ namespace Stockfish {
 namespace {
 // Version number or dev.
-constexpr std::string_view version = "16.1";
+constexpr std::string_view version = "dev";
 // Our fancy logging facility. The trick here is to replace cin.rdbuf() and
 // cout.rdbuf() with two Tie objects that tie cin and cout to a file stream. We
@@ -596,14 +596,15 @@ namespace WinProcGroup {
 #ifndef _WIN32
-void bindThisThread(size_t) {}
+void bind_this_thread(size_t) {}
 #else
 namespace {
 // Retrieves logical processor information using Windows-specific
 // API and returns the best node id for the thread with index idx. Original
 // code from Texel by Peter Österlund.
-static int best_node(size_t idx) {
+int best_node(size_t idx) {
    int   threads      = 0;
    int   nodes        = 0;
@@ -668,10 +669,11 @@ static int best_node(size_t idx) {
    // then return -1 and let the OS to decide what to do.
    return idx < groups.size() ? groups[idx] : -1;
 }
 }
 // Sets the group affinity of the current thread
-void bindThisThread(size_t idx) {
+void bind_this_thread(size_t idx) {
    // Use only local variables to be thread-safe
    int node = best_node(idx);
@@ -25,6 +25,7 @@
 #include <cstddef>
 #include <cstdint>
 #include <iosfwd>
 #include <memory>
 #include <string>
 #include <vector>
@@ -49,6 +50,30 @@ void* aligned_large_pages_alloc(size_t size);
 // nop if mem == nullptr
 void aligned_large_pages_free(void* mem);
 // Deleter for automating release of memory area
 template<typename T>
 struct AlignedDeleter {
    void operator()(T* ptr) const {
        ptr->~T();
        std_aligned_free(ptr);
    }
 };
 template<typename T>
 struct LargePageDeleter {
    void operator()(T* ptr) const {
        ptr->~T();
        aligned_large_pages_free(ptr);
    }
 };
 template<typename T>
 using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
 template<typename T>
 using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
 void dbg_hit_on(bool cond, int slot = 0);
 void dbg_mean_of(int64_t value, int slot = 0);
 void dbg_stdev_of(int64_t value, int slot = 0);
@@ -175,7 +200,7 @@ inline uint64_t mul_hi64(uint64_t a, uint64_t b) {
 // called to set group affinity for each thread. Original code from Texel by
 // Peter Österlund.
 namespace WinProcGroup {
-void bindThisThread(size_t idx);
+void bind_this_thread(size_t idx);
 }
@@ -190,18 +190,18 @@ void MovePicker::score() {
            m.value += bool(pos.check_squares(pt) & to) * 16384;
            // bonus for escaping from capture
-            m.value += threatenedPieces & from ? (pt == QUEEN && !(to & threatenedByRook)   ? 50000
+            m.value += threatenedPieces & from ? (pt == QUEEN && !(to & threatenedByRook)   ? 51000
-                                                  : pt == ROOK && !(to & threatenedByMinor) ? 25000
+                                                  : pt == ROOK && !(to & threatenedByMinor) ? 24950
-                                                  : !(to & threatenedByPawn)                ? 15000
+                                                  : !(to & threatenedByPawn)                ? 14450
                                                                                            : 0)
                                               : 0;
            // malus for putting piece en prise
            m.value -= !(threatenedPieces & from)
-                       ? (pt == QUEEN ? bool(to & threatenedByRook) * 50000
+                       ? (pt == QUEEN ? bool(to & threatenedByRook) * 48150
-                                          + bool(to & threatenedByMinor) * 10000
+                                          + bool(to & threatenedByMinor) * 10650
-                          : pt == ROOK ? bool(to & threatenedByMinor) * 25000
+                          : pt == ROOK ? bool(to & threatenedByMinor) * 24500
-                          : pt != PAWN ? bool(to & threatenedByPawn) * 15000
+                          : pt != PAWN ? bool(to & threatenedByPawn) * 14950
                                       : 0)
                       : 0;
        }
@@ -241,7 +241,7 @@ Move MovePicker::select(Pred filter) {
 // moves left, picking the move with the highest score from a list of generated moves.
 Move MovePicker::next_move(bool skipQuiets) {
-    auto quiet_threshold = [](Depth d) { return -3330 * d; };
+    auto quiet_threshold = [](Depth d) { return -3550 * d; };
 top:
    switch (stage)
@@ -19,6 +19,7 @@
 #ifndef MOVEPICK_H_INCLUDED
 #define MOVEPICK_H_INCLUDED
 #include <algorithm>
 #include <array>
 #include <cassert>
 #include <cmath>
@@ -28,8 +29,8 @@
 #include <type_traits>  // IWYU pragma: keep
 #include "movegen.h"
 #include "types.h"
 #include "position.h"
 #include "types.h"
 namespace Stockfish {
@@ -69,10 +70,11 @@ class StatsEntry {
    operator const T&() const { return entry; }
    void operator<<(int bonus) {
        assert(std::abs(bonus) <= D);  // Ensure range is [-D, D]
        static_assert(D <= std::numeric_limits<T>::max(), "D overflows T");
-        entry += bonus - entry * std::abs(bonus) / D;
+        // Make sure that bonus is in range [-D, D]
        int clampedBonus = std::clamp(bonus, -D, D);
        entry += clampedBonus - entry * std::abs(clampedBonus) / D;
        assert(std::abs(entry) <= D);
    }
@@ -1,482 +0,0 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  Stockfish is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // Code for calculating NNUE evaluation function
 #include "evaluate_nnue.h"
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <iomanip>
 #include <iostream>
 #include <optional>
 #include <sstream>
 #include <string_view>
 #include <type_traits>
 #include <unordered_map>
 #include "../evaluate.h"
 #include "../misc.h"
 #include "../position.h"
 #include "../types.h"
 #include "../uci.h"
 #include "nnue_accumulator.h"
 #include "nnue_common.h"
 namespace Stockfish::Eval::NNUE {
 // Input feature converter
 LargePagePtr<FeatureTransformer<TransformedFeatureDimensionsBig, &StateInfo::accumulatorBig>>
  featureTransformerBig;
 LargePagePtr<FeatureTransformer<TransformedFeatureDimensionsSmall, &StateInfo::accumulatorSmall>>
  featureTransformerSmall;
 // Evaluation function
 AlignedPtr<Network<TransformedFeatureDimensionsBig, L2Big, L3Big>>       networkBig[LayerStacks];
 AlignedPtr<Network<TransformedFeatureDimensionsSmall, L2Small, L3Small>> networkSmall[LayerStacks];
 // Evaluation function file names
 namespace Detail {
 // Initialize the evaluation function parameters
 template<typename T>
 void initialize(AlignedPtr<T>& pointer) {
    pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
    std::memset(pointer.get(), 0, sizeof(T));
 }
 template<typename T>
 void initialize(LargePagePtr<T>& pointer) {
    static_assert(alignof(T) <= 4096,
                  "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
    std::memset(pointer.get(), 0, sizeof(T));
 }
 // Read evaluation function parameters
 template<typename T>
 bool read_parameters(std::istream& stream, T& reference) {
    std::uint32_t header;
    header = read_little_endian<std::uint32_t>(stream);
    if (!stream || header != T::get_hash_value())
        return false;
    return reference.read_parameters(stream);
 }
 // Write evaluation function parameters
 template<typename T>
 bool write_parameters(std::ostream& stream, const T& reference) {
    write_little_endian<std::uint32_t>(stream, T::get_hash_value());
    return reference.write_parameters(stream);
 }
 }  // namespace Detail
 // Initialize the evaluation function parameters
 static void initialize(NetSize netSize) {
    if (netSize == Small)
    {
        Detail::initialize(featureTransformerSmall);
        for (std::size_t i = 0; i < LayerStacks; ++i)
            Detail::initialize(networkSmall[i]);
    }
    else
    {
        Detail::initialize(featureTransformerBig);
        for (std::size_t i = 0; i < LayerStacks; ++i)
            Detail::initialize(networkBig[i]);
    }
 }
 // Read network header
 static bool read_header(std::istream& stream, std::uint32_t* hashValue, std::string* desc) {
    std::uint32_t version, size;
    version    = read_little_endian<std::uint32_t>(stream);
    *hashValue = read_little_endian<std::uint32_t>(stream);
    size       = read_little_endian<std::uint32_t>(stream);
    if (!stream || version != Version)
        return false;
    desc->resize(size);
    stream.read(&(*desc)[0], size);
    return !stream.fail();
 }
 // Write network header
 static bool write_header(std::ostream& stream, std::uint32_t hashValue, const std::string& desc) {
    write_little_endian<std::uint32_t>(stream, Version);
    write_little_endian<std::uint32_t>(stream, hashValue);
    write_little_endian<std::uint32_t>(stream, std::uint32_t(desc.size()));
    stream.write(&desc[0], desc.size());
    return !stream.fail();
 }
 // Read network parameters
 static bool read_parameters(std::istream& stream, NetSize netSize, std::string& netDescription) {
    std::uint32_t hashValue;
    if (!read_header(stream, &hashValue, &netDescription))
        return false;
    if (hashValue != HashValue[netSize])
        return false;
    if (netSize == Big && !Detail::read_parameters(stream, *featureTransformerBig))
        return false;
    if (netSize == Small && !Detail::read_parameters(stream, *featureTransformerSmall))
        return false;
    for (std::size_t i = 0; i < LayerStacks; ++i)
    {
        if (netSize == Big && !Detail::read_parameters(stream, *(networkBig[i])))
            return false;
        if (netSize == Small && !Detail::read_parameters(stream, *(networkSmall[i])))
            return false;
    }
    return stream && stream.peek() == std::ios::traits_type::eof();
 }
 // Write network parameters
 static bool
 write_parameters(std::ostream& stream, NetSize netSize, const std::string& netDescription) {
    if (!write_header(stream, HashValue[netSize], netDescription))
        return false;
    if (netSize == Big && !Detail::write_parameters(stream, *featureTransformerBig))
        return false;
    if (netSize == Small && !Detail::write_parameters(stream, *featureTransformerSmall))
        return false;
    for (std::size_t i = 0; i < LayerStacks; ++i)
    {
        if (netSize == Big && !Detail::write_parameters(stream, *(networkBig[i])))
            return false;
        if (netSize == Small && !Detail::write_parameters(stream, *(networkSmall[i])))
            return false;
    }
    return bool(stream);
 }
 void hint_common_parent_position(const Position& pos) {
    int simpleEval = simple_eval(pos, pos.side_to_move());
    if (std::abs(simpleEval) > 1050)
        featureTransformerSmall->hint_common_access(pos);
    else
        featureTransformerBig->hint_common_access(pos);
 }
 // Evaluation function. Perform differential calculation.
 template<NetSize Net_Size>
 Value evaluate(const Position& pos, bool adjusted, int* complexity) {
    // We manually align the arrays on the stack because with gcc < 9.3
    // overaligning stack variables with alignas() doesn't work correctly.
    constexpr uint64_t alignment = CacheLineSize;
    constexpr int      delta     = 24;
 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
    TransformedFeatureType transformedFeaturesUnaligned
      [FeatureTransformer < Net_Size == Small ? TransformedFeatureDimensionsSmall
                                              : TransformedFeatureDimensionsBig,
       nullptr > ::BufferSize + alignment / sizeof(TransformedFeatureType)];
    auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
 #else
    alignas(alignment) TransformedFeatureType
      transformedFeatures[FeatureTransformer < Net_Size == Small ? TransformedFeatureDimensionsSmall
                                                                 : TransformedFeatureDimensionsBig,
                          nullptr > ::BufferSize];
 #endif
    ASSERT_ALIGNED(transformedFeatures, alignment);
    const int  bucket     = (pos.count<ALL_PIECES>() - 1) / 4;
    const auto psqt       = Net_Size == Small
                            ? featureTransformerSmall->transform(pos, transformedFeatures, bucket)
                            : featureTransformerBig->transform(pos, transformedFeatures, bucket);
    const auto positional = Net_Size == Small ? networkSmall[bucket]->propagate(transformedFeatures)
                                              : networkBig[bucket]->propagate(transformedFeatures);
    if (complexity)
        *complexity = std::abs(psqt - positional) / OutputScale;
    // Give more value to positional evaluation when adjusted flag is set
    if (adjusted)
        return static_cast<Value>(((1024 - delta) * psqt + (1024 + delta) * positional)
                                  / (1024 * OutputScale));
    else
        return static_cast<Value>((psqt + positional) / OutputScale);
 }
 template Value evaluate<Big>(const Position& pos, bool adjusted, int* complexity);
 template Value evaluate<Small>(const Position& pos, bool adjusted, int* complexity);
 struct NnueEvalTrace {
    static_assert(LayerStacks == PSQTBuckets);
    Value       psqt[LayerStacks];
    Value       positional[LayerStacks];
    std::size_t correctBucket;
 };
 static NnueEvalTrace trace_evaluate(const Position& pos) {
    // We manually align the arrays on the stack because with gcc < 9.3
    // overaligning stack variables with alignas() doesn't work correctly.
    constexpr uint64_t alignment = CacheLineSize;
 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
    TransformedFeatureType transformedFeaturesUnaligned
      [FeatureTransformer<TransformedFeatureDimensionsBig, nullptr>::BufferSize
       + alignment / sizeof(TransformedFeatureType)];
    auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
 #else
    alignas(alignment) TransformedFeatureType
      transformedFeatures[FeatureTransformer<TransformedFeatureDimensionsBig, nullptr>::BufferSize];
 #endif
    ASSERT_ALIGNED(transformedFeatures, alignment);
    NnueEvalTrace t{};
    t.correctBucket = (pos.count<ALL_PIECES>() - 1) / 4;
    for (IndexType bucket = 0; bucket < LayerStacks; ++bucket)
    {
        const auto materialist = featureTransformerBig->transform(pos, transformedFeatures, bucket);
        const auto positional  = networkBig[bucket]->propagate(transformedFeatures);
        t.psqt[bucket]       = static_cast<Value>(materialist / OutputScale);
        t.positional[bucket] = static_cast<Value>(positional / OutputScale);
    }
    return t;
 }
 constexpr std::string_view PieceToChar(" PNBRQK  pnbrqk");
 // Converts a Value into (centi)pawns and writes it in a buffer.
 // The buffer must have capacity for at least 5 chars.
 static void format_cp_compact(Value v, char* buffer) {
    buffer[0] = (v < 0 ? '-' : v > 0 ? '+' : ' ');
    int cp = std::abs(UCI::to_cp(v));
    if (cp >= 10000)
    {
        buffer[1] = '0' + cp / 10000;
        cp %= 10000;
        buffer[2] = '0' + cp / 1000;
        cp %= 1000;
        buffer[3] = '0' + cp / 100;
        buffer[4] = ' ';
    }
    else if (cp >= 1000)
    {
        buffer[1] = '0' + cp / 1000;
        cp %= 1000;
        buffer[2] = '0' + cp / 100;
        cp %= 100;
        buffer[3] = '.';
        buffer[4] = '0' + cp / 10;
    }
    else
    {
        buffer[1] = '0' + cp / 100;
        cp %= 100;
        buffer[2] = '.';
        buffer[3] = '0' + cp / 10;
        cp %= 10;
        buffer[4] = '0' + cp / 1;
    }
 }
 // Converts a Value into pawns, always keeping two decimals
 static void format_cp_aligned_dot(Value v, std::stringstream& stream) {
    const double pawns = std::abs(0.01 * UCI::to_cp(v));
    stream << (v < 0   ? '-'
               : v > 0 ? '+'
                       : ' ')
           << std::setiosflags(std::ios::fixed) << std::setw(6) << std::setprecision(2) << pawns;
 }
 // Returns a string with the value of each piece on a board,
 // and a table for (PSQT, Layers) values bucket by bucket.
 std::string trace(Position& pos) {
    std::stringstream ss;
    char board[3 * 8 + 1][8 * 8 + 2];
    std::memset(board, ' ', sizeof(board));
    for (int row = 0; row < 3 * 8 + 1; ++row)
        board[row][8 * 8 + 1] = '\0';
    // A lambda to output one box of the board
    auto writeSquare = [&board](File file, Rank rank, Piece pc, Value value) {
        const int x = int(file) * 8;
        const int y = (7 - int(rank)) * 3;
        for (int i = 1; i < 8; ++i)
            board[y][x + i] = board[y + 3][x + i] = '-';
        for (int i = 1; i < 3; ++i)
            board[y + i][x] = board[y + i][x + 8] = '|';
        board[y][x] = board[y][x + 8] = board[y + 3][x + 8] = board[y + 3][x] = '+';
        if (pc != NO_PIECE)
            board[y + 1][x + 4] = PieceToChar[pc];
        if (value != VALUE_NONE)
            format_cp_compact(value, &board[y + 2][x + 2]);
    };
    // We estimate the value of each piece by doing a differential evaluation from
    // the current base eval, simulating the removal of the piece from its square.
    Value base = evaluate<NNUE::Big>(pos);
    base       = pos.side_to_move() == WHITE ? base : -base;
    for (File f = FILE_A; f <= FILE_H; ++f)
        for (Rank r = RANK_1; r <= RANK_8; ++r)
        {
            Square sq = make_square(f, r);
            Piece  pc = pos.piece_on(sq);
            Value  v  = VALUE_NONE;
            if (pc != NO_PIECE && type_of(pc) != KING)
            {
                auto st = pos.state();
                pos.remove_piece(sq);
                st->accumulatorBig.computed[WHITE] = false;
                st->accumulatorBig.computed[BLACK] = false;
                Value eval = evaluate<NNUE::Big>(pos);
                eval       = pos.side_to_move() == WHITE ? eval : -eval;
                v          = base - eval;
                pos.put_piece(pc, sq);
                st->accumulatorBig.computed[WHITE] = false;
                st->accumulatorBig.computed[BLACK] = false;
            }
            writeSquare(f, r, pc, v);
        }
    ss << " NNUE derived piece values:\n";
    for (int row = 0; row < 3 * 8 + 1; ++row)
        ss << board[row] << '\n';
    ss << '\n';
    auto t = trace_evaluate(pos);
    ss << " NNUE network contributions "
       << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl
       << "+------------+------------+------------+------------+\n"
       << "|   Bucket   |  Material  | Positional |   Total    |\n"
       << "|            |   (PSQT)   |  (Layers)  |            |\n"
       << "+------------+------------+------------+------------+\n";
    for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket)
    {
        ss << "|  " << bucket << "        ";
        ss << " |  ";
        format_cp_aligned_dot(t.psqt[bucket], ss);
        ss << "  "
           << " |  ";
        format_cp_aligned_dot(t.positional[bucket], ss);
        ss << "  "
           << " |  ";
        format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], ss);
        ss << "  "
           << " |";
        if (bucket == t.correctBucket)
            ss << " <-- this bucket is used";
        ss << '\n';
    }
    ss << "+------------+------------+------------+------------+\n";
    return ss.str();
 }
 // Load eval, from a file stream or a memory stream
 std::optional<std::string> load_eval(std::istream& stream, NetSize netSize) {
    initialize(netSize);
    std::string netDescription;
    return read_parameters(stream, netSize, netDescription) ? std::make_optional(netDescription)
                                                            : std::nullopt;
 }
 // Save eval, to a file stream or a memory stream
 bool save_eval(std::ostream&      stream,
               NetSize            netSize,
               const std::string& name,
               const std::string& netDescription) {
    if (name.empty() || name == "None")
        return false;
    return write_parameters(stream, netSize, netDescription);
 }
 // Save eval, to a file given by its name
 bool save_eval(const std::optional<std::string>&                              filename,
               NetSize                                                        netSize,
               const std::unordered_map<Eval::NNUE::NetSize, Eval::EvalFile>& evalFiles) {
    std::string actualFilename;
    std::string msg;
    if (filename.has_value())
        actualFilename = filename.value();
    else
    {
        if (evalFiles.at(netSize).current
            != (netSize == Small ? EvalFileDefaultNameSmall : EvalFileDefaultNameBig))
        {
            msg = "Failed to export a net. "
                  "A non-embedded net can only be saved if the filename is specified";
            sync_cout << msg << sync_endl;
            return false;
        }
        actualFilename = (netSize == Small ? EvalFileDefaultNameSmall : EvalFileDefaultNameBig);
    }
    std::ofstream stream(actualFilename, std::ios_base::binary);
    bool          saved = save_eval(stream, netSize, evalFiles.at(netSize).current,
                                    evalFiles.at(netSize).netDescription);
    msg = saved ? "Network saved successfully to " + actualFilename : "Failed to export a net";
    sync_cout << msg << sync_endl;
    return saved;
 }
 }  // namespace Stockfish::Eval::NNUE
@@ -1,93 +0,0 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  Stockfish is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // header used in NNUE evaluation function
 #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
 #define NNUE_EVALUATE_NNUE_H_INCLUDED
 #include <cstdint>
 #include <iosfwd>
 #include <memory>
 #include <optional>
 #include <string>
 #include <unordered_map>
 #include "../misc.h"
 #include "../types.h"
 #include "nnue_architecture.h"
 #include "nnue_feature_transformer.h"
 namespace Stockfish {
 class Position;
 namespace Eval {
 struct EvalFile;
 }
 }
 namespace Stockfish::Eval::NNUE {
 // Hash value of evaluation function structure
 constexpr std::uint32_t HashValue[2] = {
  FeatureTransformer<TransformedFeatureDimensionsBig, nullptr>::get_hash_value()
    ^ Network<TransformedFeatureDimensionsBig, L2Big, L3Big>::get_hash_value(),
  FeatureTransformer<TransformedFeatureDimensionsSmall, nullptr>::get_hash_value()
    ^ Network<TransformedFeatureDimensionsSmall, L2Small, L3Small>::get_hash_value()};
 // Deleter for automating release of memory area
 template<typename T>
 struct AlignedDeleter {
    void operator()(T* ptr) const {
        ptr->~T();
        std_aligned_free(ptr);
    }
 };
 template<typename T>
 struct LargePageDeleter {
    void operator()(T* ptr) const {
        ptr->~T();
        aligned_large_pages_free(ptr);
    }
 };
 template<typename T>
 using AlignedPtr = std::unique_ptr<T, AlignedDeleter<T>>;
 template<typename T>
 using LargePagePtr = std::unique_ptr<T, LargePageDeleter<T>>;
 std::string trace(Position& pos);
 template<NetSize Net_Size>
 Value evaluate(const Position& pos, bool adjusted = false, int* complexity = nullptr);
 void  hint_common_parent_position(const Position& pos);
 std::optional<std::string> load_eval(std::istream& stream, NetSize netSize);
 bool                       save_eval(std::ostream&      stream,
                                     NetSize            netSize,
                                     const std::string& name,
                                     const std::string& netDescription);
 bool                       save_eval(const std::optional<std::string>& filename,
                                     NetSize                           netSize,
                                     const std::unordered_map<Eval::NNUE::NetSize, Eval::EvalFile>&);
 }  // namespace Stockfish::Eval::NNUE
 #endif  // #ifndef NNUE_EVALUATE_NNUE_H_INCLUDED
@@ -0,0 +1,444 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  Stockfish is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #include "network.h"
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <fstream>
 #include <iostream>
 #include <optional>
 #include <type_traits>
 #include <vector>
 #include "../cluster.h"
 #include "../evaluate.h"
 #include "../incbin/incbin.h"
 #include "../misc.h"
 #include "../position.h"
 #include "../types.h"
 #include "nnue_architecture.h"
 #include "nnue_common.h"
 #include "nnue_misc.h"
 namespace {
 // Macro to embed the default efficiently updatable neural network (NNUE) file
 // data in the engine binary (using incbin.h, by Dale Weiler).
 // This macro invocation will declare the following three variables
 //     const unsigned char        gEmbeddedNNUEData[];  // a pointer to the embedded data
 //     const unsigned char *const gEmbeddedNNUEEnd;     // a marker to the end
 //     const unsigned int         gEmbeddedNNUESize;    // the size of the embedded file
 // Note that this does not work in Microsoft Visual Studio.
 #if !defined(_MSC_VER) && !defined(NNUE_EMBEDDING_OFF)
 INCBIN(EmbeddedNNUEBig, EvalFileDefaultNameBig);
 INCBIN(EmbeddedNNUESmall, EvalFileDefaultNameSmall);
 #else
 const unsigned char        gEmbeddedNNUEBigData[1]   = {0x0};
 const unsigned char* const gEmbeddedNNUEBigEnd       = &gEmbeddedNNUEBigData[1];
 const unsigned int         gEmbeddedNNUEBigSize      = 1;
 const unsigned char        gEmbeddedNNUESmallData[1] = {0x0};
 const unsigned char* const gEmbeddedNNUESmallEnd     = &gEmbeddedNNUESmallData[1];
 const unsigned int         gEmbeddedNNUESmallSize    = 1;
 #endif
 struct EmbeddedNNUE {
    EmbeddedNNUE(const unsigned char* embeddedData,
                 const unsigned char* embeddedEnd,
                 const unsigned int   embeddedSize) :
        data(embeddedData),
        end(embeddedEnd),
        size(embeddedSize) {}
    const unsigned char* data;
    const unsigned char* end;
    const unsigned int   size;
 };
 using namespace Stockfish::Eval::NNUE;
 EmbeddedNNUE get_embedded(EmbeddedNNUEType type) {
    if (type == EmbeddedNNUEType::BIG)
        return EmbeddedNNUE(gEmbeddedNNUEBigData, gEmbeddedNNUEBigEnd, gEmbeddedNNUEBigSize);
    else
        return EmbeddedNNUE(gEmbeddedNNUESmallData, gEmbeddedNNUESmallEnd, gEmbeddedNNUESmallSize);
 }
 }
 namespace Stockfish::Eval::NNUE {
 namespace Detail {
 // Initialize the evaluation function parameters
 template<typename T>
 void initialize(AlignedPtr<T>& pointer) {
    pointer.reset(reinterpret_cast<T*>(std_aligned_alloc(alignof(T), sizeof(T))));
    std::memset(pointer.get(), 0, sizeof(T));
 }
 template<typename T>
 void initialize(LargePagePtr<T>& pointer) {
    static_assert(alignof(T) <= 4096,
                  "aligned_large_pages_alloc() may fail for such a big alignment requirement of T");
    pointer.reset(reinterpret_cast<T*>(aligned_large_pages_alloc(sizeof(T))));
    std::memset(pointer.get(), 0, sizeof(T));
 }
 // Read evaluation function parameters
 template<typename T>
 bool read_parameters(std::istream& stream, T& reference) {
    std::uint32_t header;
    header = read_little_endian<std::uint32_t>(stream);
    if (!stream || header != T::get_hash_value())
        return false;
    return reference.read_parameters(stream);
 }
 // Write evaluation function parameters
 template<typename T>
 bool write_parameters(std::ostream& stream, const T& reference) {
    write_little_endian<std::uint32_t>(stream, T::get_hash_value());
    return reference.write_parameters(stream);
 }
 }  // namespace Detail
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::load(const std::string& rootDirectory, std::string evalfilePath) {
 #if defined(DEFAULT_NNUE_DIRECTORY)
    std::vector<std::string> dirs = {"<internal>", "", rootDirectory,
                                     stringify(DEFAULT_NNUE_DIRECTORY)};
 #else
    std::vector<std::string> dirs = {"<internal>", "", rootDirectory};
 #endif
    if (evalfilePath.empty())
        evalfilePath = evalFile.defaultName;
    for (const auto& directory : dirs)
    {
        if (evalFile.current != evalfilePath)
        {
            if (directory != "<internal>")
            {
                load_user_net(directory, evalfilePath);
            }
            if (directory == "<internal>" && evalfilePath == evalFile.defaultName)
            {
                load_internal();
            }
        }
    }
 }
 template<typename Arch, typename Transformer>
 bool Network<Arch, Transformer>::save(const std::optional<std::string>& filename) const {
    std::string actualFilename;
    std::string msg;
    if (filename.has_value())
        actualFilename = filename.value();
    else
    {
        if (evalFile.current != evalFile.defaultName)
        {
            msg = "Failed to export a net. "
                  "A non-embedded net can only be saved if the filename is specified";
            sync_cout << msg << sync_endl;
            return false;
        }
        actualFilename = evalFile.defaultName;
    }
    std::ofstream stream(actualFilename, std::ios_base::binary);
    bool          saved = save(stream, evalFile.current, evalFile.netDescription);
    msg = saved ? "Network saved successfully to " + actualFilename : "Failed to export a net";
    sync_cout << msg << sync_endl;
    return saved;
 }
 template<typename Arch, typename Transformer>
 Value Network<Arch, Transformer>::evaluate(const Position& pos,
                                           bool            adjusted,
                                           int*            complexity,
                                           bool            psqtOnly) const {
    // We manually align the arrays on the stack because with gcc < 9.3
    // overaligning stack variables with alignas() doesn't work correctly.
    constexpr uint64_t alignment = CacheLineSize;
    constexpr int      delta     = 24;
 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
    TransformedFeatureType transformedFeaturesUnaligned
      [FeatureTransformer<Arch::TransformedFeatureDimensions, nullptr>::BufferSize
       + alignment / sizeof(TransformedFeatureType)];
    auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
 #else
    alignas(alignment) TransformedFeatureType transformedFeatures
      [FeatureTransformer<Arch::TransformedFeatureDimensions, nullptr>::BufferSize];
 #endif
    ASSERT_ALIGNED(transformedFeatures, alignment);
    const int  bucket = (pos.count<ALL_PIECES>() - 1) / 4;
    const auto psqt   = featureTransformer->transform(pos, transformedFeatures, bucket, psqtOnly);
    const auto positional = !psqtOnly ? (network[bucket]->propagate(transformedFeatures)) : 0;
    if (complexity)
        *complexity = !psqtOnly ? std::abs(psqt - positional) / OutputScale : 0;
    // Give more value to positional evaluation when adjusted flag is set
    if (adjusted)
        return static_cast<Value>(((1024 - delta) * psqt + (1024 + delta) * positional)
                                  / (1024 * OutputScale));
    else
        return static_cast<Value>((psqt + positional) / OutputScale);
 }
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::verify(std::string evalfilePath) const {
    if (evalfilePath.empty())
        evalfilePath = evalFile.defaultName;
    if (evalFile.current != evalfilePath)
    {
        std::string msg1 =
          "Network evaluation parameters compatible with the engine must be available.";
        std::string msg2 = "The network file " + evalfilePath + " was not loaded successfully.";
        std::string msg3 = "The UCI option EvalFile might need to specify the full path, "
                           "including the directory name, to the network file.";
        std::string msg4 = "The default net can be downloaded from: "
                           "https://tests.stockfishchess.org/api/nn/"
                         + evalFile.defaultName;
        std::string msg5 = "The engine will be terminated now.";
        sync_cout << "info string ERROR: " << msg1 << sync_endl;
        sync_cout << "info string ERROR: " << msg2 << sync_endl;
        sync_cout << "info string ERROR: " << msg3 << sync_endl;
        sync_cout << "info string ERROR: " << msg4 << sync_endl;
        sync_cout << "info string ERROR: " << msg5 << sync_endl;
        exit(EXIT_FAILURE);
    }
    if (Cluster::is_root())
        sync_cout << "info string NNUE evaluation using " << evalfilePath << sync_endl;
 }
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::hint_common_access(const Position& pos, bool psqtOnl) const {
    featureTransformer->hint_common_access(pos, psqtOnl);
 }
 template<typename Arch, typename Transformer>
 NnueEvalTrace Network<Arch, Transformer>::trace_evaluate(const Position& pos) const {
    // We manually align the arrays on the stack because with gcc < 9.3
    // overaligning stack variables with alignas() doesn't work correctly.
    constexpr uint64_t alignment = CacheLineSize;
 #if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
    TransformedFeatureType transformedFeaturesUnaligned
      [FeatureTransformer<Arch::TransformedFeatureDimensions, nullptr>::BufferSize
       + alignment / sizeof(TransformedFeatureType)];
    auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
 #else
    alignas(alignment) TransformedFeatureType transformedFeatures
      [FeatureTransformer<Arch::TransformedFeatureDimensions, nullptr>::BufferSize];
 #endif
    ASSERT_ALIGNED(transformedFeatures, alignment);
    NnueEvalTrace t{};
    t.correctBucket = (pos.count<ALL_PIECES>() - 1) / 4;
    for (IndexType bucket = 0; bucket < LayerStacks; ++bucket)
    {
        const auto materialist =
          featureTransformer->transform(pos, transformedFeatures, bucket, false);
        const auto positional = network[bucket]->propagate(transformedFeatures);
        t.psqt[bucket]       = static_cast<Value>(materialist / OutputScale);
        t.positional[bucket] = static_cast<Value>(positional / OutputScale);
    }
    return t;
 }
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::load_user_net(const std::string& dir,
                                               const std::string& evalfilePath) {
    std::ifstream stream(dir + evalfilePath, std::ios::binary);
    auto          description = load(stream);
    if (description.has_value())
    {
        evalFile.current        = evalfilePath;
        evalFile.netDescription = description.value();
    }
 }
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::load_internal() {
    // C++ way to prepare a buffer for a memory stream
    class MemoryBuffer: public std::basic_streambuf<char> {
       public:
        MemoryBuffer(char* p, size_t n) {
            setg(p, p, p + n);
            setp(p, p + n);
        }
    };
    const auto embedded = get_embedded(embeddedType);
    MemoryBuffer buffer(const_cast<char*>(reinterpret_cast<const char*>(embedded.data)),
                        size_t(embedded.size));
    std::istream stream(&buffer);
    auto         description = load(stream);
    if (description.has_value())
    {
        evalFile.current        = evalFile.defaultName;
        evalFile.netDescription = description.value();
    }
 }
 template<typename Arch, typename Transformer>
 void Network<Arch, Transformer>::initialize() {
    Detail::initialize(featureTransformer);
    for (std::size_t i = 0; i < LayerStacks; ++i)
        Detail::initialize(network[i]);
 }
 template<typename Arch, typename Transformer>
 bool Network<Arch, Transformer>::save(std::ostream&      stream,
                                      const std::string& name,
                                      const std::string& netDescription) const {
    if (name.empty() || name == "None")
        return false;
    return write_parameters(stream, netDescription);
 }
 template<typename Arch, typename Transformer>
 std::optional<std::string> Network<Arch, Transformer>::load(std::istream& stream) {
    initialize();
    std::string description;
    return read_parameters(stream, description) ? std::make_optional(description) : std::nullopt;
 }
 // Read network header
 template<typename Arch, typename Transformer>
 bool Network<Arch, Transformer>::read_header(std::istream&  stream,
                                             std::uint32_t* hashValue,
                                             std::string*   desc) const {
    std::uint32_t version, size;
    version    = read_little_endian<std::uint32_t>(stream);
    *hashValue = read_little_endian<std::uint32_t>(stream);
    size       = read_little_endian<std::uint32_t>(stream);
    if (!stream || version != Version)
        return false;
    desc->resize(size);
    stream.read(&(*desc)[0], size);
    return !stream.fail();
 }
 // Write network header
 template<typename Arch, typename Transformer>
 bool Network<Arch, Transformer>::write_header(std::ostream&      stream,
                                              std::uint32_t      hashValue,
                                              const std::string& desc) const {
    write_little_endian<std::uint32_t>(stream, Version);
    write_little_endian<std::uint32_t>(stream, hashValue);
    write_little_endian<std::uint32_t>(stream, std::uint32_t(desc.size()));
    stream.write(&desc[0], desc.size());
    return !stream.fail();
 }
 template<typename Arch, typename Transformer>
 bool Network<Arch, Transformer>::read_parameters(std::istream& stream,
                                                 std::string&  netDescription) const {
    std::uint32_t hashValue;
    if (!read_header(stream, &hashValue, &netDescription))
        return false;
    if (hashValue != Network::hash)
        return false;
    if (!Detail::read_parameters(stream, *featureTransformer))
        return false;
    for (std::size_t i = 0; i < LayerStacks; ++i)
    {
        if (!Detail::read_parameters(stream, *(network[i])))
            return false;
    }
    return stream && stream.peek() == std::ios::traits_type::eof();
 }
 template<typename Arch, typename Transformer>
 bool Network<Arch, Transformer>::write_parameters(std::ostream&      stream,
                                                  const std::string& netDescription) const {
    if (!write_header(stream, Network::hash, netDescription))
        return false;
    if (!Detail::write_parameters(stream, *featureTransformer))
        return false;
    for (std::size_t i = 0; i < LayerStacks; ++i)
    {
        if (!Detail::write_parameters(stream, *(network[i])))
            return false;
    }
    return bool(stream);
 }
 // Explicit template instantiation
 template class Network<
  NetworkArchitecture<TransformedFeatureDimensionsBig, L2Big, L3Big>,
  FeatureTransformer<TransformedFeatureDimensionsBig, &StateInfo::accumulatorBig>>;
 template class Network<
  NetworkArchitecture<TransformedFeatureDimensionsSmall, L2Small, L3Small>,
  FeatureTransformer<TransformedFeatureDimensionsSmall, &StateInfo::accumulatorSmall>>;
 }  // namespace Stockfish::Eval::NNUE
@@ -0,0 +1,120 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  Stockfish is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef NETWORK_H_INCLUDED
 #define NETWORK_H_INCLUDED
 #include <cstdint>
 #include <iostream>
 #include <optional>
 #include <string>
 #include <utility>
 #include "../misc.h"
 #include "../position.h"
 #include "../types.h"
 #include "nnue_architecture.h"
 #include "nnue_feature_transformer.h"
 #include "nnue_misc.h"
 namespace Stockfish::Eval::NNUE {
 enum class EmbeddedNNUEType {
    BIG,
    SMALL,
 };
 template<typename Arch, typename Transformer>
 class Network {
   public:
    Network(EvalFile file, EmbeddedNNUEType type) :
        evalFile(file),
        embeddedType(type) {}
    void load(const std::string& rootDirectory, std::string evalfilePath);
    bool save(const std::optional<std::string>& filename) const;
    Value evaluate(const Position& pos,
                   bool            adjusted   = false,
                   int*            complexity = nullptr,
                   bool            psqtOnly   = false) const;
    void hint_common_access(const Position& pos, bool psqtOnl) const;
    void          verify(std::string evalfilePath) const;
    NnueEvalTrace trace_evaluate(const Position& pos) const;
   private:
    void load_user_net(const std::string&, const std::string&);
    void load_internal();
    void initialize();
    bool                       save(std::ostream&, const std::string&, const std::string&) const;
    std::optional<std::string> load(std::istream&);
    bool read_header(std::istream&, std::uint32_t*, std::string*) const;
    bool write_header(std::ostream&, std::uint32_t, const std::string&) const;
    bool read_parameters(std::istream&, std::string&) const;
    bool write_parameters(std::ostream&, const std::string&) const;
    // Input feature converter
    LargePagePtr<Transformer> featureTransformer;
    // Evaluation function
    AlignedPtr<Arch> network[LayerStacks];
    EvalFile         evalFile;
    EmbeddedNNUEType embeddedType;
    // Hash value of evaluation function structure
    static constexpr std::uint32_t hash = Transformer::get_hash_value() ^ Arch::get_hash_value();
 };
 // Definitions of the network types
 using SmallFeatureTransformer =
  FeatureTransformer<TransformedFeatureDimensionsSmall, &StateInfo::accumulatorSmall>;
 using SmallNetworkArchitecture =
  NetworkArchitecture<TransformedFeatureDimensionsSmall, L2Small, L3Small>;
 using BigFeatureTransformer =
  FeatureTransformer<TransformedFeatureDimensionsBig, &StateInfo::accumulatorBig>;
 using BigNetworkArchitecture = NetworkArchitecture<TransformedFeatureDimensionsBig, L2Big, L3Big>;
 using NetworkBig   = Network<BigNetworkArchitecture, BigFeatureTransformer>;
 using NetworkSmall = Network<SmallNetworkArchitecture, SmallFeatureTransformer>;
 struct Networks {
    Networks(NetworkBig&& nB, NetworkSmall&& nS) :
        big(std::move(nB)),
        small(std::move(nS)) {}
    NetworkBig   big;
    NetworkSmall small;
 };
 }  // namespace Stockfish
 #endif
@@ -34,6 +34,7 @@ struct alignas(CacheLineSize) Accumulator {
    std::int16_t accumulation[2][Size];
    std::int32_t psqtAccumulation[2][PSQTBuckets];
    bool         computed[2];
    bool         computedPSQT[2];
 };
 }  // namespace Stockfish::Eval::NNUE
@@ -37,13 +37,8 @@ namespace Stockfish::Eval::NNUE {
 // Input features used in evaluation function
 using FeatureSet = Features::HalfKAv2_hm;
 enum NetSize : int {
    Big,
    Small
 };
 // Number of input feature dimensions after conversion
-constexpr IndexType TransformedFeatureDimensionsBig = 2560;
+constexpr IndexType TransformedFeatureDimensionsBig = 3072;
 constexpr int       L2Big                           = 15;
 constexpr int       L3Big                           = 32;
@@ -55,7 +50,7 @@ constexpr IndexType PSQTBuckets = 8;
 constexpr IndexType LayerStacks = 8;
 template<IndexType L1, int L2, int L3>
-struct Network {
+struct NetworkArchitecture {
    static constexpr IndexType TransformedFeatureDimensions = L1;
    static constexpr int       FC_0_OUTPUTS                 = L2;
    static constexpr int       FC_1_OUTPUTS                 = L3;
@@ -250,18 +250,21 @@ class FeatureTransformer {
    }
    // Convert input features
-    std::int32_t transform(const Position& pos, OutputType* output, int bucket) const {
+    std::int32_t
-        update_accumulator<WHITE>(pos);
+    transform(const Position& pos, OutputType* output, int bucket, bool psqtOnly) const {
-        update_accumulator<BLACK>(pos);
+        update_accumulator<WHITE>(pos, psqtOnly);
        update_accumulator<BLACK>(pos, psqtOnly);
        const Color perspectives[2]  = {pos.side_to_move(), ~pos.side_to_move()};
        const auto& accumulation     = (pos.state()->*accPtr).accumulation;
        const auto& psqtAccumulation = (pos.state()->*accPtr).psqtAccumulation;
-
+        const auto  psqt =
        const auto psqt =
          (psqtAccumulation[perspectives[0]][bucket] - psqtAccumulation[perspectives[1]][bucket])
          / 2;
        if (psqtOnly)
            return psqt;
        const auto& accumulation = (pos.state()->*accPtr).accumulation;
        for (IndexType p = 0; p < 2; ++p)
        {
@@ -312,20 +315,22 @@ class FeatureTransformer {
        return psqt;
    }  // end of function transform()
-    void hint_common_access(const Position& pos) const {
+    void hint_common_access(const Position& pos, bool psqtOnly) const {
-        hint_common_access_for_perspective<WHITE>(pos);
+        hint_common_access_for_perspective<WHITE>(pos, psqtOnly);
-        hint_common_access_for_perspective<BLACK>(pos);
+        hint_common_access_for_perspective<BLACK>(pos, psqtOnly);
    }
   private:
    template<Color Perspective>
    [[nodiscard]] std::pair<StateInfo*, StateInfo*>
-    try_find_computed_accumulator(const Position& pos) const {
+    try_find_computed_accumulator(const Position& pos, bool psqtOnly) const {
        // Look for a usable accumulator of an earlier position. We keep track
        // of the estimated gain in terms of features to be added/subtracted.
        StateInfo *st = pos.state(), *next = nullptr;
        int        gain = FeatureSet::refresh_cost(pos);
-        while (st->previous && !(st->*accPtr).computed[Perspective])
+        while (st->previous
               && (!(st->*accPtr).computedPSQT[Perspective]
                   || (!psqtOnly && !(st->*accPtr).computed[Perspective])))
        {
            // This governs when a full feature refresh is needed and how many
            // updates are better than just one full refresh.
@@ -347,7 +352,8 @@ class FeatureTransformer {
    template<Color Perspective, size_t N>
    void update_accumulator_incremental(const Position& pos,
                                        StateInfo*      computed_st,
-                                        StateInfo*      states_to_update[N]) const {
+                                        StateInfo*      states_to_update[N],
                                        bool            psqtOnly) const {
        static_assert(N > 0);
        assert(states_to_update[N - 1] == nullptr);
@@ -383,7 +389,8 @@ class FeatureTransformer {
            for (; i >= 0; --i)
            {
-                (states_to_update[i]->*accPtr).computed[Perspective] = true;
+                (states_to_update[i]->*accPtr).computed[Perspective]     = !psqtOnly;
                (states_to_update[i]->*accPtr).computedPSQT[Perspective] = true;
                const StateInfo* end_state = i == 0 ? computed_st : states_to_update[i - 1];
@@ -403,31 +410,34 @@ class FeatureTransformer {
        {
            assert(states_to_update[0]);
-            auto accIn =
+            if (!psqtOnly)
              reinterpret_cast<const vec_t*>(&(st->*accPtr).accumulation[Perspective][0]);
            auto accOut = reinterpret_cast<vec_t*>(
              &(states_to_update[0]->*accPtr).accumulation[Perspective][0]);
            const IndexType offsetR0 = HalfDimensions * removed[0][0];
            auto            columnR0 = reinterpret_cast<const vec_t*>(&weights[offsetR0]);
            const IndexType offsetA  = HalfDimensions * added[0][0];
            auto            columnA  = reinterpret_cast<const vec_t*>(&weights[offsetA]);
            if (removed[0].size() == 1)
            {
-                for (IndexType k = 0; k < HalfDimensions * sizeof(std::int16_t) / sizeof(vec_t);
+                auto accIn =
-                     ++k)
+                  reinterpret_cast<const vec_t*>(&(st->*accPtr).accumulation[Perspective][0]);
-                    accOut[k] = vec_add_16(vec_sub_16(accIn[k], columnR0[k]), columnA[k]);
+                auto accOut = reinterpret_cast<vec_t*>(
-            }
+                  &(states_to_update[0]->*accPtr).accumulation[Perspective][0]);
            else
            {
                const IndexType offsetR1 = HalfDimensions * removed[0][1];
                auto            columnR1 = reinterpret_cast<const vec_t*>(&weights[offsetR1]);
-                for (IndexType k = 0; k < HalfDimensions * sizeof(std::int16_t) / sizeof(vec_t);
+                const IndexType offsetR0 = HalfDimensions * removed[0][0];
-                     ++k)
+                auto            columnR0 = reinterpret_cast<const vec_t*>(&weights[offsetR0]);
-                    accOut[k] = vec_sub_16(vec_add_16(accIn[k], columnA[k]),
+                const IndexType offsetA  = HalfDimensions * added[0][0];
-                                           vec_add_16(columnR0[k], columnR1[k]));
+                auto            columnA  = reinterpret_cast<const vec_t*>(&weights[offsetA]);
                if (removed[0].size() == 1)
                {
                    for (IndexType k = 0; k < HalfDimensions * sizeof(std::int16_t) / sizeof(vec_t);
                         ++k)
                        accOut[k] = vec_add_16(vec_sub_16(accIn[k], columnR0[k]), columnA[k]);
                }
                else
                {
                    const IndexType offsetR1 = HalfDimensions * removed[0][1];
                    auto            columnR1 = reinterpret_cast<const vec_t*>(&weights[offsetR1]);
                    for (IndexType k = 0; k < HalfDimensions * sizeof(std::int16_t) / sizeof(vec_t);
                         ++k)
                        accOut[k] = vec_sub_16(vec_add_16(accIn[k], columnA[k]),
                                               vec_add_16(columnR0[k], columnR1[k]));
                }
            }
            auto accPsqtIn =
@@ -461,41 +471,43 @@ class FeatureTransformer {
        }
        else
        {
-            for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
+            if (!psqtOnly)
-            {
+                for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
                // Load accumulator
                auto accTileIn = reinterpret_cast<const vec_t*>(
                  &(st->*accPtr).accumulation[Perspective][j * TileHeight]);
                for (IndexType k = 0; k < NumRegs; ++k)
                    acc[k] = vec_load(&accTileIn[k]);
                for (IndexType i = 0; states_to_update[i]; ++i)
                {
-                    // Difference calculation for the deactivated features
+                    // Load accumulator
-                    for (const auto index : removed[i])
+                    auto accTileIn = reinterpret_cast<const vec_t*>(
-                    {
+                      &(st->*accPtr).accumulation[Perspective][j * TileHeight]);
                        const IndexType offset = HalfDimensions * index + j * TileHeight;
                        auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
                        for (IndexType k = 0; k < NumRegs; ++k)
                            acc[k] = vec_sub_16(acc[k], column[k]);
                    }
                    // Difference calculation for the activated features
                    for (const auto index : added[i])
                    {
                        const IndexType offset = HalfDimensions * index + j * TileHeight;
                        auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
                        for (IndexType k = 0; k < NumRegs; ++k)
                            acc[k] = vec_add_16(acc[k], column[k]);
                    }
                    // Store accumulator
                    auto accTileOut = reinterpret_cast<vec_t*>(
                      &(states_to_update[i]->*accPtr).accumulation[Perspective][j * TileHeight]);
                    for (IndexType k = 0; k < NumRegs; ++k)
-                        vec_store(&accTileOut[k], acc[k]);
+                        acc[k] = vec_load(&accTileIn[k]);
                    for (IndexType i = 0; states_to_update[i]; ++i)
                    {
                        // Difference calculation for the deactivated features
                        for (const auto index : removed[i])
                        {
                            const IndexType offset = HalfDimensions * index + j * TileHeight;
                            auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
                            for (IndexType k = 0; k < NumRegs; ++k)
                                acc[k] = vec_sub_16(acc[k], column[k]);
                        }
                        // Difference calculation for the activated features
                        for (const auto index : added[i])
                        {
                            const IndexType offset = HalfDimensions * index + j * TileHeight;
                            auto column = reinterpret_cast<const vec_t*>(&weights[offset]);
                            for (IndexType k = 0; k < NumRegs; ++k)
                                acc[k] = vec_add_16(acc[k], column[k]);
                        }
                        // Store accumulator
                        auto accTileOut =
                          reinterpret_cast<vec_t*>(&(states_to_update[i]->*accPtr)
                                                      .accumulation[Perspective][j * TileHeight]);
                        for (IndexType k = 0; k < NumRegs; ++k)
                            vec_store(&accTileOut[k], acc[k]);
                    }
                }
            }
            for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
            {
@@ -537,8 +549,10 @@ class FeatureTransformer {
 #else
        for (IndexType i = 0; states_to_update[i]; ++i)
        {
-            std::memcpy((states_to_update[i]->*accPtr).accumulation[Perspective],
+            if (!psqtOnly)
-                        (st->*accPtr).accumulation[Perspective], HalfDimensions * sizeof(BiasType));
+                std::memcpy((states_to_update[i]->*accPtr).accumulation[Perspective],
                            (st->*accPtr).accumulation[Perspective],
                            HalfDimensions * sizeof(BiasType));
            for (std::size_t k = 0; k < PSQTBuckets; ++k)
                (states_to_update[i]->*accPtr).psqtAccumulation[Perspective][k] =
@@ -549,10 +563,12 @@ class FeatureTransformer {
            // Difference calculation for the deactivated features
            for (const auto index : removed[i])
            {
-                const IndexType offset = HalfDimensions * index;
+                if (!psqtOnly)
-
+                {
-                for (IndexType j = 0; j < HalfDimensions; ++j)
+                    const IndexType offset = HalfDimensions * index;
-                    (st->*accPtr).accumulation[Perspective][j] -= weights[offset + j];
+                    for (IndexType j = 0; j < HalfDimensions; ++j)
                        (st->*accPtr).accumulation[Perspective][j] -= weights[offset + j];
                }
                for (std::size_t k = 0; k < PSQTBuckets; ++k)
                    (st->*accPtr).psqtAccumulation[Perspective][k] -=
@@ -562,10 +578,12 @@ class FeatureTransformer {
            // Difference calculation for the activated features
            for (const auto index : added[i])
            {
-                const IndexType offset = HalfDimensions * index;
+                if (!psqtOnly)
-
+                {
-                for (IndexType j = 0; j < HalfDimensions; ++j)
+                    const IndexType offset = HalfDimensions * index;
-                    (st->*accPtr).accumulation[Perspective][j] += weights[offset + j];
+                    for (IndexType j = 0; j < HalfDimensions; ++j)
                        (st->*accPtr).accumulation[Perspective][j] += weights[offset + j];
                }
                for (std::size_t k = 0; k < PSQTBuckets; ++k)
                    (st->*accPtr).psqtAccumulation[Perspective][k] +=
@@ -576,7 +594,7 @@ class FeatureTransformer {
    }
    template<Color Perspective>
-    void update_accumulator_refresh(const Position& pos) const {
+    void update_accumulator_refresh(const Position& pos, bool psqtOnly) const {
 #ifdef VECTOR
        // Gcc-10.2 unnecessarily spills AVX2 registers if this array
        // is defined in the VECTOR code below, once in each branch
@@ -587,40 +605,71 @@ class FeatureTransformer {
        // Refresh the accumulator
        // Could be extracted to a separate function because it's done in 2 places,
        // but it's unclear if compilers would correctly handle register allocation.
-        auto& accumulator                 = pos.state()->*accPtr;
+        auto& accumulator                     = pos.state()->*accPtr;
-        accumulator.computed[Perspective] = true;
+        accumulator.computed[Perspective]     = !psqtOnly;
        accumulator.computedPSQT[Perspective] = true;
        FeatureSet::IndexList active;
        FeatureSet::append_active_indices<Perspective>(pos, active);
 #ifdef VECTOR
-        for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
+        if (!psqtOnly)
-        {
+            for (IndexType j = 0; j < HalfDimensions / TileHeight; ++j)
            auto biasesTile = reinterpret_cast<const vec_t*>(&biases[j * TileHeight]);
            for (IndexType k = 0; k < NumRegs; ++k)
                acc[k] = biasesTile[k];
            for (const auto index : active)
            {
-                const IndexType offset = HalfDimensions * index + j * TileHeight;
+                auto biasesTile = reinterpret_cast<const vec_t*>(&biases[j * TileHeight]);
-                auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
+                for (IndexType k = 0; k < NumRegs; ++k)
                    acc[k] = biasesTile[k];
-                for (unsigned k = 0; k < NumRegs; ++k)
+                int i = 0;
-                    acc[k] = vec_add_16(acc[k], column[k]);
+                for (; i < int(active.size()) - 1; i += 2)
                {
                    IndexType       index0  = active[i];
                    IndexType       index1  = active[i + 1];
                    const IndexType offset0 = HalfDimensions * index0 + j * TileHeight;
                    const IndexType offset1 = HalfDimensions * index1 + j * TileHeight;
                    auto            column0 = reinterpret_cast<const vec_t*>(&weights[offset0]);
                    auto            column1 = reinterpret_cast<const vec_t*>(&weights[offset1]);
                    for (unsigned k = 0; k < NumRegs; ++k)
                        acc[k] = vec_add_16(acc[k], vec_add_16(column0[k], column1[k]));
                }
                for (; i < int(active.size()); ++i)
                {
                    IndexType       index  = active[i];
                    const IndexType offset = HalfDimensions * index + j * TileHeight;
                    auto            column = reinterpret_cast<const vec_t*>(&weights[offset]);
                    for (unsigned k = 0; k < NumRegs; ++k)
                        acc[k] = vec_add_16(acc[k], column[k]);
                }
                auto accTile =
                  reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]);
                for (unsigned k = 0; k < NumRegs; k++)
                    vec_store(&accTile[k], acc[k]);
            }
            auto accTile =
              reinterpret_cast<vec_t*>(&accumulator.accumulation[Perspective][j * TileHeight]);
            for (unsigned k = 0; k < NumRegs; k++)
                vec_store(&accTile[k], acc[k]);
        }
        for (IndexType j = 0; j < PSQTBuckets / PsqtTileHeight; ++j)
        {
            for (std::size_t k = 0; k < NumPsqtRegs; ++k)
                psqt[k] = vec_zero_psqt();
-            for (const auto index : active)
+            int i = 0;
            for (; i < int(active.size()) - 1; i += 2)
            {
                IndexType       index0  = active[i];
                IndexType       index1  = active[i + 1];
                const IndexType offset0 = PSQTBuckets * index0 + j * PsqtTileHeight;
                const IndexType offset1 = PSQTBuckets * index1 + j * PsqtTileHeight;
                auto columnPsqt0 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset0]);
                auto columnPsqt1 = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset1]);
                for (std::size_t k = 0; k < NumPsqtRegs; ++k)
                    psqt[k] =
                      vec_add_psqt_32(psqt[k], vec_add_psqt_32(columnPsqt0[k], columnPsqt1[k]));
            }
            for (; i < int(active.size()); ++i)
            {
                IndexType       index  = active[i];
                const IndexType offset = PSQTBuckets * index + j * PsqtTileHeight;
                auto columnPsqt        = reinterpret_cast<const psqt_vec_t*>(&psqtWeights[offset]);
@@ -635,18 +684,21 @@ class FeatureTransformer {
        }
 #else
-        std::memcpy(accumulator.accumulation[Perspective], biases,
+        if (!psqtOnly)
-                    HalfDimensions * sizeof(BiasType));
+            std::memcpy(accumulator.accumulation[Perspective], biases,
                        HalfDimensions * sizeof(BiasType));
        for (std::size_t k = 0; k < PSQTBuckets; ++k)
            accumulator.psqtAccumulation[Perspective][k] = 0;
        for (const auto index : active)
        {
-            const IndexType offset = HalfDimensions * index;
+            if (!psqtOnly)
-
+            {
-            for (IndexType j = 0; j < HalfDimensions; ++j)
+                const IndexType offset = HalfDimensions * index;
-                accumulator.accumulation[Perspective][j] += weights[offset + j];
+                for (IndexType j = 0; j < HalfDimensions; ++j)
                    accumulator.accumulation[Perspective][j] += weights[offset + j];
            }
            for (std::size_t k = 0; k < PSQTBuckets; ++k)
                accumulator.psqtAccumulation[Perspective][k] +=
@@ -656,7 +708,7 @@ class FeatureTransformer {
    }
    template<Color Perspective>
-    void hint_common_access_for_perspective(const Position& pos) const {
+    void hint_common_access_for_perspective(const Position& pos, bool psqtOnly) const {
        // Works like update_accumulator, but performs less work.
        // Updates ONLY the accumulator for pos.
@@ -664,27 +716,31 @@ class FeatureTransformer {
        // Look for a usable accumulator of an earlier position. We keep track
        // of the estimated gain in terms of features to be added/subtracted.
        // Fast early exit.
-        if ((pos.state()->*accPtr).computed[Perspective])
+        if ((pos.state()->*accPtr).computed[Perspective]
            || (psqtOnly && (pos.state()->*accPtr).computedPSQT[Perspective]))
            return;
-        auto [oldest_st, _] = try_find_computed_accumulator<Perspective>(pos);
+        auto [oldest_st, _] = try_find_computed_accumulator<Perspective>(pos, psqtOnly);
-        if ((oldest_st->*accPtr).computed[Perspective])
+        if ((oldest_st->*accPtr).computed[Perspective]
            || (psqtOnly && (oldest_st->*accPtr).computedPSQT[Perspective]))
        {
            // Only update current position accumulator to minimize work.
            StateInfo* states_to_update[2] = {pos.state(), nullptr};
-            update_accumulator_incremental<Perspective, 2>(pos, oldest_st, states_to_update);
+            update_accumulator_incremental<Perspective, 2>(pos, oldest_st, states_to_update,
                                                           psqtOnly);
        }
        else
-            update_accumulator_refresh<Perspective>(pos);
+            update_accumulator_refresh<Perspective>(pos, psqtOnly);
    }
    template<Color Perspective>
-    void update_accumulator(const Position& pos) const {
+    void update_accumulator(const Position& pos, bool psqtOnly) const {
-        auto [oldest_st, next] = try_find_computed_accumulator<Perspective>(pos);
+        auto [oldest_st, next] = try_find_computed_accumulator<Perspective>(pos, psqtOnly);
-        if ((oldest_st->*accPtr).computed[Perspective])
+        if ((oldest_st->*accPtr).computed[Perspective]
            || (psqtOnly && (oldest_st->*accPtr).computedPSQT[Perspective]))
        {
            if (next == nullptr)
                return;
@@ -697,12 +753,11 @@ class FeatureTransformer {
            StateInfo* states_to_update[3] = {next, next == pos.state() ? nullptr : pos.state(),
                                              nullptr};
-            update_accumulator_incremental<Perspective, 3>(pos, oldest_st, states_to_update);
+            update_accumulator_incremental<Perspective, 3>(pos, oldest_st, states_to_update,
                                                           psqtOnly);
        }
        else
-        {
+            update_accumulator_refresh<Perspective>(pos, psqtOnly);
            update_accumulator_refresh<Perspective>(pos);
        }
    }
    alignas(CacheLineSize) BiasType biases[HalfDimensions];
@@ -0,0 +1,203 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  Stockfish is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 // Code for calculating NNUE evaluation function
 #include "nnue_misc.h"
 #include <cmath>
 #include <cstdlib>
 #include <cstring>
 #include <iomanip>
 #include <iosfwd>
 #include <iostream>
 #include <sstream>
 #include <string_view>
 #include "../evaluate.h"
 #include "../position.h"
 #include "../types.h"
 #include "../uci.h"
 #include "network.h"
 #include "nnue_accumulator.h"
 namespace Stockfish::Eval::NNUE {
 constexpr std::string_view PieceToChar(" PNBRQK  pnbrqk");
 void hint_common_parent_position(const Position& pos, const Networks& networks) {
    int simpleEvalAbs = std::abs(simple_eval(pos, pos.side_to_move()));
    if (simpleEvalAbs > Eval::SmallNetThreshold)
        networks.small.hint_common_access(pos, simpleEvalAbs > Eval::PsqtOnlyThreshold);
    else
        networks.big.hint_common_access(pos, false);
 }
 namespace {
 // Converts a Value into (centi)pawns and writes it in a buffer.
 // The buffer must have capacity for at least 5 chars.
 void format_cp_compact(Value v, char* buffer, const Position& pos) {
    buffer[0] = (v < 0 ? '-' : v > 0 ? '+' : ' ');
    int cp = std::abs(UCI::to_cp(v, pos));
    if (cp >= 10000)
    {
        buffer[1] = '0' + cp / 10000;
        cp %= 10000;
        buffer[2] = '0' + cp / 1000;
        cp %= 1000;
        buffer[3] = '0' + cp / 100;
        buffer[4] = ' ';
    }
    else if (cp >= 1000)
    {
        buffer[1] = '0' + cp / 1000;
        cp %= 1000;
        buffer[2] = '0' + cp / 100;
        cp %= 100;
        buffer[3] = '.';
        buffer[4] = '0' + cp / 10;
    }
    else
    {
        buffer[1] = '0' + cp / 100;
        cp %= 100;
        buffer[2] = '.';
        buffer[3] = '0' + cp / 10;
        cp %= 10;
        buffer[4] = '0' + cp / 1;
    }
 }
 // Converts a Value into pawns, always keeping two decimals
 void format_cp_aligned_dot(Value v, std::stringstream& stream, const Position& pos) {
    const double pawns = std::abs(0.01 * UCI::to_cp(v, pos));
    stream << (v < 0   ? '-'
               : v > 0 ? '+'
                       : ' ')
           << std::setiosflags(std::ios::fixed) << std::setw(6) << std::setprecision(2) << pawns;
 }
 }
 // Returns a string with the value of each piece on a board,
 // and a table for (PSQT, Layers) values bucket by bucket.
 std::string trace(Position& pos, const Eval::NNUE::Networks& networks) {
    std::stringstream ss;
    char board[3 * 8 + 1][8 * 8 + 2];
    std::memset(board, ' ', sizeof(board));
    for (int row = 0; row < 3 * 8 + 1; ++row)
        board[row][8 * 8 + 1] = '\0';
    // A lambda to output one box of the board
    auto writeSquare = [&board, &pos](File file, Rank rank, Piece pc, Value value) {
        const int x = int(file) * 8;
        const int y = (7 - int(rank)) * 3;
        for (int i = 1; i < 8; ++i)
            board[y][x + i] = board[y + 3][x + i] = '-';
        for (int i = 1; i < 3; ++i)
            board[y + i][x] = board[y + i][x + 8] = '|';
        board[y][x] = board[y][x + 8] = board[y + 3][x + 8] = board[y + 3][x] = '+';
        if (pc != NO_PIECE)
            board[y + 1][x + 4] = PieceToChar[pc];
        if (value != VALUE_NONE)
            format_cp_compact(value, &board[y + 2][x + 2], pos);
    };
    // We estimate the value of each piece by doing a differential evaluation from
    // the current base eval, simulating the removal of the piece from its square.
    Value base = networks.big.evaluate(pos);
    base       = pos.side_to_move() == WHITE ? base : -base;
    for (File f = FILE_A; f <= FILE_H; ++f)
        for (Rank r = RANK_1; r <= RANK_8; ++r)
        {
            Square sq = make_square(f, r);
            Piece  pc = pos.piece_on(sq);
            Value  v  = VALUE_NONE;
            if (pc != NO_PIECE && type_of(pc) != KING)
            {
                auto st = pos.state();
                pos.remove_piece(sq);
                st->accumulatorBig.computed[WHITE]       = st->accumulatorBig.computed[BLACK] =
                  st->accumulatorBig.computedPSQT[WHITE] = st->accumulatorBig.computedPSQT[BLACK] =
                    false;
                Value eval = networks.big.evaluate(pos);
                eval       = pos.side_to_move() == WHITE ? eval : -eval;
                v          = base - eval;
                pos.put_piece(pc, sq);
                st->accumulatorBig.computed[WHITE]       = st->accumulatorBig.computed[BLACK] =
                  st->accumulatorBig.computedPSQT[WHITE] = st->accumulatorBig.computedPSQT[BLACK] =
                    false;
            }
            writeSquare(f, r, pc, v);
        }
    ss << " NNUE derived piece values:\n";
    for (int row = 0; row < 3 * 8 + 1; ++row)
        ss << board[row] << '\n';
    ss << '\n';
    auto t = networks.big.trace_evaluate(pos);
    ss << " NNUE network contributions "
       << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl
       << "+------------+------------+------------+------------+\n"
       << "|   Bucket   |  Material  | Positional |   Total    |\n"
       << "|            |   (PSQT)   |  (Layers)  |            |\n"
       << "+------------+------------+------------+------------+\n";
    for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket)
    {
        ss << "|  " << bucket << "        ";
        ss << " |  ";
        format_cp_aligned_dot(t.psqt[bucket], ss, pos);
        ss << "  "
           << " |  ";
        format_cp_aligned_dot(t.positional[bucket], ss, pos);
        ss << "  "
           << " |  ";
        format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], ss, pos);
        ss << "  "
           << " |";
        if (bucket == t.correctBucket)
            ss << " <-- this bucket is used";
        ss << '\n';
    }
    ss << "+------------+------------+------------+------------+\n";
    return ss.str();
 }
 }  // namespace Stockfish::Eval::NNUE
@@ -0,0 +1,63 @@
 /*
  Stockfish, a UCI chess playing engine derived from Glaurung 2.1
  Copyright (C) 2004-2024 The Stockfish developers (see AUTHORS file)
  Stockfish is free software: you can redistribute it and/or modify
  it under the terms of the GNU General Public License as published by
  the Free Software Foundation, either version 3 of the License, or
  (at your option) any later version.
  Stockfish is distributed in the hope that it will be useful,
  but WITHOUT ANY WARRANTY; without even the implied warranty of
  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
  GNU General Public License for more details.
  You should have received a copy of the GNU General Public License
  along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 #ifndef NNUE_MISC_H_INCLUDED
 #define NNUE_MISC_H_INCLUDED
 #include <cstddef>
 #include <string>
 #include "../types.h"
 #include "nnue_architecture.h"
 namespace Stockfish {
 class Position;
 namespace Eval::NNUE {
 struct EvalFile {
    // Default net name, will use one of the EvalFileDefaultName* macros defined
    // in evaluate.h
    std::string defaultName;
    // Selected net name, either via uci option or default
    std::string current;
    // Net description extracted from the net file
    std::string netDescription;
 };
 struct NnueEvalTrace {
    static_assert(LayerStacks == PSQTBuckets);
    Value       psqt[LayerStacks];
    Value       positional[LayerStacks];
    std::size_t correctBucket;
 };
 struct Networks;
 std::string trace(Position& pos, const Networks& networks);
 void        hint_common_parent_position(const Position& pos, const Networks& networks);
 }  // namespace Stockfish::Eval::NNUE
 }  // namespace Stockfish
 #endif  // #ifndef NNUE_MISC_H_INCLUDED
@@ -21,6 +21,7 @@
 #include <cstdint>
 #include "cluster.h"
 #include "movegen.h"
 #include "position.h"
 #include "types.h"
@@ -50,7 +51,7 @@ uint64_t perft(Position& pos, Depth depth) {
            nodes += cnt;
            pos.undo_move(m);
        }
-        if (Root)
+        if (Root && Cluster::is_root())
            sync_cout << UCI::move(m, pos.is_chess960()) << ": " << cnt << sync_endl;
    }
    return nodes;
@@ -62,7 +63,8 @@ inline void perft(const std::string& fen, Depth depth, bool isChess960) {
    p.set(fen, isChess960, &states->back());
    uint64_t nodes = perft<true>(p, depth);
-    sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
+    if (Cluster::is_root())
        sync_cout << "\nNodes searched: " << nodes << "\n" << sync_endl;
 }
 }
@@ -680,10 +680,14 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
    ++st->pliesFromNull;
    // Used by NNUE
-    st->accumulatorBig.computed[WHITE]     = st->accumulatorBig.computed[BLACK] =
+    st->accumulatorBig.computed[WHITE]             = st->accumulatorBig.computed[BLACK] =
-      st->accumulatorSmall.computed[WHITE] = st->accumulatorSmall.computed[BLACK] = false;
+      st->accumulatorBig.computedPSQT[WHITE]       = st->accumulatorBig.computedPSQT[BLACK] =
-    auto& dp                                                                      = st->dirtyPiece;
+        st->accumulatorSmall.computed[WHITE]       = st->accumulatorSmall.computed[BLACK] =
-    dp.dirty_num                                                                  = 1;
+          st->accumulatorSmall.computedPSQT[WHITE] = st->accumulatorSmall.computedPSQT[BLACK] =
            false;
    auto& dp     = st->dirtyPiece;
    dp.dirty_num = 1;
    Color  us       = sideToMove;
    Color  them     = ~us;
@@ -965,10 +969,13 @@ void Position::do_null_move(StateInfo& newSt, TranspositionTable& tt) {
    newSt.previous = st;
    st             = &newSt;
-    st->dirtyPiece.dirty_num               = 0;
+    st->dirtyPiece.dirty_num                 = 0;
-    st->dirtyPiece.piece[0]                = NO_PIECE;  // Avoid checks in UpdateAccumulator()
+    st->dirtyPiece.piece[0]                  = NO_PIECE;  // Avoid checks in UpdateAccumulator()
-    st->accumulatorBig.computed[WHITE]     = st->accumulatorBig.computed[BLACK] =
+    st->accumulatorBig.computed[WHITE]       = st->accumulatorBig.computed[BLACK] =
-      st->accumulatorSmall.computed[WHITE] = st->accumulatorSmall.computed[BLACK] = false;
+      st->accumulatorBig.computedPSQT[WHITE] = st->accumulatorBig.computedPSQT[BLACK] =
        st->accumulatorSmall.computed[WHITE] = st->accumulatorSmall.computed[BLACK] =
          st->accumulatorSmall.computedPSQT[WHITE] = st->accumulatorSmall.computedPSQT[BLACK] =
            false;
    if (st->epSquare != SQ_NONE)
    {
@@ -27,15 +27,16 @@
 #include <cstdlib>
 #include <initializer_list>
 #include <iostream>
 #include <utility>
 #include <sstream>
 #include <utility>
 #include "cluster.h"
 #include "evaluate.h"
 #include "misc.h"
 #include "movegen.h"
 #include "movepick.h"
 #include "nnue/evaluate_nnue.h"
 #include "nnue/nnue_common.h"
 #include "nnue/nnue_misc.h"
 #include "position.h"
 #include "syzygy/tbprobe.h"
 #include "thread.h"
@@ -53,11 +54,16 @@ using namespace Search;
 namespace {
 static constexpr double EvalLevel[10] = {1.043, 1.017, 0.952, 1.009, 0.971,
                                         1.002, 0.992, 0.947, 1.046, 1.001};
 // Futility margin
-Value futility_margin(Depth d, bool noTtCutNode, bool improving) {
+Value futility_margin(Depth d, bool noTtCutNode, bool improving, bool oppWorsening) {
-    Value futilityMult = 117 - 44 * noTtCutNode;
+    Value futilityMult       = 118 - 44 * noTtCutNode;
-    return (futilityMult * d - 3 * futilityMult / 2 * improving);
+    Value improvingDeduction = 53 * improving * futilityMult / 32;
    Value worseningDeduction = (309 + 47 * improving) * oppWorsening * futilityMult / 1024;
    return futilityMult * d - improvingDeduction - worseningDeduction;
 }
 constexpr int futility_move_count(bool improving, Depth depth) {
@@ -67,15 +73,15 @@ constexpr int futility_move_count(bool improving, Depth depth) {
 // Add correctionHistory value to raw staticEval and guarantee evaluation does not hit the tablebase range
 Value to_corrected_static_eval(Value v, const Worker& w, const Position& pos) {
    auto cv = w.correctionHistory[pos.side_to_move()][pawn_structure_index<Correction>(pos)];
-    v += cv * std::abs(cv) / 12475;
+    v += cv * std::abs(cv) / 11175;
    return std::clamp(v, VALUE_TB_LOSS_IN_MAX_PLY + 1, VALUE_TB_WIN_IN_MAX_PLY - 1);
 }
 // History and stats update bonus, based on depth
-int stat_bonus(Depth d) { return std::min(246 * d - 351, 1136); }
+int stat_bonus(Depth d) { return std::clamp(245 * d - 320, 0, 1296); }
 // History and stats update malus, based on depth
-int stat_malus(Depth d) { return std::min(519 * d - 306, 1258); }
+int stat_malus(Depth d) { return (d < 4 ? 554 * d - 303 : 1203); }
 // Add a small random component to draw evaluations to avoid 3-fold blindness
 Value value_draw(size_t nodes) { return VALUE_DRAW - 1 + Value(nodes & 0x2); }
@@ -133,7 +139,8 @@ Search::Worker::Worker(SharedState&                    sharedState,
    manager(std::move(sm)),
    options(sharedState.options),
    threads(sharedState.threads),
-    tt(sharedState.tt) {
+    tt(sharedState.tt),
    networks(sharedState.networks) {
    clear();
 }
@@ -151,8 +158,10 @@ void Search::Worker::start_searching() {
    if (rootMoves.empty())
    {
        rootMoves.emplace_back(Move::none());
-        sync_cout << "info depth 0 score "
+        if (Cluster::is_root())
-                  << UCI::value(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW) << sync_endl;
+            sync_cout << "info depth 0 score "
                      << UCI::to_score(rootPos.checkers() ? -VALUE_MATE : VALUE_DRAW, rootPos)
                      << sync_endl;
    }
    else
    {
@@ -166,12 +175,17 @@ void Search::Worker::start_searching() {
    // GUI sends a "stop" or "ponderhit" command. We therefore simply wait here
    // until the GUI sends one of those commands.
    while (!threads.stop && (main_manager()->ponder || limits.infinite))
-    {}  // Busy wait for a stop or a ponder reset
+    {
        Cluster::signals_poll(threads);
    }  // Busy wait for a stop or a ponder reset
    // Stop the threads if not already stopped (also raise the stop if
    // "ponderhit" just reset threads.ponder).
    threads.stop = true;
    // Signal and synchronize all other ranks
    Cluster::signals_sync(threads);
    // Wait until all threads have finished
    threads.wait_for_search_finished();
@@ -179,31 +193,50 @@ void Search::Worker::start_searching() {
    // the available ones before exiting.
    if (limits.npmsec)
        main_manager()->tm.advance_nodes_time(limits.inc[rootPos.side_to_move()]
-                                              - threads.nodes_searched());
+                                              - Cluster::nodes_searched(threads));
    Worker* bestThread = this;
    Skill   skill =
      Skill(options["Skill Level"], options["UCI_LimitStrength"] ? int(options["UCI_Elo"]) : 0);
-    if (int(options["MultiPV"]) == 1 && !limits.depth && !skill.enabled()
+    if (int(options["MultiPV"]) == 1 && !limits.depth && !limits.mate && !skill.enabled()
        && rootMoves[0].pv[0] != Move::none())
        bestThread = threads.get_best_thread()->worker.get();
    // Prepare PVLine and ponder move
    std::string PVLine = main_manager()->pv(*bestThread, threads, tt, bestThread->completedDepth);
    main_manager()->bestPreviousScore        = bestThread->rootMoves[0].score;
    main_manager()->bestPreviousAverageScore = bestThread->rootMoves[0].averageScore;
-    // Send again PV info if we have a new best thread
+    Move bestMove   = bestThread->rootMoves[0].pv[0];
-    if (bestThread != this)
+    Move ponderMove = Move::none();
        sync_cout << main_manager()->pv(*bestThread, threads, tt, bestThread->completedDepth)
                  << sync_endl;
    sync_cout << "bestmove " << UCI::move(bestThread->rootMoves[0].pv[0], rootPos.is_chess960());
    if (bestThread->rootMoves[0].pv.size() > 1
        || bestThread->rootMoves[0].extract_ponder_from_tt(tt, rootPos))
-        std::cout << " ponder " << UCI::move(bestThread->rootMoves[0].pv[1], rootPos.is_chess960());
+        ponderMove = bestThread->rootMoves[0].pv[1];
-    std::cout << sync_endl;
+    // Exchange info as needed
    Cluster::MoveInfo mi{bestMove.raw(), ponderMove.raw(), bestThread->completedDepth,
                         bestThread->rootMoves[0].score, Cluster::rank()};
    Cluster::pick_moves(mi, PVLine);
    main_manager()->bestPreviousScore = static_cast<Value>(mi.score);
    if (Cluster::is_root())
    {
        // Send again PV info if we have a new best thread/rank
        if (bestThread != this || mi.rank != 0)
            sync_cout << PVLine << sync_endl;
        bestMove   = static_cast<Move>(mi.move);
        ponderMove = static_cast<Move>(mi.ponder);
        if (ponderMove != Move::none())
            sync_cout << "bestmove " << UCI::move(bestMove, rootPos.is_chess960()) << " ponder "
                      << UCI::move(ponderMove, rootPos.is_chess960()) << sync_endl;
        else
            sync_cout << "bestmove " << UCI::move(bestMove, rootPos.is_chess960()) << sync_endl;
    }
 }
 // Main iterative deepening loop. It calls search()
@@ -211,7 +244,7 @@ void Search::Worker::start_searching() {
 // consumed, the user stops the search, or the maximum search depth is reached.
 void Search::Worker::iterative_deepening() {
-    SearchManager* mainThread = (thread_idx == 0 ? main_manager() : nullptr);
+    SearchManager* mainThread = (is_mainthread() ? main_manager() : nullptr);
    Move pv[MAX_PLY + 1];
@@ -265,7 +298,7 @@ void Search::Worker::iterative_deepening() {
    // Iterative deepening loop until requested to stop or the target depth is reached
    while (++rootDepth < MAX_PLY && !threads.stop
-           && !(limits.depth && mainThread && rootDepth > limits.depth))
+           && !(limits.depth && mainThread && Cluster::is_root() && rootDepth > limits.depth))
    {
        // Age out PV variability metric
        if (mainThread)
@@ -298,12 +331,12 @@ void Search::Worker::iterative_deepening() {
            // Reset aspiration window starting size
            Value avg = rootMoves[pvIdx].averageScore;
-            delta     = 9 + avg * avg / 12487;
+            delta     = 10 + avg * avg / 12493;
            alpha     = std::max(avg - delta, -VALUE_INFINITE);
            beta      = std::min(avg + delta, VALUE_INFINITE);
            // Adjust optimism based on root move's averageScore (~4 Elo)
-            optimism[us]  = 134 * avg / (std::abs(avg) + 97);
+            optimism[us]  = 132 * avg / (std::abs(avg) + 89);
            optimism[~us] = -optimism[us];
            // Start with a small aspiration window and, in the case of a fail
@@ -334,9 +367,14 @@ void Search::Worker::iterative_deepening() {
                // When failing high/low give some update (without cluttering
                // the UI) before a re-search.
-                if (mainThread && multiPV == 1 && (bestValue <= alpha || bestValue >= beta)
+                if (Cluster::is_root() && mainThread && multiPV == 1
-                    && mainThread->tm.elapsed(threads.nodes_searched()) > 3000)
+                    && (bestValue <= alpha || bestValue >= beta)
                    && mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > 3000)
                {
                    sync_cout << main_manager()->pv(*this, threads, tt, rootDepth) << sync_endl;
                    Cluster::cluster_info(threads, rootDepth,
                                          mainThread->tm.elapsed(Cluster::nodes_searched(threads)));
                }
                // In case of failing low/high increase aspiration window and
                // re-search, otherwise exit the loop.
@@ -365,15 +403,19 @@ void Search::Worker::iterative_deepening() {
            // Sort the PV lines searched so far and update the GUI
            std::stable_sort(rootMoves.begin() + pvFirst, rootMoves.begin() + pvIdx + 1);
-            if (mainThread
+            if (Cluster::is_root() && mainThread
                && (threads.stop || pvIdx + 1 == multiPV
-                    || mainThread->tm.elapsed(threads.nodes_searched()) > 3000)
+                    || mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > 3000)
                // A thread that aborted search can have mated-in/TB-loss PV and score
                // that cannot be trusted, i.e. it can be delayed or refuted if we would have
                // had time to fully search other root-moves. Thus we suppress this output and
                // below pick a proven score/PV for this thread (from the previous iteration).
                && !(threads.abortedSearch && rootMoves[0].uciScore <= VALUE_TB_LOSS_IN_MAX_PLY))
            {
                sync_cout << main_manager()->pv(*this, threads, tt, rootDepth) << sync_endl;
                Cluster::cluster_info(threads, rootDepth,
                                      mainThread->tm.elapsed(Cluster::nodes_searched(threads)) + 1);
            }
        }
        if (!threads.stop)
@@ -397,14 +439,18 @@ void Search::Worker::iterative_deepening() {
            lastBestMoveDepth = rootDepth;
        }
        // Have we found a "mate in x"?
        if (limits.mate && bestValue >= VALUE_MATE_IN_MAX_PLY
            && VALUE_MATE - bestValue <= 2 * limits.mate)
            threads.stop = true;
        if (!mainThread)
            continue;
        // Have we found a "mate in x"?
        if (limits.mate && rootMoves[0].score == rootMoves[0].uciScore
            && ((rootMoves[0].score >= VALUE_MATE_IN_MAX_PLY
                 && VALUE_MATE - rootMoves[0].score <= 2 * limits.mate)
                || (rootMoves[0].score != -VALUE_INFINITE
                    && rootMoves[0].score <= VALUE_MATED_IN_MAX_PLY
                    && VALUE_MATE + rootMoves[0].score <= 2 * limits.mate)))
            threads.stop = true;
        // If the skill level is enabled and time is up, pick a sub-optimal best move
        if (skill.enabled() && skill.time_to_pick(rootDepth))
            skill.pick_best(rootMoves, multiPV);
@@ -419,36 +465,33 @@ void Search::Worker::iterative_deepening() {
        // Do we have time for the next iteration? Can we stop searching now?
        if (limits.use_time_management() && !threads.stop && !mainThread->stopOnPonderhit)
        {
-            auto bestmove    = rootMoves[0].pv[0];
+            int nodesEffort = rootMoves[0].effort * 100 / std::max(size_t(1), size_t(nodes));
            int  nodesEffort = effort[bestmove.from_sq()][bestmove.to_sq()] * 100
                            / std::max(size_t(1), size_t(nodes));
-            double fallingEval = (66 + 14 * (mainThread->bestPreviousAverageScore - bestValue)
+            double fallingEval = (1067 + 223 * (mainThread->bestPreviousAverageScore - bestValue)
-                                  + 6 * (mainThread->iterValue[iterIdx] - bestValue))
+                                  + 97 * (mainThread->iterValue[iterIdx] - bestValue))
-                               / 616.6;
+                               / 10000.0;
-            fallingEval = std::clamp(fallingEval, 0.51, 1.51);
+            fallingEval = std::clamp(fallingEval, 0.580, 1.667);
            // If the bestMove is stable over several iterations, reduce time accordingly
-            timeReduction    = lastBestMoveDepth + 8 < completedDepth ? 1.56 : 0.69;
+            timeReduction    = lastBestMoveDepth + 8 < completedDepth ? 1.495 : 0.687;
-            double reduction = (1.4 + mainThread->previousTimeReduction) / (2.17 * timeReduction);
+            double reduction = (1.48 + mainThread->previousTimeReduction) / (2.17 * timeReduction);
-            double bestMoveInstability = 1 + 1.79 * totBestMoveChanges / threads.size();
+            double bestMoveInstability = 1 + 1.88 * totBestMoveChanges / threads.size();
            int    el                  = std::clamp((bestValue + 750) / 150, 0, 9);
-            double totalTime =
+            double totalTime = mainThread->tm.optimum() * fallingEval * reduction
-              mainThread->tm.optimum() * fallingEval * reduction * bestMoveInstability;
+                             * bestMoveInstability * EvalLevel[el];
            // Cap used time in case of a single legal move for a better viewer experience
            if (rootMoves.size() == 1)
                totalTime = std::min(500.0, totalTime);
-            if (completedDepth >= 10 && nodesEffort >= 95
+            if (completedDepth >= 10 && nodesEffort >= 97
-                && mainThread->tm.elapsed(threads.nodes_searched()) > totalTime * 3 / 4
+                && mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > totalTime * 0.739
                && !mainThread->ponder)
            {
                threads.stop = true;
            }
            // Stop the search if we have exceeded the totalTime
-            if (mainThread->tm.elapsed(threads.nodes_searched()) > totalTime)
+            if (mainThread->tm.elapsed(Cluster::nodes_searched(threads)) > totalTime)
            {
                // If we are allowed to ponder do not stop the search now but
                // keep pondering until the GUI sends "ponderhit" or "stop".
@@ -457,11 +500,10 @@ void Search::Worker::iterative_deepening() {
                else
                    threads.stop = true;
            }
            else if (!mainThread->ponder
                     && mainThread->tm.elapsed(threads.nodes_searched()) > totalTime * 0.50)
                threads.increaseDepth = false;
            else
-                threads.increaseDepth = true;
+                threads.increaseDepth =
                  mainThread->ponder
                  || mainThread->tm.elapsed(Cluster::nodes_searched(threads)) <= totalTime * 0.506;
        }
        mainThread->iterValue[iterIdx] = bestValue;
@@ -491,10 +533,10 @@ void Search::Worker::clear() {
        for (StatsType c : {NoCaptures, Captures})
            for (auto& to : continuationHistory[inCheck][c])
                for (auto& h : to)
-                    h->fill(-71);
+                    h->fill(-67);
    for (size_t i = 1; i < reductions.size(); ++i)
-        reductions[i] = int((18.79 + std::log(size_t(options["Threads"])) / 2) * std::log(i));
+        reductions[i] = int((19.80 + std::log(size_t(options["Threads"])) / 2) * std::log(i));
 }
@@ -533,7 +575,7 @@ Value Search::Worker::search(
    Move     ttMove, move, excludedMove, bestMove;
    Depth    extension, newDepth;
    Value    bestValue, value, ttValue, eval, maxValue, probCutBeta;
-    bool     givesCheck, improving, priorCapture;
+    bool     givesCheck, improving, priorCapture, opponentWorsening;
    bool     capture, moveCountPruning, ttCapture;
    Piece    movedPiece;
    int      moveCount, captureCount, quietCount;
@@ -560,8 +602,9 @@ Value Search::Worker::search(
        // Step 2. Check for aborted search and immediate draw
        if (threads.stop.load(std::memory_order_relaxed) || pos.is_draw(ss->ply)
            || ss->ply >= MAX_PLY)
-            return (ss->ply >= MAX_PLY && !ss->inCheck) ? evaluate(pos, thisThread->optimism[us])
+            return (ss->ply >= MAX_PLY && !ss->inCheck)
-                                                        : value_draw(thisThread->nodes);
+                   ? evaluate(networks, pos, thisThread->optimism[us])
                   : value_draw(thisThread->nodes);
        // Step 3. Mate distance pruning. Even if we mate at the next move our score
        // would be at best mate_in(ss->ply + 1), but if alpha is already bigger because
@@ -607,20 +650,17 @@ Value Search::Worker::search(
        && (tte->bound() & (ttValue >= beta ? BOUND_LOWER : BOUND_UPPER)))
    {
        // If ttMove is quiet, update move sorting heuristics on TT hit (~2 Elo)
-        if (ttMove)
+        if (ttMove && ttValue >= beta)
        {
-            if (ttValue >= beta)
+            // Bonus for a quiet ttMove that fails high (~2 Elo)
-            {
+            if (!ttCapture)
-                // Bonus for a quiet ttMove that fails high (~2 Elo)
+                update_quiet_stats(pos, ss, *this, ttMove, stat_bonus(depth));
                if (!ttCapture)
                    update_quiet_stats(pos, ss, *this, ttMove, stat_bonus(depth));
-                // Extra penalty for early quiet moves of
+            // Extra penalty for early quiet moves of
-                // the previous ply (~0 Elo on STC, ~2 Elo on LTC).
+            // the previous ply (~1 Elo on STC, ~2 Elo on LTC)
-                if (prevSq != SQ_NONE && (ss - 1)->moveCount <= 2 && !priorCapture)
+            if (prevSq != SQ_NONE && (ss - 1)->moveCount <= 2 && !priorCapture)
-                    update_continuation_histories(ss - 1, pos.piece_on(prevSq), prevSq,
+                update_continuation_histories(ss - 1, pos.piece_on(prevSq), prevSq,
-                                                  -stat_malus(depth + 1));
+                                              -stat_malus(depth + 1));
            }
        }
        // Partial workaround for the graph history interaction problem
@@ -666,9 +706,9 @@ Value Search::Worker::search(
                if (b == BOUND_EXACT || (b == BOUND_LOWER ? value >= beta : value <= alpha))
                {
-                    tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, b,
+                    Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(value, ss->ply),
-                              std::min(MAX_PLY - 1, depth + 6), Move::none(), VALUE_NONE,
+                                  ss->ttPv, b, std::min(MAX_PLY - 1, depth + 6), Move::none(),
-                              tt.generation());
+                                  VALUE_NONE, tt.generation());
                    return value;
                }
@@ -697,7 +737,7 @@ Value Search::Worker::search(
    {
        // Providing the hint that this node's accumulator will be used often
        // brings significant Elo gain (~13 Elo).
-        Eval::NNUE::hint_common_parent_position(pos);
+        Eval::NNUE::hint_common_parent_position(pos, networks);
        unadjustedStaticEval = eval = ss->staticEval;
    }
    else if (ss->ttHit)
@@ -705,9 +745,9 @@ Value Search::Worker::search(
        // Never assume anything about values stored in TT
        unadjustedStaticEval = tte->eval();
        if (unadjustedStaticEval == VALUE_NONE)
-            unadjustedStaticEval = evaluate(pos, thisThread->optimism[us]);
+            unadjustedStaticEval = evaluate(networks, pos, thisThread->optimism[us]);
        else if (PvNode)
-            Eval::NNUE::hint_common_parent_position(pos);
+            Eval::NNUE::hint_common_parent_position(pos, networks);
        ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
@@ -717,23 +757,23 @@ Value Search::Worker::search(
    }
    else
    {
-        unadjustedStaticEval = evaluate(pos, thisThread->optimism[us]);
+        unadjustedStaticEval = evaluate(networks, pos, thisThread->optimism[us]);
        ss->staticEval = eval = to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
        // Static evaluation is saved as it was before adjustment by correction history
-        tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, Move::none(),
+        Cluster::save(tt, threads, thisThread, tte, posKey, VALUE_NONE, ss->ttPv, BOUND_NONE,
-                  unadjustedStaticEval, tt.generation());
+                      DEPTH_NONE, Move::none(), unadjustedStaticEval, tt.generation());
    }
    // Use static evaluation difference to improve quiet move ordering (~9 Elo)
    if (((ss - 1)->currentMove).is_ok() && !(ss - 1)->inCheck && !priorCapture)
    {
-        int bonus = std::clamp(-14 * int((ss - 1)->staticEval + ss->staticEval), -1723, 1455);
+        int bonus = std::clamp(-13 * int((ss - 1)->staticEval + ss->staticEval), -1578, 1291);
        bonus     = bonus > 0 ? 2 * bonus : bonus / 2;
        thisThread->mainHistory[~us][((ss - 1)->currentMove).from_to()] << bonus;
        if (type_of(pos.piece_on(prevSq)) != PAWN && ((ss - 1)->currentMove).type_of() != PROMOTION)
            thisThread->pawnHistory[pawn_structure_index(pos)][pos.piece_on(prevSq)][prevSq]
-              << bonus / 4;
+              << bonus / 2;
    }
    // Set up the improving flag, which is true if current static evaluation is
@@ -745,11 +785,13 @@ Value Search::Worker::search(
                ? ss->staticEval > (ss - 2)->staticEval
                : (ss - 4)->staticEval != VALUE_NONE && ss->staticEval > (ss - 4)->staticEval;
    opponentWorsening = ss->staticEval + (ss - 1)->staticEval > 2;
    // Step 7. Razoring (~1 Elo)
    // If eval is really low check with qsearch if it can exceed alpha, if it can't,
    // return a fail low.
    // Adjust razor margin according to cutoffCnt. (~1 Elo)
-    if (eval < alpha - 438 - (332 - 154 * ((ss + 1)->cutoffCnt > 3)) * depth * depth)
+    if (eval < alpha - 488 - (289 - 142 * ((ss + 1)->cutoffCnt > 3)) * depth * depth)
    {
        value = qsearch<NonPV>(pos, ss, alpha - 1, alpha);
        if (value < alpha)
@@ -758,24 +800,23 @@ Value Search::Worker::search(
    // Step 8. Futility pruning: child node (~40 Elo)
    // The depth condition is important for mate finding.
-    if (!ss->ttPv && depth < 11
+    if (!ss->ttPv && depth < 12
-        && eval - futility_margin(depth, cutNode && !ss->ttHit, improving)
+        && eval - futility_margin(depth, cutNode && !ss->ttHit, improving, opponentWorsening)
-               - (ss - 1)->statScore / 314
+               - (ss - 1)->statScore / 267
             >= beta
-        && eval >= beta && eval < 30016  // smaller than TB wins
+        && eval >= beta && eval < VALUE_TB_WIN_IN_MAX_PLY && (!ttMove || ttCapture))
        && (!ttMove || ttCapture))
        return beta > VALUE_TB_LOSS_IN_MAX_PLY ? (eval + beta) / 2 : eval;
    // Step 9. Null move search with verification search (~35 Elo)
-    if (!PvNode && (ss - 1)->currentMove != Move::null() && (ss - 1)->statScore < 16620
+    if (!PvNode && (ss - 1)->currentMove != Move::null() && (ss - 1)->statScore < 16878
-        && eval >= beta && eval >= ss->staticEval && ss->staticEval >= beta - 21 * depth + 330
+        && eval >= beta && ss->staticEval >= beta - 20 * depth + 314 && !excludedMove
-        && !excludedMove && pos.non_pawn_material(us) && ss->ply >= thisThread->nmpMinPly
+        && pos.non_pawn_material(us) && ss->ply >= thisThread->nmpMinPly
        && beta > VALUE_TB_LOSS_IN_MAX_PLY)
    {
        assert(eval - beta >= 0);
        // Null move dynamic reduction based on depth and eval
-        Depth R = std::min(int(eval - beta) / 154, 6) + depth / 3 + 4;
+        Depth R = std::min(int(eval - beta) / 144, 6) + depth / 3 + 4;
        ss->currentMove         = Move::null();
        ss->continuationHistory = &thisThread->continuationHistory[0][0][NO_PIECE][0];
@@ -808,13 +849,11 @@ Value Search::Worker::search(
    }
    // Step 10. Internal iterative reductions (~9 Elo)
-    // For PV nodes without a ttMove, we decrease depth by 2,
+    // For PV nodes without a ttMove, we decrease depth by 3.
    // or by 4 if the current position is present in the TT and
    // the stored depth is greater than or equal to the current depth.
    // Use qsearch if depth <= 0.
    if (PvNode && !ttMove)
-        depth -= 2 + 2 * (ss->ttHit && tte->depth() >= depth);
+        depth -= 3;
    // Use qsearch if depth <= 0.
    if (depth <= 0)
        return qsearch<PV>(pos, ss, alpha, beta);
@@ -825,7 +864,7 @@ Value Search::Worker::search(
    // Step 11. ProbCut (~10 Elo)
    // If we have a good enough capture (or queen promotion) and a reduced search returns a value
    // much above beta, we can (almost) safely prune the previous move.
-    probCutBeta = beta + 181 - 68 * improving;
+    probCutBeta = beta + 170 - 64 * improving;
    if (
      !PvNode && depth > 3
      && std::abs(beta) < VALUE_TB_WIN_IN_MAX_PLY
@@ -868,20 +907,21 @@ Value Search::Worker::search(
                if (value >= probCutBeta)
                {
                    // Save ProbCut data into transposition table
-                    tte->save(posKey, value_to_tt(value, ss->ply), ss->ttPv, BOUND_LOWER, depth - 3,
+                    Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(value, ss->ply),
-                              move, unadjustedStaticEval, tt.generation());
+                                  ss->ttPv, BOUND_LOWER, depth - 3, move, unadjustedStaticEval,
                                  tt.generation());
                    return std::abs(value) < VALUE_TB_WIN_IN_MAX_PLY ? value - (probCutBeta - beta)
                                                                     : value;
                }
            }
-        Eval::NNUE::hint_common_parent_position(pos);
+        Eval::NNUE::hint_common_parent_position(pos, networks);
    }
 moves_loop:  // When in check, search starts here
    // Step 12. A small Probcut idea, when we are in check (~4 Elo)
-    probCutBeta = beta + 452;
+    probCutBeta = beta + 409;
    if (ss->inCheck && !PvNode && ttCapture && (tte->bound() & BOUND_LOWER)
        && tte->depth() >= depth - 4 && ttValue >= probCutBeta
        && std::abs(ttValue) < VALUE_TB_WIN_IN_MAX_PLY && std::abs(beta) < VALUE_TB_WIN_IN_MAX_PLY)
@@ -926,8 +966,8 @@ moves_loop:  // When in check, search starts here
        ss->moveCount = ++moveCount;
-        if (rootNode && is_mainthread()
+        if (rootNode && Cluster::is_root() && is_mainthread()
-            && main_manager()->tm.elapsed(threads.nodes_searched()) > 3000)
+            && main_manager()->tm.elapsed(Cluster::nodes_searched(threads)) > 3000)
            sync_cout << "info depth " << depth << " currmove "
                      << UCI::move(move, pos.is_chess960()) << " currmovenumber "
                      << moveCount + thisThread->pvIdx << sync_endl;
@@ -964,7 +1004,7 @@ moves_loop:  // When in check, search starts here
                {
                    Piece capturedPiece = pos.piece_on(move.to_sq());
                    int   futilityEval =
-                      ss->staticEval + 277 + 292 * lmrDepth + PieceValue[capturedPiece]
+                      ss->staticEval + 297 + 284 * lmrDepth + PieceValue[capturedPiece]
                      + thisThread->captureHistory[movedPiece][move.to_sq()][type_of(capturedPiece)]
                          / 7;
                    if (futilityEval < alpha)
@@ -972,7 +1012,7 @@ moves_loop:  // When in check, search starts here
                }
                // SEE based pruning for captures and checks (~11 Elo)
-                if (!pos.see_ge(move, -197 * depth))
+                if (!pos.see_ge(move, -203 * depth))
                    continue;
            }
            else
@@ -984,24 +1024,29 @@ moves_loop:  // When in check, search starts here
                  + thisThread->pawnHistory[pawn_structure_index(pos)][movedPiece][move.to_sq()];
                // Continuation history based pruning (~2 Elo)
-                if (lmrDepth < 6 && history < -4211 * depth)
+                if (lmrDepth < 6 && history < -4040 * depth)
                    continue;
                history += 2 * thisThread->mainHistory[us][move.from_to()];
-                lmrDepth += history / 6437;
+                lmrDepth += history / 5637;
                Value futilityValue =
                  ss->staticEval + (bestValue < ss->staticEval - 59 ? 141 : 58) + 125 * lmrDepth;
                // Futility pruning: parent node (~13 Elo)
-                if (!ss->inCheck && lmrDepth < 15
+                if (!ss->inCheck && lmrDepth < 15 && futilityValue <= alpha)
-                    && ss->staticEval + (bestValue < ss->staticEval - 57 ? 144 : 57)
+                {
-                           + 121 * lmrDepth
+                    if (bestValue <= futilityValue && std::abs(bestValue) < VALUE_TB_WIN_IN_MAX_PLY
-                         <= alpha)
+                        && futilityValue < VALUE_TB_WIN_IN_MAX_PLY)
                        bestValue = (bestValue + futilityValue * 3) / 4;
                    continue;
                }
                lmrDepth = std::max(lmrDepth, 0);
                // Prune moves with negative SEE (~4 Elo)
-                if (!pos.see_ge(move, -26 * lmrDepth * lmrDepth))
+                if (!pos.see_ge(move, -27 * lmrDepth * lmrDepth))
                    continue;
            }
        }
@@ -1025,7 +1070,7 @@ moves_loop:  // When in check, search starts here
                && std::abs(ttValue) < VALUE_TB_WIN_IN_MAX_PLY && (tte->bound() & BOUND_LOWER)
                && tte->depth() >= depth - 3)
            {
-                Value singularBeta  = ttValue - (60 + 54 * (ss->ttPv && !PvNode)) * depth / 64;
+                Value singularBeta  = ttValue - (58 + 58 * (ss->ttPv && !PvNode)) * depth / 64;
                Depth singularDepth = newDepth / 2;
                ss->excludedMove = move;
@@ -1040,9 +1085,12 @@ moves_loop:  // When in check, search starts here
                    // We make sure to limit the extensions in some way to avoid a search explosion
                    if (!PvNode && ss->multipleExtensions <= 16)
                    {
-                        extension = 2 + (value < singularBeta - 78 && !ttCapture);
+                        extension = 2 + (value < singularBeta - 22 && !ttCapture);
-                        depth += depth < 16;
+                        depth += depth < 14;
                    }
                    if (PvNode && !ttCapture && ss->multipleExtensions <= 5
                        && value < singularBeta - 37)
                        extension = 2;
                }
                // Multi-cut pruning
@@ -1061,7 +1109,7 @@ moves_loop:  // When in check, search starts here
                // If the ttMove is assumed to fail high over current beta (~7 Elo)
                else if (ttValue >= beta)
-                    extension = -2 - !PvNode;
+                    extension = -3;
                // If we are on a cutNode but the ttMove is not assumed to fail high over current beta (~1 Elo)
                else if (cutNode)
@@ -1072,11 +1120,11 @@ moves_loop:  // When in check, search starts here
                    extension = -1;
            }
-            // Recapture extensions (~1 Elo)
+            // Recapture extensions (~0 Elo on STC, ~1 Elo on LTC)
            else if (PvNode && move == ttMove && move.to_sq() == prevSq
                     && thisThread->captureHistory[movedPiece][move.to_sq()]
                                                  [type_of(pos.piece_on(move.to_sq()))]
-                          > 4394)
+                          > 4026)
                extension = 1;
        }
@@ -1110,14 +1158,10 @@ moves_loop:  // When in check, search starts here
        if (ttCapture)
            r++;
-        // Decrease reduction for PvNodes (~3 Elo)
+        // Decrease reduction for PvNodes (~0 Elo on STC, ~2 Elo on LTC)
        if (PvNode)
            r--;
        // Increase reduction on repetition (~1 Elo)
        if (move == (ss - 4)->currentMove && pos.has_repeated())
            r += 2;
        // Increase reduction if next ply has a lot of fail high (~5 Elo)
        if ((ss + 1)->cutoffCnt > 3)
            r++;
@@ -1130,10 +1174,10 @@ moves_loop:  // When in check, search starts here
        ss->statScore = 2 * thisThread->mainHistory[us][move.from_to()]
                      + (*contHist[0])[movedPiece][move.to_sq()]
                      + (*contHist[1])[movedPiece][move.to_sq()]
-                      + (*contHist[3])[movedPiece][move.to_sq()] - 4392;
+                      + (*contHist[3])[movedPiece][move.to_sq()] - 4723;
        // Decrease/increase reduction for moves with a good/bad history (~8 Elo)
-        r -= ss->statScore / 14189;
+        r -= ss->statScore / 13659;
        // Step 17. Late moves reduction / extension (LMR, ~117 Elo)
        if (depth >= 2 && moveCount > 1 + rootNode)
@@ -1152,7 +1196,7 @@ moves_loop:  // When in check, search starts here
            {
                // Adjust full-depth search based on LMR results - if the result
                // was good enough search deeper, if it was bad enough search shallower.
-                const bool doDeeperSearch    = value > (bestValue + 49 + 2 * newDepth);  // (~1 Elo)
+                const bool doDeeperSearch    = value > (bestValue + 47 + 2 * newDepth);  // (~1 Elo)
                const bool doShallowerSearch = value < bestValue + newDepth;             // (~2 Elo)
                newDepth += doDeeperSearch - doShallowerSearch;
@@ -1172,7 +1216,7 @@ moves_loop:  // When in check, search starts here
        // Step 18. Full-depth search when LMR is skipped
        else if (!PvNode || moveCount > 1)
        {
-            // Increase reduction if ttMove is not present (~1 Elo)
+            // Increase reduction if ttMove is not present (~6 Elo)
            if (!ttMove)
                r += 2;
@@ -1193,9 +1237,6 @@ moves_loop:  // When in check, search starts here
        // Step 19. Undo move
        pos.undo_move(move);
        if (rootNode)
            effort[move.from_sq()][move.to_sq()] += nodes - nodeCount;
        assert(value > -VALUE_INFINITE && value < VALUE_INFINITE);
        // Step 20. Check for a new best move
@@ -1210,6 +1251,8 @@ moves_loop:  // When in check, search starts here
            RootMove& rm =
              *std::find(thisThread->rootMoves.begin(), thisThread->rootMoves.end(), move);
            rm.effort += nodes - nodeCount;
            rm.averageScore =
              rm.averageScore != -VALUE_INFINITE ? (2 * value + rm.averageScore) / 3 : value;
@@ -1271,7 +1314,7 @@ moves_loop:  // When in check, search starts here
                else
                {
                    // Reduce other moves if we have found at least one score improvement (~2 Elo)
-                    if (depth > 2 && depth < 13 && beta < 13652 && value > -12761)
+                    if (depth > 2 && depth < 12 && beta < 14206 && value > -12077)
                        depth -= 2;
                    assert(depth > 0);
@@ -1314,8 +1357,9 @@ moves_loop:  // When in check, search starts here
    // Bonus for prior countermove that caused the fail low
    else if (!priorCapture && prevSq != SQ_NONE)
    {
-        int bonus = (depth > 5) + (PvNode || cutNode) + ((ss - 1)->statScore < -15736)
+        int bonus = (depth > 5) + (PvNode || cutNode) + ((ss - 1)->statScore < -14963)
-                  + ((ss - 1)->moveCount > 11);
+                  + ((ss - 1)->moveCount > 11)
                  + (!ss->inCheck && bestValue <= ss->staticEval - 150);
        update_continuation_histories(ss - 1, pos.piece_on(prevSq), prevSq,
                                      stat_bonus(depth) * bonus);
        thisThread->mainHistory[~us][((ss - 1)->currentMove).from_to()]
@@ -1333,11 +1377,12 @@ moves_loop:  // When in check, search starts here
    // Write gathered information in transposition table
    // Static evaluation is saved as it was before correction history
    if (!excludedMove && !(rootNode && thisThread->pvIdx))
-        tte->save(posKey, value_to_tt(bestValue, ss->ply), ss->ttPv,
+        Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(bestValue, ss->ply),
-                  bestValue >= beta    ? BOUND_LOWER
+                      ss->ttPv,
-                  : PvNode && bestMove ? BOUND_EXACT
+                      bestValue >= beta    ? BOUND_LOWER
-                                       : BOUND_UPPER,
+                      : PvNode && bestMove ? BOUND_EXACT
-                  depth, bestMove, unadjustedStaticEval, tt.generation());
+                                           : BOUND_UPPER,
                      depth, bestMove, unadjustedStaticEval, tt.generation());
    // Adjust correction history
    if (!ss->inCheck && (!bestMove || !pos.capture(bestMove))
@@ -1408,8 +1453,9 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
    // Step 2. Check for an immediate draw or maximum ply reached
    if (pos.is_draw(ss->ply) || ss->ply >= MAX_PLY)
-        return (ss->ply >= MAX_PLY && !ss->inCheck) ? evaluate(pos, thisThread->optimism[us])
+        return (ss->ply >= MAX_PLY && !ss->inCheck)
-                                                    : VALUE_DRAW;
+               ? evaluate(networks, pos, thisThread->optimism[us])
               : VALUE_DRAW;
    assert(0 <= ss->ply && ss->ply < MAX_PLY);
@@ -1440,7 +1486,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
            // Never assume anything about values stored in TT
            unadjustedStaticEval = tte->eval();
            if (unadjustedStaticEval == VALUE_NONE)
-                unadjustedStaticEval = evaluate(pos, thisThread->optimism[us]);
+                unadjustedStaticEval = evaluate(networks, pos, thisThread->optimism[us]);
            ss->staticEval = bestValue =
              to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
@@ -1453,7 +1499,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
        {
            // In case of null move search, use previous static eval with a different sign
            unadjustedStaticEval = (ss - 1)->currentMove != Move::null()
-                                   ? evaluate(pos, thisThread->optimism[us])
+                                   ? evaluate(networks, pos, thisThread->optimism[us])
                                   : -(ss - 1)->staticEval;
            ss->staticEval       = bestValue =
              to_corrected_static_eval(unadjustedStaticEval, *thisThread, pos);
@@ -1463,8 +1509,9 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
        if (bestValue >= beta)
        {
            if (!ss->ttHit)
-                tte->save(posKey, value_to_tt(bestValue, ss->ply), false, BOUND_LOWER, DEPTH_NONE,
+                Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(bestValue, ss->ply),
-                          Move::none(), unadjustedStaticEval, tt.generation());
+                              false, BOUND_LOWER, DEPTH_NONE, Move::none(), unadjustedStaticEval,
                              tt.generation());
            return bestValue;
        }
@@ -1472,7 +1519,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
        if (bestValue > alpha)
            alpha = bestValue;
-        futilityBase = ss->staticEval + 206;
+        futilityBase = ss->staticEval + 226;
    }
    const PieceToHistory* contHist[] = {(ss - 1)->continuationHistory,
@@ -1552,7 +1599,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
                continue;
            // Do not search moves with bad enough SEE values (~5 Elo)
-            if (!pos.see_ge(move, -74))
+            if (!pos.see_ge(move, -78))
                continue;
        }
@@ -1609,9 +1656,9 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
    // Save gathered info in transposition table
    // Static evaluation is saved as it was before adjustment by correction history
-    tte->save(posKey, value_to_tt(bestValue, ss->ply), pvHit,
+    Cluster::save(tt, threads, thisThread, tte, posKey, value_to_tt(bestValue, ss->ply), pvHit,
-              bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, ttDepth, bestMove,
+                  bestValue >= beta ? BOUND_LOWER : BOUND_UPPER, ttDepth, bestMove,
-              unadjustedStaticEval, tt.generation());
+                  unadjustedStaticEval, tt.generation());
    assert(bestValue > -VALUE_INFINITE && bestValue < VALUE_INFINITE);
@@ -1620,7 +1667,7 @@ Value Search::Worker::qsearch(Position& pos, Stack* ss, Value alpha, Value beta,
 Depth Search::Worker::reduction(bool i, Depth d, int mn, int delta) {
    int reductionScale = reductions[d] * reductions[mn];
-    return (reductionScale + 1118 - delta * 793 / rootDelta) / 1024 + (!i && reductionScale > 863);
+    return (reductionScale + 1107 - delta * 725 / rootDelta) / 1024 + (!i && reductionScale > 956);
 }
 namespace {
@@ -1709,7 +1756,7 @@ void update_all_stats(const Position& pos,
    if (!pos.capture_stage(bestMove))
    {
-        int bestMoveBonus = bestValue > beta + 166 ? quietMoveBonus      // larger bonus
+        int bestMoveBonus = bestValue > beta + 168 ? quietMoveBonus      // larger bonus
                                                   : stat_bonus(depth);  // smaller bonus
        // Increase stats for the best move in case it was a quiet move
@@ -1837,7 +1884,7 @@ void SearchManager::check_time(Search::Worker& worker) {
    static TimePoint lastInfoTime = now();
-    TimePoint elapsed = tm.elapsed(worker.threads.nodes_searched());
+    TimePoint elapsed = tm.elapsed(Cluster::nodes_searched(worker.threads));
    TimePoint tick    = worker.limits.startTime + elapsed;
    if (tick - lastInfoTime >= 1000)
@@ -1846,6 +1893,9 @@ void SearchManager::check_time(Search::Worker& worker) {
        dbg_print();
    }
    // poll on MPI signals
    Cluster::signals_poll(worker.threads);
    // We should not stop pondering until told so by the GUI
    if (ponder)
        return;
@@ -1856,7 +1906,8 @@ void SearchManager::check_time(Search::Worker& worker) {
      worker.completedDepth >= 1
      && ((worker.limits.use_time_management() && (elapsed > tm.maximum() || stopOnPonderhit))
          || (worker.limits.movetime && elapsed >= worker.limits.movetime)
-          || (worker.limits.nodes && worker.threads.nodes_searched() >= worker.limits.nodes)))
+          || (worker.limits.nodes
              && Cluster::nodes_searched(worker.threads) >= worker.limits.nodes)))
        worker.threads.stop = worker.threads.abortedSearch = true;
 }
@@ -1866,13 +1917,13 @@ std::string SearchManager::pv(const Search::Worker&     worker,
                              Depth                     depth) const {
    std::stringstream ss;
-    const auto  nodes     = threads.nodes_searched();
+    const auto  nodes     = Cluster::nodes_searched(threads);
    const auto& rootMoves = worker.rootMoves;
    const auto& pos       = worker.rootPos;
    size_t      pvIdx     = worker.pvIdx;
    TimePoint   time      = tm.elapsed(nodes) + 1;
    size_t      multiPV   = std::min(size_t(worker.options["MultiPV"]), rootMoves.size());
-    uint64_t    tbHits    = threads.tb_hits() + (worker.tbConfig.rootInTB ? rootMoves.size() : 0);
+    uint64_t tbHits = Cluster::tb_hits(threads) + (worker.tbConfig.rootInTB ? rootMoves.size() : 0);
    for (size_t i = 0; i < multiPV; ++i)
    {
@@ -1895,10 +1946,10 @@ std::string SearchManager::pv(const Search::Worker&     worker,
        ss << "info"
           << " depth " << d << " seldepth " << rootMoves[i].selDepth << " multipv " << i + 1
-           << " score " << UCI::value(v);
+           << " score " << UCI::to_score(v, pos);
        if (worker.options["UCI_ShowWDL"])
-            ss << UCI::wdl(v, pos.game_ply());
+            ss << UCI::wdl(v, pos);
        if (i == pvIdx && !tb && updated)  // tablebase- and previous-scores are exact
            ss << (rootMoves[i].scoreLowerbound
@@ -25,9 +25,11 @@
 #include <cstddef>
 #include <cstdint>
 #include <memory>
 #include <vector>
 #include <string>
 #include <vector>
 #include <mutex>
 #include "cluster.h"
 #include "misc.h"
 #include "movepick.h"
 #include "position.h"
@@ -37,6 +39,10 @@
 namespace Stockfish {
 namespace Eval::NNUE {
 struct Networks;
 }
 // Different node types, used as a template parameter
 enum NodeType {
    NonPV,
@@ -85,6 +91,7 @@ struct RootMove {
        return m.score != score ? m.score < score : m.previousScore < previousScore;
    }
    uint64_t          effort          = 0;
    Value             score           = -VALUE_INFINITE;
    Value             previousScore   = -VALUE_INFINITE;
    Value             averageScore    = -VALUE_INFINITE;
@@ -109,30 +116,36 @@ struct LimitsType {
        time[WHITE] = time[BLACK] = inc[WHITE] = inc[BLACK] = npmsec = movetime = TimePoint(0);
        movestogo = depth = mate = perft = infinite = 0;
        nodes                                       = 0;
        ponderMode                                  = false;
    }
-    bool use_time_management() const { return time[WHITE] || time[BLACK]; }
+    bool use_time_management() const { return Cluster::is_root() && (time[WHITE] || time[BLACK]); }
    std::vector<Move> searchmoves;
    TimePoint         time[COLOR_NB], inc[COLOR_NB], npmsec, movetime, startTime;
    int               movestogo, depth, mate, perft, infinite;
    uint64_t          nodes;
    bool              ponderMode;
 };
 // The UCI stores the uci options, thread pool, and transposition table.
 // This struct is used to easily forward data to the Search::Worker class.
 struct SharedState {
-    SharedState(const OptionsMap&   optionsMap,
+    SharedState(const OptionsMap&           optionsMap,
-                ThreadPool&         threadPool,
+                ThreadPool&                 threadPool,
-                TranspositionTable& transpositionTable) :
+                TranspositionTable&         transpositionTable,
                const Eval::NNUE::Networks& nets) :
        options(optionsMap),
        threads(threadPool),
-        tt(transpositionTable) {}
+        tt(transpositionTable),
        networks(nets) {}
-    const OptionsMap&   options;
+
-    ThreadPool&         threads;
+    const OptionsMap&           options;
-    TranspositionTable& tt;
+    ThreadPool&                 threads;
    TranspositionTable&         tt;
    const Eval::NNUE::Networks& networks;
 };
 class Worker;
@@ -174,6 +187,7 @@ class NullSearchManager: public ISearchManager {
    void check_time(Search::Worker&) override {}
 };
 // Search::Worker is the class that does the actual search.
 // It is instantiated once per thread, and it is responsible for keeping track
 // of the search history, and storing data required for the search.
@@ -199,6 +213,28 @@ class Worker {
    PawnHistory           pawnHistory;
    CorrectionHistory     correctionHistory;
 #ifdef USE_MPI
    struct {
        std::mutex                             mutex;
        Cluster::TTCache<Cluster::TTCacheSize> buffer = {};
    } ttCache;
 #endif
    std::atomic<uint64_t> TTsaves;
    friend void Cluster::save(TranspositionTable&,
                              ThreadPool&,
                              Search::Worker*,
                              TTEntry* tte,
                              Key      k,
                              Value    v,
                              bool     PvHit,
                              Bound    b,
                              Depth    d,
                              Move     m,
                              Value    ev,
                              uint8_t  generation8);
   private:
    void iterative_deepening();
@@ -219,8 +255,6 @@ class Worker {
        return static_cast<SearchManager*>(manager.get());
    }
    std::array<std::array<uint64_t, SQUARE_NB>, SQUARE_NB> effort;
    LimitsType limits;
    size_t                pvIdx, pvLast;
@@ -245,9 +279,10 @@ class Worker {
    Tablebases::Config tbConfig;
-    const OptionsMap&   options;
+    const OptionsMap&           options;
-    ThreadPool&         threads;
+    ThreadPool&                 threads;
-    TranspositionTable& tt;
+    TranspositionTable&         tt;
    const Eval::NNUE::Networks& networks;
    friend class Stockfish::ThreadPool;
    friend class SearchManager;
@@ -37,6 +37,7 @@
 #include <vector>
 #include "../bitboard.h"
 #include "../cluster.h"
 #include "../misc.h"
 #include "../movegen.h"
 #include "../position.h"
@@ -1466,7 +1467,8 @@ void Tablebases::init(const std::string& paths) {
        }
    }
-    sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
+    if (Cluster::is_root())
        sync_cout << "info string Found " << TBTables.size() << " tablebases" << sync_endl;
 }
 // Probe the WDL table for a particular position.
@@ -24,8 +24,8 @@
 #include <memory>
 #include <unordered_map>
 #include <utility>
 #include <array>
 #include "cluster.h"
 #include "misc.h"
 #include "movegen.h"
 #include "search.h"
@@ -62,6 +62,7 @@ Thread::~Thread() {
    stdThread.join();
 }
 // Wakes up the thread that will start the search
 void Thread::start_searching() {
    mutex.lock();
@@ -91,7 +92,7 @@ void Thread::idle_loop() {
    // just check if running threads are below a threshold, in this case, all this
    // NUMA machinery is not needed.
    if (nthreads > 8)
-        WinProcGroup::bindThisThread(idx);
+        WinProcGroup::bind_this_thread(idx);
    while (true)
    {
@@ -109,6 +110,14 @@ void Thread::idle_loop() {
    }
 }
 Search::SearchManager* ThreadPool::main_manager() {
    return static_cast<Search::SearchManager*>(main_thread()->worker.get()->manager.get());
 }
 uint64_t ThreadPool::nodes_searched() const { return accumulate(&Search::Worker::nodes); }
 uint64_t ThreadPool::tb_hits() const { return accumulate(&Search::Worker::tbHits); }
 uint64_t ThreadPool::TT_saves() const { return accumulate(&Search::Worker::TTsaves); }
 // Creates/destroys threads to match the requested number.
 // Created and launched threads will immediately go to sleep in idle_loop.
 // Upon resizing, threads are recreated to allow for binding if necessary.
@@ -140,6 +149,9 @@ void ThreadPool::set(Search::SharedState sharedState) {
        // Reallocate the hash with the new threadpool size
        sharedState.tt.resize(sharedState.options["Hash"], requested);
        // Adjust cluster buffers
        Cluster::ttSendRecvBuff_resize(requested);
    }
 }
@@ -163,13 +175,12 @@ void ThreadPool::clear() {
 void ThreadPool::start_thinking(const OptionsMap&  options,
                                Position&          pos,
                                StateListPtr&      states,
-                                Search::LimitsType limits,
+                                Search::LimitsType limits) {
                                bool               ponderMode) {
    main_thread()->wait_for_search_finished();
    main_manager()->stopOnPonderhit = stop = abortedSearch = false;
-    main_manager()->ponder                                 = ponderMode;
+    main_manager()->ponder                                 = limits.ponderMode;
    increaseDepth = true;
@@ -199,14 +210,16 @@ void ThreadPool::start_thinking(const OptionsMap&  options,
        th->worker->limits = limits;
        th->worker->nodes = th->worker->tbHits = th->worker->nmpMinPly =
          th->worker->bestMoveChanges          = 0;
        th->worker->TTsaves                    = 0;
        th->worker->rootDepth = th->worker->completedDepth = 0;
        th->worker->rootMoves                              = rootMoves;
        th->worker->rootPos.set(pos.fen(), pos.is_chess960(), &th->worker->rootState);
        th->worker->rootState = setupStates->back();
        th->worker->tbConfig  = tbConfig;
        th->worker->effort    = {};
    }
    Cluster::signals_init();
    main_thread()->start_searching();
 }
@@ -27,12 +27,14 @@
 #include <mutex>
 #include <vector>
 #include "movepick.h"
 #include "position.h"
 #include "search.h"
 #include "thread_win32_osx.h"
 namespace Stockfish {
 class OptionsMap;
 using Value = int;
@@ -79,20 +81,18 @@ class ThreadPool {
        }
    }
-    void
+    void start_thinking(const OptionsMap&, Position&, StateListPtr&, Search::LimitsType);
    start_thinking(const OptionsMap&, Position&, StateListPtr&, Search::LimitsType, bool = false);
    void clear();
    void set(Search::SharedState);
-    Search::SearchManager* main_manager() const {
+    Search::SearchManager* main_manager();
-        return static_cast<Search::SearchManager*>(main_thread()->worker.get()->manager.get());
+    Thread*                main_thread() const { return threads.front(); }
-    };
+    uint64_t               nodes_searched() const;
-    Thread*  main_thread() const { return threads.front(); }
+    uint64_t               tb_hits() const;
-    uint64_t nodes_searched() const { return accumulate(&Search::Worker::nodes); }
+    uint64_t               TT_saves() const;
-    uint64_t tb_hits() const { return accumulate(&Search::Worker::tbHits); }
+    Thread*                get_best_thread() const;
-    Thread*  get_best_thread() const;
+    void                   start_searching();
-    void     start_searching();
+    void                   wait_for_search_finished() const;
    void     wait_for_search_finished() const;
    std::atomic_bool stop, abortedSearch, increaseDepth;
@@ -84,27 +84,33 @@ void TimeManagement::init(Search::LimitsType& limits,
    // Maximum move horizon of 50 moves
    int mtg = limits.movestogo ? std::min(limits.movestogo, 50) : 50;
    // if less than one second, gradually reduce mtg
    if (limits.time[us] < 1000 && (double(mtg) / limits.time[us] > 0.05))
    {
        mtg = limits.time[us] * 0.05;
    }
    // Make sure timeLeft is > 0 since we may use it as a divisor
    TimePoint timeLeft = std::max(TimePoint(1), limits.time[us] + limits.inc[us] * (mtg - 1)
                                                  - moveOverhead * (2 + mtg));
    // x basetime (+ z increment)
-    // If there is a healthy increment, timeLeft can exceed actual available
+    // If there is a healthy increment, timeLeft can exceed the actual available
-    // game time for the current move, so also cap to 20% of available game time.
+    // game time for the current move, so also cap to a percentage of available game time.
    if (limits.movestogo == 0)
    {
        // Use extra time with larger increments
-        double optExtra = std::clamp(1.0 + 12.5 * limits.inc[us] / limits.time[us], 1.0, 1.11);
+        double optExtra = limits.inc[us] < 500 ? 1.0 : 1.13;
        // Calculate time constants based on current time left.
        double optConstant =
-          std::min(0.00334 + 0.0003 * std::log10(limits.time[us] / 1000.0), 0.0049);
+          std::min(0.00308 + 0.000319 * std::log10(limits.time[us] / 1000.0), 0.00506);
-        double maxConstant = std::max(3.4 + 3.0 * std::log10(limits.time[us] / 1000.0), 2.76);
+        double maxConstant = std::max(3.39 + 3.01 * std::log10(limits.time[us] / 1000.0), 2.93);
-        optScale = std::min(0.0120 + std::pow(ply + 3.1, 0.44) * optConstant,
+        optScale = std::min(0.0122 + std::pow(ply + 2.95, 0.462) * optConstant,
-                            0.21 * limits.time[us] / double(timeLeft))
+                            0.213 * limits.time[us] / double(timeLeft))
                 * optExtra;
-        maxScale = std::min(6.9, maxConstant + ply / 12.2);
+        maxScale = std::min(6.64, maxConstant + ply / 12.0);
    }
    // x moves in y seconds (+ z increment)
@@ -117,7 +123,7 @@ void TimeManagement::init(Search::LimitsType& limits,
    // Limit the maximum possible time for this move
    optimumTime = TimePoint(optScale * timeLeft);
    maximumTime =
-      TimePoint(std::min(0.84 * limits.time[us] - moveOverhead, maxScale * optimumTime)) - 10;
+      TimePoint(std::min(0.825 * limits.time[us] - moveOverhead, maxScale * optimumTime)) - 10;
    if (options["Ponder"])
        optimumTime += optimumTime / 4;
@@ -22,8 +22,8 @@
 #include <cstddef>
 #include <cstdint>
 #include "cluster.h"
 #include "misc.h"
 #include "types.h"
 namespace Stockfish {
@@ -19,6 +19,7 @@
 #include "tt.h"
 #include <cassert>
 #include <cstdint>
 #include <cstdlib>
 #include <cstring>
 #include <iostream>
@@ -53,6 +54,18 @@ void TTEntry::save(
 }
 uint8_t TTEntry::relative_age(const uint8_t generation8) const {
    // Due to our packed storage format for generation and its cyclic
    // nature we add GENERATION_CYCLE (256 is the modulus, plus what
    // is needed to keep the unrelated lowest n bits from affecting
    // the result) to calculate the entry age correctly even after
    // generation8 overflows into the next cycle.
    return (TranspositionTable::GENERATION_CYCLE + generation8 - genBound8)
         & TranspositionTable::GENERATION_MASK;
 }
 // Sets the size of the transposition table,
 // measured in megabytes. Transposition table consists of a power of 2 number
 // of clusters and each cluster consists of ClusterSize number of TTEntry.
@@ -82,7 +95,7 @@ void TranspositionTable::clear(size_t threadCount) {
        threads.emplace_back([this, idx, threadCount]() {
            // Thread binding gives faster search on systems with a first-touch policy
            if (threadCount > 8)
-                WinProcGroup::bindThisThread(idx);
+                WinProcGroup::bind_this_thread(idx);
            // Each thread will zero its part of the hash table
            const size_t stride = size_t(clusterCount / threadCount), start = size_t(stride * idx),
@@ -111,24 +124,18 @@ TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
    for (int i = 0; i < ClusterSize; ++i)
        if (tte[i].key16 == key16 || !tte[i].depth8)
        {
-            tte[i].genBound8 =
+            constexpr uint8_t lowerBits = GENERATION_DELTA - 1;
              uint8_t(generation8 | (tte[i].genBound8 & (GENERATION_DELTA - 1)));  // Refresh
-            return found = bool(tte[i].depth8), &tte[i];
+            // Refresh with new generation, keeping the lower bits the same.
            tte[i].genBound8 = uint8_t(generation8 | (tte[i].genBound8 & lowerBits));
            return found     = bool(tte[i].depth8), &tte[i];
        }
    // Find an entry to be replaced according to the replacement strategy
    TTEntry* replace = tte;
    for (int i = 1; i < ClusterSize; ++i)
-        // Due to our packed storage format for generation and its cyclic
+        if (replace->depth8 - replace->relative_age(generation8) * 2
-        // nature we add GENERATION_CYCLE (256 is the modulus, plus what
+            > tte[i].depth8 - tte[i].relative_age(generation8) * 2)
        // is needed to keep the unrelated lowest n bits from affecting
        // the result) to calculate the entry age correctly even after
        // generation8 overflows into the next cycle.
        if (replace->depth8
              - ((GENERATION_CYCLE + generation8 - replace->genBound8) & GENERATION_MASK)
            > tte[i].depth8
                - ((GENERATION_CYCLE + generation8 - tte[i].genBound8) & GENERATION_MASK))
            replace = &tte[i];
    return found = false, replace;
@@ -137,7 +144,7 @@ TTEntry* TranspositionTable::probe(const Key key, bool& found) const {
 // Returns an approximation of the hashtable
 // occupation during a search. The hash is x permill full, as per UCI protocol.
-
+// Only counts entries which match the current generation.
 int TranspositionTable::hashfull() const {
    int cnt = 0;
@@ -27,16 +27,21 @@
 namespace Stockfish {
-// TTEntry struct is the 10 bytes transposition table entry, defined as below:
+namespace Cluster {
-//
+void init();
-// key        16 bit
+}
-// depth       8 bit
+
-// generation  5 bit
+/// TTEntry struct is the 10 bytes transposition table entry, defined as below:
-// pv node     1 bit
+///
-// bound type  2 bit
+/// key        16 bit
-// move       16 bit
+/// depth       8 bit
-// value      16 bit
+/// generation  5 bit
-// eval value 16 bit
+/// pv node     1 bit
 /// bound type  2 bit
 /// move       16 bit
 /// value      16 bit
 /// eval value 16 bit
 struct TTEntry {
    Move  move() const { return Move(move16); }
@@ -46,9 +51,13 @@ struct TTEntry {
    bool  is_pv() const { return bool(genBound8 & 0x4); }
    Bound bound() const { return Bound(genBound8 & 0x3); }
    void  save(Key k, Value v, bool pv, Bound b, Depth d, Move m, Value ev, uint8_t generation8);
    // The returned age is a multiple of TranspositionTable::GENERATION_DELTA
    uint8_t relative_age(const uint8_t generation8) const;
   private:
    friend class TranspositionTable;
    friend void Cluster::init();
    uint16_t key16;
    uint8_t  depth8;
@@ -66,6 +75,8 @@ struct TTEntry {
 // prefetched when possible.
 class TranspositionTable {
    friend void Cluster::init();
    static constexpr int ClusterSize = 3;
    struct Cluster {
@@ -76,16 +87,25 @@ class TranspositionTable {
    static_assert(sizeof(Cluster) == 32, "Unexpected Cluster size");
    // Constants used to refresh the hash table periodically
-    static constexpr unsigned GENERATION_BITS = 3;  // nb of bits reserved for other things
+
-    static constexpr int      GENERATION_DELTA =
+    // We have 8 bits available where the lowest 3 bits are
-      (1 << GENERATION_BITS);  // increment for generation field
+    // reserved for other things.
-    static constexpr int GENERATION_CYCLE = 255 + (1 << GENERATION_BITS);  // cycle length
+    static constexpr unsigned GENERATION_BITS = 3;
-    static constexpr int GENERATION_MASK =
+    // increment for generation field
-      (0xFF << GENERATION_BITS) & 0xFF;  // mask to pull out generation number
+    static constexpr int GENERATION_DELTA = (1 << GENERATION_BITS);
    // cycle length
    static constexpr int GENERATION_CYCLE = 255 + GENERATION_DELTA;
    // mask to pull out generation number
    static constexpr int GENERATION_MASK = (0xFF << GENERATION_BITS) & 0xFF;
   public:
    ~TranspositionTable() { aligned_large_pages_free(table); }
-    void new_search() { generation8 += GENERATION_DELTA; }  // Lower bits are used for other things
+
    void new_search() {
        // increment by delta to keep lower bits as is
        generation8 += GENERATION_DELTA;
    }
    TTEntry* probe(const Key key, bool& found) const;
    int      hashfull() const;
    void     resize(size_t mbSize, int threadCount);
@@ -30,10 +30,39 @@ using std::string;
 namespace Stockfish {
-bool                              Tune::update_on_last;
+bool          Tune::update_on_last;
-const Option*                     LastOption = nullptr;
+const Option* LastOption = nullptr;
-OptionsMap*                       Tune::options;
+OptionsMap*   Tune::options;
-static std::map<std::string, int> TuneResults;
+
 namespace {
 std::map<std::string, int> TuneResults;
 void on_tune(const Option& o) {
    if (!Tune::update_on_last || LastOption == &o)
        Tune::read_options();
 }
 void make_option(OptionsMap* options, const string& n, int v, const SetRange& r) {
    // Do not generate option when there is nothing to tune (ie. min = max)
    if (r(v).first == r(v).second)
        return;
    if (TuneResults.count(n))
        v = TuneResults[n];
    (*options)[n] << Option(v, r(v).first, r(v).second, on_tune);
    LastOption = &((*options)[n]);
    // Print formatted parameters, ready to be copy-pasted in Fishtest
    std::cout << n << "," << v << "," << r(v).first << "," << r(v).second << ","
              << (r(v).second - r(v).first) / 20.0 << ","
              << "0.0020" << std::endl;
 }
 }
 string Tune::next(string& names, bool pop) {
@@ -54,29 +83,6 @@ string Tune::next(string& names, bool pop) {
    return name;
 }
 static void on_tune(const Option& o) {
    if (!Tune::update_on_last || LastOption == &o)
        Tune::read_options();
 }
 static void make_option(OptionsMap* options, const string& n, int v, const SetRange& r) {
    // Do not generate option when there is nothing to tune (ie. min = max)
    if (r(v).first == r(v).second)
        return;
    if (TuneResults.count(n))
        v = TuneResults[n];
    (*options)[n] << Option(v, r(v).first, r(v).second, on_tune);
    LastOption = &((*options)[n]);
    // Print formatted parameters, ready to be copy-pasted in Fishtest
    std::cout << n << "," << v << "," << r(v).first << "," << r(v).second << ","
              << (r(v).second - r(v).first) / 20.0 << ","
              << "0.0020" << std::endl;
 }
 template<>
 void Tune::Entry<int>::init_option() {
@@ -22,43 +22,47 @@
 #include <cassert>
 #include <cctype>
 #include <cmath>
 #include <cstdint>
 #include <cstdlib>
 #include <deque>
 #include <memory>
 #include <optional>
 #include <sstream>
 #include <utility>
 #include <vector>
 #include <cstdint>
 #include "benchmark.h"
 #include "cluster.h"
 #include "evaluate.h"
 #include "movegen.h"
-#include "nnue/evaluate_nnue.h"
+#include "nnue/network.h"
-#include "nnue/nnue_architecture.h"
+#include "nnue/nnue_common.h"
 #include "perft.h"
 #include "position.h"
 #include "search.h"
 #include "syzygy/tbprobe.h"
 #include "types.h"
 #include "ucioption.h"
 #include "perft.h"
 namespace Stockfish {
-constexpr auto StartFEN             = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
+constexpr auto StartFEN  = "rnbqkbnr/pppppppp/8/8/8/8/PPPPPPPP/RNBQKBNR w KQkq - 0 1";
-constexpr int  NormalizeToPawnValue = 356;
+constexpr int  MaxHashMB = Is64Bit ? 33554432 : 2048;
-constexpr int  MaxHashMB            = Is64Bit ? 33554432 : 2048;
+
 namespace NN = Eval::NNUE;
 UCI::UCI(int argc, char** argv) :
    networks(NN::Networks(
      NN::NetworkBig({EvalFileDefaultNameBig, "None", ""}, NN::EmbeddedNNUEType::BIG),
      NN::NetworkSmall({EvalFileDefaultNameSmall, "None", ""}, NN::EmbeddedNNUEType::SMALL))),
    cli(argc, argv) {
    evalFiles = {{Eval::NNUE::Big, {"EvalFile", EvalFileDefaultNameBig, "None", ""}},
                 {Eval::NNUE::Small, {"EvalFileSmall", EvalFileDefaultNameSmall, "None", ""}}};
    options["Debug Log File"] << Option("", [](const Option& o) { start_logger(o); });
    options["Threads"] << Option(1, 1, 1024, [this](const Option&) {
-        threads.set({options, threads, tt});
+        threads.set({options, threads, tt, networks});
    });
    options["Hash"] << Option(16, 1, MaxHashMB, [this](const Option& o) {
@@ -80,14 +84,17 @@ UCI::UCI(int argc, char** argv) :
    options["SyzygyProbeDepth"] << Option(1, 1, 100);
    options["Syzygy50MoveRule"] << Option(true);
    options["SyzygyProbeLimit"] << Option(7, 0, 7);
-    options["EvalFile"] << Option(EvalFileDefaultNameBig, [this](const Option&) {
+    options["EvalFile"] << Option(EvalFileDefaultNameBig, [this](const Option& o) {
-        evalFiles = Eval::NNUE::load_networks(cli.binaryDirectory, options, evalFiles);
+        networks.big.load(cli.binaryDirectory, o);
    });
-    options["EvalFileSmall"] << Option(EvalFileDefaultNameSmall, [this](const Option&) {
+    options["EvalFileSmall"] << Option(EvalFileDefaultNameSmall, [this](const Option& o) {
-        evalFiles = Eval::NNUE::load_networks(cli.binaryDirectory, options, evalFiles);
+        networks.small.load(cli.binaryDirectory, o);
    });
-    threads.set({options, threads, tt});
+    networks.big.load(cli.binaryDirectory, options["EvalFile"]);
    networks.small.load(cli.binaryDirectory, options["EvalFileSmall"]);
    threads.set({options, threads, tt, networks});
    search_clear();  // After threads are up
 }
@@ -106,7 +113,8 @@ void UCI::loop() {
    do
    {
        if (cli.argc == 1
-            && !getline(std::cin, cmd))  // Wait for an input or an end-of-file (EOF) indication
+            && !Cluster::getline(std::cin,
                                 cmd))  // Wait for an input or an end-of-file (EOF) indication
            cmd = "quit";
        std::istringstream is(cmd);
@@ -124,7 +132,7 @@ void UCI::loop() {
        else if (token == "ponderhit")
            threads.main_manager()->ponder = false;  // Switch to the normal search
-        else if (token == "uci")
+        else if (token == "uci" && Cluster::is_root())
            sync_cout << "id name " << engine_info(true) << "\n"
                      << options << "\nuciok" << sync_endl;
@@ -136,7 +144,7 @@ void UCI::loop() {
            position(pos, is, states);
        else if (token == "ucinewgame")
            search_clear();
-        else if (token == "isready")
+        else if (token == "isready" && Cluster::is_root())
            sync_cout << "readyok" << sync_endl;
        // Add custom non-UCI commands, mainly for debugging purposes.
@@ -145,21 +153,28 @@ void UCI::loop() {
            pos.flip();
        else if (token == "bench")
            bench(pos, is, states);
-        else if (token == "d")
+        else if (token == "d" && Cluster::is_root())
            sync_cout << pos << sync_endl;
-        else if (token == "eval")
+        else if (token == "eval" && Cluster::is_root())
            trace_eval(pos);
-        else if (token == "compiler")
+        else if (token == "compiler" && Cluster::is_root())
            sync_cout << compiler_info() << sync_endl;
-        else if (token == "export_net")
+        else if (token == "export_net" && Cluster::is_root())
        {
-            std::optional<std::string> filename;
+            std::pair<std::optional<std::string>, std::string> files[2];
-            std::string                f;
+
-            if (is >> std::skipws >> f)
+            if (is >> std::skipws >> files[0].second)
-                filename = f;
+                files[0].first = files[0].second;
-            Eval::NNUE::save_eval(filename, Eval::NNUE::Big, evalFiles);
+
            if (is >> std::skipws >> files[1].second)
                files[1].first = files[1].second;
            networks.big.save(files[0].first);
            networks.small.save(files[1].first);
        }
-        else if (token == "--help" || token == "help" || token == "--license" || token == "license")
+        else if ((token == "--help" || token == "help" || token == "--license"
                  || token == "license")
                 && Cluster::is_root())
            sync_cout
              << "\nStockfish is a powerful chess engine for playing and analyzing."
                 "\nIt is released as free software licensed under the GNU GPLv3 License."
@@ -168,18 +183,16 @@ void UCI::loop() {
                 "\nFor any further information, visit https://github.com/official-stockfish/Stockfish#readme"
                 "\nor read the corresponding README.md and Copying.txt files distributed along with this program.\n"
              << sync_endl;
-        else if (!token.empty() && token[0] != '#')
+        else if (!token.empty() && token[0] != '#' && Cluster::is_root())
            sync_cout << "Unknown command: '" << cmd << "'. Type help for more information."
                      << sync_endl;
    } while (token != "quit" && cli.argc == 1);  // The command-line arguments are one-shot
 }
-void UCI::go(Position& pos, std::istringstream& is, StateListPtr& states) {
+Search::LimitsType UCI::parse_limits(const Position& pos, std::istream& is) {
    Search::LimitsType limits;
    std::string        token;
    bool               ponderMode = false;
    limits.startTime = now();  // The search starts as early as possible
@@ -211,9 +224,17 @@ void UCI::go(Position& pos, std::istringstream& is, StateListPtr& states) {
        else if (token == "infinite")
            limits.infinite = 1;
        else if (token == "ponder")
-            ponderMode = true;
+            limits.ponderMode = true;
-    Eval::NNUE::verify(options, evalFiles);
+    return limits;
 }
 void UCI::go(Position& pos, std::istringstream& is, StateListPtr& states) {
    Search::LimitsType limits = parse_limits(pos, is);
    networks.big.verify(options["EvalFile"]);
    networks.small.verify(options["EvalFileSmall"]);
    if (limits.perft)
    {
@@ -221,7 +242,7 @@ void UCI::go(Position& pos, std::istringstream& is, StateListPtr& states) {
        return;
    }
-    threads.start_thinking(options, pos, states, limits, ponderMode);
+    threads.start_thinking(options, pos, states, limits);
 }
 void UCI::bench(Position& pos, std::istream& args, StateListPtr& states) {
@@ -242,15 +263,16 @@ void UCI::bench(Position& pos, std::istream& args, StateListPtr& states) {
        if (token == "go" || token == "eval")
        {
-            std::cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")"
+            if (Cluster::is_root())
-                      << std::endl;
+                std::cerr << "\nPosition: " << cnt++ << '/' << num << " (" << pos.fen() << ")"
                          << std::endl;
            if (token == "go")
            {
                go(pos, is, states);
                threads.main_thread()->wait_for_search_finished();
-                nodes += threads.nodes_searched();
+                nodes += Cluster::nodes_searched(threads);
            }
-            else
+            else if (Cluster::is_root())
                trace_eval(pos);
        }
        else if (token == "setoption")
@@ -268,9 +290,10 @@ void UCI::bench(Position& pos, std::istream& args, StateListPtr& states) {
    dbg_print();
-    std::cerr << "\n==========================="
+    if (Cluster::is_root())
-              << "\nTotal time (ms) : " << elapsed << "\nNodes searched  : " << nodes
+        std::cerr << "\n==========================="
-              << "\nNodes/second    : " << 1000 * nodes / elapsed << std::endl;
+                  << "\nTotal time (ms) : " << elapsed << "\nNodes searched  : " << nodes
                  << "\nNodes/second    : " << 1000 * nodes / elapsed << std::endl;
 }
 void UCI::trace_eval(Position& pos) {
@@ -278,9 +301,11 @@ void UCI::trace_eval(Position& pos) {
    Position     p;
    p.set(pos.fen(), options["UCI_Chess960"], &states->back());
-    Eval::NNUE::verify(options, evalFiles);
+    networks.big.verify(options["EvalFile"]);
    networks.small.verify(options["EvalFileSmall"]);
-    sync_cout << "\n" << Eval::trace(p) << sync_endl;
+
    sync_cout << "\n" << Eval::trace(p, networks) << sync_endl;
 }
 void UCI::search_clear() {
@@ -324,15 +349,49 @@ void UCI::position(Position& pos, std::istringstream& is, StateListPtr& states)
    }
 }
-int UCI::to_cp(Value v) { return 100 * v / NormalizeToPawnValue; }
+namespace {
-std::string UCI::value(Value v) {
+struct WinRateParams {
    double a;
    double b;
 };
 WinRateParams win_rate_params(const Position& pos) {
    int material = pos.count<PAWN>() + 3 * pos.count<KNIGHT>() + 3 * pos.count<BISHOP>()
                 + 5 * pos.count<ROOK>() + 9 * pos.count<QUEEN>();
    // The fitted model only uses data for material counts in [10, 78], and is anchored at count 58.
    double m = std::clamp(material, 10, 78) / 58.0;
    // Return a = p_a(material) and b = p_b(material), see github.com/official-stockfish/WDL_model
    constexpr double as[] = {-185.71965483, 504.85014385, -438.58295743, 474.04604627};
    constexpr double bs[] = {89.23542728, -137.02141296, 73.28669021, 47.53376190};
    double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
    double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
    return {a, b};
 }
 // The win rate model is 1 / (1 + exp((a - eval) / b)), where a = p_a(material) and b = p_b(material).
 // It fits the LTC fishtest statistics rather accurately.
 int win_rate_model(Value v, const Position& pos) {
    auto [a, b] = win_rate_params(pos);
    // Return the win rate in per mille units, rounded to the nearest integer.
    return int(0.5 + 1000 / (1 + std::exp((a - double(v)) / b)));
 }
 }
 std::string UCI::to_score(Value v, const Position& pos) {
    assert(-VALUE_INFINITE < v && v < VALUE_INFINITE);
    std::stringstream ss;
    if (std::abs(v) < VALUE_TB_WIN_IN_MAX_PLY)
-        ss << "cp " << to_cp(v);
+        ss << "cp " << to_cp(v, pos);
    else if (std::abs(v) <= VALUE_TB)
    {
        const int ply = VALUE_TB - std::abs(v);  // recompute ss->ply
@@ -344,6 +403,30 @@ std::string UCI::value(Value v) {
    return ss.str();
 }
 // Turns a Value to an integer centipawn number,
 // without treatment of mate and similar special scores.
 int UCI::to_cp(Value v, const Position& pos) {
    // In general, the score can be defined via the the WDL as
    // (log(1/L - 1) - log(1/W - 1)) / ((log(1/L - 1) + log(1/W - 1))
    // Based on our win_rate_model, this simply yields v / a.
    auto [a, b] = win_rate_params(pos);
    return std::round(100 * int(v) / a);
 }
 std::string UCI::wdl(Value v, const Position& pos) {
    std::stringstream ss;
    int wdl_w = win_rate_model(v, pos);
    int wdl_l = win_rate_model(-v, pos);
    int wdl_d = 1000 - wdl_w - wdl_l;
    ss << " wdl " << wdl_w << " " << wdl_d << " " << wdl_l;
    return ss.str();
 }
 std::string UCI::square(Square s) {
    return std::string{char('a' + file_of(s)), char('1' + rank_of(s))};
 }
@@ -369,41 +452,6 @@ std::string UCI::move(Move m, bool chess960) {
    return move;
 }
 namespace {
 // The win rate model returns the probability of winning (in per mille units) given an
 // eval and a game ply. It fits the LTC fishtest statistics rather accurately.
 int win_rate_model(Value v, int ply) {
    // The fitted model only uses data for moves in [8, 120], and is anchored at move 32.
    double m = std::clamp(ply / 2 + 1, 8, 120) / 32.0;
    // The coefficients of a third-order polynomial fit is based on the fishtest data
    // for two parameters that need to transform eval to the argument of a logistic
    // function.
    constexpr double as[] = {-1.06249702, 7.42016937, 0.89425629, 348.60356174};
    constexpr double bs[] = {-5.33122190, 39.57831533, -90.84473771, 123.40620748};
    // Enforce that NormalizeToPawnValue corresponds to a 50% win rate at move 32.
    static_assert(NormalizeToPawnValue == int(0.5 + as[0] + as[1] + as[2] + as[3]));
    double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
    double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
    // Return the win rate in per mille units, rounded to the nearest integer.
    return int(0.5 + 1000 / (1 + std::exp((a - double(v)) / b)));
 }
 }
 std::string UCI::wdl(Value v, int ply) {
    std::stringstream ss;
    int wdl_w = win_rate_model(v, ply);
    int wdl_l = win_rate_model(-v, ply);
    int wdl_d = 1000 - wdl_w - wdl_l;
    ss << " wdl " << wdl_w << " " << wdl_d << " " << wdl_l;
    return ss.str();
 }
 Move UCI::to_move(const Position& pos, std::string& str) {
    if (str.length() == 5)
@@ -21,21 +21,17 @@
 #include <iostream>
 #include <string>
 #include <unordered_map>
 #include "evaluate.h"
 #include "misc.h"
 #include "nnue/network.h"
 #include "position.h"
 #include "search.h"
 #include "thread.h"
 #include "tt.h"
 #include "ucioption.h"
 namespace Stockfish {
 namespace Eval::NNUE {
 enum NetSize : int;
 }
 class Move;
 enum Square : int;
 using Value = int;
@@ -46,18 +42,19 @@ class UCI {
    void loop();
-    static int         to_cp(Value v);
+    static int         to_cp(Value v, const Position& pos);
-    static std::string value(Value v);
+    static std::string to_score(Value v, const Position& pos);
    static std::string square(Square s);
    static std::string move(Move m, bool chess960);
-    static std::string wdl(Value v, int ply);
+    static std::string wdl(Value v, const Position& pos);
    static Move        to_move(const Position& pos, std::string& str);
-    const std::string& workingDirectory() const { return cli.workingDirectory; }
+    static Search::LimitsType parse_limits(const Position& pos, std::istream& is);
-    OptionsMap options;
+    const std::string& working_directory() const { return cli.workingDirectory; }
-    std::unordered_map<Eval::NNUE::NetSize, Eval::EvalFile> evalFiles;
+    OptionsMap           options;
    Eval::NNUE::Networks networks;
   private:
    TranspositionTable tt;
@@ -25,6 +25,7 @@
 #include <sstream>
 #include <utility>
 #include "cluster.h"
 #include "misc.h"
 namespace Stockfish {
@@ -51,7 +52,7 @@ void OptionsMap::setoption(std::istringstream& is) {
    if (options_map.count(name))
        options_map[name] = value;
-    else
+    else if (Cluster::is_root())
        sync_cout << "No such option: " << name << sync_endl;
 }
@@ -8,6 +8,13 @@ error()
 }
 trap 'error ${LINENO}' ERR
 # Since Linux Kernel 6.5 we are getting false positives from the ci,
 # lower the ALSR entropy to disable ALSR, which works as a temporary workaround.
 # https://github.com/google/sanitizers/issues/1716
 # https://bugs.launchpad.net/ubuntu/+source/linux/+bug/2056762
 sudo sysctl -w vm.mmap_rnd_bits=28
 # define suitable post and prefixes for testing options
 case $1 in
  --valgrind)