diff --git a/.github/workflows/stockfish.yml b/.github/workflows/stockfish.yml
new file mode 100644
index 00000000..e50930f8
--- /dev/null
+++ b/.github/workflows/stockfish.yml
@@ -0,0 +1,276 @@
+name: Stockfish
+on:
+  push:
+    branches:
+      - master
+      - tools
+      - github_ci
+  pull_request:
+    branches:
+      - master
+      - tools
+jobs:
+  Stockfish:
+    name: ${{ matrix.config.name }}
+    runs-on: ${{ matrix.config.os }}
+    env:
+      COMPILER: ${{ matrix.config.compiler }}
+      COMP: ${{ matrix.config.comp }}
+    strategy:
+      matrix:
+        config:
+          - {
+              name: "Ubuntu 20.04 GCC",
+              os: ubuntu-20.04,
+              compiler: g++,
+              comp: gcc,
+              run_expensive_tests: true,
+              run_32bit_tests: true,
+              run_64bit_tests: true,
+              shell: 'bash {0}'
+            }
+          - {
+              name: "Ubuntu 20.04 Clang",
+              os: ubuntu-20.04,
+              compiler: clang++,
+              comp: clang,
+              run_expensive_tests: false,
+              run_32bit_tests: true,
+              run_64bit_tests: true,
+              shell: 'bash {0}'
+            }
+          - {
+              name: "MacOS 10.15 Apple Clang",
+              os: macos-10.15,
+              compiler: clang++,
+              comp: clang,
+              run_expensive_tests: false,
+              run_32bit_tests: false,
+              run_64bit_tests: true,
+              shell: 'bash {0}'
+            }
+          - {
+              name: "MacOS 10.15 GCC 10",
+              os: macos-10.15,
+              compiler: g++-10,
+              comp: gcc,
+              run_expensive_tests: false,
+              run_32bit_tests: false,
+              run_64bit_tests: true,
+              shell: 'bash {0}'
+            }
+          - {
+              name: "Windows 2019 Mingw-w64 GCC x86_64",
+              os: windows-2019,
+              compiler: g++,
+              comp: gcc,
+              run_expensive_tests: false,
+              run_32bit_tests: false,
+              run_64bit_tests: true,
+              msys_sys: 'mingw64',
+              msys_env: 'x86_64',
+              shell: 'msys2 {0}'
+            }
+          - {
+              name: "Windows 2019 Mingw-w64 GCC i686",
+              os: windows-2019,
+              compiler: g++,
+              comp: gcc,
+              run_expensive_tests: false,
+              run_32bit_tests: true,
+              run_64bit_tests: false,
+              msys_sys: 'mingw32',
+              msys_env: 'i686',
+              shell: 'msys2 {0}'
+            }
+
+    defaults:
+      run:
+        working-directory: src
+        shell: ${{ matrix.config.shell }}
+    steps:
+      - uses: actions/checkout@v2
+        with:
+          fetch-depth: 0
+
+      - name: Download required linux packages
+        if: runner.os == 'Linux'
+        run: |
+          sudo apt update
+          sudo apt install expect valgrind g++-multilib
+
+      - name: Setup msys and install required packages
+        if: runner.os == 'Windows'
+        uses: msys2/setup-msys2@v2
+        with:
+          msystem: ${{matrix.config.msys_sys}}
+          install: mingw-w64-${{matrix.config.msys_env}}-gcc make git expect
+
+      - name: Download the used network from the fishtest framework
+        run: |
+          make net
+
+      - name: Extract the bench number from the commit history
+        run: |
+          git log HEAD | grep "\b[Bb]ench[ :]\+[0-9]\{7\}" | head -n 1 | sed "s/[^0-9]*\([0-9]*\).*/\1/g" > git_sig
+          [ -s git_sig ] && echo "benchref=$(cat git_sig)" >> $GITHUB_ENV && echo "Reference bench:" $(cat git_sig) || echo "No bench found"
+
+      - name: Check compiler
+        run: |
+          $COMPILER -v
+
+      - name: Test help target
+        run: |
+          make help
+
+      # x86-32 tests
+
+      - name: Test debug x86-32 build
+        if: ${{ matrix.config.run_32bit_tests }}
+        run: |
+          export CXXFLAGS="-D_GLIBCXX_DEBUG"
+          make clean
+          make -j2 ARCH=x86-32 optimize=no debug=yes build
+          ../tests/signature.sh $benchref
+
+      - name: Test x86-32 build
+        if: ${{ matrix.config.run_32bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-32 build
+          ../tests/signature.sh $benchref
+
+      - name: Test x86-32-sse41-popcnt build
+        if: ${{ matrix.config.run_32bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-32-sse41-popcnt build
+          ../tests/signature.sh $benchref
+
+      - name: Test x86-32-sse2 build
+        if: ${{ matrix.config.run_32bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-32-sse2 build
+          ../tests/signature.sh $benchref
+
+      - name: Test general-32 build
+        if: ${{ matrix.config.run_32bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=general-32 build
+          ../tests/signature.sh $benchref
+
+      # x86-64 tests
+
+      - name: Test debug x86-64-modern build
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          export CXXFLAGS="-D_GLIBCXX_DEBUG"
+          make clean
+          make -j2 ARCH=x86-64-modern optimize=no debug=yes build
+          ../tests/signature.sh $benchref
+
+      - name: Test x86-64-modern build
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-64-modern build
+          ../tests/signature.sh $benchref
+
+      - name: Test x86-64-ssse3 build
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-64-ssse3 build
+          ../tests/signature.sh $benchref
+
+      - name: Test x86-64-sse3-popcnt build
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-64-sse3-popcnt build
+          ../tests/signature.sh $benchref
+
+      - name: Test x86-64 build
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-64 build
+          ../tests/signature.sh $benchref
+
+      - name: Test general-64 build
+        if: matrix.config.run_64bit_tests
+        run: |
+          make clean
+          make -j2 ARCH=general-64 build
+          ../tests/signature.sh $benchref
+
+      # x86-64 with newer extensions tests
+
+      - name: Compile x86-64-avx2 build
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-64-avx2 build
+
+      - name: Compile x86-64-bmi2 build
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-64-bmi2 build
+
+      - name: Compile x86-64-avx512 build
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-64-avx512 build
+
+      - name: Compile x86-64-vnni512 build
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-64-vnni512 build
+
+      - name: Compile x86-64-vnni256 build
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-64-vnni256 build
+
+      # Other tests
+
+      - name: Check perft and search reproducibility
+        if: ${{ matrix.config.run_64bit_tests }}
+        run: |
+          make clean
+          make -j2 ARCH=x86-64-modern build
+          ../tests/perft.sh
+          ../tests/reprosearch.sh
+
+      # Sanitizers
+
+      - name: Run under valgrind
+        if: ${{ matrix.config.run_expensive_tests }}
+        run: |
+          export CXXFLAGS="-O1 -fno-inline"
+          make clean
+          make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null
+          ../tests/instrumented.sh --valgrind
+          ../tests/instrumented.sh --valgrind-thread
+
+      - name: Run with UB sanitizer
+        if: ${{ matrix.config.run_expensive_tests }}
+        run: |
+          export CXXFLAGS="-O1 -fno-inline"
+          make clean
+          make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null
+          ../tests/instrumented.sh --sanitizer-undefined
+
+      - name: Run with thread sanitizer
+        if: ${{ matrix.config.run_expensive_tests }}
+        run: |
+          export CXXFLAGS="-O1 -fno-inline"
+          make clean
+          make -j2 ARCH=x86-64-modern sanitize=thread optimize=no debug=yes build > /dev/null
+          ../tests/instrumented.sh --sanitizer-thread
diff --git a/.travis.yml b/.travis.yml
deleted file mode 100644
index 377796f7..00000000
--- a/.travis.yml
+++ /dev/null
@@ -1,75 +0,0 @@
-language: cpp
-dist: focal
-
-matrix:
-  include:
-    - os: linux
-      compiler: gcc
-      addons:
-        apt:
-          packages: ['g++-multilib', 'valgrind', 'expect', 'curl']
-      env:
-        - COMPILER=g++
-        - COMP=gcc
-
-branches:
-  only:
-   - master
-
-before_script:
-  - cd src
-
-script:
-  # Download net
-  - make net
-
-  # Obtain bench reference from git log
-  - git log HEAD | grep "\b[Bb]ench[ :]\+[0-9]\{7\}" | head -n 1 | sed "s/[^0-9]*\([0-9]*\).*/\1/g" > git_sig
-  - export benchref=$(cat git_sig)
-  - echo "Reference bench:" $benchref
-
-  # Compiler version string
-  - $COMPILER -v
-
-  # test help target
-  - make help
-
-  # Verify bench number against various builds
-  - export CXXFLAGS="-Werror -D_GLIBCXX_DEBUG"
-  - make clean && make -j2 ARCH=x86-64-modern optimize=no debug=yes build && ../tests/signature.sh $benchref
-  - export CXXFLAGS="-Werror"
-  - make clean && make -j2 ARCH=x86-64-modern build && ../tests/signature.sh $benchref
-  - make clean && make -j2 ARCH=x86-64-ssse3 build && ../tests/signature.sh $benchref
-  - make clean && make -j2 ARCH=x86-64-sse3-popcnt build && ../tests/signature.sh $benchref
-  - make clean && make -j2 ARCH=x86-64 build && ../tests/signature.sh $benchref
-  # TODO avoid _mm_malloc
-  # - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then make clean && make -j2 ARCH=general-64 build && ../tests/signature.sh $benchref; fi
-  - make clean && make -j2 ARCH=x86-64-modern profile-build && ../tests/signature.sh $benchref
-
-  # compile only for some more advanced architectures (might not run in travis)
-  - make clean && make -j2 ARCH=x86-64-avx2 blas=yes build
-
-  - make clean && make -j2 ARCH=x86-64-avx2 build
-  - make clean && make -j2 ARCH=x86-64-bmi2 build
-  - make clean && make -j2 ARCH=x86-64-avx512 build
-  - make clean && make -j2 ARCH=x86-64-vnni512 build
-  - make clean && make -j2 ARCH=x86-64-vnni256 build
-
-  #
-  # Check perft and reproducible search
-  - make clean && make -j2 ARCH=x86-64-modern build
-  - ../tests/perft.sh
-  - ../tests/reprosearch.sh
-
-  #
-  # Valgrind
-  #
-  - export CXXFLAGS="-O1 -fno-inline"
-  - make clean && make -j2 ARCH=x86-64-modern debug=yes optimize=no build > /dev/null && ../tests/instrumented.sh --valgrind
-  - ../tests/instrumented.sh --valgrind-thread
-
-  #
-  # Sanitizer
-  #
-  - make clean && make -j2 ARCH=x86-64-modern sanitize=undefined optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-undefined
-  - make clean && make -j2 ARCH=x86-64-modern sanitize=thread    optimize=no debug=yes build > /dev/null && ../tests/instrumented.sh --sanitizer-thread
diff --git a/AUTHORS b/AUTHORS
index 7165363f..4d72314f 100644
--- a/AUTHORS
+++ b/AUTHORS
@@ -1,4 +1,4 @@
-# List of authors for Stockfish, as of May 17, 2021
+# List of authors for Stockfish, as of June 14, 2021
 
 # Founders of the Stockfish project and fishtest infrastructure
 Tord Romstad (romstad)
@@ -69,6 +69,7 @@ gamander
 Gary Heckman (gheckman)
 George Sobala (gsobala)
 gguliash
+Giacomo Lorenzetti (G-Lorenz)
 Gian-Carlo Pascutto (gcp)
 Gontran Lemaire (gonlem)
 Goodkov Vasiliy Aleksandrovich (goodkov)
@@ -96,6 +97,7 @@ Joost VandeVondele (vondele)
 Jörg Oster (joergoster)
 Joseph Ellis (jhellis3)
 Joseph R. Prostko
+Julian Willemer (NightlyKing)
 jundery
 Justin Blanchard (UncombedCoconut)
 Kelly Wilson
@@ -106,6 +108,7 @@ Kojirion
 Krystian Kuzniarek (kuzkry)
 Leonardo Ljubičić (ICCF World Champion)
 Leonid Pechenik (lp--)
+Liam Keegan (lkeegan)
 Linus Arver (listx)
 loco-loco
 Lub van den Berg (ElbertoOne)
diff --git a/README.md b/README.md
index 19d5a229..467dd3c3 100644
--- a/README.md
+++ b/README.md
@@ -1,6 +1,6 @@
 ## Overview
 
-[![Build Status](https://travis-ci.org/official-stockfish/Stockfish.svg?branch=master)](https://travis-ci.org/official-stockfish/Stockfish)
+[![Build Status](https://github.com/official-stockfish/Stockfish/actions/workflows/stockfish.yml/badge.svg)](https://github.com/official-stockfish/Stockfish/actions)
 [![Build Status](https://ci.appveyor.com/api/projects/status/github/official-stockfish/Stockfish?branch=master&svg=true)](https://ci.appveyor.com/project/mcostalba/stockfish/branch/master)
 
 [Stockfish](https://stockfishchess.org) is a free, powerful UCI chess engine
@@ -35,12 +35,14 @@ This distribution of Stockfish consists of the following files:
 
 ## The UCI protocol and available options
 
-The Universal Chess Interface (UCI) is a standard protocol used to communicate with a chess engine,
-and is the recommended way to do so for typical graphical user interfaces (GUI) or chess tools.
+The Universal Chess Interface (UCI) is a standard protocol used to communicate with
+a chess engine, and is the recommended way to do so for typical graphical user interfaces
+(GUI) or chess tools. Stockfish implements the majority of it options as described
+in [the UCI protocol](https://www.shredderchess.com/download/div/uci.zip).
 
-Stockfish implements most commands as described in [the UCI protocol](https://www.shredderchess.com/download/div/uci.zip)
-
-For users, the following UCI options, which can typically be set via a GUI, are available in Stockfish:
+Developers can see the default values for UCI options available in Stockfish by typing
+`./stockfish uci` in a terminal, but the majority of users will typically see them and
+change them via a chess GUI. This is a list of available UCI options in Stockfish:
 
   * #### Threads
     The number of CPU threads used for searching a position. For best performance, set
@@ -118,14 +120,6 @@ For users, the following UCI options, which can typically be set via a GUI, are
     Limit Syzygy tablebase probing to positions with at most this many pieces left
     (including kings and pawns).
 
-  * #### Contempt
-    A positive value for contempt favors middle game positions and avoids draws,
-    effective for the classical evaluation only.
-
-  * #### Analysis Contempt
-    By default, contempt is set to prefer the side to move. Set this option to "White"
-    or "Black" to analyse with contempt for that side, or "Off" to disable contempt.
-
   * #### Move Overhead
     Assume a time delay of x ms due to network and GUI overheads. This is useful to
     avoid losses on time in those cases.
@@ -143,9 +137,9 @@ For users, the following UCI options, which can typically be set via a GUI, are
 
 For developers the following non-standard commands might be of interest, mainly useful for debugging:
 
-  * #### bench ttSize threads limit fenFile limitType evalType
-    Performs a standard benchmark using various options. The signature or standard node
-    count is obtained using all defaults. `bench` is currently `bench 16 1 13 default depth mixed`.
+  * #### bench *ttSize threads limit fenFile limitType evalType*
+    Performs a standard benchmark using various options. The signature of a version (standard node
+    count) is obtained using all defaults. `bench` is currently `bench 16 1 13 default depth mixed`.
 
   * #### compiler
     Give information about the compiler and environment used for building a binary.
diff --git a/Top CPU Contributors.txt b/Top CPU Contributors.txt
index f5347ea1..dacc5781 100644
--- a/Top CPU Contributors.txt	
+++ b/Top CPU Contributors.txt	
@@ -1,189 +1,205 @@
-Contributors to Fishtest with >10,000 CPU hours, as of Feb 15, 2021.
+Contributors to Fishtest with >10,000 CPU hours, as of Jun 29, 2021.
 Thank you!
 
-Username                CPU Hours       Games played
-----------------------------------------------------
-noobpwnftw               23930906         1560559941
-dew                       1169948           70333008
-mlang                      957168           61657446
-mibere                     703840           46867607
-tvijlbrief                 517888           33379462
-JojoM                      515404           30334272
-cw                         443276           29385549
-crunchy                    427035           27344275
-grandphish2                425794           26347253
-fastgm                     414133           24519696
-gvreuls                    377843           24708884
-CSU_Dynasty                338718           23030006
-Fisherman                  326795           21820747
-TueRens                    313730           19490246
-ctoks                      298442           20052551
-velislav                   270519           17355456
-bcross                     241064           17196165
-glinscott                  217799           13780820
-nordlandia                 211692           13484886
-bking_US                   198894           11876016
-drabel                     191096           13129722
-leszek                     189170           11446821
-mgrabiak                   187153           12013300
-robal                      181389           11539242
-Thanar                     179852           12365359
-vdv                        175274            9889046
-spams                      157128           10319326
-marrco                     150292            9401741
-sqrt2                      147963            9724586
-CoffeeOne                  137086            5022516
-vdbergh                    137041            8926915
-malala                     136182            8002293
-mhoram                     132780            8398229
-xoto                       124729            8652088
-davar                      122092            7960001
-dsmith                     122059            7570238
-Data                       113305            8220352
-BrunoBanani                112960            7436849
-pemo                       109598            5036441
-Dantist                    106768            6431396
-MaZePallas                 102741            6630419
-ElbertoOne                  99028            7023771
-brabos                      92118            6186135
-linrock                     90903            6708639
-psk                         89957            5984901
-sunu                        88614            6020673
-sterni1971                  86948            5613788
-Vizvezdenec                 83761            5344740
-BRAVONE                     81239            5054681
-nssy                        76497            5259388
-cuistot                     76366            4370584
-racerschmacer               75753            5442626
-teddybaer                   75125            5407666
-Pking_cda                   73776            5293873
-0x3C33                      73133            4670293
-jromang                     72117            5054915
-solarlight                  70517            5028306
-dv8silencer                 70287            3883992
-Bobo1239                    68515            4652287
-manap                       66273            4121774
-tinker                      64321            4268390
-robnjr                      57262            4053117
-Freja                       56938            3733019
-ttruscott                   56010            3680085
-rkl                         54986            4150767
-renouve                     53811            3501516
-finfish                     51360            3370515
-eva42                       51272            3599691
-rap                         49985            3219146
-pb00067                     49727            3298270
-amicic                      49691            3042481
-ronaldjerum                 47654            3240695
-bigpen0r                    47278            3291647
-biffhero                    46564            3111352
-VoyagerOne                  45476            3452465
-eastorwest                  45033            3071805
-speedycpu                   43842            3003273
-jbwiebe                     43305            2805433
-Antihistamine               41788            2761312
-mhunt                       41735            2691355
-homyur                      39893            2850481
-gri                         39871            2515779
-oryx                        38282            2944400
-Spprtr                      38157            2470529
-SC                          37290            2731014
-csnodgrass                  36207            2688994
-jmdana                      36157            2210661
-strelock                    34716            2074055
-Garf                        33800            2747562
-skiminki                    33515            2055584
-EthanOConnor                33370            2090311
-slakovv                     32915            2021889
-yurikvelo                   32600            2255966
-Prcuvu                      30377            2170122
-manapbk                     30326            1770143
-anst                        30301            2190091
-jkiiski                     30136            1904470
-hyperbolic.tom              29840            2017394
-Pyafue                      29650            1902349
-qurashee                    27758            1509620
-OuaisBla                    27636            1578800
-chriswk                     26902            1868317
-achambord                   26582            1767323
-Fifis                       26376            1776853
-Patrick_G                   26276            1801617
-yorkman                     26193            1992080
-SFTUser                     25182            1675689
-nabildanial                 24942            1519409
-Sharaf_DG                   24765            1786697
-ncfish1                     24411            1520927
-agg177                      23890            1395014
-JanErik                     23408            1703875
-Isidor                      23388            1680691
-Norabor                     23164            1591830
-cisco2015                   22895            1762069
-Zirie                       22542            1472937
-team-oh                     22272            1636708
-MazeOfGalious               21978            1629593
-sg4032                      21945            1643065
-ianh2105                    21725            1632562
-xor12                       21628            1680365
-dex                         21612            1467203
-nesoneg                     21494            1463031
-jjoshua2                    20997            1422689
-horst.prack                 20878            1465656
-0xB00B1ES                   20590            1208666
-sphinx                      20515            1352368
-j3corre                     20405            941444
-Adrian.Schmidt123           20316            1281436
-Ente                        20017            1432602
-wei                         19973            1745989
-rstoesser                   19569            1293588
-eudhan                      19274            1283717
-jundery                     18445            1115855
-iisiraider                  18247            1101015
-ville                       17883            1384026
-chris                       17698            1487385
-purplefishies               17595            1092533
-DMBK                        17357            1279152
-DragonLord                  17014            1162790
-dju                         16515             929427
-IgorLeMasson                16064            1147232
-ako027ako                   15671            1173203
-Nikolay.IT                  15154            1068349
-Andrew Grant                15114             895539
-OssumOpossum                14857            1007129
-enedene                     14476             905279
-bpfliegel                   14298             884523
-jpulman                     13982             870599
-joster                      13794             950160
-Nesa92                      13786            1114691
-crocogoat                   13753            1114622
-Hjax                        13535             915487
-Dark_wizzie                 13422            1007152
-mpx86                       12941             693640
-mabichito                   12903             749391
-thijsk                      12886             722107
-AdrianSA                    12860             804972
-Flopzee                     12698             894821
-fatmurphy                   12547             853210
-scuzzi                      12511             845761
-Karby                       12429             735880
-SapphireBrand               12416             969604
-modolief                    12386             896470
-pgontarz                    12151             848794
-stocky                      11954             699440
-mschmidt                    11941             803401
-infinity                    11470             727027
-torbjo                      11395             729145
-Thomas A. Anderson          11372             732094
-d64                         11263             789184
-Maxim                       11129             804704
-snicolet                    11106             869170
-MooTheCow                   11008             694942
-savage84                    10965             641068
-Rudolphous                  10915             741268
-Wolfgang                    10809             580032
-rpngn                       10712             688203
-basepi                      10637             744851
-michaelrpg                  10409             735127
-dzjp                        10343             732529
-ali-al-zhrani               10324             726502
-ols                         10259             570669
-lbraesch                    10252             647825
+Username                 CPU Hours       Games played
+-----------------------------------------------------
+noobpwnftw                27649494         1834734733
+mlang                      1426107           89454622
+dew                        1380910           82831648
+mibere                      703840           46867607
+grandphish2                 692707           41737913
+tvijlbrief                  669642           42371594
+JojoM                       597778           35297180
+TueRens                     519226           31823562
+cw                          458421           30307421
+fastgm                      439667           25950040
+gvreuls                     436599           28177460
+crunchy                     427035           27344275
+CSU_Dynasty                 374765           25106278
+Fisherman                   326901           21822979
+ctoks                       325477           21767943
+velislav                    295343           18844324
+linrock                     292789           10624427
+bcross                      278584           19488961
+okrout                      262818           13803272
+pemo                        245982           11376085
+glinscott                   217799           13780820
+leszek                      212346           12959025
+nordlandia                  211692           13484886
+bking_US                    198894           11876016
+drabel                      196463           13450602
+robal                       195473           12375650
+mgrabiak                    187226           12016564
+Dantist                     183202           10990484
+Thanar                      179852           12365359
+vdv                         175274            9889046
+spams                       157128           10319326
+marrco                      150295            9402141
+sqrt2                       147963            9724586
+mhoram                      141278            8901241
+CoffeeOne                   137100            5024116
+vdbergh                     137041            8926915
+malala                      136182            8002293
+xoto                        133702            9156676
+davar                       122092            7960001
+dsmith                      122059            7570238
+Data                        113305            8220352
+BrunoBanani                 112960            7436849
+MaZePallas                  102823            6633619
+sterni1971                  100532            5880772
+ElbertoOne                   99028            7023771
+brabos                       92118            6186135
+oz                           92100            6486640
+psk                          89957            5984901
+amicic                       89156            5392305
+sunu                         88851            6028873
+Vizvezdenec                  83761            5344740
+0x3C33                       82614            5271253
+BRAVONE                      81239            5054681
+racerschmacer                80899            5759262
+cuistot                      80300            4606144
+nssy                         76497            5259388
+teddybaer                    75125            5407666
+Pking_cda                    73776            5293873
+jromang                      72192            5057715
+solarlight                   70517            5028306
+dv8silencer                  70287            3883992
+Bobo1239                     68515            4652287
+manap                        66273            4121774
+skiminki                     65088            4023328
+tinker                       64333            4268790
+sschnee                      60767            3500800
+qurashee                     57344            3168264
+robnjr                       57262            4053117
+Freja                        56938            3733019
+ttruscott                    56010            3680085
+rkl                          55132            4164467
+renouve                      53811            3501516
+finfish                      51360            3370515
+eva42                        51272            3599691
+rap                          49985            3219146
+pb00067                      49727            3298270
+ronaldjerum                  47654            3240695
+bigpen0r                     47653            3335327
+eastorwest                   47585            3221629
+biffhero                     46564            3111352
+VoyagerOne                   45476            3452465
+yurikvelo                    44834            3034550
+speedycpu                    43842            3003273
+jbwiebe                      43305            2805433
+Spprtr                       42279            2680153
+DesolatedDodo                42007            2447516
+Antihistamine                41788            2761312
+mhunt                        41735            2691355
+homyur                       39893            2850481
+gri                          39871            2515779
+Fifis                        38776            2529121
+oryx                         38724            2966648
+SC                           37290            2731014
+csnodgrass                   36207            2688994
+jmdana                       36157            2210661
+strelock                     34716            2074055
+rpngn                        33951            2057395
+Garf                         33922            2751802
+EthanOConnor                 33370            2090311
+slakovv                      32915            2021889
+manapbk                      30987            1810399
+Prcuvu                       30377            2170122
+anst                         30301            2190091
+jkiiski                      30136            1904470
+hyperbolic.tom               29840            2017394
+Pyafue                       29650            1902349
+Wolfgang                     29260            1658936
+zeryl                        28156            1579911
+OuaisBla                     27636            1578800
+DMBK                         27051            1999456
+chriswk                      26902            1868317
+achambord                    26582            1767323
+Patrick_G                    26276            1801617
+yorkman                      26193            1992080
+SFTUser                      25182            1675689
+nabildanial                  24942            1519409
+Sharaf_DG                    24765            1786697
+ncfish1                      24411            1520927
+rodneyc                      24227            1409514
+agg177                       23890            1395014
+JanErik                      23408            1703875
+Isidor                       23388            1680691
+Norabor                      23164            1591830
+cisco2015                    22897            1762669
+Zirie                        22542            1472937
+team-oh                      22272            1636708
+MazeOfGalious                21978            1629593
+sg4032                       21947            1643265
+ianh2105                     21725            1632562
+xor12                        21628            1680365
+dex                          21612            1467203
+nesoneg                      21494            1463031
+sphinx                       21211            1384728
+jjoshua2                     21001            1423089
+horst.prack                  20878            1465656
+Ente                         20865            1477066
+0xB00B1ES                    20590            1208666
+j3corre                      20405             941444
+Adrian.Schmidt123            20316            1281436
+wei                          19973            1745989
+MaxKlaxxMiner                19850            1009176
+rstoesser                    19569            1293588
+gopeto                       19491            1174952
+eudhan                       19274            1283717
+jundery                      18445            1115855
+megaman7de                   18377            1067540
+iisiraider                   18247            1101015
+ville                        17883            1384026
+chris                        17698            1487385
+purplefishies                17595            1092533
+dju                          17353             978595
+DragonLord                   17014            1162790
+IgorLeMasson                 16064            1147232
+ako027ako                    15671            1173203
+chuckstablers                15289             891576
+Nikolay.IT                   15154            1068349
+Andrew Grant                 15114             895539
+OssumOpossum                 14857            1007129
+Karby                        14808             867120
+enedene                      14476             905279
+bpfliegel                    14298             884523
+mpx86                        14019             759568
+jpulman                      13982             870599
+crocogoat                    13803            1117422
+joster                       13794             950160
+Nesa92                       13786            1114691
+Hjax                         13535             915487
+jsys14                       13459             785000
+Dark_wizzie                  13422            1007152
+mabichito                    12903             749391
+thijsk                       12886             722107
+AdrianSA                     12860             804972
+Flopzee                      12698             894821
+fatmurphy                    12547             853210
+Rudolphous                   12520             832340
+scuzzi                       12511             845761
+SapphireBrand                12416             969604
+modolief                     12386             896470
+Machariel                    12335             810784
+pgontarz                     12151             848794
+stocky                       11954             699440
+mschmidt                     11941             803401
+Maxim                        11543             836024
+infinity                     11470             727027
+torbjo                       11395             729145
+Thomas A. Anderson           11372             732094
+savage84                     11358             670860
+d64                          11263             789184
+MooTheCow                    11237             720174
+snicolet                     11106             869170
+ali-al-zhrani                11086             767926
+AndreasKrug                  10875             887457
+pirt                         10806             836519
+basepi                       10637             744851
+michaelrpg                   10508             739039
+dzjp                         10343             732529
+aga                          10302             622975
+ols                          10259             570669
+lbraesch                     10252             647825
+FormazChar                   10059             757283
diff --git a/src/Makefile b/src/Makefile
index d3cea8de..4cfefe77 100644
--- a/src/Makefile
+++ b/src/Makefile
@@ -50,7 +50,7 @@ SRCS = benchmark.cpp bitbase.cpp bitboard.cpp endgame.cpp evaluate.cpp main.cpp
 	material.cpp misc.cpp movegen.cpp movepick.cpp pawns.cpp position.cpp psqt.cpp \
 	search.cpp thread.cpp timeman.cpp tt.cpp uci.cpp ucioption.cpp tune.cpp syzygy/tbprobe.cpp \
 	nnue/evaluate_nnue.cpp \
-	nnue/features/half_ka_v2.cpp \
+	nnue/features/half_ka_v2_hm.cpp \
 	tools/validate_training_data.cpp \
 	tools/sfen_packer.cpp \
 	tools/training_data_generator.cpp \
@@ -72,9 +72,11 @@ VPATH = syzygy:nnue:nnue/features:eval:extra:tools
 # ----------------------------------------------------------------------------
 #
 # debug = yes/no      --- -DNDEBUG         --- Enable/Disable debug mode
-# sanitize = undefined/thread/no (-fsanitize )
+# sanitize = none/<sanitizer> ... (-fsanitize )
 #                     --- ( undefined )    --- enable undefined behavior checks
-#                     --- ( thread    )    --- enable threading error  checks
+#                     --- ( thread    )    --- enable threading error checks
+#                     --- ( address   )    --- enable memory access checks
+#                     --- ...etc...        --- see compiler documentation for supported sanitizers
 # optimize = yes/no   --- (-O3/-fast etc.) --- Enable/Disable optimizations
 # arch = (name)       --- (-arch)          --- Target architecture
 # bits = 64/32        --- -DIS_64BIT       --- 64-/32-bit operating system
@@ -95,6 +97,10 @@ VPATH = syzygy:nnue:nnue/features:eval:extra:tools
 # Note that Makefile is space sensitive, so when adding new architectures
 # or modifying existing flags, you have to make sure there are no extra spaces
 # at the end of the line for flag values.
+#
+# Example of use for these flags:
+# make build ARCH=x86-64-avx512 debug=on sanitize="address undefined"
+
 
 ### 2.1. General and architecture defaults
 
@@ -116,7 +122,7 @@ endif
 
 optimize = yes
 debug = no
-sanitize = no
+sanitize = none
 bits = 64
 prefetch = no
 popcnt = no
@@ -392,10 +398,12 @@ ifeq ($(COMP),clang)
 	ifneq ($(KERNEL),Darwin)
 	ifneq ($(KERNEL),OpenBSD)
 	ifneq ($(KERNEL),FreeBSD)
+	ifneq ($(RTLIB),compiler-rt)
 		LDFLAGS += -latomic
 	endif
 	endif
 	endif
+	endif
 
 	ifeq ($(arch),$(filter $(arch),armv7 armv8))
 		ifeq ($(OS),Android)
@@ -409,8 +417,12 @@ ifeq ($(COMP),clang)
 endif
 
 ifeq ($(KERNEL),Darwin)
-	CXXFLAGS += -arch $(arch) -mmacosx-version-min=10.15
-	LDFLAGS += -arch $(arch) -mmacosx-version-min=10.15
+	CXXFLAGS += -mmacosx-version-min=10.15
+	LDFLAGS += -mmacosx-version-min=10.15
+	ifneq ($(arch),any)
+		CXXFLAGS += -arch $(arch)
+		LDFLAGS += -arch $(arch)
+	endif
 	XCRUN = xcrun
 endif
 
@@ -484,10 +496,10 @@ else
 	CXXFLAGS += -g
 endif
 
-### 3.2.3 Debugging with undefined behavior sanitizers
-ifneq ($(sanitize),no)
-        CXXFLAGS += -g3 -fsanitize=$(sanitize)
-        LDFLAGS += -fsanitize=$(sanitize)
+### 3.2.2 Debugging with undefined behavior sanitizers
+ifneq ($(sanitize),none)
+        CXXFLAGS += -g3 $(addprefix -fsanitize=,$(sanitize))
+        LDFLAGS += $(addprefix -fsanitize=,$(sanitize))
 endif
 
 ### 3.3 Optimization
@@ -806,7 +818,9 @@ profileclean:
 	@rm -rf profdir
 	@rm -f bench.txt *.gcda *.gcno ./syzygy/*.gcda ./nnue/*.gcda ./nnue/features/*.gcda *.s ./tools/*.gcda ./extra/*.gcda ./eval/*.gcda
 	@rm -f stockfish.profdata *.profraw
-	@rm -f $(PGO_TRAINING_DATA_FILE)
+	@rm -f stockfish.exe.lto_wrapper_args
+	@rm -f stockfish.exe.ltrans.out
+	@rm -f ./-lstdc++.res
 
 default:
 	help
@@ -849,7 +863,6 @@ config-sanity: net
 	@echo "Testing config sanity. If this fails, try 'make help' ..."
 	@echo ""
 	@test "$(debug)" = "yes" || test "$(debug)" = "no"
-	@test "$(sanitize)" = "undefined" || test "$(sanitize)" = "thread" || test "$(sanitize)" = "address" || test "$(sanitize)" = "no"
 	@test "$(optimize)" = "yes" || test "$(optimize)" = "no"
 	@test "$(SUPPORTED_ARCH)" = "true"
 	@test "$(arch)" = "any" || test "$(arch)" = "x86_64" || test "$(arch)" = "i386" || \
@@ -889,14 +902,15 @@ clang-profile-use:
 	all
 
 gcc-profile-make:
+	@mkdir -p profdir
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS='-fprofile-generate' \
+	EXTRACXXFLAGS='-fprofile-generate=profdir' \
 	EXTRALDFLAGS='-lgcov' \
 	all
 
 gcc-profile-use:
 	$(MAKE) ARCH=$(ARCH) COMP=$(COMP) \
-	EXTRACXXFLAGS='-fprofile-use -fno-peel-loops -fno-tracer' \
+	EXTRACXXFLAGS='-fprofile-use=profdir -fno-peel-loops -fno-tracer' \
 	EXTRALDFLAGS='-lgcov' \
 	all
 
diff --git a/src/evaluate.cpp b/src/evaluate.cpp
index ccb7436b..74474363 100644
--- a/src/evaluate.cpp
+++ b/src/evaluate.cpp
@@ -128,30 +128,6 @@ namespace Eval {
         }
   }
 
-  /// NNUE::export_net() exports the currently loaded network to a file
-  void NNUE::export_net(const std::optional<std::string>& filename) {
-    std::string actualFilename;
-
-    if (filename.has_value())
-        actualFilename = filename.value();
-    else
-    {
-        if (eval_file_loaded != EvalFileDefaultName)
-        {
-             sync_cout << "Failed to export a net. A non-embedded net can only be saved if the filename is specified." << sync_endl;
-             return;
-        }
-        actualFilename = EvalFileDefaultName;
-    }
-
-    ofstream stream(actualFilename, std::ios_base::binary);
-
-    if (save_eval(stream))
-        sync_cout << "Network saved successfully to " << actualFilename << "." << sync_endl;
-    else
-        sync_cout << "Failed to export a net." << sync_endl;
-  }
-
   /// NNUE::verify() verifies that the last net used was loaded successfully
   void NNUE::verify() {
 
@@ -218,7 +194,7 @@ namespace Trace {
     else
         os << scores[t][WHITE] << " | " << scores[t][BLACK];
 
-    os << " | " << scores[t][WHITE] - scores[t][BLACK] << "\n";
+    os << " | " << scores[t][WHITE] - scores[t][BLACK] << " |\n";
     return os;
   }
 }
@@ -228,11 +204,9 @@ using namespace Trace;
 namespace {
 
   // Threshold for lazy and space evaluation
-  constexpr Value LazyThreshold1 =  Value(1565);
-  constexpr Value LazyThreshold2 =  Value(1102);
-  constexpr Value SpaceThreshold = Value(11551);
-  constexpr Value NNUEThreshold1 =   Value(682);
-  constexpr Value NNUEThreshold2 =   Value(176);
+  constexpr Value LazyThreshold1    =  Value(3130);
+  constexpr Value LazyThreshold2    =  Value(2204);
+  constexpr Value SpaceThreshold    =  Value(11551);
 
   // KingAttackWeights[PieceType] contains king attack weights by piece type
   constexpr int KingAttackWeights[PIECE_TYPE_NB] = { 0, 0, 81, 52, 44, 10 };
@@ -1018,7 +992,7 @@ namespace {
     // Initialize score by reading the incrementally updated scores included in
     // the position object (material + piece square tables) and the material
     // imbalance. Score is computed internally from the white point of view.
-    Score score = pos.psq_score() + me->imbalance() + pos.this_thread()->contempt;
+    Score score = pos.psq_score() + me->imbalance() + pos.this_thread()->trend;
 
     // Probe the pawn hash table
     pe = Pawns::probe(pos);
@@ -1026,7 +1000,7 @@ namespace {
 
     // Early exit if score is high
     auto lazy_skip = [&](Value lazyThreshold) {
-        return abs(mg_value(score) + eg_value(score)) / 2 > lazyThreshold + pos.non_pawn_material() / 64;
+        return abs(mg_value(score) + eg_value(score)) > lazyThreshold + pos.non_pawn_material() / 32;
     };
 
     if (lazy_skip(LazyThreshold1))
@@ -1140,8 +1114,9 @@ Value Eval::evaluate(const Position& pos) {
       // Scale and shift NNUE for compatibility with search and classical evaluation
       auto  adjusted_NNUE = [&]()
       {
-
-         int scale = 903 + 28 * pos.count<PAWN>() + 28 * pos.non_pawn_material() / 1024;
+         int scale =   883
+                     + 32 * pos.count<PAWN>()
+                     + 32 * pos.non_pawn_material() / 1024;
 
          Value nnue = NNUE::evaluate(pos, true) * scale / 1024;
 
@@ -1151,30 +1126,14 @@ Value Eval::evaluate(const Position& pos) {
          return nnue;
       };
 
-      // If there is PSQ imbalance we use the classical eval. We also introduce
-      // a small probability of using the classical eval when PSQ imbalance is small.
+      // If there is PSQ imbalance we use the classical eval, but we switch to
+      // NNUE eval faster when shuffling or if the material on the board is high.
+      int r50 = pos.rule50_count();
       Value psq = Value(abs(eg_value(pos.psq_score())));
-      int   r50 = 16 + pos.rule50_count();
-      bool  largePsq = psq * 16 > (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50;
-      bool  classical = largePsq;
+      bool classical = psq * 5 > (850 + pos.non_pawn_material() / 64) * (5 + r50);
 
-      // Use classical evaluation for really low piece endgames.
-      // One critical case is the draw for bishop + A/H file pawn vs naked king.
-      bool lowPieceEndgame =   pos.non_pawn_material() == BishopValueMg
-                            || (pos.non_pawn_material() < 2 * RookValueMg && pos.count<PAWN>() < 2);
-
-      v = classical || lowPieceEndgame ? Evaluation<NO_TRACE>(pos).value()
-                                       : adjusted_NNUE();
-
-      // If the classical eval is small and imbalance large, use NNUE nevertheless.
-      // For the case of opposite colored bishops, switch to NNUE eval with small
-      // probability if the classical eval is less than the threshold.
-      if (    largePsq
-          && !lowPieceEndgame
-          && (   abs(v) * 16 < NNUEThreshold2 * r50
-              || (   pos.opposite_bishops()
-                  && abs(v) * 16 < (NNUEThreshold1 + pos.non_pawn_material() / 64) * r50)))
-          v = adjusted_NNUE();
+      v = classical ? Evaluation<NO_TRACE>(pos).value()  // classical
+                    : adjusted_NNUE();                   // NNUE
   }
 
   // Damp down the evaluation linearly when shuffling
@@ -1191,7 +1150,7 @@ Value Eval::evaluate(const Position& pos) {
 /// descriptions and values of each evaluation term. Useful for debugging.
 /// Trace scores are from white's point of view
 
-std::string Eval::trace(const Position& pos) {
+std::string Eval::trace(Position& pos) {
 
   if (pos.checkers())
       return "Final evaluation: none (in check)";
@@ -1203,44 +1162,53 @@ std::string Eval::trace(const Position& pos) {
 
   std::memset(scores, 0, sizeof(scores));
 
-  pos.this_thread()->contempt = SCORE_ZERO; // Reset any dynamic contempt
+  pos.this_thread()->trend = SCORE_ZERO; // Reset any dynamic contempt
 
   v = Evaluation<TRACE>(pos).value();
 
   ss << std::showpoint << std::noshowpos << std::fixed << std::setprecision(2)
-     << "     Term    |    White    |    Black    |    Total   \n"
-     << "             |   MG    EG  |   MG    EG  |   MG    EG \n"
-     << " ------------+-------------+-------------+------------\n"
-     << "    Material | " << Term(MATERIAL)
-     << "   Imbalance | " << Term(IMBALANCE)
-     << "       Pawns | " << Term(PAWN)
-     << "     Knights | " << Term(KNIGHT)
-     << "     Bishops | " << Term(BISHOP)
-     << "       Rooks | " << Term(ROOK)
-     << "      Queens | " << Term(QUEEN)
-     << "    Mobility | " << Term(MOBILITY)
-     << " King safety | " << Term(KING)
-     << "     Threats | " << Term(THREAT)
-     << "      Passed | " << Term(PASSED)
-     << "       Space | " << Term(SPACE)
-     << "    Winnable | " << Term(WINNABLE)
-     << " ------------+-------------+-------------+------------\n"
-     << "       Total | " << Term(TOTAL);
-
-  v = pos.side_to_move() == WHITE ? v : -v;
-
-  ss << "\nClassical evaluation: " << to_cp(v) << " (white side)\n";
+     << " Contributing terms for the classical eval:\n"
+     << "+------------+-------------+-------------+-------------+\n"
+     << "|    Term    |    White    |    Black    |    Total    |\n"
+     << "|            |   MG    EG  |   MG    EG  |   MG    EG  |\n"
+     << "+------------+-------------+-------------+-------------+\n"
+     << "|   Material | " << Term(MATERIAL)
+     << "|  Imbalance | " << Term(IMBALANCE)
+     << "|      Pawns | " << Term(PAWN)
+     << "|    Knights | " << Term(KNIGHT)
+     << "|    Bishops | " << Term(BISHOP)
+     << "|      Rooks | " << Term(ROOK)
+     << "|     Queens | " << Term(QUEEN)
+     << "|   Mobility | " << Term(MOBILITY)
+     << "|King safety | " << Term(KING)
+     << "|    Threats | " << Term(THREAT)
+     << "|     Passed | " << Term(PASSED)
+     << "|      Space | " << Term(SPACE)
+     << "|   Winnable | " << Term(WINNABLE)
+     << "+------------+-------------+-------------+-------------+\n"
+     << "|      Total | " << Term(TOTAL)
+     << "+------------+-------------+-------------+-------------+\n";
 
+  if (NNUE::useNNUE != NNUE::UseNNUEMode::False)
+      ss << '\n' << NNUE::trace(pos) << '\n';
+
+  ss << std::showpoint << std::showpos << std::fixed << std::setprecision(2) << std::setw(15);
+
+  v = pos.side_to_move() == WHITE ? v : -v;
+  ss << "\nClassical evaluation   " << to_cp(v) << " (white side)\n";
   if (NNUE::useNNUE != NNUE::UseNNUEMode::False)
   {
-      v = NNUE::evaluate(pos);
+      v = NNUE::evaluate(pos, false);
       v = pos.side_to_move() == WHITE ? v : -v;
-      ss << "\nNNUE evaluation:      " << to_cp(v) << " (white side)\n";
+      ss << "NNUE evaluation        " << to_cp(v) << " (white side)\n";
   }
 
   v = evaluate(pos);
   v = pos.side_to_move() == WHITE ? v : -v;
-  ss << "\nFinal evaluation:     " << to_cp(v) << " (white side)\n";
+  ss << "Final evaluation       " << to_cp(v) << " (white side)";
+  if (NNUE::useNNUE != NNUE::UseNNUEMode::False)
+     ss << " [with scaled NNUE, hybrid, ...]";
+  ss << "\n";
 
   return ss.str();
 }
diff --git a/src/evaluate.h b/src/evaluate.h
index fa16a93d..8c91b807 100644
--- a/src/evaluate.h
+++ b/src/evaluate.h
@@ -30,13 +30,13 @@ class Position;
 
 namespace Eval {
 
-  std::string trace(const Position& pos);
+  std::string trace(Position& pos);
   Value evaluate(const Position& pos);
 
   // The default net name MUST follow the format nn-[SHA256 first 12 digits].nnue
   // for the build process (profile-build and fishtest) to work. Do not change the
   // name of the macro, as it is used in the Makefile.
-  #define EvalFileDefaultName   "nn-7756374aaed3.nnue"
+  #define EvalFileDefaultName   "nn-e8321e467bf6.nnue"
 
   namespace NNUE {
     enum struct UseNNUEMode
@@ -49,13 +49,17 @@ namespace Eval {
     extern UseNNUEMode useNNUE;
     extern std::string eval_file_loaded;
 
+    std::string trace(Position& pos);
     Value evaluate(const Position& pos, bool adjusted = false);
+
+    void init();
+    void verify();
+
     bool load_eval(std::string name, std::istream& stream);
     bool save_eval(std::ostream& stream);
-    void init();
-    void export_net(const std::optional<std::string>& filename);
-    void verify();
-  }
+    bool save_eval(const std::optional<std::string>& filename);
+
+  } // namespace NNUE
 
 } // namespace Eval
 
diff --git a/src/misc.cpp b/src/misc.cpp
index 7a5559ce..d17e5c7e 100644
--- a/src/misc.cpp
+++ b/src/misc.cpp
@@ -380,6 +380,7 @@ void std_aligned_free(void* ptr) {
 static void* aligned_large_pages_alloc_windows(size_t allocSize) {
 
   #if !defined(_WIN64)
+    (void)allocSize; // suppress unused-parameter compiler warning
     return nullptr;
   #else
 
diff --git a/src/misc.h b/src/misc.h
index 1a574c58..99b8c3bb 100644
--- a/src/misc.h
+++ b/src/misc.h
@@ -89,9 +89,10 @@ std::ostream& operator<<(std::ostream&, SyncCout);
 #define sync_cout std::cout << IO_LOCK
 #define sync_endl std::endl << IO_UNLOCK
 
-// `ptr` must point to an array of size at least
-// `sizeof(T) * N + alignment` bytes, where `N` is the
-// number of elements in the array.
+
+// align_ptr_up() : get the first aligned element of an array.
+// ptr must point to an array of size at least `sizeof(T) * N + alignment` bytes,
+// where N is the number of elements in the array.
 template <uintptr_t Alignment, typename T>
 T* align_ptr_up(T* ptr)
 {
@@ -101,6 +102,12 @@ T* align_ptr_up(T* ptr)
   return reinterpret_cast<T*>(reinterpret_cast<char*>((ptrint + (Alignment - 1)) / Alignment * Alignment));
 }
 
+
+// IsLittleEndian : true if and only if the binary is compiled on a little endian machine
+static inline const union { uint32_t i; char c[4]; } Le = { 0x01020304 };
+static inline const bool IsLittleEndian = (Le.c[0] == 4);
+
+
 template <typename T>
 class ValueListInserter {
 public:
diff --git a/src/movegen.cpp b/src/movegen.cpp
index bb81aeac..5f3ba90a 100644
--- a/src/movegen.cpp
+++ b/src/movegen.cpp
@@ -26,21 +26,16 @@ namespace Stockfish {
 namespace {
 
   template<GenType Type, Direction D>
-  ExtMove* make_promotions(ExtMove* moveList, Square to, Square ksq) {
+  ExtMove* make_promotions(ExtMove* moveList, Square to) {
 
     if (Type == CAPTURES || Type == EVASIONS || Type == NON_EVASIONS)
-    {
         *moveList++ = make<PROMOTION>(to - D, to, QUEEN);
-        if (attacks_bb<KNIGHT>(to) & ksq)
-            *moveList++ = make<PROMOTION>(to - D, to, KNIGHT);
-    }
 
     if (Type == QUIETS || Type == EVASIONS || Type == NON_EVASIONS)
     {
         *moveList++ = make<PROMOTION>(to - D, to, ROOK);
         *moveList++ = make<PROMOTION>(to - D, to, BISHOP);
-        if (!(attacks_bb<KNIGHT>(to) & ksq))
-            *moveList++ = make<PROMOTION>(to - D, to, KNIGHT);
+        *moveList++ = make<PROMOTION>(to - D, to, KNIGHT);
     }
 
     return moveList;
@@ -57,7 +52,6 @@ namespace {
     constexpr Direction UpRight  = (Us == WHITE ? NORTH_EAST : SOUTH_WEST);
     constexpr Direction UpLeft   = (Us == WHITE ? NORTH_WEST : SOUTH_EAST);
 
-    const Square ksq = pos.square<KING>(Them);
     const Bitboard emptySquares = Type == QUIETS || Type == QUIET_CHECKS ? target : ~pos.pieces();
     const Bitboard enemies      = Type == EVASIONS ? pos.checkers()
                                 : Type == CAPTURES ? target : pos.pieces(Them);
@@ -82,6 +76,7 @@ namespace {
             // To make a quiet check, you either make a direct check by pushing a pawn
             // or push a blocker pawn that is not on the same file as the enemy king.
             // Discovered check promotion has been already generated amongst the captures.
+            Square ksq = pos.square<KING>(Them);
             Bitboard dcCandidatePawns = pos.blockers_for_king(Them) & ~file_bb(ksq);
             b1 &= pawn_attacks_bb(Them, ksq) | shift<   Up>(dcCandidatePawns);
             b2 &= pawn_attacks_bb(Them, ksq) | shift<Up+Up>(dcCandidatePawns);
@@ -111,13 +106,13 @@ namespace {
             b3 &= target;
 
         while (b1)
-            moveList = make_promotions<Type, UpRight>(moveList, pop_lsb(b1), ksq);
+            moveList = make_promotions<Type, UpRight>(moveList, pop_lsb(b1));
 
         while (b2)
-            moveList = make_promotions<Type, UpLeft >(moveList, pop_lsb(b2), ksq);
+            moveList = make_promotions<Type, UpLeft >(moveList, pop_lsb(b2));
 
         while (b3)
-            moveList = make_promotions<Type, Up     >(moveList, pop_lsb(b3), ksq);
+            moveList = make_promotions<Type, Up     >(moveList, pop_lsb(b3));
     }
 
     // Standard and en passant captures
@@ -206,6 +201,7 @@ namespace {
         moveList = generate_moves<Us,   ROOK, Checks>(pos, moveList, target);
         moveList = generate_moves<Us,  QUEEN, Checks>(pos, moveList, target);
     }
+
     if (!Checks || pos.blockers_for_king(~Us) & ksq)
     {
         Bitboard b = attacks_bb<KING>(ksq) & (Type == EVASIONS ? ~pos.pieces(Us) : target);
@@ -227,10 +223,10 @@ namespace {
 } // namespace
 
 
-/// <CAPTURES>     Generates all pseudo-legal captures plus queen and checking knight promotions
-/// <QUIETS>       Generates all pseudo-legal non-captures and underpromotions (except checking knight)
+/// <CAPTURES>     Generates all pseudo-legal captures plus queen promotions
+/// <QUIETS>       Generates all pseudo-legal non-captures and underpromotions
 /// <EVASIONS>     Generates all pseudo-legal check evasions when the side to move is in check
-/// <QUIET_CHECKS> Generates all pseudo-legal non-captures giving check, except castling
+/// <QUIET_CHECKS> Generates all pseudo-legal non-captures giving check, except castling and promotions
 /// <NON_EVASIONS> Generates all pseudo-legal captures and non-captures
 ///
 /// Returns a pointer to the end of the move list.
diff --git a/src/movepick.cpp b/src/movepick.cpp
index 4ff4cff4..20640fe2 100644
--- a/src/movepick.cpp
+++ b/src/movepick.cpp
@@ -111,7 +111,7 @@ void MovePicker::score() {
                    +     (*continuationHistory[1])[pos.moved_piece(m)][to_sq(m)]
                    +     (*continuationHistory[3])[pos.moved_piece(m)][to_sq(m)]
                    +     (*continuationHistory[5])[pos.moved_piece(m)][to_sq(m)]
-                   + (ply < MAX_LPH ? std::min(4, depth / 3) * (*lowPlyHistory)[ply][from_to(m)] : 0);
+                   + (ply < MAX_LPH ? 6 * (*lowPlyHistory)[ply][from_to(m)] : 0);
 
       else // Type == EVASIONS
       {
diff --git a/src/nnue/evaluate_nnue.cpp b/src/nnue/evaluate_nnue.cpp
index cee77fe9..891f8faa 100644
--- a/src/nnue/evaluate_nnue.cpp
+++ b/src/nnue/evaluate_nnue.cpp
@@ -20,6 +20,9 @@
 
 #include <iostream>
 #include <set>
+#include <sstream>
+#include <iomanip>
+#include <fstream>
 
 #include "../evaluate.h"
 #include "../position.h"
@@ -158,29 +161,214 @@ namespace Stockfish::Eval::NNUE {
     ASSERT_ALIGNED(buffer, alignment);
 
     const std::size_t bucket = (pos.count<ALL_PIECES>() - 1) / 4;
-    const auto [psqt, lazy] = featureTransformer->transform(pos, transformedFeatures, bucket);
+    const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket);
+    const auto output = network[bucket]->propagate(transformedFeatures, buffer);
 
-    if (lazy)
-      return static_cast<Value>(psqt / OutputScale);
-    else
-    {
+    int materialist = psqt;
+    int positional  = output[0];
+
+    int delta_npm = abs(pos.non_pawn_material(WHITE) - pos.non_pawn_material(BLACK));
+    int entertainment = (adjusted && delta_npm <= BishopValueMg - KnightValueMg ? 7 : 0);
+
+    int A = 128 - entertainment;
+    int B = 128 + entertainment;
+
+    int sum = (A * materialist + B * positional) / 128;
+
+    return static_cast<Value>( sum / OutputScale );
+  }
+
+  struct NnueEvalTrace {
+    static_assert(LayerStacks == PSQTBuckets);
+
+    Value psqt[LayerStacks];
+    Value positional[LayerStacks];
+    std::size_t correctBucket;
+  };
+
+  static NnueEvalTrace trace_evaluate(const Position& pos) {
+
+    // We manually align the arrays on the stack because with gcc < 9.3
+    // overaligning stack variables with alignas() doesn't work correctly.
+
+    constexpr uint64_t alignment = CacheLineSize;
+
+#if defined(ALIGNAS_ON_STACK_VARIABLES_BROKEN)
+    TransformedFeatureType transformedFeaturesUnaligned[
+      FeatureTransformer::BufferSize + alignment / sizeof(TransformedFeatureType)];
+    char bufferUnaligned[Network::BufferSize + alignment];
+
+    auto* transformedFeatures = align_ptr_up<alignment>(&transformedFeaturesUnaligned[0]);
+    auto* buffer = align_ptr_up<alignment>(&bufferUnaligned[0]);
+#else
+    alignas(alignment)
+      TransformedFeatureType transformedFeatures[FeatureTransformer::BufferSize];
+    alignas(alignment) char buffer[Network::BufferSize];
+#endif
+
+    ASSERT_ALIGNED(transformedFeatures, alignment);
+    ASSERT_ALIGNED(buffer, alignment);
+
+    NnueEvalTrace t{};
+    t.correctBucket = (pos.count<ALL_PIECES>() - 1) / 4;
+    for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket) {
+      const auto psqt = featureTransformer->transform(pos, transformedFeatures, bucket);
       const auto output = network[bucket]->propagate(transformedFeatures, buffer);
 
       int materialist = psqt;
       int positional  = output[0];
 
-      int delta_npm = abs(pos.non_pawn_material(WHITE) - pos.non_pawn_material(BLACK));
-      int entertainment = (adjusted && delta_npm <= BishopValueMg - KnightValueMg ? 7 : 0);
+      t.psqt[bucket] = static_cast<Value>( materialist / OutputScale );
+      t.positional[bucket] = static_cast<Value>( positional / OutputScale );
+    }
 
-      int A = 128 - entertainment;
-      int B = 128 + entertainment;
+    return t;
+  }
 
-      int sum = (A * materialist + B * positional) / 128;
+  static const std::string PieceToChar(" PNBRQK  pnbrqk");
 
-      return static_cast<Value>( sum / OutputScale );
+
+  // format_cp_compact() converts a Value into (centi)pawns and writes it in a buffer.
+  // The buffer must have capacity for at least 5 chars.
+  static void format_cp_compact(Value v, char* buffer) {
+
+    buffer[0] = (v < 0 ? '-' : v > 0 ? '+' : ' ');
+
+    int cp = std::abs(100 * v / PawnValueEg);
+    if (cp >= 10000)
+    {
+        buffer[1] = '0' + cp / 10000; cp %= 10000;
+        buffer[2] = '0' + cp / 1000; cp %= 1000;
+        buffer[3] = '0' + cp / 100; cp %= 100;
+        buffer[4] = ' ';
+    }
+    else if (cp >= 1000)
+    {
+        buffer[1] = '0' + cp / 1000; cp %= 1000;
+        buffer[2] = '0' + cp / 100; cp %= 100;
+        buffer[3] = '.';
+        buffer[4] = '0' + cp / 10;
+    }
+    else
+    {
+        buffer[1] = '0' + cp / 100; cp %= 100;
+        buffer[2] = '.';
+        buffer[3] = '0' + cp / 10; cp %= 10;
+        buffer[4] = '0' + cp / 1;
     }
   }
 
+
+  // format_cp_aligned_dot() converts a Value into (centi)pawns and writes it in a buffer,
+  // always keeping two decimals. The buffer must have capacity for at least 7 chars.
+  static void format_cp_aligned_dot(Value v, char* buffer) {
+
+    buffer[0] = (v < 0 ? '-' : v > 0 ? '+' : ' ');
+
+    double cp = 1.0 * std::abs(int(v)) / PawnValueEg;
+    sprintf(&buffer[1], "%6.2f", cp);
+  }
+
+
+  // trace() returns a string with the value of each piece on a board,
+  // and a table for (PSQT, Layers) values bucket by bucket.
+
+  std::string trace(Position& pos) {
+
+    std::stringstream ss;
+
+    char board[3*8+1][8*8+2];
+    std::memset(board, ' ', sizeof(board));
+    for (int row = 0; row < 3*8+1; ++row)
+      board[row][8*8+1] = '\0';
+
+    // A lambda to output one box of the board
+    auto writeSquare = [&board](File file, Rank rank, Piece pc, Value value) {
+
+      const int x = ((int)file) * 8;
+      const int y = (7 - (int)rank) * 3;
+      for (int i = 1; i < 8; ++i)
+         board[y][x+i] = board[y+3][x+i] = '-';
+      for (int i = 1; i < 3; ++i)
+         board[y+i][x] = board[y+i][x+8] = '|';
+      board[y][x] = board[y][x+8] = board[y+3][x+8] = board[y+3][x] = '+';
+      if (pc != NO_PIECE)
+        board[y+1][x+4] = PieceToChar[pc];
+      if (value != VALUE_NONE)
+        format_cp_compact(value, &board[y+2][x+2]);
+    };
+
+    // We estimate the value of each piece by doing a differential evaluation from
+    // the current base eval, simulating the removal of the piece from its square.
+    Value base = evaluate(pos);
+    base = pos.side_to_move() == WHITE ? base : -base;
+
+    for (File f = FILE_A; f <= FILE_H; ++f)
+      for (Rank r = RANK_1; r <= RANK_8; ++r)
+      {
+        Square sq = make_square(f, r);
+        Piece pc = pos.piece_on(sq);
+        Value v = VALUE_NONE;
+
+        if (pc != NO_PIECE && type_of(pc) != KING)
+        {
+          auto st = pos.state();
+
+          pos.remove_piece(sq);
+          st->accumulator.computed[WHITE] = false;
+          st->accumulator.computed[BLACK] = false;
+
+          Value eval = evaluate(pos);
+          eval = pos.side_to_move() == WHITE ? eval : -eval;
+          v = base - eval;
+
+          pos.put_piece(pc, sq);
+          st->accumulator.computed[WHITE] = false;
+          st->accumulator.computed[BLACK] = false;
+        }
+
+        writeSquare(f, r, pc, v);
+      }
+
+    ss << " NNUE derived piece values:\n";
+    for (int row = 0; row < 3*8+1; ++row)
+        ss << board[row] << '\n';
+    ss << '\n';
+
+    auto t = trace_evaluate(pos);
+
+    ss << " NNUE network contributions "
+       << (pos.side_to_move() == WHITE ? "(White to move)" : "(Black to move)") << std::endl
+       << "+------------+------------+------------+------------+\n"
+       << "|   Bucket   |  Material  | Positional |   Total    |\n"
+       << "|            |   (PSQT)   |  (Layers)  |            |\n"
+       << "+------------+------------+------------+------------+\n";
+
+    for (std::size_t bucket = 0; bucket < LayerStacks; ++bucket)
+    {
+      char buffer[3][8];
+      std::memset(buffer, '\0', sizeof(buffer));
+
+      format_cp_aligned_dot(t.psqt[bucket], buffer[0]);
+      format_cp_aligned_dot(t.positional[bucket], buffer[1]);
+      format_cp_aligned_dot(t.psqt[bucket] + t.positional[bucket], buffer[2]);
+
+      ss <<  "|  " << bucket    << "        "
+         << " |  " << buffer[0] << "  "
+         << " |  " << buffer[1] << "  "
+         << " |  " << buffer[2] << "  "
+         << " |";
+      if (bucket == t.correctBucket)
+          ss << " <-- this bucket is used";
+      ss << '\n';
+    }
+
+    ss << "+------------+------------+------------+------------+\n";
+
+    return ss.str();
+  }
+
+
   // Load eval, from a file stream or a memory stream
   bool load_eval(std::string name, std::istream& stream) {
 
@@ -198,4 +386,35 @@ namespace Stockfish::Eval::NNUE {
     return write_parameters(stream);
   }
 
+  /// Save eval, to a file given by its name
+  bool save_eval(const std::optional<std::string>& filename) {
+
+    std::string actualFilename;
+    std::string msg;
+
+    if (filename.has_value())
+        actualFilename = filename.value();
+    else
+    {
+        if (eval_file_loaded != EvalFileDefaultName)
+        {
+             msg = "Failed to export a net. A non-embedded net can only be saved if the filename is specified";
+
+             sync_cout << msg << sync_endl;
+             return false;
+        }
+        actualFilename = EvalFileDefaultName;
+    }
+
+    std::ofstream stream(actualFilename, std::ios_base::binary);
+    bool saved = save_eval(stream);
+
+    msg = saved ? "Network saved successfully to " + actualFilename
+                : "Failed to export a net";
+
+    sync_cout << msg << sync_endl;
+    return saved;
+  }
+
+
 } // namespace Stockfish::Eval::NNUE
diff --git a/src/nnue/features/half_ka_v2.cpp b/src/nnue/features/half_ka_v2_hm.cpp
similarity index 68%
rename from src/nnue/features/half_ka_v2.cpp
rename to src/nnue/features/half_ka_v2_hm.cpp
index 57f43e50..098a6d60 100644
--- a/src/nnue/features/half_ka_v2.cpp
+++ b/src/nnue/features/half_ka_v2_hm.cpp
@@ -16,31 +16,32 @@
   along with this program.  If not, see <http://www.gnu.org/licenses/>.
 */
 
-//Definition of input features HalfKAv2 of NNUE evaluation function
+//Definition of input features HalfKAv2_hm of NNUE evaluation function
 
-#include "half_ka_v2.h"
+#include "half_ka_v2_hm.h"
 
 #include "../../position.h"
 
 namespace Stockfish::Eval::NNUE::Features {
 
   // Orient a square according to perspective (rotates by 180 for black)
-  inline Square HalfKAv2::orient(Color perspective, Square s) {
-    return Square(int(s) ^ (bool(perspective) * 56));
+  inline Square HalfKAv2_hm::orient(Color perspective, Square s, Square ksq) {
+    return Square(int(s) ^ (bool(perspective) * SQ_A8) ^ ((file_of(ksq) < FILE_E) * SQ_H1));
   }
 
   // Index of a feature for a given king position and another piece on some square
-  inline IndexType HalfKAv2::make_index(Color perspective, Square s, Piece pc, Square ksq) {
-    return IndexType(orient(perspective, s) + PieceSquareIndex[perspective][pc] + PS_NB * ksq);
+  inline IndexType HalfKAv2_hm::make_index(Color perspective, Square s, Piece pc, Square ksq) {
+    Square o_ksq = orient(perspective, ksq, ksq);
+    return IndexType(orient(perspective, s, ksq) + PieceSquareIndex[perspective][pc] + PS_NB * KingBuckets[o_ksq]);
   }
 
   // Get a list of indices for active features
-  void HalfKAv2::append_active_indices(
+  void HalfKAv2_hm::append_active_indices(
     const Position& pos,
     Color perspective,
     ValueListInserter<IndexType> active
   ) {
-    Square ksq = orient(perspective, pos.square<KING>(perspective));
+    Square ksq = pos.square<KING>(perspective);
     Bitboard bb = pos.pieces();
     while (bb)
     {
@@ -52,7 +53,7 @@ namespace Stockfish::Eval::NNUE::Features {
 
   // append_changed_indices() : get a list of indices for recently changed features
 
-  void HalfKAv2::append_changed_indices(
+  void HalfKAv2_hm::append_changed_indices(
     Square ksq,
     StateInfo* st,
     Color perspective,
@@ -60,25 +61,24 @@ namespace Stockfish::Eval::NNUE::Features {
     ValueListInserter<IndexType> added
   ) {
     const auto& dp = st->dirtyPiece;
-    Square oriented_ksq = orient(perspective, ksq);
     for (int i = 0; i < dp.dirty_num; ++i) {
       Piece pc = dp.piece[i];
       if (dp.from[i] != SQ_NONE)
-        removed.push_back(make_index(perspective, dp.from[i], pc, oriented_ksq));
+        removed.push_back(make_index(perspective, dp.from[i], pc, ksq));
       if (dp.to[i] != SQ_NONE)
-        added.push_back(make_index(perspective, dp.to[i], pc, oriented_ksq));
+        added.push_back(make_index(perspective, dp.to[i], pc, ksq));
     }
   }
 
-  int HalfKAv2::update_cost(StateInfo* st) {
+  int HalfKAv2_hm::update_cost(StateInfo* st) {
     return st->dirtyPiece.dirty_num;
   }
 
-  int HalfKAv2::refresh_cost(const Position& pos) {
+  int HalfKAv2_hm::refresh_cost(const Position& pos) {
     return pos.count<ALL_PIECES>();
   }
 
-  bool HalfKAv2::requires_refresh(StateInfo* st, Color perspective) {
+  bool HalfKAv2_hm::requires_refresh(StateInfo* st, Color perspective) {
     return st->dirtyPiece.piece[0] == make_piece(perspective, KING);
   }
 
diff --git a/src/nnue/features/half_ka_v2.h b/src/nnue/features/half_ka_v2_hm.h
similarity index 80%
rename from src/nnue/features/half_ka_v2.h
rename to src/nnue/features/half_ka_v2_hm.h
index e4b2edd9..2c1144f6 100644
--- a/src/nnue/features/half_ka_v2.h
+++ b/src/nnue/features/half_ka_v2_hm.h
@@ -18,8 +18,8 @@
 
 //Definition of input features HalfKP of NNUE evaluation function
 
-#ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
-#define NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
+#ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED
+#define NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED
 
 #include "../nnue_common.h"
 
@@ -32,9 +32,9 @@ namespace Stockfish {
 
 namespace Stockfish::Eval::NNUE::Features {
 
-  // Feature HalfKAv2: Combination of the position of own king
-  // and the position of pieces
-  class HalfKAv2 {
+  // Feature HalfKAv2_hm: Combination of the position of own king
+  // and the position of pieces. Position mirrored such that king always on e..h files.
+  class HalfKAv2_hm {
 
     // unique number for each piece type on each square
     enum {
@@ -63,21 +63,32 @@ namespace Stockfish::Eval::NNUE::Features {
     };
 
     // Orient a square according to perspective (rotates by 180 for black)
-    static Square orient(Color perspective, Square s);
+    static Square orient(Color perspective, Square s, Square ksq);
 
     // Index of a feature for a given king position and another piece on some square
     static IndexType make_index(Color perspective, Square s, Piece pc, Square ksq);
 
    public:
     // Feature name
-    static constexpr const char* Name = "HalfKAv2(Friend)";
+    static constexpr const char* Name = "HalfKAv2_hm(Friend)";
 
     // Hash value embedded in the evaluation file
-    static constexpr std::uint32_t HashValue = 0x5f234cb8u;
+    static constexpr std::uint32_t HashValue = 0x7f234cb8u;
 
     // Number of feature dimensions
     static constexpr IndexType Dimensions =
-        static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_NB);
+        static_cast<IndexType>(SQUARE_NB) * static_cast<IndexType>(PS_NB) / 2;
+
+    static constexpr int KingBuckets[64] = {
+      -1, -1, -1, -1, 31, 30, 29, 28,
+      -1, -1, -1, -1, 27, 26, 25, 24,
+      -1, -1, -1, -1, 23, 22, 21, 20,
+      -1, -1, -1, -1, 19, 18, 17, 16,
+      -1, -1, -1, -1, 15, 14, 13, 12,
+      -1, -1, -1, -1, 11, 10, 9, 8,
+      -1, -1, -1, -1, 7, 6, 5, 4,
+      -1, -1, -1, -1, 3, 2, 1, 0
+    };
 
     // Maximum number of simultaneously active features.
     static constexpr IndexType MaxActiveDimensions = 32;
@@ -108,4 +119,4 @@ namespace Stockfish::Eval::NNUE::Features {
 
 }  // namespace Stockfish::Eval::NNUE::Features
 
-#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_H_INCLUDED
+#endif // #ifndef NNUE_FEATURES_HALF_KA_V2_HM_H_INCLUDED
diff --git a/src/nnue/layers/affine_transform.h b/src/nnue/layers/affine_transform.h
index 9a3b778e..d1318368 100644
--- a/src/nnue/layers/affine_transform.h
+++ b/src/nnue/layers/affine_transform.h
@@ -46,6 +46,11 @@ namespace Stockfish::Eval::NNUE::Layers {
 #elif defined (USE_SSSE3)
     static constexpr const IndexType OutputSimdWidth = SimdWidth / 4;
 #endif
+#if defined (USE_AVX512)
+    static constexpr const IndexType InputSimdWidth = SimdWidth * 2;
+#elif defined (USE_SSSE3)
+    static constexpr const IndexType InputSimdWidth = SimdWidth;
+#endif
 
     // Size of forward propagation buffer used in this layer
     static constexpr std::size_t SelfBufferSize =
@@ -72,6 +77,15 @@ namespace Stockfish::Eval::NNUE::Layers {
       for (std::size_t i = 0; i < OutputDimensions * PaddedInputDimensions; ++i)
 #if !defined (USE_SSSE3)
         weights[i] = read_little_endian<WeightType>(stream);
+#elif defined (USE_VNNI) || defined (USE_AVX512)
+        if constexpr (OutputDimensions <= 8 && OutputDimensions != 1)
+            weights[i] = read_little_endian<WeightType>(stream);
+        else
+            weights[
+              (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
+              i / PaddedInputDimensions * 4 +
+              i % 4
+            ] = read_little_endian<WeightType>(stream);
 #else
         weights[
           (i / 4) % (PaddedInputDimensions / 4) * OutputDimensions * 4 +
@@ -108,7 +122,6 @@ namespace Stockfish::Eval::NNUE::Layers {
 
       return !stream.fail();
     }
-
     // Forward propagation
     const OutputType* propagate(
         const TransformedFeatureType* transformedFeatures, char* buffer) const {
@@ -123,6 +136,40 @@ namespace Stockfish::Eval::NNUE::Layers {
         return _mm512_reduce_add_epi32(sum) + bias;
       };
 
+      [[maybe_unused]] auto m512_hadd128x16_interleave = [](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3) -> __m512i {
+
+        __m512i sum01a = _mm512_unpacklo_epi32(sum0, sum1);
+        __m512i sum01b = _mm512_unpackhi_epi32(sum0, sum1);
+
+        __m512i sum23a = _mm512_unpacklo_epi32(sum2, sum3);
+        __m512i sum23b = _mm512_unpackhi_epi32(sum2, sum3);
+
+        __m512i sum01 = _mm512_add_epi32(sum01a, sum01b);
+        __m512i sum23 = _mm512_add_epi32(sum23a, sum23b);
+
+        __m512i sum0123a = _mm512_unpacklo_epi64(sum01, sum23);
+        __m512i sum0123b = _mm512_unpackhi_epi64(sum01, sum23);
+
+        return _mm512_add_epi32(sum0123a, sum0123b);
+      };
+
+      [[maybe_unused]] auto m512_haddx4 = [m512_hadd128x16_interleave](
+        __m512i sum0, __m512i sum1, __m512i sum2, __m512i sum3, __m128i bias) -> __m128i {
+
+        __m512i sum = m512_hadd128x16_interleave(sum0, sum1, sum2, sum3);
+
+        __m256i sum256lo = _mm512_castsi512_si256(sum);
+        __m256i sum256hi = _mm512_extracti64x4_epi64(sum, 1);
+
+        sum256lo = _mm256_add_epi32(sum256lo, sum256hi);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum256lo);
+        __m128i sum128hi = _mm256_extracti128_si256(sum256lo, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+
       [[maybe_unused]] auto m512_add_dpbusd_epi32 = [=](__m512i& acc, __m512i a, __m512i b) {
 #if defined (USE_VNNI)
         acc = _mm512_dpbusd_epi32(acc, a, b);
@@ -133,6 +180,19 @@ namespace Stockfish::Eval::NNUE::Layers {
 #endif
       };
 
+      [[maybe_unused]] auto m512_add_dpbusd_epi32x2 = [=](__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1) {
+#if defined (USE_VNNI)
+        acc = _mm512_dpbusd_epi32(acc, a0, b0);
+        acc = _mm512_dpbusd_epi32(acc, a1, b1);
+#else
+        __m512i product0 = _mm512_maddubs_epi16(a0, b0);
+        __m512i product1 = _mm512_maddubs_epi16(a1, b1);
+        product0 = _mm512_adds_epi16(product0, product1);
+        product0 = _mm512_madd_epi16(product0, Ones512);
+        acc = _mm512_add_epi32(acc, product0);
+#endif
+      };
+
       [[maybe_unused]] auto m512_add_dpbusd_epi32x4 = [=](__m512i& acc, __m512i a0, __m512i b0, __m512i a1, __m512i b1,
                                                                         __m512i a2, __m512i b2, __m512i a3, __m512i b3) {
 #if defined (USE_VNNI)
@@ -165,6 +225,18 @@ namespace Stockfish::Eval::NNUE::Layers {
         return _mm_cvtsi128_si32(sum128) + bias;
       };
 
+      [[maybe_unused]] auto m256_haddx4 = [](__m256i sum0, __m256i sum1, __m256i sum2, __m256i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm256_hadd_epi32(sum0, sum1);
+        sum2 = _mm256_hadd_epi32(sum2, sum3);
+
+        sum0 = _mm256_hadd_epi32(sum0, sum2);
+
+        __m128i sum128lo = _mm256_castsi256_si128(sum0);
+        __m128i sum128hi = _mm256_extracti128_si256(sum0, 1);
+
+        return _mm_add_epi32(_mm_add_epi32(sum128lo, sum128hi), bias);
+      };
+
       [[maybe_unused]] auto m256_add_dpbusd_epi32 = [=](__m256i& acc, __m256i a, __m256i b) {
 #if defined (USE_VNNI)
         acc = _mm256_dpbusd_epi32(acc, a, b);
@@ -175,6 +247,19 @@ namespace Stockfish::Eval::NNUE::Layers {
 #endif
       };
 
+      [[maybe_unused]] auto m256_add_dpbusd_epi32x2 = [=](__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1) {
+#if defined (USE_VNNI)
+        acc = _mm256_dpbusd_epi32(acc, a0, b0);
+        acc = _mm256_dpbusd_epi32(acc, a1, b1);
+#else
+        __m256i product0 = _mm256_maddubs_epi16(a0, b0);
+        __m256i product1 = _mm256_maddubs_epi16(a1, b1);
+        product0 = _mm256_adds_epi16(product0, product1);
+        product0 = _mm256_madd_epi16(product0, Ones256);
+        acc = _mm256_add_epi32(acc, product0);
+#endif
+      };
+
       [[maybe_unused]] auto m256_add_dpbusd_epi32x4 = [=](__m256i& acc, __m256i a0, __m256i b0, __m256i a1, __m256i b1,
                                                                         __m256i a2, __m256i b2, __m256i a3, __m256i b3) {
 #if defined (USE_VNNI)
@@ -206,12 +291,27 @@ namespace Stockfish::Eval::NNUE::Layers {
         return _mm_cvtsi128_si32(sum) + bias;
       };
 
+      [[maybe_unused]] auto m128_haddx4 = [](__m128i sum0, __m128i sum1, __m128i sum2, __m128i sum3, __m128i bias) -> __m128i {
+        sum0 = _mm_hadd_epi32(sum0, sum1);
+        sum2 = _mm_hadd_epi32(sum2, sum3);
+        sum0 = _mm_hadd_epi32(sum0, sum2);
+        return _mm_add_epi32(sum0, bias);
+      };
+
       [[maybe_unused]] auto m128_add_dpbusd_epi32 = [=](__m128i& acc, __m128i a, __m128i b) {
         __m128i product0 = _mm_maddubs_epi16(a, b);
         product0 = _mm_madd_epi16(product0, Ones128);
         acc = _mm_add_epi32(acc, product0);
       };
 
+      [[maybe_unused]] auto m128_add_dpbusd_epi32x2 = [=](__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1) {
+        __m128i product0 = _mm_maddubs_epi16(a0, b0);
+        __m128i product1 = _mm_maddubs_epi16(a1, b1);
+        product0 = _mm_adds_epi16(product0, product1);
+        product0 = _mm_madd_epi16(product0, Ones128);
+        acc = _mm_add_epi32(acc, product0);
+      };
+
       [[maybe_unused]] auto m128_add_dpbusd_epi32x4 = [=](__m128i& acc, __m128i a0, __m128i b0, __m128i a1, __m128i b1,
                                                                         __m128i a2, __m128i b2, __m128i a3, __m128i b3) {
         __m128i product0 = _mm_maddubs_epi16(a0, b0);
@@ -231,45 +331,130 @@ namespace Stockfish::Eval::NNUE::Layers {
       using vec_t = __m512i;
       #define vec_setzero _mm512_setzero_si512
       #define vec_set_32 _mm512_set1_epi32
-      auto& vec_add_dpbusd_32 = m512_add_dpbusd_epi32;
-      auto& vec_add_dpbusd_32x4 = m512_add_dpbusd_epi32x4;
-      auto& vec_hadd = m512_hadd;
+      [[maybe_unused]] auto& vec_add_dpbusd_32 = m512_add_dpbusd_epi32;
+      [[maybe_unused]] auto& vec_add_dpbusd_32x2 = m512_add_dpbusd_epi32x2;
+      [[maybe_unused]] auto& vec_add_dpbusd_32x4 = m512_add_dpbusd_epi32x4;
+      [[maybe_unused]] auto& vec_hadd = m512_hadd;
+      [[maybe_unused]] auto& vec_haddx4 = m512_haddx4;
 #elif defined (USE_AVX2)
       using vec_t = __m256i;
       #define vec_setzero _mm256_setzero_si256
       #define vec_set_32 _mm256_set1_epi32
-      auto& vec_add_dpbusd_32 = m256_add_dpbusd_epi32;
-      auto& vec_add_dpbusd_32x4 = m256_add_dpbusd_epi32x4;
-      auto& vec_hadd = m256_hadd;
+      [[maybe_unused]] auto& vec_add_dpbusd_32 = m256_add_dpbusd_epi32;
+      [[maybe_unused]] auto& vec_add_dpbusd_32x2 = m256_add_dpbusd_epi32x2;
+      [[maybe_unused]] auto& vec_add_dpbusd_32x4 = m256_add_dpbusd_epi32x4;
+      [[maybe_unused]] auto& vec_hadd = m256_hadd;
+      [[maybe_unused]] auto& vec_haddx4 = m256_haddx4;
 #elif defined (USE_SSSE3)
       using vec_t = __m128i;
       #define vec_setzero _mm_setzero_si128
       #define vec_set_32 _mm_set1_epi32
-      auto& vec_add_dpbusd_32 = m128_add_dpbusd_epi32;
-      auto& vec_add_dpbusd_32x4 = m128_add_dpbusd_epi32x4;
-      auto& vec_hadd = m128_hadd;
+      [[maybe_unused]] auto& vec_add_dpbusd_32 = m128_add_dpbusd_epi32;
+      [[maybe_unused]] auto& vec_add_dpbusd_32x2 = m128_add_dpbusd_epi32x2;
+      [[maybe_unused]] auto& vec_add_dpbusd_32x4 = m128_add_dpbusd_epi32x4;
+      [[maybe_unused]] auto& vec_hadd = m128_hadd;
+      [[maybe_unused]] auto& vec_haddx4 = m128_haddx4;
 #endif
 
 #if defined (USE_SSSE3)
-      // Different layout, we process 4 inputs at a time, always.
-      static_assert(InputDimensions % 4 == 0);
-
       const auto output = reinterpret_cast<OutputType*>(buffer);
       const auto inputVector = reinterpret_cast<const vec_t*>(input);
+#endif
 
-      static_assert(OutputDimensions % OutputSimdWidth == 0 || OutputDimensions == 1);
+#if defined (USE_VNNI) || defined (USE_AVX512)
+
+      static_assert(OutputDimensions == 1 || OutputDimensions % 4 == 0);
 
       // OutputDimensions is either 1 or a multiple of SimdWidth
       // because then it is also an input dimension.
-      if constexpr (OutputDimensions % OutputSimdWidth == 0)
+      if constexpr (OutputDimensions <= 8 && OutputDimensions != 1)
       {
-          constexpr IndexType NumChunks = InputDimensions / 4;
+          constexpr IndexType NumChunks = PaddedInputDimensions / InputSimdWidth;
 
+          static_assert(NumChunks % 2 == 0);
+
+          const auto input_vec = reinterpret_cast<const vec_t*>(input);
+          const auto bias_vec = reinterpret_cast<const __m128i*>(biases);
+          auto out_vec = reinterpret_cast<__m128i*>(output);
+
+          vec_t regs[OutputDimensions];
+          for (IndexType k = 0; k < OutputDimensions; ++k)
+            regs[k] = vec_setzero();
+
+          for (IndexType i = 0; i < NumChunks / 2; ++i)
+          {
+              const vec_t in0 = input_vec[i * 2 + 0];
+              const vec_t in1 = input_vec[i * 2 + 1];
+              for (IndexType k = 0; k < OutputDimensions; ++k)
+              {
+                  const vec_t w0 = reinterpret_cast<const vec_t*>(&weights[k * PaddedInputDimensions])[i * 2 + 0];
+                  const vec_t w1 = reinterpret_cast<const vec_t*>(&weights[k * PaddedInputDimensions])[i * 2 + 1];
+                  vec_add_dpbusd_32(regs[k], in0, w0);
+                  vec_add_dpbusd_32(regs[k], in1, w1);
+              }
+          }
+
+          for (IndexType k = 0; k < OutputDimensions / 4; ++k)
+          {
+            out_vec[k] = vec_haddx4(
+              regs[k * 4 + 0],
+              regs[k * 4 + 1],
+              regs[k * 4 + 2],
+              regs[k * 4 + 3],
+              bias_vec[k]
+            );
+          }
+      }
+      else if constexpr (InputDimensions == 8)
+      {
+          const auto input32 = reinterpret_cast<const std::int32_t*>(input);
+          __m256i* outptr = reinterpret_cast<__m256i*>(output);
+          std::memcpy(output, biases, OutputDimensions * sizeof(OutputType));
+
+          const __m256i in0 = _mm256_set1_epi32(input32[0]);
+          const __m256i in1 = _mm256_set1_epi32(input32[1]);
+          const auto col0 = reinterpret_cast<const __m256i*>(&weights[0]);
+          const auto col1 = reinterpret_cast<const __m256i*>(&weights[OutputDimensions * 4]);
+          for (IndexType j = 0; j * 8 < OutputDimensions; ++j)
+              m256_add_dpbusd_epi32x2(outptr[j], in0, col0[j], in1, col1[j]);
+      }
+      else
+
+#elif defined (USE_SSSE3)
+
+      if constexpr (OutputDimensions % OutputSimdWidth == 0 && InputDimensions == 8)
+      {
           const auto input32 = reinterpret_cast<const std::int32_t*>(input);
           vec_t* outptr = reinterpret_cast<vec_t*>(output);
           std::memcpy(output, biases, OutputDimensions * sizeof(OutputType));
 
-          for (int i = 0; i < (int)NumChunks - 3; i += 4)
+          const vec_t in0 = vec_set_32(input32[0]);
+          const vec_t in1 = vec_set_32(input32[1]);
+          const auto col0 = reinterpret_cast<const vec_t*>(&weights[0]);
+          const auto col1 = reinterpret_cast<const vec_t*>(&weights[OutputDimensions * 4]);
+          for (IndexType j = 0; j * OutputSimdWidth < OutputDimensions; ++j)
+              vec_add_dpbusd_32x2(outptr[j], in0, col0[j], in1, col1[j]);
+      }
+      else
+
+#endif
+
+#if defined (USE_SSSE3)
+
+      if constexpr (OutputDimensions % OutputSimdWidth == 0)
+      {
+          static_assert(InputDimensions % 16 == 0);
+
+          constexpr IndexType NumChunks = InputDimensions / 4;
+          constexpr IndexType NumRegs = OutputDimensions / OutputSimdWidth;
+
+          const auto input32 = reinterpret_cast<const std::int32_t*>(input);
+          const vec_t* biasvec = reinterpret_cast<const vec_t*>(biases);
+          vec_t outs[NumRegs];
+          for (IndexType k = 0; k < NumRegs; ++k)
+              outs[k] = biasvec[k];
+
+          for (IndexType i = 0; i < NumChunks; i += 4)
           {
               const vec_t in0 = vec_set_32(input32[i + 0]);
               const vec_t in1 = vec_set_32(input32[i + 1]);
@@ -279,12 +464,18 @@ namespace Stockfish::Eval::NNUE::Layers {
               const auto col1 = reinterpret_cast<const vec_t*>(&weights[(i + 1) * OutputDimensions * 4]);
               const auto col2 = reinterpret_cast<const vec_t*>(&weights[(i + 2) * OutputDimensions * 4]);
               const auto col3 = reinterpret_cast<const vec_t*>(&weights[(i + 3) * OutputDimensions * 4]);
-              for (int j = 0; j * OutputSimdWidth < OutputDimensions; ++j)
-                  vec_add_dpbusd_32x4(outptr[j], in0, col0[j], in1, col1[j], in2, col2[j], in3, col3[j]);
+              for (IndexType k = 0; k < NumRegs; ++k)
+                  vec_add_dpbusd_32x4(outs[k], in0, col0[k], in1, col1[k], in2, col2[k], in3, col3[k]);
           }
+
+          vec_t* outptr = reinterpret_cast<vec_t*>(output);
+          for (IndexType k = 0; k < NumRegs; ++k)
+              outptr[k] = outs[k];
       }
       else if constexpr (OutputDimensions == 1)
       {
+          static_assert(InputDimensions % 4 == 0);
+
 #if defined (USE_AVX512)
           if constexpr (PaddedInputDimensions % (SimdWidth * 2) != 0)
           {
@@ -329,8 +520,8 @@ namespace Stockfish::Eval::NNUE::Layers {
 
 #if defined(USE_SSE2)
       // At least a multiple of 16, with SSE2.
-      static_assert(InputDimensions % SimdWidth == 0);
-      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+      static_assert(PaddedInputDimensions % SimdWidth == 0);
+      constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
       const __m128i Zeros = _mm_setzero_si128();
       const auto inputVector = reinterpret_cast<const __m128i*>(input);
 
@@ -341,8 +532,8 @@ namespace Stockfish::Eval::NNUE::Layers {
       const auto inputVector = reinterpret_cast<const __m64*>(input);
 
 #elif defined(USE_NEON)
-      static_assert(InputDimensions % SimdWidth == 0);
-      constexpr IndexType NumChunks = InputDimensions / SimdWidth;
+      static_assert(PaddedInputDimensions % SimdWidth == 0);
+      constexpr IndexType NumChunks = PaddedInputDimensions / SimdWidth;
       const auto inputVector = reinterpret_cast<const int8x8_t*>(input);
 #endif
 
@@ -415,6 +606,13 @@ namespace Stockfish::Eval::NNUE::Layers {
       _mm_empty();
 #endif
 
+#endif
+
+#if (!defined (USE_SSSE3) && defined (USE_SSE2)) || defined (USE_NEON)
+      static_assert(SimdWidth <= 16, "Otherwise we run outside of the padding for the output.");
+      if constexpr (SimdWidth > OutputDimensions && OutputDimensions != 1)
+          for (IndexType i = OutputDimensions; i < SimdWidth; ++i)
+            output[i] = 0;
 #endif
 
       return output;
diff --git a/src/nnue/nnue_accumulator.h b/src/nnue/nnue_accumulator.h
index e24902c4..d41ecf95 100644
--- a/src/nnue/nnue_accumulator.h
+++ b/src/nnue/nnue_accumulator.h
@@ -25,14 +25,11 @@
 
 namespace Stockfish::Eval::NNUE {
 
-  // The accumulator of a StateInfo without parent is set to the INIT state
-  enum AccumulatorState { EMPTY, COMPUTED, INIT };
-
   // Class that holds the result of affine transformation of input features
   struct alignas(CacheLineSize) Accumulator {
     std::int16_t accumulation[2][TransformedFeatureDimensions];
     std::int32_t psqtAccumulation[2][PSQTBuckets];
-    AccumulatorState state[2];
+    bool computed[2];
   };
 
 }  // namespace Stockfish::Eval::NNUE
diff --git a/src/nnue/nnue_architecture.h b/src/nnue/nnue_architecture.h
index 879a39cd..193a197d 100644
--- a/src/nnue/nnue_architecture.h
+++ b/src/nnue/nnue_architecture.h
@@ -23,7 +23,7 @@
 
 #include "nnue_common.h"
 
-#include "features/half_ka_v2.h"
+#include "features/half_ka_v2_hm.h"
 
 #include "layers/input_slice.h"
 #include "layers/affine_transform.h"
@@ -32,10 +32,10 @@
 namespace Stockfish::Eval::NNUE {
 
   // Input features used in evaluation function
-  using FeatureSet = Features::HalfKAv2;
+  using FeatureSet = Features::HalfKAv2_hm;
 
   // Number of input feature dimensions after conversion
-  constexpr IndexType TransformedFeatureDimensions = 512;
+  constexpr IndexType TransformedFeatureDimensions = 1024;
   constexpr IndexType PSQTBuckets = 8;
   constexpr IndexType LayerStacks = 8;
 
@@ -43,7 +43,7 @@ namespace Stockfish::Eval::NNUE {
 
     // Define network structure
     using InputLayer = InputSlice<TransformedFeatureDimensions * 2>;
-    using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 16>>;
+    using HiddenLayer1 = ClippedReLU<AffineTransform<InputLayer, 8>>;
     using HiddenLayer2 = ClippedReLU<AffineTransform<HiddenLayer1, 32>>;
     using OutputLayer = AffineTransform<HiddenLayer2, 1>;
 
diff --git a/src/nnue/nnue_common.h b/src/nnue/nnue_common.h
index 26f7267f..efc33fb8 100644
--- a/src/nnue/nnue_common.h
+++ b/src/nnue/nnue_common.h
@@ -26,6 +26,8 @@
 #include <cstring>
 #include <iostream>
 
+#include "../misc.h"  // for IsLittleEndian
+
 #if defined(USE_AVX2)
 #include <immintrin.h>
 
@@ -88,37 +90,77 @@ namespace Stockfish::Eval::NNUE {
   // necessary to return a result with the byte ordering of the compiling machine.
   template <typename IntType>
   inline IntType read_little_endian(std::istream& stream) {
-
       IntType result;
-      std::uint8_t u[sizeof(IntType)];
-      typename std::make_unsigned<IntType>::type v = 0;
 
-      stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
-      for (std::size_t i = 0; i < sizeof(IntType); ++i)
-          v = (v << 8) | u[sizeof(IntType) - i - 1];
+      if (IsLittleEndian)
+          stream.read(reinterpret_cast<char*>(&result), sizeof(IntType));
+      else
+      {
+          std::uint8_t u[sizeof(IntType)];
+          typename std::make_unsigned<IntType>::type v = 0;
+
+          stream.read(reinterpret_cast<char*>(u), sizeof(IntType));
+          for (std::size_t i = 0; i < sizeof(IntType); ++i)
+              v = (v << 8) | u[sizeof(IntType) - i - 1];
+
+          std::memcpy(&result, &v, sizeof(IntType));
+      }
 
-      std::memcpy(&result, &v, sizeof(IntType));
       return result;
   }
 
+  // write_little_endian() is our utility to write an integer (signed or unsigned, any size)
+  // to a stream in little-endian order. We swap the byte order before the write if
+  // necessary to always write in little endian order, independantly of the byte
+  // ordering of the compiling machine.
   template <typename IntType>
   inline void write_little_endian(std::ostream& stream, IntType value) {
 
-      std::uint8_t u[sizeof(IntType)];
-      typename std::make_unsigned<IntType>::type v = value;
+      if (IsLittleEndian)
+          stream.write(reinterpret_cast<const char*>(&value), sizeof(IntType));
+      else
+      {
+          std::uint8_t u[sizeof(IntType)];
+          typename std::make_unsigned<IntType>::type v = value;
 
-      std::size_t i = 0;
-      // if constexpr to silence the warning about shift by 8
-      if constexpr (sizeof(IntType) > 1) {
-        for (; i + 1 < sizeof(IntType); ++i) {
-            u[i] = v;
-            v >>= 8;
-        }
+          std::size_t i = 0;
+          // if constexpr to silence the warning about shift by 8
+          if constexpr (sizeof(IntType) > 1)
+          {
+            for (; i + 1 < sizeof(IntType); ++i)
+            {
+                u[i] = v;
+                v >>= 8;
+            }
+          }
+          u[i] = v;
+
+          stream.write(reinterpret_cast<char*>(u), sizeof(IntType));
       }
-      u[i] = v;
-
-      stream.write(reinterpret_cast<char*>(u), sizeof(IntType));
   }
+
+  // read_little_endian(s, out, N) : read integers in bulk from a little indian stream.
+  // This reads N integers from stream s and put them in array out.
+  template <typename IntType>
+  inline void read_little_endian(std::istream& stream, IntType* out, std::size_t count) {
+      if (IsLittleEndian)
+          stream.read(reinterpret_cast<char*>(out), sizeof(IntType) * count);
+      else
+          for (std::size_t i = 0; i < count; ++i)
+              out[i] = read_little_endian<IntType>(stream);
+  }
+
+  // write_little_endian(s, values, N) : write integers in bulk to a little indian stream.
+  // This takes N integers from array values and writes them on stream s.
+  template <typename IntType>
+  inline void write_little_endian(std::ostream& stream, const IntType* values, std::size_t count) {
+      if (IsLittleEndian)
+          stream.write(reinterpret_cast<const char*>(values), sizeof(IntType) * count);
+      else
+          for (std::size_t i = 0; i < count; ++i)
+              write_little_endian<IntType>(stream, values[i]);
+  }
+
 }  // namespace Stockfish::Eval::NNUE
 
 #endif // #ifndef NNUE_COMMON_H_INCLUDED
diff --git a/src/nnue/nnue_feature_transformer.h b/src/nnue/nnue_feature_transformer.h
index 85ab8481..47fe9c06 100644
--- a/src/nnue/nnue_feature_transformer.h
+++ b/src/nnue/nnue_feature_transformer.h
@@ -31,12 +31,17 @@
 
 namespace Stockfish::Eval::NNUE {
 
+  using BiasType       = std::int16_t;
+  using WeightType     = std::int16_t;
+  using PSQTWeightType = std::int32_t;
+
   // If vector instructions are enabled, we update and refresh the
   // accumulator tile by tile such that each tile fits in the CPU's
   // vector registers.
   #define VECTOR
 
-  static_assert(PSQTBuckets == 8, "Assumed by the current choice of constants.");
+  static_assert(PSQTBuckets % 8 == 0,
+    "Per feature PSQT values cannot be processed at granularity lower than 8 at a time.");
 
   #ifdef USE_AVX512
   typedef __m512i vec_t;
@@ -50,8 +55,7 @@ namespace Stockfish::Eval::NNUE {
   #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
   #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
   #define vec_zero_psqt() _mm256_setzero_si256()
-  static constexpr IndexType NumRegs = 8; // only 8 are needed
-  static constexpr IndexType NumPsqtRegs = 1;
+  #define NumRegistersSIMD 32
 
   #elif USE_AVX2
   typedef __m256i vec_t;
@@ -65,8 +69,7 @@ namespace Stockfish::Eval::NNUE {
   #define vec_add_psqt_32(a,b) _mm256_add_epi32(a,b)
   #define vec_sub_psqt_32(a,b) _mm256_sub_epi32(a,b)
   #define vec_zero_psqt() _mm256_setzero_si256()
-  static constexpr IndexType NumRegs = 16;
-  static constexpr IndexType NumPsqtRegs = 1;
+  #define NumRegistersSIMD 16
 
   #elif USE_SSE2
   typedef __m128i vec_t;
@@ -80,8 +83,7 @@ namespace Stockfish::Eval::NNUE {
   #define vec_add_psqt_32(a,b) _mm_add_epi32(a,b)
   #define vec_sub_psqt_32(a,b) _mm_sub_epi32(a,b)
   #define vec_zero_psqt() _mm_setzero_si128()
-  static constexpr IndexType NumRegs = Is64Bit ? 16 : 8;
-  static constexpr IndexType NumPsqtRegs = 2;
+  #define NumRegistersSIMD (Is64Bit ? 16 : 8)
 
   #elif USE_MMX
   typedef __m64 vec_t;
@@ -95,8 +97,7 @@ namespace Stockfish::Eval::NNUE {
   #define vec_add_psqt_32(a,b) _mm_add_pi32(a,b)
   #define vec_sub_psqt_32(a,b) _mm_sub_pi32(a,b)
   #define vec_zero_psqt() _mm_setzero_si64()
-  static constexpr IndexType NumRegs = 8;
-  static constexpr IndexType NumPsqtRegs = 4;
+  #define NumRegistersSIMD 8
 
   #elif USE_NEON
   typedef int16x8_t vec_t;
@@ -110,14 +111,61 @@ namespace Stockfish::Eval::NNUE {
   #define vec_add_psqt_32(a,b) vaddq_s32(a,b)
   #define vec_sub_psqt_32(a,b) vsubq_s32(a,b)
   #define vec_zero_psqt() psqt_vec_t{0}
-  static constexpr IndexType NumRegs = 16;
-  static constexpr IndexType NumPsqtRegs = 2;
+  #define NumRegistersSIMD 16
 
   #else
   #undef VECTOR
 
   #endif
 
+
+  #ifdef VECTOR
+
+      // Compute optimal SIMD register count for feature transformer accumulation.
+
+      // We use __m* types as template arguments, which causes GCC to emit warnings
+      // about losing some attribute information. This is irrelevant to us as we
+      // only take their size, so the following pragma are harmless.
+      #pragma GCC diagnostic push
+      #pragma GCC diagnostic ignored "-Wignored-attributes"
+
+      template <typename SIMDRegisterType,
+                typename LaneType,
+                int      NumLanes,
+                int      MaxRegisters>
+      static constexpr int BestRegisterCount()
+      {
+          #define RegisterSize  sizeof(SIMDRegisterType)
+          #define LaneSize      sizeof(LaneType)
+
+          static_assert(RegisterSize >= LaneSize);
+          static_assert(MaxRegisters <= NumRegistersSIMD);
+          static_assert(MaxRegisters > 0);
+          static_assert(NumRegistersSIMD > 0);
+          static_assert(RegisterSize % LaneSize == 0);
+          static_assert((NumLanes * LaneSize) % RegisterSize == 0);
+
+          const int ideal = (NumLanes * LaneSize) / RegisterSize;
+          if (ideal <= MaxRegisters)
+            return ideal;
+
+          // Look for the largest divisor of the ideal register count that is smaller than MaxRegisters
+          for (int divisor = MaxRegisters; divisor > 1; --divisor)
+            if (ideal % divisor == 0)
+              return divisor;
+
+          return 1;
+      }
+
+      static constexpr int NumRegs     = BestRegisterCount<vec_t, WeightType, TransformedFeatureDimensions, NumRegistersSIMD>();
+      static constexpr int NumPsqtRegs = BestRegisterCount<psqt_vec_t, PSQTWeightType, PSQTBuckets, NumRegistersSIMD>();
+
+      #pragma GCC diagnostic pop
+
+  #endif
+
+
+
   // Input feature converter
   class FeatureTransformer {
 
@@ -125,8 +173,6 @@ namespace Stockfish::Eval::NNUE {
     // Number of output dimensions for one side
     static constexpr IndexType HalfDimensions = TransformedFeatureDimensions;
 
-    static constexpr int LazyThreshold = 1400;
-
     #ifdef VECTOR
     static constexpr IndexType TileHeight = NumRegs * sizeof(vec_t) / 2;
     static constexpr IndexType PsqtTileHeight = NumPsqtRegs * sizeof(psqt_vec_t) / 4;
@@ -153,26 +199,26 @@ namespace Stockfish::Eval::NNUE {
 
     // Read network parameters
     bool read_parameters(std::istream& stream) {
-      for (std::size_t i = 0; i < HalfDimensions; ++i)
-        biases[i] = read_little_endian<BiasType>(stream);
-      for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i)
-        weights[i] = read_little_endian<WeightType>(stream);
-      for (std::size_t i = 0; i < PSQTBuckets * InputDimensions; ++i)
-        psqtWeights[i] = read_little_endian<PSQTWeightType>(stream);
+
+      read_little_endian<BiasType      >(stream, biases     , HalfDimensions                  );
+      read_little_endian<WeightType    >(stream, weights    , HalfDimensions * InputDimensions);
+      read_little_endian<PSQTWeightType>(stream, psqtWeights, PSQTBuckets    * InputDimensions);
+
       return !stream.fail();
     }
 
     // Write network parameters
     bool write_parameters(std::ostream& stream) const {
-      for (std::size_t i = 0; i < HalfDimensions; ++i)
-        write_little_endian<BiasType>(stream, biases[i]);
-      for (std::size_t i = 0; i < HalfDimensions * InputDimensions; ++i)
-        write_little_endian<WeightType>(stream, weights[i]);
+
+      write_little_endian<BiasType      >(stream, biases     , HalfDimensions                  );
+      write_little_endian<WeightType    >(stream, weights    , HalfDimensions * InputDimensions);
+      write_little_endian<PSQTWeightType>(stream, psqtWeights, PSQTBuckets    * InputDimensions);
+
       return !stream.fail();
     }
 
     // Convert input features
-    std::pair<std::int32_t, bool> transform(const Position& pos, OutputType* output, int bucket) const {
+    std::int32_t transform(const Position& pos, OutputType* output, int bucket) const {
       update_accumulator(pos, WHITE);
       update_accumulator(pos, BLACK);
 
@@ -181,121 +227,144 @@ namespace Stockfish::Eval::NNUE {
       const auto& psqtAccumulation = pos.state()->accumulator.psqtAccumulation;
 
       const auto psqt = (
-            psqtAccumulation[static_cast<int>(perspectives[0])][bucket]
-          - psqtAccumulation[static_cast<int>(perspectives[1])][bucket]
+            psqtAccumulation[perspectives[0]][bucket]
+          - psqtAccumulation[perspectives[1]][bucket]
         ) / 2;
 
-      if (abs(psqt) > LazyThreshold * OutputScale)
-        return { psqt, true };
 
   #if defined(USE_AVX512)
+
       constexpr IndexType NumChunks = HalfDimensions / (SimdWidth * 2);
       static_assert(HalfDimensions % (SimdWidth * 2) == 0);
       const __m512i Control = _mm512_setr_epi64(0, 2, 4, 6, 1, 3, 5, 7);
       const __m512i Zero = _mm512_setzero_si512();
 
+      for (IndexType p = 0; p < 2; ++p)
+      {
+          const IndexType offset = HalfDimensions * p;
+          auto out = reinterpret_cast<__m512i*>(&output[offset]);
+          for (IndexType j = 0; j < NumChunks; ++j)
+          {
+              __m512i sum0 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
+                                              (accumulation[perspectives[p]])[j * 2 + 0]);
+              __m512i sum1 = _mm512_load_si512(&reinterpret_cast<const __m512i*>
+                                              (accumulation[perspectives[p]])[j * 2 + 1]);
+
+              _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control,
+                                 _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero)));
+          }
+      }
+      return psqt;
+
   #elif defined(USE_AVX2)
+
       constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
       constexpr int Control = 0b11011000;
       const __m256i Zero = _mm256_setzero_si256();
 
-  #elif defined(USE_SSE2)
-      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
+      for (IndexType p = 0; p < 2; ++p)
+      {
+          const IndexType offset = HalfDimensions * p;
+          auto out = reinterpret_cast<__m256i*>(&output[offset]);
+          for (IndexType j = 0; j < NumChunks; ++j)
+          {
+              __m256i sum0 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
+                                              (accumulation[perspectives[p]])[j * 2 + 0]);
+              __m256i sum1 = _mm256_load_si256(&reinterpret_cast<const __m256i*>
+                                              (accumulation[perspectives[p]])[j * 2 + 1]);
 
-  #ifdef USE_SSE41
+              _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(
+                                 _mm256_max_epi8(_mm256_packs_epi16(sum0, sum1), Zero), Control));
+          }
+      }
+      return psqt;
+
+  #elif defined(USE_SSE2)
+
+      #ifdef USE_SSE41
+      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
       const __m128i Zero = _mm_setzero_si128();
-  #else
+      #else
+      constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
       const __m128i k0x80s = _mm_set1_epi8(-128);
-  #endif
+      #endif
+
+      for (IndexType p = 0; p < 2; ++p)
+      {
+          const IndexType offset = HalfDimensions * p;
+          auto out = reinterpret_cast<__m128i*>(&output[offset]);
+          for (IndexType j = 0; j < NumChunks; ++j)
+          {
+              __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>
+                                           (accumulation[perspectives[p]])[j * 2 + 0]);
+              __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>
+                                           (accumulation[perspectives[p]])[j * 2 + 1]);
+              const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
+
+              #ifdef USE_SSE41
+              _mm_store_si128(&out[j], _mm_max_epi8(packedbytes, Zero));
+              #else
+              _mm_store_si128(&out[j], _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s));
+              #endif
+          }
+      }
+      return psqt;
 
   #elif defined(USE_MMX)
+
       constexpr IndexType NumChunks = HalfDimensions / SimdWidth;
       const __m64 k0x80s = _mm_set1_pi8(-128);
 
+      for (IndexType p = 0; p < 2; ++p)
+      {
+          const IndexType offset = HalfDimensions * p;
+          auto out = reinterpret_cast<__m64*>(&output[offset]);
+          for (IndexType j = 0; j < NumChunks; ++j)
+          {
+              __m64 sum0 = *(&reinterpret_cast<const __m64*>(accumulation[perspectives[p]])[j * 2 + 0]);
+              __m64 sum1 = *(&reinterpret_cast<const __m64*>(accumulation[perspectives[p]])[j * 2 + 1]);
+              const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
+              out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
+          }
+      }
+      _mm_empty();
+      return psqt;
+
   #elif defined(USE_NEON)
+
       constexpr IndexType NumChunks = HalfDimensions / (SimdWidth / 2);
       const int8x8_t Zero = {0};
-  #endif
-
-      for (IndexType p = 0; p < 2; ++p) {
-        const IndexType offset = HalfDimensions * p;
-
-  #if defined(USE_AVX512)
-        auto out = reinterpret_cast<__m512i*>(&output[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j) {
-          __m512i sum0 = _mm512_load_si512(
-              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]])[j * 2 + 0]);
-          __m512i sum1 = _mm512_load_si512(
-              &reinterpret_cast<const __m512i*>(accumulation[perspectives[p]])[j * 2 + 1]);
-          _mm512_store_si512(&out[j], _mm512_permutexvar_epi64(Control,
-              _mm512_max_epi8(_mm512_packs_epi16(sum0, sum1), Zero)));
-        }
-
-  #elif defined(USE_AVX2)
-        auto out = reinterpret_cast<__m256i*>(&output[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j) {
-          __m256i sum0 = _mm256_load_si256(
-              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]])[j * 2 + 0]);
-          __m256i sum1 = _mm256_load_si256(
-              &reinterpret_cast<const __m256i*>(accumulation[perspectives[p]])[j * 2 + 1]);
-          _mm256_store_si256(&out[j], _mm256_permute4x64_epi64(_mm256_max_epi8(
-              _mm256_packs_epi16(sum0, sum1), Zero), Control));
-        }
-
-  #elif defined(USE_SSE2)
-        auto out = reinterpret_cast<__m128i*>(&output[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j) {
-          __m128i sum0 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-              accumulation[perspectives[p]])[j * 2 + 0]);
-          __m128i sum1 = _mm_load_si128(&reinterpret_cast<const __m128i*>(
-              accumulation[perspectives[p]])[j * 2 + 1]);
-      const __m128i packedbytes = _mm_packs_epi16(sum0, sum1);
-
-          _mm_store_si128(&out[j],
-
-  #ifdef USE_SSE41
-              _mm_max_epi8(packedbytes, Zero)
-  #else
-              _mm_subs_epi8(_mm_adds_epi8(packedbytes, k0x80s), k0x80s)
-  #endif
-
-          );
-        }
-
-  #elif defined(USE_MMX)
-        auto out = reinterpret_cast<__m64*>(&output[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j) {
-          __m64 sum0 = *(&reinterpret_cast<const __m64*>(
-              accumulation[perspectives[p]])[j * 2 + 0]);
-          __m64 sum1 = *(&reinterpret_cast<const __m64*>(
-              accumulation[perspectives[p]])[j * 2 + 1]);
-          const __m64 packedbytes = _mm_packs_pi16(sum0, sum1);
-          out[j] = _mm_subs_pi8(_mm_adds_pi8(packedbytes, k0x80s), k0x80s);
-        }
-
-  #elif defined(USE_NEON)
-        const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
-        for (IndexType j = 0; j < NumChunks; ++j) {
-          int16x8_t sum = reinterpret_cast<const int16x8_t*>(
-              accumulation[perspectives[p]])[j];
-          out[j] = vmax_s8(vqmovn_s16(sum), Zero);
-        }
-
-  #else
-        for (IndexType j = 0; j < HalfDimensions; ++j) {
-          BiasType sum = accumulation[static_cast<int>(perspectives[p])][j];
-          output[offset + j] = static_cast<OutputType>(
-              std::max<int>(0, std::min<int>(127, sum)));
-        }
-  #endif
 
+      for (IndexType p = 0; p < 2; ++p)
+      {
+          const IndexType offset = HalfDimensions * p;
+          const auto out = reinterpret_cast<int8x8_t*>(&output[offset]);
+          for (IndexType j = 0; j < NumChunks; ++j)
+          {
+              int16x8_t sum = reinterpret_cast<const int16x8_t*>(accumulation[perspectives[p]])[j];
+              out[j] = vmax_s8(vqmovn_s16(sum), Zero);
+          }
       }
-  #if defined(USE_MMX)
-      _mm_empty();
+      return psqt;
+
+  #else
+
+      for (IndexType p = 0; p < 2; ++p)
+      {
+          const IndexType offset = HalfDimensions * p;
+          for (IndexType j = 0; j < HalfDimensions; ++j)
+          {
+              BiasType sum = accumulation[perspectives[p]][j];
+              output[offset + j] = static_cast<OutputType>(std::max<int>(0, std::min<int>(127, sum)));
+          }
+      }
+      return psqt;
+
   #endif
 
-      return { psqt, false };
-    }
+   } // end of function transform()
+
+
 
    private:
     void update_accumulator(const Position& pos, const Color perspective) const {
@@ -317,7 +386,7 @@ namespace Stockfish::Eval::NNUE {
       // of the estimated gain in terms of features to be added/subtracted.
       StateInfo *st = pos.state(), *next = nullptr;
       int gain = FeatureSet::refresh_cost(pos);
-      while (st->accumulator.state[perspective] == EMPTY)
+      while (st->previous && !st->accumulator.computed[perspective])
       {
         // This governs when a full feature refresh is needed and how many
         // updates are better than just one full refresh.
@@ -328,7 +397,7 @@ namespace Stockfish::Eval::NNUE {
         st = st->previous;
       }
 
-      if (st->accumulator.state[perspective] == COMPUTED)
+      if (st->accumulator.computed[perspective])
       {
         if (next == nullptr)
           return;
@@ -346,8 +415,8 @@ namespace Stockfish::Eval::NNUE {
             ksq, st2, perspective, removed[1], added[1]);
 
         // Mark the accumulators as computed.
-        next->accumulator.state[perspective] = COMPUTED;
-        pos.state()->accumulator.state[perspective] = COMPUTED;
+        next->accumulator.computed[perspective] = true;
+        pos.state()->accumulator.computed[perspective] = true;
 
         // Now update the accumulators listed in states_to_update[], where the last element is a sentinel.
         StateInfo *states_to_update[3] =
@@ -467,7 +536,7 @@ namespace Stockfish::Eval::NNUE {
       {
         // Refresh the accumulator
         auto& accumulator = pos.state()->accumulator;
-        accumulator.state[perspective] = COMPUTED;
+        accumulator.computed[perspective] = true;
         IndexList active;
         FeatureSet::append_active_indices(pos, perspective, active);
 
@@ -539,10 +608,6 @@ namespace Stockfish::Eval::NNUE {
   #endif
     }
 
-    using BiasType = std::int16_t;
-    using WeightType = std::int16_t;
-    using PSQTWeightType = std::int32_t;
-
     alignas(CacheLineSize) BiasType biases[HalfDimensions];
     alignas(CacheLineSize) WeightType weights[HalfDimensions * InputDimensions];
     alignas(CacheLineSize) PSQTWeightType psqtWeights[InputDimensions * PSQTBuckets];
diff --git a/src/position.cpp b/src/position.cpp
index b497196d..6bb2edb4 100644
--- a/src/position.cpp
+++ b/src/position.cpp
@@ -256,8 +256,6 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
       set_castling_right(c, rsq);
   }
 
-  set_state(st);
-
   // 4. En passant square.
   // Ignore if square is invalid or not on side to move relative rank 6.
   bool enpassant = false;
@@ -271,24 +269,12 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
       // a) side to move have a pawn threatening epSquare
       // b) there is an enemy pawn in front of epSquare
       // c) there is no piece on epSquare or behind epSquare
-      // d) enemy pawn didn't block a check of its own color by moving forward
       enpassant = pawn_attacks_bb(~sideToMove, st->epSquare) & pieces(sideToMove, PAWN)
                && (pieces(~sideToMove, PAWN) & (st->epSquare + pawn_push(~sideToMove)))
-               && !(pieces() & (st->epSquare | (st->epSquare + pawn_push(sideToMove))))
-               && (   file_of(square<KING>(sideToMove)) == file_of(st->epSquare)
-                   || !(blockers_for_king(sideToMove) & (st->epSquare + pawn_push(~sideToMove))));
+               && !(pieces() & (st->epSquare | (st->epSquare + pawn_push(sideToMove))));
   }
 
-  // It's necessary for st->previous to be intialized in this way because legality check relies on its existence
-  if (enpassant) {
-      st->previous = new StateInfo();
-      remove_piece(st->epSquare - pawn_push(sideToMove));
-      st->previous->checkersBB = attackers_to(square<KING>(~sideToMove)) & pieces(sideToMove);
-      st->previous->blockersForKing[WHITE] = slider_blockers(pieces(BLACK), square<KING>(WHITE), st->previous->pinners[BLACK]);
-      st->previous->blockersForKing[BLACK] = slider_blockers(pieces(WHITE), square<KING>(BLACK), st->previous->pinners[WHITE]);
-      put_piece(make_piece(~sideToMove, PAWN), st->epSquare - pawn_push(sideToMove));
-  }
-  else
+  if (!enpassant)
       st->epSquare = SQ_NONE;
 
   // 5-6. Halfmove clock and fullmove number
@@ -300,8 +286,7 @@ Position& Position::set(const string& fenStr, bool isChess960, StateInfo* si, Th
 
   chess960 = isChess960;
   thisThread = th;
-  st->accumulator.state[WHITE] = Eval::NNUE::INIT;
-  st->accumulator.state[BLACK] = Eval::NNUE::INIT;
+  set_state(st);
 
   assert(pos_is_ok());
 
@@ -522,11 +507,23 @@ bool Position::legal(Move m) const {
   assert(color_of(moved_piece(m)) == us);
   assert(piece_on(square<KING>(us)) == make_piece(us, KING));
 
-  // st->previous->blockersForKing consider capsq as empty.
-  // If pinned, it has to move along the king ray.
+  // En passant captures are a tricky special case. Because they are rather
+  // uncommon, we do it simply by testing whether the king is attacked after
+  // the move is made.
   if (type_of(m) == EN_PASSANT)
-      return   !(st->previous->blockersForKing[sideToMove] & from)
-            || aligned(from, to, square<KING>(us));
+  {
+      Square ksq = square<KING>(us);
+      Square capsq = to - pawn_push(us);
+      Bitboard occupied = (pieces() ^ from ^ capsq) | to;
+
+      assert(to == ep_square());
+      assert(moved_piece(m) == make_piece(us, PAWN));
+      assert(piece_on(capsq) == make_piece(~us, PAWN));
+      assert(piece_on(to) == NO_PIECE);
+
+      return   !(attacks_bb<  ROOK>(ksq, occupied) & pieces(~us, QUEEN, ROOK))
+            && !(attacks_bb<BISHOP>(ksq, occupied) & pieces(~us, QUEEN, BISHOP));
+  }
 
   // Castling moves generation does not check if the castling path is clear of
   // enemy attacks, it is delayed at a later time: now!
@@ -659,15 +656,18 @@ bool Position::gives_check(Move m) const {
   case PROMOTION:
       return attacks_bb(promotion_type(m), to, pieces() ^ from) & square<KING>(~sideToMove);
 
-  // The double-pushed pawn blocked a check? En Passant will remove the blocker.
-  // The only discovery check that wasn't handle is through capsq and fromsq
-  // So the King must be in the same rank as fromsq to consider this possibility.
-  // st->previous->blockersForKing consider capsq as empty.
+  // En passant capture with check? We have already handled the case
+  // of direct checks and ordinary discovered check, so the only case we
+  // need to handle is the unusual case of a discovered check through
+  // the captured pawn.
   case EN_PASSANT:
-      return st->previous->checkersBB
-          || (   rank_of(square<KING>(~sideToMove)) == rank_of(from)
-              && st->previous->blockersForKing[~sideToMove] & from);
+  {
+      Square capsq = make_square(file_of(to), rank_of(from));
+      Bitboard b = (pieces() ^ from ^ capsq) | to;
 
+      return  (attacks_bb<  ROOK>(square<KING>(~sideToMove), b) & pieces(sideToMove, QUEEN, ROOK))
+            | (attacks_bb<BISHOP>(square<KING>(~sideToMove), b) & pieces(sideToMove, QUEEN, BISHOP));
+  }
   default: //CASTLING
   {
       // Castling is encoded as 'king captures the rook'
@@ -707,8 +707,8 @@ void Position::do_move(Move m, StateInfo& newSt, bool givesCheck) {
   ++st->pliesFromNull;
 
   // Used by NNUE
-  st->accumulator.state[WHITE] = Eval::NNUE::EMPTY;
-  st->accumulator.state[BLACK] = Eval::NNUE::EMPTY;
+  st->accumulator.computed[WHITE] = false;
+  st->accumulator.computed[BLACK] = false;
   auto& dp = st->dirtyPiece;
   dp.dirty_num = 1;
 
@@ -1009,8 +1009,8 @@ void Position::do_null_move(StateInfo& newSt) {
   // Used by NNUE
   st->dirtyPiece.dirty_num = 0;
   st->dirtyPiece.piece[0] = NO_PIECE; // Avoid checks in UpdateAccumulator()
-  st->accumulator.state[WHITE] = Eval::NNUE::EMPTY;
-  st->accumulator.state[BLACK] = Eval::NNUE::EMPTY;
+  st->accumulator.computed[WHITE] = false;
+  st->accumulator.computed[BLACK] = false;
 
   if (st->epSquare != SQ_NONE)
   {
@@ -1086,8 +1086,9 @@ bool Position::see_ge(Move m, Value threshold) const {
   if (swap <= 0)
       return true;
 
+  assert(color_of(piece_on(from)) == sideToMove);
   Bitboard occupied = pieces() ^ from ^ to;
-  Color stm = color_of(piece_on(from));
+  Color stm = sideToMove;
   Bitboard attackers = attackers_to(to, occupied);
   Bitboard stmAttackers, bb;
   int res = 1;
diff --git a/src/position.h b/src/position.h
index c0193a9f..20f999bc 100644
--- a/src/position.h
+++ b/src/position.h
@@ -197,6 +197,9 @@ public:
 
   // Returns the position of the ball on the c side.
   Square king_square(Color c) const { return lsb(pieces(c, KING)); }
+  
+  void put_piece(Piece pc, Square s);
+  void remove_piece(Square s);
 
 private:
   // Initialization helpers (used while setting up a position)
@@ -205,8 +208,6 @@ private:
   void set_check_info(StateInfo* si) const;
 
   // Other helpers
-  void put_piece(Piece pc, Square s);
-  void remove_piece(Square s);
   void move_piece(Square from, Square to);
   template<bool Do>
   void do_castling(Color us, Square from, Square& to, Square& rfrom, Square& rto);
@@ -413,7 +414,7 @@ inline void Position::remove_piece(Square s) {
   byTypeBB[ALL_PIECES] ^= s;
   byTypeBB[type_of(pc)] ^= s;
   byColorBB[color_of(pc)] ^= s;
-  /* board[s] = NO_PIECE;  Not needed, overwritten by the capturing one */
+  board[s] = NO_PIECE;
   pieceCount[pc]--;
   pieceCount[make_piece(color_of(pc), ALL_PIECES)]--;
   psq -= PSQT::psq[pc][s];
diff --git a/src/search.cpp b/src/search.cpp
index be137f33..b4b3303b 100644
--- a/src/search.cpp
+++ b/src/search.cpp
@@ -54,7 +54,7 @@ bool Search::prune_at_shallow_depth = true;
 namespace {
 
   // Different node types, used as a template parameter
-  enum NodeType { NonPV, PV };
+  enum NodeType { NonPV, PV, Root };
 
   constexpr uint64_t TtHitAverageWindow     = 4096;
   constexpr uint64_t TtHitAverageResolution = 1024;
@@ -97,10 +97,10 @@ namespace {
     Move best = MOVE_NONE;
   };
 
-  template <NodeType NT>
+  template <NodeType nodeType>
   Value search(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode);
 
-  template <NodeType NT>
+  template <NodeType nodeType>
   Value qsearch(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth = 0);
 
   Value value_to_tt(Value v, int ply);
@@ -147,7 +147,7 @@ namespace {
 void Search::init() {
 
   for (int i = 1; i < MAX_MOVES; ++i)
-      Reductions[i] = int((21.3 + 2 * std::log(Threads.size())) * std::log(i + 0.25 * std::log(i)));
+      Reductions[i] = int(21.9 * std::log(i));
 }
 
 
@@ -248,7 +248,7 @@ void Thread::search() {
   // To allow access to (ss-7) up to (ss+2), the stack must be oversized.
   // The former is needed to allow update_continuation_histories(ss-1, ...),
   // which accesses its argument at ss-6, also near the root.
-  // The latter is needed for statScores and killer initialization.
+  // The latter is needed for statScore and killer initialization.
   Stack stack[MAX_PLY+10], *ss = stack+7;
   Move  pv[MAX_PLY+1];
   Value bestValue, alpha, beta, delta;
@@ -263,6 +263,9 @@ void Thread::search() {
   for (int i = 7; i > 0; i--)
       (ss-i)->continuationHistory = &this->continuationHistory[0][0][NO_PIECE][0]; // Use as a sentinel
 
+  for (int i = 0; i <= MAX_PLY + 2; ++i)
+      (ss+i)->ply = i;
+
   ss->pv = pv;
 
   bestValue = delta = alpha = -VALUE_INFINITE;
@@ -307,19 +310,7 @@ void Thread::search() {
   multiPV = std::min(multiPV, rootMoves.size());
   ttHitAverage = TtHitAverageWindow * TtHitAverageResolution / 2;
 
-  int ct = int(Options["Contempt"]) * PawnValueEg / 100; // From centipawns
-
-  // In analysis mode, adjust contempt in accordance with user preference
-  if (Limits.infinite || Options["UCI_AnalyseMode"])
-      ct =  Options["Analysis Contempt"] == "Off"  ? 0
-          : Options["Analysis Contempt"] == "Both" ? ct
-          : Options["Analysis Contempt"] == "White" && us == BLACK ? -ct
-          : Options["Analysis Contempt"] == "Black" && us == WHITE ? -ct
-          : ct;
-
-  // Evaluation score is from the white point of view
-  contempt = (us == WHITE ?  make_score(ct, ct / 2)
-                          : -make_score(ct, ct / 2));
+  trend = SCORE_ZERO;
 
   int searchAgainCounter = 0;
 
@@ -365,11 +356,11 @@ void Thread::search() {
               alpha = std::max(prev - delta,-VALUE_INFINITE);
               beta  = std::min(prev + delta, VALUE_INFINITE);
 
-              // Adjust contempt based on root move's previousScore (dynamic contempt)
-              int dct = ct + (113 - ct / 2) * prev / (abs(prev) + 147);
+              // Adjust trend based on root move's previousScore (dynamic contempt)
+              int tr = 113 * prev / (abs(prev) + 147);
 
-              contempt = (us == WHITE ?  make_score(dct, dct / 2)
-                                      : -make_score(dct, dct / 2));
+              trend = (us == WHITE ?  make_score(tr, tr / 2)
+                                   : -make_score(tr, tr / 2));
           }
 
           // Start with a small aspiration window and, in the case of a fail
@@ -379,7 +370,7 @@ void Thread::search() {
           while (true)
           {
               Depth adjustedDepth = std::max(1, rootDepth - failedHighCnt - searchAgainCounter);
-              bestValue = Stockfish::search<PV>(rootPos, ss, alpha, beta, adjustedDepth, false);
+              bestValue = Stockfish::search<Root>(rootPos, ss, alpha, beta, adjustedDepth, false);
 
               // Bring the best move to the front. It is critical that sorting
               // is done with a stable algorithm because all the values but the
@@ -475,8 +466,8 @@ void Thread::search() {
               totBestMoveChanges += th->bestMoveChanges;
               th->bestMoveChanges = 0;
           }
-          double bestMoveInstability = 1 + 2 * totBestMoveChanges / Threads.size();
-
+          double bestMoveInstability = 1.073 + std::max(1.0, 2.25 - 9.9 / rootDepth)
+                                              * totBestMoveChanges / Threads.size();
           double totalTime = Time.optimum() * fallingEval * reduction * bestMoveInstability;
 
           // Cap used time in case of a single legal move for a better viewer experience in tournaments
@@ -522,18 +513,18 @@ namespace {
 
   // search<>() is the main search function for both PV and non-PV nodes
 
-  template <NodeType NT>
+  template <NodeType nodeType>
   Value search(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth, bool cutNode) {
 
-    constexpr bool PvNode = NT == PV;
-    const bool rootNode = PvNode && ss->ply == 0;
+    constexpr bool PvNode = nodeType != NonPV;
+    constexpr bool rootNode = nodeType == Root;
     const Depth maxNextDepth = rootNode ? depth : depth + 1;
 
     // Check if we have an upcoming move which draws by repetition, or
     // if the opponent had an alternative move earlier to this position.
-    if (   pos.rule50_count() >= 3
+    if (   !rootNode
+        && pos.rule50_count() >= 3
         && alpha < VALUE_DRAW
-        && !rootNode
         && pos.has_game_cycle(ss->ply))
     {
         alpha = value_draw(pos.this_thread());
@@ -543,7 +534,7 @@ namespace {
 
     // Dive into quiescence search when the depth reaches zero
     if (depth <= 0)
-        return qsearch<NT>(pos, ss, alpha, beta);
+        return qsearch<PvNode ? PV : NonPV>(pos, ss, alpha, beta);
 
     assert(-VALUE_INFINITE <= alpha && alpha < beta && beta <= VALUE_INFINITE);
     assert(PvNode || (alpha == beta - 1));
@@ -559,7 +550,7 @@ namespace {
     Move ttMove, move, excludedMove, bestMove;
     Depth extension, newDepth;
     Value bestValue, value, ttValue, eval, maxValue, probCutBeta;
-    bool formerPv, givesCheck, improving, didLMR, priorCapture;
+    bool givesCheck, improving, didLMR, priorCapture;
     bool captureOrPromotion, doFullDepthSearch, moveCountPruning,
          ttCapture, singularQuietLMR;
     Piece movedPiece;
@@ -605,11 +596,11 @@ namespace {
 
     assert(0 <= ss->ply && ss->ply < MAX_PLY);
 
-    (ss+1)->ply = ss->ply + 1;
-    (ss+1)->ttPv = false;
+    (ss+1)->ttPv         = false;
     (ss+1)->excludedMove = bestMove = MOVE_NONE;
-    (ss+2)->killers[0] = (ss+2)->killers[1] = MOVE_NONE;
-    Square prevSq = to_sq((ss-1)->currentMove);
+    (ss+2)->killers[0]   = (ss+2)->killers[1] = MOVE_NONE;
+    ss->doubleExtensions = (ss-1)->doubleExtensions;
+    Square prevSq        = to_sq((ss-1)->currentMove);
 
     // Initialize statScore to zero for the grandchildren of the current position.
     // So statScore is shared between all grandchildren and only the first grandchild
@@ -630,7 +621,6 @@ namespace {
             : ss->ttHit    ? tte->move() : MOVE_NONE;
     if (!excludedMove)
         ss->ttPv = PvNode || (ss->ttHit && tte->is_pv());
-    formerPv = ss->ttPv && !PvNode;
 
     // Update low ply history for previous move if we are near root and position is or has been in PV
     if (   ss->ttPv
@@ -768,6 +758,7 @@ namespace {
             ss->staticEval = eval = -(ss-1)->staticEval;
 
         // Save static evaluation into transposition table
+        if(!excludedMove)
         tte->save(posKey, VALUE_NONE, ss->ttPv, BOUND_NONE, DEPTH_NONE, MOVE_NONE, eval);
     }
 
@@ -786,7 +777,8 @@ namespace {
                ? ss->staticEval > (ss-4)->staticEval || (ss-4)->staticEval == VALUE_NONE
                : ss->staticEval > (ss-2)->staticEval;
 
-    // Step 7. Futility pruning: child node (~50 Elo)
+    // Step 7. Futility pruning: child node (~50 Elo).
+    // The depth condition is important for mate finding.
     if (   !PvNode
         &&  depth < 9
         &&  eval - futility_margin(depth, improving) >= beta
@@ -915,7 +907,7 @@ namespace {
         && !ttMove)
         depth -= 2;
 
-moves_loop: // When in check, search starts from here
+moves_loop: // When in check, search starts here
 
     ttCapture = ttMove && pos.capture_or_promotion(ttMove);
 
@@ -950,6 +942,7 @@ moves_loop: // When in check, search starts from here
 
     value = bestValue;
     singularQuietLMR = moveCountPruning = false;
+    bool doubleExtension = false;
 
     // Indicate PvNodes that will probably fail low if the node was searched
     // at a depth equal or greater than the current depth, and the result of this search was a fail low.
@@ -998,7 +991,7 @@ moves_loop: // When in check, search starts from here
       // Calculate new depth for this move
       newDepth = depth - 1;
 
-      // Step 13. Pruning at shallow depth (~200 Elo)
+      // Step 13. Pruning at shallow depth (~200 Elo). Depth conditions are important for mate finding.
       if (  !rootNode
           && (PvNode ? prune_at_shallow_depth : true)
           && pos.non_pawn_material(us)
@@ -1027,22 +1020,18 @@ moves_loop: // When in check, search starts from here
           {
               // Continuation history based pruning (~20 Elo)
               if (   lmrDepth < 5
-                  && (*contHist[0])[movedPiece][to_sq(move)] < CounterMovePruneThreshold
-                  && (*contHist[1])[movedPiece][to_sq(move)] < CounterMovePruneThreshold)
+                  && (*contHist[0])[movedPiece][to_sq(move)] < 23 - 23 * depth * depth
+                  && (*contHist[1])[movedPiece][to_sq(move)] < 23 - 23 * depth * depth)
                   continue;
 
               // Futility pruning: parent node (~5 Elo)
-              if (   lmrDepth < 7
-                  && !ss->inCheck
-                  && ss->staticEval + 174 + 157 * lmrDepth <= alpha
-                  &&  (*contHist[0])[movedPiece][to_sq(move)]
-                    + (*contHist[1])[movedPiece][to_sq(move)]
-                    + (*contHist[3])[movedPiece][to_sq(move)]
-                    + (*contHist[5])[movedPiece][to_sq(move)] / 3 < 28255)
+              if (   !ss->inCheck
+                  && lmrDepth < 7
+                  && ss->staticEval + 174 + 157 * lmrDepth <= alpha)
                   continue;
 
               // Prune moves with negative SEE (~20 Elo)
-              if (!pos.see_ge(move, Value(-(30 - std::min(lmrDepth, 18)) * lmrDepth * lmrDepth)))
+              if (!pos.see_ge(move, Value(-21 * lmrDepth * lmrDepth - 21 * lmrDepth)))
                   continue;
           }
       }
@@ -1054,17 +1043,17 @@ moves_loop: // When in check, search starts from here
       // then that move is singular and should be extended. To verify this we do
       // a reduced search on all the other moves but the ttMove and if the
       // result is lower than ttValue minus a margin, then we will extend the ttMove.
-      if (    depth >= 7
+      if (   !rootNode
+          &&  depth >= 7
           &&  move == ttMove
-          && !rootNode
           && !excludedMove // Avoid recursive singular search
        /* &&  ttValue != VALUE_NONE Already implicit in the next condition */
           &&  abs(ttValue) < VALUE_KNOWN_WIN
           && (tte->bound() & BOUND_LOWER)
           &&  tte->depth() >= depth - 3)
       {
-          Value singularBeta = ttValue - ((formerPv + 4) * depth) / 2;
-          Depth singularDepth = (depth - 1 + 3 * formerPv) / 2;
+          Value singularBeta = ttValue - 2 * depth;
+          Depth singularDepth = (depth - 1) / 2;
 
           ss->excludedMove = move;
           value = search<NonPV>(pos, ss, singularBeta - 1, singularBeta, singularDepth, cutNode);
@@ -1074,8 +1063,15 @@ moves_loop: // When in check, search starts from here
           {
               extension = 1;
               singularQuietLMR = !ttCapture;
-              if (!PvNode && value < singularBeta - 93)
+
+              // Avoid search explosion by limiting the number of double extensions to at most 3
+              if (   !PvNode
+                  && value < singularBeta - 93
+                  && ss->doubleExtensions < 3)
+              {
                   extension = 2;
+                  doubleExtension = true;
+              }
           }
 
           // Multi-cut pruning
@@ -1098,9 +1094,14 @@ moves_loop: // When in check, search starts from here
                   return beta;
           }
       }
+      else if (   givesCheck
+               && depth > 6
+               && abs(ss->staticEval) > Value(100))
+          extension = 1;
 
       // Add extension to new depth
       newDepth += extension;
+      ss->doubleExtensions = (ss-1)->doubleExtensions + (extension == 2);
 
       // Speculative prefetch as early as possible
       prefetch(TT.first_entry(pos.key_after(move)));
@@ -1122,12 +1123,15 @@ moves_loop: // When in check, search starts from here
       if (    depth >= 3
           &&  moveCount > 1 + 2 * rootNode
           && (  !captureOrPromotion
-              || cutNode
-              || (!PvNode && !formerPv))
+              || (cutNode && (ss-1)->moveCount > 1)
+              || !ss->ttPv)
           && (!PvNode || ss->ply > 1 || thisThread->id() % 4 != 3))
       {
           Depth r = reduction(improving, depth, moveCount);
 
+          if (PvNode)
+              r--;
+
           // Decrease reduction if the ttHit running average is large (~0 Elo)
           if (thisThread->ttHitAverage > 537 * TtHitAverageResolution * TtHitAverageWindow / 1024)
               r--;
@@ -1140,7 +1144,6 @@ moves_loop: // When in check, search starts from here
 
           // Increase reduction at root and non-PV nodes when the best move does not change frequently
           if (   (rootNode || !PvNode)
-              && thisThread->rootDepth > 10
               && thisThread->bestMoveChanges <= 2)
               r++;
 
@@ -1152,31 +1155,27 @@ moves_loop: // When in check, search starts from here
           if (singularQuietLMR)
               r--;
 
-          if (!captureOrPromotion)
-          {
-              // Increase reduction if ttMove is a capture (~3 Elo)
-              if (ttCapture)
-                  r++;
+          // Increase reduction for cut nodes (~3 Elo)
+          if (cutNode && move != ss->killers[0])
+              r += 2;
 
-              // Increase reduction for cut nodes (~3 Elo)
-              if (cutNode)
-                  r += 2;
+          // Increase reduction if ttMove is a capture (~3 Elo)
+          if (ttCapture)
+              r++;
 
-              ss->statScore =  thisThread->mainHistory[us][from_to(move)]
-                             + (*contHist[0])[movedPiece][to_sq(move)]
-                             + (*contHist[1])[movedPiece][to_sq(move)]
-                             + (*contHist[3])[movedPiece][to_sq(move)]
-                             - 4923;
+          ss->statScore =  thisThread->mainHistory[us][from_to(move)]
+                         + (*contHist[0])[movedPiece][to_sq(move)]
+                         + (*contHist[1])[movedPiece][to_sq(move)]
+                         + (*contHist[3])[movedPiece][to_sq(move)]
+                         - 4923;
 
-              // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
-              if (!ss->inCheck)
-                  r -= ss->statScore / 14721;
-          }
+          // Decrease/increase reduction for moves with a good/bad history (~30 Elo)
+          r -= ss->statScore / 14721;
 
           // In general we want to cap the LMR depth search at newDepth. But if
           // reductions are really negative and movecount is low, we allow this move
-          // to be searched deeper than the first move.
-          Depth d = std::clamp(newDepth - r, 1, newDepth + (r < -1 && moveCount <= 5));
+          // to be searched deeper than the first move, unless ttMove was extended by 2.
+          Depth d = std::clamp(newDepth - r, 1, newDepth + (r < -1 && moveCount <= 5 && !doubleExtension));
 
           value = -search<NonPV>(pos, ss+1, -(alpha+1), -alpha, d, true);
 
@@ -1274,7 +1273,6 @@ moves_loop: // When in check, search starts from here
               else
               {
                   assert(value >= beta); // Fail high
-                  ss->statScore = 0;
                   break;
               }
           }
@@ -1348,10 +1346,11 @@ moves_loop: // When in check, search starts from here
 
   // qsearch() is the quiescence search function, which is called by the main search
   // function with zero depth, or recursively with further decreasing depth per call.
-  template <NodeType NT>
+  template <NodeType nodeType>
   Value qsearch(Position& pos, Stack* ss, Value alpha, Value beta, Depth depth) {
 
-    constexpr bool PvNode = NT == PV;
+    static_assert(nodeType != Root);
+    constexpr bool PvNode = nodeType == PV;
 
     assert(alpha >= -VALUE_INFINITE && alpha < beta && beta <= VALUE_INFINITE);
     assert(PvNode || (alpha == beta - 1));
@@ -1377,7 +1376,6 @@ moves_loop: // When in check, search starts from here
     }
 
     Thread* thisThread = pos.this_thread();
-    (ss+1)->ply = ss->ply + 1;
     bestMove = MOVE_NONE;
     ss->inCheck = pos.checkers();
     moveCount = 0;
@@ -1458,7 +1456,7 @@ moves_loop: // When in check, search starts from here
 
     // Initialize a MovePicker object for the current position, and prepare
     // to search the moves. Because the depth is <= 0 here, only captures,
-    // queen and checking knight promotions, and other checks(only if depth >= DEPTH_QS_CHECKS)
+    // queen promotions, and other checks (only if depth >= DEPTH_QS_CHECKS)
     // will be generated.
     MovePicker mp(pos, ttMove, depth, &thisThread->mainHistory,
                                       &thisThread->captureHistory,
@@ -1470,6 +1468,10 @@ moves_loop: // When in check, search starts from here
     {
       assert(is_ok(move));
 
+      // Check for legality
+      if (!pos.legal(move))
+          continue;
+
       givesCheck = pos.gives_check(move);
       captureOrPromotion = pos.capture_or_promotion(move);
 
@@ -1508,13 +1510,6 @@ moves_loop: // When in check, search starts from here
       // Speculative prefetch as early as possible
       prefetch(TT.first_entry(pos.key_after(move)));
 
-      // Check for legality just before making the move
-      if (!pos.legal(move))
-      {
-          moveCount--;
-          continue;
-      }
-
       ss->currentMove = move;
       ss->continuationHistory = &thisThread->continuationHistory[ss->inCheck]
                                                                 [captureOrPromotion]
@@ -1530,7 +1525,7 @@ moves_loop: // When in check, search starts from here
 
       // Make and search the move
       pos.do_move(move, st, givesCheck);
-      value = -qsearch<NT>(pos, ss+1, -beta, -alpha, depth - 1);
+      value = -qsearch<nodeType>(pos, ss+1, -beta, -alpha, depth - 1);
       pos.undo_move(move);
 
       assert(value > -VALUE_INFINITE && value < VALUE_INFINITE);
@@ -1977,20 +1972,8 @@ namespace Search
 
       // th->clear();
 
-      int ct = int(Options["Contempt"]) * PawnValueEg / 100; // From centipawns
-      Color us = pos.side_to_move();
-
-      // In analysis mode, adjust contempt in accordance with user preference
-      if (Limits.infinite || Options["UCI_AnalyseMode"])
-        ct = Options["Analysis Contempt"] == "Off" ? 0
-        : Options["Analysis Contempt"] == "Both" ? ct
-        : Options["Analysis Contempt"] == "White" && us == BLACK ? -ct
-        : Options["Analysis Contempt"] == "Black" && us == WHITE ? -ct
-        : ct;
-
       // Evaluation score is from the white point of view
-      th->contempt = (us == WHITE ? make_score(ct, ct / 2)
-        : -make_score(ct, ct / 2));
+      th->trend = make_score(0, 0);
 
       for (int i = 7; i > 0; i--)
           (ss - i)->continuationHistory = &th->continuationHistory[0][0][NO_PIECE][0]; // Use as a sentinel
@@ -2153,7 +2136,7 @@ namespace Search
         while (true)
         {
           Depth adjustedDepth = std::max(1, rootDepth);
-          bestValue = Stockfish::search<PV>(pos, ss, alpha, beta, adjustedDepth, false);
+          bestValue = Stockfish::search<Root>(pos, ss, alpha, beta, adjustedDepth, false);
 
           stable_sort(rootMoves.begin() + pvIdx, rootMoves.end());
           //my_stable_sort(pos.this_thread()->thread_id(),&rootMoves[0] + pvIdx, rootMoves.size() - pvIdx);
@@ -3093,20 +3076,8 @@ namespace Search
         for (int i = 1; i <= MAX_PLY; ++i)
           (stack + i)->ply = i;
 
-        int ct = int(Options["Contempt"]) * PawnValueEg / 100; // From centipawns
-        Color us = pos.side_to_move();
-
-        // In analysis mode, adjust contempt in accordance with user preference
-        if (Limits.infinite || Options["UCI_AnalyseMode"])
-          ct =   Options["Analysis Contempt"] == "Off"  ? 0
-               : Options["Analysis Contempt"] == "Both" ? ct
-               : Options["Analysis Contempt"] == "White" && us == BLACK ? -ct
-               : Options["Analysis Contempt"] == "Black" && us == WHITE ? -ct
-               : ct;
-
         // Evaluation score is from the white point of view
-        th->contempt = (us == WHITE ?  make_score(ct, ct / 2)
-                                    : -make_score(ct, ct / 2));
+        th->trend = make_score(0, 0);
 
         create_new_root(pos);
 
diff --git a/src/search.h b/src/search.h
index 609f19a7..36bcb18b 100644
--- a/src/search.h
+++ b/src/search.h
@@ -54,6 +54,7 @@ struct Stack {
   bool inCheck;
   bool ttPv;
   bool ttHit;
+  int doubleExtensions;
 };
 
 
diff --git a/src/syzygy/tbprobe.cpp b/src/syzygy/tbprobe.cpp
index 57c9204b..f382edbc 100644
--- a/src/syzygy/tbprobe.cpp
+++ b/src/syzygy/tbprobe.cpp
@@ -105,9 +105,6 @@ template<> inline void swap_endian<uint8_t>(uint8_t&) {}
 
 template<typename T, int LE> T number(void* addr)
 {
-    static const union { uint32_t i; char c[4]; } Le = { 0x01020304 };
-    static const bool IsLittleEndian = (Le.c[0] == 4);
-
     T v;
 
     if ((uintptr_t)addr & (alignof(T) - 1)) // Unaligned pointer (very rare)
@@ -1539,6 +1536,14 @@ bool Tablebases::root_probe(Position& pos, Search::RootMoves& rootMoves) {
             WDLScore wdl = -probe_wdl(pos, &result);
             dtz = dtz_before_zeroing(wdl);
         }
+        else if (pos.is_draw(1))
+        {
+            // In case a root move leads to a draw by repetition or
+            // 50-move rule, we set dtz to zero. Note: since we are
+            // only 1 ply from the root, this must be a true 3-fold
+            // repetition inside the game history.
+            dtz = 0;
+        }
         else
         {
             // Otherwise, take dtz for the new position and correct by 1 ply
@@ -1589,6 +1594,7 @@ bool Tablebases::root_probe_wdl(Position& pos, Search::RootMoves& rootMoves) {
 
     ProbeState result;
     StateInfo st;
+    WDLScore wdl;
 
     bool rule50 = Options["Syzygy50MoveRule"];
 
@@ -1597,7 +1603,10 @@ bool Tablebases::root_probe_wdl(Position& pos, Search::RootMoves& rootMoves) {
     {
         pos.do_move(m.pv[0], st);
 
-        WDLScore wdl = -probe_wdl(pos, &result);
+        if (pos.is_draw(1))
+            wdl = WDLDraw;
+        else
+            wdl = -probe_wdl(pos, &result);
 
         pos.undo_move(m.pv[0]);
 
diff --git a/src/thread.h b/src/thread.h
index 0989f4ba..c0218577 100644
--- a/src/thread.h
+++ b/src/thread.h
@@ -102,7 +102,7 @@ public:
   LowPlyHistory lowPlyHistory;
   CapturePieceToHistory captureHistory;
   ContinuationHistory continuationHistory[2][2];
-  Score contempt;
+  Score trend;
   int failedHighCnt;
   bool rootInTB;
   int Cardinality;
diff --git a/src/tools/sfen_packer.cpp b/src/tools/sfen_packer.cpp
index 8182503c..7a6fb979 100644
--- a/src/tools/sfen_packer.cpp
+++ b/src/tools/sfen_packer.cpp
@@ -260,8 +260,8 @@ namespace Stockfish::Tools {
 
         pos.clear();
         std::memset(si, 0, sizeof(StateInfo));
-        si->accumulator.state[WHITE] = Eval::NNUE::INIT;
-        si->accumulator.state[BLACK] = Eval::NNUE::INIT;
+        si->accumulator.computed[WHITE] = false;
+        si->accumulator.computed[BLACK] = false;
         pos.st = si;
 
         // Active color
diff --git a/src/tools/training_data_generator.cpp b/src/tools/training_data_generator.cpp
index 45781dbb..0c4f8d82 100644
--- a/src/tools/training_data_generator.cpp
+++ b/src/tools/training_data_generator.cpp
@@ -812,10 +812,8 @@ namespace Stockfish::Tools
                 is >> params.seed;
             else if (token == "set_recommended_uci_options")
             {
-                UCI::setoption("Contempt", "0");
                 UCI::setoption("Skill Level", "20");
                 UCI::setoption("UCI_Chess960", "false");
-                UCI::setoption("UCI_AnalyseMode", "false");
                 UCI::setoption("UCI_LimitStrength", "false");
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "true");
diff --git a/src/tools/training_data_generator_nonpv.cpp b/src/tools/training_data_generator_nonpv.cpp
index 278259c6..04bab4a2 100644
--- a/src/tools/training_data_generator_nonpv.cpp
+++ b/src/tools/training_data_generator_nonpv.cpp
@@ -434,10 +434,8 @@ namespace Stockfish::Tools
                 params.smart_fen_skipping = true;
             else if (token == "set_recommended_uci_options")
             {
-                UCI::setoption("Contempt", "0");
                 UCI::setoption("Skill Level", "20");
                 UCI::setoption("UCI_Chess960", "false");
-                UCI::setoption("UCI_AnalyseMode", "false");
                 UCI::setoption("UCI_LimitStrength", "false");
                 UCI::setoption("PruneAtShallowDepth", "false");
                 UCI::setoption("EnableTranspositionTable", "true");
diff --git a/src/uci.cpp b/src/uci.cpp
index 887c8a21..b1d385d0 100644
--- a/src/uci.cpp
+++ b/src/uci.cpp
@@ -200,35 +200,39 @@ namespace {
 
 } // namespace
 
-void UCI::setoption(const std::string& name, const std::string& value)
-{
-    if (Options.count(name))
-        Options[name] = value;
-    else
-        sync_cout << "No such option: " << name << sync_endl;
-}
+namespace UCI {
 
-// The win rate model returns the probability (per mille) of winning given an eval
-// and a game-ply. The model fits rather accurately the LTC fishtest statistics.
-int win_rate_model(Value v, int ply) {
+  void setoption(const std::string& name, const std::string& value)
+  {
+      if (Options.count(name))
+          Options[name] = value;
+      else
+          sync_cout << "No such option: " << name << sync_endl;
+  }
 
-   // The model captures only up to 240 plies, so limit input (and rescale)
-   double m = std::min(240, ply) / 64.0;
+  // The win rate model returns the probability (per mille) of winning given an eval
+  // and a game-ply. The model fits rather accurately the LTC fishtest statistics.
+  int win_rate_model(Value v, int ply) {
 
-   // Coefficients of a 3rd order polynomial fit based on fishtest data
-   // for two parameters needed to transform eval to the argument of a
-   // logistic function.
-   double as[] = {-8.24404295, 64.23892342, -95.73056462, 153.86478679};
-   double bs[] = {-3.37154371, 28.44489198, -56.67657741,  72.05858751};
-   double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
-   double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
+     // The model captures only up to 240 plies, so limit input (and rescale)
+     double m = std::min(240, ply) / 64.0;
 
-   // Transform eval to centipawns with limited range
-   double x = std::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
+     // Coefficients of a 3rd order polynomial fit based on fishtest data
+     // for two parameters needed to transform eval to the argument of a
+     // logistic function.
+     double as[] = {-3.68389304,  30.07065921, -60.52878723, 149.53378557};
+     double bs[] = {-2.0181857,   15.85685038, -29.83452023,  47.59078827};
+     double a = (((as[0] * m + as[1]) * m + as[2]) * m) + as[3];
+     double b = (((bs[0] * m + bs[1]) * m + bs[2]) * m) + bs[3];
 
-   // Return win rate in per mille (rounded to nearest)
-   return int(0.5 + 1000 / (1 + std::exp((a - x) / b)));
-}
+     // Transform eval to centipawns with limited range
+     double x = std::clamp(double(100 * v) / PawnValueEg, -1000.0, 1000.0);
+
+     // Return win rate in per mille (rounded to nearest)
+     return int(0.5 + 1000 / (1 + std::exp((a - x) / b)));
+  }
+
+} // namespace
 
 // --------------------
 // Call qsearch(),search() directly for testing
@@ -348,16 +352,16 @@ void UCI::loop(int argc, char* argv[]) {
       else if (token == "d")        sync_cout << pos << sync_endl;
       else if (token == "eval")     trace_eval(pos);
       else if (token == "compiler") sync_cout << compiler_info() << sync_endl;
-      else if (token == "export_net") {
+      else if (token == "export_net")
+      {
           std::optional<std::string> filename;
           std::string f;
-          if (is >> skipws >> f) {
-            filename = f;
-          }
-          Eval::NNUE::export_net(filename);
+          if (is >> skipws >> f)
+              filename = f;
+          Eval::NNUE::save_eval(filename);
       }
       else if (token == "generate_training_data") Tools::generate_training_data(is);
-      else if (token == "generate_training_data") Tools::generate_training_data_nonpv(is);
+      else if (token == "generate_training_data_nonpv") Tools::generate_training_data_nonpv(is);
       else if (token == "convert") Tools::convert(is);
       else if (token == "validate_training_data") Tools::validate_training_data(is);
       else if (token == "convert_bin") Tools::convert_bin(is);
diff --git a/src/ucioption.cpp b/src/ucioption.cpp
index c42c38c9..5af78ec4 100644
--- a/src/ucioption.cpp
+++ b/src/ucioption.cpp
@@ -68,8 +68,6 @@ void init(OptionsMap& o) {
   constexpr int MaxHashMB = Is64Bit ? 33554432 : 2048;
 
   o["Debug Log File"]        << Option("", on_logger);
-  o["Contempt"]              << Option(24, -100, 100);
-  o["Analysis Contempt"]     << Option("Both var Off var White var Black var Both", "Both");
   o["Threads"]               << Option(1, 1, 512, on_threads);
   o["Hash"]                  << Option(16, 1, MaxHashMB, on_hash_size);
   o["Clear Hash"]            << Option(on_clear_hash);
diff --git a/tests/instrumented.sh b/tests/instrumented.sh
index 518d1087..545fb1c0 100755
--- a/tests/instrumented.sh
+++ b/tests/instrumented.sh
@@ -13,7 +13,7 @@ case $1 in
   --valgrind)
     echo "valgrind testing started"
     prefix=''
-    exeprefix='valgrind --error-exitcode=42'
+    exeprefix='valgrind --error-exitcode=42 --errors-for-leak-kinds=all --leak-check=full'
     postfix='1>/dev/null'
     threads="1"
     bench_depth=5
@@ -110,7 +110,7 @@ cat << EOF > game.exp
  expect "bestmove"
 
  send "position fen 5rk1/1K4p1/8/8/3B4/8/8/8 b - - 0 1\n"
- send "go depth $go_depth\n"
+ send "go depth 10\n"
  expect "bestmove"
 
  send "quit\n"
@@ -192,7 +192,7 @@ cat << EOF > data_generation02.exp
  exit \$value
 EOF
 
-for exp in game.exp data_generation01.exe data_generation02.exp
+for exp in game.exp data_generation01.exp data_generation02.exp
 do
 
   echo "$prefix expect $exp $postfix"